├── Data Modelling
    ├── Doc2Vec_Model.ipynb
    ├── Pickle_testing.ipynb
    └── README.md
├── Data
    ├── Course_Data
    │   ├── Coursera_Catalog.csv
    │   └── Coursera_Catalog_Request.ipynb
    ├── Job_Data
    │   ├── Glassdoor_Joblist.csv
    │   ├── Glassdoor_Joblist_Integration.ipynb
    │   └── Raw
    │   │   ├── Data_Job_NY.csv
    │   │   ├── Data_Job_SF.csv
    │   │   ├── Data_Job_TX.csv
    │   │   └── Data_Job_WA.csv
    └── README.md
├── Exploratory Data Analysis
    ├── Create_Test_Set.ipynb
    ├── Job_Posts_EDA.ipynb
    ├── README.md
    ├── courses_test_sample.csv
    └── jobs_test_sample.csv
├── LICENSE
├── Other
    ├── Course_webpages.ipynb
    ├── Coursera_data_collection.ipynb
    ├── Coursera_review_data.ipynb
    ├── Coursetalk_data.ipynb
    ├── Indeed_data.ipynb
    └── coursera_description.ipynb
├── Procfile
├── README.md
├── app.py
├── model.p
├── requirements.txt
├── static
    └── css
    │   └── styles.css
└── templates
    ├── form.html
    └── results.html


/Data Modelling/Pickle_testing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook checks to make sure the pickled model is working properly."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 12,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pickle\n",
 17 |     "import gensim\n",
 18 |     "import pandas as pd"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "# Load the pickled model from disk\n",
 28 |     "model = pickle.load(open('model.p', 'rb'))"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 4,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# Select a sample job description\n",
 38 |     "js = \"'\\nData Scientist\\n\\nat Brightidea\\n\\nSan Francisco\\n\\nThe Role\\n\\nWe are seeking machine learning developers with natural language processing experience.\\n\\nIn general, we are looking for people who are self-motivated and passionate about the field of machine learning and the vast applications of it. These folks will have the ability to work with / understand / and build on top of an existing code base using their deep knowledge of various machine learning algorithms (e.g. neural networks, bayesian methods, etc).\\n\\nKey responsibilities include, but not limited to:\\n\\n\\nBuild on top of an existing text processing/classification system\\nWrite, maintain, and develop python machine learning modules & repos\\nRun hyperparameter optimizations + collect, analyze, visualize, and present results\\n\\nWhat You Need to Succeed\\n\\nBS or MS in computer science, mathematics, physics or other hard science/engineering discipline\\nProgramming in Python ~ 2+ years\\nNumpy, scipy, pandas, Jupyter, and scikit-learn background\\nData visualization (e.g. matplotlib, seaborn, bokeh, mpld3, etc)\\nAbility to implement machine learning algorithms from scratch\\nExperience with full machine learning pipeline: from data preprocessing, to building/training various models, to hyperparameter optimization, testing, and visualization of results.\\nBackground in deep learning preferred but not required\\n\\nIn Your Application Please Include\\n\\n\\n\\nA past machine learning project you worked on in which highlights your skills, including: What tools/models did you use? What were some problems you encountered along the way, and how did you solve them?\""
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 9,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# Preprocess the job description\n",
 48 |     "doc = gensim.utils.simple_preprocess(js)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 7,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "# Vectorize the job description\n",
 58 |     "vector = model.infer_vector(doc)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 10,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "data": {
 68 |       "text/plain": [
 69 |        "[(441, 0.5915085673332214),\n",
 70 |        " (1231, 0.5760758519172668),\n",
 71 |        " (2849, 0.5734542012214661),\n",
 72 |        " (3976, 0.5609011650085449),\n",
 73 |        " (1634, 0.5435008406639099),\n",
 74 |        " (1074, 0.5411732792854309),\n",
 75 |        " (3298, 0.5391528010368347),\n",
 76 |        " (18, 0.5345658659934998),\n",
 77 |        " (4269, 0.5208688378334045),\n",
 78 |        " (1656, 0.5193619728088379)]"
 79 |       ]
 80 |      },
 81 |      "execution_count": 10,
 82 |      "metadata": {},
 83 |      "output_type": "execute_result"
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "# Extract the most similar docs from the model\n",
 88 |     "sims = model.docvecs.most_similar([vector])\n",
 89 |     "sims"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 13,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/html": [
100 |        "<div>\n",
101 |        "<style scoped>\n",
102 |        "    .dataframe tbody tr th:only-of-type {\n",
103 |        "        vertical-align: middle;\n",
104 |        "    }\n",
105 |        "\n",
106 |        "    .dataframe tbody tr th {\n",
107 |        "        vertical-align: top;\n",
108 |        "    }\n",
109 |        "\n",
110 |        "    .dataframe thead th {\n",
111 |        "        text-align: right;\n",
112 |        "    }\n",
113 |        "</style>\n",
114 |        "<table border=\"1\" class=\"dataframe\">\n",
115 |        "  <thead>\n",
116 |        "    <tr style=\"text-align: right;\">\n",
117 |        "      <th></th>\n",
118 |        "      <th>courseType</th>\n",
119 |        "      <th>description</th>\n",
120 |        "      <th>domainTypes</th>\n",
121 |        "      <th>id</th>\n",
122 |        "      <th>slug</th>\n",
123 |        "      <th>specializations</th>\n",
124 |        "      <th>workload</th>\n",
125 |        "      <th>primaryLanguages</th>\n",
126 |        "      <th>certificates</th>\n",
127 |        "      <th>name</th>\n",
128 |        "    </tr>\n",
129 |        "  </thead>\n",
130 |        "  <tbody>\n",
131 |        "    <tr>\n",
132 |        "      <th>0</th>\n",
133 |        "      <td>v2.ondemand</td>\n",
134 |        "      <td>Gamification is the application of game elemen...</td>\n",
135 |        "      <td>[{'subdomainId': 'design-and-product', 'domain...</td>\n",
136 |        "      <td>69Bku0KoEeWZtA4u62x6lQ</td>\n",
137 |        "      <td>gamification</td>\n",
138 |        "      <td>[]</td>\n",
139 |        "      <td>4-8 hours/week</td>\n",
140 |        "      <td>['en']</td>\n",
141 |        "      <td>['VerifiedCert']</td>\n",
142 |        "      <td>Gamification</td>\n",
143 |        "    </tr>\n",
144 |        "    <tr>\n",
145 |        "      <th>1</th>\n",
146 |        "      <td>v2.ondemand</td>\n",
147 |        "      <td>This course will cover the steps used in weigh...</td>\n",
148 |        "      <td>[{'subdomainId': 'data-analysis', 'domainId': ...</td>\n",
149 |        "      <td>0HiU7Oe4EeWTAQ4yevf_oQ</td>\n",
150 |        "      <td>missing-data</td>\n",
151 |        "      <td>[]</td>\n",
152 |        "      <td>4 weeks of study, 1-2 hours/week</td>\n",
153 |        "      <td>['en']</td>\n",
154 |        "      <td>['VerifiedCert', 'Specialization']</td>\n",
155 |        "      <td>Dealing With Missing Data</td>\n",
156 |        "    </tr>\n",
157 |        "  </tbody>\n",
158 |        "</table>\n",
159 |        "</div>"
160 |       ],
161 |       "text/plain": [
162 |        "    courseType                                        description  \\\n",
163 |        "0  v2.ondemand  Gamification is the application of game elemen...   \n",
164 |        "1  v2.ondemand  This course will cover the steps used in weigh...   \n",
165 |        "\n",
166 |        "                                         domainTypes                      id  \\\n",
167 |        "0  [{'subdomainId': 'design-and-product', 'domain...  69Bku0KoEeWZtA4u62x6lQ   \n",
168 |        "1  [{'subdomainId': 'data-analysis', 'domainId': ...  0HiU7Oe4EeWTAQ4yevf_oQ   \n",
169 |        "\n",
170 |        "           slug specializations                          workload  \\\n",
171 |        "0  gamification              []                    4-8 hours/week   \n",
172 |        "1  missing-data              []  4 weeks of study, 1-2 hours/week   \n",
173 |        "\n",
174 |        "  primaryLanguages                        certificates  \\\n",
175 |        "0           ['en']                    ['VerifiedCert']   \n",
176 |        "1           ['en']  ['VerifiedCert', 'Specialization']   \n",
177 |        "\n",
178 |        "                        name  \n",
179 |        "0               Gamification  \n",
180 |        "1  Dealing With Missing Data  "
181 |       ]
182 |      },
183 |      "execution_count": 13,
184 |      "metadata": {},
185 |      "output_type": "execute_result"
186 |     }
187 |    ],
188 |    "source": [
189 |     "# Read in the course data\n",
190 |     "course_df = pd.read_csv('../Data/Course_Data/Coursera_Catalog.csv')\n",
191 |     "course_df.head(2)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 14,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "# Extract course ids from the similar doc list\n",
201 |     "course_ids = [sim[0] for sim in sims]"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 15,
207 |    "metadata": {},
208 |    "outputs": [
209 |     {
210 |      "data": {
211 |       "text/plain": [
212 |        "441                              Data Science Math Skills\n",
213 |        "1231                           Mathematics for economists\n",
214 |        "2849    Scalable Machine Learning on Big Data using Ap...\n",
215 |        "3976                  Big Data Integration and Processing\n",
216 |        "1634                         Parallel Programming in Java\n",
217 |        "1074                               Tools for Data Science\n",
218 |        "3298    Programming for Everybody (Getting Started wit...\n",
219 |        "18                                 Computer Vision Basics\n",
220 |        "4269                                     Disease Clusters\n",
221 |        "1656                                  業務効率や生産性向上につながる時間管理\n",
222 |        "Name: name, dtype: object"
223 |       ]
224 |      },
225 |      "execution_count": 15,
226 |      "metadata": {},
227 |      "output_type": "execute_result"
228 |     }
229 |    ],
230 |    "source": [
231 |     "# Display the names of the most similar courses\n",
232 |     "course_df.loc[course_ids, 'name']"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": []
241 |   }
242 |  ],
243 |  "metadata": {
244 |   "kernelspec": {
245 |    "display_name": "Python 3",
246 |    "language": "python",
247 |    "name": "python3"
248 |   },
249 |   "language_info": {
250 |    "codemirror_mode": {
251 |     "name": "ipython",
252 |     "version": 3
253 |    },
254 |    "file_extension": ".py",
255 |    "mimetype": "text/x-python",
256 |    "name": "python",
257 |    "nbconvert_exporter": "python",
258 |    "pygments_lexer": "ipython3",
259 |    "version": "3.7.3"
260 |   }
261 |  },
262 |  "nbformat": 4,
263 |  "nbformat_minor": 4
264 | }
265 | 


--------------------------------------------------------------------------------
/Data Modelling/README.md:
--------------------------------------------------------------------------------
 1 | # Data Modelling 
 2 | 
 3 | Data Modelling is implemented using a Doc2Vec model for matching course descriptions to job descriptions. It trains the model on a corpus of course descriptions (from the Coursera catalog). 
 4 | Then it evaluates the model by testing it out with a 
 5 | sample set of job descriptions for which relevant courses have already been pre-selected. Finally, the model is "pickled" for use by a Flask API. 
 6 | 
 7 | The Libraries and Modules used for this purpose are: 
 8 | 
 9 | - [Numpy](https://numpy.org/)
10 | - [Pandas](https://pandas.pydata.org/)
11 | - [Gensim](https://pypi.org/project/gensim/)
12 | - [Scipy Spatial](https://docs.scipy.org/doc/scipy/reference/spatial.html)
13 | - [Pickle](https://docs.python.org/3/library/pickle.html)
14 | 


--------------------------------------------------------------------------------
/Data/Course_Data/Coursera_Catalog_Request.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Coursera Catalog Request\n",
  8 |     "\n",
  9 |     "This notebook requests the entire catalog of courses from the Coursera API, converts it into a dataframe, and exports it as a csv file."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 22,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# Import libraries\n",
 19 |     "\n",
 20 |     "import requests\n",
 21 |     "import time\n",
 22 |     "import pandas as pd"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 35,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "name": "stdout",
 32 |      "output_type": "stream",
 33 |      "text": [
 34 |       "Request status for page 0 is 200.\n",
 35 |       "Request status for page 1 is 200.\n",
 36 |       "Request status for page 2 is 200.\n",
 37 |       "Request status for page 3 is 200.\n",
 38 |       "Request status for page 4 is 200.\n",
 39 |       "Request status for page 5 is 200.\n",
 40 |       "Request status for page 6 is 200.\n",
 41 |       "Request status for page 7 is 200.\n",
 42 |       "Request status for page 8 is 200.\n",
 43 |       "Request status for page 9 is 200.\n",
 44 |       "Request status for page 10 is 200.\n",
 45 |       "Request status for page 11 is 200.\n",
 46 |       "Request status for page 12 is 200.\n",
 47 |       "Request status for page 13 is 200.\n",
 48 |       "Request status for page 14 is 200.\n",
 49 |       "Request status for page 15 is 200.\n",
 50 |       "Request status for page 16 is 200.\n",
 51 |       "Request status for page 17 is 200.\n",
 52 |       "Request status for page 18 is 200.\n",
 53 |       "Request status for page 19 is 200.\n",
 54 |       "Request status for page 20 is 200.\n",
 55 |       "Request status for page 21 is 200.\n",
 56 |       "Request status for page 22 is 200.\n",
 57 |       "Request status for page 23 is 200.\n",
 58 |       "Request status for page 24 is 200.\n",
 59 |       "Request status for page 25 is 200.\n",
 60 |       "Request status for page 26 is 200.\n",
 61 |       "Request status for page 27 is 200.\n",
 62 |       "Request status for page 28 is 200.\n",
 63 |       "Request status for page 29 is 200.\n",
 64 |       "Request status for page 30 is 200.\n",
 65 |       "Request status for page 31 is 200.\n",
 66 |       "Request status for page 32 is 200.\n",
 67 |       "Request status for page 33 is 200.\n",
 68 |       "Request status for page 34 is 200.\n",
 69 |       "Request status for page 35 is 200.\n",
 70 |       "Request status for page 36 is 200.\n",
 71 |       "Request status for page 37 is 200.\n",
 72 |       "Request status for page 38 is 200.\n",
 73 |       "Request status for page 39 is 200.\n",
 74 |       "Request status for page 40 is 200.\n",
 75 |       "Request status for page 41 is 200.\n",
 76 |       "Request status for page 42 is 200.\n",
 77 |       "Request status for page 43 is 200.\n",
 78 |       "Request status for page 44 is 200.\n",
 79 |       "Finished. The number of courses gotten from the catalog is 4416\n"
 80 |      ]
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "# Get the entire Coursera catalog.\n",
 85 |     "\n",
 86 |     "# Instantiate a list to hold the courses\n",
 87 |     "courses = []\n",
 88 |     "\n",
 89 |     "# Set the base url for making get requests\n",
 90 |     "base_url = 'https://api.coursera.org/api/courses.v1'\n",
 91 |     "\n",
 92 |     "# Add the fields I want to include in my requests\n",
 93 |     "fields = \"&fields=description,primaryLanguages,certificates,workload,specializations,domainTypes\"\n",
 94 |     "\n",
 95 |     "# Loop through all 45 pages of the catalog\n",
 96 |     "for page in range(45):\n",
 97 |     "    \n",
 98 |     "    # set pagination\n",
 99 |     "    pagination = f\"?start={page*100}&limit=100\"\n",
100 |     "\n",
101 |     "    # make a request\n",
102 |     "    res = requests.get(base_url + pagination + fields)\n",
103 |     "    print(f'Request status for page {page} is {res.status_code}.')\n",
104 |     "    \n",
105 |     "    # convert from json\n",
106 |     "    dict = res.json()\n",
107 |     "    \n",
108 |     "    # add to the catalog dictionary\n",
109 |     "    for course in dict['elements']:\n",
110 |     "        courses.append(course)\n",
111 |     "    \n",
112 |     "    # delay time to next request\n",
113 |     "    time.sleep(2)\n",
114 |     "\n",
115 |     "print(f'Finished. The number of courses gotten from the catalog is {len(courses)}')"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 36,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "(4416, 10)\n"
128 |      ]
129 |     },
130 |     {
131 |      "data": {
132 |       "text/html": [
133 |        "<div>\n",
134 |        "<style scoped>\n",
135 |        "    .dataframe tbody tr th:only-of-type {\n",
136 |        "        vertical-align: middle;\n",
137 |        "    }\n",
138 |        "\n",
139 |        "    .dataframe tbody tr th {\n",
140 |        "        vertical-align: top;\n",
141 |        "    }\n",
142 |        "\n",
143 |        "    .dataframe thead th {\n",
144 |        "        text-align: right;\n",
145 |        "    }\n",
146 |        "</style>\n",
147 |        "<table border=\"1\" class=\"dataframe\">\n",
148 |        "  <thead>\n",
149 |        "    <tr style=\"text-align: right;\">\n",
150 |        "      <th></th>\n",
151 |        "      <th>courseType</th>\n",
152 |        "      <th>description</th>\n",
153 |        "      <th>domainTypes</th>\n",
154 |        "      <th>id</th>\n",
155 |        "      <th>slug</th>\n",
156 |        "      <th>specializations</th>\n",
157 |        "      <th>workload</th>\n",
158 |        "      <th>primaryLanguages</th>\n",
159 |        "      <th>certificates</th>\n",
160 |        "      <th>name</th>\n",
161 |        "    </tr>\n",
162 |        "  </thead>\n",
163 |        "  <tbody>\n",
164 |        "    <tr>\n",
165 |        "      <th>0</th>\n",
166 |        "      <td>v2.ondemand</td>\n",
167 |        "      <td>Gamification is the application of game elemen...</td>\n",
168 |        "      <td>[{'subdomainId': 'design-and-product', 'domain...</td>\n",
169 |        "      <td>69Bku0KoEeWZtA4u62x6lQ</td>\n",
170 |        "      <td>gamification</td>\n",
171 |        "      <td>[]</td>\n",
172 |        "      <td>4-8 hours/week</td>\n",
173 |        "      <td>[en]</td>\n",
174 |        "      <td>[VerifiedCert]</td>\n",
175 |        "      <td>Gamification</td>\n",
176 |        "    </tr>\n",
177 |        "    <tr>\n",
178 |        "      <th>1</th>\n",
179 |        "      <td>v2.ondemand</td>\n",
180 |        "      <td>This course will cover the steps used in weigh...</td>\n",
181 |        "      <td>[{'subdomainId': 'data-analysis', 'domainId': ...</td>\n",
182 |        "      <td>0HiU7Oe4EeWTAQ4yevf_oQ</td>\n",
183 |        "      <td>missing-data</td>\n",
184 |        "      <td>[]</td>\n",
185 |        "      <td>4 weeks of study, 1-2 hours/week</td>\n",
186 |        "      <td>[en]</td>\n",
187 |        "      <td>[VerifiedCert, Specialization]</td>\n",
188 |        "      <td>Dealing With Missing Data</td>\n",
189 |        "    </tr>\n",
190 |        "    <tr>\n",
191 |        "      <th>2</th>\n",
192 |        "      <td>v2.ondemand</td>\n",
193 |        "      <td>The Unordered Data Structures course covers th...</td>\n",
194 |        "      <td>[{'domainId': 'computer-science', 'subdomainId...</td>\n",
195 |        "      <td>sI_-QEBiEemtDRLx7Ne8jg</td>\n",
196 |        "      <td>cs-fundamentals-3</td>\n",
197 |        "      <td>[]</td>\n",
198 |        "      <td></td>\n",
199 |        "      <td>[en]</td>\n",
200 |        "      <td>[VerifiedCert, Specialization]</td>\n",
201 |        "      <td>Unordered Data Structures</td>\n",
202 |        "    </tr>\n",
203 |        "    <tr>\n",
204 |        "      <th>3</th>\n",
205 |        "      <td>v2.ondemand</td>\n",
206 |        "      <td>The vital signs – heart rate, blood pressure, ...</td>\n",
207 |        "      <td>[{'subdomainId': 'patient-care', 'domainId': '...</td>\n",
208 |        "      <td>5zjIsJq-EeW_wArffOXkOw</td>\n",
209 |        "      <td>vital-signs</td>\n",
210 |        "      <td>[]</td>\n",
211 |        "      <td>3-5 hours/week</td>\n",
212 |        "      <td>[en]</td>\n",
213 |        "      <td>[VerifiedCert]</td>\n",
214 |        "      <td>Vital Signs: Understanding What the Body Is Te...</td>\n",
215 |        "    </tr>\n",
216 |        "    <tr>\n",
217 |        "      <th>4</th>\n",
218 |        "      <td>v2.ondemand</td>\n",
219 |        "      <td>This course “FinTech Disruptive Innovation: Im...</td>\n",
220 |        "      <td>[{'subdomainId': 'finance', 'domainId': 'busin...</td>\n",
221 |        "      <td>WFanvtoSEeedbRLwgi9a7A</td>\n",
222 |        "      <td>fintech-disruption</td>\n",
223 |        "      <td>[]</td>\n",
224 |        "      <td>Around 4 hours of videos in total, plus a fina...</td>\n",
225 |        "      <td>[en]</td>\n",
226 |        "      <td>[VerifiedCert, Specialization]</td>\n",
227 |        "      <td>FinTech Disruptive Innovation: Implications fo...</td>\n",
228 |        "    </tr>\n",
229 |        "  </tbody>\n",
230 |        "</table>\n",
231 |        "</div>"
232 |       ],
233 |       "text/plain": [
234 |        "    courseType                                        description  \\\n",
235 |        "0  v2.ondemand  Gamification is the application of game elemen...   \n",
236 |        "1  v2.ondemand  This course will cover the steps used in weigh...   \n",
237 |        "2  v2.ondemand  The Unordered Data Structures course covers th...   \n",
238 |        "3  v2.ondemand  The vital signs – heart rate, blood pressure, ...   \n",
239 |        "4  v2.ondemand  This course “FinTech Disruptive Innovation: Im...   \n",
240 |        "\n",
241 |        "                                         domainTypes                      id  \\\n",
242 |        "0  [{'subdomainId': 'design-and-product', 'domain...  69Bku0KoEeWZtA4u62x6lQ   \n",
243 |        "1  [{'subdomainId': 'data-analysis', 'domainId': ...  0HiU7Oe4EeWTAQ4yevf_oQ   \n",
244 |        "2  [{'domainId': 'computer-science', 'subdomainId...  sI_-QEBiEemtDRLx7Ne8jg   \n",
245 |        "3  [{'subdomainId': 'patient-care', 'domainId': '...  5zjIsJq-EeW_wArffOXkOw   \n",
246 |        "4  [{'subdomainId': 'finance', 'domainId': 'busin...  WFanvtoSEeedbRLwgi9a7A   \n",
247 |        "\n",
248 |        "                 slug specializations  \\\n",
249 |        "0        gamification              []   \n",
250 |        "1        missing-data              []   \n",
251 |        "2   cs-fundamentals-3              []   \n",
252 |        "3         vital-signs              []   \n",
253 |        "4  fintech-disruption              []   \n",
254 |        "\n",
255 |        "                                            workload primaryLanguages  \\\n",
256 |        "0                                     4-8 hours/week             [en]   \n",
257 |        "1                   4 weeks of study, 1-2 hours/week             [en]   \n",
258 |        "2                                                                [en]   \n",
259 |        "3                                     3-5 hours/week             [en]   \n",
260 |        "4  Around 4 hours of videos in total, plus a fina...             [en]   \n",
261 |        "\n",
262 |        "                     certificates  \\\n",
263 |        "0                  [VerifiedCert]   \n",
264 |        "1  [VerifiedCert, Specialization]   \n",
265 |        "2  [VerifiedCert, Specialization]   \n",
266 |        "3                  [VerifiedCert]   \n",
267 |        "4  [VerifiedCert, Specialization]   \n",
268 |        "\n",
269 |        "                                                name  \n",
270 |        "0                                       Gamification  \n",
271 |        "1                          Dealing With Missing Data  \n",
272 |        "2                          Unordered Data Structures  \n",
273 |        "3  Vital Signs: Understanding What the Body Is Te...  \n",
274 |        "4  FinTech Disruptive Innovation: Implications fo...  "
275 |       ]
276 |      },
277 |      "execution_count": 36,
278 |      "metadata": {},
279 |      "output_type": "execute_result"
280 |     }
281 |    ],
282 |    "source": [
283 |     "# Convert the dictionary to DataFrame\n",
284 |     "\n",
285 |     "catalog_df = pd.DataFrame(courses)\n",
286 |     "print(catalog_df.shape)\n",
287 |     "catalog_df.head()"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 38,
293 |    "metadata": {},
294 |    "outputs": [],
295 |    "source": [
296 |     "# Export the DataFrame as a csv file.\n",
297 |     "\n",
298 |     "catalog_df.to_csv('./Coursera_Catalog.csv', index=False)"
299 |    ]
300 |   }
301 |  ],
302 |  "metadata": {
303 |   "kernelspec": {
304 |    "display_name": "Python 3",
305 |    "language": "python",
306 |    "name": "python3"
307 |   },
308 |   "language_info": {
309 |    "codemirror_mode": {
310 |     "name": "ipython",
311 |     "version": 3
312 |    },
313 |    "file_extension": ".py",
314 |    "mimetype": "text/x-python",
315 |    "name": "python",
316 |    "nbconvert_exporter": "python",
317 |    "pygments_lexer": "ipython3",
318 |    "version": "3.7.3"
319 |   }
320 |  },
321 |  "nbformat": 4,
322 |  "nbformat_minor": 4
323 | }
324 | 


--------------------------------------------------------------------------------
/Data/Job_Data/Glassdoor_Joblist_Integration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Glassdoor Joblist Integration\n",
  8 |     "\n",
  9 |     "This notebook provides a dataset of job listings for testing the recommender model. It uses public data from Glassdoor in May 2020 (https://www.kaggle.com/atharvap329/glassdoor-data-science-job-data). The notebook integrates the four datasets from this collection into one and generates a new csv file."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 13,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# Import libraries\n",
 19 |     "\n",
 20 |     "import pandas as pd"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 14,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stdout",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "NY:900, SF:889, TX:643, WA:892\n"
 33 |      ]
 34 |     },
 35 |     {
 36 |      "data": {
 37 |       "text/html": [
 38 |        "<div>\n",
 39 |        "<style scoped>\n",
 40 |        "    .dataframe tbody tr th:only-of-type {\n",
 41 |        "        vertical-align: middle;\n",
 42 |        "    }\n",
 43 |        "\n",
 44 |        "    .dataframe tbody tr th {\n",
 45 |        "        vertical-align: top;\n",
 46 |        "    }\n",
 47 |        "\n",
 48 |        "    .dataframe thead th {\n",
 49 |        "        text-align: right;\n",
 50 |        "    }\n",
 51 |        "</style>\n",
 52 |        "<table border=\"1\" class=\"dataframe\">\n",
 53 |        "  <thead>\n",
 54 |        "    <tr style=\"text-align: right;\">\n",
 55 |        "      <th></th>\n",
 56 |        "      <th>Job_title</th>\n",
 57 |        "      <th>Company</th>\n",
 58 |        "      <th>State</th>\n",
 59 |        "      <th>City</th>\n",
 60 |        "      <th>Min_Salary</th>\n",
 61 |        "      <th>Max_Salary</th>\n",
 62 |        "      <th>Job_Desc</th>\n",
 63 |        "      <th>Industry</th>\n",
 64 |        "      <th>Rating</th>\n",
 65 |        "      <th>Date_Posted</th>\n",
 66 |        "      <th>Valid_until</th>\n",
 67 |        "      <th>Job_Type</th>\n",
 68 |        "    </tr>\n",
 69 |        "  </thead>\n",
 70 |        "  <tbody>\n",
 71 |        "    <tr>\n",
 72 |        "      <th>0</th>\n",
 73 |        "      <td>Chief Marketing Officer (CMO)</td>\n",
 74 |        "      <td>National Debt Relief</td>\n",
 75 |        "      <td>NY</td>\n",
 76 |        "      <td>New York</td>\n",
 77 |        "      <td>-1</td>\n",
 78 |        "      <td>-1</td>\n",
 79 |        "      <td>Who We're Looking For:\\n\\nThe Chief Marketing ...</td>\n",
 80 |        "      <td>Finance</td>\n",
 81 |        "      <td>4.0</td>\n",
 82 |        "      <td>2020-05-08</td>\n",
 83 |        "      <td>2020-06-07</td>\n",
 84 |        "      <td>FULL_TIME</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>1</th>\n",
 88 |        "      <td>Registered Nurse</td>\n",
 89 |        "      <td>Queens Boulevard Endoscopy Center</td>\n",
 90 |        "      <td>NY</td>\n",
 91 |        "      <td>Rego Park</td>\n",
 92 |        "      <td>-1</td>\n",
 93 |        "      <td>-1</td>\n",
 94 |        "      <td>Queens Boulevard Endoscopy Center, an endoscop...</td>\n",
 95 |        "      <td>NaN</td>\n",
 96 |        "      <td>3.0</td>\n",
 97 |        "      <td>2020-04-25</td>\n",
 98 |        "      <td>2020-06-07</td>\n",
 99 |        "      <td>FULL_TIME</td>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "      <th>2</th>\n",
103 |        "      <td>Dental Hygienist</td>\n",
104 |        "      <td>Batista Dental</td>\n",
105 |        "      <td>NJ</td>\n",
106 |        "      <td>West New York</td>\n",
107 |        "      <td>-1</td>\n",
108 |        "      <td>-1</td>\n",
109 |        "      <td>Part-time or Full-timedental hygienist positio...</td>\n",
110 |        "      <td>NaN</td>\n",
111 |        "      <td>NaN</td>\n",
112 |        "      <td>2020-05-02</td>\n",
113 |        "      <td>2020-06-07</td>\n",
114 |        "      <td>PART_TIME</td>\n",
115 |        "    </tr>\n",
116 |        "    <tr>\n",
117 |        "      <th>3</th>\n",
118 |        "      <td>Senior Salesforce Developer</td>\n",
119 |        "      <td>National Debt Relief</td>\n",
120 |        "      <td>NY</td>\n",
121 |        "      <td>New York</td>\n",
122 |        "      <td>44587</td>\n",
123 |        "      <td>82162</td>\n",
124 |        "      <td>Principle Duties &amp; Responsibilities:\\n\\nAnalyz...</td>\n",
125 |        "      <td>Finance</td>\n",
126 |        "      <td>4.0</td>\n",
127 |        "      <td>2020-05-08</td>\n",
128 |        "      <td>2020-06-07</td>\n",
129 |        "      <td>FULL_TIME</td>\n",
130 |        "    </tr>\n",
131 |        "    <tr>\n",
132 |        "      <th>4</th>\n",
133 |        "      <td>DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...</td>\n",
134 |        "      <td>National Advocates for Pregnant Women</td>\n",
135 |        "      <td>NY</td>\n",
136 |        "      <td>New York</td>\n",
137 |        "      <td>125410</td>\n",
138 |        "      <td>212901</td>\n",
139 |        "      <td>For FULL Job Announcement, visit our website: ...</td>\n",
140 |        "      <td>NaN</td>\n",
141 |        "      <td>NaN</td>\n",
142 |        "      <td>2020-04-28</td>\n",
143 |        "      <td>2020-06-07</td>\n",
144 |        "      <td>FULL_TIME</td>\n",
145 |        "    </tr>\n",
146 |        "  </tbody>\n",
147 |        "</table>\n",
148 |        "</div>"
149 |       ],
150 |       "text/plain": [
151 |        "                                           Job_title  \\\n",
152 |        "0                      Chief Marketing Officer (CMO)   \n",
153 |        "1                                   Registered Nurse   \n",
154 |        "2                                   Dental Hygienist   \n",
155 |        "3                        Senior Salesforce Developer   \n",
156 |        "4  DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...   \n",
157 |        "\n",
158 |        "                                 Company State           City  Min_Salary  \\\n",
159 |        "0                   National Debt Relief    NY       New York          -1   \n",
160 |        "1      Queens Boulevard Endoscopy Center    NY      Rego Park          -1   \n",
161 |        "2                         Batista Dental    NJ  West New York          -1   \n",
162 |        "3                   National Debt Relief    NY       New York       44587   \n",
163 |        "4  National Advocates for Pregnant Women    NY       New York      125410   \n",
164 |        "\n",
165 |        "   Max_Salary                                           Job_Desc Industry  \\\n",
166 |        "0          -1  Who We're Looking For:\\n\\nThe Chief Marketing ...  Finance   \n",
167 |        "1          -1  Queens Boulevard Endoscopy Center, an endoscop...      NaN   \n",
168 |        "2          -1  Part-time or Full-timedental hygienist positio...      NaN   \n",
169 |        "3       82162  Principle Duties & Responsibilities:\\n\\nAnalyz...  Finance   \n",
170 |        "4      212901  For FULL Job Announcement, visit our website: ...      NaN   \n",
171 |        "\n",
172 |        "   Rating Date_Posted Valid_until   Job_Type  \n",
173 |        "0     4.0  2020-05-08  2020-06-07  FULL_TIME  \n",
174 |        "1     3.0  2020-04-25  2020-06-07  FULL_TIME  \n",
175 |        "2     NaN  2020-05-02  2020-06-07  PART_TIME  \n",
176 |        "3     4.0  2020-05-08  2020-06-07  FULL_TIME  \n",
177 |        "4     NaN  2020-04-28  2020-06-07  FULL_TIME  "
178 |       ]
179 |      },
180 |      "execution_count": 14,
181 |      "metadata": {},
182 |      "output_type": "execute_result"
183 |     }
184 |    ],
185 |    "source": [
186 |     "# Read in csv files with glassdoor job postings.\n",
187 |     "\n",
188 |     "ny = pd.read_csv('./Raw/Data_Job_NY.csv')\n",
189 |     "sf = pd.read_csv('./Raw/Data_Job_SF.csv')\n",
190 |     "tx = pd.read_csv('./Raw/Data_Job_TX.csv')\n",
191 |     "wa = pd.read_csv('./Raw/Data_Job_WA.csv')\n",
192 |     "\n",
193 |     "print(f'NY:{ny.shape[0]}, SF:{sf.shape[0]}, TX:{tx.shape[0]}, WA:{wa.shape[0]}')\n",
194 |     "ny.head()      "
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 15,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "name": "stdout",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "(3324, 12)\n"
207 |      ]
208 |     },
209 |     {
210 |      "data": {
211 |       "text/html": [
212 |        "<div>\n",
213 |        "<style scoped>\n",
214 |        "    .dataframe tbody tr th:only-of-type {\n",
215 |        "        vertical-align: middle;\n",
216 |        "    }\n",
217 |        "\n",
218 |        "    .dataframe tbody tr th {\n",
219 |        "        vertical-align: top;\n",
220 |        "    }\n",
221 |        "\n",
222 |        "    .dataframe thead th {\n",
223 |        "        text-align: right;\n",
224 |        "    }\n",
225 |        "</style>\n",
226 |        "<table border=\"1\" class=\"dataframe\">\n",
227 |        "  <thead>\n",
228 |        "    <tr style=\"text-align: right;\">\n",
229 |        "      <th></th>\n",
230 |        "      <th>Job_title</th>\n",
231 |        "      <th>Company</th>\n",
232 |        "      <th>State</th>\n",
233 |        "      <th>City</th>\n",
234 |        "      <th>Min_Salary</th>\n",
235 |        "      <th>Max_Salary</th>\n",
236 |        "      <th>Job_Desc</th>\n",
237 |        "      <th>Industry</th>\n",
238 |        "      <th>Rating</th>\n",
239 |        "      <th>Date_Posted</th>\n",
240 |        "      <th>Valid_until</th>\n",
241 |        "      <th>Job_Type</th>\n",
242 |        "    </tr>\n",
243 |        "  </thead>\n",
244 |        "  <tbody>\n",
245 |        "    <tr>\n",
246 |        "      <th>0</th>\n",
247 |        "      <td>Chief Marketing Officer (CMO)</td>\n",
248 |        "      <td>National Debt Relief</td>\n",
249 |        "      <td>NY</td>\n",
250 |        "      <td>New York</td>\n",
251 |        "      <td>-1</td>\n",
252 |        "      <td>-1</td>\n",
253 |        "      <td>Who We're Looking For:\\n\\nThe Chief Marketing ...</td>\n",
254 |        "      <td>Finance</td>\n",
255 |        "      <td>4.0</td>\n",
256 |        "      <td>2020-05-08</td>\n",
257 |        "      <td>2020-06-07</td>\n",
258 |        "      <td>FULL_TIME</td>\n",
259 |        "    </tr>\n",
260 |        "    <tr>\n",
261 |        "      <th>1</th>\n",
262 |        "      <td>Registered Nurse</td>\n",
263 |        "      <td>Queens Boulevard Endoscopy Center</td>\n",
264 |        "      <td>NY</td>\n",
265 |        "      <td>Rego Park</td>\n",
266 |        "      <td>-1</td>\n",
267 |        "      <td>-1</td>\n",
268 |        "      <td>Queens Boulevard Endoscopy Center, an endoscop...</td>\n",
269 |        "      <td>NaN</td>\n",
270 |        "      <td>3.0</td>\n",
271 |        "      <td>2020-04-25</td>\n",
272 |        "      <td>2020-06-07</td>\n",
273 |        "      <td>FULL_TIME</td>\n",
274 |        "    </tr>\n",
275 |        "    <tr>\n",
276 |        "      <th>2</th>\n",
277 |        "      <td>Dental Hygienist</td>\n",
278 |        "      <td>Batista Dental</td>\n",
279 |        "      <td>NJ</td>\n",
280 |        "      <td>West New York</td>\n",
281 |        "      <td>-1</td>\n",
282 |        "      <td>-1</td>\n",
283 |        "      <td>Part-time or Full-timedental hygienist positio...</td>\n",
284 |        "      <td>NaN</td>\n",
285 |        "      <td>NaN</td>\n",
286 |        "      <td>2020-05-02</td>\n",
287 |        "      <td>2020-06-07</td>\n",
288 |        "      <td>PART_TIME</td>\n",
289 |        "    </tr>\n",
290 |        "    <tr>\n",
291 |        "      <th>3</th>\n",
292 |        "      <td>Senior Salesforce Developer</td>\n",
293 |        "      <td>National Debt Relief</td>\n",
294 |        "      <td>NY</td>\n",
295 |        "      <td>New York</td>\n",
296 |        "      <td>44587</td>\n",
297 |        "      <td>82162</td>\n",
298 |        "      <td>Principle Duties &amp; Responsibilities:\\n\\nAnalyz...</td>\n",
299 |        "      <td>Finance</td>\n",
300 |        "      <td>4.0</td>\n",
301 |        "      <td>2020-05-08</td>\n",
302 |        "      <td>2020-06-07</td>\n",
303 |        "      <td>FULL_TIME</td>\n",
304 |        "    </tr>\n",
305 |        "    <tr>\n",
306 |        "      <th>4</th>\n",
307 |        "      <td>DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...</td>\n",
308 |        "      <td>National Advocates for Pregnant Women</td>\n",
309 |        "      <td>NY</td>\n",
310 |        "      <td>New York</td>\n",
311 |        "      <td>125410</td>\n",
312 |        "      <td>212901</td>\n",
313 |        "      <td>For FULL Job Announcement, visit our website: ...</td>\n",
314 |        "      <td>NaN</td>\n",
315 |        "      <td>NaN</td>\n",
316 |        "      <td>2020-04-28</td>\n",
317 |        "      <td>2020-06-07</td>\n",
318 |        "      <td>FULL_TIME</td>\n",
319 |        "    </tr>\n",
320 |        "  </tbody>\n",
321 |        "</table>\n",
322 |        "</div>"
323 |       ],
324 |       "text/plain": [
325 |        "                                           Job_title  \\\n",
326 |        "0                      Chief Marketing Officer (CMO)   \n",
327 |        "1                                   Registered Nurse   \n",
328 |        "2                                   Dental Hygienist   \n",
329 |        "3                        Senior Salesforce Developer   \n",
330 |        "4  DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...   \n",
331 |        "\n",
332 |        "                                 Company State           City  Min_Salary  \\\n",
333 |        "0                   National Debt Relief    NY       New York          -1   \n",
334 |        "1      Queens Boulevard Endoscopy Center    NY      Rego Park          -1   \n",
335 |        "2                         Batista Dental    NJ  West New York          -1   \n",
336 |        "3                   National Debt Relief    NY       New York       44587   \n",
337 |        "4  National Advocates for Pregnant Women    NY       New York      125410   \n",
338 |        "\n",
339 |        "   Max_Salary                                           Job_Desc Industry  \\\n",
340 |        "0          -1  Who We're Looking For:\\n\\nThe Chief Marketing ...  Finance   \n",
341 |        "1          -1  Queens Boulevard Endoscopy Center, an endoscop...      NaN   \n",
342 |        "2          -1  Part-time or Full-timedental hygienist positio...      NaN   \n",
343 |        "3       82162  Principle Duties & Responsibilities:\\n\\nAnalyz...  Finance   \n",
344 |        "4      212901  For FULL Job Announcement, visit our website: ...      NaN   \n",
345 |        "\n",
346 |        "   Rating Date_Posted Valid_until   Job_Type  \n",
347 |        "0     4.0  2020-05-08  2020-06-07  FULL_TIME  \n",
348 |        "1     3.0  2020-04-25  2020-06-07  FULL_TIME  \n",
349 |        "2     NaN  2020-05-02  2020-06-07  PART_TIME  \n",
350 |        "3     4.0  2020-05-08  2020-06-07  FULL_TIME  \n",
351 |        "4     NaN  2020-04-28  2020-06-07  FULL_TIME  "
352 |       ]
353 |      },
354 |      "execution_count": 15,
355 |      "metadata": {},
356 |      "output_type": "execute_result"
357 |     }
358 |    ],
359 |    "source": [
360 |     "# Merge all data into a single dataframe\n",
361 |     "\n",
362 |     "jobs_df = pd.concat([ny, sf, tx, wa])\n",
363 |     "print(jobs_df.shape)\n",
364 |     "jobs_df.head()"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": 16,
370 |    "metadata": {},
371 |    "outputs": [],
372 |    "source": [
373 |     "# Export the integrated dataset to a new csv file.\n",
374 |     "\n",
375 |     "jobs_df.to_csv('Glassdoor_Joblist.csv', index=False)"
376 |    ]
377 |   }
378 |  ],
379 |  "metadata": {
380 |   "kernelspec": {
381 |    "display_name": "Python 3",
382 |    "language": "python",
383 |    "name": "python3"
384 |   },
385 |   "language_info": {
386 |    "codemirror_mode": {
387 |     "name": "ipython",
388 |     "version": 3
389 |    },
390 |    "file_extension": ".py",
391 |    "mimetype": "text/x-python",
392 |    "name": "python",
393 |    "nbconvert_exporter": "python",
394 |    "pygments_lexer": "ipython3",
395 |    "version": "3.7.3"
396 |   }
397 |  },
398 |  "nbformat": 4,
399 |  "nbformat_minor": 4
400 | }
401 | 


--------------------------------------------------------------------------------
/Data/README.md:
--------------------------------------------------------------------------------
 1 | # Datasets
 2 | 
 3 | To develop the Dataset for Course Data, the Coursera Catalog API was utilized which downloaded the entire Coursera Catalog and 
 4 | the Dataframes were stacked to develop the Dataset. The Job List Dataset was generated using  public data from Glassdoor in form of a [Kaggle](https://www.kaggle.com/atharvap329/glassdoor-data-science-job-data)
 5 | Dataset. The notebooks given in the required directories integrates the four datasets from this collection into one and generates a new Dataset that can be centrally used.
 6 | 
 7 | The Technologies used for generating the Datasets are: 
 8 | 
 9 | - [Request](https://requests.readthedocs.io/en/master/)
10 | - [Pandas](https://pandas.pydata.org/)
11 | - [Time](https://docs.python.org/3/library/time.html)
12 | 


--------------------------------------------------------------------------------
/Exploratory Data Analysis/Create_Test_Set.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Test Data Selection\n",
   8 |     "This notebook creates a small set of labeled data that can be used to test the Doc2Vec model. Specifically, in selects 10 sample job descriptions under 2 job titles (data scientist and data engineer). It matches each of these 2 job titles with 5 courses each that I believe the model should recommend. This sample data will then be used to test the accuracy of the model."
   9 |    ]
  10 |   },
  11 |   {
  12 |    "cell_type": "code",
  13 |    "execution_count": 130,
  14 |    "metadata": {},
  15 |    "outputs": [],
  16 |    "source": [
  17 |     "import pandas as pd"
  18 |    ]
  19 |   },
  20 |   {
  21 |    "cell_type": "markdown",
  22 |    "metadata": {},
  23 |    "source": [
  24 |     "## Job test data"
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "code",
  29 |    "execution_count": 131,
  30 |    "metadata": {},
  31 |    "outputs": [
  32 |     {
  33 |      "name": "stdout",
  34 |      "output_type": "stream",
  35 |      "text": [
  36 |       "(3324, 12)\n"
  37 |      ]
  38 |     },
  39 |     {
  40 |      "data": {
  41 |       "text/html": [
  42 |        "<div>\n",
  43 |        "<style scoped>\n",
  44 |        "    .dataframe tbody tr th:only-of-type {\n",
  45 |        "        vertical-align: middle;\n",
  46 |        "    }\n",
  47 |        "\n",
  48 |        "    .dataframe tbody tr th {\n",
  49 |        "        vertical-align: top;\n",
  50 |        "    }\n",
  51 |        "\n",
  52 |        "    .dataframe thead th {\n",
  53 |        "        text-align: right;\n",
  54 |        "    }\n",
  55 |        "</style>\n",
  56 |        "<table border=\"1\" class=\"dataframe\">\n",
  57 |        "  <thead>\n",
  58 |        "    <tr style=\"text-align: right;\">\n",
  59 |        "      <th></th>\n",
  60 |        "      <th>Job_title</th>\n",
  61 |        "      <th>Company</th>\n",
  62 |        "      <th>State</th>\n",
  63 |        "      <th>City</th>\n",
  64 |        "      <th>Min_Salary</th>\n",
  65 |        "      <th>Max_Salary</th>\n",
  66 |        "      <th>Job_Desc</th>\n",
  67 |        "      <th>Industry</th>\n",
  68 |        "      <th>Rating</th>\n",
  69 |        "      <th>Date_Posted</th>\n",
  70 |        "      <th>Valid_until</th>\n",
  71 |        "      <th>Job_Type</th>\n",
  72 |        "    </tr>\n",
  73 |        "  </thead>\n",
  74 |        "  <tbody>\n",
  75 |        "    <tr>\n",
  76 |        "      <th>0</th>\n",
  77 |        "      <td>Chief Marketing Officer (CMO)</td>\n",
  78 |        "      <td>National Debt Relief</td>\n",
  79 |        "      <td>NY</td>\n",
  80 |        "      <td>New York</td>\n",
  81 |        "      <td>-1</td>\n",
  82 |        "      <td>-1</td>\n",
  83 |        "      <td>Who We're Looking For:\\n\\nThe Chief Marketing ...</td>\n",
  84 |        "      <td>Finance</td>\n",
  85 |        "      <td>4.0</td>\n",
  86 |        "      <td>2020-05-08</td>\n",
  87 |        "      <td>2020-06-07</td>\n",
  88 |        "      <td>FULL_TIME</td>\n",
  89 |        "    </tr>\n",
  90 |        "    <tr>\n",
  91 |        "      <th>1</th>\n",
  92 |        "      <td>Registered Nurse</td>\n",
  93 |        "      <td>Queens Boulevard Endoscopy Center</td>\n",
  94 |        "      <td>NY</td>\n",
  95 |        "      <td>Rego Park</td>\n",
  96 |        "      <td>-1</td>\n",
  97 |        "      <td>-1</td>\n",
  98 |        "      <td>Queens Boulevard Endoscopy Center, an endoscop...</td>\n",
  99 |        "      <td>NaN</td>\n",
 100 |        "      <td>3.0</td>\n",
 101 |        "      <td>2020-04-25</td>\n",
 102 |        "      <td>2020-06-07</td>\n",
 103 |        "      <td>FULL_TIME</td>\n",
 104 |        "    </tr>\n",
 105 |        "  </tbody>\n",
 106 |        "</table>\n",
 107 |        "</div>"
 108 |       ],
 109 |       "text/plain": [
 110 |        "                       Job_title                            Company State  \\\n",
 111 |        "0  Chief Marketing Officer (CMO)               National Debt Relief    NY   \n",
 112 |        "1               Registered Nurse  Queens Boulevard Endoscopy Center    NY   \n",
 113 |        "\n",
 114 |        "        City  Min_Salary  Max_Salary  \\\n",
 115 |        "0   New York          -1          -1   \n",
 116 |        "1  Rego Park          -1          -1   \n",
 117 |        "\n",
 118 |        "                                            Job_Desc Industry  Rating  \\\n",
 119 |        "0  Who We're Looking For:\\n\\nThe Chief Marketing ...  Finance     4.0   \n",
 120 |        "1  Queens Boulevard Endoscopy Center, an endoscop...      NaN     3.0   \n",
 121 |        "\n",
 122 |        "  Date_Posted Valid_until   Job_Type  \n",
 123 |        "0  2020-05-08  2020-06-07  FULL_TIME  \n",
 124 |        "1  2020-04-25  2020-06-07  FULL_TIME  "
 125 |       ]
 126 |      },
 127 |      "execution_count": 131,
 128 |      "metadata": {},
 129 |      "output_type": "execute_result"
 130 |     }
 131 |    ],
 132 |    "source": [
 133 |     "# Read in the jobs data\n",
 134 |     "jobs_df = pd.read_csv('../Data/Job_Data/Glassdoor_Joblist.csv')\n",
 135 |     "print(jobs_df.shape)\n",
 136 |     "jobs_df.head(2)"
 137 |    ]
 138 |   },
 139 |   {
 140 |    "cell_type": "code",
 141 |    "execution_count": 132,
 142 |    "metadata": {},
 143 |    "outputs": [
 144 |     {
 145 |      "data": {
 146 |       "text/plain": [
 147 |        "Data Scientist                               186\n",
 148 |        "Data Engineer                                129\n",
 149 |        "Data Analyst                                  69\n",
 150 |        "Senior Data Engineer                          44\n",
 151 |        "Senior Data Scientist                         39\n",
 152 |        "                                            ... \n",
 153 |        "Support Scientist-Ocean Data Assimilation      1\n",
 154 |        "Insights and Analytics Manager                 1\n",
 155 |        "DHS-NTC Senior Scientist                       1\n",
 156 |        "Document Security Scientist                    1\n",
 157 |        "Sr. Healthcare Data Analyst                    1\n",
 158 |        "Name: Job_title, Length: 1619, dtype: int64"
 159 |       ]
 160 |      },
 161 |      "execution_count": 132,
 162 |      "metadata": {},
 163 |      "output_type": "execute_result"
 164 |     }
 165 |    ],
 166 |    "source": [
 167 |     "# Check the major job titles in the dataset\n",
 168 |     "jobs_df['Job_title'].value_counts()"
 169 |    ]
 170 |   },
 171 |   {
 172 |    "cell_type": "code",
 173 |    "execution_count": 133,
 174 |    "metadata": {},
 175 |    "outputs": [],
 176 |    "source": [
 177 |     "# Select 5 job descriptions for data scientist\n",
 178 |     "ds_jobs = [901, 910, 916, 920, 938]"
 179 |    ]
 180 |   },
 181 |   {
 182 |    "cell_type": "code",
 183 |    "execution_count": 134,
 184 |    "metadata": {},
 185 |    "outputs": [],
 186 |    "source": [
 187 |     "# Select 5 job descriptions for data engineer\n",
 188 |     "de_jobs = [935, 1068, 1089, 1100, 1105]"
 189 |    ]
 190 |   },
 191 |   {
 192 |    "cell_type": "code",
 193 |    "execution_count": 135,
 194 |    "metadata": {},
 195 |    "outputs": [
 196 |     {
 197 |      "data": {
 198 |       "text/html": [
 199 |        "<div>\n",
 200 |        "<style scoped>\n",
 201 |        "    .dataframe tbody tr th:only-of-type {\n",
 202 |        "        vertical-align: middle;\n",
 203 |        "    }\n",
 204 |        "\n",
 205 |        "    .dataframe tbody tr th {\n",
 206 |        "        vertical-align: top;\n",
 207 |        "    }\n",
 208 |        "\n",
 209 |        "    .dataframe thead th {\n",
 210 |        "        text-align: right;\n",
 211 |        "    }\n",
 212 |        "</style>\n",
 213 |        "<table border=\"1\" class=\"dataframe\">\n",
 214 |        "  <thead>\n",
 215 |        "    <tr style=\"text-align: right;\">\n",
 216 |        "      <th></th>\n",
 217 |        "      <th>Job_title</th>\n",
 218 |        "      <th>Job_Desc</th>\n",
 219 |        "    </tr>\n",
 220 |        "  </thead>\n",
 221 |        "  <tbody>\n",
 222 |        "    <tr>\n",
 223 |        "      <th>901</th>\n",
 224 |        "      <td>Data Scientist</td>\n",
 225 |        "      <td>We are looking for Data Scientists who are int...</td>\n",
 226 |        "    </tr>\n",
 227 |        "    <tr>\n",
 228 |        "      <th>910</th>\n",
 229 |        "      <td>Data Scientist</td>\n",
 230 |        "      <td>The world's largest and fastest-growing compan...</td>\n",
 231 |        "    </tr>\n",
 232 |        "    <tr>\n",
 233 |        "      <th>916</th>\n",
 234 |        "      <td>Data Scientist</td>\n",
 235 |        "      <td>\\nRole: Data Scientist.\\n\\nLocation: Foster Ci...</td>\n",
 236 |        "    </tr>\n",
 237 |        "    <tr>\n",
 238 |        "      <th>920</th>\n",
 239 |        "      <td>Data Scientist</td>\n",
 240 |        "      <td>Upstart is the leading AI lending platform par...</td>\n",
 241 |        "    </tr>\n",
 242 |        "    <tr>\n",
 243 |        "      <th>938</th>\n",
 244 |        "      <td>Data Scientist</td>\n",
 245 |        "      <td>Why Divvy?Over the past decade, millions of Am...</td>\n",
 246 |        "    </tr>\n",
 247 |        "    <tr>\n",
 248 |        "      <th>935</th>\n",
 249 |        "      <td>Data Engineer</td>\n",
 250 |        "      <td>About Rocket LawyerWe believe everyone deserve...</td>\n",
 251 |        "    </tr>\n",
 252 |        "    <tr>\n",
 253 |        "      <th>1068</th>\n",
 254 |        "      <td>Data Engineer</td>\n",
 255 |        "      <td>Our mission is to create a world where mental ...</td>\n",
 256 |        "    </tr>\n",
 257 |        "    <tr>\n",
 258 |        "      <th>1089</th>\n",
 259 |        "      <td>Data Engineer</td>\n",
 260 |        "      <td>Data Engineer \\nIf you are a Data Engineer wit...</td>\n",
 261 |        "    </tr>\n",
 262 |        "    <tr>\n",
 263 |        "      <th>1100</th>\n",
 264 |        "      <td>Data Engineer</td>\n",
 265 |        "      <td>Prabhav Services Inc. is one of the premier pr...</td>\n",
 266 |        "    </tr>\n",
 267 |        "    <tr>\n",
 268 |        "      <th>1105</th>\n",
 269 |        "      <td>Data Engineer</td>\n",
 270 |        "      <td>About Skupos\\nSkupos is the data platform for ...</td>\n",
 271 |        "    </tr>\n",
 272 |        "  </tbody>\n",
 273 |        "</table>\n",
 274 |        "</div>"
 275 |       ],
 276 |       "text/plain": [
 277 |        "           Job_title                                           Job_Desc\n",
 278 |        "901   Data Scientist  We are looking for Data Scientists who are int...\n",
 279 |        "910   Data Scientist  The world's largest and fastest-growing compan...\n",
 280 |        "916   Data Scientist  \\nRole: Data Scientist.\\n\\nLocation: Foster Ci...\n",
 281 |        "920   Data Scientist  Upstart is the leading AI lending platform par...\n",
 282 |        "938   Data Scientist  Why Divvy?Over the past decade, millions of Am...\n",
 283 |        "935    Data Engineer  About Rocket LawyerWe believe everyone deserve...\n",
 284 |        "1068   Data Engineer  Our mission is to create a world where mental ...\n",
 285 |        "1089   Data Engineer  Data Engineer \\nIf you are a Data Engineer wit...\n",
 286 |        "1100   Data Engineer  Prabhav Services Inc. is one of the premier pr...\n",
 287 |        "1105   Data Engineer  About Skupos\\nSkupos is the data platform for ..."
 288 |       ]
 289 |      },
 290 |      "execution_count": 135,
 291 |      "metadata": {},
 292 |      "output_type": "execute_result"
 293 |     }
 294 |    ],
 295 |    "source": [
 296 |     "sample_jobs = jobs_df.loc[[901, 910, 916, 920, 938, 935, 1068, 1089, 1100, 1105], ['Job_title', 'Job_Desc']]\n",
 297 |     "sample_jobs"
 298 |    ]
 299 |   },
 300 |   {
 301 |    "cell_type": "code",
 302 |    "execution_count": 145,
 303 |    "metadata": {},
 304 |    "outputs": [
 305 |     {
 306 |      "data": {
 307 |       "text/html": [
 308 |        "<div>\n",
 309 |        "<style scoped>\n",
 310 |        "    .dataframe tbody tr th:only-of-type {\n",
 311 |        "        vertical-align: middle;\n",
 312 |        "    }\n",
 313 |        "\n",
 314 |        "    .dataframe tbody tr th {\n",
 315 |        "        vertical-align: top;\n",
 316 |        "    }\n",
 317 |        "\n",
 318 |        "    .dataframe thead th {\n",
 319 |        "        text-align: right;\n",
 320 |        "    }\n",
 321 |        "</style>\n",
 322 |        "<table border=\"1\" class=\"dataframe\">\n",
 323 |        "  <thead>\n",
 324 |        "    <tr style=\"text-align: right;\">\n",
 325 |        "      <th></th>\n",
 326 |        "      <th>Job_title</th>\n",
 327 |        "      <th>Job_Desc</th>\n",
 328 |        "      <th>Job_id</th>\n",
 329 |        "    </tr>\n",
 330 |        "  </thead>\n",
 331 |        "  <tbody>\n",
 332 |        "    <tr>\n",
 333 |        "      <th>901</th>\n",
 334 |        "      <td>Data Scientist</td>\n",
 335 |        "      <td>We are looking for Data Scientists who are int...</td>\n",
 336 |        "      <td>901</td>\n",
 337 |        "    </tr>\n",
 338 |        "    <tr>\n",
 339 |        "      <th>910</th>\n",
 340 |        "      <td>Data Scientist</td>\n",
 341 |        "      <td>The world's largest and fastest-growing compan...</td>\n",
 342 |        "      <td>910</td>\n",
 343 |        "    </tr>\n",
 344 |        "    <tr>\n",
 345 |        "      <th>916</th>\n",
 346 |        "      <td>Data Scientist</td>\n",
 347 |        "      <td>\\nRole: Data Scientist.\\n\\nLocation: Foster Ci...</td>\n",
 348 |        "      <td>916</td>\n",
 349 |        "    </tr>\n",
 350 |        "    <tr>\n",
 351 |        "      <th>920</th>\n",
 352 |        "      <td>Data Scientist</td>\n",
 353 |        "      <td>Upstart is the leading AI lending platform par...</td>\n",
 354 |        "      <td>920</td>\n",
 355 |        "    </tr>\n",
 356 |        "    <tr>\n",
 357 |        "      <th>938</th>\n",
 358 |        "      <td>Data Scientist</td>\n",
 359 |        "      <td>Why Divvy?Over the past decade, millions of Am...</td>\n",
 360 |        "      <td>938</td>\n",
 361 |        "    </tr>\n",
 362 |        "    <tr>\n",
 363 |        "      <th>935</th>\n",
 364 |        "      <td>Data Engineer</td>\n",
 365 |        "      <td>About Rocket LawyerWe believe everyone deserve...</td>\n",
 366 |        "      <td>935</td>\n",
 367 |        "    </tr>\n",
 368 |        "    <tr>\n",
 369 |        "      <th>1068</th>\n",
 370 |        "      <td>Data Engineer</td>\n",
 371 |        "      <td>Our mission is to create a world where mental ...</td>\n",
 372 |        "      <td>1068</td>\n",
 373 |        "    </tr>\n",
 374 |        "    <tr>\n",
 375 |        "      <th>1089</th>\n",
 376 |        "      <td>Data Engineer</td>\n",
 377 |        "      <td>Data Engineer \\nIf you are a Data Engineer wit...</td>\n",
 378 |        "      <td>1089</td>\n",
 379 |        "    </tr>\n",
 380 |        "    <tr>\n",
 381 |        "      <th>1100</th>\n",
 382 |        "      <td>Data Engineer</td>\n",
 383 |        "      <td>Prabhav Services Inc. is one of the premier pr...</td>\n",
 384 |        "      <td>1100</td>\n",
 385 |        "    </tr>\n",
 386 |        "    <tr>\n",
 387 |        "      <th>1105</th>\n",
 388 |        "      <td>Data Engineer</td>\n",
 389 |        "      <td>About Skupos\\nSkupos is the data platform for ...</td>\n",
 390 |        "      <td>1105</td>\n",
 391 |        "    </tr>\n",
 392 |        "  </tbody>\n",
 393 |        "</table>\n",
 394 |        "</div>"
 395 |       ],
 396 |       "text/plain": [
 397 |        "           Job_title                                           Job_Desc  \\\n",
 398 |        "901   Data Scientist  We are looking for Data Scientists who are int...   \n",
 399 |        "910   Data Scientist  The world's largest and fastest-growing compan...   \n",
 400 |        "916   Data Scientist  \\nRole: Data Scientist.\\n\\nLocation: Foster Ci...   \n",
 401 |        "920   Data Scientist  Upstart is the leading AI lending platform par...   \n",
 402 |        "938   Data Scientist  Why Divvy?Over the past decade, millions of Am...   \n",
 403 |        "935    Data Engineer  About Rocket LawyerWe believe everyone deserve...   \n",
 404 |        "1068   Data Engineer  Our mission is to create a world where mental ...   \n",
 405 |        "1089   Data Engineer  Data Engineer \\nIf you are a Data Engineer wit...   \n",
 406 |        "1100   Data Engineer  Prabhav Services Inc. is one of the premier pr...   \n",
 407 |        "1105   Data Engineer  About Skupos\\nSkupos is the data platform for ...   \n",
 408 |        "\n",
 409 |        "      Job_id  \n",
 410 |        "901      901  \n",
 411 |        "910      910  \n",
 412 |        "916      916  \n",
 413 |        "920      920  \n",
 414 |        "938      938  \n",
 415 |        "935      935  \n",
 416 |        "1068    1068  \n",
 417 |        "1089    1089  \n",
 418 |        "1100    1100  \n",
 419 |        "1105    1105  "
 420 |       ]
 421 |      },
 422 |      "execution_count": 145,
 423 |      "metadata": {},
 424 |      "output_type": "execute_result"
 425 |     }
 426 |    ],
 427 |    "source": [
 428 |     "sample_jobs['Job_id'] = sample_jobs.index\n",
 429 |     "sample_jobs"
 430 |    ]
 431 |   },
 432 |   {
 433 |    "cell_type": "code",
 434 |    "execution_count": 147,
 435 |    "metadata": {},
 436 |    "outputs": [],
 437 |    "source": [
 438 |     "sample_jobs.to_csv('jobs_test_sample.csv', index=False)"
 439 |    ]
 440 |   },
 441 |   {
 442 |    "cell_type": "markdown",
 443 |    "metadata": {},
 444 |    "source": [
 445 |     "## Course test data"
 446 |    ]
 447 |   },
 448 |   {
 449 |    "cell_type": "code",
 450 |    "execution_count": 137,
 451 |    "metadata": {},
 452 |    "outputs": [
 453 |     {
 454 |      "name": "stdout",
 455 |      "output_type": "stream",
 456 |      "text": [
 457 |       "(4416, 10)\n"
 458 |      ]
 459 |     },
 460 |     {
 461 |      "data": {
 462 |       "text/html": [
 463 |        "<div>\n",
 464 |        "<style scoped>\n",
 465 |        "    .dataframe tbody tr th:only-of-type {\n",
 466 |        "        vertical-align: middle;\n",
 467 |        "    }\n",
 468 |        "\n",
 469 |        "    .dataframe tbody tr th {\n",
 470 |        "        vertical-align: top;\n",
 471 |        "    }\n",
 472 |        "\n",
 473 |        "    .dataframe thead th {\n",
 474 |        "        text-align: right;\n",
 475 |        "    }\n",
 476 |        "</style>\n",
 477 |        "<table border=\"1\" class=\"dataframe\">\n",
 478 |        "  <thead>\n",
 479 |        "    <tr style=\"text-align: right;\">\n",
 480 |        "      <th></th>\n",
 481 |        "      <th>courseType</th>\n",
 482 |        "      <th>description</th>\n",
 483 |        "      <th>domainTypes</th>\n",
 484 |        "      <th>id</th>\n",
 485 |        "      <th>slug</th>\n",
 486 |        "      <th>specializations</th>\n",
 487 |        "      <th>workload</th>\n",
 488 |        "      <th>primaryLanguages</th>\n",
 489 |        "      <th>certificates</th>\n",
 490 |        "      <th>name</th>\n",
 491 |        "    </tr>\n",
 492 |        "  </thead>\n",
 493 |        "  <tbody>\n",
 494 |        "    <tr>\n",
 495 |        "      <th>0</th>\n",
 496 |        "      <td>v2.ondemand</td>\n",
 497 |        "      <td>Gamification is the application of game elemen...</td>\n",
 498 |        "      <td>[{'subdomainId': 'design-and-product', 'domain...</td>\n",
 499 |        "      <td>69Bku0KoEeWZtA4u62x6lQ</td>\n",
 500 |        "      <td>gamification</td>\n",
 501 |        "      <td>[]</td>\n",
 502 |        "      <td>4-8 hours/week</td>\n",
 503 |        "      <td>['en']</td>\n",
 504 |        "      <td>['VerifiedCert']</td>\n",
 505 |        "      <td>Gamification</td>\n",
 506 |        "    </tr>\n",
 507 |        "    <tr>\n",
 508 |        "      <th>1</th>\n",
 509 |        "      <td>v2.ondemand</td>\n",
 510 |        "      <td>This course will cover the steps used in weigh...</td>\n",
 511 |        "      <td>[{'subdomainId': 'data-analysis', 'domainId': ...</td>\n",
 512 |        "      <td>0HiU7Oe4EeWTAQ4yevf_oQ</td>\n",
 513 |        "      <td>missing-data</td>\n",
 514 |        "      <td>[]</td>\n",
 515 |        "      <td>4 weeks of study, 1-2 hours/week</td>\n",
 516 |        "      <td>['en']</td>\n",
 517 |        "      <td>['VerifiedCert', 'Specialization']</td>\n",
 518 |        "      <td>Dealing With Missing Data</td>\n",
 519 |        "    </tr>\n",
 520 |        "  </tbody>\n",
 521 |        "</table>\n",
 522 |        "</div>"
 523 |       ],
 524 |       "text/plain": [
 525 |        "    courseType                                        description  \\\n",
 526 |        "0  v2.ondemand  Gamification is the application of game elemen...   \n",
 527 |        "1  v2.ondemand  This course will cover the steps used in weigh...   \n",
 528 |        "\n",
 529 |        "                                         domainTypes                      id  \\\n",
 530 |        "0  [{'subdomainId': 'design-and-product', 'domain...  69Bku0KoEeWZtA4u62x6lQ   \n",
 531 |        "1  [{'subdomainId': 'data-analysis', 'domainId': ...  0HiU7Oe4EeWTAQ4yevf_oQ   \n",
 532 |        "\n",
 533 |        "           slug specializations                          workload  \\\n",
 534 |        "0  gamification              []                    4-8 hours/week   \n",
 535 |        "1  missing-data              []  4 weeks of study, 1-2 hours/week   \n",
 536 |        "\n",
 537 |        "  primaryLanguages                        certificates  \\\n",
 538 |        "0           ['en']                    ['VerifiedCert']   \n",
 539 |        "1           ['en']  ['VerifiedCert', 'Specialization']   \n",
 540 |        "\n",
 541 |        "                        name  \n",
 542 |        "0               Gamification  \n",
 543 |        "1  Dealing With Missing Data  "
 544 |       ]
 545 |      },
 546 |      "execution_count": 137,
 547 |      "metadata": {},
 548 |      "output_type": "execute_result"
 549 |     }
 550 |    ],
 551 |    "source": [
 552 |     "# Read in the course data\n",
 553 |     "courses_df = pd.read_csv('../Data/Course_Data/Coursera_Catalog.csv')\n",
 554 |     "print(courses_df.shape)\n",
 555 |     "courses_df.head(2)"
 556 |    ]
 557 |   },
 558 |   {
 559 |    "cell_type": "code",
 560 |    "execution_count": 138,
 561 |    "metadata": {},
 562 |    "outputs": [],
 563 |    "source": [
 564 |     "ds_courses = [3823, 143, 3165, 3588, 2517]"
 565 |    ]
 566 |   },
 567 |   {
 568 |    "cell_type": "code",
 569 |    "execution_count": 139,
 570 |    "metadata": {},
 571 |    "outputs": [],
 572 |    "source": [
 573 |     "de_courses = [545, 1015, 4233, 3763, 1311]"
 574 |    ]
 575 |   },
 576 |   {
 577 |    "cell_type": "code",
 578 |    "execution_count": 140,
 579 |    "metadata": {},
 580 |    "outputs": [
 581 |     {
 582 |      "data": {
 583 |       "text/html": [
 584 |        "<div>\n",
 585 |        "<style scoped>\n",
 586 |        "    .dataframe tbody tr th:only-of-type {\n",
 587 |        "        vertical-align: middle;\n",
 588 |        "    }\n",
 589 |        "\n",
 590 |        "    .dataframe tbody tr th {\n",
 591 |        "        vertical-align: top;\n",
 592 |        "    }\n",
 593 |        "\n",
 594 |        "    .dataframe thead th {\n",
 595 |        "        text-align: right;\n",
 596 |        "    }\n",
 597 |        "</style>\n",
 598 |        "<table border=\"1\" class=\"dataframe\">\n",
 599 |        "  <thead>\n",
 600 |        "    <tr style=\"text-align: right;\">\n",
 601 |        "      <th></th>\n",
 602 |        "      <th>name</th>\n",
 603 |        "      <th>description</th>\n",
 604 |        "    </tr>\n",
 605 |        "  </thead>\n",
 606 |        "  <tbody>\n",
 607 |        "    <tr>\n",
 608 |        "      <th>3823</th>\n",
 609 |        "      <td>The Data Scientist’s Toolbox</td>\n",
 610 |        "      <td>In this course you will get an introduction to...</td>\n",
 611 |        "    </tr>\n",
 612 |        "    <tr>\n",
 613 |        "      <th>143</th>\n",
 614 |        "      <td>Machine Learning</td>\n",
 615 |        "      <td>Machine learning is the science of getting com...</td>\n",
 616 |        "    </tr>\n",
 617 |        "    <tr>\n",
 618 |        "      <th>3165</th>\n",
 619 |        "      <td>Applied Machine Learning in Python</td>\n",
 620 |        "      <td>This course will introduce the learner to appl...</td>\n",
 621 |        "    </tr>\n",
 622 |        "    <tr>\n",
 623 |        "      <th>3588</th>\n",
 624 |        "      <td>Data Visualization with Python</td>\n",
 625 |        "      <td>\"A picture is worth a thousand words\". We are ...</td>\n",
 626 |        "    </tr>\n",
 627 |        "    <tr>\n",
 628 |        "      <th>2517</th>\n",
 629 |        "      <td>Machine Learning with Python</td>\n",
 630 |        "      <td>This course dives into the basics of machine l...</td>\n",
 631 |        "    </tr>\n",
 632 |        "    <tr>\n",
 633 |        "      <th>545</th>\n",
 634 |        "      <td>Databases and SQL for Data Science</td>\n",
 635 |        "      <td>Much of the world's data resides in databases....</td>\n",
 636 |        "    </tr>\n",
 637 |        "    <tr>\n",
 638 |        "      <th>1015</th>\n",
 639 |        "      <td>Google Cloud Platform Big Data and Machine Lea...</td>\n",
 640 |        "      <td>This 2-week accelerated on-demand course intro...</td>\n",
 641 |        "    </tr>\n",
 642 |        "    <tr>\n",
 643 |        "      <th>4233</th>\n",
 644 |        "      <td>Big Data Modeling and Management Systems</td>\n",
 645 |        "      <td>Once you’ve identified a big data issue to ana...</td>\n",
 646 |        "    </tr>\n",
 647 |        "    <tr>\n",
 648 |        "      <th>3763</th>\n",
 649 |        "      <td>Database Management Essentials</td>\n",
 650 |        "      <td>Database Management Essentials provides the fo...</td>\n",
 651 |        "    </tr>\n",
 652 |        "    <tr>\n",
 653 |        "      <th>1311</th>\n",
 654 |        "      <td>Data Warehouse Concepts, Design, and Data Inte...</td>\n",
 655 |        "      <td>This is the second course in the Data Warehous...</td>\n",
 656 |        "    </tr>\n",
 657 |        "  </tbody>\n",
 658 |        "</table>\n",
 659 |        "</div>"
 660 |       ],
 661 |       "text/plain": [
 662 |        "                                                   name  \\\n",
 663 |        "3823                       The Data Scientist’s Toolbox   \n",
 664 |        "143                                    Machine Learning   \n",
 665 |        "3165                 Applied Machine Learning in Python   \n",
 666 |        "3588                     Data Visualization with Python   \n",
 667 |        "2517                       Machine Learning with Python   \n",
 668 |        "545                  Databases and SQL for Data Science   \n",
 669 |        "1015  Google Cloud Platform Big Data and Machine Lea...   \n",
 670 |        "4233           Big Data Modeling and Management Systems   \n",
 671 |        "3763                     Database Management Essentials   \n",
 672 |        "1311  Data Warehouse Concepts, Design, and Data Inte...   \n",
 673 |        "\n",
 674 |        "                                            description  \n",
 675 |        "3823  In this course you will get an introduction to...  \n",
 676 |        "143   Machine learning is the science of getting com...  \n",
 677 |        "3165  This course will introduce the learner to appl...  \n",
 678 |        "3588  \"A picture is worth a thousand words\". We are ...  \n",
 679 |        "2517  This course dives into the basics of machine l...  \n",
 680 |        "545   Much of the world's data resides in databases....  \n",
 681 |        "1015  This 2-week accelerated on-demand course intro...  \n",
 682 |        "4233  Once you’ve identified a big data issue to ana...  \n",
 683 |        "3763  Database Management Essentials provides the fo...  \n",
 684 |        "1311  This is the second course in the Data Warehous...  "
 685 |       ]
 686 |      },
 687 |      "execution_count": 140,
 688 |      "metadata": {},
 689 |      "output_type": "execute_result"
 690 |     }
 691 |    ],
 692 |    "source": [
 693 |     "sample_courses = courses_df.loc[[3823, 143, 3165, 3588, 2517, 545, 1015, 4233, 3763, 1311], ['name', 'description']]\n",
 694 |     "sample_courses"
 695 |    ]
 696 |   },
 697 |   {
 698 |    "cell_type": "code",
 699 |    "execution_count": 141,
 700 |    "metadata": {},
 701 |    "outputs": [
 702 |     {
 703 |      "data": {
 704 |       "text/html": [
 705 |        "<div>\n",
 706 |        "<style scoped>\n",
 707 |        "    .dataframe tbody tr th:only-of-type {\n",
 708 |        "        vertical-align: middle;\n",
 709 |        "    }\n",
 710 |        "\n",
 711 |        "    .dataframe tbody tr th {\n",
 712 |        "        vertical-align: top;\n",
 713 |        "    }\n",
 714 |        "\n",
 715 |        "    .dataframe thead th {\n",
 716 |        "        text-align: right;\n",
 717 |        "    }\n",
 718 |        "</style>\n",
 719 |        "<table border=\"1\" class=\"dataframe\">\n",
 720 |        "  <thead>\n",
 721 |        "    <tr style=\"text-align: right;\">\n",
 722 |        "      <th></th>\n",
 723 |        "      <th>name</th>\n",
 724 |        "      <th>description</th>\n",
 725 |        "      <th>job_title</th>\n",
 726 |        "    </tr>\n",
 727 |        "  </thead>\n",
 728 |        "  <tbody>\n",
 729 |        "    <tr>\n",
 730 |        "      <th>3823</th>\n",
 731 |        "      <td>The Data Scientist’s Toolbox</td>\n",
 732 |        "      <td>In this course you will get an introduction to...</td>\n",
 733 |        "      <td>None</td>\n",
 734 |        "    </tr>\n",
 735 |        "    <tr>\n",
 736 |        "      <th>143</th>\n",
 737 |        "      <td>Machine Learning</td>\n",
 738 |        "      <td>Machine learning is the science of getting com...</td>\n",
 739 |        "      <td>None</td>\n",
 740 |        "    </tr>\n",
 741 |        "    <tr>\n",
 742 |        "      <th>3165</th>\n",
 743 |        "      <td>Applied Machine Learning in Python</td>\n",
 744 |        "      <td>This course will introduce the learner to appl...</td>\n",
 745 |        "      <td>None</td>\n",
 746 |        "    </tr>\n",
 747 |        "    <tr>\n",
 748 |        "      <th>3588</th>\n",
 749 |        "      <td>Data Visualization with Python</td>\n",
 750 |        "      <td>\"A picture is worth a thousand words\". We are ...</td>\n",
 751 |        "      <td>None</td>\n",
 752 |        "    </tr>\n",
 753 |        "    <tr>\n",
 754 |        "      <th>2517</th>\n",
 755 |        "      <td>Machine Learning with Python</td>\n",
 756 |        "      <td>This course dives into the basics of machine l...</td>\n",
 757 |        "      <td>None</td>\n",
 758 |        "    </tr>\n",
 759 |        "    <tr>\n",
 760 |        "      <th>545</th>\n",
 761 |        "      <td>Databases and SQL for Data Science</td>\n",
 762 |        "      <td>Much of the world's data resides in databases....</td>\n",
 763 |        "      <td>None</td>\n",
 764 |        "    </tr>\n",
 765 |        "    <tr>\n",
 766 |        "      <th>1015</th>\n",
 767 |        "      <td>Google Cloud Platform Big Data and Machine Lea...</td>\n",
 768 |        "      <td>This 2-week accelerated on-demand course intro...</td>\n",
 769 |        "      <td>None</td>\n",
 770 |        "    </tr>\n",
 771 |        "    <tr>\n",
 772 |        "      <th>4233</th>\n",
 773 |        "      <td>Big Data Modeling and Management Systems</td>\n",
 774 |        "      <td>Once you’ve identified a big data issue to ana...</td>\n",
 775 |        "      <td>None</td>\n",
 776 |        "    </tr>\n",
 777 |        "    <tr>\n",
 778 |        "      <th>3763</th>\n",
 779 |        "      <td>Database Management Essentials</td>\n",
 780 |        "      <td>Database Management Essentials provides the fo...</td>\n",
 781 |        "      <td>None</td>\n",
 782 |        "    </tr>\n",
 783 |        "    <tr>\n",
 784 |        "      <th>1311</th>\n",
 785 |        "      <td>Data Warehouse Concepts, Design, and Data Inte...</td>\n",
 786 |        "      <td>This is the second course in the Data Warehous...</td>\n",
 787 |        "      <td>None</td>\n",
 788 |        "    </tr>\n",
 789 |        "  </tbody>\n",
 790 |        "</table>\n",
 791 |        "</div>"
 792 |       ],
 793 |       "text/plain": [
 794 |        "                                                   name  \\\n",
 795 |        "3823                       The Data Scientist’s Toolbox   \n",
 796 |        "143                                    Machine Learning   \n",
 797 |        "3165                 Applied Machine Learning in Python   \n",
 798 |        "3588                     Data Visualization with Python   \n",
 799 |        "2517                       Machine Learning with Python   \n",
 800 |        "545                  Databases and SQL for Data Science   \n",
 801 |        "1015  Google Cloud Platform Big Data and Machine Lea...   \n",
 802 |        "4233           Big Data Modeling and Management Systems   \n",
 803 |        "3763                     Database Management Essentials   \n",
 804 |        "1311  Data Warehouse Concepts, Design, and Data Inte...   \n",
 805 |        "\n",
 806 |        "                                            description job_title  \n",
 807 |        "3823  In this course you will get an introduction to...      None  \n",
 808 |        "143   Machine learning is the science of getting com...      None  \n",
 809 |        "3165  This course will introduce the learner to appl...      None  \n",
 810 |        "3588  \"A picture is worth a thousand words\". We are ...      None  \n",
 811 |        "2517  This course dives into the basics of machine l...      None  \n",
 812 |        "545   Much of the world's data resides in databases....      None  \n",
 813 |        "1015  This 2-week accelerated on-demand course intro...      None  \n",
 814 |        "4233  Once you’ve identified a big data issue to ana...      None  \n",
 815 |        "3763  Database Management Essentials provides the fo...      None  \n",
 816 |        "1311  This is the second course in the Data Warehous...      None  "
 817 |       ]
 818 |      },
 819 |      "execution_count": 141,
 820 |      "metadata": {},
 821 |      "output_type": "execute_result"
 822 |     }
 823 |    ],
 824 |    "source": [
 825 |     "sample_courses['job_title'] = None\n",
 826 |     "sample_courses"
 827 |    ]
 828 |   },
 829 |   {
 830 |    "cell_type": "code",
 831 |    "execution_count": 142,
 832 |    "metadata": {},
 833 |    "outputs": [
 834 |     {
 835 |      "data": {
 836 |       "text/html": [
 837 |        "<div>\n",
 838 |        "<style scoped>\n",
 839 |        "    .dataframe tbody tr th:only-of-type {\n",
 840 |        "        vertical-align: middle;\n",
 841 |        "    }\n",
 842 |        "\n",
 843 |        "    .dataframe tbody tr th {\n",
 844 |        "        vertical-align: top;\n",
 845 |        "    }\n",
 846 |        "\n",
 847 |        "    .dataframe thead th {\n",
 848 |        "        text-align: right;\n",
 849 |        "    }\n",
 850 |        "</style>\n",
 851 |        "<table border=\"1\" class=\"dataframe\">\n",
 852 |        "  <thead>\n",
 853 |        "    <tr style=\"text-align: right;\">\n",
 854 |        "      <th></th>\n",
 855 |        "      <th>name</th>\n",
 856 |        "      <th>description</th>\n",
 857 |        "      <th>job_title</th>\n",
 858 |        "    </tr>\n",
 859 |        "  </thead>\n",
 860 |        "  <tbody>\n",
 861 |        "    <tr>\n",
 862 |        "      <th>3823</th>\n",
 863 |        "      <td>The Data Scientist’s Toolbox</td>\n",
 864 |        "      <td>In this course you will get an introduction to...</td>\n",
 865 |        "      <td>Data Scientist</td>\n",
 866 |        "    </tr>\n",
 867 |        "    <tr>\n",
 868 |        "      <th>143</th>\n",
 869 |        "      <td>Machine Learning</td>\n",
 870 |        "      <td>Machine learning is the science of getting com...</td>\n",
 871 |        "      <td>Data Scientist</td>\n",
 872 |        "    </tr>\n",
 873 |        "    <tr>\n",
 874 |        "      <th>3165</th>\n",
 875 |        "      <td>Applied Machine Learning in Python</td>\n",
 876 |        "      <td>This course will introduce the learner to appl...</td>\n",
 877 |        "      <td>Data Scientist</td>\n",
 878 |        "    </tr>\n",
 879 |        "    <tr>\n",
 880 |        "      <th>3588</th>\n",
 881 |        "      <td>Data Visualization with Python</td>\n",
 882 |        "      <td>\"A picture is worth a thousand words\". We are ...</td>\n",
 883 |        "      <td>Data Scientist</td>\n",
 884 |        "    </tr>\n",
 885 |        "    <tr>\n",
 886 |        "      <th>2517</th>\n",
 887 |        "      <td>Machine Learning with Python</td>\n",
 888 |        "      <td>This course dives into the basics of machine l...</td>\n",
 889 |        "      <td>Data Scientist</td>\n",
 890 |        "    </tr>\n",
 891 |        "    <tr>\n",
 892 |        "      <th>545</th>\n",
 893 |        "      <td>Databases and SQL for Data Science</td>\n",
 894 |        "      <td>Much of the world's data resides in databases....</td>\n",
 895 |        "      <td>Data Engineer</td>\n",
 896 |        "    </tr>\n",
 897 |        "    <tr>\n",
 898 |        "      <th>1015</th>\n",
 899 |        "      <td>Google Cloud Platform Big Data and Machine Lea...</td>\n",
 900 |        "      <td>This 2-week accelerated on-demand course intro...</td>\n",
 901 |        "      <td>Data Engineer</td>\n",
 902 |        "    </tr>\n",
 903 |        "    <tr>\n",
 904 |        "      <th>4233</th>\n",
 905 |        "      <td>Big Data Modeling and Management Systems</td>\n",
 906 |        "      <td>Once you’ve identified a big data issue to ana...</td>\n",
 907 |        "      <td>Data Engineer</td>\n",
 908 |        "    </tr>\n",
 909 |        "    <tr>\n",
 910 |        "      <th>3763</th>\n",
 911 |        "      <td>Database Management Essentials</td>\n",
 912 |        "      <td>Database Management Essentials provides the fo...</td>\n",
 913 |        "      <td>Data Engineer</td>\n",
 914 |        "    </tr>\n",
 915 |        "    <tr>\n",
 916 |        "      <th>1311</th>\n",
 917 |        "      <td>Data Warehouse Concepts, Design, and Data Inte...</td>\n",
 918 |        "      <td>This is the second course in the Data Warehous...</td>\n",
 919 |        "      <td>Data Engineer</td>\n",
 920 |        "    </tr>\n",
 921 |        "  </tbody>\n",
 922 |        "</table>\n",
 923 |        "</div>"
 924 |       ],
 925 |       "text/plain": [
 926 |        "                                                   name  \\\n",
 927 |        "3823                       The Data Scientist’s Toolbox   \n",
 928 |        "143                                    Machine Learning   \n",
 929 |        "3165                 Applied Machine Learning in Python   \n",
 930 |        "3588                     Data Visualization with Python   \n",
 931 |        "2517                       Machine Learning with Python   \n",
 932 |        "545                  Databases and SQL for Data Science   \n",
 933 |        "1015  Google Cloud Platform Big Data and Machine Lea...   \n",
 934 |        "4233           Big Data Modeling and Management Systems   \n",
 935 |        "3763                     Database Management Essentials   \n",
 936 |        "1311  Data Warehouse Concepts, Design, and Data Inte...   \n",
 937 |        "\n",
 938 |        "                                            description       job_title  \n",
 939 |        "3823  In this course you will get an introduction to...  Data Scientist  \n",
 940 |        "143   Machine learning is the science of getting com...  Data Scientist  \n",
 941 |        "3165  This course will introduce the learner to appl...  Data Scientist  \n",
 942 |        "3588  \"A picture is worth a thousand words\". We are ...  Data Scientist  \n",
 943 |        "2517  This course dives into the basics of machine l...  Data Scientist  \n",
 944 |        "545   Much of the world's data resides in databases....   Data Engineer  \n",
 945 |        "1015  This 2-week accelerated on-demand course intro...   Data Engineer  \n",
 946 |        "4233  Once you’ve identified a big data issue to ana...   Data Engineer  \n",
 947 |        "3763  Database Management Essentials provides the fo...   Data Engineer  \n",
 948 |        "1311  This is the second course in the Data Warehous...   Data Engineer  "
 949 |       ]
 950 |      },
 951 |      "execution_count": 142,
 952 |      "metadata": {},
 953 |      "output_type": "execute_result"
 954 |     }
 955 |    ],
 956 |    "source": [
 957 |     "sample_courses.loc[ds_courses, 'job_title'] = 'Data Scientist'\n",
 958 |     "sample_courses.loc[de_courses, 'job_title'] = 'Data Engineer'\n",
 959 |     "sample_courses"
 960 |    ]
 961 |   },
 962 |   {
 963 |    "cell_type": "code",
 964 |    "execution_count": 148,
 965 |    "metadata": {},
 966 |    "outputs": [
 967 |     {
 968 |      "data": {
 969 |       "text/html": [
 970 |        "<div>\n",
 971 |        "<style scoped>\n",
 972 |        "    .dataframe tbody tr th:only-of-type {\n",
 973 |        "        vertical-align: middle;\n",
 974 |        "    }\n",
 975 |        "\n",
 976 |        "    .dataframe tbody tr th {\n",
 977 |        "        vertical-align: top;\n",
 978 |        "    }\n",
 979 |        "\n",
 980 |        "    .dataframe thead th {\n",
 981 |        "        text-align: right;\n",
 982 |        "    }\n",
 983 |        "</style>\n",
 984 |        "<table border=\"1\" class=\"dataframe\">\n",
 985 |        "  <thead>\n",
 986 |        "    <tr style=\"text-align: right;\">\n",
 987 |        "      <th></th>\n",
 988 |        "      <th>name</th>\n",
 989 |        "      <th>description</th>\n",
 990 |        "      <th>job_title</th>\n",
 991 |        "      <th>course_id</th>\n",
 992 |        "    </tr>\n",
 993 |        "  </thead>\n",
 994 |        "  <tbody>\n",
 995 |        "    <tr>\n",
 996 |        "      <th>3823</th>\n",
 997 |        "      <td>The Data Scientist’s Toolbox</td>\n",
 998 |        "      <td>In this course you will get an introduction to...</td>\n",
 999 |        "      <td>Data Scientist</td>\n",
1000 |        "      <td>3823</td>\n",
1001 |        "    </tr>\n",
1002 |        "    <tr>\n",
1003 |        "      <th>143</th>\n",
1004 |        "      <td>Machine Learning</td>\n",
1005 |        "      <td>Machine learning is the science of getting com...</td>\n",
1006 |        "      <td>Data Scientist</td>\n",
1007 |        "      <td>143</td>\n",
1008 |        "    </tr>\n",
1009 |        "    <tr>\n",
1010 |        "      <th>3165</th>\n",
1011 |        "      <td>Applied Machine Learning in Python</td>\n",
1012 |        "      <td>This course will introduce the learner to appl...</td>\n",
1013 |        "      <td>Data Scientist</td>\n",
1014 |        "      <td>3165</td>\n",
1015 |        "    </tr>\n",
1016 |        "    <tr>\n",
1017 |        "      <th>3588</th>\n",
1018 |        "      <td>Data Visualization with Python</td>\n",
1019 |        "      <td>\"A picture is worth a thousand words\". We are ...</td>\n",
1020 |        "      <td>Data Scientist</td>\n",
1021 |        "      <td>3588</td>\n",
1022 |        "    </tr>\n",
1023 |        "    <tr>\n",
1024 |        "      <th>2517</th>\n",
1025 |        "      <td>Machine Learning with Python</td>\n",
1026 |        "      <td>This course dives into the basics of machine l...</td>\n",
1027 |        "      <td>Data Scientist</td>\n",
1028 |        "      <td>2517</td>\n",
1029 |        "    </tr>\n",
1030 |        "    <tr>\n",
1031 |        "      <th>545</th>\n",
1032 |        "      <td>Databases and SQL for Data Science</td>\n",
1033 |        "      <td>Much of the world's data resides in databases....</td>\n",
1034 |        "      <td>Data Engineer</td>\n",
1035 |        "      <td>545</td>\n",
1036 |        "    </tr>\n",
1037 |        "    <tr>\n",
1038 |        "      <th>1015</th>\n",
1039 |        "      <td>Google Cloud Platform Big Data and Machine Lea...</td>\n",
1040 |        "      <td>This 2-week accelerated on-demand course intro...</td>\n",
1041 |        "      <td>Data Engineer</td>\n",
1042 |        "      <td>1015</td>\n",
1043 |        "    </tr>\n",
1044 |        "    <tr>\n",
1045 |        "      <th>4233</th>\n",
1046 |        "      <td>Big Data Modeling and Management Systems</td>\n",
1047 |        "      <td>Once you’ve identified a big data issue to ana...</td>\n",
1048 |        "      <td>Data Engineer</td>\n",
1049 |        "      <td>4233</td>\n",
1050 |        "    </tr>\n",
1051 |        "    <tr>\n",
1052 |        "      <th>3763</th>\n",
1053 |        "      <td>Database Management Essentials</td>\n",
1054 |        "      <td>Database Management Essentials provides the fo...</td>\n",
1055 |        "      <td>Data Engineer</td>\n",
1056 |        "      <td>3763</td>\n",
1057 |        "    </tr>\n",
1058 |        "    <tr>\n",
1059 |        "      <th>1311</th>\n",
1060 |        "      <td>Data Warehouse Concepts, Design, and Data Inte...</td>\n",
1061 |        "      <td>This is the second course in the Data Warehous...</td>\n",
1062 |        "      <td>Data Engineer</td>\n",
1063 |        "      <td>1311</td>\n",
1064 |        "    </tr>\n",
1065 |        "  </tbody>\n",
1066 |        "</table>\n",
1067 |        "</div>"
1068 |       ],
1069 |       "text/plain": [
1070 |        "                                                   name  \\\n",
1071 |        "3823                       The Data Scientist’s Toolbox   \n",
1072 |        "143                                    Machine Learning   \n",
1073 |        "3165                 Applied Machine Learning in Python   \n",
1074 |        "3588                     Data Visualization with Python   \n",
1075 |        "2517                       Machine Learning with Python   \n",
1076 |        "545                  Databases and SQL for Data Science   \n",
1077 |        "1015  Google Cloud Platform Big Data and Machine Lea...   \n",
1078 |        "4233           Big Data Modeling and Management Systems   \n",
1079 |        "3763                     Database Management Essentials   \n",
1080 |        "1311  Data Warehouse Concepts, Design, and Data Inte...   \n",
1081 |        "\n",
1082 |        "                                            description       job_title  \\\n",
1083 |        "3823  In this course you will get an introduction to...  Data Scientist   \n",
1084 |        "143   Machine learning is the science of getting com...  Data Scientist   \n",
1085 |        "3165  This course will introduce the learner to appl...  Data Scientist   \n",
1086 |        "3588  \"A picture is worth a thousand words\". We are ...  Data Scientist   \n",
1087 |        "2517  This course dives into the basics of machine l...  Data Scientist   \n",
1088 |        "545   Much of the world's data resides in databases....   Data Engineer   \n",
1089 |        "1015  This 2-week accelerated on-demand course intro...   Data Engineer   \n",
1090 |        "4233  Once you’ve identified a big data issue to ana...   Data Engineer   \n",
1091 |        "3763  Database Management Essentials provides the fo...   Data Engineer   \n",
1092 |        "1311  This is the second course in the Data Warehous...   Data Engineer   \n",
1093 |        "\n",
1094 |        "      course_id  \n",
1095 |        "3823       3823  \n",
1096 |        "143         143  \n",
1097 |        "3165       3165  \n",
1098 |        "3588       3588  \n",
1099 |        "2517       2517  \n",
1100 |        "545         545  \n",
1101 |        "1015       1015  \n",
1102 |        "4233       4233  \n",
1103 |        "3763       3763  \n",
1104 |        "1311       1311  "
1105 |       ]
1106 |      },
1107 |      "execution_count": 148,
1108 |      "metadata": {},
1109 |      "output_type": "execute_result"
1110 |     }
1111 |    ],
1112 |    "source": [
1113 |     "sample_courses['course_id'] = sample_courses.index\n",
1114 |     "sample_courses"
1115 |    ]
1116 |   },
1117 |   {
1118 |    "cell_type": "code",
1119 |    "execution_count": 149,
1120 |    "metadata": {},
1121 |    "outputs": [],
1122 |    "source": [
1123 |     "sample_courses.to_csv('courses_test_sample.csv', index=False)"
1124 |    ]
1125 |   },
1126 |   {
1127 |    "cell_type": "code",
1128 |    "execution_count": null,
1129 |    "metadata": {},
1130 |    "outputs": [],
1131 |    "source": []
1132 |   }
1133 |  ],
1134 |  "metadata": {
1135 |   "kernelspec": {
1136 |    "display_name": "Python 3",
1137 |    "language": "python",
1138 |    "name": "python3"
1139 |   },
1140 |   "language_info": {
1141 |    "codemirror_mode": {
1142 |     "name": "ipython",
1143 |     "version": 3
1144 |    },
1145 |    "file_extension": ".py",
1146 |    "mimetype": "text/x-python",
1147 |    "name": "python",
1148 |    "nbconvert_exporter": "python",
1149 |    "pygments_lexer": "ipython3",
1150 |    "version": "3.7.6"
1151 |   }
1152 |  },
1153 |  "nbformat": 4,
1154 |  "nbformat_minor": 4
1155 | }
1156 | 


--------------------------------------------------------------------------------
/Exploratory Data Analysis/Job_Posts_EDA.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Job Listings EDA\n",
  8 |     "\n",
  9 |     "This notebook examines the dataset of job posts from Glassdoor. It finds that there are issues in this dataset, such as duplicate rows and adverstisements mixed in. However, because I have decided only to use a small portion of this data for testing out the recommender model, these issues will not affect the project and do not need to be fixed here."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# Import libraries\n",
 19 |     "\n",
 20 |     "import pandas as pd"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stdout",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "(3324, 12)\n"
 33 |      ]
 34 |     },
 35 |     {
 36 |      "data": {
 37 |       "text/html": [
 38 |        "<div>\n",
 39 |        "<style scoped>\n",
 40 |        "    .dataframe tbody tr th:only-of-type {\n",
 41 |        "        vertical-align: middle;\n",
 42 |        "    }\n",
 43 |        "\n",
 44 |        "    .dataframe tbody tr th {\n",
 45 |        "        vertical-align: top;\n",
 46 |        "    }\n",
 47 |        "\n",
 48 |        "    .dataframe thead th {\n",
 49 |        "        text-align: right;\n",
 50 |        "    }\n",
 51 |        "</style>\n",
 52 |        "<table border=\"1\" class=\"dataframe\">\n",
 53 |        "  <thead>\n",
 54 |        "    <tr style=\"text-align: right;\">\n",
 55 |        "      <th></th>\n",
 56 |        "      <th>Job_title</th>\n",
 57 |        "      <th>Company</th>\n",
 58 |        "      <th>State</th>\n",
 59 |        "      <th>City</th>\n",
 60 |        "      <th>Min_Salary</th>\n",
 61 |        "      <th>Max_Salary</th>\n",
 62 |        "      <th>Job_Desc</th>\n",
 63 |        "      <th>Industry</th>\n",
 64 |        "      <th>Rating</th>\n",
 65 |        "      <th>Date_Posted</th>\n",
 66 |        "      <th>Valid_until</th>\n",
 67 |        "      <th>Job_Type</th>\n",
 68 |        "    </tr>\n",
 69 |        "  </thead>\n",
 70 |        "  <tbody>\n",
 71 |        "    <tr>\n",
 72 |        "      <th>0</th>\n",
 73 |        "      <td>Chief Marketing Officer (CMO)</td>\n",
 74 |        "      <td>National Debt Relief</td>\n",
 75 |        "      <td>NY</td>\n",
 76 |        "      <td>New York</td>\n",
 77 |        "      <td>-1</td>\n",
 78 |        "      <td>-1</td>\n",
 79 |        "      <td>Who We're Looking For:\\n\\nThe Chief Marketing ...</td>\n",
 80 |        "      <td>Finance</td>\n",
 81 |        "      <td>4.0</td>\n",
 82 |        "      <td>2020-05-08</td>\n",
 83 |        "      <td>2020-06-07</td>\n",
 84 |        "      <td>FULL_TIME</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>1</th>\n",
 88 |        "      <td>Registered Nurse</td>\n",
 89 |        "      <td>Queens Boulevard Endoscopy Center</td>\n",
 90 |        "      <td>NY</td>\n",
 91 |        "      <td>Rego Park</td>\n",
 92 |        "      <td>-1</td>\n",
 93 |        "      <td>-1</td>\n",
 94 |        "      <td>Queens Boulevard Endoscopy Center, an endoscop...</td>\n",
 95 |        "      <td>NaN</td>\n",
 96 |        "      <td>3.0</td>\n",
 97 |        "      <td>2020-04-25</td>\n",
 98 |        "      <td>2020-06-07</td>\n",
 99 |        "      <td>FULL_TIME</td>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "      <th>2</th>\n",
103 |        "      <td>Dental Hygienist</td>\n",
104 |        "      <td>Batista Dental</td>\n",
105 |        "      <td>NJ</td>\n",
106 |        "      <td>West New York</td>\n",
107 |        "      <td>-1</td>\n",
108 |        "      <td>-1</td>\n",
109 |        "      <td>Part-time or Full-timedental hygienist positio...</td>\n",
110 |        "      <td>NaN</td>\n",
111 |        "      <td>NaN</td>\n",
112 |        "      <td>2020-05-02</td>\n",
113 |        "      <td>2020-06-07</td>\n",
114 |        "      <td>PART_TIME</td>\n",
115 |        "    </tr>\n",
116 |        "    <tr>\n",
117 |        "      <th>3</th>\n",
118 |        "      <td>Senior Salesforce Developer</td>\n",
119 |        "      <td>National Debt Relief</td>\n",
120 |        "      <td>NY</td>\n",
121 |        "      <td>New York</td>\n",
122 |        "      <td>44587</td>\n",
123 |        "      <td>82162</td>\n",
124 |        "      <td>Principle Duties &amp; Responsibilities:\\n\\nAnalyz...</td>\n",
125 |        "      <td>Finance</td>\n",
126 |        "      <td>4.0</td>\n",
127 |        "      <td>2020-05-08</td>\n",
128 |        "      <td>2020-06-07</td>\n",
129 |        "      <td>FULL_TIME</td>\n",
130 |        "    </tr>\n",
131 |        "    <tr>\n",
132 |        "      <th>4</th>\n",
133 |        "      <td>DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...</td>\n",
134 |        "      <td>National Advocates for Pregnant Women</td>\n",
135 |        "      <td>NY</td>\n",
136 |        "      <td>New York</td>\n",
137 |        "      <td>125410</td>\n",
138 |        "      <td>212901</td>\n",
139 |        "      <td>For FULL Job Announcement, visit our website: ...</td>\n",
140 |        "      <td>NaN</td>\n",
141 |        "      <td>NaN</td>\n",
142 |        "      <td>2020-04-28</td>\n",
143 |        "      <td>2020-06-07</td>\n",
144 |        "      <td>FULL_TIME</td>\n",
145 |        "    </tr>\n",
146 |        "  </tbody>\n",
147 |        "</table>\n",
148 |        "</div>"
149 |       ],
150 |       "text/plain": [
151 |        "                                           Job_title  \\\n",
152 |        "0                      Chief Marketing Officer (CMO)   \n",
153 |        "1                                   Registered Nurse   \n",
154 |        "2                                   Dental Hygienist   \n",
155 |        "3                        Senior Salesforce Developer   \n",
156 |        "4  DEPUTY EXECUTIVE DIRECTOR, PROGRAM AND LEGAL A...   \n",
157 |        "\n",
158 |        "                                 Company State           City  Min_Salary  \\\n",
159 |        "0                   National Debt Relief    NY       New York          -1   \n",
160 |        "1      Queens Boulevard Endoscopy Center    NY      Rego Park          -1   \n",
161 |        "2                         Batista Dental    NJ  West New York          -1   \n",
162 |        "3                   National Debt Relief    NY       New York       44587   \n",
163 |        "4  National Advocates for Pregnant Women    NY       New York      125410   \n",
164 |        "\n",
165 |        "   Max_Salary                                           Job_Desc Industry  \\\n",
166 |        "0          -1  Who We're Looking For:\\n\\nThe Chief Marketing ...  Finance   \n",
167 |        "1          -1  Queens Boulevard Endoscopy Center, an endoscop...      NaN   \n",
168 |        "2          -1  Part-time or Full-timedental hygienist positio...      NaN   \n",
169 |        "3       82162  Principle Duties & Responsibilities:\\n\\nAnalyz...  Finance   \n",
170 |        "4      212901  For FULL Job Announcement, visit our website: ...      NaN   \n",
171 |        "\n",
172 |        "   Rating Date_Posted Valid_until   Job_Type  \n",
173 |        "0     4.0  2020-05-08  2020-06-07  FULL_TIME  \n",
174 |        "1     3.0  2020-04-25  2020-06-07  FULL_TIME  \n",
175 |        "2     NaN  2020-05-02  2020-06-07  PART_TIME  \n",
176 |        "3     4.0  2020-05-08  2020-06-07  FULL_TIME  \n",
177 |        "4     NaN  2020-04-28  2020-06-07  FULL_TIME  "
178 |       ]
179 |      },
180 |      "execution_count": 2,
181 |      "metadata": {},
182 |      "output_type": "execute_result"
183 |     }
184 |    ],
185 |    "source": [
186 |     "# Read in the dataset\n",
187 |     "\n",
188 |     "df = pd.read_csv('../Data/Job_Data/Glassdoor_Joblist.csv')\n",
189 |     "print(df.shape)\n",
190 |     "df.head()"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 3,
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "data": {
200 |       "text/plain": [
201 |        "Job_title        0\n",
202 |        "Company          0\n",
203 |        "State            2\n",
204 |        "City             6\n",
205 |        "Min_Salary       0\n",
206 |        "Max_Salary       0\n",
207 |        "Job_Desc         0\n",
208 |        "Industry       624\n",
209 |        "Rating         475\n",
210 |        "Date_Posted      0\n",
211 |        "Valid_until      0\n",
212 |        "Job_Type         0\n",
213 |        "dtype: int64"
214 |       ]
215 |      },
216 |      "execution_count": 3,
217 |      "metadata": {},
218 |      "output_type": "execute_result"
219 |     }
220 |    ],
221 |    "source": [
222 |     "# Check for missing values:\n",
223 |     "# No missing values in key columns of job title and description.\n",
224 |     "\n",
225 |     "df.isna().sum()"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 4,
231 |    "metadata": {},
232 |    "outputs": [
233 |     {
234 |      "data": {
235 |       "text/plain": [
236 |        "Data Scientist                                                    186\n",
237 |        "Data Engineer                                                     129\n",
238 |        "Data Analyst                                                       69\n",
239 |        "Senior Data Engineer                                               44\n",
240 |        "Senior Data Scientist                                              39\n",
241 |        "                                                                 ... \n",
242 |        "Spectral Research Scientist with Security Clearance                 1\n",
243 |        "Senior Medical Scientist                                            1\n",
244 |        "Senior Scientist, Oncology BioMarker Development                    1\n",
245 |        "Data Scientist, AMP Commerce/ Payments/ Subscription Analytics      1\n",
246 |        "Innovation - Data Science Manager                                   1\n",
247 |        "Name: Job_title, Length: 1619, dtype: int64"
248 |       ]
249 |      },
250 |      "execution_count": 4,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "# Examine the key category of job title:\n",
257 |     "# might need to consolidate these; leave it for now...\n",
258 |     "\n",
259 |     "df['Job_title'].value_counts()"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 5,
265 |    "metadata": {},
266 |    "outputs": [
267 |     {
268 |      "data": {
269 |       "text/plain": [
270 |        "'Queens Boulevard Endoscopy Center, an endoscopy ASC located in Rego Park, has an exciting opportunity for Full-Time Registered Nurse! Successful candidates will provide quality nursing care in all areas of the Center including pre-assessment, pre-op and pacu  Qualified candidates must possess the following:\\n\\nCurrent NY state RN license\\nBLS Certification, ACLS preferred\\nMust be a team-player with excellent multi-tasking and interpersonal skills\\nCompassion for patient needs and a high degree of professionalism\\nChinese Speaking and Spanish Preferred\\n\\nQueens Boulevard Endoscopy Center offers a pleasant professional work environment and no evening or holiday work hours. Drug-free work environment and EOE.'"
271 |       ]
272 |      },
273 |      "execution_count": 5,
274 |      "metadata": {},
275 |      "output_type": "execute_result"
276 |     }
277 |    ],
278 |    "source": [
279 |     "# Examine an example of a job description:\n",
280 |     "# Other than \\n line breaks, the text is pretty clean.\n",
281 |     "\n",
282 |     "df['Job_Desc'][1]"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 18,
288 |    "metadata": {},
289 |    "outputs": [
290 |     {
291 |      "data": {
292 |       "text/html": [
293 |        "<div>\n",
294 |        "<style scoped>\n",
295 |        "    .dataframe tbody tr th:only-of-type {\n",
296 |        "        vertical-align: middle;\n",
297 |        "    }\n",
298 |        "\n",
299 |        "    .dataframe tbody tr th {\n",
300 |        "        vertical-align: top;\n",
301 |        "    }\n",
302 |        "\n",
303 |        "    .dataframe thead th {\n",
304 |        "        text-align: right;\n",
305 |        "    }\n",
306 |        "</style>\n",
307 |        "<table border=\"1\" class=\"dataframe\">\n",
308 |        "  <thead>\n",
309 |        "    <tr style=\"text-align: right;\">\n",
310 |        "      <th></th>\n",
311 |        "      <th>Job_title</th>\n",
312 |        "      <th>Company</th>\n",
313 |        "      <th>State</th>\n",
314 |        "      <th>City</th>\n",
315 |        "      <th>Min_Salary</th>\n",
316 |        "      <th>Max_Salary</th>\n",
317 |        "      <th>Job_Desc</th>\n",
318 |        "      <th>Industry</th>\n",
319 |        "      <th>Rating</th>\n",
320 |        "      <th>Date_Posted</th>\n",
321 |        "      <th>Valid_until</th>\n",
322 |        "      <th>Job_Type</th>\n",
323 |        "    </tr>\n",
324 |        "  </thead>\n",
325 |        "  <tbody>\n",
326 |        "    <tr>\n",
327 |        "      <th>901</th>\n",
328 |        "      <td>Data Scientist</td>\n",
329 |        "      <td>GovTech</td>\n",
330 |        "      <td>CA</td>\n",
331 |        "      <td>San Francisco</td>\n",
332 |        "      <td>78594</td>\n",
333 |        "      <td>147225</td>\n",
334 |        "      <td>We are looking for Data Scientists who are int...</td>\n",
335 |        "      <td>Government</td>\n",
336 |        "      <td>3.6</td>\n",
337 |        "      <td>2020-05-01</td>\n",
338 |        "      <td>2020-06-05</td>\n",
339 |        "      <td>FULL_TIME</td>\n",
340 |        "    </tr>\n",
341 |        "    <tr>\n",
342 |        "      <th>902</th>\n",
343 |        "      <td>Data Scientist</td>\n",
344 |        "      <td>Triplebyte</td>\n",
345 |        "      <td>CA</td>\n",
346 |        "      <td>San Francisco</td>\n",
347 |        "      <td>145000</td>\n",
348 |        "      <td>225000</td>\n",
349 |        "      <td>This company is in a hiring surge in response ...</td>\n",
350 |        "      <td>Information Technology</td>\n",
351 |        "      <td>3.6</td>\n",
352 |        "      <td>2020-04-28</td>\n",
353 |        "      <td>2020-06-05</td>\n",
354 |        "      <td>FULL_TIME</td>\n",
355 |        "    </tr>\n",
356 |        "    <tr>\n",
357 |        "      <th>903</th>\n",
358 |        "      <td>Data Scientist</td>\n",
359 |        "      <td>Notion Labs</td>\n",
360 |        "      <td>CA</td>\n",
361 |        "      <td>San Francisco</td>\n",
362 |        "      <td>105765</td>\n",
363 |        "      <td>142959</td>\n",
364 |        "      <td>So, what will you do as a Data Scientist at No...</td>\n",
365 |        "      <td>Information Technology</td>\n",
366 |        "      <td>5.0</td>\n",
367 |        "      <td>2020-05-04</td>\n",
368 |        "      <td>2020-06-05</td>\n",
369 |        "      <td>FULL_TIME</td>\n",
370 |        "    </tr>\n",
371 |        "    <tr>\n",
372 |        "      <th>904</th>\n",
373 |        "      <td>Data Scientist</td>\n",
374 |        "      <td>Seen by Indeed</td>\n",
375 |        "      <td>CA</td>\n",
376 |        "      <td>San Francisco</td>\n",
377 |        "      <td>110377</td>\n",
378 |        "      <td>143329</td>\n",
379 |        "      <td>With one application you can be considered for...</td>\n",
380 |        "      <td>NaN</td>\n",
381 |        "      <td>NaN</td>\n",
382 |        "      <td>2020-04-25</td>\n",
383 |        "      <td>2020-06-05</td>\n",
384 |        "      <td>FULL_TIME</td>\n",
385 |        "    </tr>\n",
386 |        "    <tr>\n",
387 |        "      <th>907</th>\n",
388 |        "      <td>Data Scientist</td>\n",
389 |        "      <td>Formation</td>\n",
390 |        "      <td>CA</td>\n",
391 |        "      <td>San Francisco</td>\n",
392 |        "      <td>119642</td>\n",
393 |        "      <td>135250</td>\n",
394 |        "      <td>Formation provides personalization for the lar...</td>\n",
395 |        "      <td>Information Technology</td>\n",
396 |        "      <td>3.1</td>\n",
397 |        "      <td>2020-04-29</td>\n",
398 |        "      <td>2020-06-05</td>\n",
399 |        "      <td>FULL_TIME</td>\n",
400 |        "    </tr>\n",
401 |        "    <tr>\n",
402 |        "      <th>909</th>\n",
403 |        "      <td>Data Scientist</td>\n",
404 |        "      <td>Duetto</td>\n",
405 |        "      <td>CA</td>\n",
406 |        "      <td>San Francisco</td>\n",
407 |        "      <td>108809</td>\n",
408 |        "      <td>173353</td>\n",
409 |        "      <td>We are an ambitious, well-funded, high-growth ...</td>\n",
410 |        "      <td>Information Technology</td>\n",
411 |        "      <td>4.4</td>\n",
412 |        "      <td>2020-04-24</td>\n",
413 |        "      <td>2020-06-05</td>\n",
414 |        "      <td>FULL_TIME</td>\n",
415 |        "    </tr>\n",
416 |        "    <tr>\n",
417 |        "      <th>910</th>\n",
418 |        "      <td>Data Scientist</td>\n",
419 |        "      <td>Demandbase</td>\n",
420 |        "      <td>CA</td>\n",
421 |        "      <td>San Francisco</td>\n",
422 |        "      <td>148171</td>\n",
423 |        "      <td>160387</td>\n",
424 |        "      <td>The world's largest and fastest-growing compan...</td>\n",
425 |        "      <td>Information Technology</td>\n",
426 |        "      <td>4.5</td>\n",
427 |        "      <td>2020-04-29</td>\n",
428 |        "      <td>2020-06-05</td>\n",
429 |        "      <td>FULL_TIME</td>\n",
430 |        "    </tr>\n",
431 |        "    <tr>\n",
432 |        "      <th>916</th>\n",
433 |        "      <td>Data Scientist</td>\n",
434 |        "      <td>Centraprise</td>\n",
435 |        "      <td>CA</td>\n",
436 |        "      <td>Foster City</td>\n",
437 |        "      <td>116415</td>\n",
438 |        "      <td>143186</td>\n",
439 |        "      <td>\\nRole: Data Scientist.\\n\\nLocation: Foster Ci...</td>\n",
440 |        "      <td>Information Technology</td>\n",
441 |        "      <td>4.2</td>\n",
442 |        "      <td>2020-05-02</td>\n",
443 |        "      <td>2020-06-05</td>\n",
444 |        "      <td>FULL_TIME</td>\n",
445 |        "    </tr>\n",
446 |        "    <tr>\n",
447 |        "      <th>918</th>\n",
448 |        "      <td>Data Scientist</td>\n",
449 |        "      <td>CyberCoders</td>\n",
450 |        "      <td>CA</td>\n",
451 |        "      <td>San Francisco</td>\n",
452 |        "      <td>-1</td>\n",
453 |        "      <td>-1</td>\n",
454 |        "      <td>Data Scientist \\nJob Title: Data ScientistLoca...</td>\n",
455 |        "      <td>Business Services</td>\n",
456 |        "      <td>4.1</td>\n",
457 |        "      <td>2020-05-06</td>\n",
458 |        "      <td>2020-06-05</td>\n",
459 |        "      <td>FULL_TIME</td>\n",
460 |        "    </tr>\n",
461 |        "    <tr>\n",
462 |        "      <th>920</th>\n",
463 |        "      <td>Data Scientist</td>\n",
464 |        "      <td>Upstart</td>\n",
465 |        "      <td>CA</td>\n",
466 |        "      <td>San Mateo</td>\n",
467 |        "      <td>124204</td>\n",
468 |        "      <td>139717</td>\n",
469 |        "      <td>Upstart is the leading AI lending platform par...</td>\n",
470 |        "      <td>Finance</td>\n",
471 |        "      <td>4.0</td>\n",
472 |        "      <td>2020-04-24</td>\n",
473 |        "      <td>2020-06-05</td>\n",
474 |        "      <td>FULL_TIME</td>\n",
475 |        "    </tr>\n",
476 |        "  </tbody>\n",
477 |        "</table>\n",
478 |        "</div>"
479 |       ],
480 |       "text/plain": [
481 |        "          Job_title         Company State           City  Min_Salary  \\\n",
482 |        "901  Data Scientist         GovTech    CA  San Francisco       78594   \n",
483 |        "902  Data Scientist      Triplebyte    CA  San Francisco      145000   \n",
484 |        "903  Data Scientist     Notion Labs    CA  San Francisco      105765   \n",
485 |        "904  Data Scientist  Seen by Indeed    CA  San Francisco      110377   \n",
486 |        "907  Data Scientist       Formation    CA  San Francisco      119642   \n",
487 |        "909  Data Scientist          Duetto    CA  San Francisco      108809   \n",
488 |        "910  Data Scientist      Demandbase    CA  San Francisco      148171   \n",
489 |        "916  Data Scientist     Centraprise    CA    Foster City      116415   \n",
490 |        "918  Data Scientist     CyberCoders    CA  San Francisco          -1   \n",
491 |        "920  Data Scientist         Upstart    CA      San Mateo      124204   \n",
492 |        "\n",
493 |        "     Max_Salary                                           Job_Desc  \\\n",
494 |        "901      147225  We are looking for Data Scientists who are int...   \n",
495 |        "902      225000  This company is in a hiring surge in response ...   \n",
496 |        "903      142959  So, what will you do as a Data Scientist at No...   \n",
497 |        "904      143329  With one application you can be considered for...   \n",
498 |        "907      135250  Formation provides personalization for the lar...   \n",
499 |        "909      173353  We are an ambitious, well-funded, high-growth ...   \n",
500 |        "910      160387  The world's largest and fastest-growing compan...   \n",
501 |        "916      143186  \\nRole: Data Scientist.\\n\\nLocation: Foster Ci...   \n",
502 |        "918          -1  Data Scientist \\nJob Title: Data ScientistLoca...   \n",
503 |        "920      139717  Upstart is the leading AI lending platform par...   \n",
504 |        "\n",
505 |        "                   Industry  Rating Date_Posted Valid_until   Job_Type  \n",
506 |        "901              Government     3.6  2020-05-01  2020-06-05  FULL_TIME  \n",
507 |        "902  Information Technology     3.6  2020-04-28  2020-06-05  FULL_TIME  \n",
508 |        "903  Information Technology     5.0  2020-05-04  2020-06-05  FULL_TIME  \n",
509 |        "904                     NaN     NaN  2020-04-25  2020-06-05  FULL_TIME  \n",
510 |        "907  Information Technology     3.1  2020-04-29  2020-06-05  FULL_TIME  \n",
511 |        "909  Information Technology     4.4  2020-04-24  2020-06-05  FULL_TIME  \n",
512 |        "910  Information Technology     4.5  2020-04-29  2020-06-05  FULL_TIME  \n",
513 |        "916  Information Technology     4.2  2020-05-02  2020-06-05  FULL_TIME  \n",
514 |        "918       Business Services     4.1  2020-05-06  2020-06-05  FULL_TIME  \n",
515 |        "920                 Finance     4.0  2020-04-24  2020-06-05  FULL_TIME  "
516 |       ]
517 |      },
518 |      "execution_count": 18,
519 |      "metadata": {},
520 |      "output_type": "execute_result"
521 |     }
522 |    ],
523 |    "source": [
524 |     "df.loc[df['Job_title'] == 'Data Scientist'].head(10)"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "code",
529 |    "execution_count": 22,
530 |    "metadata": {},
531 |    "outputs": [
532 |     {
533 |      "data": {
534 |       "text/plain": [
535 |        "'The world\\'s largest and fastest-growing companies such as Accenture, Adobe, DocuSign and Salesforce rely on Demandbase to drive their Account-Based Marketing strategy and maximize their B2B marketing performance. We pioneered the ABM category nearly a decade ago, and today we lead the category as an indispensable part of the B2B MarTech stack. Our achievements and innovation would not be possible without the driven and collaborative teams here at Demandbase. As a company, we\\'re as committed to growing careers as we are to building word-class technology. We invest heavily in people, our culture and the community around us, and have continuously been recognized as one of the best places to work in the Bay Area.\\n\\nDemandbase is currently looking for a Staff Data Scientist to develop ground-breaking insights from our data sets and create a completely new way of data-driven thinking in B2B marketing —providing Sales and Marketing users with unique approaches for account-based advertising and web engagement.\\n\\nAs a Staff Data Scientist, you\\'ll be responsible for developing and testing hypotheses on behavioral responses in B2B marketing, creating models that extract data from, among others, website, digital advertising, and CRM solutions into actionable insights, and defining leading edge thinking on how analytical frameworks can be applied to predictive marketing. You\\'ll engage closely with product managers, engineers, customers and others to turn your models into products that delight customers and create \"A-HA\" moments. You will engage with industry peers and experts and showcase your findings (of course while maintaining company and client confidentiality!). You are both hands-on and strategic—with both a broad ecosystem-level understanding of our market space and the ability to work closely with engineering and product teams to deliver software in an iterative, continual-release environment. This is a high-visibility position involving close collaboration across functional groups and with executive stakeholders at customers like the above.\\n\\nWhat you\\'ll be doing:\\n\\nOwn: Be the functional owner of the Data Science role\\nFrame: Use data and insights to explore questions our customers and product team can and should be asking but never asked before.\\nDefine: Work with customers and internal stakeholders to define hypotheses and models, and with engineering teams to define productionalization of data science system\\nDocument: Write clear, concise descriptions of how insights can be converted into repeatable actions.\\nBuild: Write robust machine learning pipelines and data science systems that interface with production infrastructure and APIs\\nTest: Continually test your models and refine assumptions, data sources and more.\\nDrive: Work to spread understanding and buy-in among all stakeholders at all levels.\\nOther duties as assigned\\n\\nWhat we\\'re looking for:\\n\\n2-4 years of data science experience—you have driven more than one greenfield project from concept to production release\\nStrong quantitative and data analysis abilities (statistics, engineering, or financial academic background preferred)—making data actionable must be your thing!\\nGood working knowledge of Spark is a must (we use Scala heavily)\\nAny experience with Google Cloud (especially BQML) and AWS is a huge plus.\\nExperience defining products & solutions containing large data sets from diverse sources— preferably in sales and/or marketing situations.\\nPrior experience in the marketing or sales analytics/data science space desired\\nKnowledge of web site, digital marketing, and CRM technologies and companies a big plus\\n\\nOther important qualities:\\n\\nYou are perfectly comfortable working in a fast paced, market making environment\\nYou love data and data visualization—you love making data actionable for customers\\nYou are a driver and a doer\\nYou are truly passionate about asking and answering questions – some never asked before\\nYou have a strong sense of ownership for the products you help build\\n\\nBenefits:\\n\\nOur benefits include 100% paid for Medical, Dental and Vision for you and your entire family, 100% paid for short-term and long-term disability, 100% paid for life insurance, 401k, flexible vacation\\n\\nAbout Demandbase:\\n\\nDemandbase is the leader in Account-Based Marketing (ABM) and an indispensable part of the B2B tech stack. The company offers the only end-to-end ABM platform that helps B2B marketers identify, engage, close and measure progress against best-fit accounts. The biggest and fastest growing companies in the world, such as Accenture, Adobe, DocuSign, GE, Salesforce and others rely on Demandbase to drive their ABM strategy and maximize their marketing performance. The company has been named to the JMP Securities list \"The Hot 100: The Best Privately Held Software Companies,\" the Deloitte Fast 500 and named a Gartner Cool Vendor for Tech Go-To Market. In 2019, Demandbase executives authored the definitive book on ABM, Account-Based Marketing: How to Target and Engage the Companies That Will Grow Your Revenue. For more information, please visit www.demandbase.com or follow the company on Twitter @Demandbase.'"
536 |       ]
537 |      },
538 |      "execution_count": 22,
539 |      "metadata": {},
540 |      "output_type": "execute_result"
541 |     }
542 |    ],
543 |    "source": [
544 |     "df['Job_Desc'][910]"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": 23,
550 |    "metadata": {},
551 |    "outputs": [
552 |     {
553 |      "data": {
554 |       "text/html": [
555 |        "<div>\n",
556 |        "<style scoped>\n",
557 |        "    .dataframe tbody tr th:only-of-type {\n",
558 |        "        vertical-align: middle;\n",
559 |        "    }\n",
560 |        "\n",
561 |        "    .dataframe tbody tr th {\n",
562 |        "        vertical-align: top;\n",
563 |        "    }\n",
564 |        "\n",
565 |        "    .dataframe thead th {\n",
566 |        "        text-align: right;\n",
567 |        "    }\n",
568 |        "</style>\n",
569 |        "<table border=\"1\" class=\"dataframe\">\n",
570 |        "  <thead>\n",
571 |        "    <tr style=\"text-align: right;\">\n",
572 |        "      <th></th>\n",
573 |        "      <th>Job_title</th>\n",
574 |        "      <th>Company</th>\n",
575 |        "      <th>State</th>\n",
576 |        "      <th>City</th>\n",
577 |        "      <th>Min_Salary</th>\n",
578 |        "      <th>Max_Salary</th>\n",
579 |        "      <th>Job_Desc</th>\n",
580 |        "      <th>Industry</th>\n",
581 |        "      <th>Rating</th>\n",
582 |        "      <th>Date_Posted</th>\n",
583 |        "      <th>Valid_until</th>\n",
584 |        "      <th>Job_Type</th>\n",
585 |        "    </tr>\n",
586 |        "  </thead>\n",
587 |        "  <tbody>\n",
588 |        "    <tr>\n",
589 |        "      <th>935</th>\n",
590 |        "      <td>Data Engineer</td>\n",
591 |        "      <td>Rocket Lawyer</td>\n",
592 |        "      <td>CA</td>\n",
593 |        "      <td>San Francisco</td>\n",
594 |        "      <td>116784</td>\n",
595 |        "      <td>118008</td>\n",
596 |        "      <td>About Rocket LawyerWe believe everyone deserve...</td>\n",
597 |        "      <td>Information Technology</td>\n",
598 |        "      <td>3.5</td>\n",
599 |        "      <td>2020-04-23</td>\n",
600 |        "      <td>2020-06-05</td>\n",
601 |        "      <td>FULL_TIME</td>\n",
602 |        "    </tr>\n",
603 |        "    <tr>\n",
604 |        "      <th>1015</th>\n",
605 |        "      <td>Data Engineer</td>\n",
606 |        "      <td>Seen by Indeed</td>\n",
607 |        "      <td>CA</td>\n",
608 |        "      <td>San Francisco</td>\n",
609 |        "      <td>100959</td>\n",
610 |        "      <td>124595</td>\n",
611 |        "      <td>With one application you can be considered for...</td>\n",
612 |        "      <td>NaN</td>\n",
613 |        "      <td>NaN</td>\n",
614 |        "      <td>2020-04-25</td>\n",
615 |        "      <td>2020-06-05</td>\n",
616 |        "      <td>FULL_TIME</td>\n",
617 |        "    </tr>\n",
618 |        "    <tr>\n",
619 |        "      <th>1068</th>\n",
620 |        "      <td>Data Engineer</td>\n",
621 |        "      <td>Ginger</td>\n",
622 |        "      <td>CA</td>\n",
623 |        "      <td>San Francisco</td>\n",
624 |        "      <td>102913</td>\n",
625 |        "      <td>155464</td>\n",
626 |        "      <td>Our mission is to create a world where mental ...</td>\n",
627 |        "      <td>Health Care</td>\n",
628 |        "      <td>4.1</td>\n",
629 |        "      <td>2020-04-29</td>\n",
630 |        "      <td>2020-06-05</td>\n",
631 |        "      <td>FULL_TIME</td>\n",
632 |        "    </tr>\n",
633 |        "    <tr>\n",
634 |        "      <th>1081</th>\n",
635 |        "      <td>Data Engineer</td>\n",
636 |        "      <td>Allstate</td>\n",
637 |        "      <td>CA</td>\n",
638 |        "      <td>San Francisco</td>\n",
639 |        "      <td>97656</td>\n",
640 |        "      <td>112456</td>\n",
641 |        "      <td>Avail is a new car sharing platform focused on...</td>\n",
642 |        "      <td>Insurance</td>\n",
643 |        "      <td>3.4</td>\n",
644 |        "      <td>2020-05-06</td>\n",
645 |        "      <td>2020-06-05</td>\n",
646 |        "      <td>FULL_TIME</td>\n",
647 |        "    </tr>\n",
648 |        "    <tr>\n",
649 |        "      <th>1089</th>\n",
650 |        "      <td>Data Engineer</td>\n",
651 |        "      <td>CyberCoders</td>\n",
652 |        "      <td>CA</td>\n",
653 |        "      <td>San Francisco</td>\n",
654 |        "      <td>-1</td>\n",
655 |        "      <td>-1</td>\n",
656 |        "      <td>Data Engineer \\nIf you are a Data Engineer wit...</td>\n",
657 |        "      <td>Business Services</td>\n",
658 |        "      <td>4.1</td>\n",
659 |        "      <td>2020-05-02</td>\n",
660 |        "      <td>2020-06-05</td>\n",
661 |        "      <td>FULL_TIME</td>\n",
662 |        "    </tr>\n",
663 |        "    <tr>\n",
664 |        "      <th>1100</th>\n",
665 |        "      <td>Data Engineer</td>\n",
666 |        "      <td>Prabhav Services Inc</td>\n",
667 |        "      <td>CA</td>\n",
668 |        "      <td>San Francisco</td>\n",
669 |        "      <td>-1</td>\n",
670 |        "      <td>-1</td>\n",
671 |        "      <td>Prabhav Services Inc. is one of the premier pr...</td>\n",
672 |        "      <td>Information Technology</td>\n",
673 |        "      <td>4.6</td>\n",
674 |        "      <td>2020-05-06</td>\n",
675 |        "      <td>2020-06-05</td>\n",
676 |        "      <td>FULL_TIME</td>\n",
677 |        "    </tr>\n",
678 |        "    <tr>\n",
679 |        "      <th>1105</th>\n",
680 |        "      <td>Data Engineer</td>\n",
681 |        "      <td>Skupos</td>\n",
682 |        "      <td>CA</td>\n",
683 |        "      <td>San Francisco</td>\n",
684 |        "      <td>83068</td>\n",
685 |        "      <td>99451</td>\n",
686 |        "      <td>About Skupos\\nSkupos is the data platform for ...</td>\n",
687 |        "      <td>Information Technology</td>\n",
688 |        "      <td>5.0</td>\n",
689 |        "      <td>2020-04-24</td>\n",
690 |        "      <td>2020-06-05</td>\n",
691 |        "      <td>FULL_TIME</td>\n",
692 |        "    </tr>\n",
693 |        "    <tr>\n",
694 |        "      <th>1140</th>\n",
695 |        "      <td>Data Engineer</td>\n",
696 |        "      <td>Modern Health</td>\n",
697 |        "      <td>CA</td>\n",
698 |        "      <td>San Francisco</td>\n",
699 |        "      <td>100959</td>\n",
700 |        "      <td>124595</td>\n",
701 |        "      <td>Modern Health-Modern Health is a mental health...</td>\n",
702 |        "      <td>Information Technology</td>\n",
703 |        "      <td>5.0</td>\n",
704 |        "      <td>2020-04-30</td>\n",
705 |        "      <td>2020-06-05</td>\n",
706 |        "      <td>FULL_TIME</td>\n",
707 |        "    </tr>\n",
708 |        "    <tr>\n",
709 |        "      <th>1144</th>\n",
710 |        "      <td>Data Engineer</td>\n",
711 |        "      <td>Zypmedia</td>\n",
712 |        "      <td>CA</td>\n",
713 |        "      <td>San Francisco</td>\n",
714 |        "      <td>99278</td>\n",
715 |        "      <td>122333</td>\n",
716 |        "      <td>Data Engineer\\n\\nZypMedia has built an enterpr...</td>\n",
717 |        "      <td>Business Services</td>\n",
718 |        "      <td>4.2</td>\n",
719 |        "      <td>2020-05-01</td>\n",
720 |        "      <td>2020-06-05</td>\n",
721 |        "      <td>FULL_TIME</td>\n",
722 |        "    </tr>\n",
723 |        "    <tr>\n",
724 |        "      <th>1165</th>\n",
725 |        "      <td>Data Engineer</td>\n",
726 |        "      <td>DotSolved Systems, Inc.</td>\n",
727 |        "      <td>CA</td>\n",
728 |        "      <td>San Francisco</td>\n",
729 |        "      <td>-1</td>\n",
730 |        "      <td>-1</td>\n",
731 |        "      <td>Data Engineer Minimum 7- 8 years experience No...</td>\n",
732 |        "      <td>Information Technology</td>\n",
733 |        "      <td>4.9</td>\n",
734 |        "      <td>2020-05-05</td>\n",
735 |        "      <td>2020-06-05</td>\n",
736 |        "      <td>FULL_TIME</td>\n",
737 |        "    </tr>\n",
738 |        "  </tbody>\n",
739 |        "</table>\n",
740 |        "</div>"
741 |       ],
742 |       "text/plain": [
743 |        "          Job_title                  Company State           City  Min_Salary  \\\n",
744 |        "935   Data Engineer            Rocket Lawyer    CA  San Francisco      116784   \n",
745 |        "1015  Data Engineer           Seen by Indeed    CA  San Francisco      100959   \n",
746 |        "1068  Data Engineer                   Ginger    CA  San Francisco      102913   \n",
747 |        "1081  Data Engineer                 Allstate    CA  San Francisco       97656   \n",
748 |        "1089  Data Engineer              CyberCoders    CA  San Francisco          -1   \n",
749 |        "1100  Data Engineer     Prabhav Services Inc    CA  San Francisco          -1   \n",
750 |        "1105  Data Engineer                   Skupos    CA  San Francisco       83068   \n",
751 |        "1140  Data Engineer            Modern Health    CA  San Francisco      100959   \n",
752 |        "1144  Data Engineer                 Zypmedia    CA  San Francisco       99278   \n",
753 |        "1165  Data Engineer  DotSolved Systems, Inc.    CA  San Francisco          -1   \n",
754 |        "\n",
755 |        "      Max_Salary                                           Job_Desc  \\\n",
756 |        "935       118008  About Rocket LawyerWe believe everyone deserve...   \n",
757 |        "1015      124595  With one application you can be considered for...   \n",
758 |        "1068      155464  Our mission is to create a world where mental ...   \n",
759 |        "1081      112456  Avail is a new car sharing platform focused on...   \n",
760 |        "1089          -1  Data Engineer \\nIf you are a Data Engineer wit...   \n",
761 |        "1100          -1  Prabhav Services Inc. is one of the premier pr...   \n",
762 |        "1105       99451  About Skupos\\nSkupos is the data platform for ...   \n",
763 |        "1140      124595  Modern Health-Modern Health is a mental health...   \n",
764 |        "1144      122333  Data Engineer\\n\\nZypMedia has built an enterpr...   \n",
765 |        "1165          -1  Data Engineer Minimum 7- 8 years experience No...   \n",
766 |        "\n",
767 |        "                    Industry  Rating Date_Posted Valid_until   Job_Type  \n",
768 |        "935   Information Technology     3.5  2020-04-23  2020-06-05  FULL_TIME  \n",
769 |        "1015                     NaN     NaN  2020-04-25  2020-06-05  FULL_TIME  \n",
770 |        "1068             Health Care     4.1  2020-04-29  2020-06-05  FULL_TIME  \n",
771 |        "1081               Insurance     3.4  2020-05-06  2020-06-05  FULL_TIME  \n",
772 |        "1089       Business Services     4.1  2020-05-02  2020-06-05  FULL_TIME  \n",
773 |        "1100  Information Technology     4.6  2020-05-06  2020-06-05  FULL_TIME  \n",
774 |        "1105  Information Technology     5.0  2020-04-24  2020-06-05  FULL_TIME  \n",
775 |        "1140  Information Technology     5.0  2020-04-30  2020-06-05  FULL_TIME  \n",
776 |        "1144       Business Services     4.2  2020-05-01  2020-06-05  FULL_TIME  \n",
777 |        "1165  Information Technology     4.9  2020-05-05  2020-06-05  FULL_TIME  "
778 |       ]
779 |      },
780 |      "execution_count": 23,
781 |      "metadata": {},
782 |      "output_type": "execute_result"
783 |     }
784 |    ],
785 |    "source": [
786 |     "df.loc[df['Job_title'] == 'Data Engineer'].head(10)"
787 |    ]
788 |   },
789 |   {
790 |    "cell_type": "code",
791 |    "execution_count": 24,
792 |    "metadata": {},
793 |    "outputs": [
794 |     {
795 |      "data": {
796 |       "text/plain": [
797 |        "\"Our mission is to create a world where mental health is never an obstacle.\\n\\nGinger is transforming how behavioral healthcare is delivered by making it easy for people to get the support they need, when they need it, through on-demand coaching, teletherapy, telepsychiatry, and guided self-care.\\n\\nBusinesses purchase Ginger for their employee's or member's benefit. Through Ginger's secure mobile app, people around the world receive immediate emotional support that's personalized to their needs, and completely confidential. Ginger's high-quality, team-based care works — 70% of people show significant improvement after 12 weeks of using Ginger.\\n\\nAt Ginger, people are at the heart of what we do. We believe that diverse and inclusive teams make our company better. Teams with individuals that bring different perspectives to challenges are more innovative, collaborative, and create better solutions. We're building a workplace that actively embraces a diversity of people, ideas, talents, and experiences. Come join us!\\n\\nFast Company called Ginger one of The World's Top 10 Most Innovative Companies in Healthcare and the World Economic Forum named us a Technology Pioneer.\\n\\nAbout the Role:\\n\\nAt Ginger, we aim to provide better mental health care to humanity at a scale larger than has ever been possible before. This is no small task and as an expanding team we are working on a number of initiatives to achieve this, including aggressively building tools to simultaneously grow our reach and improve quality of care.\\n\\nWhat You'll Do:\\n\\nStanding at the center of multiple teams (data science, engineering) and core systems, you'll..\\n\\n\\nOpen up our data to uncover important patterns at the level of individuals and sub-populations.\\nSurface, serve and persist key actionable insights in mental health, healthy habit formation, goal pursuit, and care efficacy.\\nHelp us scale our services using modern distributed processing tools and GPUs in the cloud (AWS)\\nCollaborate with product to ideate and unlock features which derive as much actionable information from our data (text, media, activity etc) as possible.\\nHelp architect systems for near-real-time delivery of recommendations, care insights and other time-critical information to coaches and members.\\nDesign lightweight data schemas appropriate for storing, organizing and joining processed communication and care analytics.\\nDevise the tooling that takes us from algorithm prototype to production and can track data/model lineage and statistical drift through time.\\nDevelop pipelines that efficiently and reliably route output of machine learning algorithms to consumer processes and persistence mechanisms.\\nOwn operational scalability of our algorithms, systems and data models.\\nStand up infrastructure for optimal extraction, transformation, and loading of data from a wide variety of data sources using SQL, Python and AWS tools.\\nWork with a variety of stakeholders including the Data, Product, Engineering, Security and Executive teams to support their data accessibility needs.\\n\\nNecessary Skills:\\n\\nDatabases SQL/NoSQL 4+ years\\nCloud platform experience 3+ years\\nSQL 4+ years\\nSchema design 2+ years\\nAmazon Web Services (AWS) 2+ years\\nDeployment pipelines 2+ years\\nPython 2+ years\\nDeploying to production systems with active customers 2+ years\\nDistributed computing (e.g Spark, Hadoop etc.) 3+ years\\nInfrastructure monitoring 1+ years\\nWide variety of data warehouse, data lake (s3) etc familiarity\\nAnalytics experience working with structured and unstructured data\\nProject lead (self-managing) 1+ years\\nBachelors in technical field or experiential equivalent\\n\\nIdeal Skills:\\n\\nAmazon Web Services (AWS) 3+ years\\nAWS Lambda, Sagemaker\\nDocker / Kubernetes\\nDB performance engineering\\nMachine Learning (ML) 1+ years\\nRunning ML on GPUs 1+ years\\nPython 3+ years\\nStrong analytics intuition grounded in significant experience\\nExperience in the healthcare space\\nMasters in technical field or experiential equivalent\\n\""
798 |       ]
799 |      },
800 |      "execution_count": 24,
801 |      "metadata": {},
802 |      "output_type": "execute_result"
803 |     }
804 |    ],
805 |    "source": [
806 |     "df['Job_Desc'][1068]"
807 |    ]
808 |   },
809 |   {
810 |    "cell_type": "code",
811 |    "execution_count": null,
812 |    "metadata": {},
813 |    "outputs": [],
814 |    "source": []
815 |   }
816 |  ],
817 |  "metadata": {
818 |   "kernelspec": {
819 |    "display_name": "Python 3",
820 |    "language": "python",
821 |    "name": "python3"
822 |   },
823 |   "language_info": {
824 |    "codemirror_mode": {
825 |     "name": "ipython",
826 |     "version": 3
827 |    },
828 |    "file_extension": ".py",
829 |    "mimetype": "text/x-python",
830 |    "name": "python",
831 |    "nbconvert_exporter": "python",
832 |    "pygments_lexer": "ipython3",
833 |    "version": "3.7.3"
834 |   }
835 |  },
836 |  "nbformat": 4,
837 |  "nbformat_minor": 4
838 | }
839 | 


--------------------------------------------------------------------------------
/Exploratory Data Analysis/README.md:
--------------------------------------------------------------------------------
 1 | # Exploratory Data Analysis 
 2 | 
 3 | In this Module, Test Datasets have been created to check out the efficiency of our Model. This notebook `Job_Posts_EDA.ipynb` examines the dataset of 
 4 | job posts from Glassdoor. It finds that there are issues in this dataset, such as duplicate rows and adverstisements mixed in. However, because
 5 | only a small portion of this data is used for testing out the recommender model, these issues will not affect the project and do not need to be fixed here.
 6 | 
 7 | Later a small set of labeled data is taken in `Create_Test_Set.ipynb` that can be used to test the Doc2Vec model. Specifically, 10 sample job descriptions under 2 job titles (data scientist and data 
 8 | engineer) are selected. It matches each of these 2 job titles with 5 courses that I believe the model should recommend. 
 9 | This sample data will then be used to test the accuracy of the model.
10 | 
11 | 


--------------------------------------------------------------------------------
/Exploratory Data Analysis/courses_test_sample.csv:
--------------------------------------------------------------------------------
 1 | name,description,job_title,course_id
 2 | The Data Scientist’s Toolbox,"In this course you will get an introduction to the main tools and ideas in the data scientist's toolbox. The course gives an overview of the data, questions, and tools that data analysts and data scientists work with. There are two components to this course. The first is a conceptual introduction to the ideas behind turning data into actionable knowledge. The second is a practical introduction to the tools that will be used in the program like version control, markdown, git, GitHub, R, and RStudio.",Data Scientist,3823
 3 | Machine Learning,"Machine learning is the science of getting computers to act without being explicitly programmed. In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome. Machine learning is so pervasive today that you probably use it dozens of times a day without knowing it. Many researchers also think it is the best way to make progress towards human-level AI. In this class, you will learn about the most effective machine learning techniques, and gain practice implementing them and getting them to work for yourself. More importantly, you'll learn about not only the theoretical underpinnings of learning, but also gain the practical know-how needed to quickly and powerfully apply these techniques to new problems. Finally, you'll learn about some of Silicon Valley's best practices in innovation as it pertains to machine learning and AI.
 4 | 
 5 | This course provides a broad introduction to machine learning, datamining, and statistical pattern recognition. Topics include: (i) Supervised learning (parametric/non-parametric algorithms, support vector machines, kernels, neural networks). (ii) Unsupervised learning (clustering, dimensionality reduction, recommender systems, deep learning). (iii) Best practices in machine learning (bias/variance theory; innovation process in machine learning and AI). The course will also draw from numerous case studies and applications, so that you'll also learn how to apply learning algorithms to building smart robots (perception, control), text understanding (web search, anti-spam), computer vision, medical informatics, audio, database mining, and other areas.",Data Scientist,143
 6 | Applied Machine Learning in Python,"This course will introduce the learner to applied machine learning, focusing more on the techniques and methods than on the statistics behind these methods. The course will start with a discussion of how machine learning is different than descriptive statistics, and introduce the scikit learn toolkit through a tutorial. The issue of dimensionality of data will be discussed, and the task of clustering data, as well as evaluating those clusters, will be tackled. Supervised approaches for creating predictive models will be described, and learners will be able to apply the scikit learn predictive modelling methods while understanding process issues related to data generalizability (e.g. cross validation, overfitting). The course will end with a look at more advanced techniques, such as building ensembles, and practical limitations of predictive models. By the end of this course, students will be able to identify the difference between a supervised (classification) and unsupervised (clustering) technique, identify which technique they need to apply for a particular dataset and need, engineer features to meet that need, and write python code to carry out an analysis. 
 7 | 
 8 | This course should be taken after Introduction to Data Science in Python and Applied Plotting, Charting & Data Representation in Python and before Applied Text Mining in Python and Applied Social Analysis in Python.",Data Scientist,3165
 9 | Data Visualization with Python,"""A picture is worth a thousand words"". We are all familiar with this expression. It especially applies when trying to explain the insight obtained from the analysis of increasingly large datasets. Data visualization plays an essential role in the representation of both small and large-scale data.
10 | 
11 | One of the key skills of a data scientist is the ability to tell a compelling story, visualizing data and findings in an approachable and stimulating way. Learning how to leverage a software tool to visualize data will also enable you to extract information, better understand the data, and make more effective decisions.
12 | 
13 | The main goal of this Data Visualization with Python course is to teach you how to take data that at first glance has little meaning and present that data in a form that makes sense to people. Various techniques have been developed for presenting data visually but in this course, we will be using several data visualization libraries in Python, namely Matplotlib, Seaborn, and Folium.
14 | 
15 | LIMITED TIME OFFER: Subscription is only $39 USD per month for access to graded materials and a certificate.",Data Scientist,3588
16 | Machine Learning with Python,"This course dives into the basics of machine learning using an approachable, and well-known programming language, Python. 
17 | In this course, we will be reviewing two main components:
18 | First, you will be learning about the purpose of Machine Learning and where it applies to the real world. 
19 | Second, you will get a general overview of Machine Learning topics such as supervised vs unsupervised learning,  model evaluation, and Machine Learning algorithms. 
20 | 
21 | In this course, you practice with real-life examples of Machine learning and see how it affects society in ways you may not have guessed!
22 | 
23 | By just putting in a few hours a week for the next few weeks, this is what you’ll get.
24 | 1) New skills to add to your resume, such as regression, classification, clustering, sci-kit learn and SciPy 
25 | 2) New projects that you can add to your portfolio, including cancer detection, predicting economic trends, predicting customer churn, recommendation engines, and many more.
26 | 3) And a certificate in machine learning to prove your competency, and share it anywhere you like online or offline, such as LinkedIn profiles and social media.
27 | 
28 | If you choose to take this course and earn the Coursera course certificate, you will also earn an IBM digital badge upon successful completion of the course.",Data Scientist,2517
29 | Databases and SQL for Data Science,"Much of the world's data resides in databases. SQL (or Structured Query Language) is a powerful language which is used for communicating with and extracting data from databases. A working knowledge of databases and SQL is a must if you want to become a data scientist.
30 | 
31 | The purpose of this course is to introduce relational database concepts and help you learn and apply foundational knowledge of the SQL language. It is also intended to get you started with performing SQL access in a data science environment.  
32 | 
33 | The emphasis in this course is on hands-on and practical learning . As such, you will work with real databases, real data science tools, and real-world datasets. You will create a database instance in the cloud. Through a series of hands-on labs you will practice building and running SQL queries. You will also learn how to access databases from Jupyter notebooks using SQL and Python.
34 | 
35 | No prior knowledge of databases, SQL, Python, or programming is required.
36 | 
37 | Anyone can audit this course at no-charge. If you choose to take this course and earn the Coursera course certificate, you can also earn an IBM digital badge upon successful completion of the course.
38 | 
39 | LIMITED TIME OFFER: Subscription is only $39 USD per month for access to graded materials and a certificate.",Data Engineer,545
40 | Google Cloud Platform Big Data and Machine Learning Fundamentals,"This 2-week accelerated on-demand course introduces participants to the Big Data and Machine Learning capabilities of Google Cloud Platform (GCP). It provides a quick overview of the Google Cloud Platform and a deeper dive of the data processing capabilities.
41 | 
42 | At the end of this course, participants will be able to:
43 | • Identify the purpose and value of the key Big Data and Machine Learning products in the Google Cloud Platform
44 | • Use CloudSQL and Cloud Dataproc to migrate existing MySQL and Hadoop/Pig/Spark/Hive workloads to Google Cloud Platform
45 | • Employ BigQuery and Cloud Datalab to carry out interactive data analysis
46 | • Choose between Cloud SQL, BigTable and Datastore
47 | • Train and use a neural network using TensorFlow
48 | • Choose between different data processing products on the Google Cloud Platform
49 | 
50 | Before enrolling in this course, participants should have roughly one (1) year of experience with one or more of the following:
51 | • A common query language such as SQL
52 | • Extract, transform, load activities
53 | • Data modeling
54 | • Machine learning and/or statistics
55 | • Programming in Python
56 | 
57 | Google Account Notes:
58 | • Google services are currently unavailable in China.",Data Engineer,1015
59 | Big Data Modeling and Management Systems,"Once you’ve identified a big data issue to analyze, how do you collect, store and organize your data using Big Data solutions?  In this course, you will experience various data genres and management tools appropriate for each.  You will be able to describe the reasons behind the evolving plethora of new big data platforms from the perspective of big data management systems and analytical tools.  Through guided hands-on tutorials, you will become familiar with techniques using real-time and semi-structured data examples.  Systems and tools discussed include: AsterixDB, HP Vertica, Impala, Neo4j, Redis, SparkSQL. This course provides techniques to extract value from existing untapped data sources and discovering new data sources.
60 | 
61 | At the end of this course, you will be able to:
62 |  * Recognize different data elements in your own work and in everyday life problems
63 |  * Explain why your team needs to design a Big Data Infrastructure Plan and Information System Design
64 |  * Identify the frequent data operations required for various types of data
65 |  * Select a data model to suit the characteristics of your data 
66 |  * Apply techniques to handle streaming data
67 |  * Differentiate between a traditional Database Management System and a Big Data Management System
68 |  * Appreciate why there are so many data management systems
69 |  * Design a big data information system for an online game company
70 | 
71 | This course is for those new to data science.  Completion of Intro to Big Data is recommended.  No prior programming experience is needed, although the ability to install applications and utilize a virtual machine is necessary to complete the hands-on assignments.  Refer to the specialization technical requirements for complete hardware and software specifications.
72 | 
73 | Hardware Requirements: 
74 | (A) Quad Core Processor (VT-x or AMD-V support recommended), 64-bit; (B) 8 GB RAM; (C) 20 GB disk free. How to find your hardware information: (Windows): Open System by clicking the Start button, right-clicking Computer, and then clicking Properties; (Mac): Open Overview by clicking on the Apple menu and clicking “About This Mac.” Most computers with 8 GB RAM purchased in the last 3 years will meet the minimum requirements.You will need a high speed internet connection because you will be downloading files up to 4 Gb in size. 
75 | 
76 | Software Requirements: 
77 | This course relies on several open-source software tools, including Apache Hadoop. All required software can be downloaded and installed free of charge (except for data charges from your internet provider). Software requirements include: Windows 7+, Mac OS X 10.10+, Ubuntu 14.04+ or CentOS 6+ VirtualBox 5+.",Data Engineer,4233
78 | Database Management Essentials,"Database Management Essentials provides the foundation you need for a career in database development, data warehousing, or business intelligence, as well as for the entire Data Warehousing for Business Intelligence specialization. In this course, you will create relational databases, write SQL statements to extract information to satisfy business reporting requests, create entity relationship diagrams (ERDs) to design databases, and analyze table designs for excessive redundancy. As you develop these skills, you will use either Oracle, MySQL, or PostgreSQL to execute SQL statements and a database diagramming tool such as the ER Assistant or Visual Paradigm to create ERDs. We’ve designed this course to ensure a common foundation for specialization learners. Everyone taking the course can jump right in with writing SQL statements in Oracle, MySQL, or PostgreSQL.",Data Engineer,3763
79 | "Data Warehouse Concepts, Design, and Data Integration","This is the second course in the Data Warehousing for Business Intelligence specialization. Ideally, the courses should be taken in sequence.
80 | 
81 | In this course, you will learn exciting concepts and skills for designing data warehouses and creating data integration workflows. These are fundamental skills for data warehouse developers and administrators. You will have hands-on experience for data warehouse design and use open source products for manipulating pivot tables and creating data integration workflows. In the data integration assignment, you can use either Oracle, MySQL, or PostgreSQL databases. You will also gain conceptual background about maturity models, architectures, multidimensional models, and management practices, providing an organizational perspective about data warehouse development. If you are currently a business or information technology professional and want to become a data warehouse designer or administrator, this course will give you the knowledge and skills to do that. By the end of the course, you will have the design experience, software background, and organizational context that prepares you to succeed with data warehouse development projects. 
82 | 
83 | In this course, you will create data warehouse designs and data integration workflows that satisfy the business intelligence needs of organizations. When you’re done with this course, you’ll be able to:
84 |    * Evaluate an organization for data warehouse maturity and business architecture alignment;
85 |    * Create a data warehouse design and reflect on alternative design methodologies and design goals;
86 |    * Create data integration workflows using prominent open source software;
87 |    * Reflect on the role of change data, refresh constraints, refresh frequency trade-offs, and data quality goals in data integration process design; and
88 |    * Perform operations on pivot tables to satisfy typical business analysis requests using prominent open source software",Data Engineer,1311
89 | 


--------------------------------------------------------------------------------
/Exploratory Data Analysis/jobs_test_sample.csv:
--------------------------------------------------------------------------------
  1 | Job_title,Job_Desc,Job_id
  2 | Data Scientist,"We are looking for Data Scientists who are interested in using data to draw insights that will result in policy changes or business process optimisation, benefiting the public. The applicant will be scoping projects with stakeholders, using data sets across Government Agencies, applying business acumen to tease out relevant impactful insights, and presenting insights in a clear, concise manner by using appropriate visualisations.
  3 | 
  4 | He/she should have some training and working experiences on data analytics, and should be comfortable with hands-on data manipulation, data modelling and data visualisation. He/she should also be comfortable with engaging stakeholders on sharpening their business problems.
  5 | 
  6 | The analytics work that we do are typically action oriented and cross-cutting across various domains such as social, economic and infrastructure sectors. Over time, he/she will gain exposure to various policy and ops domains and become more adept in bridging between business users and technical expertise.
  7 | 
  8 | What to Expect:
  9 | 
 10 | Work closely with stakeholders to understand their business challenges, scope the problem and develop business case on how to turn data into critical information and knowledge that are actionable and impactful,. Perform data cleaning, pre-processing, feature engineering and build relevant models to conduct meaningful analysis. Apply appropriate visualisation techniques to communicate the insight effectively. Iterate with the stakeholders to perform subsequent deep dives based on the initial insights.Depending on the use case, design of dashboards and interactive visualisations as tools for data exploration and storytelling may be expected. Potentially deployed to other Government Agencies to be their resident Data Scientist. This will involve formulating and implementing strategies to build strong pipeline of impactful projects at the Agency and executing these projects.
 11 | 
 12 | How to Succeed:
 13 | 
 14 | 
 15 | 
 16 | Bachelor Degree in Computer Science, Statistics, Economics, Quantitative Social Science, or related degrees. Advanced degrees preferred. We will also factor in relevant certifications (e.g., Coursera)Minimum 2 years of relevant working experience, preferably in public sector or data science fieldAbility to take a broad, strategic perspective as well as drill deep to understand business needs and challengesUnderstand key concepts, techniques and considerations in machine learning and data analyticsTraining and relevant experience in one or more of the following areas:
 17 | 
 18 | 
 19 | Data science tools such as R, PythonVisual analytics technologies like Tableau, Qlik
 20 | Excellent communication skills, both oral and written, with ability to pitch ideas and influence stakeholdersStrong analytical, conceptualisation and problem solving skillsTeam player with strong organization and people handling skillsPassion for the use of analytics and data to improve Public Service
 21 | ",901
 22 | Data Scientist,"The world's largest and fastest-growing companies such as Accenture, Adobe, DocuSign and Salesforce rely on Demandbase to drive their Account-Based Marketing strategy and maximize their B2B marketing performance. We pioneered the ABM category nearly a decade ago, and today we lead the category as an indispensable part of the B2B MarTech stack. Our achievements and innovation would not be possible without the driven and collaborative teams here at Demandbase. As a company, we're as committed to growing careers as we are to building word-class technology. We invest heavily in people, our culture and the community around us, and have continuously been recognized as one of the best places to work in the Bay Area.
 23 | 
 24 | Demandbase is currently looking for a Staff Data Scientist to develop ground-breaking insights from our data sets and create a completely new way of data-driven thinking in B2B marketing —providing Sales and Marketing users with unique approaches for account-based advertising and web engagement.
 25 | 
 26 | As a Staff Data Scientist, you'll be responsible for developing and testing hypotheses on behavioral responses in B2B marketing, creating models that extract data from, among others, website, digital advertising, and CRM solutions into actionable insights, and defining leading edge thinking on how analytical frameworks can be applied to predictive marketing. You'll engage closely with product managers, engineers, customers and others to turn your models into products that delight customers and create ""A-HA"" moments. You will engage with industry peers and experts and showcase your findings (of course while maintaining company and client confidentiality!). You are both hands-on and strategic—with both a broad ecosystem-level understanding of our market space and the ability to work closely with engineering and product teams to deliver software in an iterative, continual-release environment. This is a high-visibility position involving close collaboration across functional groups and with executive stakeholders at customers like the above.
 27 | 
 28 | What you'll be doing:
 29 | 
 30 | Own: Be the functional owner of the Data Science role
 31 | Frame: Use data and insights to explore questions our customers and product team can and should be asking but never asked before.
 32 | Define: Work with customers and internal stakeholders to define hypotheses and models, and with engineering teams to define productionalization of data science system
 33 | Document: Write clear, concise descriptions of how insights can be converted into repeatable actions.
 34 | Build: Write robust machine learning pipelines and data science systems that interface with production infrastructure and APIs
 35 | Test: Continually test your models and refine assumptions, data sources and more.
 36 | Drive: Work to spread understanding and buy-in among all stakeholders at all levels.
 37 | Other duties as assigned
 38 | 
 39 | What we're looking for:
 40 | 
 41 | 2-4 years of data science experience—you have driven more than one greenfield project from concept to production release
 42 | Strong quantitative and data analysis abilities (statistics, engineering, or financial academic background preferred)—making data actionable must be your thing!
 43 | Good working knowledge of Spark is a must (we use Scala heavily)
 44 | Any experience with Google Cloud (especially BQML) and AWS is a huge plus.
 45 | Experience defining products & solutions containing large data sets from diverse sources— preferably in sales and/or marketing situations.
 46 | Prior experience in the marketing or sales analytics/data science space desired
 47 | Knowledge of web site, digital marketing, and CRM technologies and companies a big plus
 48 | 
 49 | Other important qualities:
 50 | 
 51 | You are perfectly comfortable working in a fast paced, market making environment
 52 | You love data and data visualization—you love making data actionable for customers
 53 | You are a driver and a doer
 54 | You are truly passionate about asking and answering questions – some never asked before
 55 | You have a strong sense of ownership for the products you help build
 56 | 
 57 | Benefits:
 58 | 
 59 | Our benefits include 100% paid for Medical, Dental and Vision for you and your entire family, 100% paid for short-term and long-term disability, 100% paid for life insurance, 401k, flexible vacation
 60 | 
 61 | About Demandbase:
 62 | 
 63 | Demandbase is the leader in Account-Based Marketing (ABM) and an indispensable part of the B2B tech stack. The company offers the only end-to-end ABM platform that helps B2B marketers identify, engage, close and measure progress against best-fit accounts. The biggest and fastest growing companies in the world, such as Accenture, Adobe, DocuSign, GE, Salesforce and others rely on Demandbase to drive their ABM strategy and maximize their marketing performance. The company has been named to the JMP Securities list ""The Hot 100: The Best Privately Held Software Companies,"" the Deloitte Fast 500 and named a Gartner Cool Vendor for Tech Go-To Market. In 2019, Demandbase executives authored the definitive book on ABM, Account-Based Marketing: How to Target and Engage the Companies That Will Grow Your Revenue. For more information, please visit www.demandbase.com or follow the company on Twitter @Demandbase.",910
 64 | Data Scientist,"
 65 | Role: Data Scientist.
 66 | 
 67 | Location: Foster City, CA
 68 | 
 69 | Hire Type: 12 Months Contract
 70 | 
 71 | Job Description:
 72 | 
 73 | Advanced degree in Data Science, Statistics, Computer Science, or similar.
 74 | 
 75 | Extensive experience as a Data Scientist.
 76 | 
 77 | Proficiency in R or Python, where the former is preferred.
 78 | 
 79 | In-depth understanding of SQL.
 80 | 
 81 | Competent in machine learning principles and techniques.
 82 | 
 83 | Demonstrable history of devising and overseeing data-centered projects.
 84 | 
 85 | Ability to relay insights in layman's terms, such that these can be used to inform business decisions.
 86 | 
 87 | Outstanding supervision and mentorship abilities.
 88 | 
 89 | Capacity to foster a healthy, stimulating work environment that frequently harnesses teamwork.
 90 | 
 91 | ",916
 92 | Data Scientist,"Upstart is the leading AI lending platform partnering with banks to expand access to affordable credit. Forbes recently ranked Upstart #12 on its list of ""most promising AI companies in America."" By leveraging Upstart's AI platform, Upstart-powered banks can have higher approval rates and lower loss rates, while simultaneously delivering the exceptional digital-first lending experience their customers demand. Upstart's patent-pending platform is the first to receive a no-action letter from the Consumer Financial Protection Bureau related to fair lending. Upstart is based in San Mateo, California and Columbus, Ohio.
 93 | 
 94 | The Role
 95 | 
 96 | Our data science team consists of full-stack generalists as well as specialists in statistical modeling or machine learning. Because our challenges are so new, data scientists at Upstart need strong creative problem-solving skills and the technical background to implement solutions. Our research environment affords team members the opportunity to utilize a variety of statistical and machine learning methods with the freedom and encouragement to pursue alternative approaches to solving problems. Whether developing new products or identifying novel approaches to core models, we are continuously seeking the next big ideas to move our business forward.
 97 | 
 98 | Our current Data Scientists summarize some of their favorite aspects of our team as:
 99 | 
100 | 
101 | Having a direct impact on the company's success
102 | Collaborative, intelligent and open team
103 | Mentorship, growth and friendship
104 | Leaders committed to challenging and growing team members
105 | Feeling safe asking for help when it's necessary; feeling trusted to get the job done when it's not
106 | 
107 | Hiring Profile
108 | 
109 | Strong academic credentials with a M.S. in Computer Science, Statistics, Data Science or a related field of study with a preference for Ph.D.
110 | Comfort with programming (ideally in Python and R)
111 | Rigorous quantitative background
112 | Predictive modeling experience is preferred
113 | Enthusiasm for and alignment with Upstart's mission and values
114 | Strong sense of intellectual curiosity balanced with humility
115 | Numerically-savvy with ability to operate at a speedy pace
116 | 
117 | Most Upstarters join us because they connect with our mission of enabling access to effortless credit based on true risk. If you are energized by the impact you can make at Upstart, we would love to hear from you!",920
118 | Data Scientist,"Why Divvy?Over the past decade, millions of Americans have been forced to put their dreams of homeownership on hold. Home prices have outpaced wage growth while mortgage requirements continue to tighten. As a result, renters are missing out on a critical wealth-building opportunity: owning a home.At Divvy, we're building an on-ramp to homeownership – one that's more affordable, more flexible, and an overall better fit for the modern American family – and it’s working.We’re looking for a Data Scientist to join our growing company. In this role, you’ll ensure the financial viability of our business by developing our underwriting and/or pricing models. Developing this model will also mean simulating new financial product offerings which match customer needs to Divvy’s capacities. Day to day, this will include a mix of dataset acquisition, statistical modeling, exploratory data analysis, and software engineering. You’ll report directly to Divvy’s Head of Data Science and work alongside a team of 8-10 software engineers and data scientists.ResponsibilitiesBuild and refine our default and/or pricing models using structured dataIdentify, analyze, and acquire new data sources to improve model accuracyInfluence Divvy’s product offerings based on quantitative insightsBecome a domain expert in risk and/or pricingWork ExperienceYou have 3+ years of experience in machine learning, data science or analyticsYou have experience in either R or PythonYou have a strong understanding of statistical modeling techniquesYou demonstrate the ability to clearly communicate analysisBonus points for previous credit default modeling experience, risk management experience, and/or real estate pricing (AVM) experiencePerksCompetitive salary + equity Full benefits (medical, dental, vision, 401k, commuter) A beautiful dog-friendly office Diverse, smart, and witty co-workersCommitment to Diversity & InclusionWe prioritize a commitment to diversity in our team building process. We enthusiastically encourage individuals from a variety of lived experiences to reach out.",938
119 | Data Engineer,"About Rocket LawyerWe believe everyone deserves access to simple and affordable legal services.
120 | Founded in 2008, Rocket Lawyer is the largest and most widely used online legal service platform in the world. With offices in North America and Europe, Rocket Lawyer has helped over 20 million people create over 50 million legal documents, and get their legal questions answered.
121 | We are in a unique position to enhance and expand the Rocket Lawyer platform to a scale never seen before in the company’s history, to capture audiences worldwide. We are expanding our team to take on this challenge!
122 | About the RoleRocket Lawyer is looking for a Data Engineer that will contribute in all aspects of creating an analytical data driven environment. The core data engineering team is responsible for the building out the data pipeline, gathering internal and external data, generating metrics, managing and monitoring batch and streaming jobs, and implementing analytical tools to drive strategic decision making.A Day in the Life
123 | 
124 | 
125 | Evangelize Modern Big Data Practices Design warehouse schemas that accurately represent our business, and facilitate analysis and building of reportsHelp build batch and streaming data ingestion pipeline using Hadoop, Hive, Pig, Storm, and Kafka StreamsWrite ETL jobs to transform raw data into business information to drive decision makingDevelop analytical environment using internal and external reporting toolsIntegrate internal and external data with warehouse and external tools
126 | 
127 | 
128 | Experience
129 | 
130 | 
131 | Excellent technical skills including expert knowledge of the Hadoop ecosystemExperience of the analysis, design and development of Data Warehouse and Big Data solutions, including analyzing source systems, developing ETL design patterns and templates, ETL development, data profiling and data quality issues resolution.Project and team management experience Excellent communication skills and presentation skillsStrong SQL, Java, and Python skillsDatabase (relational & NoSQL), Data Warehouse knowledgeStream processing experience (Storm, Kafka Streams)Passion and enthusiasm for learning new technologies and techniqueComfortable with LinuxBS or MS in computer scienceDetail oriented and organizedDesire to learn broad set of technologies
132 | 
133 | 
134 | Benefits and Perks
135 | 
136 | 
137 | Comprehensive health plans (including Medical, Dental and Vision insurance for full-time employees)Unlimited PTOCompetitive salary packages401k programLife insuranceDisability benefitsFlexible Spending AccountsCommuter/Transit ProgramYour choice of a MAC or PCMonthly onsite masseuse sessionsWeekly Friday catered lunchesCompany sponsored events, both on- and off-site
138 | 
139 | 
140 | ",935
141 | Data Engineer,"Our mission is to create a world where mental health is never an obstacle.
142 | 
143 | Ginger is transforming how behavioral healthcare is delivered by making it easy for people to get the support they need, when they need it, through on-demand coaching, teletherapy, telepsychiatry, and guided self-care.
144 | 
145 | Businesses purchase Ginger for their employee's or member's benefit. Through Ginger's secure mobile app, people around the world receive immediate emotional support that's personalized to their needs, and completely confidential. Ginger's high-quality, team-based care works — 70% of people show significant improvement after 12 weeks of using Ginger.
146 | 
147 | At Ginger, people are at the heart of what we do. We believe that diverse and inclusive teams make our company better. Teams with individuals that bring different perspectives to challenges are more innovative, collaborative, and create better solutions. We're building a workplace that actively embraces a diversity of people, ideas, talents, and experiences. Come join us!
148 | 
149 | Fast Company called Ginger one of The World's Top 10 Most Innovative Companies in Healthcare and the World Economic Forum named us a Technology Pioneer.
150 | 
151 | About the Role:
152 | 
153 | At Ginger, we aim to provide better mental health care to humanity at a scale larger than has ever been possible before. This is no small task and as an expanding team we are working on a number of initiatives to achieve this, including aggressively building tools to simultaneously grow our reach and improve quality of care.
154 | 
155 | What You'll Do:
156 | 
157 | Standing at the center of multiple teams (data science, engineering) and core systems, you'll..
158 | 
159 | 
160 | Open up our data to uncover important patterns at the level of individuals and sub-populations.
161 | Surface, serve and persist key actionable insights in mental health, healthy habit formation, goal pursuit, and care efficacy.
162 | Help us scale our services using modern distributed processing tools and GPUs in the cloud (AWS)
163 | Collaborate with product to ideate and unlock features which derive as much actionable information from our data (text, media, activity etc) as possible.
164 | Help architect systems for near-real-time delivery of recommendations, care insights and other time-critical information to coaches and members.
165 | Design lightweight data schemas appropriate for storing, organizing and joining processed communication and care analytics.
166 | Devise the tooling that takes us from algorithm prototype to production and can track data/model lineage and statistical drift through time.
167 | Develop pipelines that efficiently and reliably route output of machine learning algorithms to consumer processes and persistence mechanisms.
168 | Own operational scalability of our algorithms, systems and data models.
169 | Stand up infrastructure for optimal extraction, transformation, and loading of data from a wide variety of data sources using SQL, Python and AWS tools.
170 | Work with a variety of stakeholders including the Data, Product, Engineering, Security and Executive teams to support their data accessibility needs.
171 | 
172 | Necessary Skills:
173 | 
174 | Databases SQL/NoSQL 4+ years
175 | Cloud platform experience 3+ years
176 | SQL 4+ years
177 | Schema design 2+ years
178 | Amazon Web Services (AWS) 2+ years
179 | Deployment pipelines 2+ years
180 | Python 2+ years
181 | Deploying to production systems with active customers 2+ years
182 | Distributed computing (e.g Spark, Hadoop etc.) 3+ years
183 | Infrastructure monitoring 1+ years
184 | Wide variety of data warehouse, data lake (s3) etc familiarity
185 | Analytics experience working with structured and unstructured data
186 | Project lead (self-managing) 1+ years
187 | Bachelors in technical field or experiential equivalent
188 | 
189 | Ideal Skills:
190 | 
191 | Amazon Web Services (AWS) 3+ years
192 | AWS Lambda, Sagemaker
193 | Docker / Kubernetes
194 | DB performance engineering
195 | Machine Learning (ML) 1+ years
196 | Running ML on GPUs 1+ years
197 | Python 3+ years
198 | Strong analytics intuition grounded in significant experience
199 | Experience in the healthcare space
200 | Masters in technical field or experiential equivalent
201 | ",1068
202 | Data Engineer,"Data Engineer 
203 | If you are a Data Engineer with several years of relevant experience, please read on!We are poised to triple our customer base AGAIN in 2020 and we need a Data Engineer to help us manage the growth! Our tech stack: AWS, Aptible, Postgres, Redis, Rails, Python, Airflow, Mode Analytics, Android, React, and React Native.
204 | 
205 | What You Will Be Doing
206 | - Maintain our current ETL-lite while scaling it for the future- Create and maintain views and expand use of rollup tables- Identify opportunities to improve the integrity of our datasets and implement the fixes- Assist in building out our payments platform for managing medical claims- Help explore options for delivering data to clients, including possible API access- Inform our 2020 objectives and key results around scaling and data needs
207 | What You Need for this Position
208 | Requirements: - Bachelors degree in C.S. or comparable degree preferred- Minimum of 3 years relevant experience in data engineering- Ability to collaborate and problem solve across teams- Excellent communication skills, both written and verbal- Python: using community-standards, linting, and testing at all appropriate levels.- SQL: comfort with joins, unions, views, rollups, windowing functions, testing- JSON parsing and fluency with RESTful APIs- Operational competency with cloud-hosted systems such as AWS, Aptible, or Heroku- Ability to correlate data across multiple sources: RDBs, csv, json- Understands how to write efficient code and can optimize existing software and queriesNice to Have: - Prior experience with healthcare data (PHI/PII/HIPAA requirements)- Experience developing software in Ruby on Rails- Understanding of user experience principles- History of technical writing
209 | What's In It for You
210 | - Competitive compensation with meaningful stock options- Medical, dental, vision- 401K match - 3 months paid parental leave- Daily lunch- Professional development budget - Monthly fitness/gym reimbursement - Annual mental wellness benefit - Noise-cancelling headphones - Work from home policy- Opportunity to join a fantastically talented, diverse, and passionate team at a pivotal time in the companys lifecycle
211 | 
212 | So, if you are a Data Engineer with the required experience, please apply today!
213 |  - Applicants must be authorized to work in the U.S.
214 | CyberCoders, Inc is proud to be an Equal Opportunity Employer
215 | 
216 |  All qualified applicants will receive consideration for employment without regard to race, color, religion, sex, national origin, disability, protected veteran status, or any other characteristic protected by law.
217 |  
218 | Your Right to Work In compliance with federal law, all persons hired will be required to verify identity and eligibility to work in the United States and to complete the required employment eligibility verification document form upon hire.",1089
219 | Data Engineer,"Prabhav Services Inc. is one of the premier preferred vendor with many end clients, we have offices in USA, Canada and India, we do sponsor H1B for right candidate and do the Greencard immediately as required, we are looking for candidates for next year as well so if you or any of your friends are looking for job feel free to refer to me on parinprabhavonline.com Currently we are seeking the candidates for Data Engineer with excellent in implement automation, data modeling, data wrangling, data analysis, and data vision solutions to complex problems, processes, and scenarios. Familiarity with common data structures and languages. Responsibilities Perform data stream design, integration engineering including a full understanding and support for a typical Capture-Ingestion-Storage-Validation-Analysis-Visualization process Process mapping and automation Wrangling structured, unstructured and poorly structured data into appropriate data structures Develop data architectures that improve automation, processes, data flow and analyses (including recommendations in systems owned by other organizations) Establish objectives, formulate methodologies, and help coordinate fusion of data science, data architecture, data visualization, and data management streams and teams Identify opportunities to further build out our IoT strategy ExperienceSkillsAbilities At least 3 years of experience working as a Systems Integrator, Data Engineer, Software Engineer or similar position demonstrating the ability to design and implement automation, data modeling, data wrangling, data analysis, and data vision solutions to complex problems, processes, and scenarios. Familiarity with common data structures and languages. BS Computer Science, ComputerElectrical Engineering, or Math Degree or relevant experience. Experience with IoT, cloud computing, distributed data systems Experience working with statistical teams andor data scientists ToolsProgramming Experience SQL, Python, R, JS, HTML, CSS, BI, Tableau, AWS (to work and reorder this), excel, R, DB conceptsprogramming, object-orientated languages(e.g. Java, C++), other scripting languages, programming skills Friendly and approachable, with strong communication and presentation skills Desire to keep current with a challenging and evolving environment Team focused and self-motivated. Able to work as part of a coordinated team, yet independently when necessary Proven abilities to take initiative and to be innovative have an analytical mind with a problem-solving aptitude",1100
220 | Data Engineer,"About Skupos
221 | Skupos is the data platform for the convenience retail industry. Retailers, distributors, and brands connect to the Skupos network to create value from disparate data. Convenience retail is a long-standing industry with limited technology adoption, but is responsible for more revenue annually than all of e-commerce in the United States. Skupos leverages our massive datasets to build tools that help the industry succeed.
222 | Skupos software integrates at a retailer’s point of sale, generates analytical insights, and automates the inventory and ordering process. For distributors and CPG brands, we provide real-time visibility into consumer purchasing decisions and enable automated promotional discounts at the point of sale. We view our company as revolutionizing a brick-and-mortar industry by bringing cutting-edge technology to physical stores, and helping harness data to create a frictionless connection between millions of people and the products they need.
223 | What You'll Do
224 | Skupos is seeking a Data Engineer to help build the foundation of our big data platform. As we pave the way for our data product offerings, you will architect, develop and deploy data solutions at scale using modern data technologies. You will have an opportunity to drive the tech stack for this platform. Come, join us and be in charge of your career trajectory and leverage coaching/mentorship opportunities with your manager to write your own success story at Skupos and beyond.
225 | 
226 | Build data pipelines for the end to end data ecosystem:Data integration and Ingestion from multiple external data providers/partners.Data processing in accordance with product requirements, ensuring data security and compliance throughout the pipeline. Data Storage layer - Maintain a Single Source of Truth. Data Access layer - Make data available for reporting, dashboards, analytics, business intelligence needs.Data Science and Machine Learning modeling. Collaborate with cross-functional technical, product and business teams to take ownership of data projects to ensure a complete end to end customer experience.Research and recommend technologies to build data solutions at scale with near real-time processing of data using service oriented architecture. Improve project delivery and decrease process redundancy and overhead.Foster a lean agile development culture within the team with emphasis on code quality and software best practices. Join the foundational core data engineering team and play an instrumental role in hiring your future teammates. 
227 | 
228 | 
229 | What You Should Have
230 | 
231 | 
232 | BA/BS in business, computer science; or similar degree in a related field or equivalent experience with demonstrated proficiency3+ years of hands-on experience building big data pipelines using streaming technologies (Kafka, Spark, or similar) in cloud environments. 5+ years experience in database technologies, including RDBMS, NoSQL, Document storage, graphs, and distributed file systems.Advanced skills with functional programming languages (Scala, Python, R, Java, or similar)Experience with data warehouse architecture and data modeling for Business Intelligence.Working knowledge of BI tools (Tableau, Looker, Snowflake, or similar)Excellent attention to detail and focused on execution through rapid iterations. Self Motivated individual with strong ethics who brings the best version of themselves to raise the bar for the entire team. 
233 | 
234 | 
235 | What Makes You A Great Fit
236 | 
237 | 
238 | Startup experienceSubject matter knowledge on retail industryExperience building SAAS software
239 | 
240 | 
241 | What We Offer
242 | • Competitive salary• Healthcare benefits• 401K• Commuter benefits• Major role in a strong, small and growing development team• Be a part of a key platform of product offerings to the retail convenience store industry
243 | What are your goals and aspirations? Build your technical skills, business acumen, and leadership with Skupos.",1105
244 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Harsh Bardhan Mishra
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Other/Course_webpages.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "The purpose of this notebook is to try to retrieve the web addresses for the courses. They do not seem to be available through the API."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 3,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pandas as pd\n",
 17 |     "import requests\n",
 18 |     "import time"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 8,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stdout",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "Request status for page 0 is 200.\n",
 31 |       "Request status for page 1 is 200.\n",
 32 |       "Finished. The number of courses gotten from the catalog is 200\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "# Get the entire Coursera catalog.\n",
 38 |     "\n",
 39 |     "# Instantiate a list to hold the courses\n",
 40 |     "courses = []\n",
 41 |     "\n",
 42 |     "# Set the base url for making get requests\n",
 43 |     "base_url = 'https://api.coursera.org/api/courses.v1'\n",
 44 |     "\n",
 45 |     "# Add the fields I want to include in my requests\n",
 46 |     "fields = \"&fields=previewLink,photoURL\"\n",
 47 |     "\n",
 48 |     "# Loop through all 45 pages of the catalog\n",
 49 |     "for page in range(2):\n",
 50 |     "    \n",
 51 |     "    # set pagination\n",
 52 |     "    pagination = f\"?start={page*100}&limit=100\"\n",
 53 |     "\n",
 54 |     "    # make a request\n",
 55 |     "    res = requests.get(base_url + pagination + fields)\n",
 56 |     "    print(f'Request status for page {page} is {res.status_code}.')\n",
 57 |     "    \n",
 58 |     "    # convert from json\n",
 59 |     "    dict = res.json()\n",
 60 |     "    \n",
 61 |     "    # add to the catalog dictionary\n",
 62 |     "    for course in dict['elements']:\n",
 63 |     "        courses.append(course)\n",
 64 |     "    \n",
 65 |     "    # delay time to next request\n",
 66 |     "    time.sleep(2)\n",
 67 |     "\n",
 68 |     "print(f'Finished. The number of courses gotten from the catalog is {len(courses)}')"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 9,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "name": "stdout",
 78 |      "output_type": "stream",
 79 |      "text": [
 80 |       "(200, 4)\n"
 81 |      ]
 82 |     },
 83 |     {
 84 |      "data": {
 85 |       "text/html": [
 86 |        "<div>\n",
 87 |        "<style scoped>\n",
 88 |        "    .dataframe tbody tr th:only-of-type {\n",
 89 |        "        vertical-align: middle;\n",
 90 |        "    }\n",
 91 |        "\n",
 92 |        "    .dataframe tbody tr th {\n",
 93 |        "        vertical-align: top;\n",
 94 |        "    }\n",
 95 |        "\n",
 96 |        "    .dataframe thead th {\n",
 97 |        "        text-align: right;\n",
 98 |        "    }\n",
 99 |        "</style>\n",
100 |        "<table border=\"1\" class=\"dataframe\">\n",
101 |        "  <thead>\n",
102 |        "    <tr style=\"text-align: right;\">\n",
103 |        "      <th></th>\n",
104 |        "      <th>courseType</th>\n",
105 |        "      <th>id</th>\n",
106 |        "      <th>slug</th>\n",
107 |        "      <th>name</th>\n",
108 |        "    </tr>\n",
109 |        "  </thead>\n",
110 |        "  <tbody>\n",
111 |        "    <tr>\n",
112 |        "      <th>0</th>\n",
113 |        "      <td>v2.ondemand</td>\n",
114 |        "      <td>69Bku0KoEeWZtA4u62x6lQ</td>\n",
115 |        "      <td>gamification</td>\n",
116 |        "      <td>Gamification</td>\n",
117 |        "    </tr>\n",
118 |        "    <tr>\n",
119 |        "      <th>1</th>\n",
120 |        "      <td>v2.ondemand</td>\n",
121 |        "      <td>0HiU7Oe4EeWTAQ4yevf_oQ</td>\n",
122 |        "      <td>missing-data</td>\n",
123 |        "      <td>Dealing With Missing Data</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>2</th>\n",
127 |        "      <td>v2.ondemand</td>\n",
128 |        "      <td>sI_-QEBiEemtDRLx7Ne8jg</td>\n",
129 |        "      <td>cs-fundamentals-3</td>\n",
130 |        "      <td>Unordered Data Structures</td>\n",
131 |        "    </tr>\n",
132 |        "    <tr>\n",
133 |        "      <th>3</th>\n",
134 |        "      <td>v2.ondemand</td>\n",
135 |        "      <td>5zjIsJq-EeW_wArffOXkOw</td>\n",
136 |        "      <td>vital-signs</td>\n",
137 |        "      <td>Vital Signs: Understanding What the Body Is Te...</td>\n",
138 |        "    </tr>\n",
139 |        "    <tr>\n",
140 |        "      <th>4</th>\n",
141 |        "      <td>v2.ondemand</td>\n",
142 |        "      <td>WFanvtoSEeedbRLwgi9a7A</td>\n",
143 |        "      <td>fintech-disruption</td>\n",
144 |        "      <td>FinTech Disruptive Innovation: Implications fo...</td>\n",
145 |        "    </tr>\n",
146 |        "  </tbody>\n",
147 |        "</table>\n",
148 |        "</div>"
149 |       ],
150 |       "text/plain": [
151 |        "    courseType                      id                slug  \\\n",
152 |        "0  v2.ondemand  69Bku0KoEeWZtA4u62x6lQ        gamification   \n",
153 |        "1  v2.ondemand  0HiU7Oe4EeWTAQ4yevf_oQ        missing-data   \n",
154 |        "2  v2.ondemand  sI_-QEBiEemtDRLx7Ne8jg   cs-fundamentals-3   \n",
155 |        "3  v2.ondemand  5zjIsJq-EeW_wArffOXkOw         vital-signs   \n",
156 |        "4  v2.ondemand  WFanvtoSEeedbRLwgi9a7A  fintech-disruption   \n",
157 |        "\n",
158 |        "                                                name  \n",
159 |        "0                                       Gamification  \n",
160 |        "1                          Dealing With Missing Data  \n",
161 |        "2                          Unordered Data Structures  \n",
162 |        "3  Vital Signs: Understanding What the Body Is Te...  \n",
163 |        "4  FinTech Disruptive Innovation: Implications fo...  "
164 |       ]
165 |      },
166 |      "execution_count": 9,
167 |      "metadata": {},
168 |      "output_type": "execute_result"
169 |     }
170 |    ],
171 |    "source": [
172 |     "# Convert the dictionary to DataFrame\n",
173 |     "\n",
174 |     "catalog_df = pd.DataFrame(courses)\n",
175 |     "print(catalog_df.shape)\n",
176 |     "catalog_df.head()"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": []
185 |   }
186 |  ],
187 |  "metadata": {
188 |   "kernelspec": {
189 |    "display_name": "Python 3",
190 |    "language": "python",
191 |    "name": "python3"
192 |   },
193 |   "language_info": {
194 |    "codemirror_mode": {
195 |     "name": "ipython",
196 |     "version": 3
197 |    },
198 |    "file_extension": ".py",
199 |    "mimetype": "text/x-python",
200 |    "name": "python",
201 |    "nbconvert_exporter": "python",
202 |    "pygments_lexer": "ipython3",
203 |    "version": "3.7.6"
204 |   }
205 |  },
206 |  "nbformat": 4,
207 |  "nbformat_minor": 4
208 | }
209 | 


--------------------------------------------------------------------------------
/Other/Coursera_data_collection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook makes a preliminary attempt to pull data from the Coursera API."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import requests"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "base_url = 'https://api.coursera.org/api/courses.v1'"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 4,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "data": {
 35 |       "text/plain": [
 36 |        "200"
 37 |       ]
 38 |      },
 39 |      "execution_count": 4,
 40 |      "metadata": {},
 41 |      "output_type": "execute_result"
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "# Search for courses on machine learning\n",
 46 |     "res = requests.get(base_url + '?q=search&query=machine+learning')\n",
 47 |     "res.status_code"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 6,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "# Convert JSON code to Python dictionary\n",
 57 |     "ml = res.json()"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 7,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "dict_keys(['elements', 'paging', 'linked'])"
 69 |       ]
 70 |      },
 71 |      "execution_count": 7,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "# Check the keys of the dictionary\n",
 78 |     "ml.keys()"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 19,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "data": {
 88 |       "text/plain": [
 89 |        "{'courseType': 'v2.ondemand',\n",
 90 |        " 'id': 'Gtv4Xb1-EeS-ViIACwYKVQ',\n",
 91 |        " 'slug': 'machine-learning',\n",
 92 |        " 'name': 'Machine Learning'}"
 93 |       ]
 94 |      },
 95 |      "execution_count": 19,
 96 |      "metadata": {},
 97 |      "output_type": "execute_result"
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "# Check the first element (a course)\n",
102 |     "ml['elements'][0]"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 21,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "data": {
112 |       "text/plain": [
113 |        "'/Gtv4Xb1-EeS-ViIACwYKVQ'"
114 |       ]
115 |      },
116 |      "execution_count": 21,
117 |      "metadata": {},
118 |      "output_type": "execute_result"
119 |     }
120 |    ],
121 |    "source": [
122 |     "# Get the course id for this course\n",
123 |     "ml_id = ml['elements'][0]['id']\n",
124 |     "ml_id = '/' + ml_id  # need to add the backslash for the request\n",
125 |     "ml_id"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 22,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "data": {
135 |       "text/plain": [
136 |        "200"
137 |       ]
138 |      },
139 |      "execution_count": 22,
140 |      "metadata": {},
141 |      "output_type": "execute_result"
142 |     }
143 |    ],
144 |    "source": [
145 |     "# Now make another request for this specific course\n",
146 |     "res2 = requests.get(base_url + ml_id)\n",
147 |     "res2.status_code"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 25,
153 |    "metadata": {},
154 |    "outputs": [
155 |     {
156 |      "data": {
157 |       "text/plain": [
158 |        "dict_keys(['elements', 'paging', 'linked'])"
159 |       ]
160 |      },
161 |      "execution_count": 25,
162 |      "metadata": {},
163 |      "output_type": "execute_result"
164 |     }
165 |    ],
166 |    "source": [
167 |     "ml_course = res2.json()\n",
168 |     "ml_course.keys()"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 28,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "data": {
178 |       "text/plain": [
179 |        "[{'courseType': 'v2.ondemand',\n",
180 |        "  'id': 'Gtv4Xb1-EeS-ViIACwYKVQ',\n",
181 |        "  'slug': 'machine-learning',\n",
182 |        "  'name': 'Machine Learning'}]"
183 |       ]
184 |      },
185 |      "execution_count": 28,
186 |      "metadata": {},
187 |      "output_type": "execute_result"
188 |     }
189 |    ],
190 |    "source": [
191 |     "ml_course['elements']"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 37,
197 |    "metadata": {},
198 |    "outputs": [
199 |     {
200 |      "data": {
201 |       "text/plain": [
202 |        "200"
203 |       ]
204 |      },
205 |      "execution_count": 37,
206 |      "metadata": {},
207 |      "output_type": "execute_result"
208 |     }
209 |    ],
210 |    "source": [
211 |     "# Now try requesting the course with additional fields\n",
212 |     "\n",
213 |     "fields = \"?ids=Gtv4Xb1-EeS-ViIACwYKVQ&fields=description\"\n",
214 |     "\n",
215 |     "res3 = requests.get(base_url + fields)\n",
216 |     "res3.status_code"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 38,
222 |    "metadata": {},
223 |    "outputs": [
224 |     {
225 |      "data": {
226 |       "text/plain": [
227 |        "dict_keys(['elements', 'paging', 'linked'])"
228 |       ]
229 |      },
230 |      "execution_count": 38,
231 |      "metadata": {},
232 |      "output_type": "execute_result"
233 |     }
234 |    ],
235 |    "source": [
236 |     "ml_fields = res3.json()\n",
237 |     "ml_fields.keys()"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 39,
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "data": {
247 |       "text/plain": [
248 |        "[{'courseType': 'v2.ondemand',\n",
249 |        "  'description': \"Machine learning is the science of getting computers to act without being explicitly programmed. In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome. Machine learning is so pervasive today that you probably use it dozens of times a day without knowing it. Many researchers also think it is the best way to make progress towards human-level AI. In this class, you will learn about the most effective machine learning techniques, and gain practice implementing them and getting them to work for yourself. More importantly, you'll learn about not only the theoretical underpinnings of learning, but also gain the practical know-how needed to quickly and powerfully apply these techniques to new problems. Finally, you'll learn about some of Silicon Valley's best practices in innovation as it pertains to machine learning and AI.\\n\\nThis course provides a broad introduction to machine learning, datamining, and statistical pattern recognition. Topics include: (i) Supervised learning (parametric/non-parametric algorithms, support vector machines, kernels, neural networks). (ii) Unsupervised learning (clustering, dimensionality reduction, recommender systems, deep learning). (iii) Best practices in machine learning (bias/variance theory; innovation process in machine learning and AI). The course will also draw from numerous case studies and applications, so that you'll also learn how to apply learning algorithms to building smart robots (perception, control), text understanding (web search, anti-spam), computer vision, medical informatics, audio, database mining, and other areas.\",\n",
250 |        "  'id': 'Gtv4Xb1-EeS-ViIACwYKVQ',\n",
251 |        "  'slug': 'machine-learning',\n",
252 |        "  'name': 'Machine Learning'}]"
253 |       ]
254 |      },
255 |      "execution_count": 39,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     }
259 |    ],
260 |    "source": [
261 |     "ml_fields['elements']"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 40,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "data": {
271 |       "text/plain": [
272 |        "{'total': 1}"
273 |       ]
274 |      },
275 |      "execution_count": 40,
276 |      "metadata": {},
277 |      "output_type": "execute_result"
278 |     }
279 |    ],
280 |    "source": [
281 |     "ml_fields['paging']"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 41,
287 |    "metadata": {},
288 |    "outputs": [
289 |     {
290 |      "data": {
291 |       "text/plain": [
292 |        "{}"
293 |       ]
294 |      },
295 |      "execution_count": 41,
296 |      "metadata": {},
297 |      "output_type": "execute_result"
298 |     }
299 |    ],
300 |    "source": [
301 |     "ml_fields['linked']"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": []
310 |   }
311 |  ],
312 |  "metadata": {
313 |   "kernelspec": {
314 |    "display_name": "Python 3",
315 |    "language": "python",
316 |    "name": "python3"
317 |   },
318 |   "language_info": {
319 |    "codemirror_mode": {
320 |     "name": "ipython",
321 |     "version": 3
322 |    },
323 |    "file_extension": ".py",
324 |    "mimetype": "text/x-python",
325 |    "name": "python",
326 |    "nbconvert_exporter": "python",
327 |    "pygments_lexer": "ipython3",
328 |    "version": "3.7.6"
329 |   }
330 |  },
331 |  "nbformat": 4,
332 |  "nbformat_minor": 4
333 | }
334 | 


--------------------------------------------------------------------------------
/Other/Coursera_review_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook pulls course reviews from Coursera."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 2,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import requests\n",
 17 |     "from bs4 import BeautifulSoup\n",
 18 |     "import pandas as pd"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 5,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "data": {
 28 |       "text/plain": [
 29 |        "200"
 30 |       ]
 31 |      },
 32 |      "execution_count": 5,
 33 |      "metadata": {},
 34 |      "output_type": "execute_result"
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "url = \"https://www.coursera.org/learn/machine-learning/reviews?page=1&sort=recent\"\n",
 39 |     "res = requests.get(url)\n",
 40 |     "res.status_code"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 8,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "soup = BeautifulSoup(res.content, 'lxml')"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 10,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "reviews_list = soup.find('div', {'data-e2e': 'reviews-list'})"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 14,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "review_text = reviews_list.find_all('div', {'class': \"reviewText\"})"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 20,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "reviews = []\n",
 77 |     "for review in review_text:\n",
 78 |     "    reviews.append(review.text)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 21,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "data": {
 88 |       "text/plain": [
 89 |        "\"Thank you so much Prof. Andrew. I am very thankful to coursera for providing such a valuable course. For me math is love and Andrew's mathematical explanation behind every concept was a great plus point for me and the course wasn't easy or too hard either. Again thanks to coursera and Andrew sir for growing my interest in Machine Learning. Thanks.\""
 90 |       ]
 91 |      },
 92 |      "execution_count": 21,
 93 |      "metadata": {},
 94 |      "output_type": "execute_result"
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "reviews[0]"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": []
107 |   }
108 |  ],
109 |  "metadata": {
110 |   "kernelspec": {
111 |    "display_name": "Python 3",
112 |    "language": "python",
113 |    "name": "python3"
114 |   },
115 |   "language_info": {
116 |    "codemirror_mode": {
117 |     "name": "ipython",
118 |     "version": 3
119 |    },
120 |    "file_extension": ".py",
121 |    "mimetype": "text/x-python",
122 |    "name": "python",
123 |    "nbconvert_exporter": "python",
124 |    "pygments_lexer": "ipython3",
125 |    "version": "3.7.6"
126 |   }
127 |  },
128 |  "nbformat": 4,
129 |  "nbformat_minor": 4
130 | }
131 | 


--------------------------------------------------------------------------------
/Other/Coursetalk_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook scrapes reviews from coursetalk.com"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import requests\n",
 17 |     "from bs4 import BeautifulSoup\n",
 18 |     "import pandas as pd"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "data": {
 28 |       "text/plain": [
 29 |        "200"
 30 |       ]
 31 |      },
 32 |      "execution_count": 2,
 33 |      "metadata": {},
 34 |      "output_type": "execute_result"
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "url = \"https://www.coursetalk.com/providers/coursera/courses/machine-learning\"\n",
 39 |     "res = requests.get(url)\n",
 40 |     "res.status_code"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "soup = BeautifulSoup(res.content, 'lxml')"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 20,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "review_list = soup.find('div', {'class': 'reviews-list'})"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 34,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "rating_list = review_list.find_all('meta', {'itemprop': 'ratingValue'})"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 37,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "ratings = []\n",
 77 |     "for rating in rating_list:\n",
 78 |     "    ratings.append(rating['content'])"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 38,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "data": {
 88 |       "text/plain": [
 89 |        "['10',\n",
 90 |        " '10',\n",
 91 |        " '8',\n",
 92 |        " '10',\n",
 93 |        " '10',\n",
 94 |        " '8',\n",
 95 |        " '8',\n",
 96 |        " '10',\n",
 97 |        " '10',\n",
 98 |        " '10',\n",
 99 |        " '10',\n",
100 |        " '10',\n",
101 |        " '8',\n",
102 |        " '10',\n",
103 |        " '10',\n",
104 |        " '10',\n",
105 |        " '8',\n",
106 |        " '10',\n",
107 |        " '10',\n",
108 |        " '10',\n",
109 |        " '10',\n",
110 |        " '10',\n",
111 |        " '10',\n",
112 |        " '10',\n",
113 |        " '8',\n",
114 |        " '10',\n",
115 |        " '8',\n",
116 |        " '10',\n",
117 |        " '8',\n",
118 |        " '8']"
119 |       ]
120 |      },
121 |      "execution_count": 38,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "ratings"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": []
136 |   }
137 |  ],
138 |  "metadata": {
139 |   "kernelspec": {
140 |    "display_name": "Python 3",
141 |    "language": "python",
142 |    "name": "python3"
143 |   },
144 |   "language_info": {
145 |    "codemirror_mode": {
146 |     "name": "ipython",
147 |     "version": 3
148 |    },
149 |    "file_extension": ".py",
150 |    "mimetype": "text/x-python",
151 |    "name": "python",
152 |    "nbconvert_exporter": "python",
153 |    "pygments_lexer": "ipython3",
154 |    "version": "3.7.6"
155 |   }
156 |  },
157 |  "nbformat": 4,
158 |  "nbformat_minor": 4
159 | }
160 | 


--------------------------------------------------------------------------------
/Other/Indeed_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "The purpose of this notebook is to pull job descriptions off Indeed (using RapidAPI)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import requests"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 4,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "{\"context\":{\"results_count\":519,\"current_page\":1,\"page_count\":52,\"search_url\":\"https://www.indeed.com/jobs?q=data+scientist&l=san+francisco&start=00\"},\"jobs\":[{\"job_id\":\"54ab161b978a26b3\",\"company\":\"University of California San Francisco\",\"company_url\":\"https://www.indeed.com/cmp/University-of-California---SAN-Francisco\",\"company_rating\":4.2,\"location\":\"San Francisco, CA 94143 (Haight-Ashbury area)\",\"description\":\"Additionally, this position requires strong multitasking skills as the Research Data Scientist may also support data management and analysis needs of…\",\"publication_date\":\"2020-05-28T12:00:00.000Z\"},{\"job_id\":\"281c22f20aaf3dc9\",\"company\":\"Twitter\",\"company_url\":\"https://www.indeed.com/cmp/Twitter\",\"company_rating\":4.1,\"location\":\"San Francisco, CA 94103 (South of Market area)\",\"description\":\"You’re passionate to work on large datasets to generate knowledge on behaviors and trends and have a diverse interest and skill set covering data analysis,…\",\"publication_date\":\"2020-05-06T12:00:00.000Z\"},{\"job_id\":\"7e5b1dd0315dd25a\",\"company\":\"Pinterest\",\"company_url\":\"https://www.indeed.com/cmp/Pinterest\",\"company_rating\":4.2,\"location\":\"San Francisco, CA 94103 (South of Market area)\",\"description\":\"6+ years of industry experience with proven ability to apply scientific methods to solve real-world problems on web-scale data.\",\"publication_date\":\"2020-05-31T12:00:00.000Z\"},{\"job_id\":\"544e3faafc2bf1d7\",\"company\":\"Blue Owl\",\"location\":\"San Francisco, CA\",\"salary\":\"$200,000 - $350,000 a year\",\"description\":\"Demonstrable expertise building and supporting machine learning models deployed to production. You have built time series models using econometric approaches as…\",\"publication_date\":\"30+ days ago\"},{\"job_id\":\"45811e1c376e78ca\",\"company\":\"Adobe\",\"company_url\":\"https://www.indeed.com/cmp/Adobe\",\"company_rating\":4.3,\"location\":\"San Francisco, CA 94107 (South of Market area)\",\"description\":\"Exposure to applied machine learning in an industrial setting. This will involve thinking hard about product quality, the role of machine learning in those…\",\"publication_date\":\"30+ days ago\"},{\"job_id\":\"ecacf58f2f41884b\",\"company\":\"Notion\",\"location\":\"San Francisco, CA 94110 (Mission area)\",\"description\":\"You have experience building predictive statistical and machine learning models, and you can build reproducible backtests for proposed models to demonstrate…\",\"publication_date\":\"30+ days ago\"},{\"job_id\":\"7ce9e0a3bb536d7a\",\"company\":\"SentiLink\",\"location\":\"San Francisco, CA 94103 (South of Market area)\",\"description\":\"A graduate degree in a technical field and 1+ years relevant work experience OR 3+ years relevant work experience (e.g. data scientist, machine learning…\",\"publication_date\":\"30+ days ago\"},{\"job_id\":\"69f3b39791a4d24d\",\"company\":\"Eaze\",\"location\":\"San Francisco, CA\",\"description\":\"Building production data science models utilized by many departments at Eaze as well as our core product, including predictive, vehicle routing, monte-carlo,…\",\"publication_date\":\"30+ days ago\"},{\"job_id\":\"19559f2a996703a4\",\"company\":\"Y Combinator\",\"location\":\"San Francisco, CA 94108 (Chinatown area)\",\"description\":\"Build machine learning models to support admissions processes. You will be the point person for data pipeline, analysis and modeling efforts, primarily focusing…\",\"publication_date\":\"30+ days ago\"},{\"job_id\":\"b575ede49cdd6689\",\"company\":\"The Climate Corporation\",\"company_url\":\"https://www.indeed.com/cmp/The-Climate-Corporation\",\"company_rating\":3.6,\"location\":\"San Francisco, CA\",\"description\":\"Working with engineering and scientific leaders, you will set the strategic direction of productizing large scale scientific problems that inform our products…\",\"publication_date\":\"30+ days ago\"}]}\n"
 29 |      ]
 30 |     }
 31 |    ],
 32 |    "source": [
 33 |     "# Use the code copied from Rapid API.\n",
 34 |     "\n",
 35 |     "url = \"https://indeed9.p.rapidapi.com/search\"\n",
 36 |     "\n",
 37 |     "payload = \"page=1&position=data%20scientist&city=san%20francisco\"\n",
 38 |     "headers = {\n",
 39 |     "    'x-rapidapi-host': \"indeed9.p.rapidapi.com\",\n",
 40 |     "    'x-rapidapi-key': \"\",\n",
 41 |     "    'content-type': \"application/x-www-form-urlencoded\"\n",
 42 |     "    }\n",
 43 |     "\n",
 44 |     "response = requests.request(\"POST\", url, data=payload, headers=headers)\n",
 45 |     "\n",
 46 |     "print(response.text)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 5,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "jobs = response.json()"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 6,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "dict_keys(['context', 'jobs'])"
 67 |       ]
 68 |      },
 69 |      "execution_count": 6,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "jobs.keys()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 7,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/plain": [
 86 |        "{'results_count': 519,\n",
 87 |        " 'current_page': 1,\n",
 88 |        " 'page_count': 52,\n",
 89 |        " 'search_url': 'https://www.indeed.com/jobs?q=data+scientist&l=san+francisco&start=00'}"
 90 |       ]
 91 |      },
 92 |      "execution_count": 7,
 93 |      "metadata": {},
 94 |      "output_type": "execute_result"
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "jobs['context']"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 8,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "{'job_id': '54ab161b978a26b3',\n",
110 |        " 'company': 'University of California San Francisco',\n",
111 |        " 'company_url': 'https://www.indeed.com/cmp/University-of-California---SAN-Francisco',\n",
112 |        " 'company_rating': 4.2,\n",
113 |        " 'location': 'San Francisco, CA 94143 (Haight-Ashbury area)',\n",
114 |        " 'description': 'Additionally, this position requires strong multitasking skills as the Research Data Scientist may also support data management and analysis needs of…',\n",
115 |        " 'publication_date': '2020-05-28T12:00:00.000Z'}"
116 |       ]
117 |      },
118 |      "execution_count": 8,
119 |      "metadata": {},
120 |      "output_type": "execute_result"
121 |     }
122 |    ],
123 |    "source": [
124 |     "# What does one job entry look like?\n",
125 |     "# Does not include the full description!\n",
126 |     "\n",
127 |     "jobs['jobs'][0]"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 9,
133 |    "metadata": {},
134 |    "outputs": [
135 |     {
136 |      "data": {
137 |       "text/plain": [
138 |        "10"
139 |       ]
140 |      },
141 |      "execution_count": 9,
142 |      "metadata": {},
143 |      "output_type": "execute_result"
144 |     }
145 |    ],
146 |    "source": [
147 |     "# Only getting ten jobs per request.\n",
148 |     "len(jobs['jobs'])"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": []
157 |   }
158 |  ],
159 |  "metadata": {
160 |   "kernelspec": {
161 |    "display_name": "Python 3",
162 |    "language": "python",
163 |    "name": "python3"
164 |   },
165 |   "language_info": {
166 |    "codemirror_mode": {
167 |     "name": "ipython",
168 |     "version": 3
169 |    },
170 |    "file_extension": ".py",
171 |    "mimetype": "text/x-python",
172 |    "name": "python",
173 |    "nbconvert_exporter": "python",
174 |    "pygments_lexer": "ipython3",
175 |    "version": "3.7.3"
176 |   }
177 |  },
178 |  "nbformat": 4,
179 |  "nbformat_minor": 4
180 | }
181 | 


--------------------------------------------------------------------------------
/Other/coursera_description.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "df = pd.read_csv('../Data/Course_Data/Coursera_Catalog.csv')"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 3,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "data": {
 28 |       "text/html": [
 29 |        "<div>\n",
 30 |        "<style scoped>\n",
 31 |        "    .dataframe tbody tr th:only-of-type {\n",
 32 |        "        vertical-align: middle;\n",
 33 |        "    }\n",
 34 |        "\n",
 35 |        "    .dataframe tbody tr th {\n",
 36 |        "        vertical-align: top;\n",
 37 |        "    }\n",
 38 |        "\n",
 39 |        "    .dataframe thead th {\n",
 40 |        "        text-align: right;\n",
 41 |        "    }\n",
 42 |        "</style>\n",
 43 |        "<table border=\"1\" class=\"dataframe\">\n",
 44 |        "  <thead>\n",
 45 |        "    <tr style=\"text-align: right;\">\n",
 46 |        "      <th></th>\n",
 47 |        "      <th>courseType</th>\n",
 48 |        "      <th>description</th>\n",
 49 |        "      <th>domainTypes</th>\n",
 50 |        "      <th>id</th>\n",
 51 |        "      <th>slug</th>\n",
 52 |        "      <th>specializations</th>\n",
 53 |        "      <th>workload</th>\n",
 54 |        "      <th>primaryLanguages</th>\n",
 55 |        "      <th>certificates</th>\n",
 56 |        "      <th>name</th>\n",
 57 |        "    </tr>\n",
 58 |        "  </thead>\n",
 59 |        "  <tbody>\n",
 60 |        "    <tr>\n",
 61 |        "      <th>0</th>\n",
 62 |        "      <td>v2.ondemand</td>\n",
 63 |        "      <td>Gamification is the application of game elemen...</td>\n",
 64 |        "      <td>[{'subdomainId': 'design-and-product', 'domain...</td>\n",
 65 |        "      <td>69Bku0KoEeWZtA4u62x6lQ</td>\n",
 66 |        "      <td>gamification</td>\n",
 67 |        "      <td>[]</td>\n",
 68 |        "      <td>4-8 hours/week</td>\n",
 69 |        "      <td>['en']</td>\n",
 70 |        "      <td>['VerifiedCert']</td>\n",
 71 |        "      <td>Gamification</td>\n",
 72 |        "    </tr>\n",
 73 |        "    <tr>\n",
 74 |        "      <th>1</th>\n",
 75 |        "      <td>v2.ondemand</td>\n",
 76 |        "      <td>This course will cover the steps used in weigh...</td>\n",
 77 |        "      <td>[{'subdomainId': 'data-analysis', 'domainId': ...</td>\n",
 78 |        "      <td>0HiU7Oe4EeWTAQ4yevf_oQ</td>\n",
 79 |        "      <td>missing-data</td>\n",
 80 |        "      <td>[]</td>\n",
 81 |        "      <td>4 weeks of study, 1-2 hours/week</td>\n",
 82 |        "      <td>['en']</td>\n",
 83 |        "      <td>['VerifiedCert', 'Specialization']</td>\n",
 84 |        "      <td>Dealing With Missing Data</td>\n",
 85 |        "    </tr>\n",
 86 |        "  </tbody>\n",
 87 |        "</table>\n",
 88 |        "</div>"
 89 |       ],
 90 |       "text/plain": [
 91 |        "    courseType                                        description  \\\n",
 92 |        "0  v2.ondemand  Gamification is the application of game elemen...   \n",
 93 |        "1  v2.ondemand  This course will cover the steps used in weigh...   \n",
 94 |        "\n",
 95 |        "                                         domainTypes                      id  \\\n",
 96 |        "0  [{'subdomainId': 'design-and-product', 'domain...  69Bku0KoEeWZtA4u62x6lQ   \n",
 97 |        "1  [{'subdomainId': 'data-analysis', 'domainId': ...  0HiU7Oe4EeWTAQ4yevf_oQ   \n",
 98 |        "\n",
 99 |        "           slug specializations                          workload  \\\n",
100 |        "0  gamification              []                    4-8 hours/week   \n",
101 |        "1  missing-data              []  4 weeks of study, 1-2 hours/week   \n",
102 |        "\n",
103 |        "  primaryLanguages                        certificates  \\\n",
104 |        "0           ['en']                    ['VerifiedCert']   \n",
105 |        "1           ['en']  ['VerifiedCert', 'Specialization']   \n",
106 |        "\n",
107 |        "                        name  \n",
108 |        "0               Gamification  \n",
109 |        "1  Dealing With Missing Data  "
110 |       ]
111 |      },
112 |      "execution_count": 3,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "df.head(2)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 10,
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "data": {
128 |       "text/plain": [
129 |        "'By enrolling in this specialization you agree to the Qwiklabs Terms of Service as set out in the FAQ and located at: htt...'"
130 |       ]
131 |      },
132 |      "execution_count": 10,
133 |      "metadata": {},
134 |      "output_type": "execute_result"
135 |     }
136 |    ],
137 |    "source": [
138 |     "df.iloc[920]['description'][0:120] + \"...\""
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": []
147 |   }
148 |  ],
149 |  "metadata": {
150 |   "kernelspec": {
151 |    "display_name": "Python 3",
152 |    "language": "python",
153 |    "name": "python3"
154 |   },
155 |   "language_info": {
156 |    "codemirror_mode": {
157 |     "name": "ipython",
158 |     "version": 3
159 |    },
160 |    "file_extension": ".py",
161 |    "mimetype": "text/x-python",
162 |    "name": "python",
163 |    "nbconvert_exporter": "python",
164 |    "pygments_lexer": "ipython3",
165 |    "version": "3.7.3"
166 |   }
167 |  },
168 |  "nbformat": 4,
169 |  "nbformat_minor": 4
170 | }
171 | 


--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: gunicorn app:app
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MargSetu
 2 | 
 3 | <p align="center">
 4 |   <a href="https://github.com/HarshCasper/MargSetu">
 5 |     <img src="https://d3njjcbhbojbot.cloudfront.net/api/utilities/v1/imageproxy/https://coursera.s3.amazonaws.com/media/coursera-logo-square.png?auto=format%2Ccompress&dpr=1" alt="Logo" width="150" height="150">
 6 |   </a>
 7 | 
 8 | ## 📌 Introduction
 9 | 
10 | This Web Application powered by Machine Learning and Flask API is a Recommender System which can be used to recommend <b>Massive Open Online Courses (MOOCs)</b> to Students and 
11 | Professionals according to their needs and proficiency. The [Dataset](https://github.com/HarshCasper/MargSetu/tree/master/Data) used to process the Data Model and power the Application has been scrapped from Coursera's Public Catalog which consists of more than 4000+ Courses in various domains like Data Scientist, DevOps Engineers and Cloud Development Roles. Using Gensim for developing Natural Language Processing Model, a Doc2Vec Model was used to generate predictions for a given role. 
12 | 
13 | ## 🎯 Purpose of the Project
14 | 
15 |  <b>Massive Open Online Courses (MOOCs)</b> are increasingly being relied on by Students and Professionals to learn new skills and get the know-how of various technologies and toolkits. While this has allowed the awareness among people, this has also led them to take multiple unreliable course materials that simply don't do justice to people. This Machine Learning Application tries to recommend the appropriate courses to Students and Professionals according to the Job-Profile they are aiming for. 
16 |  
17 |  Our Model performs fairly well when it comes to recommending the appropriate courses and hence allows the right recommendations to be generated as per the technology or tooling that someone is aiming to learn.
18 |  
19 |  ## 🏁 Technology Stack
20 | 
21 | * [Flask](https://github.com/pallets/flask)
22 | * [HTML](https://www.w3.org/TR/html52/)
23 | * [CSS](https://developer.mozilla.org/en-US/docs/Web/CSS)
24 | * [Gensim](https://pypi.org/project/gensim/) 
25 | * [Pandas](https://pandas.pydata.org/)
26 | 
27 | ## 🏃‍♂️ Local Installation
28 | 
29 | 1. Drop a ⭐ on the Github Repository. 
30 | 2. Clone the Repo by going to your local Git Client and pushing in the command: 
31 | 
32 | ```sh
33 | https://github.com/HarshCasper/MargSetu.git
34 | ```
35 | 
36 | 3. Install the Packages: 
37 | ```sh
38 | pip install -r requirements.txt
39 | ```
40 | 
41 | 4. At last, push in the command:
42 | ```sh
43 | python app.py
44 | ```
45 | 
46 | 5. Go to ` http://127.0.0.1:5000/` and enjoy the application.
47 | 
48 | ## 📋 Further Changes to be Done
49 | 
50 | - [ ] Deploying the Web Application on Cloud.
51 | - [ ] Development of the Model using Tensorflow/PyTorch.
52 | - [ ] Enhance the User-Interface using HTML/CSS.
53 | - [ ] Set the Application on Docker.
54 | - [ ] Improve the Quality of Predictions.
55 | - [ ] Add a more interactive User-Interface and integrate various other parameters.
56 | 
57 | ## 📜 LICENSE
58 | 
59 | [MIT](https://github.com/HarshCasper/MargSetu/blob/master/LICENSE)
60 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pickle
 3 | from flask import Flask, request, Response, render_template, jsonify
 4 | import gensim
 5 | import pandas as pd
 6 | 
 7 | # Initializing the Flask Application
 8 | app = Flask('cr_app')
 9 | 
10 | # Route 1: Shows a Form to the User to fill in
11 | @app.route('/')
12 | def home():
13 |     return render_template('form.html')
14 | 
15 | # Route 2: Accept the Form Submission and process it
16 | @app.route('/submit')
17 | def submit():
18 |     jd = request.args["JobDesc"] 
19 |     doc = gensim.utils.simple_preprocess(jd) 
20 |     model = pickle.load(open('./model.p', 'rb')) 
21 |     vector = model.infer_vector(doc) 
22 |     top = 5 
23 |     sims = model.docvecs.most_similar([vector], topn=top)
24 |     course_ids = [sim[0] for sim in sims] 
25 |     df = pd.read_csv('./Data/Course_Data/Coursera_Catalog.csv') 
26 |     course_names = [df.iloc[id]['name'] for id in course_ids] 
27 |     course_descriptions = [df.iloc[id]['description'][0:150] + "..." for id in course_ids] 
28 |     return render_template('results.html', len=top, names=course_names, descriptions=course_descriptions)
29 | 
30 | if __name__ == '__main__':
31 |     app.run(debug=True)
32 | 


--------------------------------------------------------------------------------
/model.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HarshCasper/MargSetu/fce01bddb1672d33cf74cdd7338894e83013ec0e/model.p


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | attrs==19.3.0
  2 | autopep8==1.4.4
  3 | Babel==2.8.0
  4 | backcall==0.1.0
  5 | backports.functools-lru-cache==1.6.1
  6 | backports.shutil-get-terminal-size==1.0.0
  7 | backports.tempfile==1.0
  8 | backports.weakref==1.0.post1
  9 | beautifulsoup4==4.8.2
 10 | bitarray==1.2.1
 11 | bkcharts==0.2
 12 | bleach==3.1.4
 13 | bokeh==2.0.0
 14 | boto==2.49.0
 15 | boto3==1.13.5
 16 | botocore==1.16.5
 17 | Bottleneck==1.3.2
 18 | branca==0.4.1
 19 | cachetools==4.1.0
 20 | certifi==2019.11.28
 21 | cffi==1.14.0
 22 | cftime==1.1.3
 23 | chardet==3.0.4
 24 | Click==7.0
 25 | click-plugins==1.1.1
 26 | cligj==0.5.0
 27 | cloudpickle==1.3.0
 28 | clyent==1.2.2
 29 | colorama==0.4.3
 30 | conda==4.8.3
 31 | conda-build==3.18.11
 32 | conda-package-handling==1.6.0
 33 | conda-verify==3.4.2
 34 | gast==0.3.3
 35 | gensim==3.8.3
 36 | geopandas==0.7.0
 37 | gevent==1.4.0
 38 | glob2==0.7
 39 | gmpy2==2.0.8
 40 | greenlet==0.4.15
 41 | grpcio==1.29.0
 42 | gunicorn==20.0.4
 43 | h5py==2.10.0
 44 | HeapDict==1.0.1
 45 | html5lib==1.0.1
 46 | hypothesis==5.5.4
 47 | inflection==0.4.0
 48 | isort==4.3.21
 49 | itsdangerous==1.1.0
 50 | jdcal==1.4.1
 51 | jedi==0.14.1
 52 | jellyfish==0.6.1
 53 | Jinja2==2.11.1
 54 | jmespath==0.9.5
 55 | joblib==0.14.1
 56 | json5==0.9.1
 57 | jsonschema==3.2.0
 58 | jupyter==1.0.0
 59 | jupyter-client==5.3.4
 60 | jupyter-console==6.1.0
 61 | jupyter-core==4.6.1
 62 | jupyterlab==1.2.6
 63 | jupyterlab-server==1.0.6
 64 | keyring==21.1.0
 65 | kiwisolver==1.1.0
 66 | lazy-object-proxy==1.4.3
 67 | libarchive-c==2.8
 68 | lief==0.9.0
 69 | llvmlite==0.31.0
 70 | locket==0.2.0
 71 | lxml==4.5.0
 72 | Markdown==3.2.2
 73 | MarkupSafe==1.1.1
 74 | matplotlib==3.2.1
 75 | mccabe==0.6.1
 76 | mistune==0.8.4
 77 | mkl-fft==1.0.15
 78 | mkl-random==1.1.0
 79 | mkl-service==2.3.0
 80 | mock==4.0.1
 81 | more-itertools==8.2.0
 82 | mpmath==1.1.0
 83 | msgpack==0.6.1
 84 | multipledispatch==0.6.0
 85 | multitasking==0.0.9
 86 | munch==2.5.0
 87 | navigator-updater==0.2.1
 88 | nbconvert==5.6.1
 89 | nbformat==5.0.4
 90 | netCDF4==1.5.3
 91 | networkx==2.4
 92 | nltk==3.4.5
 93 | nose==1.3.7
 94 | notebook==6.0.3
 95 | numba==0.48.0
 96 | numexpr==2.7.1
 97 | numpy==1.18.2
 98 | numpydoc==0.9.2
 99 | olefile==0.46
100 | openpyxl==3.0.3
101 | opt-einsum==3.2.1
102 | packaging==20.1
103 | pandas==1.0.3
104 | pandocfilters==1.4.2
105 | parso==0.5.2
106 | partd==1.1.0
107 | path==13.1.0
108 | pathlib2==2.3.5
109 | pathtools==0.1.2
110 | patsy==0.5.1
111 | pep8==1.7.1
112 | pexpect==4.8.0
113 | pickleshare==0.7.5
114 | pipenv==2020.6.2
115 | pkginfo==1.5.0.1
116 | requests==2.22.0
117 | rope==0.16.0
118 | Rtree==0.9.3
119 | ruamel-yaml==0.15.87
120 | scikit-learn==0.22.2.post1
121 | scipy==1.4.1
122 | seaborn==0.10.0
123 | spyder-kernels==1.8.1
124 | SQLAlchemy==1.3.13
125 | sqlparse==0.3.1
126 | statsmodels==0.11.0
127 | sympy==1.5.1
128 | tables==3.6.1
129 | tblib==1.6.0
130 | urllib3==1.25.8
131 | virtualenv==20.0.21
132 | virtualenv-clone==0.5.4
133 | webencodings==0.5.1
134 | Werkzeug==1.0.0
135 | 


--------------------------------------------------------------------------------
/static/css/styles.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |   font-family: sans-serif;
 3 |   margin: 0;
 4 |   padding: 0;
 5 | }
 6 | 
 7 | header {
 8 |   text-align: center;
 9 |   color: black;
10 |   font-family: Arial Black, sans-serif;
11 |   /* font-variant: small-caps; */
12 |   font-size: 23px;
13 |   font-weight: 700;
14 |   font-style: normal;
15 |   padding-top: 2rem;
16 |   /* padding-bottom: 0rem; */
17 | }
18 | 
19 | footer {
20 |   text-align: center;
21 |   padding-top: 1rem;
22 |   padding-bottom: 2rem;
23 | }
24 | 
25 | .submission-form {
26 |   text-align: center;
27 |   padding-left: 1rem;
28 |   padding-right: 1rem;
29 | }
30 | 
31 | textarea {
32 |   margin: auto;
33 |   outline: none;
34 |   resize: none;
35 |   text-align: left;
36 |   font-family: sans-serif;
37 | }
38 | 
39 | button {
40 |   padding: 10px;
41 |   font-weight: 600;
42 | }
43 | 
44 | .results-list {
45 |   padding-top: 2rem;
46 |   padding-bottom: 0rem;
47 |   padding-left: 2rem;
48 |   padding-right: 2rem;
49 | }
50 | 
51 | dt {
52 |   font-weight: bold;
53 | }
54 | 
55 | .button-container {
56 |   text-align: center;
57 | }
58 | 
59 | .info-container {
60 |   text-align: center;
61 |   display: inline-block;
62 | }
63 | 
64 | .info {
65 |   color: gray;
66 | }
67 | 


--------------------------------------------------------------------------------
/templates/form.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 | 	<meta charset="UTF-8">
 6 | 	<meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 | 	<title>Career Course Recommender</title>
 8 | 	<link rel="stylesheet" href="../static/css/styles.css">
 9 | 	<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
10 | </head>
11 | 
12 | <body>
13 | 
14 | 	<header>
15 | 		<p>Career Course Recommender</p>
16 | 	</header>
17 | 	<main>
18 | 		<section class="submission-form">
19 | 				<form action="/submit">
20 | 					<textarea name="JobDesc" placeholder="Enter a job description and click the button below." rows="15" cols="60" required></textarea><br><br>
21 | 					<button type="submit">Recommend Courses</button>
22 | 				</form>
23 | 		</section>
24 | 	</main>
25 | 	<footer>
26 | 		<div class="info-container">
27 | 			<a class="info" href="https://github.com/harshcasper"><i class="fa fa-github fa-lg"></i></a>
28 | 		</div>
29 | 	</footer>
30 | 	
31 | </body>
32 | </html>
33 | 


--------------------------------------------------------------------------------
/templates/results.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 | 	<meta charset="UTF-8">
 6 | 	<meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 | 	<title>Career Course Recommender</title>
 8 | 	<link rel="stylesheet" href="../static/css/styles.css">
 9 | 	<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
10 | </head>
11 | 
12 | <body>
13 | 	<main>
14 | 		<section class="results-list">
15 | 				<dl>
16 | 					{%for i in range(len)%}
17 | 						<dt>{{names[i]}}</dt>
18 | 						<dd>{{descriptions[i]}}</dd>
19 | 						<br>
20 | 					{%endfor%}
21 | 				</dl>
22 | 				<div class="button-container">
23 | 					<form action="/">
24 | 						<button type="submit">Start Over</button>
25 | 					</form>
26 | 				</div>
27 | 		</section>
28 | 	</main>
29 | 	<footer>
30 | 		<div class="info-container">
31 | 			<a class="info" href="https://github.com/harshcasper"><i class="fa fa-github fa-lg"></i></a>
32 | 		</div>
33 | 	</footer>
34 | </body>
35 | 
36 | </html>
37 | 


--------------------------------------------------------------------------------