├── 01_PQ_YT_Notebook.ipynb ├── 01_ValueInvestorHoldings31122021.xlsx ├── 02_DataCleaning_Video.ipynb ├── 05_ValueInvestorHoldings31122022_2.xlsx ├── 06_ValueInvestorHoldings31032023.xlsx ├── 28_Lost_Customers_NW_Video.pbix ├── 28_New_Customers_NW_video.pbix ├── ActualDataset.xlsx ├── BaSensei_webscrape_Bitcoin_YTVideo.ipynb ├── BrokerStatement.xlsx ├── BrokerStatement2.xlsx ├── BrokerStatement3.xlsx ├── CleanderDataScience.csv ├── DAX_Dates.txt ├── DataProfiling_Video.ipynb ├── DataScientist2.csv ├── DataScientist3.html ├── File1.xlsx ├── File3.xlsx ├── Holdings_30062022.xlsx ├── Holdings_30092022.xlsx ├── Holdings_31032022.xlsx ├── Holdings_31122022.xlsx ├── HorizontalDatasheet.xlsx ├── PBI_CrossOverChart_SVB_CS.pbix ├── PBI_Holdings_Source.xlsx ├── PBI_MOM_Video.pbix ├── PBI_Simple_Moving_Average_Video.pbix ├── PQ_Advanced_Grouping_With_Max.xlsx ├── PQ_AllSeasons_Video_2.xlsx ├── PQ_BUFFER_VIDEO.xlsx ├── PQ_BankStatement_Video.xlsx ├── PQ_BlankRow_video.xlsx ├── PQ_Budget_Video.xlsx ├── PQ_ChatGPT_MarketVideo.pbix ├── PQ_ClosestNextHoliday_Video.xlsx ├── PQ_ColumnGroups_BaSensei_Video.xlsx ├── PQ_Consecutivenumbers_Video2.xlsx ├── PQ_Counter_Column_Video.xlsx ├── PQ_DOUBLE_BARREL_2_Video.xlsx ├── PQ_DoubleBarrel_Video.xlsx ├── PQ_Double_Headers_YT_Video.xlsx ├── PQ_Dups_Video.xlsx ├── PQ_DynamicSplit_Video.xlsx ├── PQ_Dynamic_Split_Header_Names.xlsx ├── PQ_Dynamic_T_B_Video.xlsx ├── PQ_EACH_VIDEO.xlsx ├── PQ_EMAIL_VIDEO.xlsx ├── PQ_Expense_Allocations_Video.xlsx ├── PQ_FILTER_PARAM_VIDEO.xlsx ├── PQ_FILTER_TEXT_INPUT_OR_POSITION.xlsx ├── PQ_Filter_Before_Expand_Video.xlsx ├── PQ_Filter_Columns_Once_Video.xlsx ├── PQ_Grouping_Video_2.xlsx ├── PQ_HorzontalStack_Video.xlsx ├── PQ_Jagged_Stacked_Video.xlsx ├── PQ_Jagged_Tables_Source.xlsx ├── PQ_JunkRows_Video.xlsx ├── PQ_LOOPING_VIDEO_2.pbix ├── PQ_LastNTotal_Video.xlsx ├── PQ_ListAccum_NewColumns_Video.xlsx ├── PQ_ListAlernate_Video.xlsx ├── PQ_Listaccumulate_video.xlsx ├── PQ_LsatDate_Video.xlsx ├── PQ_Max_Value_Row_Video.xlsx ├── PQ_Merge_Video.xlsx ├── PQ_OCCURANCE_COUNT_VIDEO.xlsx ├── PQ_OpenAI_Pyhton.xlsx ├── PQ_PadMiddle_Video.xlsx ├── PQ_ParsingDelimitedDataToTable_Video2.xlsx ├── PQ_Pattern_Extraction_Video.xlsx ├── PQ_PercentageOfTotal_Video.xlsx ├── PQ_Portfolio_Comparer_Video.xlsx ├── PQ_PreviousRowsEmployeeTimeLine.xlsx ├── PQ_PreviousRowsEmployeeTimeLine_Video.xlsx ├── PQ_RatingsData.xlsx ├── PQ_ReportheaderIntoReportVideo.xlsx ├── PQ_ReverseFillDown_Video.xlsx ├── PQ_SPRatings_ListAccum_Source_Video2.xlsx ├── PQ_Song_List_Stacked_Video.xlsx ├── PQ_SourceData_Portfolios.xlsx ├── PQ_StepsProcess_Video.xlsx ├── PQ_Stock_Groupings_Video.xlsx ├── PQ_Subtotals_Video.xlsx ├── PQ_TextReverseSorting_Video.xlsx ├── PQ_Unstack_Data.xlsx ├── PQ_Unstack_Data_Uneven_Video.xlsx ├── PQ_VALUE_ALL_COLUMNS_VIDEO.xlsx ├── PQ_Working Days_2_Video.xlsx ├── PQ_Working Hours_Video.xlsx ├── PQ_bulkReplace_YT_Video.xlsx ├── PQ_combineWithdifferntColumnNames.xlsx ├── Portfolios1.xlsx ├── Portfolios2.xlsx ├── Portfolios3.xlsx ├── PowerQuery_Dynamically_Clean_DataSet_Headers.xlsx ├── Pq_Address_Split_Video.xlsx ├── Pq_AttribuetDescription_Video.xlsx ├── Pq_Conditionally_Replace_Video.xlsx ├── Pq_DynamicSortColumns_Video.xlsx ├── Pq_DynamicTRansformColumnNames_Video.xlsx ├── Pq_Name_Changes_Video.xlsx ├── Pq_Portfolio_Video.xlsx ├── Pq_Query_Referencing_Video.xlsx ├── Pq_SplitDynamically_Source.xlsx ├── StockQuotesData.xlsx ├── StockQuotes_SVB_CS.xlsx ├── Trades_video.xlsx ├── pattern2Video.xlsx ├── us_stock_sales_2024.csv └── us_stock_sales_2024_extended.csv /01_PQ_YT_Notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": { 7 | "collapsed": true, 8 | "ExecuteTime": { 9 | "end_time": "2023-11-24T12:06:28.192716100Z", 10 | "start_time": "2023-11-24T12:06:28.178737Z" 11 | } 12 | }, 13 | "outputs": [], 14 | "source": [ 15 | "import openai\n", 16 | "import os" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 5, 22 | "outputs": [], 23 | "source": [ 24 | "API_Key = \"xxxxx\"\n", 25 | "openai.api_key = API_Key" 26 | ], 27 | "metadata": { 28 | "collapsed": false, 29 | "ExecuteTime": { 30 | "end_time": "2023-11-24T12:06:56.082223300Z", 31 | "start_time": "2023-11-24T12:06:56.079230400Z" 32 | } 33 | } 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 7, 38 | "outputs": [], 39 | "source": [ 40 | "openai.api_key = os.getenv(\"OPENAI_API_KEY\")" 41 | ], 42 | "metadata": { 43 | "collapsed": false, 44 | "ExecuteTime": { 45 | "end_time": "2023-11-24T12:08:05.620389800Z", 46 | "start_time": "2023-11-24T12:08:05.605957900Z" 47 | } 48 | } 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 9, 53 | "outputs": [], 54 | "source": [ 55 | "response = openai.ChatCompletion.create(\n", 56 | " model=\"gpt-3.5-turbo\",\n", 57 | " messages = [{\"role\":\"user\",\"content\": \"Hello World!\"}],\n", 58 | " temperature=1,\n", 59 | " max_tokens=600,\n", 60 | " top_p=1,\n", 61 | " frequency_penalty=0,\n", 62 | " presence_penalty=0\n", 63 | ")" 64 | ], 65 | "metadata": { 66 | "collapsed": false, 67 | "ExecuteTime": { 68 | "end_time": "2023-11-24T12:09:45.928571500Z", 69 | "start_time": "2023-11-24T12:09:45.245885700Z" 70 | } 71 | } 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 10, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "{\n", 82 | " \"id\": \"chatcmpl-8OPI8QjpySYFAf9U2zxOKKbZC5Bg0\",\n", 83 | " \"object\": \"chat.completion\",\n", 84 | " \"created\": 1700827784,\n", 85 | " \"model\": \"gpt-3.5-turbo-0613\",\n", 86 | " \"choices\": [\n", 87 | " {\n", 88 | " \"index\": 0,\n", 89 | " \"message\": {\n", 90 | " \"role\": \"assistant\",\n", 91 | " \"content\": \"Hello! How can I assist you today?\"\n", 92 | " },\n", 93 | " \"finish_reason\": \"stop\"\n", 94 | " }\n", 95 | " ],\n", 96 | " \"usage\": {\n", 97 | " \"prompt_tokens\": 10,\n", 98 | " \"completion_tokens\": 9,\n", 99 | " \"total_tokens\": 19\n", 100 | " }\n", 101 | "}\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "print(response)" 107 | ], 108 | "metadata": { 109 | "collapsed": false, 110 | "ExecuteTime": { 111 | "end_time": "2023-11-24T12:09:55.968293500Z", 112 | "start_time": "2023-11-24T12:09:55.964800800Z" 113 | } 114 | } 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 12, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": "'Hello! How can I assist you today?'" 123 | }, 124 | "execution_count": 12, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "response[\"choices\"][0][\"message\"][\"content\"]" 131 | ], 132 | "metadata": { 133 | "collapsed": false, 134 | "ExecuteTime": { 135 | "end_time": "2023-11-24T12:10:56.926079200Z", 136 | "start_time": "2023-11-24T12:10:56.917035700Z" 137 | } 138 | } 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 24, 143 | "outputs": [], 144 | "source": [ 145 | "def ask_chatgpt(messages):\n", 146 | " model_response = openai.ChatCompletion.create(\n", 147 | " model=\"gpt-3.5-turbo\",\n", 148 | " messages=messages,\n", 149 | " temperature=1,\n", 150 | " max_tokens=600,\n", 151 | " top_p=1,\n", 152 | " frequency_penalty=0,\n", 153 | " presence_penalty=0\n", 154 | " )\n", 155 | " return model_response[\"choices\"][0][\"message\"][\"content\"]" 156 | ], 157 | "metadata": { 158 | "collapsed": false, 159 | "ExecuteTime": { 160 | "end_time": "2023-11-24T12:25:18.049776100Z", 161 | "start_time": "2023-11-24T12:25:18.034961700Z" 162 | } 163 | } 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 25, 168 | "outputs": [], 169 | "source": [ 170 | "prompt_role='You are a Financial Analyst who needs to provide Market updates for your clients. Context : the daily market information is provided to you daily. Task : Summarize in 140 words the following Market news for a twitter post. The Market news to summarize:'" 171 | ], 172 | "metadata": { 173 | "collapsed": false, 174 | "ExecuteTime": { 175 | "end_time": "2023-11-24T12:25:35.608621800Z", 176 | "start_time": "2023-11-24T12:25:35.599095Z" 177 | } 178 | } 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 26, 183 | "outputs": [], 184 | "source": [ 185 | "Commentary = 'The JSE ended weaker on Tuesday, reversing some of the gains from the previous day. The market was pulled down by resource and industrial stocks. On the currency front, the rand traded firmer against the dollar, mainly supported by recent hawkish remarks from the South African Reserve Bank ahead of a crucial budget statement. The JS E All Share and blue-chip Top 40 Index decreased by 0.3% and 0.6%. European markets closed broadly stronger as investors reacted to a slew of earnings updates and economic data from the region. The UK’s FTSE 100 declined by 0.1%, while Germany’s DAX and France’s CAC 40 advanced by 0.6% and 0.9% respectively. US stocks moved mostly higher over the course of the trading session following some early weakness. Selling pressure faded shortly after the start of trading, as investors seemed reluctant to make significant moves ahead of the Federal Reserves monetary policy announcement on Wednesday. The S&P 500 and the Nasdaq Composite jumped by 0.6% and 0.5%, while the Dow Jones Industrial Average improved by 0.4%. Asian markets are up this morning. The Nikkei, the Hang Seng and the Shanghai Composite are 2.1%, 0.1% and 0.3% in the green.'" 186 | ], 187 | "metadata": { 188 | "collapsed": false, 189 | "ExecuteTime": { 190 | "end_time": "2023-11-24T12:25:56.068731600Z", 191 | "start_time": "2023-11-24T12:25:56.055501700Z" 192 | } 193 | } 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 27, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/plain": "196" 202 | }, 203 | "execution_count": 27, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "len(Commentary.split())" 210 | ], 211 | "metadata": { 212 | "collapsed": false, 213 | "ExecuteTime": { 214 | "end_time": "2023-11-24T12:26:11.427217800Z", 215 | "start_time": "2023-11-24T12:26:11.411246500Z" 216 | } 217 | } 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 28, 222 | "outputs": [], 223 | "source": [ 224 | "def assist_analyst(prole, MainComment):\n", 225 | " prompt = f'{prole}{MainComment}'\n", 226 | " return ask_chatgpt([{\"role\": \"user\", \"content\": prompt}])\n" 227 | ], 228 | "metadata": { 229 | "collapsed": false, 230 | "ExecuteTime": { 231 | "end_time": "2023-11-24T12:28:37.191711800Z", 232 | "start_time": "2023-11-24T12:28:37.186727100Z" 233 | } 234 | } 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 29, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/plain": "\"Market Update: JSE weakened, dragged by resource and industrial stocks. Rand traded firmer supported by hawkish statements from SA Reserve Bank ahead of budget statement. European markets closed stronger on earnings and economic data. UK's FTSE 100 declined, Germany's DAX and France's CAC 40 advanced. US stocks mostly higher, investors cautious before Fed's policy announcement. S&P 500 and Nasdaq Composite jumped, Dow Jones Industrial Average improved. Asian markets up this morning, Nikkei, Hang Seng, and Shanghai Composite in the green. #marketnews #JSE #rand #Europeanmarkets #USstocks #Asianmarkets\"" 243 | }, 244 | "execution_count": 29, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "assist_analyst(prompt_role, Commentary)" 251 | ], 252 | "metadata": { 253 | "collapsed": false, 254 | "ExecuteTime": { 255 | "end_time": "2023-11-24T12:28:52.661715400Z", 256 | "start_time": "2023-11-24T12:28:48.236489400Z" 257 | } 258 | } 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 30, 263 | "outputs": [ 264 | { 265 | "data": { 266 | "text/plain": "90" 267 | }, 268 | "execution_count": 30, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "len(assist_analyst(prompt_role, Commentary).split())" 275 | ], 276 | "metadata": { 277 | "collapsed": false, 278 | "ExecuteTime": { 279 | "end_time": "2023-11-24T12:29:15.726871Z", 280 | "start_time": "2023-11-24T12:29:11.926260500Z" 281 | } 282 | } 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "outputs": [], 288 | "source": [], 289 | "metadata": { 290 | "collapsed": false 291 | } 292 | } 293 | ], 294 | "metadata": { 295 | "kernelspec": { 296 | "display_name": "Python 3", 297 | "language": "python", 298 | "name": "python3" 299 | }, 300 | "language_info": { 301 | "codemirror_mode": { 302 | "name": "ipython", 303 | "version": 2 304 | }, 305 | "file_extension": ".py", 306 | "mimetype": "text/x-python", 307 | "name": "python", 308 | "nbconvert_exporter": "python", 309 | "pygments_lexer": "ipython2", 310 | "version": "2.7.6" 311 | } 312 | }, 313 | "nbformat": 4, 314 | "nbformat_minor": 0 315 | } 316 | -------------------------------------------------------------------------------- /01_ValueInvestorHoldings31122021.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jbotes/powerbiTutorials/13ec6bab783b77e7b6b904274c140e0efdf32cb1/01_ValueInvestorHoldings31122021.xlsx -------------------------------------------------------------------------------- /02_DataCleaning_Video.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Import Libraries" 7 | ], 8 | "metadata": { 9 | "collapsed": false 10 | } 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 193, 15 | "outputs": [], 16 | "source": [ 17 | "import pandas as pd" 18 | ], 19 | "metadata": { 20 | "collapsed": false, 21 | "ExecuteTime": { 22 | "end_time": "2023-08-01T18:50:16.909365600Z", 23 | "start_time": "2023-08-01T18:50:16.894906300Z" 24 | } 25 | } 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "source": [ 30 | "# Import Dataframe from CSV" 31 | ], 32 | "metadata": { 33 | "collapsed": false 34 | } 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 194, 39 | "outputs": [], 40 | "source": [ 41 | "df = pd.read_csv('DataScientist2.csv')" 42 | ], 43 | "metadata": { 44 | "collapsed": false, 45 | "ExecuteTime": { 46 | "end_time": "2023-08-01T18:50:17.354097500Z", 47 | "start_time": "2023-08-01T18:50:17.246945900Z" 48 | } 49 | } 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 195, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": " Unnamed: 0 index Job Title \\\n0 0 0 Senior Data Scientist \n1 1 1 Data Scientist, Product Analytics \n2 2 2 Data Science Manager \n3 3 3 Data Analyst \n4 4 4 Director, Data Science \n\n Salary Estimate \\\n0 $111K-$181K (Glassdoor est.) \n1 $111K-$181K (Glassdoor est.) \n2 $111K-$181K (Glassdoor est.) \n3 $111K-$181K (Glassdoor est.) \n4 $111K-$181K (Glassdoor est.) \n\n Job Description Rating \\\n0 ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ... 3.5 \n1 At Noom, we use scientifically proven methods ... 4.5 \n2 Decode_M\\n\\nhttps://www.decode-m.com/\\n\\nData ... -1.0 \n3 NaN 3.4 \n4 Director, Data Science - (200537)\\nDescription... 3.4 \n\n Company Name Location Headquarters \\\n0 Hopper\\n3.5 New York, NY Montreal, Canada \n1 Noom US\\n4.5 New York, NY New York, NY \n2 Decode_M New York, NY New York, NY \n3 Sapphire Digital\\n3.4 Lyndhurst, NJ NaN \n4 United Entertainment Group\\n3.4 New York, NY New York, NY \n\n Size Founded Type of ownership \\\n0 501 to 1000 employees 2007 Company - Private \n1 1001 to 5000 employees 2008 Company - Private \n2 1 to 50 employees -1 Unknown \n3 201 to 500 employees 2019 Company - Private \n4 51 to 200 employees 2007 Company - Private \n\n Industry Sector \\\n0 Travel Agencies Travel & Tourism \n1 Health, Beauty, & Fitness Consumer Services \n2 -1 -1 \n3 NaN Information Technology \n4 NaN Business Services \n\n Revenue Competitors Easy Apply \n0 Unknown / Non-Applicable -1 -1 \n1 Unknown / Non-Applicable -1 -1 \n2 Unknown / Non-Applicable -1 TRUE \n3 Unknown / Non-Applicable Zocdoc, Healthgrades -1 \n4 Unknown / Non-Applicable BBDO, Grey Group, Droga5 -1 ", 58 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Unnamed: 0indexJob TitleSalary EstimateJob DescriptionRatingCompany NameLocationHeadquartersSizeFoundedType of ownershipIndustrySectorRevenueCompetitorsEasy Apply
000Senior Data Scientist$111K-$181K (Glassdoor est.)ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ...3.5Hopper\\n3.5New York, NYMontreal, Canada501 to 1000 employees2007Company - PrivateTravel AgenciesTravel & TourismUnknown / Non-Applicable-1-1
111Data Scientist, Product Analytics$111K-$181K (Glassdoor est.)At Noom, we use scientifically proven methods ...4.5Noom US\\n4.5New York, NYNew York, NY1001 to 5000 employees2008Company - PrivateHealth, Beauty, & FitnessConsumer ServicesUnknown / Non-Applicable-1-1
222Data Science Manager$111K-$181K (Glassdoor est.)Decode_M\\n\\nhttps://www.decode-m.com/\\n\\nData ...-1.0Decode_MNew York, NYNew York, NY1 to 50 employees-1Unknown-1-1Unknown / Non-Applicable-1TRUE
333Data Analyst$111K-$181K (Glassdoor est.)NaN3.4Sapphire Digital\\n3.4Lyndhurst, NJNaN201 to 500 employees2019Company - PrivateNaNInformation TechnologyUnknown / Non-ApplicableZocdoc, Healthgrades-1
444Director, Data Science$111K-$181K (Glassdoor est.)Director, Data Science - (200537)\\nDescription...3.4United Entertainment Group\\n3.4New York, NYNew York, NY51 to 200 employees2007Company - PrivateNaNBusiness ServicesUnknown / Non-ApplicableBBDO, Grey Group, Droga5-1
\n
" 59 | }, 60 | "execution_count": 195, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "df.head()" 67 | ], 68 | "metadata": { 69 | "collapsed": false, 70 | "ExecuteTime": { 71 | "end_time": "2023-08-01T18:50:17.463406200Z", 72 | "start_time": "2023-08-01T18:50:17.454269600Z" 73 | } 74 | } 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 196, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": "(3912, 17)" 83 | }, 84 | "execution_count": 196, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "df.shape" 91 | ], 92 | "metadata": { 93 | "collapsed": false, 94 | "ExecuteTime": { 95 | "end_time": "2023-08-01T18:50:17.619805700Z", 96 | "start_time": "2023-08-01T18:50:17.604856100Z" 97 | } 98 | } 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "source": [ 103 | "# 1 - Dropping Columns" 104 | ], 105 | "metadata": { 106 | "collapsed": false 107 | } 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 197, 112 | "outputs": [], 113 | "source": [ 114 | "to_drop = ['index','Rating','Headquarters','Size','Founded','Type of ownership'\n", 115 | " ,'Revenue','Sector','Easy Apply','Competitors','Unnamed: 0']" 116 | ], 117 | "metadata": { 118 | "collapsed": false, 119 | "ExecuteTime": { 120 | "end_time": "2023-08-01T18:50:18.007820200Z", 121 | "start_time": "2023-08-01T18:50:17.998343700Z" 122 | } 123 | } 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 198, 128 | "outputs": [], 129 | "source": [ 130 | "df.drop(to_drop, inplace=True, axis=1)" 131 | ], 132 | "metadata": { 133 | "collapsed": false, 134 | "ExecuteTime": { 135 | "end_time": "2023-08-01T18:50:18.209356300Z", 136 | "start_time": "2023-08-01T18:50:18.195895800Z" 137 | } 138 | } 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 199, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "text/plain": "(3912, 6)" 147 | }, 148 | "execution_count": 199, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "df.shape" 155 | ], 156 | "metadata": { 157 | "collapsed": false, 158 | "ExecuteTime": { 159 | "end_time": "2023-08-01T18:50:18.410704200Z", 160 | "start_time": "2023-08-01T18:50:18.385899500Z" 161 | } 162 | } 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 200, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/plain": " Job Title Salary Estimate \\\n0 Senior Data Scientist $111K-$181K (Glassdoor est.) \n1 Data Scientist, Product Analytics $111K-$181K (Glassdoor est.) \n2 Data Science Manager $111K-$181K (Glassdoor est.) \n3 Data Analyst $111K-$181K (Glassdoor est.) \n4 Director, Data Science $111K-$181K (Glassdoor est.) \n\n Job Description \\\n0 ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ... \n1 At Noom, we use scientifically proven methods ... \n2 Decode_M\\n\\nhttps://www.decode-m.com/\\n\\nData ... \n3 NaN \n4 Director, Data Science - (200537)\\nDescription... \n\n Company Name Location Industry \n0 Hopper\\n3.5 New York, NY Travel Agencies \n1 Noom US\\n4.5 New York, NY Health, Beauty, & Fitness \n2 Decode_M New York, NY -1 \n3 Sapphire Digital\\n3.4 Lyndhurst, NJ NaN \n4 United Entertainment Group\\n3.4 New York, NY NaN ", 171 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Job TitleSalary EstimateJob DescriptionCompany NameLocationIndustry
0Senior Data Scientist$111K-$181K (Glassdoor est.)ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ...Hopper\\n3.5New York, NYTravel Agencies
1Data Scientist, Product Analytics$111K-$181K (Glassdoor est.)At Noom, we use scientifically proven methods ...Noom US\\n4.5New York, NYHealth, Beauty, & Fitness
2Data Science Manager$111K-$181K (Glassdoor est.)Decode_M\\n\\nhttps://www.decode-m.com/\\n\\nData ...Decode_MNew York, NY-1
3Data Analyst$111K-$181K (Glassdoor est.)NaNSapphire Digital\\n3.4Lyndhurst, NJNaN
4Director, Data Science$111K-$181K (Glassdoor est.)Director, Data Science - (200537)\\nDescription...United Entertainment Group\\n3.4New York, NYNaN
\n
" 172 | }, 173 | "execution_count": 200, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "df.head()" 180 | ], 181 | "metadata": { 182 | "collapsed": false, 183 | "ExecuteTime": { 184 | "end_time": "2023-08-01T18:50:18.595738200Z", 185 | "start_time": "2023-08-01T18:50:18.589758100Z" 186 | } 187 | } 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "source": [ 192 | "# 2 - Removing Duplicates" 193 | ], 194 | "metadata": { 195 | "collapsed": false 196 | } 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 201, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/plain": " Job Title \\\n3906 Security Analytics Data Engineer \n3907 Security Analytics Data Engineer \n3908 Patient Safety Physician or Safety Scientist -... \n3909 Security Analytics Data Engineer \n3910 Security Analytics Data Engineer \n3911 Patient Safety Physician or Safety Scientist -... \n\n Salary Estimate \\\n3906 $55K-$112K (Glassdoor est.) \n3907 $55K-$112K (Glassdoor est.) \n3908 $55K-$112K (Glassdoor est.) \n3909 $55K-$112K (Glassdoor est.) \n3910 $55K-$112K (Glassdoor est.) \n3911 $55K-$112K (Glassdoor est.) \n\n Job Description \\\n3906 Job DescriptionThe Security Analytics Data Eng... \n3907 The Security Analytics Data Engineer will inte... \n3908 Help us transform patients' lives.\\nAt UCB, we... \n3909 Job DescriptionThe Security Analytics Data Eng... \n3910 The Security Analytics Data Engineer will inte... \n3911 Help us transform patients' lives.\\nAt UCB, we... \n\n Company Name Location Industry \n3906 PDS Tech, Inc.\\n3.8 Dublin, OH Staffing & Outsourcing \n3907 Data Resource Technologies\\n4.0 Dublin, OH Accounting \n3908 UCB\\n3.7 Slough, OH Biotech & Pharmaceuticals \n3909 PDS Tech, Inc.\\n3.8 Dublin, OH Staffing & Outsourcing \n3910 Data Resource Technologies\\n4.0 Dublin, OH Accounting \n3911 UCB\\n3.7 Slough, OH Biotech & Pharmaceuticals ", 205 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Job TitleSalary EstimateJob DescriptionCompany NameLocationIndustry
3906Security Analytics Data Engineer$55K-$112K (Glassdoor est.)Job DescriptionThe Security Analytics Data Eng...PDS Tech, Inc.\\n3.8Dublin, OHStaffing & Outsourcing
3907Security Analytics Data Engineer$55K-$112K (Glassdoor est.)The Security Analytics Data Engineer will inte...Data Resource Technologies\\n4.0Dublin, OHAccounting
3908Patient Safety Physician or Safety Scientist -...$55K-$112K (Glassdoor est.)Help us transform patients' lives.\\nAt UCB, we...UCB\\n3.7Slough, OHBiotech & Pharmaceuticals
3909Security Analytics Data Engineer$55K-$112K (Glassdoor est.)Job DescriptionThe Security Analytics Data Eng...PDS Tech, Inc.\\n3.8Dublin, OHStaffing & Outsourcing
3910Security Analytics Data Engineer$55K-$112K (Glassdoor est.)The Security Analytics Data Engineer will inte...Data Resource Technologies\\n4.0Dublin, OHAccounting
3911Patient Safety Physician or Safety Scientist -...$55K-$112K (Glassdoor est.)Help us transform patients' lives.\\nAt UCB, we...UCB\\n3.7Slough, OHBiotech & Pharmaceuticals
\n
" 206 | }, 207 | "execution_count": 201, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "df[df.duplicated(keep = False)]" 214 | ], 215 | "metadata": { 216 | "collapsed": false, 217 | "ExecuteTime": { 218 | "end_time": "2023-08-01T18:50:18.950391100Z", 219 | "start_time": "2023-08-01T18:50:18.934443800Z" 220 | } 221 | } 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 202, 226 | "outputs": [], 227 | "source": [ 228 | "df.drop_duplicates(keep = 'first', inplace=True)" 229 | ], 230 | "metadata": { 231 | "collapsed": false, 232 | "ExecuteTime": { 233 | "end_time": "2023-08-01T18:50:19.180643Z", 234 | "start_time": "2023-08-01T18:50:19.165920500Z" 235 | } 236 | } 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 203, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": "Empty DataFrame\nColumns: [Job Title, Salary Estimate, Job Description, Company Name, Location, Industry]\nIndex: []", 245 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n
Job TitleSalary EstimateJob DescriptionCompany NameLocationIndustry
\n
" 246 | }, 247 | "execution_count": 203, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | } 251 | ], 252 | "source": [ 253 | "df[df.duplicated(keep = False)]" 254 | ], 255 | "metadata": { 256 | "collapsed": false, 257 | "ExecuteTime": { 258 | "end_time": "2023-08-01T18:50:19.396364100Z", 259 | "start_time": "2023-08-01T18:50:19.363076100Z" 260 | } 261 | } 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "source": [ 266 | "# 3 - Remove Irrelevant Rows" 267 | ], 268 | "metadata": { 269 | "collapsed": false 270 | } 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 204, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/plain": "21" 279 | }, 280 | "execution_count": 204, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "len(df[df['Salary Estimate'].str.contains('Per Hour', case = False)])" 287 | ], 288 | "metadata": { 289 | "collapsed": false, 290 | "ExecuteTime": { 291 | "end_time": "2023-08-01T18:50:19.764144400Z", 292 | "start_time": "2023-08-01T18:50:19.750184200Z" 293 | } 294 | } 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 205, 299 | "outputs": [ 300 | { 301 | "data": { 302 | "text/plain": "(3909, 6)" 303 | }, 304 | "execution_count": 205, 305 | "metadata": {}, 306 | "output_type": "execute_result" 307 | } 308 | ], 309 | "source": [ 310 | "df.shape" 311 | ], 312 | "metadata": { 313 | "collapsed": false, 314 | "ExecuteTime": { 315 | "end_time": "2023-08-01T18:50:19.999297900Z", 316 | "start_time": "2023-08-01T18:50:19.988335Z" 317 | } 318 | } 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 206, 323 | "outputs": [], 324 | "source": [ 325 | "df = df.drop(df[df['Salary Estimate'].str.contains('Per Hour', case = False) == True].index)" 326 | ], 327 | "metadata": { 328 | "collapsed": false, 329 | "ExecuteTime": { 330 | "end_time": "2023-08-01T18:50:20.225073100Z", 331 | "start_time": "2023-08-01T18:50:20.211120100Z" 332 | } 333 | } 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 207, 338 | "outputs": [ 339 | { 340 | "data": { 341 | "text/plain": "(3888, 6)" 342 | }, 343 | "execution_count": 207, 344 | "metadata": {}, 345 | "output_type": "execute_result" 346 | } 347 | ], 348 | "source": [ 349 | "df.shape" 350 | ], 351 | "metadata": { 352 | "collapsed": false, 353 | "ExecuteTime": { 354 | "end_time": "2023-08-01T18:50:20.413462400Z", 355 | "start_time": "2023-08-01T18:50:20.400996Z" 356 | } 357 | } 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 208, 362 | "outputs": [ 363 | { 364 | "data": { 365 | "text/plain": "0" 366 | }, 367 | "execution_count": 208, 368 | "metadata": {}, 369 | "output_type": "execute_result" 370 | } 371 | ], 372 | "source": [ 373 | "len(df[df['Salary Estimate'].str.contains('Per Hour', case = False)])" 374 | ], 375 | "metadata": { 376 | "collapsed": false, 377 | "ExecuteTime": { 378 | "end_time": "2023-08-01T18:50:20.626157200Z", 379 | "start_time": "2023-08-01T18:50:20.594753Z" 380 | } 381 | } 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "source": [ 386 | "# 4 - Removing Nulls and Blanks" 387 | ], 388 | "metadata": { 389 | "collapsed": false 390 | } 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 209, 395 | "outputs": [ 396 | { 397 | "data": { 398 | "text/plain": "Job Title 1\nSalary Estimate 0\nJob Description 1\nCompany Name 0\nLocation 0\nIndustry 2\ndtype: int64" 399 | }, 400 | "execution_count": 209, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "df.isna().sum()" 407 | ], 408 | "metadata": { 409 | "collapsed": false, 410 | "ExecuteTime": { 411 | "end_time": "2023-08-01T18:50:21.006062200Z", 412 | "start_time": "2023-08-01T18:50:20.980636600Z" 413 | } 414 | } 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 210, 419 | "outputs": [], 420 | "source": [ 421 | "df = df.dropna(subset=['Job Description'])" 422 | ], 423 | "metadata": { 424 | "collapsed": false, 425 | "ExecuteTime": { 426 | "end_time": "2023-08-01T18:50:21.162088200Z", 427 | "start_time": "2023-08-01T18:50:21.141158300Z" 428 | } 429 | } 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 211, 434 | "outputs": [ 435 | { 436 | "data": { 437 | "text/plain": "Job Title 1\nSalary Estimate 0\nJob Description 0\nCompany Name 0\nLocation 0\nIndustry 1\ndtype: int64" 438 | }, 439 | "execution_count": 211, 440 | "metadata": {}, 441 | "output_type": "execute_result" 442 | } 443 | ], 444 | "source": [ 445 | "df.isna().sum()" 446 | ], 447 | "metadata": { 448 | "collapsed": false, 449 | "ExecuteTime": { 450 | "end_time": "2023-08-01T18:50:21.362437800Z", 451 | "start_time": "2023-08-01T18:50:21.344498100Z" 452 | } 453 | } 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 212, 458 | "outputs": [], 459 | "source": [ 460 | "df = df.dropna()" 461 | ], 462 | "metadata": { 463 | "collapsed": false, 464 | "ExecuteTime": { 465 | "end_time": "2023-08-01T18:50:21.549832600Z", 466 | "start_time": "2023-08-01T18:50:21.531892700Z" 467 | } 468 | } 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 213, 473 | "outputs": [ 474 | { 475 | "data": { 476 | "text/plain": "Job Title 0\nSalary Estimate 0\nJob Description 0\nCompany Name 0\nLocation 0\nIndustry 0\ndtype: int64" 477 | }, 478 | "execution_count": 213, 479 | "metadata": {}, 480 | "output_type": "execute_result" 481 | } 482 | ], 483 | "source": [ 484 | "df.isna().sum()" 485 | ], 486 | "metadata": { 487 | "collapsed": false, 488 | "ExecuteTime": { 489 | "end_time": "2023-08-01T18:50:21.738086500Z", 490 | "start_time": "2023-08-01T18:50:21.727123400Z" 491 | } 492 | } 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 214, 497 | "outputs": [ 498 | { 499 | "data": { 500 | "text/plain": " Job Title \\\n0 Senior Data Scientist \n1 Data Scientist, Product Analytics \n2 Data Science Manager \n5 Data Scientist \n7 Quantitative Research Associate \n.. ... \n98 Data Scientist \n99 Machine Learning Engineer/Scientist \n100 Point72 Healthcare Data Scientist \n101 Senior Data Scientist - Automated Marketing \n102 Data Scientist \n\n Salary Estimate \\\n0 $111K-$181K (Glassdoor est.) \n1 $111K-$181K (Glassdoor est.) \n2 $111K-$181K (Glassdoor est.) \n5 $111K-$181K (Glassdoor est.) \n7 $111K-$181K (Glassdoor est.) \n.. ... \n98 $74K-$124K (Glassdoor est.) \n99 $74K-$124K (Glassdoor est.) \n100 $74K-$124K (Glassdoor est.) \n101 $74K-$124K (Glassdoor est.) \n102 $74K-$124K (Glassdoor est.) \n\n Job Description \\\n0 ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ... \n1 At Noom, we use scientifically proven methods ... \n2 Decode_M\\n\\nhttps://www.decode-m.com/\\n\\nData ... \n5 Job Brief\\n\\nThe ideal candidate will have pre... \n7 Seeking a quant to work with senior researcher... \n.. ... \n98 Data Scientist\\n\\n\\nNew York, NY | Full Time\\n... \n99 About the Position\\n\\n\\nJane Street is seeking... \n100 About Point72\\nPoint72 Asset Management is a g... \n101 We are looking for a Senior Data Scientist wit... \n102 We are the most sought after hedge fund in New... \n\n Company Name Location \\\n0 Hopper\\n3.5 New York, NY \n1 Noom US\\n4.5 New York, NY \n2 Decode_M New York, NY \n5 IFG Companies\\n2.9 New York, NY \n7 Enlightenment Research New York, NY \n.. ... ... \n98 CKM Advisors\\n2.9 New York, NY \n99 Jane Street\\n4.8 New York, NY \n100 Point72\\n3.9 New York, NY \n101 Spotify\\n3.8 New York, NY \n102 Averity\\n5.0 New York, NY \n\n Industry \n0 Travel Agencies \n1 Health, Beauty, & Fitness \n2 -1 \n5 Insurance Carriers \n7 -1 \n.. ... \n98 Consulting \n99 Investment Banking & Asset Management \n100 Investment Banking & Asset Management \n101 Internet \n102 Staffing & Outsourcing \n\n[100 rows x 6 columns]", 501 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Job TitleSalary EstimateJob DescriptionCompany NameLocationIndustry
0Senior Data Scientist$111K-$181K (Glassdoor est.)ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ...Hopper\\n3.5New York, NYTravel Agencies
1Data Scientist, Product Analytics$111K-$181K (Glassdoor est.)At Noom, we use scientifically proven methods ...Noom US\\n4.5New York, NYHealth, Beauty, & Fitness
2Data Science Manager$111K-$181K (Glassdoor est.)Decode_M\\n\\nhttps://www.decode-m.com/\\n\\nData ...Decode_MNew York, NY-1
5Data Scientist$111K-$181K (Glassdoor est.)Job Brief\\n\\nThe ideal candidate will have pre...IFG Companies\\n2.9New York, NYInsurance Carriers
7Quantitative Research Associate$111K-$181K (Glassdoor est.)Seeking a quant to work with senior researcher...Enlightenment ResearchNew York, NY-1
.....................
98Data Scientist$74K-$124K (Glassdoor est.)Data Scientist\\n\\n\\nNew York, NY | Full Time\\n...CKM Advisors\\n2.9New York, NYConsulting
99Machine Learning Engineer/Scientist$74K-$124K (Glassdoor est.)About the Position\\n\\n\\nJane Street is seeking...Jane Street\\n4.8New York, NYInvestment Banking & Asset Management
100Point72 Healthcare Data Scientist$74K-$124K (Glassdoor est.)About Point72\\nPoint72 Asset Management is a g...Point72\\n3.9New York, NYInvestment Banking & Asset Management
101Senior Data Scientist - Automated Marketing$74K-$124K (Glassdoor est.)We are looking for a Senior Data Scientist wit...Spotify\\n3.8New York, NYInternet
102Data Scientist$74K-$124K (Glassdoor est.)We are the most sought after hedge fund in New...Averity\\n5.0New York, NYStaffing & Outsourcing
\n

100 rows × 6 columns

\n
" 502 | }, 503 | "execution_count": 214, 504 | "metadata": {}, 505 | "output_type": "execute_result" 506 | } 507 | ], 508 | "source": [ 509 | "df.head(100)" 510 | ], 511 | "metadata": { 512 | "collapsed": false, 513 | "ExecuteTime": { 514 | "end_time": "2023-08-01T18:50:21.942425300Z", 515 | "start_time": "2023-08-01T18:50:21.923488700Z" 516 | } 517 | } 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 215, 522 | "outputs": [ 523 | { 524 | "data": { 525 | "text/plain": " Job Title \\\n2 Data Science Manager \n7 Quantitative Research Associate \n40 Data Scientist \n44 Data Science Analyst \n55 Data Scientist, Analytics & Inference \n... ... \n3880 Data Science Technical Lead / Architect \n3882 Big Data Engineer \n3883 Columbus Opportunities: Finance, Accounting, D... \n3886 Big Data Engineer \n3897 Senior Data Engineer \n\n Salary Estimate \\\n2 $111K-$181K (Glassdoor est.) \n7 $111K-$181K (Glassdoor est.) \n40 $120K-$140K (Glassdoor est.) \n44 $120K-$140K (Glassdoor est.) \n55 $120K-$140K (Glassdoor est.) \n... ... \n3880 $39K-$86K (Glassdoor est.) \n3882 $55K-$112K (Glassdoor est.) \n3883 $55K-$112K (Glassdoor est.) \n3886 $55K-$112K (Glassdoor est.) \n3897 $55K-$112K (Glassdoor est.) \n\n Job Description \\\n2 Decode_M\\n\\nhttps://www.decode-m.com/\\n\\nData ... \n7 Seeking a quant to work with senior researcher... \n40 We make small businesses more successful throu... \n44 Job Description\\nOur client, a music streaming... \n55 Hello, World! Codecademy has helped over 45 mi... \n... ... \n3880 Urgent need for DATASCIENCE TECHNICAL LEAD ARC... \n3882 RESPONSIBILITIES Kforce has a client in search... \n3883 Job Description\\nWe are experts at conducting ... \n3886 Job Description:\\nAct as a strong developer us... \n3897 Job Responsibility:\\nBased on business strateg... \n\n Company Name Location Industry \n2 Decode_M New York, NY -1 \n7 Enlightenment Research New York, NY -1 \n40 NorthOne\\n4.3 New York, NY -1 \n44 MUSIC & Entertainment New York, NY -1 \n55 Codeacademy New York, NY -1 \n... ... ... ... \n3880 DATAECONOMY\\n5.0 Columbus, OH -1 \n3882 Kforce Technology Staffing Columbus, OH -1 \n3883 Rainmaker Resources, LLC Columbus, OH -1 \n3886 Zllius Inc. Columbus, OH -1 \n3897 Kognetics\\n3.6 Gahanna, OH -1 \n\n[548 rows x 6 columns]", 526 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Job TitleSalary EstimateJob DescriptionCompany NameLocationIndustry
2Data Science Manager$111K-$181K (Glassdoor est.)Decode_M\\n\\nhttps://www.decode-m.com/\\n\\nData ...Decode_MNew York, NY-1
7Quantitative Research Associate$111K-$181K (Glassdoor est.)Seeking a quant to work with senior researcher...Enlightenment ResearchNew York, NY-1
40Data Scientist$120K-$140K (Glassdoor est.)We make small businesses more successful throu...NorthOne\\n4.3New York, NY-1
44Data Science Analyst$120K-$140K (Glassdoor est.)Job Description\\nOur client, a music streaming...MUSIC & EntertainmentNew York, NY-1
55Data Scientist, Analytics & Inference$120K-$140K (Glassdoor est.)Hello, World! Codecademy has helped over 45 mi...CodeacademyNew York, NY-1
.....................
3880Data Science Technical Lead / Architect$39K-$86K (Glassdoor est.)Urgent need for DATASCIENCE TECHNICAL LEAD ARC...DATAECONOMY\\n5.0Columbus, OH-1
3882Big Data Engineer$55K-$112K (Glassdoor est.)RESPONSIBILITIES Kforce has a client in search...Kforce Technology StaffingColumbus, OH-1
3883Columbus Opportunities: Finance, Accounting, D...$55K-$112K (Glassdoor est.)Job Description\\nWe are experts at conducting ...Rainmaker Resources, LLCColumbus, OH-1
3886Big Data Engineer$55K-$112K (Glassdoor est.)Job Description:\\nAct as a strong developer us...Zllius Inc.Columbus, OH-1
3897Senior Data Engineer$55K-$112K (Glassdoor est.)Job Responsibility:\\nBased on business strateg...Kognetics\\n3.6Gahanna, OH-1
\n

548 rows × 6 columns

\n
" 527 | }, 528 | "execution_count": 215, 529 | "metadata": {}, 530 | "output_type": "execute_result" 531 | } 532 | ], 533 | "source": [ 534 | "df[df['Industry'].str.contains('-1')]" 535 | ], 536 | "metadata": { 537 | "collapsed": false, 538 | "ExecuteTime": { 539 | "end_time": "2023-08-01T18:50:22.128714100Z", 540 | "start_time": "2023-08-01T18:50:22.117750800Z" 541 | } 542 | } 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 216, 547 | "outputs": [], 548 | "source": [ 549 | "df['Industry'].replace('-1', 'Unknown', inplace= True)" 550 | ], 551 | "metadata": { 552 | "collapsed": false, 553 | "ExecuteTime": { 554 | "end_time": "2023-08-01T18:50:22.319097600Z", 555 | "start_time": "2023-08-01T18:50:22.308134400Z" 556 | } 557 | } 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 217, 562 | "outputs": [ 563 | { 564 | "data": { 565 | "text/plain": " Job Title Salary Estimate \\\n684 Data Engineer $136K-$164K (Glassdoor est.) \n838 Data Analyst $39K-$81K (Glassdoor est.) \n874 CSI Data Analyst $76K-$147K (Glassdoor est.) \n1051 Senior Data Analyst $47K-$82K (Glassdoor est.) \n1544 Senior Data Engineer $97K-$111K (Glassdoor est.) \n\n Job Description \\\n684 Company Overview:\\nAge of Learning is a leadin... \n838 Chicago Public Schools (CPS) is the third larg... \n874 Chicago Public Schools (CPS) is the third larg... \n1051 Chicago Public Schools (CPS) is the third larg... \n1544 If you’re bright, highly motivated and want to... \n\n Company Name Location Industry \n684 Age of Learning\\n3.3 Glendale, CA K-12 Education \n838 Chicago Public Schools\\n3.5 Chicago, IL K-12 Education \n874 Chicago Public Schools\\n3.5 Chicago, IL K-12 Education \n1051 Chicago Public Schools\\n3.5 Chicago, IL K-12 Education \n1544 StrongMind\\n3.8 Chandler, AZ K-12 Education ", 566 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Job TitleSalary EstimateJob DescriptionCompany NameLocationIndustry
684Data Engineer$136K-$164K (Glassdoor est.)Company Overview:\\nAge of Learning is a leadin...Age of Learning\\n3.3Glendale, CAK-12 Education
838Data Analyst$39K-$81K (Glassdoor est.)Chicago Public Schools (CPS) is the third larg...Chicago Public Schools\\n3.5Chicago, ILK-12 Education
874CSI Data Analyst$76K-$147K (Glassdoor est.)Chicago Public Schools (CPS) is the third larg...Chicago Public Schools\\n3.5Chicago, ILK-12 Education
1051Senior Data Analyst$47K-$82K (Glassdoor est.)Chicago Public Schools (CPS) is the third larg...Chicago Public Schools\\n3.5Chicago, ILK-12 Education
1544Senior Data Engineer$97K-$111K (Glassdoor est.)If you’re bright, highly motivated and want to...StrongMind\\n3.8Chandler, AZK-12 Education
\n
" 567 | }, 568 | "execution_count": 217, 569 | "metadata": {}, 570 | "output_type": "execute_result" 571 | } 572 | ], 573 | "source": [ 574 | "df[df['Industry'].str.contains('-1')]" 575 | ], 576 | "metadata": { 577 | "collapsed": false, 578 | "ExecuteTime": { 579 | "end_time": "2023-08-01T18:50:22.667213700Z", 580 | "start_time": "2023-08-01T18:50:22.647262800Z" 581 | } 582 | } 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 218, 587 | "outputs": [ 588 | { 589 | "data": { 590 | "text/plain": "Unknown 543\nIT Services 471\nStaffing & Outsourcing 313\nBiotech & Pharmaceuticals 291\nComputer Hardware & Software 263\n ... \nCommercial Equipment Rental 1\nMetals Brokers 1\nTruck Rental & Leasing 1\nBeauty & Personal Accessories Stores 1\nAuto Repair & Maintenance 1\nName: Industry, Length: 96, dtype: int64" 591 | }, 592 | "execution_count": 218, 593 | "metadata": {}, 594 | "output_type": "execute_result" 595 | } 596 | ], 597 | "source": [ 598 | "df['Industry'].value_counts()" 599 | ], 600 | "metadata": { 601 | "collapsed": false, 602 | "ExecuteTime": { 603 | "end_time": "2023-08-01T18:50:22.864332100Z", 604 | "start_time": "2023-08-01T18:50:22.830444800Z" 605 | } 606 | } 607 | }, 608 | { 609 | "cell_type": "markdown", 610 | "source": [ 611 | "# 5 - Standardise Values" 612 | ], 613 | "metadata": { 614 | "collapsed": false 615 | } 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": 219, 620 | "outputs": [ 621 | { 622 | "data": { 623 | "text/plain": " Job Title \\\n0 Senior Data Scientist \n1 Data Scientist, Product Analytics \n5 Data Scientist \n8 AI Scientist \n10 Data Scientist \n... ... \n3872 IGM - Post Doctoral Scientist - Chaudhari Lab \n3884 Senior Research Scientist - RI IPP Cooper \n3893 Biotransformation Scientist and DMPK Design Lead \n3895 Senior/Principal Scientist - Display Technolog... \n3908 Patient Safety Physician or Safety Scientist -... \n\n Salary Estimate \\\n0 $111K-$181K (Glassdoor est.) \n1 $111K-$181K (Glassdoor est.) \n5 $111K-$181K (Glassdoor est.) \n8 $111K-$181K (Glassdoor est.) \n10 $111K-$181K (Glassdoor est.) \n... ... \n3872 $39K-$86K (Glassdoor est.) \n3884 $55K-$112K (Glassdoor est.) \n3893 $55K-$112K (Glassdoor est.) \n3895 $55K-$112K (Glassdoor est.) \n3908 $55K-$112K (Glassdoor est.) \n\n Job Description \\\n0 ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ... \n1 At Noom, we use scientifically proven methods ... \n5 Job Brief\\n\\nThe ideal candidate will have pre... \n8 Paige is a software company helping pathologis... \n10 Company Description:\\n\\nQuartet is a pioneerin... \n... ... \n3872 JOB POSTING - Post Doctoral Scientist IGM\\nFul... \n3884 JOB POSTING – Senior Research Scientist – RI I... \n3893 Help us transform patients' lives.\\n\\nAt UCB, ... \n3895 Help us transform patients' lives.\\n\\nAt UCB, ... \n3908 Help us transform patients' lives.\\nAt UCB, we... \n\n Company Name Location \\\n0 Hopper\\n3.5 New York, NY \n1 Noom US\\n4.5 New York, NY \n5 IFG Companies\\n2.9 New York, NY \n8 Paige\\n5.0 New York, NY \n10 Quartet Health\\n3.9 New York, NY \n... ... ... \n3872 Nationwide Children's Hospital\\n3.7 Columbus, OH \n3884 Nationwide Children's Hospital\\n3.7 Columbus, OH \n3893 UCB\\n3.7 Slough, OH \n3895 UCB\\n3.7 Slough, OH \n3908 UCB\\n3.7 Slough, OH \n\n Industry \n0 Travel Agencies \n1 Health, Beauty, & Fitness \n5 Insurance Carriers \n8 Enterprise Software & Network Solutions \n10 Enterprise Software & Network Solutions \n... ... \n3872 Health Care Services & Hospitals \n3884 Health Care Services & Hospitals \n3893 Biotech & Pharmaceuticals \n3895 Biotech & Pharmaceuticals \n3908 Biotech & Pharmaceuticals \n\n[1746 rows x 6 columns]", 624 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Job TitleSalary EstimateJob DescriptionCompany NameLocationIndustry
0Senior Data Scientist$111K-$181K (Glassdoor est.)ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ...Hopper\\n3.5New York, NYTravel Agencies
1Data Scientist, Product Analytics$111K-$181K (Glassdoor est.)At Noom, we use scientifically proven methods ...Noom US\\n4.5New York, NYHealth, Beauty, & Fitness
5Data Scientist$111K-$181K (Glassdoor est.)Job Brief\\n\\nThe ideal candidate will have pre...IFG Companies\\n2.9New York, NYInsurance Carriers
8AI Scientist$111K-$181K (Glassdoor est.)Paige is a software company helping pathologis...Paige\\n5.0New York, NYEnterprise Software & Network Solutions
10Data Scientist$111K-$181K (Glassdoor est.)Company Description:\\n\\nQuartet is a pioneerin...Quartet Health\\n3.9New York, NYEnterprise Software & Network Solutions
.....................
3872IGM - Post Doctoral Scientist - Chaudhari Lab$39K-$86K (Glassdoor est.)JOB POSTING - Post Doctoral Scientist IGM\\nFul...Nationwide Children's Hospital\\n3.7Columbus, OHHealth Care Services & Hospitals
3884Senior Research Scientist - RI IPP Cooper$55K-$112K (Glassdoor est.)JOB POSTING – Senior Research Scientist – RI I...Nationwide Children's Hospital\\n3.7Columbus, OHHealth Care Services & Hospitals
3893Biotransformation Scientist and DMPK Design Lead$55K-$112K (Glassdoor est.)Help us transform patients' lives.\\n\\nAt UCB, ...UCB\\n3.7Slough, OHBiotech & Pharmaceuticals
3895Senior/Principal Scientist - Display Technolog...$55K-$112K (Glassdoor est.)Help us transform patients' lives.\\n\\nAt UCB, ...UCB\\n3.7Slough, OHBiotech & Pharmaceuticals
3908Patient Safety Physician or Safety Scientist -...$55K-$112K (Glassdoor est.)Help us transform patients' lives.\\nAt UCB, we...UCB\\n3.7Slough, OHBiotech & Pharmaceuticals
\n

1746 rows × 6 columns

\n
" 625 | }, 626 | "execution_count": 219, 627 | "metadata": {}, 628 | "output_type": "execute_result" 629 | } 630 | ], 631 | "source": [ 632 | "df[df['Job Title'].str.contains('scientist', case = False)]" 633 | ], 634 | "metadata": { 635 | "collapsed": false, 636 | "ExecuteTime": { 637 | "end_time": "2023-08-01T18:50:23.608187100Z", 638 | "start_time": "2023-08-01T18:50:23.580219500Z" 639 | } 640 | } 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 220, 645 | "outputs": [], 646 | "source": [ 647 | "df.loc[df['Job Title'].str.contains('scientist', case = False), 'Job Title'] = 'Data Scientist'" 648 | ], 649 | "metadata": { 650 | "collapsed": false, 651 | "ExecuteTime": { 652 | "end_time": "2023-08-01T18:50:23.794077400Z", 653 | "start_time": "2023-08-01T18:50:23.774142500Z" 654 | } 655 | } 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 221, 660 | "outputs": [ 661 | { 662 | "data": { 663 | "text/plain": " Job Title Salary Estimate \\\n0 Data Scientist $111K-$181K (Glassdoor est.) \n1 Data Scientist $111K-$181K (Glassdoor est.) \n5 Data Scientist $111K-$181K (Glassdoor est.) \n8 Data Scientist $111K-$181K (Glassdoor est.) \n10 Data Scientist $111K-$181K (Glassdoor est.) \n... ... ... \n3872 Data Scientist $39K-$86K (Glassdoor est.) \n3884 Data Scientist $55K-$112K (Glassdoor est.) \n3893 Data Scientist $55K-$112K (Glassdoor est.) \n3895 Data Scientist $55K-$112K (Glassdoor est.) \n3908 Data Scientist $55K-$112K (Glassdoor est.) \n\n Job Description \\\n0 ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ... \n1 At Noom, we use scientifically proven methods ... \n5 Job Brief\\n\\nThe ideal candidate will have pre... \n8 Paige is a software company helping pathologis... \n10 Company Description:\\n\\nQuartet is a pioneerin... \n... ... \n3872 JOB POSTING - Post Doctoral Scientist IGM\\nFul... \n3884 JOB POSTING – Senior Research Scientist – RI I... \n3893 Help us transform patients' lives.\\n\\nAt UCB, ... \n3895 Help us transform patients' lives.\\n\\nAt UCB, ... \n3908 Help us transform patients' lives.\\nAt UCB, we... \n\n Company Name Location \\\n0 Hopper\\n3.5 New York, NY \n1 Noom US\\n4.5 New York, NY \n5 IFG Companies\\n2.9 New York, NY \n8 Paige\\n5.0 New York, NY \n10 Quartet Health\\n3.9 New York, NY \n... ... ... \n3872 Nationwide Children's Hospital\\n3.7 Columbus, OH \n3884 Nationwide Children's Hospital\\n3.7 Columbus, OH \n3893 UCB\\n3.7 Slough, OH \n3895 UCB\\n3.7 Slough, OH \n3908 UCB\\n3.7 Slough, OH \n\n Industry \n0 Travel Agencies \n1 Health, Beauty, & Fitness \n5 Insurance Carriers \n8 Enterprise Software & Network Solutions \n10 Enterprise Software & Network Solutions \n... ... \n3872 Health Care Services & Hospitals \n3884 Health Care Services & Hospitals \n3893 Biotech & Pharmaceuticals \n3895 Biotech & Pharmaceuticals \n3908 Biotech & Pharmaceuticals \n\n[1746 rows x 6 columns]", 664 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Job TitleSalary EstimateJob DescriptionCompany NameLocationIndustry
0Data Scientist$111K-$181K (Glassdoor est.)ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ...Hopper\\n3.5New York, NYTravel Agencies
1Data Scientist$111K-$181K (Glassdoor est.)At Noom, we use scientifically proven methods ...Noom US\\n4.5New York, NYHealth, Beauty, & Fitness
5Data Scientist$111K-$181K (Glassdoor est.)Job Brief\\n\\nThe ideal candidate will have pre...IFG Companies\\n2.9New York, NYInsurance Carriers
8Data Scientist$111K-$181K (Glassdoor est.)Paige is a software company helping pathologis...Paige\\n5.0New York, NYEnterprise Software & Network Solutions
10Data Scientist$111K-$181K (Glassdoor est.)Company Description:\\n\\nQuartet is a pioneerin...Quartet Health\\n3.9New York, NYEnterprise Software & Network Solutions
.....................
3872Data Scientist$39K-$86K (Glassdoor est.)JOB POSTING - Post Doctoral Scientist IGM\\nFul...Nationwide Children's Hospital\\n3.7Columbus, OHHealth Care Services & Hospitals
3884Data Scientist$55K-$112K (Glassdoor est.)JOB POSTING – Senior Research Scientist – RI I...Nationwide Children's Hospital\\n3.7Columbus, OHHealth Care Services & Hospitals
3893Data Scientist$55K-$112K (Glassdoor est.)Help us transform patients' lives.\\n\\nAt UCB, ...UCB\\n3.7Slough, OHBiotech & Pharmaceuticals
3895Data Scientist$55K-$112K (Glassdoor est.)Help us transform patients' lives.\\n\\nAt UCB, ...UCB\\n3.7Slough, OHBiotech & Pharmaceuticals
3908Data Scientist$55K-$112K (Glassdoor est.)Help us transform patients' lives.\\nAt UCB, we...UCB\\n3.7Slough, OHBiotech & Pharmaceuticals
\n

1746 rows × 6 columns

\n
" 665 | }, 666 | "execution_count": 221, 667 | "metadata": {}, 668 | "output_type": "execute_result" 669 | } 670 | ], 671 | "source": [ 672 | "df[df['Job Title'].str.contains('scientist', case = False)]" 673 | ], 674 | "metadata": { 675 | "collapsed": false, 676 | "ExecuteTime": { 677 | "end_time": "2023-08-01T18:50:23.995303400Z", 678 | "start_time": "2023-08-01T18:50:23.968512700Z" 679 | } 680 | } 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": 222, 685 | "outputs": [], 686 | "source": [ 687 | "df.loc[df['Job Title'].str.contains('analyst', case = False), 'Job Title'] = 'Data Analyst'" 688 | ], 689 | "metadata": { 690 | "collapsed": false, 691 | "ExecuteTime": { 692 | "end_time": "2023-08-01T18:50:24.528641500Z", 693 | "start_time": "2023-08-01T18:50:24.517293800Z" 694 | } 695 | } 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": 223, 700 | "outputs": [], 701 | "source": [ 702 | "df.loc[df['Job Title'].str.contains('engineer', case = False), 'Job Title'] = 'Data Engineer'" 703 | ], 704 | "metadata": { 705 | "collapsed": false, 706 | "ExecuteTime": { 707 | "end_time": "2023-08-01T18:50:24.747497200Z", 708 | "start_time": "2023-08-01T18:50:24.721583900Z" 709 | } 710 | } 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": 224, 715 | "outputs": [ 716 | { 717 | "data": { 718 | "text/plain": "Data Scientist 1746\nData Engineer 878\nData Analyst 874\nData Modeler 17\nMicrosoft Analytics Consultant 9\n ... \nManaging Consultant - Data Science 1\nRevenue Management and Strategic Analytics Manager 1\nHead of Data Science, Americas 1\nData Modeler Architect 1\nColumbus Data Science Tutor Jobs 1\nName: Job Title, Length: 279, dtype: int64" 719 | }, 720 | "execution_count": 224, 721 | "metadata": {}, 722 | "output_type": "execute_result" 723 | } 724 | ], 725 | "source": [ 726 | "df['Job Title']. value_counts()" 727 | ], 728 | "metadata": { 729 | "collapsed": false, 730 | "ExecuteTime": { 731 | "end_time": "2023-08-01T18:50:24.932897300Z", 732 | "start_time": "2023-08-01T18:50:24.913960800Z" 733 | } 734 | } 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": 225, 739 | "outputs": [], 740 | "source": [ 741 | "threshold = 800\n", 742 | "df_counts = df['Job Title']. value_counts()" 743 | ], 744 | "metadata": { 745 | "collapsed": false, 746 | "ExecuteTime": { 747 | "end_time": "2023-08-01T18:50:25.159079500Z", 748 | "start_time": "2023-08-01T18:50:25.140142700Z" 749 | } 750 | } 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": 226, 755 | "outputs": [], 756 | "source": [ 757 | "df_thresdrop = df['Job Title'].isin(df_counts.index[df_counts < threshold])" 758 | ], 759 | "metadata": { 760 | "collapsed": false, 761 | "ExecuteTime": { 762 | "end_time": "2023-08-01T18:50:25.764778Z", 763 | "start_time": "2023-08-01T18:50:25.740859600Z" 764 | } 765 | } 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": 227, 770 | "outputs": [], 771 | "source": [ 772 | "df = df[~df_thresdrop]" 773 | ], 774 | "metadata": { 775 | "collapsed": false, 776 | "ExecuteTime": { 777 | "end_time": "2023-08-01T18:50:26.036380100Z", 778 | "start_time": "2023-08-01T18:50:26.024420300Z" 779 | } 780 | } 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": 228, 785 | "outputs": [ 786 | { 787 | "data": { 788 | "text/plain": "Data Scientist 1746\nData Engineer 878\nData Analyst 874\nName: Job Title, dtype: int64" 789 | }, 790 | "execution_count": 228, 791 | "metadata": {}, 792 | "output_type": "execute_result" 793 | } 794 | ], 795 | "source": [ 796 | "df['Job Title'].value_counts()" 797 | ], 798 | "metadata": { 799 | "collapsed": false, 800 | "ExecuteTime": { 801 | "end_time": "2023-08-01T18:50:26.270848600Z", 802 | "start_time": "2023-08-01T18:50:26.252908600Z" 803 | } 804 | } 805 | }, 806 | { 807 | "cell_type": "markdown", 808 | "source": [ 809 | "# 6 - Remove a Substring" 810 | ], 811 | "metadata": { 812 | "collapsed": false 813 | } 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": 229, 818 | "outputs": [], 819 | "source": [ 820 | "df[['Est Min Salary', 'Est Max Salary']] = df['Salary Estimate'].str.split('-', expand = True)" 821 | ], 822 | "metadata": { 823 | "collapsed": false, 824 | "ExecuteTime": { 825 | "end_time": "2023-08-01T18:50:26.935673200Z", 826 | "start_time": "2023-08-01T18:50:26.924709900Z" 827 | } 828 | } 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": 230, 833 | "outputs": [ 834 | { 835 | "data": { 836 | "text/plain": " Job Title Salary Estimate \\\n0 Data Scientist $111K-$181K (Glassdoor est.) \n1 Data Scientist $111K-$181K (Glassdoor est.) \n5 Data Scientist $111K-$181K (Glassdoor est.) \n8 Data Scientist $111K-$181K (Glassdoor est.) \n10 Data Scientist $111K-$181K (Glassdoor est.) \n\n Job Description Company Name \\\n0 ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ... Hopper\\n3.5 \n1 At Noom, we use scientifically proven methods ... Noom US\\n4.5 \n5 Job Brief\\n\\nThe ideal candidate will have pre... IFG Companies\\n2.9 \n8 Paige is a software company helping pathologis... Paige\\n5.0 \n10 Company Description:\\n\\nQuartet is a pioneerin... Quartet Health\\n3.9 \n\n Location Industry Est Min Salary \\\n0 New York, NY Travel Agencies $111K \n1 New York, NY Health, Beauty, & Fitness $111K \n5 New York, NY Insurance Carriers $111K \n8 New York, NY Enterprise Software & Network Solutions $111K \n10 New York, NY Enterprise Software & Network Solutions $111K \n\n Est Max Salary \n0 $181K (Glassdoor est.) \n1 $181K (Glassdoor est.) \n5 $181K (Glassdoor est.) \n8 $181K (Glassdoor est.) \n10 $181K (Glassdoor est.) ", 837 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Job TitleSalary EstimateJob DescriptionCompany NameLocationIndustryEst Min SalaryEst Max Salary
0Data Scientist$111K-$181K (Glassdoor est.)ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ...Hopper\\n3.5New York, NYTravel Agencies$111K$181K (Glassdoor est.)
1Data Scientist$111K-$181K (Glassdoor est.)At Noom, we use scientifically proven methods ...Noom US\\n4.5New York, NYHealth, Beauty, & Fitness$111K$181K (Glassdoor est.)
5Data Scientist$111K-$181K (Glassdoor est.)Job Brief\\n\\nThe ideal candidate will have pre...IFG Companies\\n2.9New York, NYInsurance Carriers$111K$181K (Glassdoor est.)
8Data Scientist$111K-$181K (Glassdoor est.)Paige is a software company helping pathologis...Paige\\n5.0New York, NYEnterprise Software & Network Solutions$111K$181K (Glassdoor est.)
10Data Scientist$111K-$181K (Glassdoor est.)Company Description:\\n\\nQuartet is a pioneerin...Quartet Health\\n3.9New York, NYEnterprise Software & Network Solutions$111K$181K (Glassdoor est.)
\n
" 838 | }, 839 | "execution_count": 230, 840 | "metadata": {}, 841 | "output_type": "execute_result" 842 | } 843 | ], 844 | "source": [ 845 | "df.head()" 846 | ], 847 | "metadata": { 848 | "collapsed": false, 849 | "ExecuteTime": { 850 | "end_time": "2023-08-01T18:50:27.274672500Z", 851 | "start_time": "2023-08-01T18:50:27.249755900Z" 852 | } 853 | } 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": 231, 858 | "outputs": [], 859 | "source": [ 860 | "df[['Est Max Salary', 'Source']] = df['Est Max Salary'].str.split('(', expand = True)" 861 | ], 862 | "metadata": { 863 | "collapsed": false, 864 | "ExecuteTime": { 865 | "end_time": "2023-08-01T18:50:27.455083200Z", 866 | "start_time": "2023-08-01T18:50:27.447110100Z" 867 | } 868 | } 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": 232, 873 | "outputs": [ 874 | { 875 | "data": { 876 | "text/plain": " Job Title Salary Estimate \\\n0 Data Scientist $111K-$181K (Glassdoor est.) \n1 Data Scientist $111K-$181K (Glassdoor est.) \n5 Data Scientist $111K-$181K (Glassdoor est.) \n8 Data Scientist $111K-$181K (Glassdoor est.) \n10 Data Scientist $111K-$181K (Glassdoor est.) \n\n Job Description Company Name \\\n0 ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ... Hopper\\n3.5 \n1 At Noom, we use scientifically proven methods ... Noom US\\n4.5 \n5 Job Brief\\n\\nThe ideal candidate will have pre... IFG Companies\\n2.9 \n8 Paige is a software company helping pathologis... Paige\\n5.0 \n10 Company Description:\\n\\nQuartet is a pioneerin... Quartet Health\\n3.9 \n\n Location Industry Est Min Salary \\\n0 New York, NY Travel Agencies $111K \n1 New York, NY Health, Beauty, & Fitness $111K \n5 New York, NY Insurance Carriers $111K \n8 New York, NY Enterprise Software & Network Solutions $111K \n10 New York, NY Enterprise Software & Network Solutions $111K \n\n Est Max Salary Source \n0 $181K Glassdoor est.) \n1 $181K Glassdoor est.) \n5 $181K Glassdoor est.) \n8 $181K Glassdoor est.) \n10 $181K Glassdoor est.) ", 877 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Job TitleSalary EstimateJob DescriptionCompany NameLocationIndustryEst Min SalaryEst Max SalarySource
0Data Scientist$111K-$181K (Glassdoor est.)ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ...Hopper\\n3.5New York, NYTravel Agencies$111K$181KGlassdoor est.)
1Data Scientist$111K-$181K (Glassdoor est.)At Noom, we use scientifically proven methods ...Noom US\\n4.5New York, NYHealth, Beauty, & Fitness$111K$181KGlassdoor est.)
5Data Scientist$111K-$181K (Glassdoor est.)Job Brief\\n\\nThe ideal candidate will have pre...IFG Companies\\n2.9New York, NYInsurance Carriers$111K$181KGlassdoor est.)
8Data Scientist$111K-$181K (Glassdoor est.)Paige is a software company helping pathologis...Paige\\n5.0New York, NYEnterprise Software & Network Solutions$111K$181KGlassdoor est.)
10Data Scientist$111K-$181K (Glassdoor est.)Company Description:\\n\\nQuartet is a pioneerin...Quartet Health\\n3.9New York, NYEnterprise Software & Network Solutions$111K$181KGlassdoor est.)
\n
" 878 | }, 879 | "execution_count": 232, 880 | "metadata": {}, 881 | "output_type": "execute_result" 882 | } 883 | ], 884 | "source": [ 885 | "df.head()" 886 | ], 887 | "metadata": { 888 | "collapsed": false, 889 | "ExecuteTime": { 890 | "end_time": "2023-08-01T18:50:27.813243300Z", 891 | "start_time": "2023-08-01T18:50:27.805935300Z" 892 | } 893 | } 894 | }, 895 | { 896 | "cell_type": "code", 897 | "execution_count": 233, 898 | "outputs": [], 899 | "source": [ 900 | "replace_dict = {'\\$' : '', 'K':'000'}" 901 | ], 902 | "metadata": { 903 | "collapsed": false, 904 | "ExecuteTime": { 905 | "end_time": "2023-08-01T18:50:28.310579700Z", 906 | "start_time": "2023-08-01T18:50:28.296617600Z" 907 | } 908 | } 909 | }, 910 | { 911 | "cell_type": "code", 912 | "execution_count": 234, 913 | "outputs": [], 914 | "source": [ 915 | "df['Est Min Salary'] = df['Est Min Salary'].replace(replace_dict, regex = True)" 916 | ], 917 | "metadata": { 918 | "collapsed": false, 919 | "ExecuteTime": { 920 | "end_time": "2023-08-01T18:50:28.575615700Z", 921 | "start_time": "2023-08-01T18:50:28.552692100Z" 922 | } 923 | } 924 | }, 925 | { 926 | "cell_type": "code", 927 | "execution_count": 235, 928 | "outputs": [ 929 | { 930 | "data": { 931 | "text/plain": " Job Title Salary Estimate \\\n0 Data Scientist $111K-$181K (Glassdoor est.) \n1 Data Scientist $111K-$181K (Glassdoor est.) \n5 Data Scientist $111K-$181K (Glassdoor est.) \n8 Data Scientist $111K-$181K (Glassdoor est.) \n10 Data Scientist $111K-$181K (Glassdoor est.) \n\n Job Description Company Name \\\n0 ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ... Hopper\\n3.5 \n1 At Noom, we use scientifically proven methods ... Noom US\\n4.5 \n5 Job Brief\\n\\nThe ideal candidate will have pre... IFG Companies\\n2.9 \n8 Paige is a software company helping pathologis... Paige\\n5.0 \n10 Company Description:\\n\\nQuartet is a pioneerin... Quartet Health\\n3.9 \n\n Location Industry Est Min Salary \\\n0 New York, NY Travel Agencies 111000 \n1 New York, NY Health, Beauty, & Fitness 111000 \n5 New York, NY Insurance Carriers 111000 \n8 New York, NY Enterprise Software & Network Solutions 111000 \n10 New York, NY Enterprise Software & Network Solutions 111000 \n\n Est Max Salary Source \n0 $181K Glassdoor est.) \n1 $181K Glassdoor est.) \n5 $181K Glassdoor est.) \n8 $181K Glassdoor est.) \n10 $181K Glassdoor est.) ", 932 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Job TitleSalary EstimateJob DescriptionCompany NameLocationIndustryEst Min SalaryEst Max SalarySource
0Data Scientist$111K-$181K (Glassdoor est.)ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ...Hopper\\n3.5New York, NYTravel Agencies111000$181KGlassdoor est.)
1Data Scientist$111K-$181K (Glassdoor est.)At Noom, we use scientifically proven methods ...Noom US\\n4.5New York, NYHealth, Beauty, & Fitness111000$181KGlassdoor est.)
5Data Scientist$111K-$181K (Glassdoor est.)Job Brief\\n\\nThe ideal candidate will have pre...IFG Companies\\n2.9New York, NYInsurance Carriers111000$181KGlassdoor est.)
8Data Scientist$111K-$181K (Glassdoor est.)Paige is a software company helping pathologis...Paige\\n5.0New York, NYEnterprise Software & Network Solutions111000$181KGlassdoor est.)
10Data Scientist$111K-$181K (Glassdoor est.)Company Description:\\n\\nQuartet is a pioneerin...Quartet Health\\n3.9New York, NYEnterprise Software & Network Solutions111000$181KGlassdoor est.)
\n
" 933 | }, 934 | "execution_count": 235, 935 | "metadata": {}, 936 | "output_type": "execute_result" 937 | } 938 | ], 939 | "source": [ 940 | "df.head()" 941 | ], 942 | "metadata": { 943 | "collapsed": false, 944 | "ExecuteTime": { 945 | "end_time": "2023-08-01T18:50:28.793899600Z", 946 | "start_time": "2023-08-01T18:50:28.779947700Z" 947 | } 948 | } 949 | }, 950 | { 951 | "cell_type": "code", 952 | "execution_count": 236, 953 | "outputs": [], 954 | "source": [ 955 | "df['Est Max Salary'] = df['Est Max Salary'].replace(replace_dict, regex = True)" 956 | ], 957 | "metadata": { 958 | "collapsed": false, 959 | "ExecuteTime": { 960 | "end_time": "2023-08-01T18:50:29.063058600Z", 961 | "start_time": "2023-08-01T18:50:29.036148700Z" 962 | } 963 | } 964 | }, 965 | { 966 | "cell_type": "code", 967 | "execution_count": 237, 968 | "outputs": [ 969 | { 970 | "data": { 971 | "text/plain": " Job Title Salary Estimate \\\n0 Data Scientist $111K-$181K (Glassdoor est.) \n1 Data Scientist $111K-$181K (Glassdoor est.) \n5 Data Scientist $111K-$181K (Glassdoor est.) \n8 Data Scientist $111K-$181K (Glassdoor est.) \n10 Data Scientist $111K-$181K (Glassdoor est.) \n\n Job Description Company Name \\\n0 ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ... Hopper\\n3.5 \n1 At Noom, we use scientifically proven methods ... Noom US\\n4.5 \n5 Job Brief\\n\\nThe ideal candidate will have pre... IFG Companies\\n2.9 \n8 Paige is a software company helping pathologis... Paige\\n5.0 \n10 Company Description:\\n\\nQuartet is a pioneerin... Quartet Health\\n3.9 \n\n Location Industry Est Min Salary \\\n0 New York, NY Travel Agencies 111000 \n1 New York, NY Health, Beauty, & Fitness 111000 \n5 New York, NY Insurance Carriers 111000 \n8 New York, NY Enterprise Software & Network Solutions 111000 \n10 New York, NY Enterprise Software & Network Solutions 111000 \n\n Est Max Salary Source \n0 181000 Glassdoor est.) \n1 181000 Glassdoor est.) \n5 181000 Glassdoor est.) \n8 181000 Glassdoor est.) \n10 181000 Glassdoor est.) ", 972 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Job TitleSalary EstimateJob DescriptionCompany NameLocationIndustryEst Min SalaryEst Max SalarySource
0Data Scientist$111K-$181K (Glassdoor est.)ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ...Hopper\\n3.5New York, NYTravel Agencies111000181000Glassdoor est.)
1Data Scientist$111K-$181K (Glassdoor est.)At Noom, we use scientifically proven methods ...Noom US\\n4.5New York, NYHealth, Beauty, & Fitness111000181000Glassdoor est.)
5Data Scientist$111K-$181K (Glassdoor est.)Job Brief\\n\\nThe ideal candidate will have pre...IFG Companies\\n2.9New York, NYInsurance Carriers111000181000Glassdoor est.)
8Data Scientist$111K-$181K (Glassdoor est.)Paige is a software company helping pathologis...Paige\\n5.0New York, NYEnterprise Software & Network Solutions111000181000Glassdoor est.)
10Data Scientist$111K-$181K (Glassdoor est.)Company Description:\\n\\nQuartet is a pioneerin...Quartet Health\\n3.9New York, NYEnterprise Software & Network Solutions111000181000Glassdoor est.)
\n
" 973 | }, 974 | "execution_count": 237, 975 | "metadata": {}, 976 | "output_type": "execute_result" 977 | } 978 | ], 979 | "source": [ 980 | "df.head()" 981 | ], 982 | "metadata": { 983 | "collapsed": false, 984 | "ExecuteTime": { 985 | "end_time": "2023-08-01T18:50:29.451701400Z", 986 | "start_time": "2023-08-01T18:50:29.424791Z" 987 | } 988 | } 989 | }, 990 | { 991 | "cell_type": "code", 992 | "execution_count": 238, 993 | "outputs": [], 994 | "source": [ 995 | "df.drop('Source', inplace=True, axis=1)" 996 | ], 997 | "metadata": { 998 | "collapsed": false, 999 | "ExecuteTime": { 1000 | "end_time": "2023-08-01T18:50:30.158161500Z", 1001 | "start_time": "2023-08-01T18:50:30.140222100Z" 1002 | } 1003 | } 1004 | }, 1005 | { 1006 | "cell_type": "markdown", 1007 | "source": [ 1008 | "# 7 - Convert Datatypes" 1009 | ], 1010 | "metadata": { 1011 | "collapsed": false 1012 | } 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": 239, 1017 | "outputs": [ 1018 | { 1019 | "data": { 1020 | "text/plain": "Job Title object\nSalary Estimate object\nJob Description object\nCompany Name object\nLocation object\nIndustry object\nEst Min Salary object\nEst Max Salary object\ndtype: object" 1021 | }, 1022 | "execution_count": 239, 1023 | "metadata": {}, 1024 | "output_type": "execute_result" 1025 | } 1026 | ], 1027 | "source": [ 1028 | "df.dtypes" 1029 | ], 1030 | "metadata": { 1031 | "collapsed": false, 1032 | "ExecuteTime": { 1033 | "end_time": "2023-08-01T18:50:30.799392800Z", 1034 | "start_time": "2023-08-01T18:50:30.788007300Z" 1035 | } 1036 | } 1037 | }, 1038 | { 1039 | "cell_type": "code", 1040 | "execution_count": 240, 1041 | "outputs": [], 1042 | "source": [ 1043 | "df['Est Min Salary'] = df['Est Min Salary'].astype(int)" 1044 | ], 1045 | "metadata": { 1046 | "collapsed": false, 1047 | "ExecuteTime": { 1048 | "end_time": "2023-08-01T18:50:31.227812200Z", 1049 | "start_time": "2023-08-01T18:50:31.205247Z" 1050 | } 1051 | } 1052 | }, 1053 | { 1054 | "cell_type": "code", 1055 | "execution_count": 241, 1056 | "outputs": [], 1057 | "source": [ 1058 | "df['Est Max Salary'] = df['Est Max Salary'].astype(int)" 1059 | ], 1060 | "metadata": { 1061 | "collapsed": false, 1062 | "ExecuteTime": { 1063 | "end_time": "2023-08-01T18:50:32.576341100Z", 1064 | "start_time": "2023-08-01T18:50:32.553417400Z" 1065 | } 1066 | } 1067 | }, 1068 | { 1069 | "cell_type": "code", 1070 | "execution_count": 242, 1071 | "outputs": [ 1072 | { 1073 | "data": { 1074 | "text/plain": "Job Title object\nSalary Estimate object\nJob Description object\nCompany Name object\nLocation object\nIndustry object\nEst Min Salary int32\nEst Max Salary int32\ndtype: object" 1075 | }, 1076 | "execution_count": 242, 1077 | "metadata": {}, 1078 | "output_type": "execute_result" 1079 | } 1080 | ], 1081 | "source": [ 1082 | "df.dtypes" 1083 | ], 1084 | "metadata": { 1085 | "collapsed": false, 1086 | "ExecuteTime": { 1087 | "end_time": "2023-08-01T18:50:33.184123400Z", 1088 | "start_time": "2023-08-01T18:50:33.163193800Z" 1089 | } 1090 | } 1091 | }, 1092 | { 1093 | "cell_type": "code", 1094 | "execution_count": 243, 1095 | "outputs": [], 1096 | "source": [ 1097 | "df = df.drop('Salary Estimate', axis=1)" 1098 | ], 1099 | "metadata": { 1100 | "collapsed": false, 1101 | "ExecuteTime": { 1102 | "end_time": "2023-08-01T18:50:33.820676300Z", 1103 | "start_time": "2023-08-01T18:50:33.804221900Z" 1104 | } 1105 | } 1106 | }, 1107 | { 1108 | "cell_type": "markdown", 1109 | "source": [ 1110 | "# 8 - Stripping Letters from strings" 1111 | ], 1112 | "metadata": { 1113 | "collapsed": false 1114 | } 1115 | }, 1116 | { 1117 | "cell_type": "code", 1118 | "execution_count": 244, 1119 | "outputs": [], 1120 | "source": [ 1121 | "df['Company Name'] = df['Company Name'].str[:-4]" 1122 | ], 1123 | "metadata": { 1124 | "collapsed": false, 1125 | "ExecuteTime": { 1126 | "end_time": "2023-08-01T18:51:37.723942300Z", 1127 | "start_time": "2023-08-01T18:51:37.717313200Z" 1128 | } 1129 | } 1130 | }, 1131 | { 1132 | "cell_type": "code", 1133 | "execution_count": 245, 1134 | "outputs": [ 1135 | { 1136 | "data": { 1137 | "text/plain": " Job Title Job Description \\\n0 Data Scientist ABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ... \n1 Data Scientist At Noom, we use scientifically proven methods ... \n5 Data Scientist Job Brief\\n\\nThe ideal candidate will have pre... \n8 Data Scientist Paige is a software company helping pathologis... \n10 Data Scientist Company Description:\\n\\nQuartet is a pioneerin... \n11 Data Scientist PulsePoint™, a global programmatic advertising... \n12 Data Scientist Medidata: Conquering Diseases Together\\n\\nMedi... \n13 Data Scientist A Career with Point72’s MI Data team\\nAt Point... \n14 Data Scientist Two Sigma is a different kind of investment ma... \n15 Data Scientist Data Scientist\\nAffinity Solutions / Marketing... \n\n Company Name Location Industry \\\n0 Hopper New York, NY Travel Agencies \n1 Noom US New York, NY Health, Beauty, & Fitness \n5 IFG Companies New York, NY Insurance Carriers \n8 Paige New York, NY Enterprise Software & Network Solutions \n10 Quartet Health New York, NY Enterprise Software & Network Solutions \n11 PulsePoint New York, NY Internet \n12 Medidata Solutions New York, NY Enterprise Software & Network Solutions \n13 Point72 New York, NY Investment Banking & Asset Management \n14 Two Sigma New York, NY Investment Banking & Asset Management \n15 Affinity Solutions New York, NY Advertising & Marketing \n\n Est Min Salary Est Max Salary \n0 111000 181000 \n1 111000 181000 \n5 111000 181000 \n8 111000 181000 \n10 111000 181000 \n11 111000 181000 \n12 111000 181000 \n13 111000 181000 \n14 111000 181000 \n15 111000 181000 ", 1138 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Job TitleJob DescriptionCompany NameLocationIndustryEst Min SalaryEst Max Salary
0Data ScientistABOUT HOPPER\\n\\nAt Hopper, we’re on a mission ...HopperNew York, NYTravel Agencies111000181000
1Data ScientistAt Noom, we use scientifically proven methods ...Noom USNew York, NYHealth, Beauty, & Fitness111000181000
5Data ScientistJob Brief\\n\\nThe ideal candidate will have pre...IFG CompaniesNew York, NYInsurance Carriers111000181000
8Data ScientistPaige is a software company helping pathologis...PaigeNew York, NYEnterprise Software & Network Solutions111000181000
10Data ScientistCompany Description:\\n\\nQuartet is a pioneerin...Quartet HealthNew York, NYEnterprise Software & Network Solutions111000181000
11Data ScientistPulsePoint™, a global programmatic advertising...PulsePointNew York, NYInternet111000181000
12Data ScientistMedidata: Conquering Diseases Together\\n\\nMedi...Medidata SolutionsNew York, NYEnterprise Software & Network Solutions111000181000
13Data ScientistA Career with Point72’s MI Data team\\nAt Point...Point72New York, NYInvestment Banking & Asset Management111000181000
14Data ScientistTwo Sigma is a different kind of investment ma...Two SigmaNew York, NYInvestment Banking & Asset Management111000181000
15Data ScientistData Scientist\\nAffinity Solutions / Marketing...Affinity SolutionsNew York, NYAdvertising & Marketing111000181000
\n
" 1139 | }, 1140 | "execution_count": 245, 1141 | "metadata": {}, 1142 | "output_type": "execute_result" 1143 | } 1144 | ], 1145 | "source": [ 1146 | "df.head(10)" 1147 | ], 1148 | "metadata": { 1149 | "collapsed": false, 1150 | "ExecuteTime": { 1151 | "end_time": "2023-08-01T18:51:41.190987500Z", 1152 | "start_time": "2023-08-01T18:51:41.170378300Z" 1153 | } 1154 | } 1155 | }, 1156 | { 1157 | "cell_type": "markdown", 1158 | "source": [ 1159 | "# Mine Job Description" 1160 | ], 1161 | "metadata": { 1162 | "collapsed": false 1163 | } 1164 | }, 1165 | { 1166 | "cell_type": "code", 1167 | "execution_count": 246, 1168 | "outputs": [], 1169 | "source": [ 1170 | "skill_set = ['sql','python', 'power bi', 'tableau', 'excel', ' r ']" 1171 | ], 1172 | "metadata": { 1173 | "collapsed": false, 1174 | "ExecuteTime": { 1175 | "end_time": "2023-08-01T18:54:41.775406400Z", 1176 | "start_time": "2023-08-01T18:54:41.757410500Z" 1177 | } 1178 | } 1179 | }, 1180 | { 1181 | "cell_type": "code", 1182 | "execution_count": 247, 1183 | "outputs": [ 1184 | { 1185 | "name": "stdout", 1186 | "output_type": "stream", 1187 | "text": [ 1188 | "Number of jobs that require sql : 581\n", 1189 | "Number of jobs that require python : 266\n", 1190 | "Number of jobs that require power bi : 98\n", 1191 | "Number of jobs that require tableau : 275\n", 1192 | "Number of jobs that require excel : 533\n", 1193 | "Number of jobs that require r : 63\n" 1194 | ] 1195 | } 1196 | ], 1197 | "source": [ 1198 | "for skills in skill_set:\n", 1199 | " print('Number of jobs that require', skills, ' : ' , len(df[(df['Job Description'].str.contains(skills, case = False)) & (df['Job Title']=='Data Analyst')]))" 1200 | ], 1201 | "metadata": { 1202 | "collapsed": false, 1203 | "ExecuteTime": { 1204 | "end_time": "2023-08-01T18:56:50.084458600Z", 1205 | "start_time": "2023-08-01T18:56:49.766296500Z" 1206 | } 1207 | } 1208 | }, 1209 | { 1210 | "cell_type": "code", 1211 | "execution_count": 248, 1212 | "outputs": [ 1213 | { 1214 | "name": "stdout", 1215 | "output_type": "stream", 1216 | "text": [ 1217 | "Number of jobs that require sql : 641\n", 1218 | "Number of jobs that require python : 980\n", 1219 | "Number of jobs that require power bi : 40\n", 1220 | "Number of jobs that require tableau : 192\n", 1221 | "Number of jobs that require excel : 829\n", 1222 | "Number of jobs that require r : 168\n" 1223 | ] 1224 | } 1225 | ], 1226 | "source": [ 1227 | "for skills in skill_set:\n", 1228 | " print('Number of jobs that require', skills, ' : ' , len(df[(df['Job Description'].str.contains(skills, case = False)) & (df['Job Title']=='Data Scientist')]))" 1229 | ], 1230 | "metadata": { 1231 | "collapsed": false, 1232 | "ExecuteTime": { 1233 | "end_time": "2023-08-01T18:57:05.003171100Z", 1234 | "start_time": "2023-08-01T18:57:04.613465100Z" 1235 | } 1236 | } 1237 | }, 1238 | { 1239 | "cell_type": "code", 1240 | "execution_count": 249, 1241 | "outputs": [ 1242 | { 1243 | "name": "stdout", 1244 | "output_type": "stream", 1245 | "text": [ 1246 | "Number of jobs that require sql : 591\n", 1247 | "Number of jobs that require python : 635\n", 1248 | "Number of jobs that require power bi : 47\n", 1249 | "Number of jobs that require tableau : 133\n", 1250 | "Number of jobs that require excel : 294\n", 1251 | "Number of jobs that require r : 44\n" 1252 | ] 1253 | } 1254 | ], 1255 | "source": [ 1256 | "for skills in skill_set:\n", 1257 | " print('Number of jobs that require', skills, ' : ' , len(df[(df['Job Description'].str.contains(skills, case = False)) & (df['Job Title']=='Data Engineer')]))" 1258 | ], 1259 | "metadata": { 1260 | "collapsed": false, 1261 | "ExecuteTime": { 1262 | "end_time": "2023-08-01T18:57:17.670664200Z", 1263 | "start_time": "2023-08-01T18:57:17.339594300Z" 1264 | } 1265 | } 1266 | }, 1267 | { 1268 | "cell_type": "code", 1269 | "execution_count": null, 1270 | "outputs": [], 1271 | "source": [], 1272 | "metadata": { 1273 | "collapsed": false 1274 | } 1275 | } 1276 | ], 1277 | "metadata": { 1278 | "kernelspec": { 1279 | "display_name": "Python 3", 1280 | "language": "python", 1281 | "name": "python3" 1282 | }, 1283 | "language_info": { 1284 | "codemirror_mode": { 1285 | "name": "ipython", 1286 | "version": 2 1287 | }, 1288 | "file_extension": ".py", 1289 | "mimetype": "text/x-python", 1290 | "name": "python", 1291 | "nbconvert_exporter": "python", 1292 | "pygments_lexer": "ipython2", 1293 | "version": "2.7.6" 1294 | } 1295 | }, 1296 | "nbformat": 4, 1297 | "nbformat_minor": 0 1298 | } 1299 | -------------------------------------------------------------------------------- /05_ValueInvestorHoldings31122022_2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jbotes/powerbiTutorials/13ec6bab783b77e7b6b904274c140e0efdf32cb1/05_ValueInvestorHoldings31122022_2.xlsx -------------------------------------------------------------------------------- /06_ValueInvestorHoldings31032023.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jbotes/powerbiTutorials/13ec6bab783b77e7b6b904274c140e0efdf32cb1/06_ValueInvestorHoldings31032023.xlsx -------------------------------------------------------------------------------- /28_Lost_Customers_NW_Video.pbix: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jbotes/powerbiTutorials/13ec6bab783b77e7b6b904274c140e0efdf32cb1/28_Lost_Customers_NW_Video.pbix -------------------------------------------------------------------------------- /28_New_Customers_NW_video.pbix: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jbotes/powerbiTutorials/13ec6bab783b77e7b6b904274c140e0efdf32cb1/28_New_Customers_NW_video.pbix -------------------------------------------------------------------------------- /ActualDataset.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jbotes/powerbiTutorials/13ec6bab783b77e7b6b904274c140e0efdf32cb1/ActualDataset.xlsx -------------------------------------------------------------------------------- /BaSensei_webscrape_Bitcoin_YTVideo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true, 8 | "ExecuteTime": { 9 | "end_time": "2023-08-13T11:42:32.862157700Z", 10 | "start_time": "2023-08-13T11:42:31.597143300Z" 11 | } 12 | }, 13 | "outputs": [], 14 | "source": [ 15 | "from bs4 import BeautifulSoup\n", 16 | "import pandas as pd\n", 17 | "import requests\n", 18 | "import csv\n", 19 | "#from pyspark.sql import dataframe\n", 20 | "from datetime import datetime" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "outputs": [], 27 | "source": [ 28 | "headers = {\"User-Agent\":\"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0\", \"Accept-Encoding\":\"gzip, deflate\", \"Accept\":\"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\", \"DNT\":\"1\",\"Connection\":\"close\", \"Upgrade-Insecure-Requests\":\"1\"}" 29 | ], 30 | "metadata": { 31 | "collapsed": false, 32 | "ExecuteTime": { 33 | "end_time": "2023-08-13T11:42:48.740041700Z", 34 | "start_time": "2023-08-13T11:42:48.728055600Z" 35 | } 36 | } 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "outputs": [], 42 | "source": [ 43 | "url = 'https://bitinfocharts.com/top-100-richest-bitcoin-addresses.html'\n", 44 | "response = requests.get(url,headers=headers).text\n", 45 | "soup = BeautifulSoup(response, \"html.parser\")" 46 | ], 47 | "metadata": { 48 | "collapsed": false, 49 | "ExecuteTime": { 50 | "end_time": "2023-08-13T11:43:14.690684900Z", 51 | "start_time": "2023-08-13T11:43:13.906464400Z" 52 | } 53 | } 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "source": [ 58 | "# Paste ChaptGPT Code" 59 | ], 60 | "metadata": { 61 | "collapsed": false 62 | } 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 5, 67 | "outputs": [], 68 | "source": [ 69 | "# Find the table with the specified attributes\n", 70 | "table = soup.find('table', {'class': 'table table-condensed bb', 'style': 'max-width:1000px;text-align:center;width: inherit;margin-bottom: 0;'})\n" 71 | ], 72 | "metadata": { 73 | "collapsed": false, 74 | "ExecuteTime": { 75 | "end_time": "2023-08-13T11:43:53.567113Z", 76 | "start_time": "2023-08-13T11:43:53.552132Z" 77 | } 78 | } 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 6, 83 | "outputs": [], 84 | "source": [ 85 | "# Extract the headers\n", 86 | "headers = [th.text for th in table.find('thead').find_all('th')]" 87 | ], 88 | "metadata": { 89 | "collapsed": false, 90 | "ExecuteTime": { 91 | "end_time": "2023-08-13T11:43:59.555082500Z", 92 | "start_time": "2023-08-13T11:43:59.546076Z" 93 | } 94 | } 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 7, 99 | "outputs": [], 100 | "source": [ 101 | "# Extract the rows\n", 102 | "rows = []\n", 103 | "for tr in table.find('tbody').find_all('tr'):\n", 104 | " rows.append([td.text.strip() for td in tr.find_all('td')])" 105 | ], 106 | "metadata": { 107 | "collapsed": false, 108 | "ExecuteTime": { 109 | "end_time": "2023-08-13T11:44:05.380681100Z", 110 | "start_time": "2023-08-13T11:44:05.371079600Z" 111 | } 112 | } 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 8, 117 | "outputs": [], 118 | "source": [ 119 | "# Create a DataFrame\n", 120 | "df = pd.DataFrame(rows, columns=headers)" 121 | ], 122 | "metadata": { 123 | "collapsed": false, 124 | "ExecuteTime": { 125 | "end_time": "2023-08-13T11:44:16.715243300Z", 126 | "start_time": "2023-08-13T11:44:16.697255800Z" 127 | } 128 | } 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 9, 133 | "outputs": [], 134 | "source": [ 135 | "# Add the current date and time column\n", 136 | "df['Date and Time'] = pd.Timestamp.now().strftime('%d/%m/%Y %H:%M')" 137 | ], 138 | "metadata": { 139 | "collapsed": false, 140 | "ExecuteTime": { 141 | "end_time": "2023-08-13T11:44:22.655041400Z", 142 | "start_time": "2023-08-13T11:44:22.643056Z" 143 | } 144 | } 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 10, 149 | "outputs": [ 150 | { 151 | "data": { 152 | "text/plain": " Balance, BTC Addresses % Addresses (Total) Coins \\\n0 (0 - 0.00001) 3648493 7.48% (100%) 18.62 BTC \n1 [0.00001 - 0.0001) 9578291 19.64% (92.52%) 411.72 BTC \n2 [0.0001 - 0.001) 12084944 24.78% (72.88%) 4,694 BTC \n3 [0.001 - 0.01) 11236056 23.04% (48.1%) 41,775 BTC \n4 [0.01 - 0.1) 7784483 15.96% (25.06%) 261,374 BTC \n\n USD % Coins (Total) Date and Time \n0 $547,352 0% (100%) 13/08/2023 13:44 \n1 $12,104,875 0% (100%) 13/08/2023 13:44 \n2 $138,013,773 0.02% (100%) 13/08/2023 13:44 \n3 $1,228,237,337 0.21% (99.97%) 13/08/2023 13:44 \n4 $7,684,675,479 1.34% (99.76%) 13/08/2023 13:44 ", 153 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Balance, BTCAddresses% Addresses (Total)CoinsUSD% Coins (Total)Date and Time
0(0 - 0.00001)36484937.48% (100%)18.62 BTC$547,3520% (100%)13/08/2023 13:44
1[0.00001 - 0.0001)957829119.64% (92.52%)411.72 BTC$12,104,8750% (100%)13/08/2023 13:44
2[0.0001 - 0.001)1208494424.78% (72.88%)4,694 BTC$138,013,7730.02% (100%)13/08/2023 13:44
3[0.001 - 0.01)1123605623.04% (48.1%)41,775 BTC$1,228,237,3370.21% (99.97%)13/08/2023 13:44
4[0.01 - 0.1)778448315.96% (25.06%)261,374 BTC$7,684,675,4791.34% (99.76%)13/08/2023 13:44
\n
" 154 | }, 155 | "execution_count": 10, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "# Display the first few rows of the DataFrame\n", 162 | "df.head()" 163 | ], 164 | "metadata": { 165 | "collapsed": false, 166 | "ExecuteTime": { 167 | "end_time": "2023-08-13T11:44:27.161515300Z", 168 | "start_time": "2023-08-13T11:44:27.146054300Z" 169 | } 170 | } 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 11, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": " Balance, BTC Addresses % Addresses (Total) Coins \\\n0 (0 - 0.00001) 3648493 0.0748 18.62 BTC \n1 [0.00001 - 0.0001) 9578291 0.1964 411.72 BTC \n2 [0.0001 - 0.001) 12084944 0.2478 4,694 BTC \n3 [0.001 - 0.01) 11236056 0.2304 41,775 BTC \n4 [0.01 - 0.1) 7784483 0.1596 261,374 BTC \n\n USD % Coins (Total) Date and Time \n0 $547,352 0% (100%) 13/08/2023 13:44 \n1 $12,104,875 0% (100%) 13/08/2023 13:44 \n2 $138,013,773 0.02% (100%) 13/08/2023 13:44 \n3 $1,228,237,337 0.21% (99.97%) 13/08/2023 13:44 \n4 $7,684,675,479 1.34% (99.76%) 13/08/2023 13:44 ", 179 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Balance, BTCAddresses% Addresses (Total)CoinsUSD% Coins (Total)Date and Time
0(0 - 0.00001)36484930.074818.62 BTC$547,3520% (100%)13/08/2023 13:44
1[0.00001 - 0.0001)95782910.1964411.72 BTC$12,104,8750% (100%)13/08/2023 13:44
2[0.0001 - 0.001)120849440.24784,694 BTC$138,013,7730.02% (100%)13/08/2023 13:44
3[0.001 - 0.01)112360560.230441,775 BTC$1,228,237,3370.21% (99.97%)13/08/2023 13:44
4[0.01 - 0.1)77844830.1596261,374 BTC$7,684,675,4791.34% (99.76%)13/08/2023 13:44
\n
" 180 | }, 181 | "execution_count": 11, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "# Split the \"% Addresses (Total)\" column by the \"(\" delimiter and keep only the first part\n", 188 | "df['% Addresses (Total)'] = df['% Addresses (Total)'].apply(lambda x: x.split('(')[0].strip())\n", 189 | "\n", 190 | "# Convert the values to percentage\n", 191 | "df['% Addresses (Total)'] = df['% Addresses (Total)'].str.rstrip('%').astype('float') / 100\n", 192 | "\n", 193 | "# Display the updated DataFrame\n", 194 | "df.head()" 195 | ], 196 | "metadata": { 197 | "collapsed": false, 198 | "ExecuteTime": { 199 | "end_time": "2023-08-13T11:45:06.909086800Z", 200 | "start_time": "2023-08-13T11:45:06.898549400Z" 201 | } 202 | } 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 12, 207 | "outputs": [ 208 | { 209 | "data": { 210 | "text/plain": " Balance, BTC Addresses % Addresses (Total) Coins \\\n0 (0 - 0.00001) 3648493 0.0748 18.62 \n1 [0.00001 - 0.0001) 9578291 0.1964 411.72 \n2 [0.0001 - 0.001) 12084944 0.2478 4694.00 \n3 [0.001 - 0.01) 11236056 0.2304 41775.00 \n4 [0.01 - 0.1) 7784483 0.1596 261374.00 \n\n USD % Coins (Total) Date and Time \n0 $547,352 0% (100%) 13/08/2023 13:44 \n1 $12,104,875 0% (100%) 13/08/2023 13:44 \n2 $138,013,773 0.02% (100%) 13/08/2023 13:44 \n3 $1,228,237,337 0.21% (99.97%) 13/08/2023 13:44 \n4 $7,684,675,479 1.34% (99.76%) 13/08/2023 13:44 ", 211 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Balance, BTCAddresses% Addresses (Total)CoinsUSD% Coins (Total)Date and Time
0(0 - 0.00001)36484930.074818.62$547,3520% (100%)13/08/2023 13:44
1[0.00001 - 0.0001)95782910.1964411.72$12,104,8750% (100%)13/08/2023 13:44
2[0.0001 - 0.001)120849440.24784694.00$138,013,7730.02% (100%)13/08/2023 13:44
3[0.001 - 0.01)112360560.230441775.00$1,228,237,3370.21% (99.97%)13/08/2023 13:44
4[0.01 - 0.1)77844830.1596261374.00$7,684,675,4791.34% (99.76%)13/08/2023 13:44
\n
" 212 | }, 213 | "execution_count": 12, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "# Remove the \"BTC\" string and commas from the \"Coins\" column, then convert to decimal data type\n", 220 | "df['Coins'] = df['Coins'].replace('[BTC,]', '', regex=True).astype(float)\n", 221 | "\n", 222 | "# Round the values to 2 decimal places\n", 223 | "df['Coins'] = df['Coins'].round(2)\n", 224 | "\n", 225 | "# Display the updated DataFrame\n", 226 | "df.head()" 227 | ], 228 | "metadata": { 229 | "collapsed": false, 230 | "ExecuteTime": { 231 | "end_time": "2023-08-13T11:45:30.138855800Z", 232 | "start_time": "2023-08-13T11:45:30.114909900Z" 233 | } 234 | } 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 13, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/plain": " Balance, BTC Addresses % Addresses (Total) Coins \\\n0 (0 - 0.00001) 3648493 0.0748 18.62 \n1 [0.00001 - 0.0001) 9578291 0.1964 411.72 \n2 [0.0001 - 0.001) 12084944 0.2478 4694.00 \n3 [0.001 - 0.01) 11236056 0.2304 41775.00 \n4 [0.01 - 0.1) 7784483 0.1596 261374.00 \n\n USD % Coins (Total) Date and Time \n0 $547,352 0.0000 13/08/2023 13:44 \n1 $12,104,875 0.0000 13/08/2023 13:44 \n2 $138,013,773 0.0002 13/08/2023 13:44 \n3 $1,228,237,337 0.0021 13/08/2023 13:44 \n4 $7,684,675,479 0.0134 13/08/2023 13:44 ", 243 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Balance, BTCAddresses% Addresses (Total)CoinsUSD% Coins (Total)Date and Time
0(0 - 0.00001)36484930.074818.62$547,3520.000013/08/2023 13:44
1[0.00001 - 0.0001)95782910.1964411.72$12,104,8750.000013/08/2023 13:44
2[0.0001 - 0.001)120849440.24784694.00$138,013,7730.000213/08/2023 13:44
3[0.001 - 0.01)112360560.230441775.00$1,228,237,3370.002113/08/2023 13:44
4[0.01 - 0.1)77844830.1596261374.00$7,684,675,4790.013413/08/2023 13:44
\n
" 244 | }, 245 | "execution_count": 13, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "# Split the \"% Coins (Total)\" column by the \"(\" delimiter and keep only the first part\n", 252 | "df['% Coins (Total)'] = df['% Coins (Total)'].apply(lambda x: x.split('(')[0].strip())\n", 253 | "\n", 254 | "# Convert the values to percentage\n", 255 | "df['% Coins (Total)'] = df['% Coins (Total)'].str.rstrip('%').astype('float') / 100\n", 256 | "\n", 257 | "# Display the updated DataFrame\n", 258 | "df.head()" 259 | ], 260 | "metadata": { 261 | "collapsed": false, 262 | "ExecuteTime": { 263 | "end_time": "2023-08-13T11:45:55.422054800Z", 264 | "start_time": "2023-08-13T11:45:55.402557200Z" 265 | } 266 | } 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 15, 271 | "outputs": [], 272 | "source": [ 273 | "# Define the path for the CSV file\n", 274 | "csv_file_path = \"bitcoin_wealth_distribution.csv\"\n", 275 | "\n", 276 | "# Save the DataFrame to a CSV file\n", 277 | "df.to_csv(csv_file_path, index=False)" 278 | ], 279 | "metadata": { 280 | "collapsed": false, 281 | "ExecuteTime": { 282 | "end_time": "2023-08-13T11:46:27.361153Z", 283 | "start_time": "2023-08-13T11:46:27.342234700Z" 284 | } 285 | } 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "outputs": [], 291 | "source": [], 292 | "metadata": { 293 | "collapsed": false 294 | } 295 | } 296 | ], 297 | "metadata": { 298 | "kernelspec": { 299 | "display_name": "Python 3", 300 | "language": "python", 301 | "name": "python3" 302 | }, 303 | "language_info": { 304 | "codemirror_mode": { 305 | "name": "ipython", 306 | "version": 2 307 | }, 308 | "file_extension": ".py", 309 | "mimetype": "text/x-python", 310 | "name": "python", 311 | "nbconvert_exporter": "python", 312 | "pygments_lexer": "ipython2", 313 | "version": "2.7.6" 314 | } 315 | }, 316 | "nbformat": 4, 317 | "nbformat_minor": 0 318 | } 319 | -------------------------------------------------------------------------------- /BrokerStatement.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jbotes/powerbiTutorials/13ec6bab783b77e7b6b904274c140e0efdf32cb1/BrokerStatement.xlsx -------------------------------------------------------------------------------- /BrokerStatement2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jbotes/powerbiTutorials/13ec6bab783b77e7b6b904274c140e0efdf32cb1/BrokerStatement2.xlsx -------------------------------------------------------------------------------- /BrokerStatement3.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jbotes/powerbiTutorials/13ec6bab783b77e7b6b904274c140e0efdf32cb1/BrokerStatement3.xlsx -------------------------------------------------------------------------------- /DAX_Dates.txt: -------------------------------------------------------------------------------- 1 | Dim_Holding_Date = 2 | VAR FirstFiscalMonth = 7 -- First month of fiscal year 3 | VAR FirstDayOfWeek = 0 -- 0 = Sunday, 1 = Monday, ... 4 | VAR FirstYear = -- Customize first year to use 5 | YEAR ( MIN ( 'Fact_Holdings'[PortfolioDate] )) 6 | RETURN 7 | GENERATE ( 8 | FILTER ( 9 | CALENDARAUTO (), 10 | YEAR ( [Date] ) >= FirstYear 11 | ), 12 | VAR Yr = YEAR ( [Date] ) -- Year Number 13 | VAR Mn = MONTH ( [Date] ) -- Month Number (1-12) 14 | VAR Qr = QUARTER ( [Date] ) -- Quarter Number (1-4) 15 | VAR MnQ = Mn - 3 * (Qr - 1) -- Month in Quarter (1-3) 16 | VAR Wd = WEEKDAY ( [Date], 1 ) - 1 -- Week day number (0 = Sunday, 1 = Monday, ...) 17 | VAR Fyr = -- Fiscal Year Number 18 | Yr + 1 * ( FirstFiscalMonth > 1 && Mn >= FirstFiscalMonth ) 19 | VAR Fqr = -- Fiscal Quarter (string) 20 | FORMAT ( EOMONTH ( [Date], 1 - FirstFiscalMonth ), "\QQ" ) 21 | RETURN ROW ( 22 | "Year", DATE ( Yr, 12, 31 ), 23 | "Year Quarter", FORMAT ( [Date], "\QQ-YYYY" ), 24 | "Year Quarter Date", EOMONTH ( [Date], 3 - MnQ ), 25 | "Quarter", FORMAT ( [Date], "\QQ" ), 26 | -- "Year Month", EOMONTH ( [Date], 0 ), -- use this for end-of-month 27 | "Year Month", EOMONTH ( [Date], -1 ) + 1, -- use this for beginning-of-month 28 | "Month", DATE ( 1900, MONTH ( [Date] ), 1 ), 29 | "Day of Week", DATE ( 1900, 1, 7 + Wd + (7 * (Wd < FirstDayOfWeek)) ), 30 | "Fiscal Year", DATE ( Fyr + (FirstFiscalMonth = 1), FirstFiscalMonth, 1 ) - 1, 31 | "Fiscal Year Quarter", "F" & Fqr & "-" & Fyr, 32 | "Fiscal Year Quarter Date", EOMONTH ( [Date], 3 - MnQ ), 33 | "Fiscal Quarter", "F" & Fqr 34 | ) 35 | ) 36 | -------------------------------------------------------------------------------- /DataProfiling_Video.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Import Libraries" 7 | ], 8 | "metadata": { 9 | "collapsed": false 10 | } 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 11, 15 | "outputs": [], 16 | "source": [ 17 | "import pandas as pd\n", 18 | "from pandas_profiling import ProfileReport" 19 | ], 20 | "metadata": { 21 | "collapsed": false, 22 | "ExecuteTime": { 23 | "end_time": "2023-08-06T16:36:26.140875800Z", 24 | "start_time": "2023-08-06T16:36:26.136780200Z" 25 | } 26 | } 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 12, 31 | "outputs": [], 32 | "source": [ 33 | "#Load the Data\n", 34 | "df = pd.read_csv('CleanderDataScience.csv')" 35 | ], 36 | "metadata": { 37 | "collapsed": false, 38 | "ExecuteTime": { 39 | "end_time": "2023-08-06T16:36:53.770667200Z", 40 | "start_time": "2023-08-06T16:36:53.667496Z" 41 | } 42 | } 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 14, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/plain": "(3498, 8)" 51 | }, 52 | "execution_count": 14, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "#check Shape\n", 59 | "df.shape" 60 | ], 61 | "metadata": { 62 | "collapsed": false, 63 | "ExecuteTime": { 64 | "end_time": "2023-08-06T16:37:04.764610200Z", 65 | "start_time": "2023-08-06T16:37:04.719711600Z" 66 | } 67 | } 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 15, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": " Unnamed: 0 Job Title \\\n1780 2004 Data Analyst \n3028 3367 Data Analyst \n1398 1584 Data Engineer \n626 683 Data Scientist \n1659 1872 Data Scientist \n1138 1279 Data Scientist \n1766 1989 Data Scientist \n733 809 Data Analyst \n327 354 Data Scientist \n2154 2418 Data Scientist \n\n Job Description \\\n1780 Purpose of Job\\nWe are currently seeking a tal... \n3028 Business Intelligence AnalystLocation: Austin,... \n1398 Job Title Senior Data Engineer Location Chandl... \n626 As a Project Scientist, you may assist in prep... \n1659 Site Name: USA - Pennsylvania - Upper Providen... \n1138 Min Qualifications\\n\\nREQUIRED EDUCATION / EXP... \n1766 LMI is a government consulting firm, dedicated... \n733 Kelly Services is seeking a Provider Analytics... \n327 In this role you will join the Aviana Data Sci... \n2154 Description\\nScientist/Associate Scientist Can... \n\n Company Name Location \\\n1780 USAA San Antonio, TX \n3028 Iconma, L.L.C. Austin, TX \n1398 ICST, LLC Chandler, AZ \n626 Cedars-Sinai Los Angeles, CA \n1659 GSK Collegeville, PA \n1138 The University of Texas Medical Branch Webster, TX \n1766 LMI San Antonio, TX \n733 Kelly Chicago, IL \n327 Aviana Global Technologies Brea, CA \n2154 Fate Therapeutics, Inc. San Diego, CA \n\n Industry Est Min Salary Est Max Salary \n1780 Insurance Carriers 74000 140000 \n3028 Staffing & Outsourcing 73000 111000 \n1398 Staffing & Outsourcing 84000 101000 \n626 Health Care Services & Hospitals 136000 164000 \n1659 Biotech & Pharmaceuticals 143000 237000 \n1138 Health Care Services & Hospitals 73000 136000 \n1766 Consulting 54000 92000 \n733 Staffing & Outsourcing 31000 56000 \n327 IT Services 102000 164000 \n2154 Accounting 112000 211000 ", 76 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Unnamed: 0Job TitleJob DescriptionCompany NameLocationIndustryEst Min SalaryEst Max Salary
17802004Data AnalystPurpose of Job\\nWe are currently seeking a tal...USAASan Antonio, TXInsurance Carriers74000140000
30283367Data AnalystBusiness Intelligence AnalystLocation: Austin,...Iconma, L.L.C.Austin, TXStaffing & Outsourcing73000111000
13981584Data EngineerJob Title Senior Data Engineer Location Chandl...ICST, LLCChandler, AZStaffing & Outsourcing84000101000
626683Data ScientistAs a Project Scientist, you may assist in prep...Cedars-SinaiLos Angeles, CAHealth Care Services & Hospitals136000164000
16591872Data ScientistSite Name: USA - Pennsylvania - Upper Providen...GSKCollegeville, PABiotech & Pharmaceuticals143000237000
11381279Data ScientistMin Qualifications\\n\\nREQUIRED EDUCATION / EXP...The University of Texas Medical BranchWebster, TXHealth Care Services & Hospitals73000136000
17661989Data ScientistLMI is a government consulting firm, dedicated...LMISan Antonio, TXConsulting5400092000
733809Data AnalystKelly Services is seeking a Provider Analytics...KellyChicago, ILStaffing & Outsourcing3100056000
327354Data ScientistIn this role you will join the Aviana Data Sci...Aviana Global TechnologiesBrea, CAIT Services102000164000
21542418Data ScientistDescription\\nScientist/Associate Scientist Can...Fate Therapeutics, Inc.San Diego, CAAccounting112000211000
\n
" 77 | }, 78 | "execution_count": 15, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "#Check sample\n", 85 | "df.sample(10)" 86 | ], 87 | "metadata": { 88 | "collapsed": false, 89 | "ExecuteTime": { 90 | "end_time": "2023-08-06T16:37:14.114374300Z", 91 | "start_time": "2023-08-06T16:37:14.103861100Z" 92 | } 93 | } 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 16, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": " Unnamed: 0 Est Min Salary Est Max Salary\ncount 3498.000000 3498.000000 3498.000000\nmean 1955.409949 82785.591767 135068.610635\nstd 1129.056284 34270.898430 44593.302576\nmin 0.000000 12000.000000 56000.000000\n25% 970.250000 54000.000000 98000.000000\n50% 1970.500000 79000.000000 130000.000000\n75% 2933.750000 111000.000000 165750.000000\nmax 3908.000000 200000.000000 254000.000000", 102 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
Unnamed: 0Est Min SalaryEst Max Salary
count3498.0000003498.0000003498.000000
mean1955.40994982785.591767135068.610635
std1129.05628434270.89843044593.302576
min0.00000012000.00000056000.000000
25%970.25000054000.00000098000.000000
50%1970.50000079000.000000130000.000000
75%2933.750000111000.000000165750.000000
max3908.000000200000.000000254000.000000
\n
" 103 | }, 104 | "execution_count": 16, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "#check description\n", 111 | "df.describe()" 112 | ], 113 | "metadata": { 114 | "collapsed": false, 115 | "ExecuteTime": { 116 | "end_time": "2023-08-06T16:37:24.690856600Z", 117 | "start_time": "2023-08-06T16:37:24.664272600Z" 118 | } 119 | } 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "source": [ 124 | "# Run pandas Profiling" 125 | ], 126 | "metadata": { 127 | "collapsed": false 128 | } 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 17, 133 | "outputs": [], 134 | "source": [ 135 | "report = ProfileReport(df, title=\"Data Science Salaries Profile\")\n" 136 | ], 137 | "metadata": { 138 | "collapsed": false, 139 | "ExecuteTime": { 140 | "end_time": "2023-08-06T16:38:34.197617Z", 141 | "start_time": "2023-08-06T16:38:34.170074800Z" 142 | } 143 | } 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 18, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": "Summarize dataset: 0%| | 0/5 [00:00