├── .gitignore ├── DataCamp_Model_Building.ipynb ├── Data_Camp_Exploration.ipynb ├── Python_intro_hackathon.sublime-project ├── Python_intro_hackathon.sublime-workspace ├── README.md ├── chapter1.md ├── chapter2.md ├── chapter3.md ├── chapter4.md ├── chapter5.md ├── chapter6.md ├── course.yml ├── img ├── author_image.png └── shield_image.png └── requirements.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_STORE 2 | .cache 3 | .ipynb_checkpoints 4 | .spyderproject 5 | -------------------------------------------------------------------------------- /DataCamp_Model_Building.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# preprocessing of data set" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 3, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import numpy as np\n", 20 | "from sklearn.preprocessing import LabelEncoder\n", 21 | "\n", 22 | "train = pd.read_csv(\"https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv\")\n", 23 | "test = pd.read_csv(\"https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv\")" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 4, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/plain": [ 36 | "ApplicantIncome 0\n", 37 | "CoapplicantIncome 0\n", 38 | "Credit_History 79\n", 39 | "Dependents 25\n", 40 | "Education 0\n", 41 | "Gender 24\n", 42 | "LoanAmount 27\n", 43 | "Loan_Amount_Term 20\n", 44 | "Loan_ID 0\n", 45 | "Loan_Status 367\n", 46 | "Married 3\n", 47 | "Property_Area 0\n", 48 | "Self_Employed 55\n", 49 | "Type 0\n", 50 | "dtype: int64" 51 | ] 52 | }, 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "#Combining both train and test dataset\n", 60 | "\n", 61 | "train['Type']='Train' #Create a flag for Train and Test Data set\n", 62 | "test['Type']='Test'\n", 63 | "fullData = pd.concat([train,test],axis=0)\n", 64 | "\n", 65 | "#Look at the available missing values in the dataset\n", 66 | "fullData.isnull().sum()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "#Identify categorical and continuous variables\n", 78 | "ID_col = ['Loan_ID']\n", 79 | "target_col = [\"Loan_Status\"]\n", 80 | "cat_cols = ['Credit_History','Dependents','Gender','Married','Education','Property_Area','Self_Employed']\n", 81 | "\n", 82 | "other_col=['Type'] #Test and Train Data set identifier\n", 83 | "num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(other_col))" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 6, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [ 93 | { 94 | "name": "stderr", 95 | "output_type": "stream", 96 | "text": [ 97 | "C:\\Users\\abc\\Anaconda2\\lib\\site-packages\\pandas\\core\\generic.py:3178: SettingWithCopyWarning: \n", 98 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 99 | "\n", 100 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 101 | " self._update_inplace(new_data)\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "#Imputing Missing values with mean for continuous variable\n", 107 | "fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean(),inplace=True)\n", 108 | "\n", 109 | "\n", 110 | "#Imputing Missing values with mode for categorical variables\n", 111 | "cat_imput=pd.Series(fullData[cat_cols].mode().values[0])\n", 112 | "cat_imput.index=cat_cols\n", 113 | "fullData[cat_cols] = fullData[cat_cols].fillna(cat_imput,inplace=True)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 7, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "#Create a new column as Total Income\n", 125 | "\n", 126 | "fullData['TotalIncome']=fullData['ApplicantIncome']+fullData['CoapplicantIncome']\n", 127 | "\n", 128 | "#Take a log of TotalIncome + 1, adding 1 to deal with zeros of TotalIncome it it exists\n", 129 | "fullData['Log_TotalIncome']=np.log(fullData['TotalIncome'])\n" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 8, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [ 139 | { 140 | "name": "stderr", 141 | "output_type": "stream", 142 | "text": [ 143 | "C:\\Users\\abc\\Anaconda2\\lib\\site-packages\\ipykernel\\__main__.py:8: SettingWithCopyWarning: \n", 144 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 145 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 146 | "\n", 147 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "#create label encoders for categorical features\n", 153 | "for var in cat_cols:\n", 154 | " number = LabelEncoder()\n", 155 | " fullData[var] = number.fit_transform(fullData[var].astype('str'))\n", 156 | "\n", 157 | "train_modified=fullData[fullData['Type']=='Train']\n", 158 | "test_modified=fullData[fullData['Type']=='Test']\n", 159 | "train_modified[\"Loan_Status\"] = number.fit_transform(train_modified[\"Loan_Status\"].astype('str'))" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "# Building Logistic Regression" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 9, 172 | "metadata": { 173 | "collapsed": false 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "from sklearn.linear_model import LogisticRegression\n", 178 | "\n", 179 | "\n", 180 | "predictors=['Credit_History','Education','Gender']\n", 181 | "\n", 182 | "x_train = train_modified[list(predictors)].values\n", 183 | "y_train = train_modified[\"Loan_Status\"].values\n", 184 | "\n", 185 | "x_test=test_modified[list(predictors)].values" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 10, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [ 195 | { 196 | "name": "stderr", 197 | "output_type": "stream", 198 | "text": [ 199 | "C:\\Users\\abc\\Anaconda2\\lib\\site-packages\\ipykernel\\__main__.py:14: SettingWithCopyWarning: \n", 200 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 201 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 202 | "\n", 203 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 204 | ] 205 | } 206 | ], 207 | "source": [ 208 | "# Create logistic regression object\n", 209 | "model = LogisticRegression()\n", 210 | "\n", 211 | "# Train the model using the training sets\n", 212 | "model.fit(x_train, y_train)\n", 213 | "\n", 214 | "#Predict Output\n", 215 | "predicted= model.predict(x_test)\n", 216 | "\n", 217 | "#Reverse encoding for predicted outcome\n", 218 | "predicted = number.inverse_transform(predicted)\n", 219 | "\n", 220 | "#Store it to test dataset\n", 221 | "test_modified['Loan_Status']=predicted\n", 222 | "\n", 223 | "#Output file to make submission\n", 224 | "test_modified.to_csv(\"Submission1.csv\",columns=['Loan_ID','Loan_Status'])" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "# Building Decision Tree Classifier" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 11, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "predictors=['Credit_History','Education','Gender']\n", 243 | "\n", 244 | "x_train = train_modified[list(predictors)].values\n", 245 | "y_train = train_modified[\"Loan_Status\"].values\n", 246 | "\n", 247 | "x_test=test_modified[list(predictors)].values" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 12, 253 | "metadata": { 254 | "collapsed": false 255 | }, 256 | "outputs": [ 257 | { 258 | "name": "stderr", 259 | "output_type": "stream", 260 | "text": [ 261 | "C:\\Users\\abc\\Anaconda2\\lib\\site-packages\\ipykernel\\__main__.py:16: SettingWithCopyWarning: \n", 262 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 263 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 264 | "\n", 265 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 266 | ] 267 | } 268 | ], 269 | "source": [ 270 | "from sklearn.tree import DecisionTreeClassifier\n", 271 | "\n", 272 | "# Create Decision Tree object\n", 273 | "model = DecisionTreeClassifier()\n", 274 | "\n", 275 | "# Train the model using the training sets\n", 276 | "model.fit(x_train, y_train)\n", 277 | "\n", 278 | "#Predict Output\n", 279 | "predicted= model.predict(x_test)\n", 280 | "\n", 281 | "#Reverse encoding for predicted outcome\n", 282 | "predicted = number.inverse_transform(predicted)\n", 283 | "\n", 284 | "#Store it to test dataset\n", 285 | "test_modified['Loan_Status']=predicted\n", 286 | "\n", 287 | "#Output file to make submission\n", 288 | "test_modified.to_csv(\"Submission2.csv\",columns=['Loan_ID','Loan_Status'])\n" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "# Building Random Forest Classifier" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 13, 301 | "metadata": { 302 | "collapsed": true 303 | }, 304 | "outputs": [], 305 | "source": [ 306 | "from sklearn.linear_model import LogisticRegression\n", 307 | "\n", 308 | "\n", 309 | "predictors=['ApplicantIncome', 'CoapplicantIncome', 'Credit_History','Dependents', 'Education', 'Gender', 'LoanAmount',\n", 310 | " 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'TotalIncome','Log_TotalIncome']\n", 311 | "\n", 312 | "x_train = train_modified[list(predictors)].values\n", 313 | "y_train = train_modified[\"Loan_Status\"].values\n", 314 | "\n", 315 | "x_test=test_modified[list(predictors)].values" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 14, 321 | "metadata": { 322 | "collapsed": false 323 | }, 324 | "outputs": [ 325 | { 326 | "name": "stderr", 327 | "output_type": "stream", 328 | "text": [ 329 | "C:\\Users\\abc\\Anaconda2\\lib\\site-packages\\ipykernel\\__main__.py:16: SettingWithCopyWarning: \n", 330 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 331 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 332 | "\n", 333 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" 334 | ] 335 | } 336 | ], 337 | "source": [ 338 | "from sklearn.ensemble import RandomForestClassifier\n", 339 | "\n", 340 | "# Create Decision Tree object\n", 341 | "model = RandomForestClassifier()\n", 342 | "\n", 343 | "# Train the model using the training sets\n", 344 | "model.fit(x_train, y_train)\n", 345 | "\n", 346 | "#Predict Output\n", 347 | "predicted= model.predict(x_test)\n", 348 | "\n", 349 | "#Reverse encoding for predicted outcome\n", 350 | "predicted = number.inverse_transform(predicted)\n", 351 | "\n", 352 | "#Store it to test dataset\n", 353 | "test_modified['Loan_Status']=predicted\n", 354 | "\n", 355 | "#Output file to make submission\n", 356 | "test_modified.to_csv(\"Submission3.csv\",columns=['Loan_ID','Loan_Status'])\n" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 15, 362 | "metadata": { 363 | "collapsed": false 364 | }, 365 | "outputs": [ 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "Credit_History 0.232724\n", 371 | "TotalIncome 0.146955\n", 372 | "LoanAmount 0.128687\n", 373 | "ApplicantIncome 0.114424\n", 374 | "Log_TotalIncome 0.113866\n", 375 | "CoapplicantIncome 0.082272\n", 376 | "Dependents 0.038125\n", 377 | "Property_Area 0.036118\n", 378 | "Loan_Amount_Term 0.032650\n", 379 | "Married 0.022713\n", 380 | "Self_Employed 0.022481\n", 381 | "Education 0.016459\n", 382 | "Gender 0.012527\n", 383 | "dtype: float64\n" 384 | ] 385 | } 386 | ], 387 | "source": [ 388 | "#Create a series with feature importances:\n", 389 | "featimp = pd.Series(model.feature_importances_, index=predictors).sort_values(ascending=False)\n", 390 | "print featimp" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 16, 396 | "metadata": { 397 | "collapsed": true 398 | }, 399 | "outputs": [], 400 | "source": [ 401 | "number = LabelEncoder()\n", 402 | "train['Gender'] = number.fit_transform(train['Gender'].astype('str'))\n", 403 | " " 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 17, 409 | "metadata": { 410 | "collapsed": false 411 | }, 412 | "outputs": [ 413 | { 414 | "data": { 415 | "text/plain": [ 416 | "0 1\n", 417 | "1 1\n", 418 | "2 1\n", 419 | "3 1\n", 420 | "4 1\n", 421 | "5 1\n", 422 | "6 1\n", 423 | "7 1\n", 424 | "8 1\n", 425 | "9 1\n", 426 | "10 1\n", 427 | "11 1\n", 428 | "12 1\n", 429 | "13 1\n", 430 | "14 1\n", 431 | "15 1\n", 432 | "16 1\n", 433 | "17 0\n", 434 | "18 1\n", 435 | "19 1\n", 436 | "20 1\n", 437 | "21 1\n", 438 | "22 1\n", 439 | "23 2\n", 440 | "24 1\n", 441 | "25 1\n", 442 | "26 1\n", 443 | "27 1\n", 444 | "28 1\n", 445 | "29 0\n", 446 | " ..\n", 447 | "584 1\n", 448 | "585 1\n", 449 | "586 1\n", 450 | "587 0\n", 451 | "588 2\n", 452 | "589 1\n", 453 | "590 1\n", 454 | "591 1\n", 455 | "592 2\n", 456 | "593 1\n", 457 | "594 1\n", 458 | "595 1\n", 459 | "596 1\n", 460 | "597 1\n", 461 | "598 1\n", 462 | "599 1\n", 463 | "600 0\n", 464 | "601 1\n", 465 | "602 1\n", 466 | "603 1\n", 467 | "604 0\n", 468 | "605 1\n", 469 | "606 1\n", 470 | "607 1\n", 471 | "608 1\n", 472 | "609 0\n", 473 | "610 1\n", 474 | "611 1\n", 475 | "612 1\n", 476 | "613 0\n", 477 | "Name: Gender, dtype: int64" 478 | ] 479 | }, 480 | "execution_count": 17, 481 | "metadata": {}, 482 | "output_type": "execute_result" 483 | } 484 | ], 485 | "source": [ 486 | "train.Gender" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": { 493 | "collapsed": true 494 | }, 495 | "outputs": [], 496 | "source": [] 497 | } 498 | ], 499 | "metadata": { 500 | "kernelspec": { 501 | "display_name": "Python 2", 502 | "language": "python", 503 | "name": "python2" 504 | }, 505 | "language_info": { 506 | "codemirror_mode": { 507 | "name": "ipython", 508 | "version": 2 509 | }, 510 | "file_extension": ".py", 511 | "mimetype": "text/x-python", 512 | "name": "python", 513 | "nbconvert_exporter": "python", 514 | "pygments_lexer": "ipython2", 515 | "version": "2.7.11" 516 | } 517 | }, 518 | "nbformat": 4, 519 | "nbformat_minor": 0 520 | } 521 | -------------------------------------------------------------------------------- /Python_intro_hackathon.sublime-project: -------------------------------------------------------------------------------- 1 | { 2 | "folders": 3 | [ 4 | { 5 | "path": "." 6 | } 7 | ] 8 | } 9 | -------------------------------------------------------------------------------- /Python_intro_hackathon.sublime-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "auto_complete": 3 | { 4 | "selected_items": 5 | [ 6 | [ 7 | "text", 8 | "text_size_change" 9 | ], 10 | [ 11 | "get", 12 | "getElementById" 13 | ], 14 | [ 15 | "butt", 16 | "button_text_to_change" 17 | ], 18 | [ 19 | "button", 20 | "button1" 21 | ], 22 | [ 23 | "m", 24 | "myImage" 25 | ], 26 | [ 27 | "on", 28 | "onclick Attr" 29 | ], 30 | [ 31 | "name", 32 | "name" 33 | ], 34 | [ 35 | "format", 36 | "formattedRole" 37 | ], 38 | [ 39 | "formatted", 40 | "formattedName" 41 | ], 42 | [ 43 | "fun", 44 | "funThoughts" 45 | ], 46 | [ 47 | "For", 48 | "ForeignKey" 49 | ], 50 | [ 51 | "resta", 52 | "restaurant" 53 | ], 54 | [ 55 | "nu", 56 | "nullable" 57 | ], 58 | [ 59 | "cre", 60 | "create_engine" 61 | ], 62 | [ 63 | "dec", 64 | "declarative_base" 65 | ] 66 | ] 67 | }, 68 | "buffers": 69 | [ 70 | { 71 | "contents": "\ntitle : Python Libraries and data structures\ndescription : In this chapter, we will take you through the libraries we commonly use in data analysis and introduce some of the most common data structures to you.\nattachments :\n slides_link : https://s3.amazonaws.com/assets.datacamp.com/course/teach/slides_example.pdf\n\n--- type:NormalExercise lang:python xp:100 skills:1 key:af2f6f90f3\n## Create a list\n\nList is one of the most versatile data structure in Python. A list can simply be defined by writing a list of comma separated values in square brackets. Lists might contain items of different types. Python lists are mutable and individual elements of a list can be changed.\n\n```{python}\nCountry =['INDIA','USA','GERMANY','UK','AUSTRALIA']\n\nTemperature =[44, 28, 20, 18, 25, 45, 67]\n```\nWe just created two lists, one for Country names and other one for temperature. \n\n####Accessing individual elements of a list\n- Individual elements of a list can be accessed by writting an index number in square bracket. First index of list starts with 0 (zero) not 1.\n- A range of element can be accessed by having start index and end index but it does not return the value available at end index,\n\n*** =instructions\n- Create a list of first five odd numbers and store it in a variable odd_numbers.\n- Print second to fourth element [1, 4, 9] from squares_lis,t\n\n\n*** =hint\n- Use AV[0] to select the first element of a list AV. \n- Use AV[1:3] to select second to third element of a list AV.\n\n\n*** =pre_exercise_code\n```{python}\n# The pre exercise code runs code to initialize the user's workspace. You can use it for several things:\n```\n\n*** =sample_code\n\n```{python}\n\n# Create a list of squared numbers\nsquares_list = [0, 1, 4, 9, 16, 25]\n\n# Now write a code to create list of first five odd numbers and store it into a variable odd_numbers\nodd_numbers=\n\n# Print first element of squares_list\nprint (squares_list[0])\n\n# Print second to fourth elements of squares_list\n\n```\n\n*** =solution\n```{python}\n\n# Create a list of squared numbers\nsquares_list = [0, 1, 4, 9, 16, 25]\n\n# Now write a code to create list of first five odd numbers and store it into a variable odd_numbers\nodd_numbers = [1, 3, 5, 7, 9]\n\n# Print first element of squares_list\nprint (squares_list[0])\n\n# Print second to fourth elements of squares_list\nprint (squares_list[1:4])\n```\n\n*** =sct\n```{python}\n# The sct section defines the Submission Correctness Tests (SCTs) used to\n# evaluate the student's response. All functions used here are defined in the \n# pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki\n\n# Test for list of odd_numbers\ntest_object(\"odd_numbers\")\n\n# Check second to fourth elements\"\ntest_output_contains(\"[1, 4, 9]\", pattern = False)\nsuccess_msg(\"Great work!\")\n```\n\n--- type:NormalExercise lang:python xp:100 skills:1 key:c7f91e389f\n## Create a String\n\nStrings can simply be defined by use of single ( ‘ ), double ( ” ) or triple ( ”’ ) inverted commas. Strings enclosed in triple quotes ( ”’ ) can span over multiple lines. Please note that Python strings are immutable, so you can not change part of strings.\n\n```{python}\nString =\" Strings elements can also be accessed using index number like list\"\n\nprint (String[0:8])\n\n#Above print command display Strings on screen.\n\n```\n\n\n*** =instructions\n\n- len function returns the lenght of string\n- Strings characters can be accessed using index number (similar like list)\n- Strings can be concatenated with other strings using '+' operator\n\n\n\n*** =hint\n\n- Use str[2] to select the third element of string str \n- Use len(str) to return the length of string\n- Use str1 + str2 to return the concatenated result of both strings str1 and str2\n\n\n\n*** =pre_exercise_code\n\n```{python}\n# The pre exercise code runs code to initialize the user's workspace. You can use it for several things:\n```\n\n*** =sample_code\n\n```{python}\n# Create a string str\nstr1 = \"Introduction with strings\"\n\n# Now store the length of string in varible str_len \nstr_len =\n\n# Print last seven characters of strings str\n\n\nstr1 = \"I am doing a course Introduction to Hackathon using \"\nstr2 = \"Python\"\n\n# Write a code to store concatenated string of str1 and str2 into variable str3\nstr3 =\n\n```\n\n*** =solution\n\n```{python}\n\n# Create a string str\nstr1 = \"Introduction with strings\"\n\n# Now store the length of string in varible str_len \nstr_len=len(str1)\n\n# Print last seven characters of strings str\nprint (str1[18:25])\n\nstr1 = \"I am doing a course Introduction to Hackathon using \"\nstr2 = \"Python\"\n\n# Write a code to store concatenated string of str1 and str2 into variable str3\nstr3= str1 + str2\n```\n\n*** =sct\n\n```{python}\n# The sct section defines the Submission Correctness Tests (SCTs) used to\n# evaluate the student's response. All functions used here are defined in the \n# pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki\n\n# Check length of strings\ntest_object(\"str_len\")\n\n# Check last seven characters\ntest_output_contains(\"strings\", pattern = False)\n\n# Check concatenated strings\"\ntest_object(\"str3\")\nsuccess_msg(\"Great work!\")\n```\n\n--- type:NormalExercise lang:python xp:100 skills:1 key:377e9324f2\n## Create a Dictionary\n\nDictionary is an unordered set of key: value pairs, with the requirement that the keys are unique (within one dictionary). A pair of braces creates an empty dictionary: {}.\n\n```{python}\nDICT = {'Name':'Kunal', 'Company':'Analytics Vidhya'}\n\n#Dictionary elements can be accessed by \"keys\"\n\nprint (DICT['Name'])\n\n#Above print statement will print Kunal\n\n```\n\nIn dictonary \"DICT\", Name and Company are dictionary keys where as \"Kunal\" and \"Analytics Vidhya\" are values.\n\n*** =instructions\n\n- To access dictionary elements, you can use the familiar square brackets along with the key to obtain its value\n- Dictionary can be updated by adding a new entry or a key-value pair, modifying or deleting an existing entry\n\n*** =hint\n\n- Use dict['Keys'] = new_value to update the existing value\n- Use dict.keys() to access all keys of dictionary dict\n- Use dict.values() to access all values of dictionary dict\n\n\n*** =pre_exercise_code\n\n```{python}\n# The pre exercise code runs code to initialize the user's workspace. You can use it for several things:\n```\n\n*** =sample_code\n\n```{python}\n\n# Create a dictionary\ndict1 = {'Name': 'Max', 'Age': 16, 'Sports': 'Cricket'}\n\n# Update the value of Age to 18\n\n\n# Print the value of Age\n\n\n# Print all the keys of dictionary dict1\n\n\n```\n\n*** =solution\n\n```{python}\n\n# Create a dictionary\ndict1 = {'Name': 'Max', 'Age': 16, 'Sports': 'Cricket'}\n\n# Update the value of Age to 18\ndict1['Age'] = 18\n\n# Print the value of Age\nprint (dict1['Age'])\n\n# Print all the keys of dictionary dict\nprint (dict1.keys())\n\n```\n\n*** =sct\n\n```{python}\n# The sct section defines the Submission Correctness Tests (SCTs) used to\n# evaluate the student's response. All functions used here are defined in the \n# pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki\n\n# Check value of Age\ntest_output_contains(\"18\", pattern = False)\n\n# Check keys of dictionary\ntest_output_contains(\"dict_keys(['Name', 'Age', 'Sports'])\", pattern = False)\n\nsuccess_msg(\"Great work!\")\n```\n\n--- type:MultipleChoiceExercise lang:python xp:50 skills:1 key:9a8fd577a9\n## Why python libraries are useful?\n\nLets take one step ahead in our journey to learn Python by getting acquainted with some useful libraries. The first step is obviously to learn to import them into our environment. There are several ways of doing so in Python:\n\n```{python}\nimport math as m\n\nfrom math import *\n```\n\nIn the first manner, we have defined an alias m to library math. We can now use various functions from math library (e.g. factorial) by referencing it using the alias m.factorial().\n\nIn the second manner, you have imported the entire name space in math i.e. you can directly use factorial() without referring to math.\n\nFollowing are a list of libraries, you will need for any scientific computations and data analysis:\n\n* Numpy \n* Scipy \n* Pandas \n* Matplotlib \n* Scikit Learn \n\n\n\n##### Which of the following is a valid import statement for below code?\n```{python}\nprint (factorial(5))\n```\n\n*** =instructions\n- import math\n- from math import factorial\n- import math.factorial\n\n*** =hint\nPython's from statement lets you import specific attributes from a module into the current namespace.\n\n*** =pre_exercise_code\n\n\n*** =sct\n```{python}\n# The sct section defines the Submission Correctness Tests (SCTs) used to\n# evaluate the student's response. All functions used here are defined in the \n# pythonwhat Python package\n\nmsg_bad = \"Read about importing libraries in python\"\nmsg_success = \"Good Job!\"\n\n# Use test_mc() to grade multiple choice exercises. \n# Pass the correct option (Action, option 2 in the instructions) to correct.\n# Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order.\ntest_mc(2, [msg_bad, msg_success, msg_bad]) \n```\n\n\n--- type:NormalExercise lang:python xp:100 skills:1 key:50c9218dac\n## Why conditional statement is required?\n\nConditional statements, these are used to execute code fragments based on a condition. The most commonly used construct is if-else, with following syntax:\n\n```{python}\n\nif [condition]:\n __execution if true__\nelse:\n __execution if false__ \n```\n\n*** =instructions\n\n- Store the length of squares_list to square_len\n- Use the if statement to perform one action if one thing is true,or any other actions, if something else is true\n\n\n*** =hint\n\n- Use <, >, <=, >=, == and != for comparison\n- Use len(list) to return length of string\n\n\n*** =pre_exercise_code\n\n```{python}\n# The pre exercise code runs code to initialize the user's workspace. You can use it for several things:\n```\n\n*** =sample_code\n\n```{python}\n# Create a two integer variables a and b\na=3\nb=4\n\n# if a is greater than b print a-b else a+b\nif a > b:\n print (a-b)\nelse:\n print (a+b)\n\n# Create a list of squared numbers\nsquares_list = [0, 1, 4, 9, 16, 25]\n\n# Store the length of squares_list in square_len\nsquare_len = \n\n# if square_len is less than 5 then print \"Less than 5\" else \"Greater than 5\"\nif square_len < ___:\n print (\"__________\")\nelse:\n print (\"__________\")\n\n\n```\n\n*** =solution\n\n```{python}\n# Create a two integer variables a and b\na=3\nb=4\n\n# if a is greater than b print a-b else a+b\nif a > b:\n print (a-b)\nelse:\n print (a+b)\n\n# Create a list of squared numbers\nsquares_list = [0, 1, 4, 9, 16, 25]\n\n# Store the length of squares_list in square_len\nsquare_len = len(squares_list)\n\n# if square_len is less than 5 then print \"Less than 5\" else \"Greater than 5\"\nif square_len < 5:\n print (\"Less than 5\")\nelse:\n print (\"Greater than 5\")\n\n```\n\n*** =sct\n\n```{python}\n# The sct section defines the Submission Correctness Tests (SCTs) used to\n# evaluate the student's response. All functions used here are defined in the \n# pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki\n\n# Check length of strings\ntest_object(\"square_len\")\n\n# Check last seven characters\ntest_output_contains(\"Greater than 5\", pattern = False)\n\nsuccess_msg(\"Great work!\")\n```\n\n\n--- type:NormalExercise lang:python xp:100 skills:1 key:c1b7c2fd5c\n## How iterative statement does help?\n\nComputers are often used to automate repetitive tasks. Repeating identical or similar tasks without making errors is something that computers do well. Repeated execution of a set of statements is called iteration.\n\nLike most languages, Python also has a FOR-loop which is the most widely used method for iteration. It has a simple syntax:\n\n```{python}\n\nfor i in [Python Iterable]:\n expression(i)\n\n```\n“Python Iterable” can be a list or other advanced data structures which we will explore in later sections. Let’s take a look at a simple example, determining the factorial of a number.\n\n*** =instructions\n\n- Use list.append() to append values in a list\n- Iterate over list to access each element of list\n\n\n\n*** =hint\n\n- Use <, >, <=, >=, == and != for comparison\n- Use len(list) to return length of string\n- % operator helps to return remainder e.g. 4 % 3 would be 1\n\n*** =pre_exercise_code\n\n```{python}\n# The pre exercise code runs code to initialize the user's workspace. You can use it for several things:\n```\n\n*** =sample_code\n\n```{python}\n# Create a list with first five numbers\nls=[]\nfor x in range(5):\n ls.append(x)\n \nsum=0\n# Store sum all even numbers of the list ls in sum\n\nfor x in ls: \n if ______: \n sum += x\n\n```\n\n*** =solution\n\n```{python}\n# Create a list with first five numbers\nls=[]\nfor x in range(5):\n ls.append(x) # append a value to a list\n \nsum=0\n# Store sum all even numbers of the list ls in sum\n\nfor x in ls: \n if x%2==0: \n sum += x\n\n```\n\n*** =sct\n\n```{python}\n# The sct section defines the Submission Correctness Tests (SCTs) used to\n# evaluate the student's response. All functions used here are defined in the \n# pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki\n\n# Check length of strings\ntest_object(\"sum\")\n\nsuccess_msg(\"Great work!\")\n```\n", 72 | "file": "chapter2.md", 73 | "file_size": 13909, 74 | "file_write_time": 131096300693332037, 75 | "settings": 76 | { 77 | "buffer_size": 13384, 78 | "line_ending": "Windows" 79 | } 80 | }, 81 | { 82 | "contents": "Analytics Vidhya\nAbout Us\nTeam\nCareers\n\n\nFor Data Scientists\nBlog\nDiscussions\nHackathons\nJobs\n", 83 | "settings": 84 | { 85 | "buffer_size": 94, 86 | "line_ending": "Windows", 87 | "name": "Analytics Vidhya" 88 | } 89 | }, 90 | { 91 | "file": "chapter7.md", 92 | "settings": 93 | { 94 | "buffer_size": 7564, 95 | "line_ending": "Windows" 96 | } 97 | }, 98 | { 99 | "contents": "---\ntitle : Tips and Tricks from the best hackers!\ndescription : Here is the best part of a hackathon - you learn from the best hackers as you compete against them. This chapter just brings out some tips and tricks as shared by the best hackers.\nattachments :\n slides_link : https://s3.amazonaws.com/assets.datacamp.com/course/teach/slides_example.pdf\n\n--- type:VideoExercise lang:python xp:50 skills:1 key:c55198c91d\n## Analyze movie ratings\n\n*** =video_link\n//player.vimeo.com/video/154783078\n\n--- type:MultipleChoiceExercise lang:python xp:50 skills:1 key:9a8fd577a9\n## A really bad movie\n\nHave a look at the plot that showed up in the viewer to the right. Which type of movies have the worst rating assigned to them?\n\n*** =instructions\n- Long movies, clearly\n- Short movies, clearly\n- Long movies, but the correlation seems weak\n- Short movies, but the correlation seems weak\n\n*** =hint\nHave a look at the plot. Do you see a trend in the dots?\n\n*** =pre_exercise_code\n```{r}\n# The pre exercise code runs code to initialize the user's workspace. You can use it for several things:\n\n# 1. Pre-load packages, so that users don't have to do this manually.\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n# 2. Preload a dataset. The code below will read the csv that is stored at the URL's location.\n# The movies variable will be available in the user's console.\nmovies = pd.read_csv(\"http://s3.amazonaws.com/assets.datacamp.com/course/introduction_to_r/movies.csv\")\n\n# 3. Create a plot in the viewer, that students can check out while reading the exercise\nplt.scatter(movies.runtime, movies.rating)\nplt.show()\n```\n\n*** =sct\n```{r}\n# The sct section defines the Submission Correctness Tests (SCTs) used to\n# evaluate the student's response. All functions used here are defined in the \n# pythonwhat Python package\n\nmsg_bad = \"That is not correct!\"\nmsg_success = \"Exactly! The correlation is very weak though.\"\n\n# Use test_mc() to grade multiple choice exercises. \n# Pass the correct option (Action, option 2 in the instructions) to correct.\n# Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order.\ntest_mc(4, [msg_bad, msg_bad, msg_bad, msg_success]) \n```\n\n--- type:MultipleChoiceExercise lang:python xp:50 skills:1 key:f0e6a8e8a5\n## A really bad movie\n\nHave a look at the plot that showed up in the viewer to the right. Which type of movies have the worst rating assigned to them?\n\n*** =instructions\n- Long movies, clearly\n- Short movies, clearly\n- Long movies, but the correlation seems weak\n- Short movies, but the correlation seems weak\n\n*** =hint\nHave a look at the plot. Do you see a trend in the dots?\n\n*** =pre_exercise_code\n```{python}\n# The pre exercise code runs code to initialize the user's workspace. You can use it for several things:\n\n# 1. Pre-load packages, so that users don't have to do this manually.\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n# 2. Preload a dataset. The code below will read the csv that is stored at the URL's location.\n# The movies variable will be available in the user's console.\nmovies = pd.read_csv(\"http://s3.amazonaws.com/assets.datacamp.com/course/introduction_to_r/movies.csv\")\n\n# 3. Create a plot in the viewer, that students can check out while reading the exercise\nplt.scatter(movies.runtime, movies.rating)\nplt.show()\n```\n\n*** =sct\n```{python}\n# The sct section defines the Submission Correctness Tests (SCTs) used to\n# evaluate the student's response. All functions used here are defined in the\n# pythonwhat Python package\n\nmsg_bad = \"That is not correct!\"\nmsg_success = \"Exactly! The correlation is very weak though.\"\n\n# Use test_mc() to grade multiple choice exercises.\n# Pass the correct option (option 4 in the instructions) to correct.\n# Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order.\ntest_mc(4, [msg_bad, msg_bad, msg_bad, msg_success])\n```\n\n--- type:NormalExercise lang:python xp:100 skills:1 key:af2f6f90f3\n## Plot the movies yourself\n\nDo you remember the plot of the last exercise? Let's make an even cooler plot!\n\nA dataset of movies, `movies`, is available in the workspace.\n\n*** =instructions\n- The first function, `np.unique()`, uses the `unique()` function of the `numpy` package to get integer values for the movie genres. You don't have to change this code, just have a look!\n- Import `pyplot` in the `matplotlib` package. Set an alias for this import: `plt`.\n- Use `plt.scatter()` to plot `movies.runtime` onto the x-axis, `movies.rating` onto the y-axis and use `ints` for the color of the dots. You should use the first and second positional argument, and the `c` keyword.\n- Show the plot using `plt.show()`.\n\n*** =hint\n- You don't have to program anything for the first instruction, just take a look at the first line of code.\n- Use `import ___ as ___` to import `matplotlib.pyplot` as `plt`.\n- Use `plt.scatter(___, ___, c = ___)` for the third instruction.\n- You'll always have to type in `plt.show()` to show the plot you created.\n\n*** =pre_exercise_code\n```{python}\n# The pre exercise code runs code to initialize the user's workspace. You can use it for several things:\n\n# 1. Preload a dataset. The code below will read the csv that is stored at the URL's location.\n# The movies variable will be available in the user's console.\nimport pandas as pd\nmovies = pd.read_csv(\"http://s3.amazonaws.com/assets.datacamp.com/course/introduction_to_r/movies.csv\")\n\n# 2. Preload a package\nimport numpy as np\n```\n\n*** =sample_code\n```{python}\n# Get integer values for genres\n_, ints = np.unique(movies.genre, return_inverse = True)\n\n# Import matplotlib.pyplot\n\n\n# Make a scatter plot: runtime on x-axis, rating on y-axis and set c to ints\n\n\n# Show the plot\n\n```\n\n*** =solution\n```{python}\n# Get integer values for genres\n_, ints = np.unique(movies.genre, return_inverse = True)\n\n# Import matplotlib.pyplot\nimport matplotlib.pyplot as plt\n\n# Make a scatter plot: runtime on x-axis, rating on y-axis and set c to ints\nplt.scatter(movies.runtime, movies.rating, c=ints)\n\n# Show the plot\nplt.show()\n```\n\n*** =sct\n```{python}\n# The sct section defines the Submission Correctness Tests (SCTs) used to\n# evaluate the student's response. All functions used here are defined in the \n# pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki\n\n# Check if the student changed the np.unique() call\n# If it's not called, we know the student removed the call.\n# If it's called incorrectly, we know the student changed the call.\ntest_function(\"numpy.unique\",\n not_called_msg = \"Don't remove the call of `np.unique` to define `ints`.\",\n incorrect_msg = \"Don't change the call of `np.unique` to define `ints`.\")\n# Check if the student removed the ints object\ntest_object(\"ints\",\n undefined_msg = \"Don't remove the definition of the predefined `ints` object.\",\n incorrect_msg = \"Don't change the definition of the predefined `ints` object.\")\n\n# Check if the student imported matplotlib.pyplot like the solution\n# Let automatic feedback message generation handle the feedback messages\ntest_import(\"matplotlib.pyplot\", same_as = True)\n\n# Check whether the student used the scatter() function correctly\n# If it's used, but incorrectly, tell them to check the instructions again\ntest_function(\"matplotlib.pyplot.scatter\",\n incorrect_msg = \"You didn't use `plt.scatter()` correctly, have another look at the instructions.\")\n\n# Check if the student called the show() function\n# Let automatic feedback message generation handle all feedback messages\ntest_function(\"matplotlib.pyplot.show\")\n\nsuccess_msg(\"Great work!\")\n```", 100 | "file": "chapter6.md", 101 | "file_size": 7859, 102 | "file_write_time": 131091859011149763, 103 | "settings": 104 | { 105 | "buffer_size": 7680, 106 | "line_ending": "Windows" 107 | } 108 | }, 109 | { 110 | "file": "chapter5.md", 111 | "settings": 112 | { 113 | "buffer_size": 27307, 114 | "line_ending": "Windows" 115 | } 116 | }, 117 | { 118 | "file": "chapter4.md", 119 | "settings": 120 | { 121 | "buffer_size": 9866, 122 | "line_ending": "Windows" 123 | } 124 | }, 125 | { 126 | "file": "chapter3.md", 127 | "settings": 128 | { 129 | "buffer_size": 11549, 130 | "line_ending": "Windows" 131 | } 132 | }, 133 | { 134 | "file": "course.yml", 135 | "settings": 136 | { 137 | "buffer_size": 708, 138 | "line_ending": "Windows" 139 | } 140 | }, 141 | { 142 | "file": "chapter1.md", 143 | "settings": 144 | { 145 | "buffer_size": 6780, 146 | "line_ending": "Windows" 147 | } 148 | }, 149 | { 150 | "file": "README.md", 151 | "settings": 152 | { 153 | "buffer_size": 1933, 154 | "line_ending": "Windows" 155 | } 156 | }, 157 | { 158 | "contents": "List of possible questions:\n\n1. Where can we host slides? Amazon only or could this be Slideshare or Dropbox as well?", 159 | "settings": 160 | { 161 | "buffer_size": 117, 162 | "line_ending": "Windows", 163 | "name": "List of possible questions:" 164 | } 165 | } 166 | ], 167 | "build_system": "", 168 | "build_system_choices": 169 | [ 170 | ], 171 | "build_varint": "", 172 | "command_palette": 173 | { 174 | "height": 392.0, 175 | "last_filter": "packa", 176 | "selected_items": 177 | [ 178 | [ 179 | "packa", 180 | "Package Control: Install Package" 181 | ], 182 | [ 183 | "", 184 | "Package Control: Install Package" 185 | ] 186 | ], 187 | "width": 512.0 188 | }, 189 | "console": 190 | { 191 | "height": 126.0, 192 | "history": 193 | [ 194 | "import urllib.request,os,hashlib; h = '2915d1851351e5ee549c20394736b442' + '8bc59f460fa1548d1514676163dafc88'; pf = 'Package Control.sublime-package'; ipp = sublime.installed_packages_path(); urllib.request.install_opener( urllib.request.build_opener( urllib.request.ProxyHandler()) ); by = urllib.request.urlopen( 'http://packagecontrol.io/' + pf.replace(' ', '%20')).read(); dh = hashlib.sha256(by).hexdigest(); print('Error validating download (got %s instead of %s), please try manual install' % (dh, h)) if dh != h else open(os.path.join( ipp, pf), 'wb' ).write(by)" 195 | ] 196 | }, 197 | "distraction_free": 198 | { 199 | "menu_visible": true, 200 | "show_minimap": false, 201 | "show_open_files": false, 202 | "show_tabs": false, 203 | "side_bar_visible": false, 204 | "status_bar_visible": false 205 | }, 206 | "expanded_folders": 207 | [ 208 | "/C/Users/lenovo/python_intro_hackathon" 209 | ], 210 | "file_history": 211 | [ 212 | "/C/Users/lenovo/python_intro_hackathon/chapter1.md", 213 | "/C/Users/lenovo/Downloads/DYD_SEC.py", 214 | "/C/Users/lenovo/Downloads/sub4/sub4/prepData.py", 215 | "/C/Users/lenovo/Downloads/sub4/sub4/finalModel.py", 216 | "/E/Kunal/GitHub/frontend-nanodegree-resume/Log in", 217 | "/E/Kunal/GitHub/javascript_experiments/test.html", 218 | "/E/Kunal/GitHub/frontend-nanodegree-resume/index.html", 219 | "/E/Kunal/GitHub/frontend-nanodegree-resume/js/resumeBuilder.js", 220 | "/E/Kunal/GitHub/frontend-nanodegree-resume/js/helper.js", 221 | "/E/Kunal/GitHub/frontend-nanodegree-resume/js/jQuery.js", 222 | "/E/Kunal/linux/vagrant_machine/python_code/database_setup.py", 223 | "/E/Kunal/linux/vagrant_machine/python_code/lotsofmenus2.py", 224 | "/E/Kunal/linux/vagrant_machine/python_code/fresh_tomatoes.py", 225 | "/E/Kunal/linux/vagrant_machine/python_code/lotsofmenus.py" 226 | ], 227 | "find": 228 | { 229 | "height": 23.0 230 | }, 231 | "find_in_files": 232 | { 233 | "height": 0.0, 234 | "where_history": 235 | [ 236 | ] 237 | }, 238 | "find_state": 239 | { 240 | "case_sensitive": false, 241 | "find_history": 242 | [ 243 | "\";" 244 | ], 245 | "highlight": true, 246 | "in_selection": false, 247 | "preserve_case": false, 248 | "regex": true, 249 | "replace_history": 250 | [ 251 | ], 252 | "reverse": false, 253 | "show_context": true, 254 | "use_buffer2": true, 255 | "whole_word": false, 256 | "wrap": true 257 | }, 258 | "groups": 259 | [ 260 | { 261 | "selected": 1, 262 | "sheets": 263 | [ 264 | { 265 | "buffer": 0, 266 | "file": "chapter2.md", 267 | "semi_transient": false, 268 | "settings": 269 | { 270 | "buffer_size": 13384, 271 | "regions": 272 | { 273 | }, 274 | "selection": 275 | [ 276 | [ 277 | 1357, 278 | 1357 279 | ] 280 | ], 281 | "settings": 282 | { 283 | "syntax": "Packages/Markdown/Markdown.tmLanguage" 284 | }, 285 | "translation.x": 0.0, 286 | "translation.y": 1470.0, 287 | "zoom_level": 1.0 288 | }, 289 | "stack_index": 1, 290 | "type": "text" 291 | }, 292 | { 293 | "buffer": 1, 294 | "semi_transient": false, 295 | "settings": 296 | { 297 | "buffer_size": 94, 298 | "regions": 299 | { 300 | }, 301 | "selection": 302 | [ 303 | [ 304 | 94, 305 | 94 306 | ] 307 | ], 308 | "settings": 309 | { 310 | "auto_name": "Analytics Vidhya", 311 | "default_dir": "C:\\Users\\lenovo\\python_intro_hackathon", 312 | "syntax": "Packages/Text/Plain text.tmLanguage" 313 | }, 314 | "translation.x": 0.0, 315 | "translation.y": 0.0, 316 | "zoom_level": 1.0 317 | }, 318 | "stack_index": 0, 319 | "type": "text" 320 | }, 321 | { 322 | "buffer": 2, 323 | "file": "chapter7.md", 324 | "semi_transient": false, 325 | "settings": 326 | { 327 | "buffer_size": 7564, 328 | "regions": 329 | { 330 | }, 331 | "selection": 332 | [ 333 | [ 334 | 132, 335 | 132 336 | ] 337 | ], 338 | "settings": 339 | { 340 | "syntax": "Packages/Markdown/Markdown.tmLanguage" 341 | }, 342 | "translation.x": 0.0, 343 | "translation.y": 0.0, 344 | "zoom_level": 1.0 345 | }, 346 | "stack_index": 10, 347 | "type": "text" 348 | }, 349 | { 350 | "buffer": 3, 351 | "file": "chapter6.md", 352 | "semi_transient": false, 353 | "settings": 354 | { 355 | "buffer_size": 7680, 356 | "regions": 357 | { 358 | }, 359 | "selection": 360 | [ 361 | [ 362 | 251, 363 | 251 364 | ] 365 | ], 366 | "settings": 367 | { 368 | "syntax": "Packages/Markdown/Markdown.tmLanguage" 369 | }, 370 | "translation.x": 0.0, 371 | "translation.y": 0.0, 372 | "zoom_level": 1.0 373 | }, 374 | "stack_index": 9, 375 | "type": "text" 376 | }, 377 | { 378 | "buffer": 4, 379 | "file": "chapter5.md", 380 | "semi_transient": false, 381 | "settings": 382 | { 383 | "buffer_size": 27307, 384 | "regions": 385 | { 386 | }, 387 | "selection": 388 | [ 389 | [ 390 | 165, 391 | 165 392 | ] 393 | ], 394 | "settings": 395 | { 396 | "syntax": "Packages/Markdown/Markdown.tmLanguage" 397 | }, 398 | "translation.x": 0.0, 399 | "translation.y": 0.0, 400 | "zoom_level": 1.0 401 | }, 402 | "stack_index": 8, 403 | "type": "text" 404 | }, 405 | { 406 | "buffer": 5, 407 | "file": "chapter4.md", 408 | "semi_transient": false, 409 | "settings": 410 | { 411 | "buffer_size": 9866, 412 | "regions": 413 | { 414 | }, 415 | "selection": 416 | [ 417 | [ 418 | 186, 419 | 186 420 | ] 421 | ], 422 | "settings": 423 | { 424 | "syntax": "Packages/Markdown/Markdown.tmLanguage" 425 | }, 426 | "translation.x": 0.0, 427 | "translation.y": 2092.0, 428 | "zoom_level": 1.0 429 | }, 430 | "stack_index": 4, 431 | "type": "text" 432 | }, 433 | { 434 | "buffer": 6, 435 | "file": "chapter3.md", 436 | "semi_transient": false, 437 | "settings": 438 | { 439 | "buffer_size": 11549, 440 | "regions": 441 | { 442 | }, 443 | "selection": 444 | [ 445 | [ 446 | 790, 447 | 631 448 | ] 449 | ], 450 | "settings": 451 | { 452 | "syntax": "Packages/Markdown/Markdown.tmLanguage" 453 | }, 454 | "translation.x": 0.0, 455 | "translation.y": 0.0, 456 | "zoom_level": 1.0 457 | }, 458 | "stack_index": 6, 459 | "type": "text" 460 | }, 461 | { 462 | "buffer": 7, 463 | "file": "course.yml", 464 | "semi_transient": false, 465 | "settings": 466 | { 467 | "buffer_size": 708, 468 | "regions": 469 | { 470 | }, 471 | "selection": 472 | [ 473 | [ 474 | 354, 475 | 354 476 | ] 477 | ], 478 | "settings": 479 | { 480 | "syntax": "Packages/YAML/YAML.tmLanguage" 481 | }, 482 | "translation.x": 0.0, 483 | "translation.y": 0.0, 484 | "zoom_level": 1.0 485 | }, 486 | "stack_index": 3, 487 | "type": "text" 488 | }, 489 | { 490 | "buffer": 8, 491 | "file": "chapter1.md", 492 | "semi_transient": false, 493 | "settings": 494 | { 495 | "buffer_size": 6780, 496 | "regions": 497 | { 498 | }, 499 | "selection": 500 | [ 501 | [ 502 | 6780, 503 | 6780 504 | ] 505 | ], 506 | "settings": 507 | { 508 | "syntax": "Packages/Markdown/Markdown.tmLanguage" 509 | }, 510 | "translation.x": 0.0, 511 | "translation.y": 3532.0, 512 | "zoom_level": 1.0 513 | }, 514 | "stack_index": 2, 515 | "type": "text" 516 | }, 517 | { 518 | "buffer": 9, 519 | "file": "README.md", 520 | "semi_transient": false, 521 | "settings": 522 | { 523 | "buffer_size": 1933, 524 | "regions": 525 | { 526 | }, 527 | "selection": 528 | [ 529 | [ 530 | 831, 531 | 831 532 | ] 533 | ], 534 | "settings": 535 | { 536 | "syntax": "Packages/Markdown/Markdown.tmLanguage" 537 | }, 538 | "translation.x": 0.0, 539 | "translation.y": 0.0, 540 | "zoom_level": 1.0 541 | }, 542 | "stack_index": 5, 543 | "type": "text" 544 | }, 545 | { 546 | "buffer": 10, 547 | "semi_transient": false, 548 | "settings": 549 | { 550 | "buffer_size": 117, 551 | "regions": 552 | { 553 | }, 554 | "selection": 555 | [ 556 | [ 557 | 117, 558 | 117 559 | ] 560 | ], 561 | "settings": 562 | { 563 | "auto_name": "List of possible questions:", 564 | "default_dir": "C:\\Users\\lenovo\\python_intro_hackathon", 565 | "syntax": "Packages/Text/Plain text.tmLanguage" 566 | }, 567 | "translation.x": 0.0, 568 | "translation.y": 0.0, 569 | "zoom_level": 1.0 570 | }, 571 | "stack_index": 7, 572 | "type": "text" 573 | } 574 | ] 575 | } 576 | ], 577 | "incremental_find": 578 | { 579 | "height": 23.0 580 | }, 581 | "input": 582 | { 583 | "height": 31.0 584 | }, 585 | "layout": 586 | { 587 | "cells": 588 | [ 589 | [ 590 | 0, 591 | 0, 592 | 1, 593 | 1 594 | ] 595 | ], 596 | "cols": 597 | [ 598 | 0.0, 599 | 1.0 600 | ], 601 | "rows": 602 | [ 603 | 0.0, 604 | 1.0 605 | ] 606 | }, 607 | "menu_visible": true, 608 | "output.find_results": 609 | { 610 | "height": 0.0 611 | }, 612 | "pinned_build_system": "", 613 | "project": "Python_intro_hackathon.sublime-project", 614 | "replace": 615 | { 616 | "height": 42.0 617 | }, 618 | "save_all_on_build": true, 619 | "select_file": 620 | { 621 | "height": 0.0, 622 | "last_filter": "", 623 | "selected_items": 624 | [ 625 | ], 626 | "width": 0.0 627 | }, 628 | "select_project": 629 | { 630 | "height": 0.0, 631 | "last_filter": "", 632 | "selected_items": 633 | [ 634 | ], 635 | "width": 0.0 636 | }, 637 | "select_symbol": 638 | { 639 | "height": 0.0, 640 | "last_filter": "", 641 | "selected_items": 642 | [ 643 | ], 644 | "width": 0.0 645 | }, 646 | "selected_group": 0, 647 | "settings": 648 | { 649 | }, 650 | "show_minimap": true, 651 | "show_open_files": false, 652 | "show_tabs": true, 653 | "side_bar_visible": true, 654 | "side_bar_width": 275.0, 655 | "status_bar_visible": true, 656 | "template_settings": 657 | { 658 | } 659 | } 660 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction to AV Hackathons (using Python) 2 | 3 | 4 | 5 | This is the repository for the course created by Analytics Vidhya to be hosted on DataCamp. This is meant to be an introductory course to hackathons on Analytics Vidhya. Check out DataHack platform on Analytics Vidhya for more details about the hackathon. 6 | 7 | 8 | ## Aim of the course 9 | This course is aimed towards beginners in Data Science industry. The objective of the course is to help people learn Data Science in fun, interactive manner and be ready for a larger stage for competing in various data science hackathons. 10 | 11 | We use one of our popular practice problems to tell you the basics of data science (using Python) and help you get started with building models for this practice hackathon. 12 | 13 | 14 | ##Feedback on the course 15 | If you have any feedback on the course, please feel free to reach out to kunal.jain@analyticsvidhya.com 16 | 17 | -------------------------------------------------------------------------------- /chapter1.md: -------------------------------------------------------------------------------- 1 | --- 2 | title : Introduction to Python for Data Analysis 3 | description : This chapter will get you started with Python for Data Analysis. We will cover the reasons to learn Data Science using Python, provide an overview of the Python ecosystem and get you to write your first code in Python! 4 | 5 | 6 | 7 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:9a8fd577a9 8 | ## Why learn Python for data analysis? 9 | 10 | Python (an interpreted language) has gathered a lot of interest recently as a preferred choice of language for data analysis. Here are some reasons in favour of learning Python: 11 | 12 | * It is open source – free to install and use 13 | * Python has an awesome online community - latest algorithms come to Python in a matter of days 14 | * It is easy to learn 15 | * It can become a common language for data science and production of web-based analytics products 16 | 17 | ####Which of the following is not a reason to learn Python for data analysis? 18 | 19 | 20 | *** =instructions 21 | - Python is easy to learn. 22 | - Python is an interpreted language, so computation times can be higher than compiler based languages in some cases. 23 | - Python has good libraries for data science. 24 | - It is a production ready language (from web & software perspective). 25 | 26 | *** =hint 27 | Interpreted languages are typically easier to learn, but take longer computational time than compiler based languages. 28 | 29 | *** =sct 30 | ```{python} 31 | # The sct section defines the Submission Correctness Tests (SCTs) used to 32 | # evaluate the student's response. All functions used here are defined in the 33 | # pythonwhat Python package 34 | 35 | msg_bad1 = "That is a good reason to learn Python! Think again" 36 | msg_success = "Exactly! Since Python is an interpreted language, the computation times can be higher compared to other compiler based languages." 37 | 38 | # Use test_mc() to grade multiple choice exercises. 39 | # Pass the correct option (Action, option 2 in the instructions) to correct. 40 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 41 | test_mc(2, [msg_bad1, msg_success, msg_bad1, msg_bad1]) 42 | ``` 43 | 44 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:db5fe12eff 45 | ## Python 2.7 vs. Python 3.5? 46 | 47 | You will come across this question soon after you start using Python. Python has 2 popular competing versions. Both versions have their pros and cons. 48 | 49 | **Benefits of Python 2.7** 50 | 51 | * Awesome online community. Easier to find answers when you get stuck at places. 52 | * Tonnes of third party libraries 53 | 54 | **Benefits of Python 3.5** 55 | 56 | * Cleaner and faster 57 | * It is the future! 58 | 59 | You can read a more detailed answer here 60 | 61 | ####Which version of Python would you recommend to someone who needs to use several third party libraries? 62 | 63 | 64 | *** =instructions 65 | - Python 2.7 66 | - Python 3.5 67 | - Should work on both 68 | 69 | 70 | *** =hint 71 | If you need several third party tools, you should look for a version which has higher community support and integrations. 72 | 73 | 74 | 75 | 76 | *** =sct 77 | ```{python} 78 | # The sct section defines the Submission Correctness Tests (SCTs) used to 79 | # evaluate the student's response. All functions used here are defined in the 80 | # pythonwhat Python package 81 | 82 | msg_bad1 = "Python 3.5 is newer and has lesser third party packages compared to Python 2.7" 83 | msg_success = "Python 2.7 has much higher compatibility with third party libraries." 84 | msg_bad2 = "Think again! One of them is better than the other in this scenario" 85 | 86 | # Use test_mc() to grade multiple choice exercises. 87 | # Pass the correct option (Action, option 2 in the instructions) to correct. 88 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 89 | test_mc(1, [msg_success, msg_bad1, msg_bad2]) 90 | ``` 91 | 92 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:2f83694db6 93 | ## Python installation 94 | 95 | While DataCamp provides an awesome interface to get you started, you will need to run a local instance of Python for any serious Data Science work. The simplest way would be to download Anaconda. An open source distribution of Python, it has most of the libraries & packages you would need, and removes any version conflicts. 96 | I strongly recommend this for beginners. For this course, we will be using Python 3.x 97 | 98 | 99 | ####Should you install a local instance of Python on your machine to continue this course? 100 | 101 | 102 | *** =instructions 103 | - Yes 104 | - No 105 | - I need some help 106 | 107 | *** =hint 108 | Download Anaconda 109 | 110 | 111 | 112 | 113 | *** =sct 114 | ```{python} 115 | # The sct section defines the Submission Correctness Tests (SCTs) used to 116 | # evaluate the student's response. All functions used here are defined in the 117 | # pythonwhat Python package 118 | 119 | msg_bad = "You should install a Python instance locally before going forward" 120 | msg_success = "Great! You are all set to go ahead" 121 | msg_help = "Drop us a line at help@analyticsvidhya.com" 122 | 123 | # Use test_mc() to grade multiple choice exercises. 124 | # Pass the correct option (Action, option 2 in the instructions) to correct. 125 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 126 | test_mc(1, [msg_success, msg_bad, msg_help]) 127 | ``` 128 | 129 | --- type:NormalExercise lang:python xp:100 skills:2 key:af2f6f90f3 130 | ## Run a few simple programs in Python 131 | 132 | Time to get our hands dirty now. We will use Python to run a simple program! 133 | 134 | *** =instructions 135 | - The first line adds two numbers (1 & 2) and stores it in variable addition1. 136 | - Write a line of code in line 4, which adds the number 3 and the number 4 and assigns it to a variable addition2 137 | 138 | 139 | 140 | *** =hint 141 | - Think how would you write simple addition. 142 | - Make sure you assign the sum to the variable 'addition2' 143 | - Remember - Python is case sensitive. Check your cases and white spaces 144 | 145 | *** =pre_exercise_code 146 | ```{python} 147 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 148 | ``` 149 | 150 | *** =sample_code 151 | ```{python} 152 | # Add 1 & 2 and assign it to addition1 153 | addition1 = 1 + 2 154 | # Now write code to add 3 & 4 and assign it to addition2 155 | 156 | ``` 157 | 158 | 159 | *** =solution 160 | ```{python} 161 | # Add 1 & 2 and assign it to addition1 162 | addition1 = 1 + 2 163 | # Now write code to add 3 & 4 and assign to addition2 164 | addition2 = 3 + 4 165 | 166 | ``` 167 | 168 | *** =sct 169 | ```{python} 170 | # The sct section defines the Submission Correctness Tests (SCTs) used to 171 | # evaluate the student's response. All functions used here are defined in the 172 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 173 | 174 | # Check if the student typed 3 + 4 175 | test_object("addition2") 176 | success_msg("Great work! Let's print something now!") 177 | ``` 178 | --- type:NormalExercise lang:python xp:100 skills:2 key:b52d6e84c1 179 | ## Printing "Hello World!" in Python! 180 | 181 | Now that you know how to add numbers, let us look at printing "Hello World!" in Python. 182 | 183 | *** =instructions 184 | 185 | - Print "Hello World!" on the console 186 | 187 | 188 | *** =hint 189 | - Remember that the message to be printed should be enclosed in (" ") 190 | - Remember - Python is case sensitive. Check your cases and white spaces 191 | - Hope you are not missing the exclaimation mark ! 192 | 193 | *** =pre_exercise_code 194 | ```{python} 195 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 196 | ``` 197 | 198 | *** =sample_code 199 | ```{python} 200 | # Print a message 201 | print("Welcome to the joint course from Analytics Vidhya and DataCamp") 202 | 203 | # Now write code to print "Hello World!" 204 | 205 | ``` 206 | 207 | 208 | *** =solution 209 | ```{python} 210 | # Print a message 211 | print("Welcome to the joint course from Analytics Vidhya and DataCamp") 212 | 213 | # Now write a code to Print "Hello World!" 214 | print("Hello World!") 215 | ``` 216 | 217 | *** =sct 218 | ```{python} 219 | # The sct section defines the Submission Correctness Tests (SCTs) used to 220 | # evaluate the student's response. All functions used here are defined in the 221 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 222 | 223 | # Check if the student printed "Hello World!" 224 | test_output_contains("Hello World!", pattern = False, no_output_msg="Did you print Hello World! ?") 225 | success_msg("Great work! Let's move to the next chapter") 226 | ``` 227 | -------------------------------------------------------------------------------- /chapter2.md: -------------------------------------------------------------------------------- 1 | --- 2 | title : Python Libraries and data structures 3 | description : In this chapter, we will introduce some of the most common data structures in Python to you and take you through some of the libraries we commonly use in data analysis. 4 | 5 | 6 | --- type:NormalExercise lang:python xp:100 skills:2 key:af2f6f90f3 7 | ## Create a List 8 | 9 | Lists are probably the most versatile data structures in Python. A list can be defined by writing a list of comma separated values in square brackets. Lists might contain items of different types. Python lists are mutable - individual elements of a list can be changed while the identity does not change. 10 | 11 | ```{python} 12 | Country =['INDIA','USA','GERMANY','UK','AUSTRALIA'] 13 | 14 | Temperature =[44, 28, 20, 18, 25, 45, 67] 15 | ``` 16 | We just created two lists, one for Country names (strings) and another one for Temperature data (whole numbers). 17 | 18 | ####Accessing individual elements of a list 19 | - Individual elements of a list can be accessed by writing an index number in square bracket. The first index of a list starts with 0 (zero) not 1. For example, Country[0] can be used to access the first element, 'INDIA' 20 | - A range of elements can be accessed by using start index and end index but it does not return the value of the end index. For example, Temperature[1:4] returns three elements, the second through fourth elements [28, 20, 18], but not the fifth element 21 | 22 | *** =instructions 23 | - Create a list of the first five odd numbers and store it in the variable odd_numbers 24 | - Print second to fourth element [1, 4, 9] from squares_list 25 | 26 | 27 | *** =hint 28 | - Use AV[0] to select the first element of a list AV. 29 | - Use AV[1:3] to select the second to the third element of a list AV. 30 | 31 | 32 | *** =pre_exercise_code 33 | ```{python} 34 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 35 | ``` 36 | 37 | *** =sample_code 38 | 39 | ```{python} 40 | 41 | # Create a list of squared numbers 42 | squares_list = [0, 1, 4, 9, 16, 25] 43 | 44 | # Now write a line of code to create a list of the first five odd numbers and store it in a variable odd_numbers 45 | odd_numbers= 46 | 47 | # Print the first element of squares_list 48 | print (squares_list[0]) 49 | 50 | # Print the second to fourth elements of squares_list 51 | 52 | ``` 53 | 54 | *** =solution 55 | ```{python} 56 | 57 | # Create a list of squared numbers 58 | squares_list = [0, 1, 4, 9, 16, 25] 59 | 60 | # Now write a code to create list of first five odd numbers and store it in a variable odd_numbers 61 | odd_numbers = [1, 3, 5, 7, 9] 62 | 63 | # Print the first element of squares_list 64 | print (squares_list[0]) 65 | 66 | # Print the second to fourth elements of squares_list 67 | print (squares_list[1:4]) 68 | ``` 69 | 70 | *** =sct 71 | ```{python} 72 | # The sct section defines the Submission Correctness Tests (SCTs) used to 73 | # evaluate the student's response. All functions used here are defined in the 74 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 75 | 76 | # Test for list of odd_numbers 77 | test_object("odd_numbers", incorrect_msg="Are you sure you assigned the correct value to odd_numbers? It should be 1, 3, 5, 7, 9") 78 | 79 | # Check second to fourth elements" 80 | test_output_contains("[1, 4, 9]", pattern = False, no_output_msg="Have you given the right index numbers to squares_list?") 81 | success_msg("Good progress! You just learnt the most versatile data structure in Python!") 82 | ``` 83 | 84 | --- type:NormalExercise lang:python xp:100 skills:2 key:c7f91e389f 85 | ## Create a String 86 | 87 | Strings can simply be defined by use of single ( ‘ ), double ( ” ) or triple ( ”’ ) inverted commas. Strings enclosed in triple quotes ( ”’ ) can span over multiple lines. 88 | A few things to keep in mind about strings: 89 | 90 | * Strings are immutable in Python, so you can not change the content of a string. 91 | * Function len() can be used to get length of a string 92 | * You can access the elements using indexes as you do for lists 93 | 94 | ```{python} 95 | String ="String elements can also be accessed using index numbers, just like lists" 96 | 97 | print (String[0:7]) 98 | 99 | #Above print command displays "String " on screen. 100 | ``` 101 | 102 | * You can use '+' operator to concatenate two strings 103 | 104 | 105 | *** =instructions 106 | 107 | - Use the len() function to store the length of string 108 | - Use start and end index to access the required characters, e.g. str[0:3] to return first three characters of string str 109 | - '+' operator is used to concatenate (combine) two strings 110 | 111 | 112 | 113 | *** =hint 114 | 115 | - Use str[0] to select the first element of string str 116 | - Use str1 + str2 to return the concatenated result of both strings str1 and str2 117 | 118 | 119 | 120 | *** =pre_exercise_code 121 | 122 | ```{python} 123 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 124 | ``` 125 | 126 | *** =sample_code 127 | 128 | ```{python} 129 | # Create a string str1 130 | str1 = "Introduction with strings" 131 | 132 | # Now store the length of string str1 in variable str_len 133 | str_len = _________ 134 | 135 | str_new = "Machine Learning is awesome!" 136 | # Print last eight characters of string str_new (the length of str_new is 28 characters). 137 | print __________ 138 | 139 | str2 = "I am doing a course Introduction to Hackathon using " 140 | str3 = "Python" 141 | 142 | # Write a line of code to store concatenated string of str2 and str3 into variable str4 143 | str4 = _________ 144 | 145 | ``` 146 | 147 | *** =solution 148 | 149 | ```{python} 150 | 151 | # Create a string str1 152 | str1 = "Introduction with strings" 153 | 154 | # Now store the length of string str1 in varible str_len 155 | str_len=len(str1) 156 | 157 | str_new = "Machine Learning is awesome!" 158 | # Print last eight characters of string str_new (the length of str_new is 28 characters). 159 | print (str_new[20:28]) 160 | 161 | str2 = "I am doing a course Introduction to Hackathon using " 162 | str3 = "Python" 163 | 164 | # Write a code to store concatenated string of str2 and str3 into variable str4 165 | str4= str2 + str3 166 | ``` 167 | 168 | *** =sct 169 | 170 | ```{python} 171 | # The sct section defines the Submission Correctness Tests (SCTs) used to 172 | # evaluate the student's response. All functions used here are defined in the 173 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 174 | 175 | # Check length of strings 176 | test_object("str_len", incorrect_msg = "Did you use len() function with str1?") 177 | 178 | # Check last seven characters 179 | test_output_contains("awesome!", pattern = False, no_output_msg="Have you used the right start and end index number with str_new to print the last eight characters?") 180 | 181 | # Check concatenated strings" 182 | test_object("str3", incorrect_msg="Are you sure that you have used + sign to concatenate both strings st2 and str3") 183 | success_msg("Great work!") 184 | ``` 185 | 186 | --- type:NormalExercise lang:python xp:100 skills:2 key:377e9324f2 187 | ## Create a Dictionary 188 | 189 | A Dictionary is an unordered set of key:value pairs, with the requirement that the keys are unique (within a Dictionary). A few pointers about dictionary: 190 | 191 | * An empty dictionary can be created by a pair of braces: {}. 192 | * Dictionary elements can be accessed by dictionary keys 193 | * DICT.keys() will return all the keys of given dictionary "DICT" 194 | 195 | ```{python} 196 | DICT = { 197 | 'Name':'Kunal', 198 | 'Company':'Analytics Vidhya' 199 | } 200 | 201 | #Dictionary elements can be accessed by keys 202 | 203 | print (DICT['Name']) 204 | 205 | #The above print statement will print Kunal 206 | 207 | ``` 208 | 209 | In dictionary "DICT", Name and Company are dictionary keys whereas "Kunal" and "Analytics Vidhya" are their respective values. 210 | 211 | *** =instructions 212 | 213 | - Print the value associated with key 'Age' in dictionary dict1 214 | - Store all the keys of dictionary dict1 in variable 'dict_keys' 215 | 216 | *** =hint 217 | 218 | - Use dict['Key'] = new_value to update the existing value 219 | 220 | 221 | *** =pre_exercise_code 222 | 223 | ```{python} 224 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 225 | ``` 226 | 227 | *** =sample_code 228 | 229 | ```{python} 230 | 231 | # Create a dictionary dict1 232 | dict1 = { 'Age': 16, 'Name': 'Max', 'Sports': 'Cricket'} 233 | 234 | # Update the value of Age to 18 235 | dict1['Age'] = 18 236 | 237 | # Print the value of Age 238 | print __________ 239 | 240 | # Store the keys of dictionary dict1 to dict_keys 241 | dict_keys = __________ 242 | 243 | ``` 244 | 245 | *** =solution 246 | 247 | ```{python} 248 | 249 | # Create a dictionary 250 | dict1 = {'Age': 16, 'Name': 'Max', 'Sports': 'Cricket'} 251 | 252 | # Update the value of Age to 18 253 | dict1['Age'] = 18 254 | 255 | # Print the value of Age 256 | print (dict1['Age']) 257 | 258 | # Store the keys of dictionary dict1 to dict_keys 259 | dict_keys= dict1.keys() 260 | 261 | ``` 262 | 263 | *** =sct 264 | 265 | ```{python} 266 | # The sct section defines the Submission Correctness Tests (SCTs) used to 267 | # evaluate the student's response. All functions used here are defined in the 268 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 269 | 270 | # Check value of Age 271 | test_output_contains("18", pattern = False, no_output_msg="Have you used the key Age with dictonary dict1") 272 | 273 | 274 | # Store the keys of dictionary dict1 to dict_keys 275 | test_object("dict_keys", incorrect_msg="Have you used keys() with dict?", undefined_msg="Have you used keys() with dict?") 276 | 277 | success_msg("Great work!") 278 | 279 | 280 | ``` 281 | 282 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:9a8fd577a9 283 | ## How to use Python libraries? 284 | First of all - great progress! You now know some of the important data structures in Python. 285 | 286 | Let's take another step ahead in our journey to learn Python, by getting acquainted with some useful libraries. The first step is to learn to import them into your environment. There are several ways of doing so in Python: 287 | 288 | ```{python} 289 | import math as m 290 | 291 | from math import * 292 | ``` 293 | 294 | In the first manner, we have defined an alias m to library math. We can now use various functions from math library (e.g. factorial) by referencing it using the alias m.factorial(). 295 | 296 | In the second manner, you have imported the entire name space in math i.e. you can directly use factorial() without referring to math. 297 | 298 | Following are a list of libraries, you will need for any scientific computations and data analysis: 299 | 300 | * Numpy 301 | * Scipy 302 | * Pandas 303 | * Matplotlib 304 | * Scikit Learn 305 | 306 | 307 | 308 | ##### Which of the following is a valid import statement for below code? 309 | ```{python} 310 | print (factorial(5)) 311 | ``` 312 | 313 | *** =instructions 314 | - import math 315 | - from math import factorial 316 | - import math.factorial 317 | 318 | *** =hint 319 | Python's from statement lets you import specific attributes from a module into the current namespace. 320 | 321 | *** =sct 322 | ```{python} 323 | # The sct section defines the Submission Correctness Tests (SCTs) used to 324 | # evaluate the student's response. All functions used here are defined in the 325 | # pythonwhat Python package 326 | 327 | msg_bad = "Read about importing libraries in python" 328 | msg_success = "Good Job!" 329 | 330 | # Use test_mc() to grade multiple choice exercises. 331 | # Pass the correct option (Action, option 2 in the instructions) to correct. 332 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 333 | test_mc(2, [msg_bad, msg_success, msg_bad]) 334 | ``` 335 | 336 | 337 | --- type:NormalExercise lang:python xp:100 skills:2 key:50c9218dac 338 | ## Why are conditional statements required? 339 | 340 | Conditional statements are used to execute code fragments based on a given condition. The most commonly used construct is if-else, with the following syntax: 341 | 342 | ```{python} 343 | 344 | if [condition]: 345 | __execution if true__ 346 | else: 347 | __execution if false__ 348 | ``` 349 | 350 | *** =instructions 351 | 352 | - Store the length of `squares_list` to `square_len` using function `len()` 353 | - Comparision operators `<, >, <=, >=, ==` and `!=` help to check condition is true or false 354 | - Write the outcome in each branch of the following conditional code 355 | 356 | *** =hint 357 | 358 | - Use <, >, <=, >=, == and != for comparison 359 | - Use `len(list)` to return length of string 360 | 361 | 362 | *** =pre_exercise_code 363 | 364 | ```{python} 365 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 366 | ``` 367 | 368 | *** =sample_code 369 | 370 | ```{python} 371 | # Create a two integer variables a and b 372 | a=3 373 | b=4 374 | 375 | # if a is greater than b print a-b else a+b 376 | if a > b: 377 | print (a-b) 378 | else: 379 | print (a+b) 380 | 381 | # Create a list of squared numbers 382 | squares_list = [0, 1, 4, 9, 16, 25] 383 | 384 | # Store the length of squares_list in square_len 385 | square_len = 386 | 387 | # if square_len is less than 5 then print "Less than 5" else "Greater than 5" 388 | if square_len < 5: 389 | print ("__________") 390 | else: 391 | print ("__________") 392 | 393 | 394 | ``` 395 | 396 | *** =solution 397 | 398 | ```{python} 399 | # Create a two integer variables a and b 400 | a=3 401 | b=4 402 | 403 | # if a is greater than b print a-b else a+b 404 | if a > b: 405 | print (a-b) 406 | else: 407 | print (a+b) 408 | 409 | # Create a list of squared numbers 410 | squares_list = [0, 1, 4, 9, 16, 25] 411 | 412 | # Store the length of squares_list in square_len 413 | square_len = len(squares_list) 414 | 415 | # if square_len is less than 5 then print "Less than 5" else "Greater than 5" 416 | if square_len < 5: 417 | print ("Less than 5") 418 | else: 419 | print ("Greater than 5") 420 | 421 | ``` 422 | 423 | *** =sct 424 | 425 | ```{python} 426 | # The sct section defines the Submission Correctness Tests (SCTs) used to 427 | # evaluate the student's response. All functions used here are defined in the 428 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 429 | 430 | # Check length of strings 431 | test_object("square_len", incorrect_msg = "Have you used len function with list squares_list?") 432 | 433 | # Check last seven characters 434 | test_output_contains("Greater than 5", pattern = False, no_output_msg="Have you given the right statement in True and False block of if statement ?") 435 | 436 | success_msg("Great work!") 437 | ``` 438 | 439 | 440 | --- type:NormalExercise lang:python xp:100 skills:2 key:c1b7c2fd5c 441 | ## How iterative statements help? 442 | 443 | Computers are often used to automate repetitive tasks. Repeating identical or similar tasks without making errors is something that computers do well. Repeated execution of a set of statements is called iteration. 444 | 445 | Like most languages, Python also has a FOR-loop which is the most widely used method for iteration. It has a simple syntax: 446 | 447 | ```{python} 448 | 449 | for i in [Python Iterable]: 450 | expression(i) 451 | 452 | ``` 453 | “Python Iterable” can be a list or other advanced data structures which we will explore in later sections. Let’s take a look at a simple example, determining the factorial of a number. 454 | 455 | *** =instructions 456 | 457 | - Iterate over all values of list using for loop 458 | - Use % modulus operator to return remainder e.g. 4%2 will result in 0 and 5%2 to 1 459 | 460 | 461 | 462 | *** =hint 463 | 464 | - Write an expression x % 2 == 0 to check x is even or not 465 | 466 | 467 | *** =pre_exercise_code 468 | 469 | ```{python} 470 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 471 | ``` 472 | 473 | *** =sample_code 474 | 475 | ```{python} 476 | # Create a list of first five numbers 477 | ls=[] 478 | for x in range(5): 479 | ls.append(x) 480 | 481 | sum=0 482 | # Store sum all the even numbers of the list ls in sum 483 | 484 | for x in ls: 485 | if x%2 == __: 486 | sum += x 487 | 488 | print (sum) 489 | 490 | ``` 491 | 492 | *** =solution 493 | 494 | ```{python} 495 | # Create a list with first five numbers 496 | ls=[] 497 | for x in range(5): 498 | ls.append(x) # append a value to a list 499 | 500 | sum=0 501 | # Store sum all even numbers of the list ls in sum 502 | 503 | for x in ls: 504 | if x%2==0: 505 | sum += x 506 | 507 | print (sum) 508 | 509 | ``` 510 | 511 | *** =sct 512 | 513 | ```{python} 514 | # The sct section defines the Submission Correctness Tests (SCTs) used to 515 | # evaluate the student's response. All functions used here are defined in the 516 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 517 | 518 | # Check length of strings 519 | test_object("sum", incorrect_msg="Are you taking sum of even numbers?") 520 | 521 | 522 | success_msg("Great work! Let's move to the next chapter") 523 | ``` 524 | -------------------------------------------------------------------------------- /chapter3.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title : Exploratory analysis in Python using Pandas 4 | description : We start with the first step of data analysis - the exploratory data analysis. 5 | 6 | --- type:NormalExercise lang:python xp:100 skills:2 key:af2f6f90f3 7 | ## Case study - Who is eligible for loan? 8 | 9 | ###Introduction - Analytics Vidhya (AV) DataHack 10 | At Analytics Vidhya, we are building a knowledge platform for data science professionals across the globe. Among several things, we host several hackathons for our community on our DataHack platform. The case study for today's problem is one of the practice problem on our platform. You can check out the practice problem here. 11 | 12 | ###The case study - Dream Housing Finance 13 | 14 | Dream Housing Finance company deals in all home loans. They have a presence across all urban, semi-urban and rural areas. Customers first apply for a home loan after that company validates the customer's eligibility. The company wants to automate the loan eligibility process (real-time) based on customer detail provided while filling online application form. 15 | 16 | Let's start with loading the training and testing set into your python environment. You will use the training set to build your model, and the test set to validate it. Both the files are stored on the web as CSV files; their URLs are already available as character strings in the sample code. 17 | 18 | You can load this data with the pandas.read_csv() function. It converts the data set to a python dataframe. In simple words, Python dataframe can be imagined as an equivalent of a spreadsheet or a SQL table. 19 | 20 | 21 | *** =instructions 22 | - train.head(n) helps to look at top n observation of train dataframe. Use it to print top 5 observations of train. 23 | - len(DataFrame) returns the total number of observations. Store the number of observations in train data in variable train_length 24 | - DataFrame.columns returns the total columns heading of the data set. Store the number of columns in test datasetin variable test_col 25 | 26 | 27 | *** =hint 28 | - Use len(dataframe) to return the total observations 29 | - Use len(dataframe.columns) to return the total available columns 30 | 31 | 32 | *** =pre_exercise_code 33 | 34 | ```{python} 35 | 36 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 37 | 38 | # Import library pandas 39 | import pandas as pd 40 | 41 | # Import train file 42 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 43 | 44 | # Import test file 45 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 46 | 47 | ``` 48 | 49 | *** =sample_code 50 | 51 | ```{python} 52 | 53 | # import library pandas 54 | import pandas as pd 55 | 56 | # Import training data as train 57 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 58 | 59 | # Import testing data as test 60 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 61 | 62 | # Print top 5 observation of train dataset 63 | print (train.____() ) 64 | 65 | # Store total number of observation in training dataset 66 | train_length = len (_____) 67 | 68 | # Store total number of columns in testing data set 69 | test_col = len ( test._____) 70 | 71 | ``` 72 | 73 | *** =solution 74 | 75 | ```{python} 76 | 77 | import pandas as pd 78 | 79 | # Import training data as train 80 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 81 | 82 | # Import testing data as test 83 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 84 | 85 | # Print top 5 observation of test dataset 86 | print (train.head(5)) 87 | 88 | # Store total number of observation in training dataset 89 | train_length = len(train) 90 | 91 | # Store total number of columns in testing data set 92 | test_col = len(test.columns) 93 | 94 | ``` 95 | 96 | *** =sct 97 | 98 | ```{python} 99 | # The sct section defines the Submission Correctness Tests (SCTs) used to 100 | # evaluate the student's response. All functions used here are defined in the 101 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 102 | 103 | # Test for evaluating top 5 heading of dataframe 104 | test_function("print", incorrect_msg = "Don't forget to print the first 5 observations of `train`!") 105 | 106 | # Test for total observation in training dataset 107 | test_object("train_length", incorrect_msg = "Don't forget to store the length of `train` in train_length") 108 | 109 | # Test for total columns in testing dataset 110 | test_object("test_col", incorrect_msg = "Don't forget to store the number of columns of `test` in test_col") 111 | 112 | success_msg("Great work! Let us look at the data more closely") 113 | ``` 114 | 115 | --- type:NormalExercise lang:python xp:100 skills:2 key:36c3190b26 116 | ## Understanding the Data 117 | 118 | You can look at a summary of numerical fields by using dataframe.describe(). It provides the count, mean, standard deviation (std), min, quartiles and max in its output. 119 | 120 | 121 | ```{python} 122 | dataframe.describe() 123 | ``` 124 | 125 | For the non-numeric values (e.g. Property_Area, Credit_History etc.), we can look at frequency distribution. The frequency table can be printed by the following command: 126 | 127 | 128 | ```{python} 129 | df[column_name].value_counts() 130 | ``` 131 | 132 |
OR
133 | 134 | ```{python} 135 | df.column_name.value_counts() 136 | ``` 137 | 138 | *** =instructions 139 | 140 | - Use `dataframe.describe()` to understand the distribution of numerical variables 141 | - Look at unique values of non-numeric values using `df[column_name].value_counts()` 142 | 143 | 144 | *** =hint 145 | - Store the output of `train.describe()` in a variable df 146 | - Use `train.PropertyArea.value_counts()` to look at frequency distribution 147 | 148 | 149 | *** =pre_exercise_code 150 | 151 | ```{python} 152 | 153 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 154 | 155 | # Import library pandas 156 | import pandas as pd 157 | 158 | # Import training file 159 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 160 | 161 | # Import testing file 162 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 163 | 164 | ``` 165 | 166 | *** =sample_code 167 | 168 | ```{python} 169 | 170 | #Training and Testing data set are loaded in train and test dataframe respectively 171 | 172 | # Look at the summary of numerical variables for train data set 173 | df= train.________() 174 | print (df) 175 | 176 | # Print the unique values and their frequency of variable Property_Area 177 | df1=train.Property_Area.________() 178 | print (df1) 179 | 180 | ``` 181 | 182 | *** =solution 183 | 184 | ```{python} 185 | 186 | # Look at the summary of numerical variables for train data set 187 | df = train.describe() 188 | print (df) 189 | 190 | # Print the unique values and their frequency of variable Property_Area 191 | df1=train.Property_Area.value_counts() 192 | print (df1) 193 | 194 | ``` 195 | 196 | *** =sct 197 | 198 | ```{python} 199 | # The sct section defines the Submission Correctness Tests (SCTs) used to 200 | # evaluate the student's response. All functions used here are defined in the 201 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 202 | 203 | # Test for describe 204 | test_function("train.describe", not_called_msg = "Did you call the right function with train dataset to see numerical summary?") 205 | # Test for value_counts 206 | test_function("train.Property_Area.value_counts", not_called_msg = "Did you call the right function with train dataset to see frequency table of 'Property_Area'?") 207 | 208 | success_msg("Great work!") 209 | ``` 210 | 211 | 212 | --- type:NormalExercise lang:python xp:100 skills:2, 4 key:85c5d3a079 213 | ## Understanding distribution of numerical variables 214 | 215 | Now that we are familiar with basic data characteristics, let us study the distribution of numerical variables. Let us start with numeric variable "ApplicantIncome". 216 | 217 | Let's start by plotting the histogram of ApplicantIncome using the following command: 218 | 219 | ```{python} 220 | train['ApplicantIncome'].hist(bins=50) 221 | ``` 222 |
Or
223 | 224 | ```{python} 225 | train.ApplicantIncome.hist(bins=50) 226 | ``` 227 | 228 | Next, we can also look at box plots to understand the distributions. Box plot for ApplicantIncome can be plotted by 229 | 230 | 231 | ```{python} 232 | train.boxplot(column='ApplicantIncome') 233 | ``` 234 | 235 | *** =instructions 236 | 237 | - Use hist() to plot histogram 238 | - Use by=categorical_variable with box plot to look at distribution by categories 239 | 240 | ```{python} 241 | train.boxplot(column='ApplicantIncome', by='Gender') 242 | ``` 243 | 244 | *** =hint 245 | - Use dataframe.columnname1.hist() to plot histogram 246 | - Use dataframe.boxplot(column='columnname2', by = 'columnname3' ) to have boxplot by different categories of a categorical variable 247 | 248 | 249 | *** =pre_exercise_code 250 | 251 | ```{python} 252 | 253 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 254 | 255 | # Import library pandas 256 | import pandas as pd 257 | 258 | # Import training file 259 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 260 | 261 | # Import testing file 262 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 263 | 264 | ``` 265 | 266 | *** =sample_code 267 | 268 | ```{python} 269 | 270 | # Training and Testing dataset are loaded in train and test dataframe respectively 271 | # Plot histogram for variable LoanAmount 272 | train.LoanAmount._____ 273 | 274 | # Plot a box plot for variable LoanAmount by variable Gender of training data set 275 | train._______(column='LoanAmount', by = 'Gender') 276 | 277 | ``` 278 | 279 | *** =solution 280 | 281 | ```{python} 282 | 283 | 284 | # Assumed training and testing dataset are loaded in train and test dataframe respectively 285 | # Plot histogram for variable LoanAmount 286 | train.LoanAmount.hist() 287 | 288 | # Plot a box plot for variable LoanAmount by variable Gender of training data set 289 | train.boxplot(column='LoanAmount', by ='Gender' ) 290 | 291 | ``` 292 | 293 | *** =sct 294 | 295 | ```{python} 296 | # The sct section defines the Submission Correctness Tests (SCTs) used to 297 | # evaluate the student's response. All functions used here are defined in the 298 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 299 | 300 | # Test for evaluating histogram 301 | test_function("train.LoanAmount.hist", not_called_msg = "Did you call the right function to plot histogram?") 302 | 303 | # Test for evaluating box plot 304 | test_function("train.boxplot", not_called_msg = "Did you call the right function for boxplot?") 305 | 306 | success_msg("Great work!") 307 | ``` 308 | 309 | 310 | 311 | --- type:NormalExercise lang:python xp:100 skills:2, 4 key:708e937aea 312 | ## Understanding distribution of categorical variables 313 | 314 | We have looked at the distributions of ApplicantIncome and LoanIncome, now it's time for looking at categorical variables in more details. For instance, let's see whether Gender is affecting the loan status or not. This can be tested using cross-tabulation as shown below: 315 | 316 | ```{python} 317 | pd.crosstab( train ['Gender'], train ["Loan_Status"], margins=True) 318 | ``` 319 | Next, we can also look at proportions can be more intuitive in making some quick insights. We can do this using the apply function. You can read more about cross tab and apply functions here. 320 | 321 | 322 | ```{python} 323 | 324 | def percentageConvert(ser): 325 | return ser/float(ser[-1]) 326 | 327 | pd.crosstab(train ["Gender"], train ["Loan_Status"], margins=True).apply(percentageConvert, axis=1) 328 | 329 | ``` 330 | 331 | *** =instructions 332 | 333 | - Use value_counts() with train['LoanStatus'] to look at the frequency distribution 334 | - Use crosstab with Loan_Status and Credit_History to perform bi-variate analysis 335 | 336 | 337 | 338 | *** =hint 339 | train['Loan_Status'].value_counts() return the frequency by each category of categorical variable 340 | 341 | 342 | 343 | *** =pre_exercise_code 344 | 345 | ```{python} 346 | 347 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 348 | 349 | # Import library pandas 350 | import pandas as pd 351 | 352 | # Import training file 353 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 354 | 355 | # Import testing file 356 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 357 | 358 | ``` 359 | 360 | *** =sample_code 361 | 362 | ```{python} 363 | 364 | # Training and Testing dataset are loaded in train and test dataframe respectively 365 | 366 | # Approved Loan in absolute numbers 367 | loan_approval = train['Loan_Status'].________()['Y'] 368 | 369 | # Two-way comparison: Credit History and Loan Status 370 | twowaytable = pd.________(train ["Credit_History"], train ["Loan_Status"], margins=True) 371 | 372 | 373 | 374 | ``` 375 | 376 | *** =solution 377 | 378 | ```{python} 379 | 380 | # Assumed training and testing dataset are loaded in train and test dataframe respectively 381 | 382 | # Approved Loan in absolute numbers 383 | loan_approval = train['Loan_Status'].value_counts()['Y'] 384 | 385 | # Two-way comparison: Credit_History and Loan_Status 386 | twowaytable = pd.crosstab(train ["Credit_History"], train ["Loan_Status"], margins=True) 387 | 388 | ``` 389 | 390 | *** =sct 391 | 392 | ```{python} 393 | # The sct section defines the Submission Correctness Tests (SCTs) used to 394 | # evaluate the student's response. All functions used here are defined in the 395 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 396 | 397 | # Test for Approved Loan in absolute numbers 398 | test_object("loan_approval", incorrect_msg='Did you look at the frequency distribution?',undefined_msg='Did you look at the frequency distribution?') 399 | 400 | 401 | # Test for two-way comparison Credit_History and Loan_Status 402 | test_object("twowaytable", incorrect_msg='Did you use the right function to generate two way table?', undefined_msg='Did you use the right function to generate two way table?') 403 | 404 | 405 | success_msg("Great work!") 406 | 407 | ``` 408 | -------------------------------------------------------------------------------- /chapter4.md: -------------------------------------------------------------------------------- 1 | --- 2 | title : Data Munging in Python using Pandas 3 | description : Pandas is at the heart of data analysis in Python. This chapter gets you started with Data Munging in Python using Pandas 4 | 5 | --- type:NormalExercise lang:python xp:100 skills:2, 4, 8 key:af2f6f90f3 6 | ## The curious case of missing values 7 | 8 | Rarely is the data captured perfectly in real world. People might not disclose few details or those details might not be available in the first place. This data set is no different. There are missing values in variables. 9 | 10 | We need to first find out which variables have missing values, and then see what is the best way to handle these missing values. The way to handle a missing value can depend on the number of missing values, the type of variable and the expected importance of those variables. 11 | 12 | So, let's start by finding out whether variable "Credit_history" has missing values or not and if so, how many observations are missing. 13 | 14 | ```{python} 15 | 16 | train['Credit_History'].isnull().sum() 17 | 18 | ``` 19 | 20 | * isnull() helps to check the observation has missing value or not (It returns a boolean value TRUE or FALSE) 21 | * sum() used to return the number of records have missing values 22 | 23 | *** =instructions 24 | - Apply isnull() to check the observation has null value or not 25 | - Check number of missing values is greater than 0 or not 26 | 27 | 28 | *** =hint 29 | Use sum() with train['Self_Employed'].isnull() to check number of missing values 30 | 31 | 32 | 33 | *** =pre_exercise_code 34 | 35 | ```{python} 36 | 37 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 38 | 39 | # Import library pandas 40 | import pandas as pd 41 | 42 | # Import training file 43 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 44 | 45 | # Import testing file 46 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 47 | 48 | ``` 49 | 50 | *** =sample_code 51 | 52 | ```{python} 53 | 54 | # How many missing values in variable "Self_Employed" ? 55 | n_missing_value_Self_Employed = train['Self_Employed']._____.sum() 56 | 57 | # Variable Loan amount has missing values or not? 58 | LoanAmount_have_missing_value = train['LoanAmount'].isnull().sum() > ____ 59 | 60 | 61 | ``` 62 | 63 | *** =solution 64 | 65 | ```{python} 66 | 67 | # How many missing values in variable "Self_Employed" ? 68 | n_missing_value_Self_Employed = train['Self_Employed'].isnull().sum() 69 | 70 | # Variable Loan amount has missing values or not? 71 | LoanAmount_have_missing_value = train['LoanAmount'].isnull().sum() > 0 72 | 73 | 74 | ``` 75 | 76 | *** =sct 77 | 78 | ```{python} 79 | # The sct section defines the Submission Correctness Tests (SCTs) used to 80 | # evaluate the student's response. All functions used here are defined in the 81 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 82 | 83 | # How many missing values in variable "Self_Employed" ? 84 | test_object("n_missing_value_Self_Employed", incorrect_msg='Have you checked the missing values?') 85 | 86 | # Variable Loan amount has missing values or not? 87 | test_object("LoanAmount_have_missing_value", incorrect_msg='Have you checked the column has missing value or not?') 88 | 89 | success_msg("Great work!") 90 | ``` 91 | 92 | 93 | 94 | --- type:NormalExercise lang:python xp:100 skills:2, 4, 8 key:4abbcb0b8d 95 | ## How many variables have missing values? 96 | 97 | Till now, we have checked the variable has missing value or not? Next action is to check how many variables have missing values. One way of doing this check would be to evaluate each individual variable. This would not be easy if we have hundred of columns. This action can be performed simply by using isnull() on dataframe object. 98 | 99 | ```{python} 100 | 101 | train.isnull().sum() 102 | 103 | ``` 104 | 105 | This statement will return the column names with the number of observation that have missing (null) values. 106 | 107 |
108 | 109 | *** =instructions 110 | Apply isnull().sum() with test dataset 111 | 112 | 113 | 114 | *** =hint 115 | Use train.isnull().sum() to check number of missing values in train data set 116 | 117 | 118 | 119 | *** =pre_exercise_code 120 | 121 | ```{python} 122 | 123 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 124 | 125 | # Import library pandas 126 | import pandas as pd 127 | 128 | # Import training file 129 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 130 | 131 | # Import testing file 132 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 133 | 134 | ``` 135 | 136 | *** =sample_code 137 | 138 | ```{python} 139 | 140 | # Check variables have missing values in test data set 141 | number_missing_values_test_data = test.isnull()._____() 142 | 143 | ``` 144 | 145 | *** =solution 146 | 147 | ```{python} 148 | 149 | # Check variables have missing values in test data set 150 | number_missing_values_test_data = test.isnull().sum() 151 | 152 | ``` 153 | 154 | *** =sct 155 | 156 | ```{python} 157 | # The sct section defines the Submission Correctness Tests (SCTs) used to 158 | # evaluate the student's response. All functions used here are defined in the 159 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 160 | 161 | # Check variables have missing values in test data set 162 | test_object("number_missing_values_test_data", incorrect_msg='Have you count the number of missing values in each variable of test data set?') 163 | 164 | 165 | success_msg("Great work!") 166 | ``` 167 | 168 | 169 | --- type:NormalExercise lang:python xp:100 skills:2, 4, 8 key:fd3cdcb726 170 | ## Imputing missing values of LoanAmount 171 | 172 | There are multiple ways to fill the missing values of continuous variables. You can replace them with mean, median or estimate values based on other features of the data set. 173 | 174 | For the sake of simplicity, we would impute the missing values of LoanAmount by mean value (Mean of available values of LoanAmount). 175 | 176 | ```{python} 177 | train['LoanAmount'].fillna(train['LoanAmount'].mean(), inplace=True) 178 | ``` 179 | 180 | *** =instructions 181 | Impute missing values with a specific value 168 182 | 183 | 184 | 185 | 186 | 187 | *** =hint 188 | Use dataframe['missingcol'].fillna(225, inplace=True) to impute missing value of column 'missingcol' with 225 189 | 190 | 191 | *** =pre_exercise_code 192 | 193 | ```{python} 194 | 195 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 196 | 197 | # Import library pandas 198 | import pandas as pd 199 | 200 | # Import training file 201 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 202 | 203 | # Import testing file 204 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 205 | 206 | ``` 207 | 208 | *** =sample_code 209 | 210 | ```{python} 211 | 212 | # Impute missing value of LoanAmount with 168 for test data set 213 | test['LoanAmount'].fillna(______, inplace=True) 214 | 215 | ``` 216 | 217 | *** =solution 218 | 219 | ```{python} 220 | 221 | # Impute missing value of LoanAmount with 168 for test data set 222 | test['LoanAmount'].fillna(168, inplace=True) 223 | 224 | ``` 225 | 226 | *** =sct 227 | 228 | ```{python} 229 | # The sct section defines the Submission Correctness Tests (SCTs) used to 230 | # evaluate the student's response. All functions used here are defined in the 231 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 232 | 233 | # Impute missing value of LoanAmount with 168 for test data set 234 | test_data_frame("test", columns=["LoanAmount"], incorrect_msg='Did you impute missing value with 168?') 235 | success_msg("Great work!") 236 | ``` 237 | 238 | 239 | --- type:NormalExercise lang:python xp:100 skills:2, 4, 8 key:ca19896cae 240 | ## Impute missing values of SelfEmployed 241 | 242 | Similarly, to impute missing values of Categorical variables, we look at the frequency table. The simplest way is to impute with value which has highest frequency because there is a higher probability of success. 243 | 244 | For example, if you look at the distribution of SelfEmployed 500 out of 582 which is ~86% of total values falls under the category "No". Here we will replace missing values of SelfEmployed with "No". 245 | 246 | ```{python} 247 | train['Self_Employed'].fillna('No',inplace=True) 248 | ``` 249 | 250 | *** =instructions 251 | - Impute missing values with more frequent category of Gender and Credit History 252 | - Use value_counts() to check more frequent category of variable 253 | 254 | *** =hint 255 | - Male is more frequent in Gender 256 | - 1 is more frequent in Credit_History 257 | 258 | 259 | *** =pre_exercise_code 260 | 261 | ```{python} 262 | 263 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 264 | 265 | # Import library pandas 266 | import pandas as pd 267 | 268 | # Import training file 269 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 270 | 271 | # Import testing file 272 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 273 | 274 | ``` 275 | 276 | *** =sample_code 277 | 278 | ```{python} 279 | 280 | # Impute missing value of Gender (Male is more frequent category) 281 | train['Gender'].fillna(_____,inplace=True) 282 | 283 | 284 | # Impute missing value of Credit_History ( 1 is more frequent category) 285 | train['Credit_History'].fillna(_____,inplace=True) 286 | 287 | ``` 288 | 289 | *** =solution 290 | 291 | ```{python} 292 | 293 | # Impute missing value of LoanAmount with median for test data set 294 | train['Gender'].fillna('Male',inplace=True) 295 | 296 | # Impute missing value of Credit_History 297 | train['Credit_History'].fillna(1,inplace=True) 298 | 299 | 300 | ``` 301 | 302 | *** =sct 303 | 304 | ```{python} 305 | # The sct section defines the Submission Correctness Tests (SCTs) used to 306 | # evaluate the student's response. All functions used here are defined in the 307 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 308 | 309 | # Impute missing value of LoanAmount with median for test data set 310 | test_data_frame("train", columns=["Gender"], incorrect_msg='Did you impute missing value of Gender with Male?') 311 | 312 | # Impute missing value of Credit_History 313 | test_data_frame("train", columns=["Credit_History"], incorrect_msg='Did you impute missing value of Credit_History with 1?') 314 | 315 | 316 | success_msg("Great work!") 317 | ``` 318 | --- type:NormalExercise lang:python xp:100 skills:2, 4, 8 key:2607b0ce32 319 | 320 | ## Treat / Tranform extreme values of LoanAmount and ApplicantIncome 321 | 322 | Let’s analyze LoanAmount first. Since the extreme values are practically possible, i.e. some people might apply for high-value loans due to specific needs. 323 | 324 | ```{python} 325 | train ['LoanAmount'].hist(bins=20) 326 | ``` 327 |
328 | 329 | So instead of treating them as outliers, let’s try a log transformation to nullify their effect: 330 | 331 | ```{python} 332 | import numpy as np 333 | train ['LoanAmount_log'] = np.log(train['LoanAmount']) 334 | train ['LoanAmount_log'].hist(bins=20) 335 | ``` 336 |
337 | 338 | 339 | Now the distribution looks much closer to normal and effect of extreme values has been significantly subsided. 340 | 341 | *** =instructions 342 | - Add both ApplicantIncome and CoapplicantIncome as TotalIncome 343 | - Take log transformation of TotalIncome to deal with extreme values 344 | 345 | 346 | *** =hint 347 | - Add both train['ApplicantIncome'] and train['CoapplicantIncome'] 348 | - Take log of df['TotalIncome'] 349 | 350 | 351 | *** =pre_exercise_code 352 | 353 | ```{python} 354 | 355 | # The pre-exercise code runs code to initialize the user's workspace. You can use it for several things: 356 | 357 | # Import library pandas 358 | import pandas as pd 359 | import numpy as np 360 | 361 | # Import training file 362 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 363 | 364 | # Import testing file 365 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 366 | train['TotalIncome'] = train['ApplicantIncome'] + train['CoapplicantIncome'] 367 | 368 | ``` 369 | 370 | *** =sample_code 371 | 372 | ```{python} 373 | 374 | # Training and Testing datasets are loaded in variable train and test dataframe respectively 375 | 376 | # Add both ApplicantIncome and CoapplicantIncome to TotalIncome 377 | train['TotalIncome'] = train['ApplicantIncome'] + train[_________] 378 | 379 | # Perform log transformation of TotalIncome to make it closer to normal 380 | train['TotalIncome_log']= np.____(train['TotalIncome']) 381 | 382 | 383 | ``` 384 | 385 | *** =solution 386 | 387 | ```{python} 388 | 389 | # Training and Testing datasets are loaded in variable train and test dataframe respectively 390 | 391 | # Add both ApplicantIncome and CoapplicantIncome to TotalIncome 392 | train['TotalIncome'] = train['ApplicantIncome'] + train['CoapplicantIncome'] 393 | 394 | # Perform log transformation of TotalIncome to make it closer to normal 395 | train['TotalIncome_log'] = np.log(train['TotalIncome']) 396 | 397 | 398 | ``` 399 | 400 | *** =sct 401 | 402 | ```{python} 403 | # The sct section defines the Submission Correctness Tests (SCTs) used to 404 | # evaluate the student's response. All functions used here are defined in the 405 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 406 | 407 | # Add both ApplicantIncome and CoapplicantIncome to TotalIncome 408 | test_data_frame("train", columns=["TotalIncome"], incorrect_msg='Have you added both ApplicantIncome and CoapplicantIncome?') 409 | 410 | # Perform log transformation of TotalIncome to make it closer to normal 411 | test_data_frame("train", columns=["TotalIncome_log"], incorrect_msg='Have you taken log of TotalIncome?') 412 | 413 | success_msg("Great work!") 414 | ``` 415 | 416 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:9a8fd577a9 417 | ## iPython / Jupyter notebook for Data Exploration 418 | 419 | The Jupyter Notebook is a web application that allows you to create and share documents that contain live code, equations, visualizations and explanatory text. Uses include: data cleaning and transformation, numerical simulation, statistical modeling, machine learning and much more. 420 | 421 | We have shared the Jupyter notebook for your reference here 422 | 423 | ### Download the jupyter notebook from here. Have you downloaded the jupyter notebook? 424 | 425 | *** =instructions 426 | - Yes, I have downloaded the notebook 427 | - No, I am not able to 428 | 429 | *** =hint 430 | Click on the link and download the Jupyter notebook. 431 | 432 | *** =sct 433 | ```{python} 434 | # The sct section defines the Submission Correctness Tests (SCTs) used to 435 | # evaluate the student's response. All functions used here are defined in the 436 | # pythonwhat Python package 437 | 438 | msg1 = "Awesome! You can proceed to model building now!" 439 | msg2 = "Check the link provided and download the file from there." 440 | 441 | # Use test_mc() to grade multiple choice exercises. 442 | # Pass the correct option (Action, option 2 in the instructions) to correct. 443 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 444 | test_mc(1, [msg1, msg2]) 445 | 446 | ``` 447 | -------------------------------------------------------------------------------- /chapter5.md: -------------------------------------------------------------------------------- 1 | --- 2 | title : Building a Predictive model in Python 3 | description : We build our predictive models and make submissions to the AV DataHack platform in this section. 4 | 5 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2, 6 key:9a8fd577a9 6 | ## First Step of Model Building 7 | 8 | In Python, Scikit-Learn (sklearn) is the most commonly used library for building predictive / machine learning models. This article provides a good overview of scikit-learn. It has gathered a lot of interest recently for model building. There are few pre-requisite before jumping into a model building exercise: 9 | 10 | * Treat missing values 11 | * Treat outlier/ exponential observation 12 | * All inputs must be numeric array ( Requirement of scikit learn library) 13 | 14 | 15 | ####Can we build a model without treating missing values of a data set? 16 | 17 | 18 | *** =instructions 19 | - True 20 | - False 21 | 22 | *** =hint 23 | Missing value tratment is mandatory step of model building 24 | 25 | 26 | *** =sct 27 | ```{python} 28 | # The sct section defines the Submission Correctness Tests (SCTs) used to 29 | # evaluate the student's response. All functions used here are defined in the 30 | # pythonwhat Python package 31 | 32 | msg_bad1 = "Think again - If the values are missing, how will you make a predictive model?" 33 | msg_success = "Yes! We should always treat missing value" 34 | 35 | # Use test_mc() to grade multiple choice exercises. 36 | # Pass the correct option (Action, option 2 in the instructions) to correct. 37 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 38 | test_mc(2, [msg_bad1, msg_success]) 39 | ``` 40 | 41 | 42 | 43 | --- type:NormalExercise lang:python xp:100 skills:2, 6 key:2c1cf7aa90 44 | ## Label categories of Gender to number 45 | 46 | Library "Scikit Learn" only works with numeric array. Hence, we need to label all the character variables into a numeric array. For example Variable "Gender" has two labels "Male" and "Female". Hence, we will transform the labels to number as 1 for "Male" and 0 for "Female". 47 | 48 | "Scikit Learn" library has a module called "LabelEncoder" which helps to label character labels into numbers so first import module "LabelEncoder". 49 | 50 | ```{python} 51 | 52 | from sklearn.preprocessing import LabelEncoder 53 | 54 | number = LabelEncoder() 55 | 56 | train['Gender'] = number.fit_transform(train['Gender'].astype(str)) 57 | 58 | ``` 59 | 60 | *** =instructions 61 | Perform Label encoding for categories of variable "Married" and save it as a new variable "Married_new" in the DataFrame 62 | 63 | 64 | *** =hint 65 | Use number.fit_transform() to perform label encoding 66 | 67 | 68 | *** =pre_exercise_code 69 | 70 | ```{python} 71 | 72 | # The pre exercise code runs code to initialize the user's workspace. You can use it for several things: 73 | 74 | # Import library pandas 75 | import pandas as pd 76 | import numpy as np 77 | from sklearn.preprocessing import LabelEncoder 78 | 79 | # Import training file 80 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 81 | 82 | # Import testing file 83 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 84 | 85 | ``` 86 | 87 | *** =sample_code 88 | 89 | ```{python} 90 | 91 | #import module for label encoding 92 | from sklearn.preprocessing import LabelEncoder 93 | 94 | #train and test dataset is already loaded in the enviornment 95 | # Perform label encoding for variable 'Married' 96 | number = LabelEncoder() 97 | train['Married_new'] = number.________(train['Married'].astype(str)) 98 | 99 | 100 | ``` 101 | 102 | *** =solution 103 | 104 | ```{python} 105 | 106 | #import module for label encoding 107 | from sklearn.preprocessing import LabelEncoder 108 | 109 | #train and test dataset is already loaded in the enviornment 110 | # Perform label encoding for variable 'Married' 111 | number = LabelEncoder() 112 | train['Married_new'] = number.fit_transform(train['Married'].astype(str)) 113 | ``` 114 | 115 | *** =sct 116 | 117 | ```{python} 118 | # The sct section defines the Submission Correctness Tests (SCTs) used to 119 | # evaluate the student's response. All functions used here are defined in the 120 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 121 | 122 | # Perform label encoding for Married 123 | test_data_frame("train", columns=["Married"], incorrect_msg='Have you used write methds to perform label encoding for variable Married?') 124 | 125 | success_msg("Great work!") 126 | ``` 127 | 128 | 129 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2, 6 key:ee5ed17633 130 | ## Selecting the right algorithm 131 | 132 | The basic principle behind selecting the right algorithm is to look at the dependent variable (or target variable). In this challenge "Loan Prediction", we need to classify a customer's eligibility for Loan as "Y" or "N" based on the available information about the customer. Here the dependent variable is categorical and our task is to classify the customer in two groups; eligible for the loan amount and not eligible for the loan amount. 133 | 134 | This is a classification challenge so we will import module of classification algorithms of sklearn library. Below are some commonly used classification algorithms: 135 | * Logistic Regression 136 | * Decision Tree 137 | * Random Forest 138 | 139 | 140 | ####Whether an e-mail is spam or not? Is this problem a classification challenge or regression? 141 | 142 | 143 | *** =instructions 144 | - Classification 145 | - Regression 146 | 147 | *** =hint 148 | - Regression: When we model for continuous variables 149 | - Classification: When we model to classify in different categories 150 | 151 | 152 | 153 | 154 | *** =sct 155 | ```{python} 156 | # The sct section defines the Submission Correctness Tests (SCTs) used to 157 | # evaluate the student's response. All functions used here are defined in the 158 | # pythonwhat Python package 159 | 160 | msg_bad1 = "Try again. Regression challenges require you to predict a quantity, while classification challenge requires you to classify an object in groups." 161 | msg_success = "Correct - this is a classification challenge" 162 | 163 | # Use test_mc() to grade multiple choice exercises. 164 | # Pass the correct option (Action, option 2 in the instructions) to correct. 165 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 166 | test_mc(1, [msg_success, msg_bad1]) 167 | ``` 168 | 169 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2, 6 key:bd9b384210 170 | ## Have you performed data preprocessing step? 171 | 172 | As discussed before, you should perform some data pre processing steps for both train and test dataset before jumping into model building exercise. Here are a few things you need to perform at the minimum: 173 | * Missing value imputation 174 | * Outlier treatment 175 | * Label encoding for character variables 176 | * Algorithm selection 177 | 178 | 179 | ####Which of the following steps have you performed till now with both train and test data set? 180 | 181 | 182 | *** =instructions 183 | - Impute missing values of all variables 184 | - Treat outlier and influential observations 185 | - Label encoding for character variables 186 | - All of the above 187 | 188 | *** =hint 189 | All steps are necessary and would impact your model performance 190 | 191 | 192 | 193 | 194 | *** =sct 195 | ```{python} 196 | # The sct section defines the Submission Correctness Tests (SCTs) used to 197 | # evaluate the student's response. All functions used here are defined in the 198 | # pythonwhat Python package 199 | 200 | msg_bad1 = "You should perform all pre processing steps before model building" 201 | msg_success = "Great! Go ahead with modeling exercise" 202 | 203 | # Use test_mc() to grade multiple choice exercises. 204 | # Pass the correct option (Action, option 2 in the instructions) to correct. 205 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 206 | test_mc(4, [msg_bad1, msg_bad1, msg_bad1, msg_success ]) 207 | ``` 208 | 209 | --- type:NormalExercise lang:python xp:100 skills:2, 6 key:f4c3fbee79 210 | 211 | ## Logistic Regression Introduction 212 | 213 | Logistic Regression is a classification algorithm. It is used to predict a binary outcome (1 / 0, Yes / No, True / False) given a set of independent variables. To represent binary / categorical outcome, we use dummy variables. You can also think of logistic regression as a special case of linear regression when the outcome variable is categorical, where we are using log of odds as the dependent variable. 214 | 215 | In simple words, it predicts the probability of occurrence of an event by fitting data to a logit function, read more about Logistic Regression . 216 | 217 | LogisticRegression() function is part of linear_model module of sklearn and is used to create logistic regression 218 | 219 | Reference: Mathematical working and implementation from scratch for Logistic regression. 220 | 221 | *** =instructions 222 | - Import Linear model of sklearn 223 | - Create object of sklearn.linear_model.LogisticRegression 224 | 225 | 226 | *** =hint 227 | You can import a module of a library as import library.module 228 | 229 | *** =pre_exercise_code 230 | 231 | ```{python} 232 | import sklearn.linear_model 233 | ``` 234 | 235 | *** =sample_code 236 | 237 | ```{python} 238 | 239 | # Import linear model of sklearn 240 | import ______.linear_model 241 | 242 | # Create object of Logistic Regression 243 | model=sklearn.______.LogisticRegression() 244 | 245 | ``` 246 | 247 | *** =solution 248 | 249 | ```{python} 250 | # Import linear model of sklearn 251 | import sklearn.linear_model 252 | 253 | # Create object of Logistic Regression 254 | model=sklearn.linear_model.LogisticRegression() 255 | 256 | ``` 257 | 258 | *** =sct 259 | 260 | ```{python} 261 | # The sct section defines the Submission Correctness Tests (SCTs) used to 262 | # evaluate the student's response. All functions used here are defined in the 263 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 264 | 265 | # Test for library import 266 | test_import("sklearn.linear_model", same_as = False) 267 | 268 | # Test for logistic regression 269 | test_function("sklearn.linear_model.LogisticRegression", incorrect_msg='Have you created Logistic Regression object from linear model module of sklearn?') 270 | 271 | success_msg("Great work!") 272 | ``` 273 | 274 | --- type:NormalExercise lang:python xp:100 skills:2 key:6eb60851bc 275 | 276 | ## Build your first logistic regression model 277 | 278 | Let’s build our first Logistic Regression model. One way would be to take all the variables into the model, but this might result in overfitting (don’t worry if you’re unaware of this terminology yet). In simple words, taking all variables might result in the model understanding complex relations specific to the data and will not generalize well. 279 | 280 | We can easily make some intuitive hypothesis to set the ball rolling. The chances of getting a loan will be higher for: 281 | 282 | * Applicants having a credit history 283 | * Applicants with higher applicant and co-applicant income 284 | * Applicants with higher education level 285 | * Properties in urban areas with high growth perspectives 286 | 287 | Ok, time for you to build your first logistics regression model! The pre processed train_modified and test_modifed data are available in your workspace. 288 | 289 | *** =instructions 290 | - Store input variable in a list "predictors" 291 | - Create an object of logistic regression 292 | 293 | 294 | 295 | *** =hint 296 | Use list ['Credit_History','Education','Gender'] as predictor variable 297 | 298 | *** =pre_exercise_code 299 | 300 | ```{python} 301 | import pandas as pd 302 | import numpy as np 303 | from sklearn.preprocessing import LabelEncoder 304 | 305 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 306 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 307 | 308 | #Combining both train and test dataset 309 | 310 | train['Type']='Train' #Create a flag for Train and Test Data set 311 | test['Type']='Test' 312 | fullData = pd.concat([train,test],axis=0) 313 | 314 | #Identify categorical and continuous variables 315 | 316 | ID_col = ['Loan_ID'] 317 | target_col = ["Loan_Status"] 318 | cat_cols = ['Credit_History','Dependents','Gender','Married','Education','Property_Area','Self_Employed'] 319 | 320 | other_col=['Type'] #Test and Train Data set identifier 321 | num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(other_col)) 322 | 323 | #Imputing Missing values with mean for continuous variable 324 | fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean(),inplace=True) 325 | 326 | 327 | #Imputing Missing values with mode for categorical variables 328 | cat_imput=pd.Series(fullData[cat_cols].mode().values[0]) 329 | cat_imput.index=cat_cols 330 | fullData[cat_cols] = fullData[cat_cols].fillna(cat_imput,inplace=True) 331 | 332 | #Create a new column as Total Income 333 | fullData['TotalIncome']=fullData['ApplicantIncome']+fullData['CoapplicantIncome'] 334 | 335 | #Take a log of TotalIncome + 1, adding 1 to deal with zeros of TotalIncome it it exists 336 | fullData['Log_TotalIncome']=np.log(fullData['TotalIncome']) 337 | 338 | #create label encoders for categorical features 339 | for var in cat_cols: 340 | number = LabelEncoder() 341 | fullData[var] = number.fit_transform(fullData[var].astype('str')) 342 | 343 | train_modified=fullData[fullData['Type']=='Train'] 344 | test_modified=fullData[fullData['Type']=='Test'] 345 | train_modified["Loan_Status"] = number.fit_transform(train_modified["Loan_Status"].astype('str')) 346 | ``` 347 | 348 | *** =sample_code 349 | 350 | ```{python} 351 | 352 | #train_modified and test_modified already loaded in the workspace 353 | #Import module for Logistic regression 354 | import sklearn.linear_model 355 | 356 | # Select three predictors Credit_History, Education and Gender 357 | predictors =[____,_____,_____] 358 | 359 | # Converting predictors and outcome to numpy array 360 | x_train = train_modified[predictors].values 361 | y_train = train_modified['Loan_Status'].values 362 | 363 | # Model Building 364 | model = sklearn.________.LogisticRegression() 365 | model.fit(x_train, y_train) 366 | 367 | ``` 368 | 369 | *** =solution 370 | 371 | ```{python} 372 | # Import module for Logistic regression 373 | import sklearn.linear_model 374 | 375 | # Select three predictors Credit_History, Education and Gender 376 | predictors =['Credit_History','Education','Gender'] 377 | 378 | # Converting predictors and outcome to numpy array 379 | x_train = train_modified[predictors].values 380 | y_train = train_modified['Loan_Status'].values 381 | 382 | # Model Building 383 | model = sklearn.linear_model.LogisticRegression() 384 | model.fit(x_train, y_train) 385 | 386 | ``` 387 | 388 | *** =sct 389 | 390 | ```{python} 391 | # The sct section defines the Submission Correctness Tests (SCTs) used to 392 | # evaluate the student's response. All functions used here are defined in the 393 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 394 | 395 | # Test for predictor selection 396 | test_object("predictors", incorrect_msg='Have you created the list of given predictors variables?') 397 | 398 | # Test for model 399 | test_function("sklearn.linear_model.LogisticRegression", incorrect_msg='Have you created Logistic Regression object from linear_model module of sklearn?') 400 | 401 | success_msg("Great work!") 402 | ``` 403 | 404 | 405 | 406 | --- type:NormalExercise lang:python xp:100 skills:2, 6 key:207a5629cc 407 | 408 | ## Prediction and submission to DataHack 409 | 410 | To upload a submission to DataHack, you need to predict the loan approval rate for the observations in the test set. This can be done using ".predict()" method with logistic regression object (model). To extract the test features we will need to create a numpy array of input features of test data set in the same way as we did when training the model for training data. 411 | 412 | Next, you need to make sure your output is in line with the submission requirements of DataHack: a csv file with exactly 367 entries and two columns: Loan_ID and Loan_Status. Then create a csv file using to_csv() method from Pandas. 413 | 414 | 415 | *** =instructions 416 | - Store input variable in list "predictors" 417 | - Use .predict() method for prediction 418 | 419 | 420 | *** =hint 421 | Use model.predict(x_test) for prediction of test dataset 422 | 423 | *** =pre_exercise_code 424 | 425 | ```{python} 426 | import pandas as pd 427 | import numpy as np 428 | from sklearn.preprocessing import LabelEncoder 429 | 430 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 431 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 432 | 433 | #Combining both train and test dataset 434 | 435 | train['Type']='Train' #Create a flag for Train and Test Data set 436 | test['Type']='Test' 437 | fullData = pd.concat([train,test],axis=0) 438 | 439 | #Identify categorical and continuous variables 440 | 441 | ID_col = ['Loan_ID'] 442 | target_col = ["Loan_Status"] 443 | cat_cols = ['Credit_History','Dependents','Gender','Married','Education','Property_Area','Self_Employed'] 444 | 445 | other_col=['Type'] #Test and Train Data set identifier 446 | num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(other_col)) 447 | 448 | #Imputing Missing values with mean for continuous variable 449 | fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean(),inplace=True) 450 | 451 | 452 | #Imputing Missing values with mode for categorical variables 453 | cat_imput=pd.Series(fullData[cat_cols].mode().values[0]) 454 | cat_imput.index=cat_cols 455 | fullData[cat_cols] = fullData[cat_cols].fillna(cat_imput,inplace=True) 456 | 457 | #Create a new column as Total Income 458 | 459 | fullData['TotalIncome']=fullData['ApplicantIncome']+fullData['CoapplicantIncome'] 460 | 461 | #Take a log of TotalIncome + 1, adding 1 to deal with zeros of TotalIncome it it exists 462 | fullData['Log_TotalIncome']=np.log(fullData['TotalIncome']) 463 | 464 | #create label encoders for categorical features 465 | for var in cat_cols: 466 | number = LabelEncoder() 467 | fullData[var] = number.fit_transform(fullData[var].astype('str')) 468 | 469 | train_modified=fullData[fullData['Type']=='Train'] 470 | test_modified=fullData[fullData['Type']=='Test'] 471 | train_modified["Loan_Status"] = number.fit_transform(train_modified["Loan_Status"].astype('str')) 472 | 473 | # Import module for Logistic regression 474 | from sklearn.linear_model import LogisticRegression 475 | 476 | # Select three predictors Credit_History, Education and Gender 477 | predictors =['Credit_History','Education','Gender'] 478 | 479 | # Converting predictors and outcome to numpy array 480 | x_train = train_modified[predictors].values 481 | y_train = train_modified['Loan_Status'].values 482 | 483 | # Model Building 484 | model = LogisticRegression() 485 | model.fit(x_train, y_train) 486 | ``` 487 | 488 | *** =sample_code 489 | 490 | ```{python} 491 | 492 | #test_modified already loaded in the workspace 493 | 494 | # Select three predictors Credit_History, Education and Gender 495 | predictors =[____,_____,_____] 496 | 497 | # Converting predictors and outcome to numpy array 498 | x_test = test_modified[predictors].values 499 | 500 | #Predict Output 501 | predicted= model._____(x_test) 502 | 503 | #Reverse encoding for predicted outcome 504 | predicted = number.inverse_transform(predicted) 505 | 506 | #Store it to test dataset 507 | test_modified['Loan_Status']=predicted 508 | 509 | #Output file to make submission 510 | test_modified.to_csv("Submission1.csv",columns=['Loan_ID','Loan_Status']) 511 | 512 | ``` 513 | 514 | *** =solution 515 | 516 | ```{python} 517 | #test_modified already loaded in the workspace 518 | 519 | # Select three predictors Credit_History, Education and Gender 520 | predictors =['Credit_History','Education','Gender'] 521 | 522 | # Converting predictors and outcome to numpy array 523 | x_test = test_modified[predictors].values 524 | 525 | #Predict Output 526 | predicted= model.predict(x_test) 527 | 528 | #Reverse encoding for predicted outcome 529 | predicted = number.inverse_transform(predicted) 530 | 531 | #Store it to test dataset 532 | test_modified['Loan_Status']=predicted 533 | 534 | #Output file to make submission 535 | test_modified.to_csv("Submission1.csv",columns=['Loan_ID','Loan_Status']) 536 | 537 | ``` 538 | 539 | *** =sct 540 | 541 | ```{python} 542 | # The sct section defines the Submission Correctness Tests (SCTs) used to 543 | # evaluate the student's response. All functions used here are defined in the 544 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 545 | 546 | # Test for predictor selection 547 | test_object("predictors", incorrect_msg='Have you create the list of given predictors variables?') 548 | 549 | # Test for model 550 | test_object("predicted", incorrect_msg='Have you used .predict() method?') 551 | 552 | success_msg("Great work!") 553 | ``` 554 | 555 | --- type:NormalExercise lang:python xp:100 skills:2, 6 key:0f04d6b3e1 556 | 557 | ## Decision Tree Introduction 558 | 559 | Decision trees are mostly used in classification problems. It works for both categorical and continuous input and output variables. In this technique, we split the population or sample into two or more homogeneous sets (or sub-populations) based on most significant splitter / differentiator in input variables, read more about Decision Tree . 560 | 561 | 562 | *** =instructions 563 | - Import tree module of sklearn 564 | - Create a object of DecisionTreeClassifier 565 | 566 | 567 | *** =hint 568 | Use DecisiontreeClassifier() with sklearn.tree to create object of decision tree 569 | 570 | *** =pre_exercise_code 571 | 572 | ```{python} 573 | from sklearn.tree import DecisionTreeClassifier 574 | 575 | ``` 576 | 577 | *** =sample_code 578 | 579 | ```{python} 580 | 581 | # Import tree module of sklearn 582 | import sklearn._____ 583 | 584 | # Create object of DecisionTreeClassifier 585 | model = sklearn.tree.__________() 586 | 587 | ``` 588 | 589 | *** =solution 590 | 591 | ```{python} 592 | # Import tree module of sklearn 593 | import sklearn.tree 594 | 595 | # Create object of DecisionTreeClassifier 596 | model = sklearn.tree.DecisionTreeClassifier() 597 | 598 | ``` 599 | 600 | *** =sct 601 | 602 | ```{python} 603 | # The sct section defines the Submission Correctness Tests (SCTs) used to 604 | # evaluate the student's response. All functions used here are defined in the 605 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 606 | 607 | # Test for library import 608 | test_import("sklearn.tree", same_as = False) 609 | 610 | # Test for logistic regression 611 | test_function("sklearn.tree.DecisionTreeClassifier", incorrect_msg='Have you created DecisionTree object from tree module of sklearn?') 612 | 613 | success_msg("Great work!") 614 | ``` 615 | 616 | 617 | 618 | --- type:NormalExercise lang:python xp:100 skills:2, 4, 6 key:dcf5c3e2c2 619 | 620 | ## Train model and do prediction using Decision Tree 621 | 622 | Let’s make first Decision Tree model. Similar to Logistic regression, we first select the input features, train our model and finally perform prediction on test data set. 623 | 624 | Ok! time for you to build your first Decision Tree model! The pre processed train_modified and test_modifed data are available in your workspace. 625 | 626 | 627 | *** =instructions 628 | - Store input variable in list "predictors" 629 | - Create a object of DecisionTreeClassifier 630 | - Do prediction for test data set 631 | - Export test prediction to csv file 632 | 633 | 634 | *** =hint 635 | - Use predictors =['Credit_History','Education','Gender'] as predictor variable 636 | - Use DecisionTreeClassifier with sklearn.tree to create decision tree object 637 | - Use to_csv() with dataframe to export csv file 638 | 639 | 640 | *** =pre_exercise_code 641 | 642 | ```{python} 643 | import pandas as pd 644 | import numpy as np 645 | from sklearn.preprocessing import LabelEncoder 646 | import sklearn.tree 647 | 648 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 649 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 650 | 651 | #Combining both train and test dataset 652 | 653 | train['Type']='Train' #Create a flag for Train and Test Data set 654 | test['Type']='Test' 655 | fullData = pd.concat([train,test],axis=0) 656 | 657 | #Identify categorical and continuous variables 658 | 659 | ID_col = ['Loan_ID'] 660 | target_col = ["Loan_Status"] 661 | cat_cols = ['Credit_History','Dependents','Gender','Married','Education','Property_Area','Self_Employed'] 662 | 663 | other_col=['Type'] #Test and Train Data set identifier 664 | num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(other_col)) 665 | 666 | #Imputing Missing values with mean for continuous variable 667 | fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean(),inplace=True) 668 | 669 | 670 | #Imputing Missing values with mode for categorical variables 671 | cat_imput=pd.Series(fullData[cat_cols].mode().values[0]) 672 | cat_imput.index=cat_cols 673 | fullData[cat_cols] = fullData[cat_cols].fillna(cat_imput,inplace=True) 674 | 675 | #Create a new column as Total Income 676 | fullData['TotalIncome']=fullData['ApplicantIncome']+fullData['CoapplicantIncome'] 677 | 678 | #Take a log of TotalIncome + 1, adding 1 to deal with zeros of TotalIncome it it exists 679 | fullData['Log_TotalIncome']=np.log(fullData['TotalIncome']) 680 | 681 | #create label encoders for categorical features 682 | for var in cat_cols: 683 | number = LabelEncoder() 684 | fullData[var] = number.fit_transform(fullData[var].astype('str')) 685 | 686 | train_modified=fullData[fullData['Type']=='Train'] 687 | test_modified=fullData[fullData['Type']=='Test'] 688 | train_modified["Loan_Status"] = number.fit_transform(train_modified["Loan_Status"].astype('str')) 689 | ``` 690 | 691 | *** =sample_code 692 | 693 | ```{python} 694 | 695 | #train_modified and test_modified already loaded in the workspace 696 | #Import module for Decision tree 697 | import sklearn.tree 698 | 699 | # Select three predictors Credit_History, Education and Gender 700 | predictors =[____,_____,_____] 701 | 702 | # Converting predictors and outcome to numpy array 703 | x_train = train_modified[predictors].values 704 | y_train = train_modified['Loan_Status'].values 705 | 706 | # Model Building 707 | model = sklearn._____.DecisionTreeClassifier() 708 | model.fit(x_train, y_train) 709 | 710 | # Converting predictors and outcome to numpy array 711 | x_test = test_modified[predictors].values 712 | 713 | #Predict Output 714 | predicted= model._____(x_test) 715 | 716 | #Reverse encoding for predicted outcome 717 | predicted = number.inverse_transform(predicted) 718 | 719 | #Store it to test dataset 720 | test_modified['Loan_Status']=predicted 721 | 722 | #Output file to make submission 723 | test_modified.______("Submission1.csv",columns=['Loan_ID','Loan_Status']) 724 | 725 | 726 | ``` 727 | 728 | *** =solution 729 | 730 | ```{python} 731 | #train_modified and test_modified already loaded in the workspace 732 | #Import module for Decision tree 733 | import sklearn.tree 734 | 735 | # Select three predictors Credit_History, Education and Gender 736 | predictors =['Credit_History','Education','Gender'] 737 | 738 | # Converting predictors and outcome to numpy array 739 | x_train = train_modified[predictors].values 740 | y_train = train_modified['Loan_Status'].values 741 | 742 | # Model Building 743 | model = sklearn.tree.DecisionTreeClassifier() 744 | model.fit(x_train, y_train) 745 | 746 | # Converting predictors and outcome to numpy array 747 | x_test = test_modified[predictors].values 748 | 749 | #Predict Output 750 | predicted= model.predict(x_test) 751 | 752 | #Reverse encoding for predicted outcome 753 | predicted = number.inverse_transform(predicted) 754 | 755 | #Store it to test dataset 756 | test_modified['Loan_Status']=predicted 757 | 758 | #Output file to make submission 759 | test_modified.to_csv("Submission1.csv",columns=['Loan_ID','Loan_Status']) 760 | 761 | 762 | ``` 763 | 764 | *** =sct 765 | 766 | ```{python} 767 | # The sct section defines the Submission Correctness Tests (SCTs) used to 768 | # evaluate the student's response. All functions used here are defined in the 769 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 770 | 771 | # Test for predictor selection 772 | test_object("predictors", incorrect_msg='Have you create the list of given predictors variables?') 773 | 774 | # Test for model 775 | test_function("sklearn.tree.DecisionTreeClassifier", incorrect_msg='Have you created DecisionTree object from tree module of sklearn?') 776 | 777 | # Test for predicted 778 | test_object("predicted", incorrect_msg='Have you used .predict() method?') 779 | 780 | 781 | # Test for csv import 782 | test_function("test_modified.to_csv", incorrect_msg='Have you used the right function to export a csv file?') 783 | 784 | success_msg("Great work!") 785 | ``` 786 | 787 | 788 | 789 | 790 | 791 | --- type:NormalExercise lang:python xp:100 skills:2, 6 key:ff4ced6565 792 | 793 | ## Random Forest Introduction 794 | 795 | Random Forest is a versatile machine learning method capable of performing both regression and classification tasks. It also undertakes dimensional reduction methods, treats missing values, outlier values and other essential steps of data exploration, and does a fairly good job. It is a type of ensemble learning method, where a group of weak models combine to form a powerful model, read more about Random Forest . 796 | 797 | 798 | *** =instructions 799 | - Import library sklearn.ensemble 800 | - Create a object of RandomForestClassifier 801 | 802 | 803 | *** =hint 804 | Use RandomForestClassifier() with sklearn.ensemble to create object of Random Forest 805 | 806 | 807 | *** =pre_exercise_code 808 | 809 | ```{python} 810 | import sklearn.ensemble 811 | ``` 812 | 813 | *** =sample_code 814 | 815 | ```{python} 816 | 817 | # Import ensemble module from sklearn 818 | import sklearn.______ 819 | 820 | # Create object of RandomForestClassifier 821 | model=sklearn.ensemble.__________ 822 | 823 | ``` 824 | 825 | *** =solution 826 | 827 | ```{python} 828 | # Import ensemble module from sklearn 829 | import sklearn.ensemble 830 | 831 | # Create object of RandomForestClassifier 832 | model=sklearn.ensemble.RandomForestClassifier() 833 | 834 | ``` 835 | 836 | *** =sct 837 | 838 | ```{python} 839 | # The sct section defines the Submission Correctness Tests (SCTs) used to 840 | # evaluate the student's response. All functions used here are defined in the 841 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 842 | 843 | # Test for library import 844 | test_import("sklearn.ensemble", same_as = False) 845 | 846 | # Test for logistic regression 847 | test_function("sklearn.ensemble.RandomForestClassifier", incorrect_msg='Have you created RandomForest object from ensemble module of sklearn?') 848 | 849 | success_msg("Great work!") 850 | ``` 851 | 852 | 853 | --- type:NormalExercise lang:python xp:100 skills:2, 6 key:f0d1f62bb1 854 | 855 | ## Train model and do prediction using Random Forest 856 | 857 | Let’s make first Random Forest model. Similar to Logistic regression and Decision Tree, here we also first select the input features, train model and finally perform prediction on test data set. 858 | 859 | Ok, time for you to build your first Random Forest model! The pre processed train_modified and test_modifed data are available in your workspace. 860 | 861 | 862 | *** =instructions 863 | - Create a object of RandomForestClassifier 864 | - Do prediction for test data set 865 | - Export test prediction to csv file 866 | 867 | 868 | *** =hint 869 | - Use RandomForestClassifier() with sklearn.ensemble to create a random forest object 870 | - Use to_csv() with dataframe to export csv file 871 | 872 | 873 | *** =pre_exercise_code 874 | 875 | ```{python} 876 | import pandas as pd 877 | import numpy as np 878 | from sklearn.preprocessing import LabelEncoder 879 | import sklearn.ensemble 880 | 881 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 882 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 883 | 884 | #Combining both train and test dataset 885 | 886 | train['Type']='Train' #Create a flag for Train and Test Data set 887 | test['Type']='Test' 888 | fullData = pd.concat([train,test],axis=0) 889 | 890 | #Identify categorical and continuous variables 891 | 892 | ID_col = ['Loan_ID'] 893 | target_col = ["Loan_Status"] 894 | cat_cols = ['Credit_History','Dependents','Gender','Married','Education','Property_Area','Self_Employed'] 895 | 896 | other_col=['Type'] #Test and Train Data set identifier 897 | num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(other_col)) 898 | 899 | #Imputing Missing values with mean for continuous variable 900 | fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean(),inplace=True) 901 | 902 | 903 | #Imputing Missing values with mode for categorical variables 904 | cat_imput=pd.Series(fullData[cat_cols].mode().values[0]) 905 | cat_imput.index=cat_cols 906 | fullData[cat_cols] = fullData[cat_cols].fillna(cat_imput,inplace=True) 907 | 908 | #Create a new column as Total Income 909 | fullData['TotalIncome']=fullData['ApplicantIncome']+fullData['CoapplicantIncome'] 910 | 911 | #Take a log of TotalIncome + 1, adding 1 to deal with zeros of TotalIncome it it exists 912 | fullData['Log_TotalIncome']=np.log(fullData['TotalIncome']) 913 | 914 | #create label encoders for categorical features 915 | for var in cat_cols: 916 | number = LabelEncoder() 917 | fullData[var] = number.fit_transform(fullData[var].astype('str')) 918 | 919 | train_modified=fullData[fullData['Type']=='Train'] 920 | test_modified=fullData[fullData['Type']=='Test'] 921 | train_modified["Loan_Status"] = number.fit_transform(train_modified["Loan_Status"].astype('str')) 922 | ``` 923 | 924 | *** =sample_code 925 | 926 | ```{python} 927 | 928 | #train_modified and test_modified already loaded in the workspace 929 | #Import module for Random Forest 930 | import sklearn.ensemble 931 | 932 | # Select three predictors Credit_History, Education and Gender 933 | predictors =['Credit_History','Education','Gender'] 934 | 935 | # Converting predictors and outcome to numpy array 936 | x_train = train_modified[predictors].values 937 | y_train = train_modified['Loan_Status'].values 938 | 939 | # Model Building 940 | model = sklearn.ensemble._______ 941 | model.fit(x_train, y_train) 942 | 943 | # Converting predictors and outcome to numpy array 944 | x_test = test_modified[predictors].values 945 | 946 | #Predict Output 947 | predicted= model.______(x_test) 948 | 949 | #Reverse encoding for predicted outcome 950 | predicted = number.inverse_transform(predicted) 951 | 952 | #Store it to test dataset 953 | test_modified['Loan_Status']=predicted 954 | 955 | #Output file to make submission 956 | test_modified._____("Submission1.csv",columns=['Loan_ID','Loan_Status']) 957 | 958 | 959 | ``` 960 | 961 | *** =solution 962 | 963 | ```{python} 964 | #train_modified and test_modified already loaded in the workspace 965 | #Import module for Random Forest 966 | import sklearn.ensemble 967 | 968 | # Select three predictors Credit_History, Education and Gender 969 | predictors =['Credit_History','Education','Gender'] 970 | 971 | # Converting predictors and outcome to numpy array 972 | x_train = train_modified[predictors].values 973 | y_train = train_modified['Loan_Status'].values 974 | 975 | # Model Building 976 | model = sklearn.ensemble.RandomForestClassifier() 977 | model.fit(x_train, y_train) 978 | 979 | # Converting predictors and outcome to numpy array 980 | x_test = test_modified[predictors].values 981 | 982 | #Predict Output 983 | predicted= model.predict(x_test) 984 | 985 | #Reverse encoding for predicted outcome 986 | predicted = number.inverse_transform(predicted) 987 | 988 | #Store it to test dataset 989 | test_modified['Loan_Status']=predicted 990 | 991 | #Output file to make submission 992 | test_modified.to_csv("Submission1.csv",columns=['Loan_ID','Loan_Status']) 993 | 994 | 995 | ``` 996 | 997 | *** =sct 998 | 999 | ```{python} 1000 | # The sct section defines the Submission Correctness Tests (SCTs) used to 1001 | # evaluate the student's response. All functions used here are defined in the 1002 | # pythonwhat Python package. Documentation can also be found at github.com/datacamp/pythonwhat/wiki 1003 | 1004 | # Test for model 1005 | test_function("sklearn.ensemble.RandomForestClassifier", incorrect_msg='Have you created RandomForest object from ensemble module of sklearn?') 1006 | 1007 | # Test for predicted 1008 | test_object("predicted", incorrect_msg='Have you used .predict() method?') 1009 | 1010 | 1011 | # Test for csv import 1012 | test_function("test_modified.to_csv", incorrect_msg='Have you used the right function to export a csv file?') 1013 | 1014 | success_msg("Great work!") 1015 | 1016 | ``` 1017 | 1018 | 1019 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2, 6 key:4621632d2a 1020 | ## Selecting important variables for model building 1021 | 1022 | One of the benefits of Random forest is the power of handle large data set with higher dimensionality. It can handle thousands of input variables and identify most significant variables so it is considered as one of the dimensionality reduction methods. Further, the model outputs the importance of the variables, which can be a very handy feature. 1023 | 1024 | ```{python} 1025 | 1026 | featimp = pd.Series(model.feature_importances_, index=predictors).sort_values(ascending=False) 1027 | 1028 | print (featimp) 1029 | 1030 | ``` 1031 | I have selected all the features available in the train data set and model it using random forest: 1032 | 1033 | ```{python} 1034 | predictors=['ApplicantIncome', 'CoapplicantIncome', 'Credit_History','Dependents', 'Education', 'Gender', 'LoanAmount', 1035 | 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'TotalIncome','Log_TotalIncome'] 1036 | 1037 | 1038 | ``` 1039 | 1040 | Run feature importance command and identify Which variable has the highest impact on the model?? 1041 | 1042 | 1043 | *** =instructions 1044 | - LoanAmount 1045 | - Dependents 1046 | - Gender 1047 | - Education 1048 | 1049 | *** =hint 1050 | Run feature importance command 1051 | 1052 | *** =pre_exercise_code 1053 | ```{python} 1054 | import pandas as pd 1055 | import numpy as np 1056 | from sklearn.preprocessing import LabelEncoder 1057 | 1058 | train = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv") 1059 | test = pd.read_csv("https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv") 1060 | 1061 | #Combining both train and test dataset 1062 | 1063 | train['Type']='Train' #Create a flag for Train and Test Data set 1064 | test['Type']='Test' 1065 | fullData = pd.concat([train,test],axis=0) 1066 | 1067 | #Identify categorical and continuous variables 1068 | 1069 | ID_col = ['Loan_ID'] 1070 | target_col = ["Loan_Status"] 1071 | cat_cols = ['Credit_History','Dependents','Gender','Married','Education','Property_Area','Self_Employed'] 1072 | 1073 | other_col=['Type'] #Test and Train Data set identifier 1074 | num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(other_col)) 1075 | 1076 | #Imputing Missing values with mean for continuous variable 1077 | fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean(),inplace=True) 1078 | 1079 | 1080 | #Imputing Missing values with mode for categorical variables 1081 | cat_imput=pd.Series(fullData[cat_cols].mode().values[0]) 1082 | cat_imput.index=cat_cols 1083 | fullData[cat_cols] = fullData[cat_cols].fillna(cat_imput,inplace=True) 1084 | 1085 | #Create a new column as Total Income 1086 | 1087 | fullData['TotalIncome']=fullData['ApplicantIncome']+fullData['CoapplicantIncome'] 1088 | 1089 | #Take a log of TotalIncome + 1, adding 1 to deal with zeros of TotalIncome it it exists 1090 | fullData['Log_TotalIncome']=np.log(fullData['TotalIncome']) 1091 | 1092 | #create label encoders for categorical features 1093 | for var in cat_cols: 1094 | number = LabelEncoder() 1095 | fullData[var] = number.fit_transform(fullData[var].astype('str')) 1096 | 1097 | train_modified=fullData[fullData['Type']=='Train'] 1098 | test_modified=fullData[fullData['Type']=='Test'] 1099 | train_modified["Loan_Status"] = number.fit_transform(train_modified["Loan_Status"].astype('str')) 1100 | 1101 | # Import module for Random Forest classifier 1102 | from sklearn.ensemble import RandomForestClassifier 1103 | 1104 | # Select three predictors Credit_History, LoanAmount and Log_TotalIncome 1105 | predictors=['ApplicantIncome', 'CoapplicantIncome', 'Credit_History','Dependents', 'Education', 'Gender', 'LoanAmount', 1106 | 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'TotalIncome','Log_TotalIncome'] 1107 | 1108 | # Converting predictors and outcome to numpy array 1109 | x_train = train_modified[predictors].values 1110 | y_train = train_modified['Loan_Status'].values 1111 | x_test = test_modified[predictors].values 1112 | 1113 | # Model Building 1114 | model = RandomForestClassifier() 1115 | model.fit(x_train, y_train) 1116 | 1117 | ``` 1118 | 1119 | 1120 | *** =sct 1121 | ```{python} 1122 | # The sct section defines the Submission Correctness Tests (SCTs) used to 1123 | # evaluate the student's response. All functions used here are defined in the 1124 | # pythonwhat Python package 1125 | 1126 | msg_bad = "That is not correct!" 1127 | msg_success = "You got it right!" 1128 | 1129 | # Use test_mc() to grade multiple choice exercises. 1130 | # Pass the correct option (Action, option 2 in the instructions) to correct. 1131 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 1132 | test_mc(1, [msg_success, msg_bad, msg_bad, msg_bad]) 1133 | ``` 1134 | -------------------------------------------------------------------------------- /chapter6.md: -------------------------------------------------------------------------------- 1 | --- 2 | title : Expert advice to improve model performance 3 | description : This chapter will help to understand the approach of data science experts, "How they do approach a challenge?", "How to select a right algorithm?", "How to combine outputs of multiple algorithms?" and "How to select the right value of model parameter also known as parameter tuning?". 4 | 5 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:9a8fd577a9 6 | ## How to approach a challenge? 7 | 8 | The model development cycle goes through various stages, starting from data collection to model building. Most of us admit that data exploration needs more attention to unleashing the hidden story of data but before exploring the data to understand relationships (in variables), It’s always recommended to perform hypothesis generation. (To know more about hypothesis generation, refer to this link). 9 | 10 | It is important that you spend time thinking about the given problem and gaining the domain knowledge. So, how does it help? 11 | 12 | This practice usually helps in building better features later on, which are not biased by the data available in the dataset. This is a crucial step which usually improves a model’s accuracy. 13 | 14 | At this stage, you are expected to apply structured thinking to the problem i.e. a thinking process which takes into consideration all the possible aspects of a particular problem. 15 | 16 | 17 | ####Which of the following has the right order of model building life cycle? 18 | 19 | 20 | *** =instructions 21 | - Data Collection --> Data Exploration --> Hypothesis Generation --> Model Building --> Prediction 22 | - Data Collection --> Hypothesis Generation --> Data Exploration --> Model Building --> Prediction 23 | - Hypothesis Generation --> Data Collection --> Data Exploration --> Model Building --> Prediction 24 | 25 | *** =hint 26 | Always perform hypothesis generation before data collection and exploration, it also helps you to collect right data 27 | 28 | 29 | 30 | 31 | *** =sct 32 | ```{python} 33 | # The sct section defines the Submission Correctness Tests (SCTs) used to 34 | # evaluate the student's response. All functions used here are defined in the 35 | # pythonwhat Python package 36 | 37 | msg_bad1 = "Think again!" 38 | msg_success = "Exactly! we always do Hypothesis generation before data collection and exploration" 39 | 40 | # Use test_mc() to grade multiple choice exercises. 41 | # Pass the correct option (Action, option 2 in the instructions) to correct. 42 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 43 | test_mc(3, [msg_bad1, msg_bad1, msg_success]) 44 | ``` 45 | 46 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2, 4, 6 key:01167ddb1f 47 | ## Feature Engineering 48 | 49 | This step helps to extract more information from existing data. New information is extracted in terms of new features. These features may have a higher ability to explain the variance in the training data. Thus, giving improved model accuracy. 50 | 51 | Feature engineering is highly influenced by hypotheses generation. A good hypothesis results in a good feature. That’s why experts always suggest investing quality time in hypothesis generation. Feature engineering process can be divided into two steps: 52 | 53 | * Feature Transformation 54 | * Feature Creation 55 | 56 | ##### Feature Transformation: 57 | 58 | There are various scenarios where feature transformation is required: 59 | * Changing the scale of a variable from original scale to scale between zero and one. 60 | * Some algorithms works well with normally distributed data. Therefore, we must remove skewness of variable(s). There are methods like log, square root or inverse of the values to remove skewness 61 | * Binning of numerical variables 62 | 63 | ##### Feature Creation: 64 | 65 | Deriving new variable(s) from existing variables is known as feature creation. It helps to unleash the hidden relationship of a data set. Let’s say, we want to predict the number of transactions in a store based on transaction dates. Here transaction dates may not have a direct correlation with the number of transaction, but if we look at the day of a week, it may have a higher correlation. In this case, the information about the day of the week is hidden. We need to extract it to make the model better. 66 | 67 | #### Creating a variable based on a mathematical computation on three existing variables is a method of? 68 | 69 | 70 | *** =instructions 71 | - Feature Transformation 72 | - Feature Creation 73 | - Feature Selection 74 | 75 | 76 | *** =hint 77 | Creating a new variable from existing data set is known as feature creation 78 | 79 | 80 | 81 | 82 | *** =sct 83 | ```{python} 84 | # The sct section defines the Submission Correctness Tests (SCTs) used to 85 | # evaluate the student's response. All functions used here are defined in the 86 | # pythonwhat Python package 87 | 88 | msg_bad1 = "Think again!" 89 | msg_success = "Yes! Creating a new feature out of existing ones is known as feature creation" 90 | 91 | # Use test_mc() to grade multiple choice exercises. 92 | # Pass the correct option (Action, option 2 in the instructions) to correct. 93 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 94 | test_mc(2, [msg_bad1, msg_success, msg_bad1]) 95 | ``` 96 | 97 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2, 4, 6 key:3c72c926e8 98 | ## Feature Selection 99 | 100 | Feature Selection is a process of finding out the best subset of attributes which better explains the relationship of independent variables with target variable. 101 | 102 | You can select the useful features based on various metrics like: 103 | 104 | * Domain Knowledge: Based on domain experience, we select feature(s) which may have a higher impact on target variable. 105 | * Visualization: As the name suggests, it helps to visualize the relationship between variables, which makes your variable selection process easier. 106 | * Statistical Parameters: We also consider the p-values, information values, and other statistical metrics to select right features. 107 | 108 | #### Variable importance table of random forest classifier can act as feature selection tool? 109 | 110 | 111 | *** =instructions 112 | - TRUE 113 | - FALSE 114 | 115 | 116 | *** =hint 117 | Variable importance table shows the importance of each variable with respect to target variable 118 | 119 | 120 | 121 | 122 | *** =sct 123 | ```{python} 124 | # The sct section defines the Submission Correctness Tests (SCTs) used to 125 | # evaluate the student's response. All functions used here are defined in the 126 | # pythonwhat Python package 127 | 128 | msg_bad1 = "Think again!" 129 | msg_success = "Yes! Creating a new feature out of existing ones is known as feature creation" 130 | 131 | # Use test_mc() to grade multiple choice exercises. 132 | # Pass the correct option (Action, option 2 in the instructions) to correct. 133 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 134 | test_mc(1, [msg_success, msg_bad1]) 135 | ``` 136 | 137 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2, 6 key:a93345ad36 138 | ## How to select the right value of model parameter? 139 | 140 | We know that machine learning algorithms are driven by parameters. These parameters majorly influence the outcome of the learning process. 141 | 142 | The objective of parameter tuning is to find the optimum value for each parameter to improve the accuracy of the model. To tune these parameters, you must have a good understanding of their meaning and individual impact on the model. You can repeat this process with a number of well-performing models. 143 | 144 | For example: In a random forest, we have various parameters like max_features, number_trees, random_state, oob_score and others. Intuitive optimization of these parameter values will result in better and more accurate models. 145 | 146 | #### Which of the following is not a parameter of random forest algorithm (in Scikit Learn)? 147 | 148 | 149 | *** =instructions 150 | - max_depth 151 | - max_leaf_node 152 | - learning rate 153 | - max_features 154 | 155 | 156 | *** =hint 157 | List of all parameters in random forest scikit learn algorithm: 158 | 159 | RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None,min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None,bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False,class_weight=None) 160 | 161 | 162 | 163 | 164 | *** =sct 165 | ```{python} 166 | # The sct section defines the Submission Correctness Tests (SCTs) used to 167 | # evaluate the student's response. All functions used here are defined in the 168 | # pythonwhat Python package 169 | 170 | msg_bad1 = "Look at the hint to know more about parameters of random forest" 171 | msg_success = "Good Job!" 172 | 173 | # Use test_mc() to grade multiple choice exercises. 174 | # Pass the correct option (Action, option 2 in the instructions) to correct. 175 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 176 | test_mc(3, [msg_bad1, msg_bad1, msg_success, msg_bad1]) 177 | ``` 178 | 179 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2, 6 key:63b7c07abc 180 | ## Use ensemble methods to combine output of more than one models? 181 | 182 | This is the most common approach found majorly in winning solutions of Data science competitions. This technique simply combines the result of multiple weak models and produce better results. This can be achieved through many ways: 183 | 184 | * Bagging (Bootstrap Aggregating) 185 | * Boosting 186 | 187 | To know more about these methods, you can refer article “Introduction to ensemble learning“ . 188 | 189 | It is always a better idea to apply ensemble methods to improve the accuracy of your model. There are two good reasons for this: 190 | * They are generally more complex than traditional methods 191 | * The traditional methods give you a good base level from which you can improve and draw from to create your ensembles. 192 | 193 | #### Taking the average of predictions (given by different models) is an example of ensemble model? 194 | 195 | 196 | *** =instructions 197 | - TRUE 198 | - FALSE 199 | 200 | *** =hint 201 | We can combine output of different base models by: 202 | - Taking average of all predictions 203 | - Using maximum vote techniques 204 | 205 | 206 | 207 | 208 | 209 | *** =sct 210 | ```{python} 211 | # The sct section defines the Submission Correctness Tests (SCTs) used to 212 | # evaluate the student's response. All functions used here are defined in the 213 | # pythonwhat Python package 214 | 215 | msg_bad1 = "Read more about ensemble methods" 216 | msg_success = "Good Job!" 217 | 218 | # Use test_mc() to grade multiple choice exercises. 219 | # Pass the correct option (Action, option 2 in the instructions) to correct. 220 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 221 | test_mc(1, [msg_success, msg_bad1]) 222 | ``` 223 | 224 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2, 6 key:60de1e0b02 225 | ## Cross validtion helps to improve your score on out of sample data set 226 | 227 | Till here, we have seen methods which can improve the accuracy of a model. But, it is not necessary that higher accuracy models always perform better (for unseen data points). Sometimes, the improvement in model’s accuracy can be due to over-fitting too. 228 | 229 | Here Cross-Validation helps to find the right answer to this question. Cross Validation says, try to leave a sample on which you do not train the model and test the model on this sample before finalizing the model. This method helps us to achieve more generalized relationships. To know more about this cross validation method, you should refer article “Improve model performance using cross-validation“ . 230 | 231 | #### Common methods used for Cross-Validation ? 232 | 233 | 234 | ##### The Validation set Approach: 235 | In this approach, we reserve 50% of the dataset for validation and rest 50% for model training. A major disadvantage of this approach is that we train a model on 50% of the dataset only, it may be possible that we are leaving some interesting information about data i.e. higher bias. 236 | 237 | ##### Leave one out cross validation (LOOCV) 238 | 239 | In this approach, we reserve only one data-point of the available data set. And, train model on the rest of data set. This process iterates for each data point. This approach leads to higher variation in testing model effectiveness because we test against one data point. So, our estimation gets highly influenced by that one data point. If the data point turns out to be an outlier, it can lead to higher variation. 240 | 241 | ##### K-fold cross validation 242 | 243 | In this method, we follow below steps: 244 | * Randomly split your entire dataset into k-”folds”. 245 | * For each k folds in your dataset, build your model on k – 1 folds of the data set. 246 | * Then, test the model to check the effectiveness for kth fold and record the error you see on each of the predictions. 247 | * Repeat this until each of the k folds has served as the test set. 248 | 249 | The average of your k recorded errors is called the cross-validation error and will serve as your performance metric for the model. 250 | 251 | #### How to choose right value of k for K-fold cross validation? 252 | 253 | *** =instructions 254 | - Choose lower value of K 255 | - Choose a higher value of K 256 | - Use k=10 257 | 258 | *** =hint 259 | Always remember, lower value of K is more biased and hence undesirable. On the other hand, a higher value of K is less biased; but it can suffer from large variability. It is good to know that a smaller value of k always takes us towards validation set approach, whereas the higher value of k leads to LOOCV approach. Hence, it is often suggested to use k=10. 260 | 261 | 262 | 263 | 264 | *** =sct 265 | ```{python} 266 | # The sct section defines the Submission Correctness Tests (SCTs) used to 267 | # evaluate the student's response. All functions used here are defined in the 268 | # pythonwhat Python package 269 | 270 | msg_bad1 = "Try again! Read more about Cross Validation" 271 | msg_success = "Good Job!" 272 | 273 | # Use test_mc() to grade multiple choice exercises. 274 | # Pass the correct option (Action, option 2 in the instructions) to correct. 275 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 276 | test_mc(3, [msg_bad1, msg_bad1, msg_success]) 277 | ``` 278 | 279 | --- type:MultipleChoiceExercise lang:python xp:50 skills:1 key:ed0dcad240 280 | ## iPython / Jupyter notebook for Predictive Modeling 281 | 282 | The Jupyter Notebook is a web application that allows you to create and share documents that contain live code, equations, visualizations and explanatory text. Uses include: data cleaning and transformation, numerical simulation, statistical modeling, machine learning and much more. 283 | 284 | We have shared the Jupyter notebook for your reference here 285 | 286 | ### Download the jupyter notebook from here. Have you downloaded the jupyter notebook? 287 | 288 | 289 | 290 | *** =instructions 291 | - Yes, I have downloaded the file 292 | - No, I am not able to 293 | 294 | 295 | *** =hint 296 | Click on the link and download the Jupyter notebook. 297 | 298 | 299 | 300 | 301 | *** =sct 302 | ```{python} 303 | # The sct section defines the Submission Correctness Tests (SCTs) used to 304 | # evaluate the student's response. All functions used here are defined in the 305 | # pythonwhat Python package 306 | 307 | msg1 = "Awesome! You can check out additional reference!" 308 | msg2 = "Check the link provided and download the file from there." 309 | 310 | # Use test_mc() to grade multiple choice exercises. 311 | # Pass the correct option (Action, option 2 in the instructions) to correct. 312 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 313 | test_mc(1, [msg1, msg2]) 314 | ``` 315 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:6177e4a3f3 316 | ## Thank You & Further studies 317 | 318 | Thanks for taking up this open course from Analytics Vidhya . We hope you enjoyed the problem solving exercises and our hackathon experience. For more such hackathons, you can always visit our DataHack platform. 319 | 320 | ###Here are a few more resources you can check out: 321 | 322 | ####Practice Problems (Hackathons): 323 | - Big Mart Sales Problem. 324 | 325 | ####All Hackathons: 326 | - All Hackathons. 327 | 328 | ####Tutorials 329 | - Learning path in Python - Path from beginner to an expert in Data Science 330 | - LeaRning path in R - Path from beginner to an expert in Data Science 331 | - Essentials of Machine Learning (with codes in Python & R) 332 | - 12 useful Pandas techniques for Data Manipulation 333 | - Complete guide to create a time series forecast (with codes in Python) 334 | 335 | ####Data Science Discussions 336 | 337 | 338 | 339 | ###What do you want to do next: 340 | 341 | *** =instructions 342 | - Finish the course 343 | - Stay on this page and explore the references 344 | 345 | 346 | *** =hint 347 | Thank You - hope you enjoyed the course. 348 | 349 | 350 | 351 | 352 | *** =sct 353 | ```{python} 354 | # The sct section defines the Submission Correctness Tests (SCTs) used to 355 | # evaluate the student's response. All functions used here are defined in the 356 | # pythonwhat Python package 357 | 358 | msg1 = "Thanks for completing the course. Looking forward to interacting with you on DataHack." 359 | msg2 = "No hurry! You can take your own time." 360 | 361 | # Use test_mc() to grade multiple choice exercises. 362 | # Pass the correct option (Action, option 2 in the instructions) to correct. 363 | # Pass the feedback messages, both positive and negative, to feedback_msgs in the appropriate order. 364 | test_mc(1, [msg1, msg2]) 365 | ``` 366 | -------------------------------------------------------------------------------- /course.yml: -------------------------------------------------------------------------------- 1 | title : Introduction to Python & Machine Learning (with Analytics Vidhya Hackathons) 2 | author_field : Kunal Jain 3 | description : This course introduces basic concepts of data science, data exploration, preparation in Python and then prepares you to participate in exciting machine learning competitions on Analytics Vidhya. 4 | author_bio : Kunal is the Founder & CEO of Analytics Vidhya, a community of data science professionals.
At Analytics Vidhya, we believe that Data Science knowledge should be free and accessible to everyone across the globe. 5 | university : DataCamp 6 | difficulty_level : 2 7 | time_needed : 2 hour 8 | programming_language : python 9 | from : "python-base-prod:20" 10 | -------------------------------------------------------------------------------- /img/author_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kunalj101/python_intro_hackathon/5038018b5ff61842c60a739d9d2ec94356ed65bc/img/author_image.png -------------------------------------------------------------------------------- /img/shield_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kunalj101/python_intro_hackathon/5038018b5ff61842c60a739d9d2ec94356ed65bc/img/shield_image.png -------------------------------------------------------------------------------- /requirements.sh: -------------------------------------------------------------------------------- 1 | pip3 install pandas==0.19.1 2 | pip3 install numpy==1.11.0 3 | pip3 install scipy==0.18.1 4 | pip3 install scikit-learn==0.18.1 5 | --------------------------------------------------------------------------------