├── README.md ├── screenshots └── accuracy.png ├── employee_satisfaction_evaluation.xlsx └── Prediction of Which of your Employee will Quit Your Company.py /README.md: -------------------------------------------------------------------------------- 1 | # Predicting-which-of-your-Employee-will-Quit-your-Company-Data-Science-Project 2 | 3 | ## Accuracy Score : 4 | ![accuracy](screenshots/accuracy.png) 5 | 6 | --- 7 | -------------------------------------------------------------------------------- /screenshots/accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pydeveloperashish/Predicting-which-of-your-Employee-will-Quit-your-Company-Data-Science-Project/HEAD/screenshots/accuracy.png -------------------------------------------------------------------------------- /employee_satisfaction_evaluation.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pydeveloperashish/Predicting-which-of-your-Employee-will-Quit-your-Company-Data-Science-Project/HEAD/employee_satisfaction_evaluation.xlsx -------------------------------------------------------------------------------- /Prediction of Which of your Employee will Quit Your Company.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[136]: 5 | 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | hr_df=pd.read_csv(r'C:\Python37\Projects\ALL ML-DL-DS Projects from Udemy and other Sources\Data_Science\Data Science and Deep Learning for Business\datascienceforbusiness-master\hr_data.csv') 11 | 12 | 13 | # In[5]: 14 | 15 | 16 | hr_df 17 | 18 | 19 | # In[ ]: 20 | 21 | 22 | 23 | 24 | 25 | # In[6]: 26 | 27 | 28 | #Numerical Analysis 29 | 30 | 31 | # In[7]: 32 | 33 | 34 | hr_df.shape 35 | 36 | 37 | # In[8]: 38 | 39 | 40 | hr_df.size 41 | 42 | 43 | # In[ ]: 44 | 45 | 46 | 47 | 48 | 49 | # In[9]: 50 | 51 | 52 | hr_df.info() 53 | 54 | 55 | # In[ ]: 56 | 57 | 58 | 59 | 60 | 61 | # In[10]: 62 | 63 | 64 | hr_df['department'].unique() 65 | 66 | 67 | # In[11]: 68 | 69 | 70 | hr_df['salary'].unique() 71 | 72 | 73 | # In[ ]: 74 | 75 | 76 | 77 | 78 | 79 | # In[ ]: 80 | 81 | 82 | 83 | 84 | 85 | # In[ ]: 86 | 87 | 88 | 89 | 90 | 91 | # In[12]: 92 | 93 | 94 | #Loading our Employee Satisfaction Data 95 | 96 | 97 | # In[13]: 98 | 99 | 100 | s_df=pd.read_excel(r'C:\Python37\Projects\ALL ML-DL-DS Projects from Udemy and other Sources\Data_Science\Data Science and Deep Learning for Business\datascienceforbusiness-master\employee_satisfaction_evaluation.xlsx') 101 | 102 | 103 | # In[14]: 104 | 105 | 106 | s_df 107 | 108 | 109 | # In[ ]: 110 | 111 | 112 | 113 | 114 | 115 | # In[ ]: 116 | 117 | 118 | 119 | 120 | 121 | # In[15]: 122 | 123 | 124 | #Merging and Joining 125 | 126 | 127 | # In[16]: 128 | 129 | 130 | main_df= hr_df.set_index('employee_id').join(s_df.set_index('EMPLOYEE #')) 131 | 132 | 133 | # In[17]: 134 | 135 | 136 | main_df=main_df.reset_index() 137 | 138 | 139 | # In[18]: 140 | 141 | 142 | main_df 143 | 144 | 145 | # In[ ]: 146 | 147 | 148 | 149 | 150 | 151 | # In[19]: 152 | 153 | 154 | main_df.info() 155 | 156 | 157 | # In[ ]: 158 | 159 | 160 | 161 | 162 | 163 | # In[ ]: 164 | 165 | 166 | 167 | 168 | 169 | # In[20]: 170 | 171 | 172 | main_df[main_df.isnull().any(axis=1)] 173 | 174 | 175 | # In[ ]: 176 | 177 | 178 | 179 | 180 | 181 | # In[ ]: 182 | 183 | 184 | 185 | 186 | 187 | # In[21]: 188 | 189 | 190 | main_df.describe() 191 | 192 | 193 | # In[ ]: 194 | 195 | 196 | 197 | 198 | 199 | # In[ ]: 200 | 201 | 202 | 203 | 204 | 205 | # In[22]: 206 | 207 | 208 | main_df.fillna(main_df.mean(),inplace=True) 209 | 210 | 211 | # In[23]: 212 | 213 | 214 | main_df[main_df.isnull().any(axis=1)] 215 | 216 | 217 | # In[ ]: 218 | 219 | 220 | 221 | 222 | 223 | # In[ ]: 224 | 225 | 226 | 227 | 228 | 229 | # In[24]: 230 | 231 | 232 | main_df.loc[main_df['employee_id']==1340] 233 | 234 | 235 | # In[ ]: 236 | 237 | 238 | 239 | 240 | 241 | # In[ ]: 242 | 243 | 244 | 245 | 246 | 247 | # In[25]: 248 | 249 | 250 | main_df.drop(columns='employee_id',inplace=True) 251 | 252 | 253 | # In[26]: 254 | 255 | 256 | main_df 257 | 258 | 259 | # In[ ]: 260 | 261 | 262 | 263 | 264 | 265 | # In[28]: 266 | 267 | 268 | #main_df['department'].values_counts() 269 | 270 | 271 | # In[29]: 272 | 273 | 274 | main_df.groupby('department').sum() 275 | 276 | 277 | # In[ ]: 278 | 279 | 280 | 281 | 282 | 283 | # In[ ]: 284 | 285 | 286 | 287 | 288 | 289 | # In[30]: 290 | 291 | 292 | main_df.groupby('department').mean() 293 | 294 | 295 | # In[ ]: 296 | 297 | 298 | 299 | 300 | 301 | # In[ ]: 302 | 303 | 304 | 305 | 306 | 307 | # In[31]: 308 | 309 | 310 | main_df['left'].value_counts() 311 | 312 | 313 | # In[ ]: 314 | 315 | 316 | 317 | 318 | 319 | # In[ ]: 320 | 321 | 322 | 323 | 324 | 325 | # In[32]: 326 | 327 | 328 | #Data Visualization 329 | 330 | 331 | # In[33]: 332 | 333 | 334 | import matplotlib.pyplot as plt 335 | import seaborn as sns 336 | 337 | 338 | # In[ ]: 339 | 340 | 341 | 342 | 343 | 344 | # In[34]: 345 | 346 | 347 | def plot_corr(df,size=10): 348 | 349 | corr=df.corr() 350 | fig,ax=plt.subplots(figsize=(size,size)) 351 | ax.legend() 352 | cax=ax.matshow(corr) 353 | fig.colorbar(cax) 354 | plt.xticks(range(len(corr.columns)), corr.columns, rotation='vertical') 355 | plt.yticks(range(len(corr.columns)), corr.columns) 356 | 357 | 358 | plot_corr(main_df) 359 | 360 | 361 | # In[ ]: 362 | 363 | 364 | 365 | 366 | 367 | # In[ ]: 368 | 369 | 370 | 371 | 372 | 373 | # In[35]: 374 | 375 | 376 | plt.bar(x=main_df['left'],height=main_df['satisfaction_level']) 377 | 378 | 379 | # In[36]: 380 | 381 | 382 | sns.barplot(x='left',y='satisfaction_level',data=main_df) 383 | 384 | 385 | # In[ ]: 386 | 387 | 388 | 389 | 390 | 391 | # In[37]: 392 | 393 | 394 | sns.barplot(x='promotion_last_5years',y='satisfaction_level',data=main_df,hue='left') 395 | 396 | 397 | # In[ ]: 398 | 399 | 400 | 401 | 402 | 403 | # In[38]: 404 | 405 | 406 | sns.pairplot(main_df,hue='left') 407 | 408 | 409 | # In[ ]: 410 | 411 | 412 | 413 | 414 | 415 | # In[ ]: 416 | 417 | 418 | 419 | 420 | 421 | # In[ ]: 422 | 423 | 424 | 425 | 426 | 427 | # In[39]: 428 | 429 | 430 | #Data Preprocessing 431 | 432 | 433 | # In[40]: 434 | 435 | 436 | y=main_df[['department','salary']] 437 | 438 | 439 | # In[41]: 440 | 441 | 442 | y 443 | 444 | 445 | # In[42]: 446 | 447 | 448 | from sklearn.preprocessing import LabelEncoder 449 | 450 | le=LabelEncoder() 451 | 452 | k=le.fit_transform(main_df['salary']) 453 | 454 | 455 | # In[43]: 456 | 457 | 458 | k 459 | 460 | 461 | # In[44]: 462 | 463 | 464 | main_df 465 | 466 | 467 | # In[45]: 468 | 469 | 470 | main_df['salary_num']=k 471 | 472 | 473 | # In[46]: 474 | 475 | 476 | main_df 477 | 478 | 479 | # In[47]: 480 | 481 | 482 | main_df.loc[main_df['salary']=='high'] 483 | 484 | 485 | # In[48]: 486 | 487 | 488 | main_df.drop(['salary'],axis=1,inplace=True) 489 | 490 | 491 | # In[49]: 492 | 493 | 494 | main_df 495 | 496 | 497 | # In[ ]: 498 | 499 | 500 | 501 | 502 | 503 | # In[ ]: 504 | 505 | 506 | 507 | 508 | 509 | # In[50]: 510 | 511 | 512 | z=le.fit_transform(main_df['department']) 513 | 514 | 515 | # In[51]: 516 | 517 | 518 | z 519 | 520 | 521 | # In[52]: 522 | 523 | 524 | main_df['department_num']=z 525 | 526 | 527 | # In[53]: 528 | 529 | 530 | main_df 531 | 532 | 533 | # In[54]: 534 | 535 | 536 | main_df.loc[main_df['department']=='IT'] 537 | 538 | 539 | # In[55]: 540 | 541 | 542 | main_df.drop(['department'],axis=1,inplace=True) 543 | 544 | 545 | # In[56]: 546 | 547 | 548 | main_df 549 | 550 | 551 | # In[ ]: 552 | 553 | 554 | 555 | 556 | 557 | # In[57]: 558 | 559 | 560 | X=main_df.drop(['left'],axis=1) 561 | 562 | 563 | # In[58]: 564 | 565 | 566 | X 567 | 568 | 569 | # In[ ]: 570 | 571 | 572 | 573 | 574 | 575 | # In[59]: 576 | 577 | 578 | y=main_df['left'] 579 | 580 | 581 | # In[60]: 582 | 583 | 584 | y.size 585 | 586 | 587 | # In[ ]: 588 | 589 | 590 | 591 | 592 | 593 | # In[ ]: 594 | 595 | 596 | 597 | 598 | 599 | # In[61]: 600 | 601 | 602 | from sklearn.model_selection import train_test_split 603 | 604 | X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,) 605 | 606 | 607 | # In[62]: 608 | 609 | 610 | X_test 611 | 612 | 613 | # In[63]: 614 | 615 | 616 | y_test 617 | 618 | 619 | # In[64]: 620 | 621 | 622 | # Standard Scaler 623 | 624 | 625 | # In[ ]: 626 | 627 | 628 | 629 | 630 | 631 | # In[65]: 632 | 633 | 634 | # Model Classification 635 | 636 | 637 | # In[66]: 638 | 639 | 640 | # Decision Tree 641 | 642 | 643 | # In[67]: 644 | 645 | 646 | from sklearn.metrics import accuracy_score 647 | 648 | 649 | # In[68]: 650 | 651 | 652 | from sklearn.tree import DecisionTreeClassifier 653 | 654 | dt=DecisionTreeClassifier() 655 | 656 | dt.fit(X_train,y_train) 657 | 658 | 659 | # In[69]: 660 | 661 | 662 | prediction_dt=dt.predict(X_test) 663 | 664 | 665 | # In[70]: 666 | 667 | 668 | prediction_dt 669 | 670 | 671 | # In[71]: 672 | 673 | 674 | y_test 675 | 676 | 677 | # In[152]: 678 | 679 | 680 | accuracy_dt=accuracy_score(y_test,prediction_dt)*100 681 | 682 | 683 | # In[153]: 684 | 685 | 686 | accuracy_dt 687 | 688 | 689 | # In[73]: 690 | 691 | 692 | X_test 693 | 694 | 695 | # In[ ]: 696 | 697 | 698 | 699 | 700 | 701 | # In[ ]: 702 | 703 | 704 | 705 | 706 | 707 | # In[74]: 708 | 709 | 710 | Catagory=['Employee will stay','Employee will Leave'] 711 | 712 | 713 | # In[ ]: 714 | 715 | 716 | 717 | 718 | 719 | # In[75]: 720 | 721 | 722 | custom_dt=[[1,500,3,6,0,0.90,0.89,1,8]] 723 | 724 | 725 | # In[76]: 726 | 727 | 728 | print(int(dt.predict(custom_dt))) 729 | 730 | 731 | # In[77]: 732 | 733 | 734 | Catagory[int(dt.predict(custom_dt))] 735 | 736 | 737 | # In[ ]: 738 | 739 | 740 | 741 | 742 | 743 | # In[ ]: 744 | 745 | 746 | 747 | 748 | 749 | # In[78]: 750 | 751 | 752 | dt.feature_importances_ 753 | 754 | 755 | # In[79]: 756 | 757 | 758 | feature_importance=pd.DataFrame(dt.feature_importances_,index=X_train.columns,columns=['Importance']).sort_values('Importance',ascending=False) 759 | 760 | 761 | # In[80]: 762 | 763 | 764 | feature_importance 765 | 766 | 767 | # In[ ]: 768 | 769 | 770 | 771 | 772 | 773 | # In[87]: 774 | 775 | 776 | X_train 777 | 778 | 779 | # In[ ]: 780 | 781 | 782 | 783 | 784 | 785 | # In[82]: 786 | 787 | 788 | #KNN 789 | 790 | 791 | # In[83]: 792 | 793 | 794 | # Data Processing of KNN 795 | 796 | 797 | # In[84]: 798 | 799 | 800 | from sklearn.preprocessing import StandardScaler 801 | 802 | 803 | # In[86]: 804 | 805 | 806 | sc=StandardScaler().fit(X_train) 807 | X_train_std=sc.transform(X_train) 808 | X_test_std=sc.transform(X_test) 809 | 810 | 811 | # In[88]: 812 | 813 | 814 | X_train_std 815 | 816 | 817 | # In[90]: 818 | 819 | 820 | X_test_std 821 | 822 | 823 | # In[ ]: 824 | 825 | 826 | 827 | 828 | 829 | # In[130]: 830 | 831 | 832 | from sklearn.neighbors import KNeighborsClassifier 833 | 834 | knn=KNeighborsClassifier(n_neighbors=3) 835 | knn.fit(X_train_std,y_train) 836 | 837 | 838 | # In[ ]: 839 | 840 | 841 | 842 | 843 | 844 | # In[131]: 845 | 846 | 847 | prediction_knn=knn.predict(X_test_std) 848 | 849 | 850 | # In[132]: 851 | 852 | 853 | accuracy_knn=accuracy_score(y_test,prediction_knn)*100 854 | 855 | 856 | # In[133]: 857 | 858 | 859 | accuracy_knn 860 | 861 | 862 | # In[134]: 863 | 864 | 865 | prediction_knn 866 | 867 | 868 | # In[100]: 869 | 870 | 871 | y_test 872 | 873 | 874 | # In[ ]: 875 | 876 | 877 | 878 | 879 | 880 | # In[ ]: 881 | 882 | 883 | 884 | 885 | 886 | # In[126]: 887 | 888 | 889 | k_range=range(1,26) 890 | scores={} 891 | scores_list=[] 892 | 893 | 894 | for k in k_range: 895 | knn=KNeighborsClassifier(n_neighbors=k) 896 | knn.fit(X_train_std,y_train) 897 | prediction_knn=knn.predict(X_test_std) 898 | scores[k]=accuracy_score(y_test,prediction_knn)*100 899 | scores_list.append(accuracy_score(y_test,prediction_knn)) 900 | 901 | 902 | 903 | 904 | 905 | # In[127]: 906 | 907 | 908 | scores 909 | 910 | 911 | # In[128]: 912 | 913 | 914 | scores_list 915 | 916 | 917 | # In[129]: 918 | 919 | 920 | plt.plot(k_range,scores_list) 921 | 922 | 923 | # In[ ]: 924 | 925 | 926 | 927 | 928 | 929 | # In[146]: 930 | 931 | 932 | X_test.head(1) 933 | 934 | 935 | # In[147]: 936 | 937 | 938 | X_knn=np.array([[20,500,10,6,0,0.10,0.30,1,8]]) 939 | X_knn_std=sc.transform(X_knn) 940 | 941 | 942 | # In[148]: 943 | 944 | 945 | X_knn_std 946 | 947 | 948 | # In[149]: 949 | 950 | 951 | X_knn_prediction=knn.predict(X_knn_std) 952 | 953 | 954 | # In[150]: 955 | 956 | 957 | X_knn_prediction 958 | 959 | 960 | # In[151]: 961 | 962 | 963 | Catagory[int(dt.predict(custom_dt))] 964 | 965 | 966 | # In[ ]: 967 | 968 | 969 | 970 | 971 | 972 | # In[ ]: 973 | 974 | 975 | 976 | 977 | 978 | # In[156]: 979 | 980 | 981 | algorithms=['Decision Tree','KNN'] 982 | scores=[accuracy_dt,accuracy_knn] 983 | plt.xlabel("Algorithms") 984 | plt.ylabel("Accuracy Score") 985 | sns.barplot(algorithms,scores) 986 | plt.show() 987 | 988 | 989 | # In[ ]: 990 | 991 | 992 | 993 | 994 | 995 | # In[ ]: 996 | 997 | 998 | 999 | 1000 | 1001 | # In[ ]: 1002 | 1003 | 1004 | 1005 | 1006 | --------------------------------------------------------------------------------