├── data ├── COVID-19_Deaths_by_Hospital_Referral_Region.csv └── arrests_ usa.csv └── main.py /data/arrests_ usa.csv: -------------------------------------------------------------------------------- 1 | id,year,population,total_arrests,homicide,rape,robbery,aggravated_assault,burglary,larceny,motor_vehicle_theft,arson,violent_crime,property_crime,other_assault,forgery,fraud,embezzlement,stolen_property,vandalism,weapons,prostitution,other_sex_offenses,drug_abuse,gambling,against_family,dui,liquor_laws,drunkenness,disorderly_conduct,vagrancy,other,suspicion,curfew_loitering 2 | 1,2016,323127513,10662252,11788,23632,95754,383977,207325,1050058,86088,9812,515151,1353283,1078808,56661,128531,15937,93981,195951,156777,38306,51063,1572579,3705,88748,1017808,234899,376433,369733,24851,3254871,576,34176 3 | 2,2015,320896618,10797088,11092,22863,95572,376154,216010,1160390,77979,8834,505681,1463213,1081019,55333,133138,15909,88576,191015,145358,41877,51388,1488707,4825,94837,1089171,266250,405880,386078,25151,3218880,1389,44802 4 | 3,2014,318907401,11205833,10571,21007,94403,372685,237974,1238190,68422,9394,498666,1553980,1093258,56783,141293,16227,88946,198400,140713,47598,55456,1561231,5637,102336,1117852,321125,414854,436014,27380,3274430,1310,53654 5 | 4,2013,316497531,11302102,10231,16863,94406,358860,252629,1231580,64566,10509,480360,1559284,1097741,60969,143528,15730,92691,201168,137779,48620,57925,1501043,6024,101247,1166824,354872,443527,467993,25755,3282651,1096,56371 6 | 5,2012,313873685,12196959,11075,18098,103661,388362,283582,1282352,68845,11433,521196,1646212,1199476,67046,153535,16023,97670,228463,149286,56575,68355,1552432,7868,107018,1282957,441532,511271,543995,27003,3448856,1532,70190 7 | 6,2011,311587816,12408899,10832,19491,106674,397707,296707,1264986,66414,11776,534704,1639883,1241722,70211,168217,16190,93234,237638,153519,57345,69225,1531251,8596,116723,1215077,500648,534218,582158,29203,3532195,1424,76942 8 | 7,2010,309330219,13120947,11201,20088,112300,408488,289769,1271410,71487,11296,552077,1643962,1292449,78101,187887,16616,94802,252753,159020,62668,72628,1638846,9941,111062,1412223,512790,560718,615172,32033,3720402,1166,94797 9 | 8,2009,307006550,13687241,12418,21407,126725,421215,299351,1334933,81797,12204,581765,1728285,1319458,85844,210255,17920,105303,270439,166334,71355,77326,1663582,10360,114564,1440409,570333,594300,655322,33388,3764672,1975,112593 10 | 9,2008,304059724,14005615,12955,22584,129403,429969,308479,1266706,98035,14125,594911,1687345,1298342,90127,234199,21402,111319,285012,179661,75004,79914,1702537,9811,118419,1483396,625939,611069,685985,33852,3835083,1650,133063 11 | 10,2007,301621157,14209365,13480,23307,126715,433945,303853,1172762,118231,15242,597447,1610088,1305693,103448,252873,22381,122061,291575,188891,77607,83979,1841182,12161,122812,1427494,633654,589402,709105,33666,3931965,2176,143002 12 | 11,2006,299398484,14380370,13435,24535,125605,447948,304801,1081157,137757,16582,611523,1540297,1305757,108823,280693,20012,122722,300679,200782,79673,87252,1889810,12307,131491,1460498,645734,553188,703504,36471,4022068,2482,152907 13 | 12,2005,296507061,14094186,14062,25528,114616,449297,298835,1146696,147459,16337,603503,1609327,1301392,118455,321521,18970,133856,279562,193469,84891,91625,1846351,11180,129128,1371919,597838,556167,678231,33227,3863785,3764,140835 14 | 13,2004,293656842,13938071,13467,26066,108992,438033,294645,1185619,148429,15504,586558,1644197,1284858,119518,282938,17332,128529,272522,175776,87872,90913,1746570,10755,124936,1433382,612528,552671,657637,36404,3815435,3554,137398 15 | 14,2003,290788976,13639479,13190,26350,107553,449933,290956,1145074,152934,16163,597026,1605127,1246698,111823,299138,16826,126775,273431,167972,75190,91546,1678192,10954,136034,1448148,612079,548616,639371,28948,3665543,7163,136461 16 | 15,2002,287973924,13741438,14158,28288,105774,472290,288291,1160085,148943,16635,620510,1613954,1288682,115735,337404,18552,126422,276697,164446,79733,95066,1538813,10506,140286,1461746,653819,572735,669938,27295,3662159,8899,141252 17 | 16,2001,285317559,13699254,13653,27270,108400,477809,291444,1160821,147451,18749,627132,1618465,1315807,113741,323308,20157,121972,270645,165896,80854,91828,1586902,11112,143683,1434852,610591,618668,621394,27935,3618164,3955,142889 18 | 17,2000,281421906,13980297,13227,27469,106130,478417,289844,1166362,148225,16530,625132,1620928,1312169,108654,345732,18952,118641,281305,159181,87620,93399,1579566,10842,147663,1471289,683124,637554,638740,32542,3710434,5682,154711 19 | 18,1999,272690813,14031070,14790,28830,108850,483530,296100,1189400,142200,16800,635990,1644500,1294400,106900,363800,17100,121900,278200,172400,92100,92400,1532200,10400,151200,1511300,657900,656100,633100,30000,3728100,7500,167200 20 | 19,1998,270248003,14528300,17450,31070,120870,506630,330700,1307100,150700,17200,675900,1805600,1338800,114600,394600,17100,137900,300200,190600,94000,93600,1559100,12800,146400,1402800,630400,710300,696100,30400,3824100,5200,187800 21 | 20,1997,267783607,15284300,18290,32060,132450,534920,356000,1472600,167000,20000,717750,2015600,1395800,120100,414600,17400,155300,318400,218900,101600,101900,1583600,15900,155800,1477300,636400,734800,811100,28800,3884600,6500,182700 22 | 21,1996,265228572,15168100,19020,33050,156270,521570,364800,1486300,175400,19000,729900,2045600,1329000,121600,465000,15700,151100,320900,216200,99000,95800,1506200,21000,149800,1467300,677400,718700,842600,27800,3786700,4900,185100 23 | 22,1995,262803276,15119800,21230,34650,171870,568480,386500,1530200,191900,20000,796250,2128600,1290400,122300,436400,15200,166500,311100,243900,97700,94500,1476100,19500,142900,1436000,594900,708100,748600,25900,3865400,12100,149800 24 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | '''Versatile data analysis laptop. It only takes one line to replace - reading your data file''' 2 | # This is a sample Python script. 3 | import pandas as pd 4 | #Visualization 5 | import matplotlib.pyplot as plt 6 | import seaborn as sns 7 | 8 | 9 | #we will split the train set into train and test data in future sections 10 | # data_raw = pd.read_csv('data/AH_Provisional_COVID-19_Deaths_by_Hospital_Referral_Region.csv') 11 | data_raw = pd.read_csv('data/arrests_ usa.csv') 12 | 13 | 14 | #to play with our data we'll create a copy 15 | #remember python assignment or equal passes by reference vs values, so we use the copy function: 16 | # https://stackoverflow.com/questions/46327494/python-pandas-dataframe-copydeep-false-vs-copydeep-true-vs 17 | data1 = data_raw.copy(deep = True) 18 | 19 | #preview data 20 | 21 | # to play with our data we'll create a copy 22 | # remember python assignment or equal passes by reference vs values, so we use the copy function: https://stackoverflow.com/questions/46327494/python-pandas-dataframe-copydeep-false-vs-copydeep-true-vs 23 | data1 = data_raw.copy(deep=True) 24 | 25 | # preview data 26 | print("\n ----------Top-5- Record----------") 27 | print(data_raw.head(5)) # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.head.html 28 | # print(data_raw.tail(5)) #https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.tail.html 29 | # print(data_raw.sample(10)) #https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sample.html 30 | print("\n -----------Information-----------") 31 | print(data_raw.info()) # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.info.html 32 | print("\n -----------Data Types-----------") 33 | print(data_raw.dtypes) 34 | print("\n ----------Missing value-----------") 35 | print(data_raw.isnull().sum()) 36 | print("\n ----------Null value-----------") 37 | print(data_raw.isna().sum()) 38 | print("\n ----------Shape of Data----------") 39 | print(data_raw.shape) 40 | print("\n ----------Number of duplicates----------") 41 | print('Number of duplicates:', len(data_raw[data_raw.duplicated()])) 42 | 43 | 44 | # Function to calculate missing values by column# Funct 45 | def missing_values_table(df): 46 | # Total missing values 47 | mis_val = df.isnull().sum() 48 | 49 | # Percentage of missing values 50 | mis_val_percent = 100 * df.isnull().sum() / len(df) 51 | 52 | # Make a table with the results 53 | mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1) 54 | 55 | # Rename the columns 56 | mis_val_table_ren_columns = mis_val_table.rename( 57 | columns={0: 'Missing Values', 1: '% of Total Values'}) 58 | 59 | # Sort the table by percentage of missing descending 60 | mis_val_table_ren_columns = mis_val_table_ren_columns[ 61 | mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values( 62 | '% of Total Values', ascending=False).round(1) 63 | 64 | # Print some summary information 65 | print("Your selected dataframe has " + str(df.shape[1]) + " columns.\n" 66 | "There are " + str(mis_val_table_ren_columns.shape[0]) + 67 | " columns that have missing values.") 68 | 69 | # Return the dataframe with missing information 70 | return mis_val_table_ren_columns 71 | 72 | 73 | missing_values_data = missing_values_table(data1) 74 | print("\n ----------Missing values----------") 75 | print(missing_values_data.head(30)) 76 | 77 | print("\n ----------Number of types----------") 78 | # Number of each type of column 79 | print(data1.dtypes.value_counts()) 80 | 81 | print("\n ----------Number of uniques----------") 82 | # Let's now look at the number of unique entries in each of the object (categorical) columns. 83 | print(data1.select_dtypes('object').apply(pd.Series.nunique, axis=0)) 84 | 85 | print("\n ----------Describe of tables----------") 86 | print(data_raw.describe(include='all')) 87 | 88 | # preview data again 89 | print(data1.corr()) 90 | 91 | #correlation map 92 | f, ax = plt.subplots(figsize=(18, 18)) 93 | sns.heatmap(data1.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax) 94 | plt.show() 95 | 96 | data1.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8) # ; avoid having the matplotlib verbose informations 97 | 98 | sns.set() 99 | sns.pairplot(data1, size = 2.5) 100 | plt.show() --------------------------------------------------------------------------------