└── etl project.txt /etl project.txt: -------------------------------------------------------------------------------- 1 | pip install opendatasets --upgrade --quiet 2 | import opendatasets as od 3 | 4 | download_url = 'https://www.kaggle.com/sobhanmoosavi/us-accidents' 5 | 6 | od.download(download_url) 7 | import pandas as pd 8 | df = pd.read_csv(data_filename) 9 | df 10 | df.info() 11 | df.describe() 12 | numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] 13 | 14 | numeric_df = df.select_dtypes(include=numerics) 15 | len(numeric_df.columns) 16 | missing_percentages = df.isna().sum().sort_values(ascending=False) / len(df) 17 | missing_percentages 18 | type(missing_percentages) 19 | missing_percentages[missing_percentages != 0].plot(kind='barh') 20 | df.columns 21 | df.City 22 | cities = df.City.unique() 23 | len(cities) 24 | cities_by_accident = df.City.value_counts() 25 | cities_by_accident 26 | cities_by_accident[:20] 27 | type(cities_by_accident) 28 | cities_by_accident[:20].plot(kind='barh') 29 | import seaborn as sns 30 | sns.set_style("darkgrid") 31 | sns.histplot(cities_by_accident, log_scale=True) 32 | cities_by_accident[cities_by_accident == 1] 33 | df.Start_Time 34 | df.Start_Time = pd.to_datetime(df.Start_Time) 35 | sns.distplot(df.Start_Time.dt.hour, bins=24, kde=False, norm_hist=True) 36 | sns.distplot(df.Start_Time.dt.dayofweek, bins=7, kde=False, norm_hist=True) 37 | sundays_start_time = df.Start_Time[df.Start_Time.dt.dayofweek == 6] 38 | sns.distplot(sundays_start_time.dt.hour, bins=24, kde=False, norm_hist=True) 39 | monday_start_time = df.Start_Time[df.Start_Time.dt.dayofweek == 0] 40 | sns.distplot(monday_start_time.dt.hour, bins=24, kde=False, norm_hist=True) 41 | df_2019 = df[df.Start_Time.dt.year == 2019] 42 | df_2019_Bing = df_2019[df_2019.Source == 'MapQuest'] 43 | sns.distplot(df_2019_Bing.Start_Time.dt.month, bins=12, kde=False, norm_hist=True) 44 | df.Start_Lat 45 | df.Start_Lng 46 | sample_df = df.sample(int(0.1 * len(df))) 47 | sns.scatterplot(x=sample_df.Start_Lng, y=sample_df.Start_Lat, size=0.001) 48 | import folium 49 | lat, lon = df.Start_Lat[0], df.Start_Lng[0] 50 | lat, lon 51 | for x in df[['Start_Lat', 'Start_Lng']].sample(100).iteritems(): 52 | print(x[1]) 53 | zip(list(df.Start_Lat), list(df.Start_Lng)) 54 | from folium.plugins import HeatMap 55 | sample_df = df.sample(int(0.001 * len(df))) 56 | lat_lon_pairs = list(zip(list(sample_df.Start_Lat), list(sample_df.Start_Lng))) 57 | map = folium.Map() 58 | HeatMap(lat_lon_pairs).add_to(map) 59 | map 60 | --------------------------------------------------------------------------------