├── .gitattributes ├── Basic_OMDBAPI_Title_Request.py ├── Bernoulli_Trial.py ├── Binomial_Dist_plot.py ├── Binomial_Dist_plot_Update_0317.py ├── Bootstrap Variance.py ├── Bootstrap_Data_Analysis.py ├── Bootstrap_test_One Sample.py ├── CSV Import_Panda_Header.py ├── CSV_Concatenate_All.py ├── CSV_Import-All2.py ├── CSV_Reader_Test2.py ├── Calc_Diff_Btw_Columns.py ├── Connect to PostgresQL Database.py ├── Create_DataBase_Engine.py ├── Customer Segmentation_Code_Spyder.py ├── CustomizeSQL_Query_Col_Row_Size.py ├── DB_Tables_Update.py ├── DatStream.py ├── DatVisPython.py ├── DatVis_Bokeh_1.py ├── DatVis_Bokeh_CaseStudy_App_Build_5.py ├── DatVis_Bokeh_High_Level_Charts_3.py ├── DatVis_Bokeh_Intr_App_Build_4.py ├── DatVis_Bokeh_Layout-Int-Annot_2.py ├── DatVis_Images.py ├── Dat_Clean_Analysis.py ├── Dat_Read_Plot.py ├── DataChunkFunc.py ├── DataClean_GS_Analysis5.py ├── DataCombine_Analysis3.py ├── DataFrame_Lambda_Filter_Read.py ├── DataTidy_Analysis2.py ├── DataTypes_Analysis4.py ├── DataXplore_Analysis1.py ├── Data_Corr_Func.py ├── Datchunk_PopPlot.py ├── Deep_Learning_Basics_1.py ├── Deep_Learning_KerasModel_Build_3.py ├── Deep_Learning_KerasModel_Optimise_4.py ├── Deep_Learning_Ntwrk_Optim_2.py ├── Distribution_Check_Theor_ECDF_Data_CDF.py ├── EDA_Analysis_Comarison.py ├── EDA_Hypothesis_Test.py ├── Entry_Count_Check_Exception.py ├── EthicalHackingCourseNotes.py ├── ExtractHist_Image.py ├── Extract_Data_from_HDF5.py ├── File_Import_Multi_DataType.py ├── FilterData_Selected_from_Table_SQLAlchemy.py ├── FilterSQL_Database_Table_Col_Row.py ├── FilterSQL_Database_Table_WHERE.py ├── General Multi_Column DataFrame Analysis.py ├── General Twitter Language Analysis.py ├── Generate from MultiType Data.py ├── HTML_with_BeautifulSoup_GetHypLinktData.py ├── HTML_with_BeautifulSoup_GetTextData.py ├── HTTP_Request_Urllib_Response.py ├── HTTP_Request_Urllib_Response_Read.py ├── HTTP_Request_using_Requests.py ├── Hack_Bern_nprandom.py ├── Hack_Stats_BasicRandGen.py ├── Import_Excel_Pandas.py ├── Import_Excel_Parse.py ├── Import_FlatFile_Web.py ├── Import_HDF5.py ├── Import_MatLab_WorkSpace.py ├── Import_Pickled-Data.py ├── Import_Plot_Web_Flatfile_NonLocal_Save.py ├── Import_SAS7BDAT_.py ├── Import_Stata_File.py ├── Inserting_Multiple_Rows.py ├── Iteration.py ├── LICENSE ├── Lambda_List_Filter.py ├── LinReg_BS_Pairs_func.py ├── Linear_Regression_Anscombe.py ├── ListComp_Gen.py ├── ListComp_timestamped.py ├── List_Dictionary_Full.py ├── Load_Explore_Twitter_Data.py ├── Local_JSon_Load_Explore.py ├── Multidata_tweeter_count_function.py ├── Nested_List_Comp.py ├── NewsArticleClass.py ├── NewsAutosummarize.py ├── Non_Flat_File_Import_Web-Excel.py ├── Numpy_Import_LoadTxt.py ├── Numpy_LoadData_and_Plot.py ├── README.md ├── Random_NLTK.py ├── SQL_Arbitrary_Insert_Row.py ├── SQL_Arbitrary_Table_Create.py ├── SQL_Automatic_Join_Est_Rel.py ├── SQL_CaseStudy_Basic.py ├── SQL_Check_Col_Population_Percentage.py ├── SQL_Data_Count_Group-By.py ├── SQL_Data_Count_Keys_Values.py ├── SQL_Delete_Table.py ├── SQL_Det_Pop_Sum_by_Column.py ├── SQL_Join_Columns_Advanced.py ├── SQL_Join_Table_Columns.py ├── SQL_Leverage_Heirach_Data_Group_By.py ├── SQL_LoadCSV_csv-reader.py ├── SQL_Order_Desc_by_Column.py ├── SQL_Order_by_Data by Column.py ├── SQL_Plot_Results_DataFrame.py ├── SQL_Same_Table_Joined_Query.py ├── Simple_Data_Filter_Select_Where.py ├── TwitterAPI_Authentication_SampleM.py ├── Twitter_Data to DataFrame.py ├── Twitter_Text_dataAnalysis.py ├── Vis_Regressions_FixData.py ├── csv_DataFrame_NumpyArray.py ├── draw_bootstrap_reps.py ├── draw_bs_pairs.py └── fixations.csv /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto -------------------------------------------------------------------------------- /Basic_OMDBAPI_Title_Request.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 16 01:37:31 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import requests package 9 | import requests 10 | 11 | # Assign URL to variable: url 12 | url = 'http://www.omdbapi.com/?t=this+is+spinal+tap' 13 | 14 | # Package the request, send the request and catch the response: r 15 | r = requests.get(url) 16 | 17 | # Print the text of the response 18 | print(r.text) -------------------------------------------------------------------------------- /Bernoulli_Trial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 7 22:50:22 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import numpy as np 9 | 10 | 11 | def perform_bernoulli_trials(n, p): 12 | """Perform n Bernoulli trials with success probability p 13 | and return number of successes.""" 14 | # Initialize number of successes: n_success 15 | n_success = 0 16 | 17 | # Perform trials 18 | for i in range(n): 19 | # Choose random number between zero and one: random_number 20 | random_number = np.random.random() 21 | 22 | # If less than p, it's a success so add one to n_success 23 | if random_number < p: 24 | n_success += 1 25 | 26 | return n_success -------------------------------------------------------------------------------- /Binomial_Dist_plot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 7 23:26:03 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | from Bernoulli_Trial import perform_bernoulli_trials 13 | from ecdf_func import ecdf 14 | 15 | # Seed random number generator 16 | np.random.seed(42) 17 | 18 | # Take 10,000 samples out of the binomial distribution: n_defaults 19 | n_defaults = np.random.binomial(100, 0.05, size=10000) 20 | 21 | # Compute CDF: x, y 22 | x, y = ecdf(n_defaults) 23 | 24 | # Plot the CDF with axis labels 25 | _ = plt.plot(x, y, marker='.', linestyle='none') 26 | plt.margins(0.002) 27 | plt.xlabel('Defaults out of 100 loans') 28 | plt.ylabel('ECDF') 29 | 30 | # Show the plot 31 | plt.show() 32 | 33 | # Seed random number generator 34 | np.random.seed(42) 35 | 36 | # Initialize the number of defaults: n_defaults 37 | n_defaults = np.empty(1000) 38 | 39 | # Compute the number of defaults 40 | for i in range(1000): 41 | n_defaults[i] = perform_bernoulli_trials(100, 0.05) 42 | 43 | 44 | # Plot the histogram with default number of bins; label your axes 45 | _ = plt.hist(n_defaults, normed=True) 46 | _ = plt.xlabel('number of defaults out of 100 loans') 47 | _ = plt.ylabel('probability') 48 | 49 | # Show the plot 50 | plt.show() 51 | 52 | # Compute bin edges: bins 53 | bins = np.arange(-0.5, max(n_defaults + 1.5) - 0.5) 54 | 55 | # Generate histogram 56 | _ = plt.hist(n_defaults, normed=True, bins=bins) 57 | 58 | # Set margins 59 | plt.margins(0.02) 60 | 61 | # Label axes 62 | _ = plt.xlabel('number of defaults out of 100 loans') 63 | _ = plt.ylabel('Binomial PMF') 64 | 65 | 66 | # #################################################################### # 67 | 68 | # Draw 10,000 samples out of Poisson distribution: samples_poisson 69 | samples_poisson = np.random.poisson(10, size=10000) 70 | 71 | # Print the mean and standard deviation 72 | print('Poisson: ', np.mean(samples_poisson), 73 | np.std(samples_poisson)) 74 | 75 | # Specify values of n and p to consider for Binomial: n, p 76 | n = [20, 100, 1000] 77 | p = [0.5, 0.1, 0.01] 78 | 79 | 80 | # Draw 10,000 samples for each n,p pair: samples_binomial 81 | for i in range(3): 82 | samples_binomial = np.random.binomial(n[i], p[i], size=10000) 83 | 84 | # Print results 85 | print('n =', n[i], 'Binom:', np.mean(samples_binomial), 86 | np.std(samples_binomial)) 87 | -------------------------------------------------------------------------------- /Binomial_Dist_plot_Update_0317.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 7 23:26:03 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | from Bernoulli_Trial import perform_bernoulli_trials 13 | from ecdf_func import ecdf 14 | 15 | # Seed random number generator 16 | np.random.seed(42) 17 | 18 | # Take 10,000 samples out of the binomial distribution: n_defaults 19 | n_defaults = np.random.binomial(100, 0.05, size=10000) 20 | 21 | # Compute CDF: x, y 22 | x, y = ecdf(n_defaults) 23 | 24 | # Plot the CDF with axis labels 25 | _ = plt.plot(x, y, marker='.', linestyle='none') 26 | plt.margins(0.002) 27 | plt.xlabel('Defaults out of 100 loans') 28 | plt.ylabel('ECDF') 29 | 30 | # Show the plot 31 | plt.show() 32 | 33 | # ################################################################## # 34 | 35 | # Seed random number generator 36 | np.random.seed(42) 37 | 38 | # Initialize the number of defaults: n_defaults 39 | n_defaults = np.empty(1000) 40 | 41 | # Compute the number of defaults 42 | for i in range(1000): 43 | n_defaults[i] = perform_bernoulli_trials(100, 0.05) 44 | 45 | 46 | # Plot the histogram with default number of bins; label your axes 47 | _ = plt.hist(n_defaults, normed=True) 48 | _ = plt.xlabel('number of defaults out of 100 loans') 49 | _ = plt.ylabel('probability') 50 | 51 | # Show the plot 52 | plt.show() 53 | 54 | # Compute bin edges: bins 55 | bins = np.arange(-0.5, max(n_defaults + 1.5) - 0.5) 56 | 57 | # Generate histogram 58 | _ = plt.hist(n_defaults, normed=True, bins=bins) 59 | 60 | # Set margins 61 | plt.margins(0.02) 62 | 63 | # Label axes 64 | _ = plt.xlabel('number of defaults out of 100 loans') 65 | _ = plt.ylabel('Binomial PMF') 66 | 67 | 68 | # #################################################################### # 69 | 70 | # Draw 10,000 samples out of Poisson distribution: samples_poisson 71 | samples_poisson = np.random.poisson(10, size=10000) 72 | 73 | # Print the mean and standard deviation 74 | print('Poisson: ', np.mean(samples_poisson), 75 | np.std(samples_poisson)) 76 | 77 | # Specify values of n and p to consider for Binomial: n, p 78 | n = [20, 100, 1000] 79 | p = [0.5, 0.1, 0.01] 80 | 81 | 82 | # Draw 10,000 samples for each n,p pair: samples_binomial 83 | for i in range(3): 84 | samples_binomial = np.random.binomial(n[i], p[i], size=10000) 85 | 86 | # Print results 87 | print('n =', n[i], 'Binom:', np.mean(samples_binomial), 88 | np.std(samples_binomial)) 89 | 90 | # ##################################################################### # 91 | 92 | # Plotting the Normal PDFs 93 | 94 | # Draw 100000 samples from Normal distribution with stds of interest: 95 | # samples_std1, samples_std3, samples_std10 96 | samples_std1 = np.random.normal(20, 1, size=100000) 97 | samples_std3 = np.random.normal(20, 3, size=100000) 98 | samples_std10 = np.random.normal(20, 10, size=100000) 99 | 100 | # Make histograms 101 | _ = plt.hist(samples_std1, normed=True, histtype='step', bins=100) 102 | _ = plt.hist(samples_std3, normed=True, histtype='step', bins=100) 103 | _ = plt.hist(samples_std10, normed=True, histtype='step', bins=100) 104 | 105 | # Make a legend, set limits and show plot 106 | _ = plt.legend(('std = 1', 'std = 3', 'std = 10')) 107 | plt.ylim(-0.01, 0.42) 108 | plt.show() 109 | 110 | # ######################################## # 111 | 112 | # Plottign the Normal CDF/ECDF 113 | 114 | # Generate CDFs 115 | x_std1, y_std1 = ecdf(samples_std1) 116 | x_std3, y_std3 = ecdf(samples_std3) 117 | x_std10, y_std10 = ecdf(samples_std10) 118 | 119 | # Plot CDFs 120 | _ = plt.plot(x_std1, y_std1, marker='.', linestyle='none') 121 | _ = plt.plot(x_std3, y_std3, marker='.', linestyle='none') 122 | _ = plt.plot(x_std10, y_std10, marker='.', linestyle='none') 123 | 124 | # Make 2% margin 125 | plt.margins(0.02) 126 | 127 | # Make a legend and show the plot 128 | _ = plt.legend(('std = 1', 'std = 3', 'std = 10'), loc='lower right') 129 | plt.show() 130 | 131 | -------------------------------------------------------------------------------- /Bootstrap Variance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 13 21:36:23 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Generate 10,000 bootstrap replicates of the variance: bs_replicates 9 | bs_replicates = draw_bs_reps(rainfall, np.var, size=10000) 10 | 11 | # Put the variance in units of square centimeters 12 | bs_replicates /= 100 13 | 14 | # Make a histogram of the results 15 | _ = plt.hist(bs_replicates, bins=50, normed=True) 16 | _ = plt.xlabel('variance of annual rainfall (sq. cm)') 17 | _ = plt.ylabel('PDF') 18 | 19 | # Show the plot 20 | plt.show() 21 | 22 | 23 | # Draw bootstrap replicates of the mean no-hitter time (equal to tau): 24 | # bs_replicates 25 | bs_replicates = draw_bs_reps(nohitter_times, np.mean, size=10000) 26 | 27 | # Compute the 95% confidence interval: conf_int 28 | conf_int = np.percentile(bs_replicates, [2.5, 97.5]) 29 | 30 | # Print the confidence interval 31 | print('95% confidence interval =', conf_int, 'games') 32 | 33 | # Plot the histogram of the replicates 34 | _ = plt.hist(bs_replicates, bins=50, normed=True) 35 | _ = plt.xlabel(r'$\tau$ (games)') 36 | _ = plt.ylabel('PDF') 37 | 38 | # Show the plot 39 | plt.show() 40 | 41 | 42 | def draw_bs_pairs_linreg(x, y, size=1): 43 | """Perform pairs bootstrap for linear regression.""" 44 | 45 | # Set up array of indices to sample from: inds 46 | inds = np.arange(len(x)) 47 | 48 | # Initialize replicates: bs_slope reps, bs_intercept_reps 49 | bs_slope_reps = np.empty(size) 50 | bs_intercept_reps = np.empty(size=size) 51 | 52 | # Generate replicates 53 | for i in range(size): 54 | bs_inds = np.random.choice(inds, size=len(inds)) 55 | bs_x, bs_y = x[bs_inds], y[bs_inds] 56 | bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x, bs_y, 1) 57 | 58 | return bs_slope_reps, bs_intercept_reps 59 | -------------------------------------------------------------------------------- /Bootstrap_Data_Analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 12 21:16:16 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | 9 | for _ in range(50): 10 | # Generate bootstrap sample: bs_sample 11 | bs_sample = np.random.choice(rainfall, size=len(rainfall)) 12 | 13 | # Compute and plot ECDF from bootstrap sample 14 | x, y = ecdf(bs_sample) 15 | _ = plt.plot(x, y, marker='.', linestyle='none', 16 | color='gray', alpha=0.1) 17 | 18 | # Compute and plot ECDF from original data 19 | x, y = ecdf(rainfall) 20 | _ = plt.plot(x, y, marker='.') 21 | 22 | # Make margins and label axes 23 | plt.margins(0.02) 24 | _ = plt.xlabel('yearly rainfall (mm)') 25 | _ = plt.xlabel('ECDF') 26 | 27 | # Show the plot 28 | plt.show() 29 | 30 | 31 | # # COMPUTE MEAN & SEM OF BOOTSTRAP REPLICATES #### # 32 | 33 | # Take 10,000 bootstrap replicates of the mean: bs_replicates 34 | bs_replicates = draw_bs_reps(rainfall, np.mean, 10000) 35 | 36 | # Compute and print SEM 37 | print(np.std(rainfall) / np.sqrt(len(rainfall))) 38 | 39 | # Compute and print standard deviation of bootstrap replicates 40 | print(np.std(bs_replicates)) 41 | 42 | # Make a histogram of the results 43 | _ = plt.hist(bs_replicates, bins=50, normed=True) 44 | _ = plt.xlabel('mean annual rainfall (mm)') 45 | _ = plt.ylabel('PDF') 46 | 47 | # Show the plot 48 | plt.show() 49 | 50 | 51 | # ######### PLOTTING BOOTSTRAP REGRESSIONS ###### # 52 | 53 | # Generate array of x-values for bootstrap lines: x 54 | x = np.array([0, 100]) 55 | 56 | # Plot the bootstrap lines 57 | for i in range(100): 58 | _ = plt.plot(x, bs_slope_reps[i] * x + bs_intercept_reps[i], 59 | linewidth=0.5, alpha=0.2, color='red') 60 | 61 | # Plot the data 62 | _ = plt.plot(illiteracy, fertility, marker='.', linestyle='none') 63 | 64 | # Label axes, set the margins, and show the plot 65 | _ = plt.xlabel('illiteracy') 66 | _ = plt.ylabel('fertility') 67 | plt.margins(0.02) 68 | plt.show() -------------------------------------------------------------------------------- /Bootstrap_test_One Sample.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 20 19:59:42 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import numpy as np 9 | 10 | # Make an array of translated impact forces: translated_force_b 11 | translated_force_b = force_b - np.mean(force_b) + 0.55 12 | 13 | # bootstrap replicates of Frog B's translated impact forces: bs_replicates 14 | bs_replicates = draw_bs_reps(translated_force_b, np.mean, 10000) 15 | 16 | # Calc.fraction of replicates that are less than the observed Frog B force: p 17 | p = np.sum(bs_replicates <= np.mean(force_b)) / 10000 18 | 19 | # ##### two sample bootstrap hypothesis test for diff fo means ##### # 20 | # Compute mean of all forces: mean_force 21 | mean_force = np.mean(forces_concat) 22 | 23 | # Generate shifted arrays 24 | force_a_shifted = force_a - np.mean(force_a) + mean_force 25 | force_b_shifted = force_b - np.mean(force_b) + mean_force 26 | 27 | # Compute 10,000 bootstrap replicates from shifted arrays 28 | bs_replicates_a = draw_bs_reps(force_a_shifted, np.mean, 10000) 29 | bs_replicates_b = draw_bs_reps(force_b_shifted, np.mean, 10000) 30 | 31 | # Get replicates of difference of means: bs_replicates 32 | bs_replicates = bs_replicates_a - bs_replicates_b 33 | 34 | # Compute and print p-value: p 35 | p = np.sum(bs_replicates >= empirical_diff_means) / 10000 36 | print('p-value =', p) 37 | -------------------------------------------------------------------------------- /CSV Import_Panda_Header.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 10 15:03:52 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import pandas as pd 9 | import pandas as pd 10 | 11 | # Assign the filename: file 12 | file = 'fixations.csv' 13 | file2 = 'gaze_postions.csv' 14 | 15 | # Read the file into a DataFrame: df 16 | df = pd.read_csv(file) 17 | df2 = pd.read_csv(file2) 18 | 19 | # View the head of the DataFrame 20 | 21 | print(df.head()) 22 | print(df2.head()) 23 | -------------------------------------------------------------------------------- /CSV_Concatenate_All.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 5 12:39:59 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import os 9 | import glob 10 | import pandas as pd 11 | import seaborn as sns 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | from scipy import stats 15 | # from mayavi import mlab 16 | import multiprocessing 17 | import plotly.plotly as py 18 | import plotly.graph_objs as go 19 | from plotly.graph_objs import Surface 20 | 21 | 22 | path = r'C:\Users\Shabaka\Desktop\Test2 DJI_Corretti\100\TIM' 23 | # path = r'C:\DRO\DCL_rawdata_files' 24 | allFiles = glob.glob(path + "/*.csv") 25 | # frame = pd.DataFrame() 26 | list_TIM = [] 27 | for file_ in allFiles: 28 | df_TIM = pd.read_csv(file_, index_col=None, header=0) 29 | list_TIM.append(df_TIM) 30 | frame = pd.concat(list_TIM) # ignore_index=True) 31 | 32 | print(frame.head()) 33 | 34 | # sns.heatmap(frame.head()) 35 | 36 | plt.show() 37 | 38 | temp = pd.read_csv('C:\\Users\\Shabaka\\Desktop\\Temperatura_Media.csv') 39 | # Plot the aapl time series in blue 40 | print(temp.head()) 41 | plt.plot(temp, color='blue', label='Temp_Median..(yr)') 42 | 43 | plt.show() 44 | 45 | 46 | # Plot the pairwise joint distributions grouped by 'origin' along with 47 | # regression lines 48 | # sns.pairplot(temp, kind='reg', hue='Temp_Med') 49 | # plt.show() 50 | 51 | # urb_pop_reader = pd.read_csv(filename, chunksize=1000) 52 | 53 | """ 54 | files = glob("*.txt") 55 | fig, ax = plt.subplots() 56 | 57 | for f in files: 58 | print("Current file is"+f) 59 | #your csv loading into data 60 | data.plot('time','temp',ax=axes[0]) 61 | 62 | #outside of the for loop 63 | plt.savefig("myplots.png") 64 | 65 | """ 66 | 67 | # ''''''''''''3D Density MAp Plot ''''''''''# 68 | 69 | def calc_kde(data): 70 | return kde(data.T) 71 | 72 | mu, sigma = 0, 0.1 73 | x = 10*np.random.normal(mu, sigma, 5000) 74 | y = 10*np.random.normal(mu, sigma, 5000) 75 | z = 10*np.random.normal(mu, sigma, 5000) 76 | 77 | xyz = np.vstack([x, y, z]) 78 | kde = stats.gaussian_kde(xyz) 79 | 80 | # Evaluate kde on a grid 81 | xmin, ymin, zmin = x.min(), y.min(), z.min() 82 | xmax, ymax, zmax = x.max(), y.max(), z.max() 83 | xi, yi, zi = np.mgrid[xmin:xmax:30j, ymin:ymax:30j, zmin:zmax:30j] 84 | coords = np.vstack([item.ravel() for item in [xi, yi, zi]]) 85 | 86 | # Multiprocessing 87 | cores = multiprocessing.cpu_count() 88 | pool = multiprocessing.Pool(processes=cores) 89 | results = pool.map(calc_kde, np.array_split(coords.T, 2)) 90 | density = np.concatenate(results).reshape(xi.shape) 91 | 92 | # Plot scatter with mayavi 93 | figure = mlab.figure('DensityPlot') 94 | 95 | grid = mlab.pipeline.scalar_field(xi, yi, zi, density) 96 | min = density.min() 97 | max = density.max() 98 | mlab.pipeline.volume(grid, vmin=min, vmax=min + .5*(max-min)) 99 | 100 | mlab.axes() 101 | mlab.show() 102 | 103 | 104 | # '''''''' Alternativc Route'''''''''''''# 105 | filename = 'C:\\Users\\Shabaka\\Desktop\\Temperatura_Media.csv' 106 | raw_data = open(filename, 'rt') 107 | tempdata = pd.read_csv(raw_data, header=0) 108 | print(tempdata.shape) 109 | 110 | print(tempdata.head()) 111 | 112 | plt.plot(tempdata, color='blue', label='Temp_Med') 113 | 114 | plt.show() 115 | 116 | sns.pairplot(tempdata, kind='reg') # hue='Temp_Med') 117 | plt.show() 118 | 119 | surfdata = [go.Surface(tempdata.as_matrix())] 120 | 121 | layout = go.Layout( 122 | title='Temp_Data Elevation', 123 | autosize=False, 124 | width=500, 125 | height=500, 126 | margin=dict( 127 | l=65, 128 | r=50, 129 | b=65, 130 | t=90 131 | ) 132 | ) 133 | fig = go.Figure(data=surfdata, layout=layout) 134 | py.iplot(fig, filename='elevations-3d-surface', type='surface') 135 | 136 | plt.show() -------------------------------------------------------------------------------- /CSV_Import-All2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 5 12:51:39 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import os 9 | import glob 10 | import pandas as pd 11 | import seaborn as sns 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | 15 | # path = r'C:\DRO\DCL_rawdata_files' 16 | 17 | path = r'C:\Users\Shabaka\Desktop\Test2 DJI_Corretti\100\TIM' 18 | allfiles = os.path.join(path, "*.csv") 19 | frame2 = pd.DataFrame() 20 | list2 = [] 21 | for file_ in allfiles: 22 | df = pd.read_csv(file, index_col=None, header=None) 23 | list2.append(df) 24 | frame = pd.concat(list2, ignore_index=True) 25 | 26 | print(frame.head()) 27 | 28 | 29 | df = pd.concat((pd.read_csv(file) for file in allfiles)) 30 | 31 | print(df.head()) -------------------------------------------------------------------------------- /CSV_Reader_Test2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 5 12:18:36 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import os 9 | import glob 10 | import pandas as pd 11 | 12 | 13 | def concatenate(indir='', outfile=''): 14 | os.chdir(indir) 15 | fileList = glob.glob('*.csv') 16 | dfList = [] 17 | 18 | for filename in fileList: 19 | print(filename) 20 | df = pd.read_csv(filename, header=None) 21 | dfList.append(df) 22 | concatDF= pd.concat(dfList, axis=0) 23 | concatDF.columns=colanmes 24 | concatDF.to_csv 25 | -------------------------------------------------------------------------------- /Calc_Diff_Btw_Columns.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 23 02:16:54 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | from sqlalchemy import desc 9 | 10 | # Build query to return state names by population difference from 2008 to 2000: 11 | # stmt 12 | stmt = select([census.columns.state, (census.columns.pop2008 -census.columns.pop2000).label('pop_change')]) 13 | 14 | # Append group by for the state: stmt 15 | stmt = stmt.group_by(census.columns.state) 16 | 17 | # Append order by for pop_change descendingly: stmt 18 | stmt = stmt.order_by(desc('pop_change')) 19 | 20 | # Return only 5 results: stmt 21 | stmt = stmt.limit(5) 22 | 23 | # Use connection to execute the statement and fetch all results 24 | results = connection.execute(stmt).fetchall() 25 | 26 | # Print the state and population change for each record 27 | for result in results: 28 | print('{}-{}'.format(result.state, result.pop_change)) 29 | -------------------------------------------------------------------------------- /Connect to PostgresQL Database.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 17 01:28:54 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import create_engine function 9 | from sqlalchemy import create_engine 10 | 11 | # Create an engine to the census database 12 | engine = create_engine('postgresql+psycopg2://' + 'student:datacamp'+\ 13 | '@postgresql.csrrinzqubik.us-east-1.rds.amazonaws.com'':5432/census') 14 | 15 | # Use the 'table_names()' method on the engine to print the table names 16 | print(engine.table_names()) -------------------------------------------------------------------------------- /Create_DataBase_Engine.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 11 19:13:40 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import necessary module 9 | from sqlalchemy import create_engine 10 | import pandas as pd 11 | 12 | # Create engine: engine 13 | engine = create_engine('sqlite:///Chinook.sqlite') 14 | 15 | 16 | # Save the table names to a list: table_names 17 | table_names = engine.table_names() 18 | 19 | # Print the table names to the shell 20 | print(table_names) 21 | 22 | """ 23 | Open the engine connection as con using the method connect() on the engine. 24 | Execute the query that selects ALL columns from the Album table. Store the 25 | results in rs. 26 | Store all of your query results in the DataFrame df by applying the 27 | fetchall() method to the results rs. 28 | Close the connection! 29 | """ 30 | 31 | # 'Retrieve column of table called Album in the chinook database' 32 | 33 | # Open engine connection: con 34 | con = engine.connect() 35 | 36 | # Perform query: rs 37 | rs = con.execute('SELECT * FROM Album') 38 | 39 | # Save results of the query to DataFrame: df 40 | df = pd.DataFrame(rs.fetchall()) 41 | -------------------------------------------------------------------------------- /Customer Segmentation_Code_Spyder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Feb 25 18:00:11 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import pandas as pd 9 | 10 | df_offers = pd.read_excel("http://blog.yhathq.com/static/misc/data/WineKMC.xlsx", sheetname=0) 11 | df_offers.columns = ["offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak"] 12 | df_offers.head() 13 | 14 | df_transactions = pd.read_excel("http://blog.yhathq.com/static/misc/data/WineKMC.xlsx", sheetname=1) 15 | df_transactions.columns = ["customer_name", "offer_id"] 16 | df_transactions['n'] = 1 17 | df_transactions.head() 18 | 19 | # join the offers and transactions table 20 | df = pd.merge(df_offers, df_transactions) 21 | # create a "pivot table" which will give us the number of times each customer responded to a given offer 22 | matrix = df.pivot_table(index=['customer_name'], columns=['offer_id'], values='n') 23 | # a little tidying up. fill NA values with 0 and make the index into a column 24 | matrix = matrix.fillna(0).reset_index() 25 | # save a list of the 0/1 columns. we'll use these a bit later 26 | x_cols = matrix.columns[1:] 27 | 28 | from sklearn.cluster import KMeans 29 | 30 | cluster = KMeans(n_clusters=5) 31 | # slice matrix so we only include the 0/1 indicator columns in the clustering 32 | matrix['cluster'] = cluster.fit_predict(matrix[matrix.columns[2:]]) 33 | matrix.cluster.value_counts() 34 | 35 | from sklearn.decomposition import PCA 36 | 37 | pca = PCA(n_components=2) 38 | matrix['x'] = pca.fit_transform(matrix[x_cols])[:,0] 39 | matrix['y'] = pca.fit_transform(matrix[x_cols])[:,1] 40 | matrix = matrix.reset_index() 41 | 42 | customer_clusters = matrix[['customer_name', 'cluster', 'x', 'y']] 43 | customer_clusters.head() 44 | 45 | df = pd.merge(df_transactions, customer_clusters) 46 | df = pd.merge(df_offers, df) 47 | 48 | from ggplot import * 49 | """ 50 | import matplotlib.pyplot as plt 51 | plt.figure() 52 | plt.plot(rigs2) 53 | plt.plot(customer_clusters) 54 | plt.ion() 55 | plt.show() 56 | """ 57 | ggplot(df, aes(x='x', y='y', color='cluster')) + \ 58 | geom_point(size=75) + \ 59 | ggtitle("Customers Grouped by Cluster") 60 | 61 | cluster_centers = pca.transform(cluster.cluster_centers_) 62 | cluster_centers = pd.DataFrame(cluster_centers, columns=['x', 'y']) 63 | cluster_centers['cluster'] = range(0, len(cluster_centers)) 64 | 65 | ggplot(df, aes(x='x', y='y', color='cluster')) + \ 66 | geom_point(size=75) + \ 67 | geom_point(cluster_centers, size=500) +\ 68 | ggtitle("Customers Grouped by Cluster") 69 | 70 | df['is_4'] = df.cluster==4 71 | df.groupby("is_4").varietal.value_counts() 72 | 73 | df.groupby("is_4")[['min_qty', 'discount']].mean() 74 | -------------------------------------------------------------------------------- /CustomizeSQL_Query_Col_Row_Size.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jan 13 14:32:12 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | """ 9 | 10 | Open the engine connection as con using the method connect() on the engine. 11 | Execute the query that selects ALL columns from the Album table. Store the 12 | results in rs. 13 | Store all of your query results in the DataFrame df by applying the 14 | fetchall() method to the results rs. 15 | Close the connection! - In Query Script 16 | """ 17 | 18 | # 'This script allows us to perform the following things:' 19 | 20 | # Select specified columns from a table; 21 | # Select a specified number of rows; 22 | # Import column names from the database table. 23 | 24 | 25 | from sqlalchemy import create_engine 26 | import pandas as pd 27 | 28 | engine = create_engine('sqlite:///Chinook.sqlite') 29 | 30 | # Open engine in context manager 31 | # Perform query and save results to DataFrame: df 32 | with engine.connect() as con: 33 | rs = con.execute("SELECT LastName, Title FROM Employee") 34 | df = pd.DataFrame(rs.fetchmany(size=3)) 35 | df.columns = rs.keys() 36 | 37 | # Print the length of the DataFrame df 38 | print(len(df)) 39 | 40 | # Print the head of the DataFrame df 41 | print(df.head()) 42 | -------------------------------------------------------------------------------- /DB_Tables_Update.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 4 17:05:28 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # '''''''''''' Panda SQL Query ''''''''''''# 9 | # Import packages 10 | import sqlite3 11 | from sqlalchemy import create_engine 12 | from sqlalchemy import update 13 | from sqlalchemy import connection 14 | import pandas as pd 15 | # Import insert and select from sqlalchemy 16 | from sqlalchemy import insert, select 17 | # Create engine: engine 18 | engine = create_engine('sqlite:///Chinook.sqlite') 19 | 20 | # Execute query and store records in DataFrame: df 21 | df = pd.read_sql_query("SELECT * FROM Album", engine) 22 | 23 | # Print head of DataFrame 24 | 25 | print(df.head()) 26 | 27 | # Open engine in context manager 28 | # Perform query and save results to DataFrame: df1 29 | 30 | with engine.connect() as con: 31 | rs = con.execute("SELECT * FROM Album") 32 | df1 = pd.DataFrame(rs.fetchall()) 33 | df1.columns = rs.keys() 34 | 35 | # Confirm that both methods yield the same result: does df = df1 ? 36 | 37 | print(df.equals(df1)) 38 | 39 | # ''''''''''''#######'''''''##################''''''''# 40 | 41 | # Build an insert statement to insert a record into the data table: stmt 42 | 43 | stmt = insert(data).values(name='Anna', count=1, amount=1000.00, valid=True) 44 | 45 | # Execute the statement via the connection: results 46 | 47 | results = connection.execute(stmt) 48 | 49 | # Print result rowcount 50 | 51 | print(results.rowcount) 52 | 53 | # Build a select statement to validate the insert 54 | 55 | stmt = select([data]).where(data.columns.name == 'Anna') 56 | 57 | # Print the result of executing the query. 58 | 59 | print(connection.execute(stmt).first()) 60 | 61 | # '''''''''###########'''''''''''''''' # 62 | # ''''''####'''''''''''''##########'''''''''# 63 | 64 | # Create a insert statement for census: stmt 65 | 66 | stmt = insert(census) 67 | 68 | # Create an empty list and zeroed row count: values_list, total_rowcount 69 | 70 | values_list = [] 71 | total_rowcount = 0 72 | 73 | # Enumerate the rows of csv_reader 74 | for idx, row in enumerate(csv_reader): 75 | # create data and append to values_list 76 | data = {'state': row[0], 'sex': row[1], 'age': row[2], 'pop2000': row[3], 77 | 'pop2008': row[4]} 78 | values_list.append(data) 79 | 80 | # Check to see if divisible by 51 81 | if idx % 51 == 0: 82 | results = connection.execute(stmt, values_list) 83 | total_rowcount += results.rowcount 84 | values_list = [] 85 | 86 | 87 | # Build a select statement: select_stmt 88 | select_stmt = select([state_fact]).where(state_fact.columns.name == 'New York') 89 | 90 | # Print the results of executing the select_stmt 91 | print(connection.execute(select_stmt).fetchall()) 92 | 93 | # Build a statement to update the fips_state to 36: stmt 94 | stmt = update(state_fact).values(fips_state=36) 95 | 96 | # Append a where clause to limit it to records for New York state 97 | stmt = stmt.where(state_fact.columns.name == 'New York') 98 | 99 | # Execute the statement: results 100 | results = connection.execute(stmt) 101 | 102 | # Print rowcount 103 | print(results.rowcount) 104 | 105 | # Execute the select_stmt again to view the changes 106 | print(connection.execute(select_stmt).fetchall()) 107 | 108 | 109 | # ''''''''''''' Update Multiple Records ''''''# 110 | 111 | # Build a statement to update the notes to 'The Wild West': stmt 112 | stmt = update(state_fact).values(notes='The Wild West') 113 | 114 | # Append a where clause to match the West census region records 115 | stmt = stmt.where(state_fact.columns.census_region_name == 'West') 116 | 117 | # Execute the statement: results 118 | results = connection.execute(stmt) 119 | 120 | # Print rowcount 121 | print(results.rowcount) 122 | 123 | # ''''''''''' Making Correlated Updates ''' ######## 124 | 125 | # Build a statement to select name from state_fact: stmt 126 | fips_stmt = select([state_fact.columns.name]) 127 | 128 | # Append a where clause to Match the fips_state to flat_census fips_code 129 | fips_stmt = fips_stmt.where( 130 | state_fact.columns.fips_state == flat_census.columns.fips_code) 131 | 132 | # Build an update statement to set the name to fips_stmt: update_stmt 133 | update_stmt = update(flat_census).values(state_name=fips_stmt) 134 | 135 | # Execute update_stmt: results 136 | results = connection.execute(update_stmt) 137 | 138 | # Print rowcount 139 | print(results.rowcount) 140 | 141 | 142 | -------------------------------------------------------------------------------- /DatStream.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 30 21:27:31 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import pandas as pd 9 | 10 | # Open a connection to the file 11 | with open('world_dev_ind.csv') as file: 12 | 13 | # Skip the column names 14 | file.readline() 15 | 16 | # Initialize an empty dictionary: counts_dict 17 | counts_dict = {} 18 | 19 | # Process only the first 1000 rows 20 | for j in range(1000): 21 | 22 | # Split the current line into a list: line 23 | line = file.readline().split(',') 24 | 25 | # Get the value for the first column: first_col 26 | first_col = line[0] 27 | 28 | # If the column value is in the dict, increment its value 29 | if first_col in counts_dict.keys(): 30 | counts_dict[first_col] += 1 31 | 32 | # Else, add to the dict and set value to 1 33 | else: 34 | counts_dict[first_col] = 1 35 | 36 | # Print the resulting dictionary 37 | print(counts_dict) 38 | 39 | # ''''''''''''''''' Write Generator to Load Data Chunks ''''''' # 40 | 41 | # Define read_large_file() 42 | def read_large_file(file_object): 43 | """A generator function to read a large file lazily.""" 44 | 45 | # Loop indefinitely until the end of the file 46 | while True: 47 | 48 | # Read a line from the file: data 49 | data = file_object.readline() 50 | 51 | # Break if this is the end of the file 52 | if not data: 53 | break 54 | 55 | # Yield the line of data 56 | yield data 57 | # Open a connection to the file 58 | with open('world_dev_ind.csv') as file: 59 | 60 | # Create a generator object for the file: gen_file 61 | gen_file = read_large_file(file) 62 | 63 | # Print the first three lines of the file 64 | print(next(gen_file)) 65 | print(next(gen_file)) 66 | print(next(gen_file)) 67 | 68 | 69 | # ''''''''''''''' Load Data in Chunks with Generator ''''''''''' '# 70 | # Initialize an empty dictionary: counts_dict 71 | counts_dict = {} 72 | 73 | # Open a connection to the file 74 | with open('world_dev_ind.csv') as file: 75 | 76 | # Iterate over the generator from read_large_file() 77 | for line in read_large_file(file): 78 | 79 | row = line.split(',') 80 | first_col = row[0] 81 | 82 | if first_col in counts_dict.keys(): 83 | counts_dict[first_col] += 1 84 | else: 85 | counts_dict[first_col] = 1 86 | 87 | # Print 88 | print(counts_dict) 89 | 90 | # ''''' Iterator to load data in chunks ''''''''''' # 91 | 92 | # Import the pandas package 93 | 94 | # Initialize reader object: df_reader 95 | df_reader = pd.read_csv('ind_pop.csv', chunksize=10) 96 | 97 | # Print two chunks 98 | print(next(df_reader)) 99 | print(next(df_reader)) 100 | 101 | # ''''''''''''' Iterator to Load Data in Chunks '''''''''''# 102 | 103 | # Initialize reader object: urb_pop_reader 104 | urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000) 105 | 106 | # Get the first dataframe chunk: df_urb_pop 107 | df_urb_pop = next(urb_pop_reader) 108 | 109 | # Check out the head of the dataframe 110 | print(df_urb_pop.head()) 111 | 112 | # Check out specific country: df_pop_ceb 113 | df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB'] 114 | 115 | # Zip dataframe columns of interest: pops 116 | pops = zip(df_pop_ceb['Total Population'], 117 | df_pop_ceb['Urban population (% of total)']) 118 | 119 | # Turn zip object into list: pops_list 120 | pops_list = list(pops) 121 | 122 | # Print pops_list 123 | print(pops_list) 124 | 125 | 126 | # Use list comp to create new dataframe column 'Total Urban Population' 127 | 128 | df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1]) for tup in pops_list] 129 | 130 | # Plot urban population data 131 | 132 | df_pop_ceb.plot(kind='scatter', x='Year', y='Total Urban Population') 133 | plt.show() 134 | 135 | -------------------------------------------------------------------------------- /DatVis_Bokeh_1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue May 16 17:04:55 2017 4 | 5 | @author: Shabaka 6 | """ 7 | import numpy as np 8 | import pandas as pd 9 | 10 | from bokeh.plotting import figure 11 | from bokeh.io import output_file, show 12 | from bokeh.plotting import ColumnDataSource 13 | from bokeh.models import HoverTool 14 | 15 | # Create the figure: p 16 | p = figure(x_axis_label='fertility (children per woman)', 17 | y_axis_label='female_literacy (% population)') 18 | 19 | # Add a circle glyph to the figure p 20 | p.circle(fertility, female_literacy) 21 | 22 | # Call the output_file() function and specify the name of the file 23 | output_file('fert_lit.html') 24 | 25 | # Display the plot 26 | show(p) 27 | 28 | # ''''''''''''' Multiple Dta plots ''''''# 29 | 30 | # Create the figure: p 31 | p = figure(x_axis_label='fertility', 32 | y_axis_label='female_literacy (% population)') 33 | 34 | # Add a circle glyph to the figure p 35 | _ = p.circle(fertility_latinamerica, female_literacy_latinamerica) 36 | 37 | # Add an x glyph to the figure p 38 | _ = p.x(fertility_africa, female_literacy_africa) 39 | 40 | # Specify the name of the file 41 | output_file('fert_lit_separate.html') 42 | 43 | # Display the plot 44 | show(p) 45 | 46 | # '''''Scatter Plot Customisation '''''''# 47 | 48 | # Create the figure: p 49 | p = figure(x_axis_label='fertility (children per woman)', 50 | y_axis_label='female_literacy (% population)') 51 | 52 | # Add a blue circle glyph to the figure p 53 | p.circle(fertility_latinamerica, female_literacy_latinamerica, 54 | color='blue', size=10, alpha=0.8) 55 | 56 | # Add a red circle glyph to the figure p 57 | p.circle(fertility_africa, female_literacy_africa, 58 | color='red', size=10, alpha=0.8) 59 | 60 | # Specify the name of the file 61 | output_file('fert_lit_separate_colors.html') 62 | 63 | # Display the plot 64 | show(p) 65 | 66 | 67 | # ''''Bokeh Line PLot '''''''''''# 68 | 69 | # Import figure from bokeh.plotting - to p of file 70 | 71 | # Create a figure with x_axis_type="datetime": p 72 | p = figure(x_axis_type='datetime', 73 | x_axis_label='Date', y_axis_label='US Dollars') 74 | 75 | # Plot date along the x axis and price along the y axis 76 | p.line(date, price, line_width=3) 77 | 78 | # Specify the name of the output file and show the result 79 | output_file('line.html') 80 | show(p) 81 | 82 | # '''''Line and Marker Plot ''''''''# 83 | 84 | # Import figure from bokeh.plotting - top of file 85 | 86 | # Create a figure with x_axis_type='datetime': p 87 | p = figure(x_axis_type='datetime', x_axis_label='Date', 88 | y_axis_label='US Dollars') 89 | 90 | # Plot date along the x-axis and price along the y-axis 91 | p.line(date, price) 92 | 93 | # With date on the x-axis and price on the y-axis, 94 | # add a white circle glyph of size 4 95 | p.circle(date, price, fill_color='white', size=4) 96 | 97 | # Specify the name of the output file and show the result 98 | output_file('line.html') 99 | show(p) 100 | 101 | # ''''''Bokeh Patch Plots 'Maps' ''# 102 | 103 | # Create a list of az_lons, co_lons, nm_lons and ut_lons: x 104 | x = [az_lons, co_lons, nm_lons, ut_lons] 105 | 106 | # Create a list of az_lats, co_lats, nm_lats and ut_lats: y 107 | y = [az_lats, co_lats, nm_lats, ut_lats] 108 | 109 | # Add patches to figure p with line_color=white for x and y 110 | p.patches(x, y, line_color='white') 111 | 112 | # Specify the name of the output file and show the result 113 | output_file('four_corners.html') 114 | show(p) 115 | 116 | 117 | # ''''''''' Plotting from a numpy array ''''''# 118 | 119 | # Import numpy as np - at top of file 120 | 121 | # Create array using np.linspace: x 122 | x = np.linspace(0, 5, 100) 123 | 124 | # Create array using np.cos: y 125 | y = np.cos(x) 126 | 127 | # Add circles at x and y 128 | p.circle(x, y) 129 | 130 | # Specify the name of the output file and show the result 131 | output_file('numpy.html') 132 | show(p) 133 | 134 | # '''''''' Plotting from Pandas Dataframe ''''''''# 135 | 136 | # Import pandas as pd - top of file 137 | 138 | # Read in the CSV file: df 139 | df = pd.read_csv('auto.csv') 140 | 141 | # Import figure from bokeh.plotting - top of file 142 | 143 | # Create the figure: p 144 | p = figure(x_axis_label='HP', y_axis_label='MPG') 145 | 146 | # Plot mpg vs hp by color 147 | p.circle(df['hp'], df['mpg'], color=df['color'], size=10) 148 | 149 | # Specify the name of the output file and show the result 150 | output_file('auto-df.html') 151 | show(p) 152 | 153 | # '''''''' Plot from ColumnData Source ''''''''# 154 | 155 | # Import the ColumnDataSource class from bokeh.plotting 156 | 157 | # Create a ColumnDataSource from df: source 158 | source = ColumnDataSource(df) 159 | 160 | # Add circle glyphs to the figure p 161 | p.circle('Year', 'Time', source=source, color='color', size=8) 162 | 163 | # Specify the name of the output file and show the result 164 | output_file('sprint.html') 165 | show(p) 166 | 167 | # '''''''Selection and non-Selection Glyph Specification ''''# 168 | 169 | # Create a figure with the "box_select" tool: p 170 | p = figure(x_axis_label='Year', y_axis_label='Time', tools='box_select') 171 | 172 | # Add circle glyphs to the figure p with the selected 173 | # and non-selected properties 174 | 175 | p.circle('Year', 'Time', source=source, 176 | selection_color='red', nonselection_alpha=0.1) 177 | 178 | # Specify the name of the output file and show the result 179 | output_file('selection_glyph.html') 180 | show(p) 181 | 182 | # ''''''making Hover Glyphs '''''''# 183 | 184 | # import the HoverTool - at top of file 185 | 186 | # Add circle glyphs to figure p 187 | p.circle(x, y, size=10, 188 | fill_color='grey', alpha=0.1, line_color=None, 189 | hover_fill_color='firebrick', hover_alpha=0.5, 190 | hover_line_color='white') 191 | 192 | # Create a HoverTool: hover 193 | hover = HoverTool(tooltips=None, mode='vline') 194 | 195 | # Add the hover tool to the figure p 196 | p.add_tools(hover) 197 | 198 | # Specify the name of the output file and show the result 199 | output_file('hover_glyph.html') 200 | show(p) 201 | 202 | # ''''''''' Color Mapping '''''''''''# 203 | 204 | #Import CategoricalColorMapper from bokeh.models 205 | from bokeh.models import CategoricalColorMapper 206 | 207 | # Convert df to a ColumnDataSource: source 208 | source = ColumnDataSource(df) 209 | 210 | # Make a CategoricalColorMapper object: color_mapper 211 | color_mapper = CategoricalColorMapper(factors=['Europe', 'Asia', 'US'], 212 | palette=['red', 'green', 'blue']) 213 | 214 | # Add a circle glyph to the figure p 215 | p.circle('weight', 'mpg', source=source, 216 | color=dict(field='origin', transform=color_mapper), 217 | legend='origin') 218 | 219 | # Specify the name of the output file and show the result 220 | output_file('colormap.html') 221 | show(p) 222 | -------------------------------------------------------------------------------- /DatVis_Bokeh_CaseStudy_App_Build_5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu May 18 15:54:33 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | from bokeh.io import output_file, show 12 | 13 | from bokeh.plotting import figure 14 | from bokeh.plotting import ColumnDataSource 15 | 16 | from bokeh.layouts import gridplot 17 | from bokeh.layouts import row, column 18 | from bokeh.layouts import widgetbox 19 | 20 | from bokeh.charts import BoxPlot 21 | from bokeh.charts import Scatter 22 | 23 | from bokeh.palettes import Spectral6 24 | 25 | from bokeh.models import Select 26 | from bokeh.models import Slider 27 | from bokeh.models import Button 28 | from bokeh.models import HoverTool 29 | from bokeh.models import CategoricalColorMapper 30 | from bokeh.models import CheckboxGroup, RadioGroup, Toggle 31 | 32 | from bokeh.models.widgets import Panel 33 | from bokeh.models.widgets import Tabs 34 | 35 | # Perform necessary imports 36 | from bokeh.io import curdoc 37 | 38 | data = pd.read_csv('___.csv') 39 | _ = data.head() 40 | _ = data.describe 41 | _ = data.info() 42 | _ = data.shape 43 | 44 | 45 | 46 | # '''' Basic EDA Plot of Gapminder Data set ''''''''# 47 | 48 | # Make the ColumnDataSource: source 49 | source = ColumnDataSource(data={ 50 | 'x' : data.loc[1970].fertility, 51 | 'y' : data.loc[1970].life, 52 | 'country' : data.loc[1970].Country, 53 | }) 54 | 55 | # Create the figure: p 56 | p = figure(title='1970', x_axis_label='Fertility (children per woman)', 57 | y_axis_label='Life Expectancy (years)', 58 | plot_height=400, plot_width=700, 59 | tools=[HoverTool(tooltips='@country')]) 60 | 61 | # Add a circle glyph to the figure p 62 | p.circle(x='x', y='y', source=source) 63 | 64 | # Output the file and show the figure 65 | output_file('gapminder.html') 66 | show(p) 67 | 68 | # ''' Basic Data Plot '''''''# 69 | 70 | # Make the ColumnDataSource: source 71 | source = ColumnDataSource(data={ 72 | 'x' : data.loc[1970].fertility, 73 | 'y' : data.loc[1970].life, 74 | 'country' : data.loc[1970].Country, 75 | 'pop' : (data.loc[1970].population / 20000000) + 2, 76 | 'region' : data.loc[1970].region, 77 | }) 78 | 79 | # Save the minimum and maximum values of the fertility column: xmin, xmax 80 | xmin, xmax = min(data.fertility), max(data.fertility) 81 | 82 | # Save the minimum and maximum values of the life expectancy column: ymin, ymax 83 | ymin, ymax = min(data.life), max(data.life) 84 | 85 | # Create the figure: plot 86 | plot = figure(title='Gapminder Data for 1970', plot_height=400, plot_width=700, 87 | x_range=(xmin, xmax), y_range=(ymin, ymax)) 88 | 89 | # Add circle glyphs to the plot 90 | plot.circle(x='x', y='y', fill_alpha=0.8, source=source) 91 | 92 | # Set the x-axis label 93 | plot.xaxis.axis_label = 'Fertility (children per woman)' 94 | 95 | # Set the y-axis label 96 | plot.yaxis.axis_label = 'Life Expectancy (years)' 97 | 98 | # Add the plot to the current document and add a title 99 | curdoc().add_root(plot) 100 | curdoc().title = 'Gapminder' 101 | 102 | 103 | # ''''' Enhancing the list with some colours ''''# 104 | 105 | # Make a list of the unique values from the region column: regions_list 106 | regions_list = data.region.unique().tolist() 107 | 108 | # Import CategoricalColorMapper from bokeh.models and 109 | # the Spectral6 palette from bokeh.palettes 110 | 111 | # Make a color mapper: color_mapper 112 | color_mapper = CategoricalColorMapper(factors=regions_list, palette=Spectral6) 113 | 114 | # Add the color mapper to the circle glyph 115 | plot.circle(x='x', y='y', fill_alpha=0.8, source=source, 116 | color=dict(field='region', transform=color_mapper), 117 | legend='region') 118 | 119 | # Set the legend.location attribute of the plot to 'top_right' 120 | plot.legend.location = 'top_right' 121 | 122 | # Add the plot to the current document and add the title 123 | curdoc().add_root(plot) 124 | curdoc().title = 'Gapminder' 125 | 126 | 127 | # '''''' Adding a Slider to vary the year ''''''# 128 | 129 | # Define the callback function: update_plot 130 | def update_plot(attr, old, new): 131 | # set the `yr` name to `slider.value 132 | # and `source.data = new_data` 133 | yr = slider.value 134 | new_data = { 135 | 'x': data.loc[yr].fertility, 136 | 'y': data.loc[yr].life, 137 | 'country': data.loc[yr].Country, 138 | 'pop': (data.loc[yr].population / 20000000) + 2, 139 | 'region': data.loc[yr].region, 140 | } 141 | source.data = new_data 142 | 143 | 144 | # Make a slider object: slider 145 | slider = Slider(start=1970, end=2010, step=1, value=1970, title='Year') 146 | 147 | # Attach the callback to the 'value' property of slider 148 | slider.on_change('value', update_plot) 149 | 150 | # Make a row layout of widgetbox(slider) and plot 151 | # and add it to the current document 152 | layout = row(widgetbox(slider), plot) 153 | curdoc().add_root(layout) 154 | 155 | # ''''' Customised Plot API from user input '''# 156 | 157 | # Define the callback function: update_plot 158 | def update_plot(attr, old, new): 159 | # Assign the value of the slider: yr 160 | yr = slider.value 161 | # Set new_data 162 | new_data = { 163 | 'x' : data.loc[yr].fertility, 164 | 'y' : data.loc[yr].life, 165 | 'country' : data.loc[yr].Country, 166 | 'pop' : (data.loc[yr].population / 20000000) + 2, 167 | 'region' : data.loc[yr].region, 168 | } 169 | # Assign new_data to: source.data 170 | source.data = new_data 171 | 172 | # Add title to figure: plot.title.text 173 | plot.title.text = 'Gapminder data for %d' % yr 174 | 175 | # Make a slider object: slider 176 | slider = Slider(start=1970, end=2010, step=1, value=1970, title='Year') 177 | 178 | # Attach the callback to the 'value' property of slider 179 | slider.on_change('value', update_plot) 180 | 181 | # Make a row layout of widgetbox(slider) and 182 | # plot and add it to the current document 183 | layout = row(widgetbox(slider), plot) 184 | curdoc().add_root(layout) 185 | 186 | # '''' Add Hover info_tool to the API '''''''# 187 | 188 | # Create a HoverTool: hover 189 | hover = HoverTool(tooltips=[('Country', '@country')]) 190 | 191 | # Add the HoverTool to the plot 192 | plot.add_tools(hover) 193 | # Create layout: layout 194 | layout = row(widgetbox(slider), plot) 195 | 196 | # Add layout to current document 197 | curdoc().add_root(layout) 198 | 199 | # '''''''Adding drop-down menu to the App ''''''''''# 200 | 201 | # Define the callback: update_plot 202 | def update_plot(attr, old, new): 203 | # Read the current value off the slider and 2 dropdowns: yr, x, y 204 | yr = slider.value 205 | x = x_select.value 206 | y = y_select.value 207 | # Label axes of plot 208 | plot.xaxis.axis_label = x 209 | plot.yaxis.axis_label = y 210 | # Set new_data 211 | new_data = { 212 | 'x' : data.loc[yr][x], 213 | 'y' : data.loc[yr][y], 214 | 'country' : data.loc[yr].Country, 215 | 'pop' : (data.loc[yr].population / 20000000) + 2, 216 | 'region' : data.loc[yr].region, 217 | } 218 | # Assign new_data to source.data 219 | source.data = new_data 220 | 221 | # Set the range of all axes 222 | plot.x_range.start = min(data[x]) 223 | plot.x_range.end = max(data[x]) 224 | plot.y_range.start = min(data[y]) 225 | plot.y_range.end = max(data[y]) 226 | 227 | # Add title to plot 228 | plot.title.text = 'Gapminder data for %d' % yr 229 | 230 | # Create a dropdown slider widget: slider 231 | slider = Slider(start=1970, end=2010, step=1, value=1970, title='Year') 232 | 233 | # Attach the callback to the 'value' property of slider 234 | slider.on_change('value', update_plot) 235 | 236 | # Create a dropdown Select widget for the x data: x_select 237 | x_select = Select( 238 | options=['fertility', 'life', 'child_mortality', 'gdp'], 239 | value='fertility', 240 | title='x-axis data' 241 | ) 242 | 243 | # Attach the update_plot callback to the 244 | # 'value' property of x_select 245 | x_select.on_change('value', update_plot) 246 | 247 | # Create a dropdown Select widget for the y data: y_select 248 | y_select = Select( 249 | options=['fertility', 'life', 'child_mortality', 'gdp'], 250 | value='life', 251 | title='y-axis data' 252 | ) 253 | 254 | # Attach the update_plot callback to 255 | # the 'value' property of y_select 256 | y_select.on_change('value', update_plot) 257 | 258 | # Create layout and add to current document 259 | layout = row(widgetbox(slider, x_select, y_select), plot) 260 | curdoc().add_root(layout) 261 | 262 | -------------------------------------------------------------------------------- /DatVis_Bokeh_High_Level_Charts_3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed May 17 19:35:15 2017 4 | 5 | @author: Shabaka 6 | """ 7 | import pandas as pd 8 | import numpy as np 9 | 10 | from bokeh.plotting import figure 11 | from bokeh.io import output_file, show 12 | from bokeh.plotting import ColumnDataSource 13 | from bokeh.models import HoverTool 14 | from bokeh.layouts import gridplot 15 | from bokeh.models.widgets import Panel 16 | from bokeh.models.widgets import Tabs 17 | from bokeh.layouts import row, column 18 | from bokeh.charts import BoxPlot 19 | from bokeh.charts import Scatter 20 | 21 | # Import Histogram, output_file, and show from bokeh.charts 22 | from bokeh.charts import Histogram 23 | 24 | 25 | # ''''' Basic bokeh Histogram ''''''''# 26 | df = pd.read_csv('fixations.csv') 27 | 28 | df.head() 29 | 30 | # Create a ColumnDataSource from df: source 31 | source = ColumnDataSource(df) 32 | 33 | # Make a Histogram: p 34 | p = Histogram(df, 'duration', title='Gaze_Time', bins=50) 35 | 36 | # Set the x axis label 37 | p.xaxis.axis_label = 'Gaze_Duration' 38 | 39 | # Set the y axis label 40 | p.yaxis.axis_label = 'Pupil DIa' 41 | # Specify the name of the output_file and show the result 42 | output_file('histogram.html') 43 | show(p) 44 | 45 | """ 46 | # Make a Histogram: p 47 | p = Histogram(df, 'female_literacy', title='Female Literacy', 48 | bins=40) 49 | 50 | # Set the x axis label 51 | p.xaxis.axis_label = 'Female Literacy' 52 | 53 | # Set the y axis label 54 | p.yaxis.axis_label = 'Fertility' 55 | # Specify the name of the output_file and show the result 56 | output_file('histogram.html') 57 | show(p) 58 | 59 | """ 60 | # '''''' Multiple Histograms ''''''''# 61 | 62 | # Make a Histogram: p 63 | p = Histogram(df, 'female_literacy', title='Female Literacy', 64 | color='Continent', legend='top_left') 65 | 66 | # Set axis labels 67 | p.xaxis.axis_label = 'Female Literacy (% population)' 68 | p.yaxis.axis_label = 'Number of Countries' 69 | 70 | # Specify the name of the output_file and show the result 71 | output_file('hist_bins.html') 72 | 73 | """ 74 | # '''''' Basic BoxPlot '''''''''# 75 | 76 | # Make a box plot: p 77 | p = BoxPlot(df, values='duration', label='confidence', 78 | title='Gaze Duration (grouped by Avg_Pupil_Size)', 79 | legend='bottom_right') 80 | 81 | # Set the y axis label 82 | p.yaxis.axis_label = 'Fixations (% Tot_Gaze_Pop)' 83 | 84 | # Specify the name of the output_file and show the result 85 | output_file('boxplot.html') 86 | show(p) 87 | """ 88 | 89 | # ''''''''''''''' ################ '''''''''''''' # 90 | # Make a box plot: p 91 | p = BoxPlot(df, values='female_literacy', label='Continent', 92 | title='Female Literacy (grouped by Continent)', 93 | legend='bottom_right') 94 | 95 | # Set the y axis label 96 | p.yaxis.axis_label = 'Female literacy (% population)' 97 | 98 | # Specify the name of the output_file and show the result 99 | output_file('boxplot.html') 100 | show(p) 101 | 102 | # ''''''''''' Multicoloured Boxplots ''''''# 103 | 104 | # Make a box plot: p 105 | p = BoxPlot(df, values='female_literacy', 106 | label='Continent', color='Continent', 107 | title='Female Literacy (grouped by Continent)', 108 | legend='bottom_right') 109 | 110 | # Set y-axis label 111 | p.yaxis.axis_label = 'Female literacy (% population)' 112 | 113 | # Specify the name of the output_file and show the result 114 | output_file('boxplot.html') 115 | show(p) 116 | 117 | # ''''''''' Basic Bokeh Scatter PLot ''''''# 118 | 119 | # Make a scatter plot: p 120 | p = Scatter(df, x='population', y='female_literacy', 121 | title='Female Literacy vs Population') 122 | 123 | # Set the x-axis label 124 | p.xaxis.axis_label = 'Population' 125 | 126 | # Set the y-axis label 127 | p.yaxis.axis_label = 'Female Literacy' 128 | # Specify the name of the output_file and show the result 129 | output_file('scatterplot.html') 130 | show(p) 131 | 132 | # ''''' scatter plot grouping by colour ''''# 133 | 134 | # Make a scatter plot such that each circle 135 | # is colored by its continent: p 136 | p = Scatter(df, x='population', y='female_literacy', 137 | color='Continent', 138 | title='Female Literacy vs Population') 139 | 140 | # Set x-axis and y-axis labels 141 | p.xaxis.axis_label = 'Population (millions)' 142 | p.yaxis.axis_label = 'Female literacy (% population)' 143 | 144 | # Specify the name of the output_file and show the result 145 | output_file('scatterplot.html') 146 | 147 | # ''''' Scatter plot shape(marker) grouping '''''# 148 | 149 | # Make a scatter plot such that each continent has a different marker type: p 150 | p = p = Scatter(df, x='population', y='female_literacy', 151 | color='Continent', 152 | marker='Continent', 153 | title='Female Literacy vs Population') 154 | 155 | # Set x-axis and y-axis labels 156 | p.xaxis.axis_label = 'Population (millions)' 157 | p.yaxis.axis_label = 'Female literacy (% population)' 158 | 159 | # Specify the name of the output_file and show the result 160 | output_file('scatterplot.html') 161 | show(p) 162 | 163 | -------------------------------------------------------------------------------- /DatVis_Bokeh_Intr_App_Build_4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed May 17 23:09:01 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | 11 | from bokeh.io import output_file, show 12 | 13 | from bokeh.plotting import figure 14 | from bokeh.plotting import ColumnDataSource 15 | 16 | from bokeh.layouts import gridplot 17 | from bokeh.layouts import row, column 18 | from bokeh.layouts import widgetbox 19 | 20 | from bokeh.charts import BoxPlot 21 | from bokeh.charts import Scatter 22 | 23 | from bokeh.models import Select 24 | from bokeh.models import Slider 25 | from bokeh.models import Button 26 | from bokeh.models import HoverTool 27 | from bokeh.models import CheckboxGroup, RadioGroup, Toggle 28 | 29 | from bokeh.models.widgets import Panel 30 | from bokeh.models.widgets import Tabs 31 | 32 | # Perform necessary imports 33 | from bokeh.io import curdoc 34 | 35 | # Create a new plot: plot 36 | plot = figure() 37 | 38 | # Add a line to the plot 39 | plot.line(x=[1, 2, 3, 4, 5], y=[2, 5, 4, 6, 7]) 40 | 41 | # Add the plot to the current document 42 | curdoc().add_root(plot) 43 | 44 | # ''''''''' Add a slider ''''''# 45 | 46 | # Create a slider: slider 47 | slider = Slider(title='my slider', start=0, end=10, step=0.1, value=2) 48 | 49 | # Create a widgetbox layout: layout 50 | layout = widgetbox(slider) 51 | 52 | # Add the layout to the current document 53 | curdoc().add_root(layout) 54 | 55 | # '''''''' Multiple Sliders ''''''''# 56 | 57 | # Create first slider: slider1 58 | slider1 = Slider(title='slider1', start=0, end=10, step=0.1, value=2) 59 | 60 | # Create second slider: slider2 61 | slider2 = Slider(title='slider2', start=10, end=100, step=1, value=20) 62 | 63 | # Add slider1 and slider2 to a widgetbox 64 | layout = widgetbox(slider1, slider2) 65 | 66 | # Add the layout to the current document 67 | curdoc().add_root(layout) 68 | 69 | 70 | # '''' Combining bokeh models into a layout ''''# 71 | 72 | # Create ColumnDataSource: source 73 | source = ColumnDataSource(data={'x': x, 'y': y}) 74 | 75 | # Add a line to the plot 76 | plot.line('x', 'y', source=source) 77 | 78 | # Create a column layout: layout 79 | layout = column(widgetbox(slider), plot) 80 | 81 | # Add the layout to the current document 82 | curdoc().add_root(layout) 83 | 84 | # '' Basic callback on widget ''''''# 85 | 86 | # Define a callback function: callback 87 | def callback(attr, old, new): 88 | 89 | # Read the current value of the slider: scale 90 | scale = slider.value 91 | 92 | # Compute the updated y using np.sin(scale/x): new_y 93 | new_y = np.sin(scale/x) 94 | 95 | # Update source with the new data values 96 | source.data = {'x': x, 'y': new_y} 97 | 98 | # Attach the callback to the 'value' property of slider 99 | slider.on_change('value', callback) 100 | 101 | # Create layout and add to current document 102 | layout = column(widgetbox(slider), plot) 103 | curdoc().add_root(layout) 104 | 105 | # ''''Updating data sources - Drop down in callback '''# 106 | 107 | # Create ColumnDataSource: source 108 | source = ColumnDataSource(data={ 109 | 'x' : fertility, 110 | 'y' : female_literacy 111 | }) 112 | 113 | # Create a new plot: plot 114 | plot = figure() 115 | 116 | # Add circles to the plot 117 | plot.circle('x', 'y', source=source) 118 | 119 | # Define a callback function: update_plot 120 | def update_plot(attr, old, new): 121 | # If the new Selection is 'female_literacy', update 'y' to female_literacy 122 | if new == 'female_literacy': 123 | source.data = { 124 | 'x': fertility, 125 | 'y': female_literacy 126 | } 127 | # Else, update 'y' to population 128 | else: 129 | source.data = { 130 | 'x' : fertility, 131 | 'y' : population 132 | } 133 | 134 | # Create a dropdown Select widget: select 135 | select = Select(title="distribution", 136 | options=['female_literacy', 'population'], 137 | value='female_literacy') 138 | 139 | # Attach the update_plot callback to the 'value' property of select 140 | select.on_change('value', update_plot) 141 | 142 | # Create layout and add to current document 143 | layout = row(select, plot) 144 | curdoc().add_root(layout) 145 | 146 | # ''''''''' Synchronise two dropdowns '''''''''''# 147 | 148 | # Create two dropdown Select widgets: select1, select2 149 | 150 | select1 = Select(title='First', options=['A', 'B'], value='A') 151 | select2 = Select(title='Second', options=['1', '2', '3'], value='1') 152 | 153 | # Define a callback function: callback 154 | def callback(attr, old, new): 155 | # If select1 is 'A' 156 | if select1.value == 'A': 157 | # Set select2 options to ['1', '2', '3'] 158 | select2.options = ['1', '2', '3'] 159 | 160 | # Set select2 value to '1' 161 | select2.value = '1' 162 | else: 163 | # Set select2 options to ['100', '200', '300'] 164 | select2.options = ['100', '200', '300'] 165 | 166 | # Set select2 value to '100' 167 | select2.value = '100' 168 | 169 | # Attach the callback to the 'value' property of select1 170 | select1.on_change('value', callback) 171 | 172 | # Create layout and add to current document 173 | layout = widgetbox(select1, select2) 174 | curdoc().add_root(layout) 175 | 176 | 177 | # ''''''''''Basic button widget '''''''''# 178 | 179 | # Create a Button with label 'Update Data' 180 | button = Button(label='Update Data') 181 | 182 | # Define an update callback with no arguments: update 183 | def update(): 184 | 185 | # Compute new y values: y 186 | y = np.sin(x) + np.random.random(N) 187 | 188 | # Update the ColumnDataSource data dictionary 189 | source.data = {'x': x, 'y': y} 190 | 191 | # Add the update callback to the button 192 | button.on_click(update) 193 | 194 | # Create layout and add to current document 195 | layout = column(widgetbox(button), plot) 196 | curdoc().add_root(layout) 197 | 198 | 199 | # ''''''' Button Styles '''''''# 200 | 201 | # Import CheckboxGroup, RadioGroup, Toggle from bokeh.models 202 | 203 | # Add a Toggle: toggle 204 | toggle = Toggle(button_type='success', label='Toggle button') 205 | 206 | # Add a CheckboxGroup: checkbox 207 | checkbox = CheckboxGroup(labels=['Option 1', 'Option 2', 'Option 3']) 208 | 209 | # Add a RadioGroup: radio 210 | radio = RadioGroup(labels=['Option 1', 'Option 2', 'Option 3']) 211 | 212 | # Add widgetbox(toggle, checkbox, radio) to the current document 213 | curdoc().add_root(widgetbox(toggle, checkbox, radio)) -------------------------------------------------------------------------------- /DatVis_Bokeh_Layout-Int-Annot_2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed May 17 16:30:01 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | from bokeh.plotting import figure 9 | from bokeh.io import output_file, show 10 | from bokeh.plotting import ColumnDataSource 11 | from bokeh.models import HoverTool 12 | from bokeh.layouts import gridplot 13 | from bokeh.models.widgets import Panel 14 | from bokeh.models.widgets import Tabs 15 | from bokeh.layouts import row, column 16 | 17 | # Create a ColumnDataSource from df: source 18 | source = ColumnDataSource(df) 19 | 20 | # '''''' Creating Rows of Plots 21 | 22 | # Create the first figure: p1 23 | p1 = figure(x_axis_label='fertility (children per woman)', 24 | y_axis_label='female_literacy (% population)') 25 | 26 | # Add a circle glyph to p1 27 | p1.circle('fertility', 'female_literacy', source=source) 28 | 29 | # Create the second figure: p2 30 | p2 = figure(x_axis_label='population', 31 | y_axis_label='female_literacy (% population)') 32 | 33 | # Add a circle glyph to p2 34 | p2.circle('population', 'female_literacy', source=source) 35 | 36 | # Put p1 and p2 into a horizontal row: layout 37 | layout = row(p1, p2) 38 | 39 | # Specify the name of the output_file and show the result 40 | output_file('fert_row.html') 41 | show(layout) 42 | 43 | # '''''''''''''' Column Plots in Bokeh ''''''# 44 | 45 | # Create a blank figure: p1 46 | p1 = figure(x_axis_label='fertility (children per woman)', 47 | y_axis_label='female_literacy (% population)') 48 | 49 | # Add circle scatter to the figure p1 50 | p1.circle('fertility', 'female_literacy', source=source) 51 | 52 | # Create a new blank figure: p2 53 | p2 = figure(x_axis_label='population', 54 | y_axis_label='female_literacy (% population)') 55 | 56 | # Add circle scatter to the figure p2 57 | p2.circle('population', 'female_literacy', source=source) 58 | 59 | # Put plots p1 and p2 in a column: layout 60 | layout = column(p1, p2) 61 | 62 | # Specify the name of the output_file and show the result 63 | output_file('fert_column.html') 64 | show(layout) 65 | 66 | # ''''''' Nesting Rows & Columns of Plots '''''''# 67 | 68 | # Make a column layout that will be used as the second row: row2 69 | row2 = column([mpg_hp, mpg_weight], sizing_mode='scale_width') 70 | 71 | # Make a row layout that includes the above column layout: layout 72 | layout = row([avg_mpg, row2], sizing_mode='scale_width') 73 | 74 | # Specify the name of the output_file and show the result 75 | output_file('layout_custom.html') 76 | show(layout) 77 | 78 | # '''''Gridded Layouts ''''''''# 79 | 80 | # Create a list containing plots p1 and p2: row1 81 | row1 = [p1, p2] 82 | 83 | # Create a list containing plots p3 and p4: row2 84 | row2 = [p3, p4] 85 | 86 | # Create a gridplot using row1 and row2: layout 87 | layout = gridplot([row1, row2]) 88 | 89 | # Specify the name of the output_file and show the result 90 | output_file('grid.html') 91 | show(layout) 92 | 93 | # ''''''Start Tabbed Layouts ''''#1 Create Panels 94 | 95 | # Create tab1 from plot p1: tab1 96 | tab1 = Panel(child=p1, title='Latin America') 97 | 98 | # Create tab2 from plot p2: tab2 99 | tab2 = Panel(child=p2, title='Africa') 100 | 101 | # Create tab3 from plot p3: tab3 102 | tab3 = Panel(child=p3, title='Asia') 103 | 104 | # Create tab4 from plot p4: tab4 105 | tab4 = Panel(child=p4, title='Europe') 106 | 107 | 108 | # ''''''''''''' Display the tabbed layouts '''''''''''# 109 | 110 | # Create a Tabs layout: layout 111 | layout = Tabs(tabs=[tab1, tab2, tab3, tab4]) 112 | 113 | # Specify the name of the output_file and show the result 114 | output_file('tabs.html') 115 | show(layout) 116 | 117 | # '''''''' Linked Axes Plots '''''''# 118 | 119 | # Link the x_range of p2 to p1: p2.x_range 120 | p2.x_range = p1.x_range 121 | 122 | # Link the y_range of p2 to p1: p2.y_range 123 | p2.y_range = p1.y_range 124 | 125 | # Link the x_range of p3 to p1: p3.x_range 126 | p3.x_range = p1.x_range 127 | 128 | # Link the y_range of p4 to p1: p4.y_range 129 | p4.y_range = p1.y_range 130 | 131 | # Specify the name of the output_file and show the result 132 | output_file('linked_range.html') 133 | show(layout) 134 | 135 | # ' Linked brushed data - brushing ''''''''''''''# 136 | 137 | # Create ColumnDataSource: source 138 | source = ColumnDataSource(data) 139 | 140 | # Create the first figure: p1 141 | p1 = figure(x_axis_label='fertility (children per woman)', 142 | y_axis_label='female literacy (% population)', 143 | tools='box_select,lasso_select') 144 | 145 | # Add a circle glyph to p1 146 | _ = p1.circle('fertility', 'female literacy', source=source) 147 | 148 | # Create the second figure: p2 149 | p2 = figure(x_axis_label='fertility (children per woman)', 150 | y_axis_label='population (millions)', 151 | tools='box_select,lasso_select') 152 | 153 | # Add a circle glyph to p2 154 | _ = p2.circle('fertility', 'population', source=source) 155 | 156 | # Create row layout of figures p1 and p2: layout 157 | layout = row(p1, p2) 158 | 159 | # Specify the name of the output_file and show the result 160 | output_file('linked_brush.html') 161 | show(layout) 162 | 163 | # ''''''' Creating Legends '''''''''# 164 | 165 | # Add the first circle glyph to the figure p 166 | p.circle('fertility', 'female_literacy', 167 | source=latin_america, size=10, 168 | color='red', legend='Latin America') 169 | 170 | # Add the second circle glyph to the figure p 171 | p.circle('fertility', 'female_literacy', 172 | source=africa, size=10, 173 | color='blue', legend='Africa') 174 | 175 | # Specify the name of the output_file and show the result 176 | output_file('fert_lit_groups.html') 177 | show(p) 178 | 179 | # '''Legend Position and Style '''''''# 180 | 181 | # Assign the legend to the bottom left: p.legend.location 182 | p.legend.location='bottom_left' 183 | 184 | # Fill the legend background with the color 'lightgray': 185 | # p.legend.background_fill_color 186 | p.legend.background_fill_color='lightgray' 187 | 188 | # Specify the name of the output_file and show the result 189 | output_file('fert_lit_groups.html') 190 | show(p) 191 | 192 | # ''''' Add hover tooltip to plot '''''''# 193 | 194 | # Create a HoverTool object: hover 195 | hover = HoverTool(tooltips=[('Country','@Country')]) 196 | 197 | # Add the HoverTool object to figure p 198 | p.add_tools(hover) 199 | 200 | # Specify the name of the output_file and show the result 201 | output_file('hover.html') 202 | show(p) -------------------------------------------------------------------------------- /DatVis_Images.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Apr 1 19:30:44 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | 11 | # Load the image into an array: img 12 | img = plt.imread('480px-Astronaut-EVA.jpg') 13 | 14 | # Print the shape of the image 15 | print(img.shape) 16 | 17 | # Display the image 18 | plt.imshow(img) 19 | 20 | # Hide the axes 21 | plt.axis('off') 22 | plt.show() 23 | 24 | # ''''''''''' Pseudocolor Plot from Image Data ''''''''''''# 25 | 26 | # Load the image into an array: img 27 | img = plt.imread('480px-Astronaut-EVA.jpg') 28 | 29 | # Print the shape of the image 30 | print(img.shape) 31 | 32 | # Compute the sum of the red, green and blue channels: intensity 33 | intensity = img.sum(axis=2) 34 | 35 | # Print the shape of the intensity 36 | print(intensity.shape) 37 | 38 | # Display the intensity with a colormap of 'gray' 39 | plt.imshow(intensity, cmap='gray') 40 | 41 | # Add a colorbar 42 | plt.colorbar() 43 | 44 | # Hide the axes and show the figure 45 | plt.axis('off') 46 | plt.show() 47 | 48 | # # '''''''''''''Specifying Extents and Aspect Ratio '''''# 49 | 50 | # Load the image into an array: img 51 | img = plt.imread('480px-Astronaut-EVA.jpg') 52 | 53 | # Specify the extent and aspect ratio of the top left subplot 54 | plt.subplot(2, 2, 1) 55 | plt.title('extent=(-1,1,-1,1),\naspect=0.5') 56 | plt.xticks([-1, 0, 1]) 57 | plt.yticks([-1, 0, 1]) 58 | plt.imshow(img, extent=(-1, 1, -1, 1), aspect=0.5) 59 | 60 | # Specify the extent and aspect ratio of the top right subplot 61 | plt.subplot(2, 2, 2) 62 | plt.title('extent=(-1,1,-1,1),\naspect=1') 63 | plt.xticks([-1, 0, 1]) 64 | plt.yticks([-1, 0, 1]) 65 | plt.imshow(img, extent=(-1, 1, -1, 1), aspect=1) 66 | 67 | # Specify the extent and aspect ratio of the bottom left subplot 68 | plt.subplot(2, 2, 3) 69 | plt.title('extent=(-1,1,-1,1),\naspect=2') 70 | plt.xticks([-1, 0, 1]) 71 | plt.yticks([-1, 0, 1]) 72 | plt.imshow(img, extent=(-1, 1, -1, 1), aspect=2) 73 | 74 | # Specify the extent and aspect ratio of the bottom right subplot 75 | plt.subplot(2, 2, 4) 76 | plt.title('extent=(-2,2,-1,1),\naspect=2') 77 | plt.xticks([-2, -1, 0, 1, 2]) 78 | plt.yticks([-1, 0, 1]) 79 | plt.imshow(img, extent=(-2, 2, -1, 1), aspect=2) 80 | 81 | # Improve spacing and display the figure 82 | plt.tight_layout() 83 | plt.show() 84 | 85 | # '''''' Rescale Pixel Intensities '''''''''''''# 86 | 87 | # Load the image into an array: image 88 | image = plt.imread('640px-Unequalized_Hawkes_Bay_NZ.jpg') 89 | 90 | # Extract minimum and maximum values from the image: pmin, pmax 91 | pmin, pmax = image.min(), image.max() 92 | print("The smallest & largest pixel intensities are %d & %d." % (pmin, pmax)) 93 | 94 | # Rescale the pixels: rescaled_image 95 | rescaled_image = 256*(image - pmin) / (pmax - pmin) 96 | print("The rescaled smallest & largest pixel intensities are %.1f & %.1f." % 97 | (rescaled_image.min(), rescaled_image.max())) 98 | 99 | # Display the original image in the top subplot 100 | plt.subplot(2, 1, 1) 101 | plt.title('original image') 102 | plt.axis('off') 103 | plt.imshow(image, extent=(-2, 2, -1, 1), aspect=2) 104 | 105 | # Display the rescaled image in the bottom subplot 106 | plt.subplot(2, 1, 2) 107 | plt.title('rescaled image') 108 | plt.axis('off') 109 | plt.imshow(rescaled_image, extent=(-2, 2, -1, 1), aspect=2) 110 | 111 | plt.show() -------------------------------------------------------------------------------- /Dat_Clean_Analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 7 02:30:02 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # ''Load and View Data ''''''''''# 9 | 10 | # Import pandas 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | 14 | 15 | # Read the file into a DataFrame: df 16 | # df = pd.read_csv('your_file.csv') . This might also mean filepath. 17 | 18 | df = pd.read_csv('fixations.csv') 19 | df2 = pd.read_csv('flightdata.csv') 20 | 21 | # Print the head of df 22 | print(df.head()) 23 | 24 | # Print the tail of df 25 | print(df.tail()) 26 | 27 | print('AERO DATA OUTPUT') 28 | 29 | 30 | print(df2.head()) 31 | 32 | print(df2.tail()) 33 | 34 | # Print the shape of df 35 | print(df.shape) 36 | 37 | print(df2.shape) 38 | 39 | # Print the columns of df 40 | print(df.columns) 41 | 42 | print(df2.columns) 43 | 44 | # Print the head and tail of df_subset 45 | # print(df.subset.head()) 46 | # print(df.subset.tail()) 47 | 48 | # Print the info of df 49 | print(df.info()) 50 | 51 | print(df2.info()) 52 | 53 | # Print the info of df_subset 54 | # print(df.subset.info()) 55 | 56 | 57 | # '''''''' Frequency counts for Categorical Data 58 | # note that dataframe titles here are actually for 59 | # continuous data. These are simply placeholders. 60 | 61 | # Print the value counts for 'your category - i.e.column titles'' 62 | print(df['duration'].value_counts(dropna=False)) 63 | 64 | print(df['duration'].shape) 65 | 66 | # Print the value_counts for 'next_category' 67 | print(df['confidence'].value_counts(dropna=False)) 68 | 69 | print(df['confidence'].shape) 70 | 71 | # Print the value counts for 'and_another' 72 | print(df['avg_pupil_size'].value_counts(dropna=False)) 73 | 74 | 75 | # ''''''''''' Single Variable Histogram plot ''''''''# 76 | 77 | # Plot the histogram 78 | df['duration'].plot(kind='hist', rot=70, logx=True, logy=True) 79 | 80 | # Display the histogram 81 | plt.show() 82 | 83 | # ''''' Multi Variable Box Plot Visualisation '''''''# 84 | 85 | # Import necessary modules (see top of script) 86 | # doesn't necessarily have to be at the top of the script 87 | # but Spyder likes it this way and it looks 88 | # good too. 89 | 90 | # you want to create the boxplot? 91 | df.boxplot(column='duration', by='avg_pupil_size', rot=90) 92 | 93 | # Display the plot 94 | plt.show() 95 | 96 | # ''''''''''' Multiple variable scatter plot visualisation''''# 97 | 98 | # Import necessary modules -moved to top 99 | # import pandas as pd - at top 100 | # import matplotlib.pyplot as plt - at top 101 | 102 | # Create and display the first scatter plot 103 | df.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70) 104 | plt.show() 105 | 106 | # Create and display the second scatter plot 107 | df_subset.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70) 108 | plt.show() 109 | 110 | -------------------------------------------------------------------------------- /Dat_Read_Plot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 5 18:57:00 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import os 9 | import glob 10 | import pandas as pd 11 | import mayavi 12 | import seaborn as sns 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | from scipy import stats 16 | from mayavi import mlab 17 | import multiprocessing 18 | import plotly.plotly as py 19 | import plotly.graph_objs as go 20 | from plotly.graph_objs import Surface 21 | 22 | 23 | path = 'C:\\Users\\Shabaka\Desktop\\Test2 DJI_Corretti' 24 | all_files = glob.glob(os.path.join(path, "*Temperatura_Media.csv")) 25 | 26 | df_from_each_file = pd.read_csv(all_files) 27 | conc_df = pd.concat(df_from_each_file, ignore_index=True) 28 | 29 | print(conc_df.head()) 30 | -------------------------------------------------------------------------------- /DataChunkFunc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Apr 1 13:19:38 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | 11 | # Define plot_pop() 12 | def plot_pop(filename, country_code): 13 | 14 | # Initialize reader object: urb_pop_reader 15 | urb_pop_reader = pd.read_csv(filename, chunksize=1000) 16 | 17 | # Initialize empty dataframe: data 18 | data = pd.DataFrame() 19 | 20 | # Iterate over each dataframe chunk 21 | for df_urb_pop in urb_pop_reader: 22 | # Check out specific country: df_pop_ceb 23 | df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code] 24 | 25 | # Zip dataframe columns of interest: pops 26 | pops = zip(df_pop_ceb['Total Population'], 27 | df_pop_ceb['Urban population (% of total)']) 28 | 29 | # Turn zip object into list: pops_list 30 | pops_list = list(pops) 31 | 32 | # Use list comprehension to create new 33 | # dataframe column 'Total Urban Population' 34 | df_pop_ceb['Total Urban Population'] = \ 35 | [int(tup[0] * tup[1]) for tup in pops_list] 36 | 37 | # Append dataframe chunk to data: data 38 | data = data.append(df_pop_ceb) 39 | 40 | # Plot urban population data 41 | data.plot(kind='scatter', x='Year', y='Total Urban Population') 42 | plt.show() -------------------------------------------------------------------------------- /DataClean_GS_Analysis5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 11 23:54:33 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | 14 | # Load csv files OR load up file directory seeking specific file name 15 | 16 | 17 | # '''''''Basic EDA Instructions to verify Data 18 | 19 | 20 | # ''''''' Carry out some basic Visaualisations ''''''''''''# 21 | 22 | # Import matplotlib.pyplot 23 | # import matplotlib.pyplot as plt 24 | 25 | # Create the scatter plot 26 | g1800s.plot(kind='scatter', x='1800', y='1899') 27 | 28 | # Specify axis labels 29 | plt.xlabel('Life Expectancy by Country in 1800') 30 | plt.ylabel('Life Expectancy by Country in 1899') 31 | 32 | # Specify axis limits 33 | plt.xlim(20, 55) 34 | plt.ylim(20, 55) 35 | 36 | # Display the plot 37 | plt.show() 38 | 39 | # Think - QUestion at HAnd ''''# 40 | 41 | def check_null_or_valid(row_data): 42 | """Function that takes a row of data, 43 | drops all missing values, 44 | and checks if all remaining values are greater than or equal to 0 45 | """ 46 | no_na = row_data.dropna()[1:-1] 47 | numeric = pd.to_numeric(no_na) 48 | ge0 = numeric >= 0 49 | return ge0 50 | 51 | # Check whether the first column is 'Life expectancy' 52 | assert g1800s.columns[0] == 'Life expectancy' 53 | 54 | # Check whether the values in the row are valid 55 | assert g1800s.iloc[:, 1:].apply(check_null_or_valid, axis=1).all().all() 56 | 57 | # Check that there is only one instance of each country 58 | assert g1800s['Life expectancy'].value_counts()[0] == 1 59 | 60 | 61 | # ''''''''''' Assemble the Data '''''''''''''# 62 | 63 | # Concatenate the DataFrames row-wise 64 | gapminder = pd.concat([g1800s, g1900s, g2000s]) 65 | 66 | # Print the shape of gapminder 67 | print(gapminder.shape) 68 | 69 | # Print the head of gapminder 70 | print(gapminder.head()) 71 | 72 | 73 | # ''''Reshape the data to aid easier analysis ( if required)''''# 74 | 75 | # Melt gapminder: gapminder_melt 76 | gapminder_melt = pd.melt(gapminder, id_vars='Life expectancy') 77 | 78 | # Rename the columns 79 | gapminder_melt.columns = ['country', 'year', 'life_expectancy'] 80 | 81 | # Print the head of gapminder_melt 82 | print(gapminder_melt.head()) 83 | 84 | # '''''''''''Check the data types in the dataset ''''''''# 85 | 86 | # Convert the year column to numeric 87 | gapminder.year = pd.to_numeric(gapminder['year']) 88 | 89 | # Test if country is of type object 90 | assert gapminder.country.dtypes == np.object 91 | 92 | # Test if year is of type int64 93 | assert gapminder.year.dtypes == np.int64 94 | 95 | # Test if life_expectancy is of type float64 96 | assert gapminder.life_expectancy.dtypes == np.float64 97 | 98 | # '''''''''''''''''Ex. Country Spellings to CHeck for Correctness ''''# 99 | 100 | # Create the series of countries: countries 101 | countries = gapminder['country'] 102 | 103 | # Drop all the duplicates from countries 104 | countries = countries.drop_duplicates() 105 | 106 | # Write the regular expression: pattern 107 | pattern = '^[A-Za-z\.\s]*$' 108 | 109 | # Create the Boolean vector: mask 110 | mask = countries.str.contains(pattern) 111 | 112 | # Invert the mask: mask_inverse 113 | mask_inverse = ~mask 114 | 115 | # Subset countries using mask_inverse: invalid_countries 116 | invalid_countries = countries.loc[mask_inverse] 117 | 118 | # Print invalid_countries 119 | print(invalid_countries) 120 | 121 | # '''''''' More Cleaning Ex.''''''''''# 122 | 123 | # Assert that country does not contain any missing values 124 | assert pd.notnull(gapminder.country).all() 125 | 126 | # Assert that year does not contain any missing values 127 | assert pd.notnull(gapminder.year).all() 128 | 129 | # Drop the missing values 130 | gapminder = gapminder.dropna(how='any') 131 | 132 | # Print the shape of gapminder 133 | print(gapminder.shape) 134 | 135 | # Add first subplot 136 | plt.subplot(2, 1, 1) 137 | 138 | # Create a histogram of life_expectancy 139 | gapminder.life_expectancy.plot(kind='hist') 140 | 141 | # Group gapminder: gapminder_agg 142 | gapminder_agg = gapminder.groupby('year')['life_expectancy'].mean() 143 | 144 | # Print the head of gapminder_agg 145 | print(gapminder_agg.head()) 146 | 147 | # Print the tail of gapminder_agg 148 | print(gapminder_agg.tail()) 149 | 150 | # Add second subplot 151 | plt.subplot(2, 1, 2) 152 | 153 | 154 | # ''''''''' Wrap up with visualisation of cleaned data set'''' Eg.'''# 155 | # Create a line plot of life expectancy per year 156 | gapminder_agg.plot() 157 | 158 | # Add title and specify axis labels 159 | plt.title('Life expectancy over the years') 160 | plt.ylabel('Life expectancy') 161 | plt.xlabel('Year') 162 | 163 | # Display the plots 164 | plt.tight_layout() 165 | plt.show() 166 | 167 | # Save both DataFrames to csv files 168 | gapminder.to_csv('gapminder.csv') 169 | gapminder_agg.to_csv('gapminder_agg.csv') -------------------------------------------------------------------------------- /DataCombine_Analysis3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 10 14:57:36 2017 4 | 5 | @author: Shabaka 6 | """ 7 | import pandas as pd 8 | import matplotlib.pyplot as plt 9 | import glob 10 | 11 | 12 | 13 | # ''''Combining Rows of Data ''''''''''''# 14 | 15 | # Concatenate uber1, uber2, and uber3: row_concat 16 | row_concat = pd.concat([uber1, uber2, uber3]) 17 | 18 | # Print the shape of row_concat 19 | print(row_concat.shape) 20 | 21 | # Print the head of row_concat 22 | print(row_concat.head()) 23 | 24 | #'''''''''''' cOMBINING cOLUMNS OF dATA'''''''''''# 25 | 26 | # Concatenate ebola_melt and status_country column-wise: ebola_tidy 27 | ebola_tidy = pd.concat([ebola_melt, status_country], axis=1) 28 | 29 | # Print the shape of ebola_tidy 30 | print(ebola_tidy.shape) 31 | 32 | # Print the head of ebola_tidy 33 | print(ebola_tidy.head()) 34 | 35 | 36 | # '''Find Files that match a PAttern '''''''' # 37 | 38 | # Import necessary modules 39 | 40 | # Write the pattern: pattern 41 | pattern = '*.csv' 42 | 43 | # Save all file matches: csv_files 44 | csv_files = glob.glob(pattern) 45 | 46 | # Print the file names 47 | print(csv_files) 48 | 49 | # Load the second file into a DataFrame: csv2 50 | csv2 = pd.read_csv(csv_files[1]) 51 | 52 | # Print the head of csv2 53 | print(csv2.head()) 54 | 55 | # '''''''''''Iterate and Concatenate all Matches ''''''# 56 | 57 | # Create an empty list: frames 58 | frames = [] 59 | 60 | # Iterate over csv_files 61 | for csv in csv_files: 62 | 63 | # Read csv into a DataFrame: df 64 | df = pd.read_csv(csv) 65 | 66 | # Append df to frames 67 | frames.append(df) 68 | 69 | # Concatenate frames into a single DataFrame: uber 70 | uber = pd.concat(frames) 71 | 72 | # Print the shape of uber 73 | print(uber.shape) 74 | 75 | # Print the head of uber 76 | print(uber.head()) 77 | 78 | 79 | # ''''''One to - One Data Merge '# 80 | 81 | # Merge the DataFrames: o2o 82 | o2o = pd.merge(left=site, right=visited, left_on='name', right_on='site') 83 | 84 | # Print o2o 85 | print(o2o) 86 | 87 | # '''''''MAny to One Data MErge ''''# 88 | 89 | # Merge the DataFrames: m2o 90 | m2o = pd.merge(left=site, right=visited, left_on='name', right_on='site') 91 | 92 | # Print m2o 93 | print(m2o) 94 | 95 | # ''''''''''Many To Many Data Merge ''''''''''''# 96 | 97 | # Merge site and visited: m2m 98 | m2m = pd.merge(left=site, right = visited, left_on='name', right_on='site') 99 | 100 | # Merge m2m and survey: m2m 101 | m2m = pd.merge(left=m2m, right=survey, left_on='ident', right_on='taken') 102 | 103 | # Print the first 20 lines of m2m 104 | print(m2m.head(20)) 105 | -------------------------------------------------------------------------------- /DataFrame_Lambda_Filter_Read.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 9 01:21:36 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Select retweets from the Twitter dataframe: result 9 | result = filter(lambda x:x[0:2] == 'RT', tweets_df['text']) 10 | 11 | # Create list from filter object result: res_list 12 | res_list = list(result) 13 | 14 | # Print all retweets in res_list 15 | for tweet in res_list: 16 | print(tweet) -------------------------------------------------------------------------------- /DataTidy_Analysis2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 10 10:02:49 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | 12 | 13 | # ' Data Reshape wit melt ''# 14 | 15 | # Print the head of airquality 16 | print(airquality.head()) 17 | 18 | # Melt airquality: airquality_melt 19 | airquality_melt = pd.melt(airquality, id_vars=['Month', 'Day']) 20 | 21 | # Print the head of airquality_melt 22 | print(airquality_melt.head()) 23 | 24 | # ''''Customise melted Data - Change var name & Val'''# 25 | 26 | # Print the head of airquality 27 | print(airquality.head()) 28 | 29 | # Melt airquality: airquality_melt 30 | airquality_melt = pd.melt(airquality, id_vars=['Month', 'Day'], 31 | var_name='measurement', value_name='reading') 32 | 33 | # Print the head of airquality_melt 34 | print(airquality_melt.head()) 35 | 36 | 37 | #''' Pivoting Data''' from melt'''''''''# 38 | 39 | # Print the head of airquality_melt 40 | print(airquality_melt.head()) 41 | 42 | # Pivot airquality_melt: airquality_pivot 43 | airquality_pivot = airquality_melt.pivot_table(index=['Month', 'Day'], columns='measurement', values='reading') 44 | 45 | # Print the head of airquality_pivot 46 | print(airquality_pivot.head()) 47 | 48 | #''''''''''''''''Reset data frame index''''''''''''# 49 | 50 | # Print the index of airquality_pivot 51 | print(airquality_pivot.index) 52 | 53 | # Reset the index of airquality_pivot: airquality_pivot 54 | airquality_pivot = airquality_pivot.reset_index() 55 | 56 | # Print the new index of airquality_pivot 57 | print(airquality_pivot.index) 58 | 59 | # Print the head of airquality_pivot 60 | print(airquality_pivot.head()) 61 | 62 | # ''''''' Pivoting Duplicate Values ''''''''''# 63 | 64 | # Pivot airquality_dup: airquality_pivot 65 | airquality_pivot = airquality_dup.pivot_table(index=['Month', 'Day'], 66 | columns='measurement', 67 | values='reading', aggfunc=np.mean) 68 | 69 | # Reset the index of airquality_pivot 70 | airquality_pivot = airquality_pivot.reset_index() 71 | 72 | # Print the head of airquality_pivot 73 | print(airquality_pivot.head()) 74 | 75 | # Print the head of airquality 76 | print(airquality.head()) 77 | 78 | # '''''''''Split column infor using str '''''# 79 | 80 | # Melt tb: tb_melt 81 | tb_melt = pd.melt(frame=tb, id_vars=['country', 'year']) 82 | 83 | # Create the 'gender' column 84 | tb_melt['gender'] = tb_melt.variable.str[0] 85 | 86 | # Create the 'age_group' column 87 | tb_melt['age_group'] = tb_melt.variable.str[1:] 88 | 89 | # '''''' Split a column with .split() and .get() 90 | 91 | # Melt ebola: ebola_melt 92 | ebola_melt = pd.melt(ebola, id_vars=['Date', 'Day'], var_name='type_country', value_name='counts') 93 | 94 | # Create the 'str_split' column 95 | ebola_melt['str_split'] = ebola_melt.type_country.str.split('_') 96 | 97 | # Create the 'type' column 98 | ebola_melt['type'] = ebola_melt.str_split.str.get(0) 99 | 100 | # Create the 'country' column 101 | ebola_melt['country'] = ebola_melt.str_split.str.get(1) 102 | 103 | # Print the head of ebola_melt 104 | print(ebola_melt.head()) 105 | 106 | # ''''Combining Rows of Data ''''''''''''# 107 | 108 | # Concatenate uber1, uber2, and uber3: row_concat 109 | row_concat = pd.concat([uber1, uber2, uber3]) 110 | 111 | # Print the shape of row_concat 112 | print(row_concat.shape) 113 | 114 | # Print the head of row_concat 115 | print(row_concat.head()) 116 | 117 | #'''''''''''' cOMBINING cOLUMNS OF dATA'''''''''''# 118 | 119 | # Concatenate ebola_melt and status_country column-wise: ebola_tidy 120 | ebola_tidy = pd.concat([ebola_melt, status_country], axis=1) 121 | 122 | # Print the shape of ebola_tidy 123 | print(ebola_tidy.shape) 124 | 125 | # Print the head of ebola_tidy 126 | print(ebola_tidy.head()) 127 | -------------------------------------------------------------------------------- /DataTypes_Analysis4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 10 17:29:54 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | import re 12 | 13 | 14 | # Convert the sex column to type 'category' 15 | tips.sex = tips.sex.astype('category') 16 | 17 | # Convert the smoker column to type 'category' 18 | tips.smoker = tips.smoker.astype('category') 19 | 20 | # Print the info of tips 21 | print(tips.info()) 22 | 23 | # '''''Working with Numeric Data - Wrong data types ''''# 24 | 25 | # Convert 'total_bill' to a numeric dtype 26 | tips['total_bill'] = pd.to_numeric(tips['total_bill'], errors='coerce') 27 | 28 | # Convert 'tip' to a numeric dtype 29 | tips['tip'] = pd.to_numeric(tips['tip'], errors='coerce') 30 | 31 | # Print the info of tips 32 | print(tips.info()) 33 | 34 | 35 | # '''' String Parsing with regular expression '''# 36 | 37 | # Import the regular expression module 38 | 39 | # Compile the pattern: prog 40 | prog = re.compile('\d{3}-\d{3}-\d{4}') 41 | 42 | # See if the pattern matches 43 | result = prog.match('123-456-7890') 44 | print(bool(result)) 45 | 46 | # See if the pattern matches 47 | result = prog.match('1123-456-7890') 48 | print(bool(result)) 49 | 50 | # ''''''' Find Numeric in sstring '''''''' # 51 | 52 | # Find the numeric values: matches 53 | matches = re.findall('\d+', 'the recipe requires 10 strawberries and 1 banana') 54 | 55 | # Print the matches 56 | print(matches) 57 | 58 | 59 | # ''''' paTTERN maTCHING '''''## 60 | 61 | # Write the first pattern 62 | print(bool(re.match(pattern='\d{3}-\d{3}-\d{4}', string='123-456-7890'))) 63 | 64 | # Write the second pattern 65 | print(bool(re.match(pattern='\$\d*\.\d{2}', string='$123.45'))) 66 | 67 | # Write the third pattern 68 | print(bool(re.match(pattern='[A-Z]\w*', string='Australia'))) 69 | 70 | # '''''''''######## ''''''''''''''''' ##########'''''''''''''''''''# 71 | 72 | # '''''Custom Fxn to clean data in column ( dataframe)''''''''# 73 | 74 | # Define recode_sex() 75 | 76 | 77 | def recode_sex(sex_value): 78 | 79 | # Return 1 if sex_value is 'Male' 80 | if sex_value == 'Male': 81 | return 1 82 | 83 | # Return 0 if sex_value is 'Female' 84 | elif sex_value == 'Female': 85 | return 0 86 | 87 | # Return np.nan 88 | else: 89 | 90 | return np.nan 91 | 92 | 93 | # Apply the function to the sex column 94 | tips['sex_recode'] = tips.sex.apply(recode_sex) 95 | 96 | 97 | #''' Lambda Functions ''''''# 98 | 99 | # Write the lambda function using replace 100 | tips['total_dollar_replace'] = tips.total_dollar.apply(lambda x: x.replace('$', '')) 101 | 102 | # Write the lambda function using regular expressions 103 | tips['total_dollar_re'] = tips.total_dollar.apply(lambda x: re.findall('\d+\.\d+', x)) 104 | 105 | # Print the head of tips 106 | print(tips.head()) 107 | 108 | # '''''''Dropping DUplicate Data '''''''''''''# 109 | 110 | # Create the new DataFrame: tracks 111 | tracks = billboard[['year', 'artist', 'track', 'time']] 112 | 113 | # Print info of tracks 114 | print(tracks.info()) 115 | 116 | # Drop the duplicates: tracks_no_duplicates 117 | tracks_no_duplicates = tracks.drop_duplicates() 118 | 119 | # Print info of tracks 120 | print(tracks_no_duplicates.info()) 121 | 122 | # '''''''''''''''' Fill in MIssing Data ''''''''' # 123 | 124 | # Calculate the mean of the Ozone column: oz_mean 125 | oz_mean = np.mean(airquality.Ozone) 126 | 127 | # Replace all the missing values in the Ozone column with the mean 128 | airquality['Ozone'] = airquality['Ozone'].fillna(oz_mean) 129 | 130 | # Print the info of airquality 131 | print(airquality.info()) 132 | 133 | # ''''''''''''''' Data Test with Assert Statements ''''''# 134 | 135 | # Assert that there are no missing values 136 | assert pd.notnull(ebola).all().all() 137 | 138 | # Assert that all values are >= 0 139 | assert (ebola >= 0).all().all() 140 | 141 | # assert pd.notnull(ebola >= 0).all().all() 142 | 143 | 144 | -------------------------------------------------------------------------------- /DataXplore_Analysis1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 7 02:30:02 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # ''Load and View Data ''''''''''# 9 | 10 | # Import pandas 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | 14 | 15 | # Read the file into a DataFrame: df 16 | # df = pd.read_csv('dob_job_application_filings_subset.csv') 17 | 18 | df = pd.read_csv('fixations.csv') 19 | df2 = pd.read_csv('aerodata.csv') 20 | 21 | # Print the head of df 22 | print(df.head()) 23 | 24 | # Print the tail of df 25 | print(df.tail()) 26 | 27 | print('AERO DATA OUTPUT') 28 | 29 | 30 | print(df2.head()) 31 | 32 | print(df2.tail()) 33 | 34 | # Print the shape of df 35 | print(df.shape) 36 | 37 | print(df2.shape) 38 | 39 | # Print the columns of df 40 | print(df.columns) 41 | 42 | print(df2.columns) 43 | 44 | # Print the head and tail of df_subset 45 | # print(df.subset.head()) 46 | # print(df.subset.tail()) 47 | 48 | # Print the info of df 49 | print(df.info()) 50 | 51 | print(df2.info()) 52 | 53 | # Print the info of df_subset 54 | # print(df.subset.info()) 55 | 56 | 57 | # '''''''' Frequency counts for Categorical Data 58 | 59 | # Print the value counts for 'Borough' 60 | print(df['duration'].value_counts(dropna=False)) 61 | 62 | print(df['duration'].shape) 63 | 64 | # Print the value_counts for 'State' 65 | print(df['confidence'].value_counts(dropna=False)) 66 | 67 | print(df['confidence'].shape) 68 | 69 | # Print the value counts for 'Site Fill' 70 | print(df['avg_pupil_size'].value_counts(dropna=False)) 71 | 72 | # ''''''''''' Single Variable Histogram plot ''''''''# 73 | 74 | # Plot the histogram 75 | df['duration'].plot(kind='hist', rot=70, logx=True, logy=True) 76 | 77 | # Display the histogram 78 | plt.show() 79 | 80 | # ''''' Multi Variable Box Plot Visualisation '''''''# 81 | 82 | # Import necessary modules 83 | 84 | # Create the boxplot 85 | df.boxplot(column='duration', by='avg_pupil_size', rot=90) 86 | 87 | # Display the plot 88 | plt.show() 89 | 90 | # ''''''''''' Multiple variable scatter plot visualisation''''# 91 | 92 | # Import necessary modules 93 | # import pandas as pd 94 | # import matplotlib.pyplot as plt 95 | 96 | # Create and display the first scatter plot 97 | df.plot(kind='scatter', x='duration', y='avg_pupil_size', rot=70) 98 | plt.show() 99 | 100 | # Create and display the second scatter plot 101 | df_subset.plot(kind='scatter', x='duration', y='confidence', rot=70) 102 | plt.show() 103 | 104 | -------------------------------------------------------------------------------- /Data_Corr_Func.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 6 18:59:20 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Code for correelate function is copied from pupil labs git @ 9 | # https://github.com/pupil-labs/pupil/wiki/Data-Format 10 | 11 | 12 | def correlate_data(data, timestamps): 13 | ''' 14 | data: list of data : 15 | each datum is a dict with at least: 16 | timestamp: float 17 | 18 | timestamps: timestamps list to correlate data to 19 | 20 | this takes a data list and a timestamps list and makes a new list 21 | with the length of the number of timestamps. 22 | Each slot contains a list that will have 0, 1 or more associated 23 | data points. 24 | 25 | Finally we add an index field to the datum with the associated index 26 | ''' 27 | timestamps = list(timestamps) 28 | data_by_frame = [[] for i in timestamps] 29 | 30 | frame_idx = 0 31 | data_index = 0 32 | 33 | data.sort(key=lambda d: d['timestamp']) 34 | 35 | while True: 36 | try: 37 | datum = data[data_index] 38 | # we can take the midpoint between two frames in time: 39 | # More appropriate for SW timestamps 40 | ts = (timestamps[frame_idx]+timestamps[frame_idx+1]) / 2. 41 | # or the time of the next frame: 42 | # More appropriate for Sart Of Exposure Timestamps (HW timestamps). 43 | # ts = timestamps[frame_idx+1] 44 | except IndexError: 45 | # we might loose a data point at the end but we don't care 46 | break 47 | 48 | if datum['timestamp'] <= ts: 49 | datum['index'] = frame_idx 50 | data_by_frame[frame_idx].append(datum) 51 | data_index += 1 52 | else: 53 | frame_idx += 1 54 | 55 | return data_by_frame -------------------------------------------------------------------------------- /Datchunk_PopPlot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Apr 1 13:24:43 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | 11 | # Define plot_pop() 12 | 13 | 14 | def plot_pop(filename, country_code): 15 | 16 | # Initialize reader object: urb_pop_reader 17 | urb_pop_reader = pd.read_csv(filename, chunksize=1000) 18 | 19 | # Initialize empty dataframe: data 20 | data = pd.DataFrame() 21 | 22 | # Iterate over each dataframe chunk 23 | for df_urb_pop in urb_pop_reader: 24 | # Check out specific country: df_pop_ceb 25 | df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code] 26 | 27 | # Zip dataframe columns of interest: pops 28 | pops = zip(df_pop_ceb['Total Population'], 29 | df_pop_ceb['Urban population (% of total)']) 30 | 31 | # Turn zip object into list: pops_list 32 | pops_list = list(pops) 33 | 34 | # Use list comp to create new dataframe column 'Total Urban Population' 35 | df_pop_ceb['Total Urban Population'] = \ 36 | [int(tup[0] * tup[1]) for tup in pops_list] 37 | 38 | # Append dataframe chunk to data: data 39 | data = data.append(df_pop_ceb) 40 | 41 | # Plot urban population data 42 | data.plot(kind='scatter', x='Year', y='Total Urban Population') 43 | plt.show() 44 | 45 | # Set the filename: fn 46 | fn = 'ind_pop_data.csv' 47 | 48 | # Call plot_pop for country code 'CEB' 49 | plot_pop('ind_pop_data.csv', 'CEB') 50 | 51 | # Call plot_pop for country code 'ARB' 52 | plot_pop('ind_pop_data.csv', 'ARB') -------------------------------------------------------------------------------- /Deep_Learning_Basics_1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 21 11:58:38 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | # ''''''''''Coding the Forward Propagation (FP) Algorithm ''''''''# 14 | 15 | weights = {'node_1': np.array([4, -5]), 'node_0': np.array([2, 4]), 16 | 'output': np.array([2, 7])} 17 | 18 | input_data = [3, 5] 19 | 20 | # Calculate node 0 value: node_0_value 21 | node_0_value = (input_data * weights['node_0']).sum() 22 | 23 | # Calculate node 1 value: node_1_value 24 | node_1_value = (input_data * weights['node_1']).sum() 25 | 26 | # Put node values into array: hidden_layer_outputs 27 | hidden_layer_outputs = np.array([node_0_value, node_1_value]) 28 | 29 | # Calculate output: output 30 | output = (hidden_layer_outputs * weights['output']).sum() 31 | 32 | # Print output 33 | print(output, 'is the basic FP output from model') 34 | 35 | # ''''''' Apply the Rectified Linear Activation Function '''''''''''''# 36 | 37 | # NOTE: The activation function is very useful for tuning model weights ''# 38 | 39 | 40 | def relu(input): 41 | '''Define relu activation function here''' 42 | # Calculate the value for the output of the relu function: output 43 | output = max(input, 0) 44 | 45 | # Return the value just calculated 46 | return(output) 47 | 48 | # Calculate node 0 value: node_0_output 49 | node_0_input = (input_data * weights['node_0']).sum() 50 | node_0_output = relu(node_0_input) 51 | 52 | # Calculate node 1 value: node_1_output 53 | node_1_input = (input_data * weights['node_1']).sum() 54 | node_1_output = relu(node_1_input) 55 | 56 | # Put node values into array: hidden_layer_outputs 57 | hidden_layer_outputs = np.array([node_0_output, node_1_output]) 58 | 59 | # Calculate model output (do not apply relu) 60 | model_output = (hidden_layer_outputs * weights['output']).sum() 61 | 62 | # Print model output 63 | print(model_output, 'is the FP_ReLU predicted quantity of transactions') 64 | 65 | 66 | # ''''''''''' Apply Network to many observations/rows of data '''''''# 67 | 68 | # Define predict_with_network() 69 | def predict_with_network(input_data_row, weights): 70 | 71 | # Calculate node 0 value 72 | node_0_input = (input_data_row * weights['node_0']).sum() 73 | node_0_output = relu(node_0_input) 74 | 75 | # Calculate node 1 value 76 | node_1_input = (input_data_row * weights['node_1']).sum() 77 | node_1_output = relu(node_1_input) 78 | 79 | # Put node values into array: hidden_layer_outputs 80 | hidden_layer_outputs = np.array([node_0_output, node_1_output]) 81 | 82 | # Calculate model output 83 | input_to_final_layer = (weights['output'] * hidden_layer_outputs).sum() 84 | model_output = relu(input_to_final_layer) 85 | 86 | # Return model output 87 | return(model_output) 88 | 89 | 90 | # Create empty list to store prediction results 91 | results = [] 92 | for input_data_row in input_data: 93 | # Append prediction to results 94 | results.append(predict_with_network(input_data_row, weights)) 95 | 96 | # Print results 97 | print(results) 98 | 99 | 100 | # ''''''''''''' Behaviour of a Multi Layer Neural Network ''''''''# 101 | 102 | def predict_with_network(input_data): 103 | # Calculate node 0 in the first hidden layer 104 | node_0_0_input = (input_data * weights['node_0_0']).sum() 105 | node_0_0_output = relu(node_0_0_input) 106 | 107 | # Calculate node 1 in the first hidden layer 108 | node_0_1_input = (input_data * weights['node_0_1']).sum() 109 | node_0_1_output = relu(node_0_1_input) 110 | 111 | # Put node values into array: hidden_0_outputs 112 | hidden_0_outputs = np.array([node_0_0_output, node_0_1_output]) 113 | 114 | # Calculate node 0 in the second hidden layer 115 | node_1_0_input = (hidden_0_outputs * weights['node_1_0']).sum() 116 | node_1_0_output = relu(node_1_0_input) 117 | 118 | # Calculate node 1 in the second hidden layer 119 | node_1_1_input = (hidden_0_outputs * weights['node_1_1']).sum() 120 | node_1_1_output = relu(node_1_1_input) 121 | 122 | # Put node values into array: hidden_1_outputs 123 | hidden_1_outputs = np.array([node_1_0_output, node_1_1_output]) 124 | 125 | # Calculate model output: model_output 126 | model_output = (weights['output'] * hidden_1_outputs).sum() 127 | 128 | # Return model_output 129 | return(model_output) 130 | 131 | output = predict_with_network(input_data) 132 | print(output) 133 | 134 | 135 | # ''' Calculating Model Errors - Consideration of weight effects''''### 136 | 137 | # '''''''' Test Case - Bank Transactions Predictions '''''''## 138 | 139 | # ''''''' Coding how weight changes affects accuracy ''''#'''''### 140 | -------------------------------------------------------------------------------- /Deep_Learning_KerasModel_Build_3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu May 11 18:46:07 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | from sklearn.metrics import mean_squared_error 12 | from keras.layers import Dense 13 | from keras.models import Sequential 14 | 15 | # predictors = np.loadtxt('predictors_data.csv', delimiter=',') 16 | 17 | predictors = np.loadtxt('aerodata.csv', delimiter=',') 18 | target = 3 19 | # Import necessary modules 20 | 21 | # Save the number of columns in predictors: n_cols 22 | # n_cols = predictors.shape[1] 23 | 24 | # Set up the model: model 25 | # model = Sequential() 26 | 27 | # Add the first layer 28 | # model.add(Dense(50, activation='relu', input_shape=(n_cols,))) 29 | 30 | # Add the second layer 31 | # model.add(Dense(32, activation='relu')) 32 | 33 | # Add the output layer 34 | # model.add(Dense(1)) 35 | 36 | # ''''''''' Compile the Model ''''''''''# 37 | 38 | # Specify the model 39 | n_cols = predictors.shape[1] 40 | model = Sequential() 41 | model.add(Dense(50, activation='relu', input_shape=(n_cols,))) 42 | model.add(Dense(32, activation='relu')) 43 | model.add(Dense(1)) 44 | 45 | # Compile the model 46 | model.compile(optimizer='adam', loss='mean_squared_error') 47 | 48 | # Verify that model contains information from compiling 49 | print("Loss function: " + model.loss) 50 | 51 | model.fit(predictors, target) 52 | 53 | # ''''''''''Define Classification Model - Titaninc datasrt example '''# 54 | 55 | # Convert the target to categorical: target 56 | target = to_categorical(df.survived) 57 | 58 | # Set up the model 59 | model = Sequential() 60 | 61 | # Add the first layer 62 | model.add(Dense(32, activation='relu', input_shape=(n_cols,))) 63 | 64 | # Add the output layer 65 | model.add(Dense(2, activation='softmax')) 66 | 67 | # Compile the model 68 | model.compile(optimizer='sgd', loss='categorical_crossentropy', 69 | metrics=['accuracy']) 70 | 71 | # Fit the model 72 | model.fit(predictors, target) 73 | 74 | 75 | # '''''''''''' Making predictions ;;;;;;;;;;# 76 | 77 | # Calculate predictions: predictions 78 | predictions = model.predict(pred_data) 79 | 80 | # Calculate predicted probability of survival: predicted_prob_true 81 | predicted_prob_true = predictions[:, 1] 82 | 83 | # print predicted_prob_true 84 | print(predicted_prob_true) 85 | 86 | 87 | # '''''''''' Model Optimisation - (#4)'''''''''''# 88 | 89 | 90 | # Create list of learning rates: lr_to_test 91 | lr_to_test = [.000001, 0.01, 1] 92 | 93 | # Loop over learning rates 94 | for lr in lr_to_test: 95 | print('\n\nTesting model with learning rate: %f\n'%lr ) 96 | 97 | # Build new model to test, unaffected by previous models 98 | model = get_new_model() 99 | 100 | # Create SGD optimizer with specified learning rate: my_optimizer 101 | my_optimizer = SGD(lr=lr) 102 | 103 | # Compile the model 104 | model.compile(optimizer= my_optimizer, loss= 'categorical_crossentropy') 105 | 106 | # Fit the model 107 | model.fit(predictors, target) 108 | 109 | # ''''''Evaluate model accuracy on validation dataset ''''''# 110 | 111 | # Save the number of columns in predictors: n_cols 112 | n_cols = predictors.shape[1] 113 | input_shape = (n_cols,) 114 | 115 | # Specify the model 116 | model = Sequential() 117 | model.add(Dense(100, activation='relu', input_shape=input_shape)) 118 | model.add(Dense(100, activation='relu')) 119 | model.add(Dense(2, activation='softmax')) 120 | 121 | # Compile the model 122 | model.compile(optimizer='adam', loss='categorical_crossentropy', 123 | metrics=['accuracy']) 124 | 125 | # Fit the model 126 | hist = model.fit(predictors, target, validation_split=0.3) 127 | 128 | 129 | # '''''' Early Stopping - Optimising the optimisation ''''''''''# 130 | 131 | # Import EarlyStopping - already done above 132 | 133 | # Save the number of columns in predictors: n_cols 134 | n_cols = predictors.shape[1] 135 | input_shape = (n_cols,) 136 | 137 | # Specify the model 138 | model = Sequential() 139 | model.add(Dense(100, activation='relu', input_shape=input_shape)) 140 | model.add(Dense(100, activation='relu')) 141 | model.add(Dense(2, activation='softmax')) 142 | 143 | # Compile the model 144 | model.compile(optimizer='adam', loss='categorical_crossentropy', 145 | metrics=['accuracy']) 146 | 147 | # Define early_stopping_monitor 148 | early_stopping_monitor = EarlyStopping(patience=2) 149 | 150 | # Fit the model 151 | model.fit(predictors, target, epochs=30, validation_split=0.3, 152 | callbacks=[early_stopping_monitor]) 153 | 154 | # ''''''''''''' Experimenting with a wider network ''''''# 155 | 156 | # Define early_stopping_monitor 157 | early_stopping_monitor = EarlyStopping(patience=2) 158 | 159 | # Create the new model: model_2 160 | model_2 = Sequential() 161 | 162 | # Add the first and second layers 163 | model_2.add(Dense(100, activation='relu', input_shape=input_shape)) 164 | model_2.add(Dense(100, activation='relu')) 165 | 166 | # Add the output layer 167 | model_2.add(Dense(2, activation='softmax')) 168 | 169 | # Compile model_2 170 | model_2.compile(optimizer='adam', loss='categorical_crossentropy', 171 | metrics=['accuracy']) 172 | 173 | # Fit model_1 174 | model_1_training = model_1.fit(predictors, target, epochs=15, 175 | validation_split=0.2, 176 | callbacks=[early_stopping_monitor], 177 | verbose=False) 178 | 179 | # Fit model_2 180 | model_2_training = model_2.fit(predictors, target, epochs=15, 181 | validation_split=0.2, 182 | callbacks=[early_stopping_monitor], 183 | verbose=False) 184 | 185 | # Create the plot 186 | plt.plot(model_1_training.history['val_loss'], 'r', 187 | model_2_training.history['val_loss'], 'b') 188 | plt.xlabel('Epochs') 189 | plt.ylabel('Validation score') 190 | plt.show() 191 | 192 | -------------------------------------------------------------------------------- /Deep_Learning_KerasModel_Optimise_4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 15 09:52:23 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | from keras.layers import Dense 10 | from keras.models import Sequential 11 | from keras.callbacks import EarlyStopping 12 | 13 | # Import the SGD optimizer 14 | from keras.optimizers import SGD 15 | 16 | # Create list of learning rates: lr_to_test 17 | lr_to_test = [.000001, 0.01, 1] 18 | 19 | # Loop over learning rates 20 | for lr in lr_to_test: 21 | print('\n\nTesting model with learning rate: % f \n'% lr) 22 | 23 | # Build new model to test, unaffected by previous models 24 | model = get_new_model() 25 | 26 | # Create SGD optimizer with specified learning rate: my_optimizer 27 | my_optimizer = SGD(lr=lr) 28 | 29 | # Compile the model 30 | model.compile(optimizer=my_optimizer, loss='categorical_crossentropy') 31 | 32 | # Fit the model 33 | model.fit(predictors, target) 34 | 35 | 36 | # ''''''Evaluate model accuracy on validation dataset ''''''# 37 | 38 | # Save the number of columns in predictors: n_cols 39 | n_cols = predictors.shape[1] 40 | input_shape = (n_cols,) 41 | 42 | # Specify the model 43 | model = Sequential() 44 | model.add(Dense(100, activation='relu', input_shape=input_shape)) 45 | model.add(Dense(100, activation='relu')) 46 | model.add(Dense(2, activation='softmax')) 47 | 48 | # Compile the model 49 | model.compile(optimizer='adam', loss='categorical_crossentropy', 50 | metrics=['accuracy']) 51 | 52 | # Fit the model 53 | hist = model.fit(predictors, target, validation_split=0.3) 54 | 55 | 56 | # '''''' Early Stopping - Optimising the optimisation ''''''''''# 57 | 58 | # Import EarlyStopping - already done above 59 | 60 | # Save the number of columns in predictors: n_cols 61 | n_cols = predictors.shape[1] 62 | input_shape = (n_cols,) 63 | 64 | # Specify the model 65 | model = Sequential() 66 | model.add(Dense(100, activation='relu', input_shape=input_shape)) 67 | model.add(Dense(100, activation='relu')) 68 | model.add(Dense(2, activation='softmax')) 69 | 70 | # Compile the model 71 | model.compile(optimizer='adam', loss='categorical_crossentropy', 72 | metrics=['accuracy']) 73 | 74 | # Define early_stopping_monitor 75 | early_stopping_monitor = EarlyStopping(patience=2) 76 | 77 | # Fit the model 78 | model.fit(predictors, target, epochs=30, validation_split=0.3, 79 | callbacks=[early_stopping_monitor]) 80 | 81 | 82 | # ''''''''''''' Experimenting with a wider network ''''''# 83 | 84 | # Define early_stopping_monitor 85 | early_stopping_monitor = EarlyStopping(patience=2) 86 | 87 | # Create the new model: model_2 88 | model_2 = Sequential() 89 | 90 | # Add the first and second layers 91 | model_2.add(Dense(100, activation='relu', input_shape=input_shape)) 92 | model_2.add(Dense(100, activation='relu')) 93 | 94 | # Add the output layer 95 | model_2.add(Dense(2, activation='softmax')) 96 | 97 | # Compile model_2 98 | model_2.compile(optimizer='adam', loss='categorical_crossentropy', 99 | metrics=['accuracy']) 100 | 101 | # Fit model_1 102 | model_1_training = model_1.fit(predictors, target, epochs=15, 103 | validation_split=0.2, 104 | callbacks=[early_stopping_monitor], 105 | verbose=False) 106 | 107 | # Fit model_2 108 | model_2_training = model_2.fit(predictors, target, epochs=15, 109 | validation_split=0.2, 110 | callbacks=[early_stopping_monitor], 111 | verbose=False) 112 | 113 | # Create the plot 114 | plt.plot(model_1_training.history['val_loss'], 'r', 115 | model_2_training.history['val_loss'], 'b') 116 | plt.xlabel('Epochs') 117 | plt.ylabel('Validation score') 118 | plt.show() 119 | 120 | 121 | # ''''''''' Adding layers to the model ''''''''' # 122 | 123 | # The input shape to use in the first hidden layer 124 | input_shape = (n_cols,) 125 | 126 | # Create the new model: model_2 127 | model_2 = Sequential() 128 | 129 | # Add the first, second, and third hidden layers 130 | model_2.add(Dense(50, activation='relu', input_shape=input_shape)) 131 | model_2.add(Dense(50, activation='relu')) 132 | model_2.add(Dense(50, activation='relu')) 133 | 134 | # Add the output layer 135 | model_2.add(Dense(2, activation='softmax')) 136 | 137 | # Compile model_2 138 | model_2.compile(optimizer='adam', loss='categorical_crossentropy', 139 | metrics=['accuracy']) 140 | 141 | # Fit model 1 142 | model_1_training = model_1.fit(predictors, target, epochs=20, 143 | validation_split=0.4, 144 | callbacks=[early_stopping_monitor], 145 | verbose=False) 146 | 147 | # Fit model 2 148 | model_2_training = model_2.fit(predictors, target, epochs=20, 149 | validation_split=0.4, 150 | callbacks=[early_stopping_monitor], 151 | verbose=False) 152 | 153 | # Create the plot 154 | plt.plot(model_1_training.history['val_loss'], 'r', 155 | model_2_training.history['val_loss'], 'b') 156 | plt.xlabel('Epochs') 157 | plt.ylabel('Validation score') 158 | plt.show() 159 | 160 | 161 | # '''''' Digit Recognition Model '''''''# 162 | 163 | # Create the model: model 164 | model = Sequential() 165 | 166 | # Add the first hidden layer 167 | model.add(Dense(50, activation='relu', input_shape=(784,))) 168 | 169 | # Add the second hidden layer 170 | model.add(Dense(50, activation='relu')) 171 | 172 | # Add the output layer 173 | model.add(Dense(10, activation='softmax')) 174 | 175 | # Compile the model 176 | model.compile(optimizer='adam', loss='categorical_crossentropy', 177 | metrics=['accuracy']) 178 | 179 | # Fit the model 180 | model.fit(X, y, validation_split=0.3) 181 | -------------------------------------------------------------------------------- /Deep_Learning_Ntwrk_Optim_2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 1 18:28:26 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | from sklearn.metrics import mean_squared_error 12 | 13 | 14 | # ''''' Rectified Lin Activa Func. ''''''''''' ## 15 | 16 | 17 | def relu(input): 18 | '''Define relu activation function here''' 19 | # Calculate the value for the output of the relu function: output 20 | output = max(input, 0) 21 | 22 | # Return the value just calculated 23 | return(output) 24 | 25 | 26 | # .............### 27 | 28 | weights = {'node_1': np.array([4, -5]), 'node_0': np.array([2, 4]), 29 | 'output': np.array([2, 7])} 30 | 31 | # '''''' Part 1 End ''''''''''' ### 32 | 33 | input_data = [3, 5] 34 | # ''''''''''''' Behaviour of a Multi Layer Neural Network ''''''''# 35 | 36 | 37 | def predict_with_network(input_data): 38 | # Calculate node 0 in the first hidden layer 39 | node_0_0_input = (input_data * weights['node_0_0']).sum() 40 | node_0_0_output = relu(node_0_0_input) 41 | 42 | # Calculate node 1 in the first hidden layer 43 | node_0_1_input = (input_data * weights['node_0_1']).sum() 44 | node_0_1_output = relu(node_0_1_input) 45 | 46 | # Put node values into array: hidden_0_outputs 47 | hidden_0_outputs = np.array([node_0_0_output, node_0_1_output]) 48 | 49 | # Calculate node 0 in the second hidden layer 50 | node_1_0_input = (hidden_0_outputs * weights['node_1_0']).sum() 51 | node_1_0_output = relu(node_1_0_input) 52 | 53 | # Calculate node 1 in the second hidden layer 54 | node_1_1_input = (hidden_0_outputs * weights['node_1_1']).sum() 55 | node_1_1_output = relu(node_1_1_input) 56 | 57 | # Put node values into array: hidden_1_outputs 58 | hidden_1_outputs = np.array([node_1_0_output, node_1_1_output]) 59 | 60 | # Calculate model output: model_output 61 | model_output = (weights['output'] * hidden_1_outputs).sum() 62 | 63 | # Return model_output 64 | return(model_output) 65 | 66 | output = predict_with_network(input_data) 67 | print(output) 68 | 69 | # ''''''''''''''''''''' Deep Learning - Part 2 ''''''''''' ## 70 | 71 | 72 | # ''' Calculating Model Errors - Consideration of weight effects''''### 73 | 74 | # '''''''' Test Case - Bank Transactions Predictions '''''''## 75 | 76 | # ''''''' Coding how weight changes affects accuracy ''''#'''''### 77 | 78 | # The data point you will make a prediction for 79 | 80 | input_data = np.array([0, 3]) 81 | 82 | # Sample weights 83 | weights_0 = {'node_0': [2, 1], 84 | 'node_1': [1, 2], 85 | 'output': [1, 1] 86 | } 87 | 88 | # The actual target value, used to calculate the error 89 | target_actual = 3 90 | target = 2 91 | # Make prediction using original weights 92 | model_output_0 = predict_with_network(input_data, weights_0) 93 | 94 | # Calculate error: error_0 95 | error_0 = model_output_0 - target_actual 96 | 97 | # Create weights that cause the network to make perfect prediction (3): 98 | # weights_1 99 | weights_1 = {'node_0': [2, 1], 100 | 'node_1': [1, 2], 101 | 'output': [1, 0] 102 | } 103 | 104 | # Make prediction using new weights: model_output_1 105 | model_output_1 = predict_with_network(input_data, weights_1) 106 | 107 | # Calculate error: error_1 108 | error_1 = model_output_1 - target_actual 109 | 110 | # Print error_0 and error_1 111 | print(error_0) 112 | print(error_1) 113 | 114 | 115 | # '''''''''' Scaling up - Multiple Data Points ''''''''''''# 116 | 117 | # Create model_output_0 118 | model_output_0 = [] 119 | # Create model_output_0 120 | model_output_1 = [] 121 | 122 | # Loop over input_data 123 | for row in input_data: 124 | # Append prediction to model_output_0 125 | model_output_0.append(predict_with_network(row, weights_0)) 126 | 127 | # Append prediction to model_output_1 128 | model_output_1.append(predict_with_network(row, weights_1)) 129 | 130 | # Calculate the mean squared error for model_output_0: mse_0 131 | mse_0 = mean_squared_error(model_output_0, target_actuals) 132 | 133 | # Calculate the mean squared error for model_output_1: mse_1 134 | mse_1 = mean_squared_error(model_output_1, target_actuals) 135 | 136 | # Print mse_0 and mse_1 137 | print("Mean squared error with weights_0 : %f" % mse_0) 138 | print("Mean squared error with weights_1 : %f" % mse_1) 139 | 140 | # ''''''''Calculating Slopes '''''# 141 | 142 | # Calculate the predictions: preds 143 | preds = (weights * input_data).sum() 144 | 145 | # Calculate the error: error 146 | error = target - preds 147 | 148 | # Calculate the slope: slope 149 | slope = 2 * input_data * error 150 | 151 | # Print the slope 152 | print(slope) 153 | 154 | # '''''''''' Improving the model weights '''''''' # 155 | 156 | # Set the learning rate: learning_rate 157 | learning_rate = 0.01 158 | 159 | # Calculate the predictions: preds 160 | preds = (weights * input_data).sum() 161 | 162 | # Calculate the error: error 163 | error = target - preds 164 | 165 | # Calculate the slope: slope 166 | slope = 2 * input_data * error 167 | 168 | # Update the weights: weights_updated 169 | weights_updated = weights + (learning_rate * slope) 170 | 171 | # Get updated predictions: preds_updated 172 | preds_updated = (weights_updated * input_data).sum() 173 | 174 | # Calculate updated error: error_updated 175 | error_updated = target - preds_updated 176 | 177 | # Print the original error 178 | print(error) 179 | 180 | # Print the updated error 181 | print(error_updated) 182 | 183 | # ''''''' Making multiple updates to weights ''''''' # 184 | 185 | n_updates = 20 186 | mse_hist = [] 187 | 188 | # Iterate over the number of updates 189 | for i in range(n_updates): 190 | # Calculate the slope: slope 191 | slope = get_slope(input_data, target, weights) 192 | 193 | # Update the weights: weights 194 | weights = weights + 0.01 * slope 195 | 196 | # Calculate mse with new weights: mse 197 | mse = get_mse(input_data, target, weights) 198 | 199 | # Append the mse to mse_hist 200 | mse_hist.append(mse) 201 | 202 | # Plot the mse history 203 | plt.plot(mse_hist) 204 | plt.xlabel('Iterations') 205 | plt.ylabel('Mean Squared Error') 206 | plt.show() 207 | 208 | -------------------------------------------------------------------------------- /Distribution_Check_Theor_ECDF_Data_CDF.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Mar 11 16:21:39 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | # import pandas as pd 11 | 12 | from ecdf_func import ecdf 13 | 14 | # Compute mean and standard deviation: mu, sigma 15 | mu = np.mean(belmont_no_outliers) 16 | sigma = np.std(belmont_no_outliers) 17 | 18 | 19 | # Sample out of a normal distribution with this mu and sigma: samples 20 | samples = np.random.normal(mu, sigma, size=10000) 21 | 22 | # Get the CDF of the samples and of the data 23 | x_theor, y_theor = ecdf(samples) 24 | x, y = ecdf(belmont_no_outliers) 25 | 26 | # Plot the CDFs and show the plot 27 | _ = plt.plot(x_theor, y_theor) 28 | _ = plt.plot(x, y, marker='.', linestyle='none') 29 | plt.margins(0.02) 30 | _ = plt.xlabel('Belmont winning time (sec.)') 31 | _ = plt.ylabel('CDF') 32 | plt.show() 33 | 34 | 35 | # Take a million samples out of the Normal distribution: samples 36 | samples = np.random.normal(mu, sigma, size=1000000) 37 | 38 | # Compute the fraction that are faster than 144 seconds: prob 39 | prob = np.sum(samples <= 144)/len(samples) 40 | 41 | # Print the result 42 | print('Probability of besting Secretariat:', prob) 43 | 44 | # #################################### # 45 | 46 | # Determine successive poisson relationship - i.e. total time between 47 | # two poisson processes 48 | 49 | 50 | def successive_poisson(tau1, tau2, size=1): 51 | # Draw samples out of first exponential distribution: t1 52 | t1 = np.random.exponential(tau1, size) 53 | 54 | # Draw samples out of second exponential distribution: t2 55 | t2 = np.random.exponential(tau2, size) 56 | 57 | return t1 + t2 58 | 59 | 60 | # Draw samples of waiting times: waiting_times 61 | waiting_times = successive_poisson(764, 715, size=100000) 62 | 63 | # Make the histogram 64 | _ = plt.hist(waiting_times, normed=True, histtype='step', bins=100) 65 | 66 | 67 | # Label axes 68 | plt.xlabel('waiting_times') 69 | plt.ylabel('successive_poisson') 70 | 71 | 72 | # Show the plot 73 | plt.show() -------------------------------------------------------------------------------- /EDA_Analysis_Comarison.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 12 17:12:24 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # ########## EDA Analysis ######### # 9 | 10 | # Once the dataframe has been created and you can identify the relevant columns of interest 11 | # this example considers the relationship between illiteracy and fertility as per datacamp course example 12 | # Plot the illiteracy rate versus fertility 13 | _ = plt.plot(illiteracy, fertility, marker='.', linestyle='none') 14 | 15 | # Set the margins and label axes 16 | plt.margins(0.02) 17 | _ = plt.xlabel('percent illiterate') 18 | _ = plt.ylabel('fertility') 19 | 20 | # Show the plot 21 | plt.show() 22 | 23 | # Show the Pearson correlation coefficient 24 | print(pearson_r(illiteracy, fertility)) 25 | 26 | # ############ ######### LINEAR REGRESSION ############ # 27 | 28 | # Perform a linear regression using np.polyfit(): a, b 29 | a, b = np.polyfit(illiteracy, fertility, 1) 30 | 31 | # Print the results to the screen 32 | print('slope =', a, 'children per woman / percent illiterate') 33 | print('intercept =', b, 'children per woman') 34 | 35 | # Make theoretical line to plot 36 | x = np.array([0,100]) 37 | y = a * x + b 38 | 39 | # Add regression line to your plot 40 | _ = plt.plot(x, y) 41 | 42 | # Draw the plot 43 | plt.show() 44 | 45 | ## ############# IS REGRESSION OPTIMAL? ######## # 46 | 47 | # Specify slopes to consider: a_vals 48 | a_vals = np.linspace(0, 0.1, 200) 49 | 50 | # Initialize sum of square of residuals: rss 51 | rss = np.empty_like(a_vals) 52 | 53 | # Compute sum of square of residuals for each value of a_vals 54 | for i, a in enumerate(a_vals): 55 | rss[i] = np.sum((fertility - a*illiteracy - b)**2) 56 | 57 | # Plot the RSS 58 | plt.plot(a_vals, rss, '-') 59 | plt.xlabel('slope (children per woman / percent illiterate)') 60 | plt.ylabel('sum of square of residuals') 61 | 62 | plt.show() 63 | 64 | # Notice that the minimum on the plot, that is the value of the slope that 65 | # gives the minimum sum of the square of the residuals, this is the same value 66 | # got when performing the regression. 67 | -------------------------------------------------------------------------------- /EDA_Hypothesis_Test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 20 19:02:21 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # the following analysis loks at an example comparing impact force of frogs - interesting :-) 9 | 10 | 11 | # ########### EDA BEFORE HYPOTHESIS TESTING ### # 12 | # Make bee swarm plot 13 | _ = sns.swarmplot(x='ID', y='impact_force', data=df) 14 | 15 | # Label axes 16 | _ = plt.xlabel('frog') 17 | _ = plt.ylabel('impact force (N)') 18 | 19 | # Show the plot 20 | plt.show() 21 | 22 | 23 | # ######## permutation test on the data ### # 24 | 25 | # make a difference of means function #### # 26 | 27 | def diff_of_means(data_1, data_2): 28 | """Difference in means of two arrays.""" 29 | 30 | # The difference of means of data_1, data_2: diff 31 | diff = np.mean(data_1) - np.mean(data_2) 32 | 33 | return diff 34 | 35 | # Compute difference of mean impact force from experiment: empirical_diff_means 36 | empirical_diff_means = diff_of_means(force_a, force_b) 37 | 38 | # Draw 10,000 permutation replicates: perm_replicates 39 | perm_replicates = draw_perm_reps(force_a, force_b, 40 | diff_of_means, size=10000) 41 | 42 | # Compute p-value: p 43 | p = np.sum(perm_replicates >= empirical_diff_means) / len(perm_replicates) 44 | 45 | # Print the result 46 | print('p-value =', p) 47 | 48 | 49 | # ######## ONE SAMPLE BOOTSTRAP TEST ########### # 50 | 51 | # Make an array of translated impact forces: translated_force_b 52 | translated_force_b = force_b - np.mean(force_b) + 0.55 53 | 54 | # Take bootstrap replicates of Frog B's translated impact forces: bs_replicates 55 | bs_replicates = draw_bs_reps(translated_force_b, np.mean, 10000) 56 | 57 | # Compute fraction of replicates that are less than the observed Frog B force: p 58 | p = np.sum(bs_replicates <= np.mean(force_b)) / 10000 59 | 60 | # BOOTSTRAP TEST FOR AN INDENTICAL DISTRIBUTION 61 | 62 | Compute difference of mean impact force from experiment: empirical_diff_means 63 | empirical_diff_means = diff_of_means(force_a, force_b) 64 | 65 | # Concatenate forces: forces_concat 66 | forces_concat = np.concatenate((force_a, force_b)) 67 | 68 | # Initialize bootstrap replicates: bs_replicates 69 | bs_replicates = np.empty(10000) 70 | 71 | for i in range(10000): 72 | # Generate bootstrap sample 73 | bs_sample = np.random.choice(forces_concat, size=len(forces_concat)) 74 | 75 | # Compute replicate 76 | bs_replicates[i] = diff_of_means(bs_sample[:len(force_a)], 77 | bs_sample[len(force_b):]) 78 | 79 | # Compute and print p-value: p 80 | p = np.sum(bs_replicates >= empirical_diff_means)/ 10000 81 | print('p-value =', p) 82 | 83 | -------------------------------------------------------------------------------- /Entry_Count_Check_Exception.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 9 01:41:06 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Define count_entries() 9 | def count_entries(df, col_name='lang'): 10 | """Return a dictionary with counts of 11 | occurrences as value for each key.""" 12 | 13 | # Initialize an empty dictionary: cols_count 14 | cols_count = {} 15 | 16 | # Add try block 17 | try: 18 | # Extract column from DataFrame: col 19 | col = df[col_name] 20 | 21 | # Iterate over the column in dataframe 22 | for entry in col: 23 | 24 | # If entry is in cols_count, add 1 25 | if entry in cols_count.keys(): 26 | cols_count[entry] += 1 27 | # Else add the entry to cols_count, set the value to 1 28 | else: 29 | cols_count[entry] = 1 30 | 31 | # Return the cols_count dictionary 32 | return cols_count 33 | 34 | # Add except block 35 | except: 36 | print('The dataframe does not have a ' + col_name + ' column.') 37 | 38 | # Call count_entries(): result1 39 | result1 = count_entries(tweets_df, 'lang') 40 | 41 | # Print result1 42 | print(result1) 43 | 44 | # Call count_entries(): result2 45 | result2 = count_entries(tweets_df, 'lang1') 46 | -------------------------------------------------------------------------------- /EthicalHackingCourseNotes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jun 15 16:03:49 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Pen test methodology 9 | # 1. Vulnerability 10 | # 2. Exploit - End of Pen Testing 11 | # 3. Trace Removal 12 | ########################################################## 13 | # elevate privileges 14 | # copy or move data 15 | # log out without being noticed 16 | 17 | ##################################################### 18 | # Attack Vectors 19 | # 3 major areas 20 | # Network - 21 | # Host OS aatack / 22 | # Application attacks 23 | ###################################################### 24 | # Vulnerability Managment 25 | # 6 steps 26 | # discover 27 | # cat and priorotise 28 | # scan for vul 29 | # report and classify 30 | # remediate 31 | # verify checks 32 | ############################## 33 | # Incident Management - Quickly resolve incidents with min 34 | # impact to the process or business 35 | # Improve monitoring 36 | # elimination of loss of requests 37 | # availability of info 38 | # accurate CMDB infor 39 | # improve user and cust satisfaction 40 | ######################### 41 | # Incident Management Plan 42 | # Identify 43 | # Analyse 44 | # Gather infor 45 | # Contain 46 | # Mitigate 47 | # Eradicate 48 | ###################################################### 49 | # Risk Assesment 50 | # Vulnerability Assessments 51 | -------------------------------------------------------------------------------- /ExtractHist_Image.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 4 13:24:24 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | # Load the image into an array: image 12 | image = plt.imread('640px-Unequalized_Hawkes_Bay_NZ.jpg') 13 | 14 | # Display image in top subplot using color map 'gray' 15 | plt.subplot(2, 1, 1) 16 | plt.title('Original image') 17 | plt.axis('off') 18 | plt.imshow(image, cmap='gray') 19 | 20 | # Flatten the image into 1 dimension: pixels 21 | pixels = image.flatten() 22 | 23 | # Display a histogram of the pixels in the bottom subplot 24 | plt.subplot(2, 1, 2) 25 | plt.xlim((0, 255)) 26 | plt.title('Normalized histogram') 27 | plt.hist(pixels, bins=64, range=(0, 256), normed=True, color='red', alpha=0.4) 28 | _ = plt.imshow(image) 29 | # Display the plot 30 | plt.show() 31 | 32 | # '''''''''''''''' Cumulative Distribution Fuc on Image Hist ''# 33 | 34 | # Load the image into an array: image 35 | image = plt.imread('640px-Unequalized_Hawkes_Bay_NZ.jpg') 36 | 37 | # Display image in top subplot using color map 'gray' 38 | plt.subplot(2,1,1) 39 | plt.imshow(image, cmap='gray') 40 | plt.title('Original image') 41 | plt.axis('off') 42 | 43 | # Flatten the image into 1 dimension: pixels 44 | pixels = image.flatten() 45 | 46 | # Display a histogram of the pixels in the bottom subplot 47 | plt.subplot(2,1,2) 48 | pdf = plt.hist(pixels, bins=64, range=(0, 256), normed=False, 49 | color='red', alpha=0.4) 50 | plt.grid('off') 51 | 52 | # Use plt.twinx() to overlay the CDF in the bottom subplot 53 | _ = plt.twinx() 54 | 55 | # Display a cumulative histogram of the pixels 56 | cdf = plt.hist(pixels, bins=64, range=(0,256), 57 | normed=True, cumulative=True, 58 | color='blue', alpha=0.4) 59 | 60 | # Specify x-axis range, hide axes, add title and display plot 61 | plt.xlim((0,256)) 62 | plt.grid('off') 63 | plt.title('PDF & CDF (original image)') 64 | plt.show() 65 | 66 | # ''''''''''''' Equalise Image Histogram ''''''# 67 | 68 | # Load the image into an array: image 69 | image = plt.imread('640px-Unequalized_Hawkes_Bay_NZ.jpg') 70 | 71 | # Flatten the image into 1 dimension: pixels 72 | pixels = image.flatten() 73 | 74 | # Generate a cumulative histogram 75 | cdf, bins, patches = plt.hist(pixels, bins=256, range=(0, 256), normed=True, cumulative=True) 76 | new_pixels = np.interp(pixels, bins[:-1], cdf*255) 77 | 78 | # Reshape new_pixels as a 2-D array: new_image 79 | new_image = new_pixels.reshape(image.shape) 80 | 81 | # Display the new image with 'gray' color map 82 | plt.subplot(2, 1, 1) 83 | plt.title('Equalized image') 84 | plt.axis('off') 85 | plt.imshow(new_image, cmap='gray') 86 | 87 | # Generate a histogram of the new pixels 88 | plt.subplot(2, 1, 2) 89 | pdf = plt.hist(new_pixels, bins=64, range=(0, 256), normed=False, 90 | color='red', alpha=0.4) 91 | plt.grid('off') 92 | 93 | # Use plt.twinx() to overlay the CDF in the bottom subplot 94 | _ = plt.twinx() 95 | plt.xlim((0, 256)) 96 | plt.grid('off') 97 | 98 | # Add title 99 | plt.title('PDF & CDF (equalized image)') 100 | 101 | # Generate a cumulative histogram of the new pixels 102 | cdf = plt.hist(new_pixels, bins=64, range=(0,256), 103 | cumulative=True, normed=True, 104 | color='blue', alpha=0.4) 105 | plt.show() 106 | 107 | # ''''''''''''' Extract Histograms from a colour Image ''''''''''# 108 | 109 | # Load the image into an array: image 110 | image = plt.imread('hs-2004-32-b-small_web.jpg') 111 | 112 | # Display image in top subplot 113 | plt.subplot(2, 1, 1) 114 | plt.title('Original image') 115 | plt.axis('off') 116 | plt.imshow(image) 117 | 118 | # Extract 2-D arrays of the RGB channels: red, blue, green 119 | red, green, blue = image[:, :, 0], image[:, :, 1], image[:, :, 2] 120 | 121 | # Flatten the 2-D arrays of the RGB channels into 1-D 122 | red_pixels = red.flatten() 123 | blue_pixels = blue.flatten() 124 | green_pixels = green.flatten() 125 | 126 | # Overlay histograms of the pixels of each color in the bottom subplot 127 | plt.subplot(2, 1, 2) 128 | plt.title('Histograms from color image') 129 | plt.xlim((0, 256)) 130 | plt.hist(red_pixels, bins=64, normed=True, color='red', alpha=0.2) 131 | plt.hist(blue_pixels, bins=64, normed=True, color='blue', alpha=0.2) 132 | plt.hist(green_pixels, bins=64, normed=True, color='green', alpha=0.2) 133 | 134 | # Display the plot 135 | plt.show() 136 | 137 | # ''''''Extracting Bivariate Histograms from a Colour Image '''''''# 138 | 139 | # Load the image into an array: image 140 | image = plt.imread('hs-2004-32-b-small_web.jpg') 141 | 142 | # Extract RGB channels and flatten into 1-D array 143 | red, blue, green = image[:, :, 0], image[:, :, 1], image[:, :, 2] 144 | red_pixels = red.flatten() 145 | blue_pixels = blue.flatten() 146 | green_pixels = green.flatten() 147 | 148 | # Generate a 2-D histogram of the red and green pixels 149 | plt.subplot(2, 2, 1) 150 | plt.grid('off') 151 | plt.xticks(rotation=60) 152 | plt.xlabel('red') 153 | plt.ylabel('green') 154 | _ = plt.hist2d(red_pixels, green_pixels, bins=(32, 32)) 155 | 156 | # Generate a 2-D histogram of the green and blue pixels 157 | plt.subplot(2, 2, 2) 158 | plt.grid('off') 159 | plt.xticks(rotation=60) 160 | plt.xlabel('green') 161 | plt.ylabel('blue') 162 | _ = plt.hist2d(green_pixels, blue_pixels, bins=(32, 32)) 163 | 164 | # Generate a 2-D histogram of the blue and red pixels 165 | plt.subplot(2, 2, 3) 166 | plt.grid('off') 167 | plt.xticks(rotation=60) 168 | plt.xlabel('blue') 169 | plt.ylabel('red') 170 | _ = plt.hist2d(blue_pixels, red_pixels, bins=(32, 32)) 171 | 172 | # Display the plot 173 | plt.show() -------------------------------------------------------------------------------- /Extract_Data_from_HDF5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 11 03:30:20 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Get the HDF5 group: group 9 | group = data['strain'] 10 | 11 | # Check out keys of group 12 | for key in group.keys(): 13 | print(key) 14 | 15 | # Set variable equal to time series data: strain 16 | strain = data['strain']['Strain'].value 17 | 18 | # Set number of time points to sample: num_samples 19 | num_samples = 10000 20 | 21 | # Set time vector 22 | time = np.arange(0, 1, 1/num_samples) 23 | 24 | # Plot data 25 | plt.plot(time, strain[:num_samples]) 26 | plt.xlabel('GPS Time (s)') 27 | plt.ylabel('strain') 28 | plt.show() 29 | -------------------------------------------------------------------------------- /File_Import_Multi_DataType.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 9 13:05:02 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Assign filename: file 9 | file = 'seaslug.txt' 10 | 11 | # Import file: data 12 | data = np.loadtxt(file, delimiter='\t', dtype=str) 13 | 14 | # Print the first element of data 15 | print(data[0]) 16 | 17 | # Import data as floats and skip the first row: data_float 18 | data_float = np.loadtxt(file, delimiter='\t', dtype=float, skiprows=1) 19 | 20 | # Print the 10th element of data_float 21 | print(data_float[9]) 22 | 23 | # Plot a scatterplot of the data 24 | plt.scatter(data_float[:, 0], data_float[:, 1]) 25 | plt.xlabel('time (min.)') 26 | plt.ylabel('percentage of larvae') 27 | plt.show() 28 | -------------------------------------------------------------------------------- /FilterData_Selected_from_Table_SQLAlchemy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 17 13:15:13 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # import and_ 9 | from sqlalchemy import and_ 10 | 11 | # BUild a query for the census table: stmt 12 | stmt = select([census]) 13 | 14 | # Append a where clause to select only non-male records from California using and_ 15 | 16 | stmt = stmt.where( 17 | # The state of California with a non-male sex 18 | and_(census.columns.state == 'California', census.columns.sex != 'M') 19 | 20 | ) 21 | 22 | # Loop over the ResultProxy printing the age and sex 23 | for result in connection.execute(stmt): 24 | print(result.age, result.sex) 25 | 26 | -------------------------------------------------------------------------------- /FilterSQL_Database_Table_Col_Row.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jan 13 15:01:21 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import pandas as pd 9 | from sqlalchemy import create_engine 10 | """ 11 | Let's say, for example that you wanted to get all records from the Customer 12 | table of the Chinook database for which the Country is 'Canada'. 13 | You can do this very easily in SQL 14 | using a SELECT statement followed by a WHERE clause as follows: 15 | 16 | SELECT * FROM Customer WHERE Country = 'Canada' 17 | 18 | In fact, you can filter any SELECT statement by any condition using a WHERE 19 | clause. This is called filtering your records. 20 | Below, you'll select all records of the Employee table for which 'EmployeeId' 21 | is greater than or equal to 6 22 | """ 23 | 24 | 25 | # Create engine: engine 26 | engine = create_engine('sqlite:///Chinnok.sqlite') 27 | 28 | # Open engine in context manager 29 | # Perform query and save results to DataFrame: df 30 | with engine.connect() as con: 31 | rs = con.execute("SELECT * FROM Employee WHERE EmployeeId >= 6") 32 | df = pd.DataFrame(rs.fetchall()) 33 | df.columns = rs.keys() 34 | 35 | # Print the head of the DataFrame df 36 | print(df.head()) 37 | -------------------------------------------------------------------------------- /FilterSQL_Database_Table_WHERE.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jan 13 15:01:21 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import pandas as pd 9 | from sqlalchemy import create_engine 10 | """ 11 | Let's say, for example that you wanted to get all records from the Customer 12 | table of the Chinook database for which the Country is 'Canada'. 13 | You can do this very easily in SQL 14 | using a SELECT statement followed by a WHERE clause as follows: 15 | 16 | SELECT * FROM Customer WHERE Country = 'Canada' 17 | 18 | In fact, you can filter any SELECT statement by any condition using a WHERE 19 | clause. This is called filtering your records. 20 | Below, you'll select all records of the Employee table for which 'EmployeeId' 21 | is greater than or equal to 6 22 | """ 23 | 24 | # Create engine: engine 25 | engine = create_engine('sqlite:///Chinnok.sqlite') 26 | 27 | # Open engine in context manager 28 | # Perform query and save results to DataFrame: df 29 | with engine.connect() as con: 30 | rs = con.execute("SELECT * FROM Employee WHERE EmployeeId >= 6") 31 | df = pd.DataFrame(rs.fetchall()) 32 | df.columns = rs.keys() 33 | 34 | # Print the head of the DataFrame df 35 | print(df.head()) 36 | -------------------------------------------------------------------------------- /General Multi_Column DataFrame Analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Jan 7 18:14:49 2017 4 | 5 | @author: Shabaka 6 | """ 7 | import pandas as pd 8 | import numpy as np 9 | 10 | # Define count_entries() 11 | def count_entries(df, *args): 12 | """Return a dictionary with counts of 13 | occurrences as value for each key.""" 14 | #Initialize an empty dictionary: cols_count 15 | cols_count = {} 16 | # Iterate over column names in args 17 | for col_name in args: 18 | 19 | # Extract column from DataFrame: col 20 | col = df[col_name] 21 | 22 | # Iterate over the column in dataframe 23 | for entry in col: 24 | 25 | # If entry is in cols_count, add 1 26 | if entry in cols_count.keys(): 27 | cols_count[entry] += 1 28 | 29 | # Else add the entry to cols_count, set the value to 1 30 | else: 31 | cols_count[entry] = 1 32 | 33 | # Return the cols_count dictionary 34 | return cols_count 35 | 36 | # Call count_entries(): result1 37 | result1 = count_entries(tweets_df, 'lang') 38 | 39 | # Call count_entries(): result2 40 | result2 = count_entries(tweets_df, 'lang', 'source') 41 | 42 | # Print result1 and result2 43 | print(result1) 44 | print(result2) 45 | 46 | -------------------------------------------------------------------------------- /General Twitter Language Analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Jan 7 17:31:07 2017 4 | 5 | @author: Shabaka 6 | """ 7 | import pandas as pd 8 | import numpy as np 9 | 10 | 11 | # Define count_entries() 12 | def count_entries(df, col_name = 'lang'): 13 | """Return a dictionary with counts of 14 | occurrences as value for each key.""" 15 | 16 | # Initialize an empty dictionary: cols_count 17 | cols_count = {} 18 | 19 | # Extract column from DataFrame: col 20 | col = df[col_name] 21 | 22 | # Iterate over the column in dataframe 23 | for entry in col: 24 | 25 | # If entry is in cols_count, add 1 26 | if entry in cols_count.keys(): 27 | cols_count[entry] += 1 28 | 29 | # Else add the entry to cols_count, set the value to 1 30 | else: 31 | cols_count[entry] = 1 32 | 33 | # Return the cols_count dictionary 34 | return cols_count 35 | 36 | # Call count_entries(): result1 37 | result1 = count_entries(tweets_df, 'lang') 38 | 39 | # Call count_entries(): result2 40 | result2 = count_entries(tweets_df, 'source') 41 | 42 | # Print result1 and result2 43 | print(result1) 44 | print(result2) 45 | 46 | -------------------------------------------------------------------------------- /Generate from MultiType Data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 10 00:24:52 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | data = np.genfromtxt('gaze_positions.csv', delimiter=',', names=True, dtype=None) 12 | 13 | np.shape(data) 14 | 15 | 16 | data[0] 17 | 18 | 19 | #More mixed datatypes 20 | 21 | # Assign the filename: file 22 | file = 'titanic.csv' 23 | 24 | # Import file using np.recfromcsv: d 25 | d = np.recfromcsv(file) 26 | 27 | #np.recfrocsv already contains the default 28 | #delimiter as a comma and dtype is none 29 | 30 | # Print out first three entries of d 31 | print(d[:3]) -------------------------------------------------------------------------------- /HTML_with_BeautifulSoup_GetHypLinktData.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 16 00:08:42 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import packages 9 | import requests 10 | from bs4 import BeautifulSoup 11 | 12 | # Specify url 13 | url = 'https://www.python.org/~guido/' 14 | 15 | # Package the request, send the request and catch the response: r 16 | r = requests.get(url) 17 | 18 | # Extracts the response as html: html_doc 19 | html_doc = r.text 20 | 21 | # create a BeautifulSoup object from the HTML: soup 22 | soup = BeautifulSoup(html_doc) 23 | 24 | # Print the title of Guido's webpage 25 | print(soup.title) 26 | 27 | # Find all 'a' tags (which define hyperlinks): a_tags 28 | a_tags = soup.find_all('a') 29 | 30 | # Print the URLs to the shell 31 | for link in a_tags: 32 | print(link.get('href')) -------------------------------------------------------------------------------- /HTML_with_BeautifulSoup_GetTextData.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 16 00:01:32 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import packages 9 | import requests 10 | from bs4 import BeautifulSoup 11 | 12 | # Specify url: url 13 | url = 'https://www.python.org/~guido/' 14 | 15 | # Package the request, send the request and catch the response: r 16 | r = requests.get(url) 17 | 18 | # Extract the response as html: html_doc 19 | html_doc = r.text 20 | 21 | # Create a BeautifulSoup object from the HTML: soup 22 | soup = BeautifulSoup(html_doc) # presents page in a readable manner 23 | soup.body.text 24 | 25 | bold = soup.finaAll('b') # find all bold text and return a list 26 | 27 | print(bold) 28 | print(soup.prettify()) 29 | # Get the title of Guido's webpage: guido_title 30 | guido_title = (soup.title) 31 | 32 | # Print the title of Guido's webpage to the shell 33 | print(guido_title) 34 | 35 | # Get Guido's text: guido_text 36 | guido_text = (soup.get_text()) 37 | 38 | # Print Guido's text to the shell 39 | print(guido_text) 40 | 41 | soup.findAll(id-"para2")[0].text 42 | soup.findAll(['b', 'p']) 43 | 44 | soup.findAll({'b': True, 'p': True}) 45 | 46 | # find all links in the document 47 | 48 | links = soup.find('a') # retruns 1st match it gets -use findAll 49 | 50 | print(links['href'] + " is the url and " + links.text + " is the text") 51 | 52 | 53 | # Use find in various ways 54 | 55 | # findParents, findNextSiblings, findPreviousSiblings 56 | # findNext, findPrevious and findAllNext and findAllPrevious -------------------------------------------------------------------------------- /HTTP_Request_Urllib_Response.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 15 23:10:17 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import packages 9 | from urllib.request import urlopen, Request 10 | 11 | # Specify the url 12 | url = "http://www.datacamp.com/teach/documentation" 13 | 14 | # This packages the request: request 15 | request = Request(url) 16 | 17 | # Sends the request and catches the response: response 18 | response = urlopen(request) 19 | 20 | # Print the datatype of response 21 | print(type(response)) 22 | 23 | # Be polite and close the response! 24 | response.close() -------------------------------------------------------------------------------- /HTTP_Request_Urllib_Response_Read.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 15 23:15:37 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import packages 9 | from urllib.request import urlopen, Request 10 | 11 | # Specify the url 12 | url = "http://docs.datacamp.com/teach/" 13 | 14 | # This packages the request 15 | request = Request(url) 16 | 17 | # Sends the request and catches the response: response 18 | response = urlopen(request) 19 | 20 | # Extract the response: html 21 | html = response.read() 22 | 23 | # Print the html 24 | print(html) 25 | 26 | # Be polite and close the response! 27 | response.close() 28 | -------------------------------------------------------------------------------- /HTTP_Request_using_Requests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 15 23:32:44 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | 9 | """ 10 | 11 | Import the package requests. 12 | Assign the URL of interest to the variable url. 13 | Package the request to the URL, send the request and catch the response 14 | with a single function requests.get(), assigning the response to the 15 | variable r. 16 | Use the text attribute of the object r to return the HTML of the webpage 17 | as a string; store the result in a variable text. 18 | Hit submit to print the HTML of the webpage. 19 | """ 20 | 21 | # Import package 22 | import requests 23 | 24 | # Specify the url: url 25 | url = "http://docs.datacamp.com/teach/" 26 | 27 | # Packages the request, send the request and catch the response: r 28 | r = requests.get(url) 29 | 30 | # Extract the response: text 31 | text = r.text 32 | 33 | # Print the html 34 | print(text) -------------------------------------------------------------------------------- /Hack_Bern_nprandom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 7 21:39:58 2017 4 | 5 | @author: Shabaka 6 | """ 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | def perform_bernoulli_trials(n, p): 12 | """Perform n Bernoulli trials with success probability p 13 | and return number of successes.""" 14 | # Initialize number of successes: n_success 15 | n_success = 0 16 | 17 | # Perform trials 18 | for i in range(n): 19 | # Choose random number between zero and one: random_number 20 | random_number = np.random.random() 21 | 22 | # If less than p, it's a success so add one to n_success 23 | if random_number < p: 24 | n_success += 1 25 | 26 | return n_success 27 | 28 | # Seed random number generator 29 | np.random.seed(42) 30 | 31 | # Initialize the number of defaults: n_defaults 32 | n_defaults = np.empty(1000) 33 | 34 | # Compute the number of defaults 35 | for i in range(1000): 36 | n_defaults[i] = perform_bernoulli_trials(100, 0.05) 37 | 38 | 39 | # Plot the histogram with default number of bins; label your axes 40 | _ = plt.hist(n_defaults, normed=True) 41 | _ = plt.xlabel('number of defaults out of 100 loans') 42 | _ = plt.ylabel('probability') 43 | 44 | # Show the plot 45 | plt.show() 46 | 47 | # Compute bin edges: bins 48 | bins = np.arange(-0.5, max(n_defaults + 1.5) - 0.5) 49 | 50 | # Generate histogram 51 | _ = plt.hist(n_defaults, normed=True, bins=bins) 52 | 53 | # Set margins 54 | plt.margins(0.02) 55 | 56 | # Label axes 57 | _ = plt.xlabel('number of defaults out of 100 loans') 58 | _ = plt.ylabel('Binomial PMF') 59 | 60 | 61 | # Show the plot 62 | plt.show() 63 | 64 | 65 | # Draw 10,000 samples out of Poisson distribution: samples_poisson 66 | samples_poisson = np.random.poisson(10, size=10000) 67 | 68 | # Print the mean and standard deviation 69 | print('Poisson: ', np.mean(samples_poisson), 70 | np.std(samples_poisson)) 71 | 72 | # Specify values of n and p to consider for Binomial: n, p 73 | n = [20, 100, 1000] 74 | p = [0.5, 0.1, 0.01] 75 | 76 | 77 | # Draw 10,000 samples for each n,p pair: samples_binomial 78 | for i in range(3): 79 | samples_binomial = np.random.binomial(n[i], p[i], size=10000) 80 | 81 | # Print results 82 | print('n =', n[i], 'Binom:', np.mean(samples_binomial), 83 | np.std(samples_binomial)) 84 | 85 | -------------------------------------------------------------------------------- /Hack_Stats_BasicRandGen.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 7 21:05:24 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | # Seed the random number generator 12 | np.random.seed(42) 13 | 14 | # Initialize random numbers: random_numbers 15 | random_numbers = np.empty(100000) 16 | 17 | # Generate random numbers by looping over range(100000) 18 | for i in range(100000): 19 | random_numbers[i] = np.random.random() 20 | 21 | # Plot a histogram 22 | _ = plt.hist(random_numbers) 23 | 24 | # Show the plot 25 | plt.show() 26 | -------------------------------------------------------------------------------- /Import_Excel_Pandas.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 11 01:28:11 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import pandas 9 | import pandas as pd 10 | 11 | # Assign spreadsheet filename: file 12 | file = 'battledeath.xlsx' 13 | 14 | # Load spreadsheet: xl 15 | xl = pd.ExcelFile(file) 16 | 17 | # Print sheet names 18 | print(xl.sheet_names) 19 | 20 | """ 21 | Import Excel Sheets Specifically 22 | """ 23 | 24 | # Load a sheet into a DataFrame by name: df1 25 | df1 = xl.parse('2004') 26 | 27 | # Print the head of the DataFrame df1 28 | print(df1.head()) 29 | 30 | # Load a sheet into a DataFrame by index: df2 31 | df2 = xl.parse(0) 32 | 33 | # Print the head of the DataFrame df2 34 | print(df2.head()) -------------------------------------------------------------------------------- /Import_Excel_Parse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 11 02:22:12 2017 4 | 5 | @author: Shabaka 6 | """ 7 | import pandas as pd 8 | import numpy as np 9 | """ 10 | The spreadsheet 'battledeath.xlsx' is already loaded as xl. 11 | 12 | As before, you'll use the method parse(). This time, however, you'll add the 13 | additional arguments skiprows, names and parse_cols. These skip rows, name the 14 | columns and designate which columns to parse, respectively. All these arguments 15 | can be assigned to lists containing the specific row numbers, strings and 16 | column numbers, respectively. 17 | """ 18 | 19 | # Parse the first sheet and rename the columns: df1 20 | df1 = xl.parse(0, skiprows=[0], names=['Country', 'AAM due to War (2002)']) 21 | 22 | # Print the head of the DataFrame df1 23 | print(df1.head()) 24 | 25 | # Parse the first column of the second sheet and rename the column: df2 26 | df2 = xl.parse(1, parse_cols=[0], skiprows=[0], names=['Country']) 27 | 28 | # Print the head of the DataFrame df2 29 | print(df2.head()) 30 | -------------------------------------------------------------------------------- /Import_FlatFile_Web.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 15 21:27:31 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import the function urlretrieve from the subpackage urllib.request. 9 | # Assign the URL of the file to the variable url. 10 | # Use the function urlretrieve() to save the file locally as 11 | # 'winequality-red.csv'. 12 | # Execute the remaining code to load 'winequality-red.csv' in a pandas 13 | # DataFrame and to print its head to the shell 14 | 15 | 16 | # Import package 17 | from urllib.request import urlretrieve 18 | 19 | # Import pandas 20 | import pandas as pd 21 | 22 | # Assign url of file: url 23 | url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/data\ 24 | sets/winequality-red.csv' 25 | 26 | # Save file locally 27 | urlretrieve(url, 'winequality-red.csv') 28 | 29 | # Read file into a DataFrame and print its head 30 | df = pd.read_csv('winequality-red.csv', sep=';') 31 | print(df.head()) -------------------------------------------------------------------------------- /Import_HDF5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 11 03:24:25 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import packages 9 | import numpy as np 10 | import h5py 11 | 12 | # Assign filename: file 13 | file = 'LIGO_data.hdf5' 14 | 15 | # Load file: data 16 | data = h5py.File('LIGO_data.hdf5', 'r') 17 | 18 | # Print the datatype of the loaded file 19 | print(type(data)) 20 | 21 | # Print the keys of the file 22 | for key in data.keys(): 23 | print(key) -------------------------------------------------------------------------------- /Import_MatLab_WorkSpace.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 11 18:35:11 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import packages 9 | import scipy.io 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | 13 | # Load MATLAB file: mat 14 | mat = scipy.io.loadmat('albeck_gene_expression.mat') 15 | 16 | # Print the datatype type of mat 17 | print(type(mat)) 18 | 19 | """ 20 | discover what is in the MATLAB dictionary that you loaded 21 | """ 22 | 23 | # Print the keys of the MATLAB dictionary 24 | print(mat.keys()) 25 | 26 | # Print the type of the value corresponding to the key 'CYratioCyt' 27 | print(type(mat['CYratioCyt'])) 28 | 29 | # Print the shape of the value corresponding to the key 'CYratioCyt' 30 | print(np.shape(mat['CYratioCyt'])) 31 | 32 | # Subset the array and plot it 33 | data = mat['CYratioCyt'][25, 5:] 34 | fig = plt.figure() 35 | plt.plot(data) 36 | plt.xlabel('time (min.)') 37 | plt.ylabel('normalized fluorescence (measure of expression)') 38 | plt.show() 39 | -------------------------------------------------------------------------------- /Import_Pickled-Data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 11 01:22:54 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import pickle package 9 | import pickle 10 | 11 | # Open pickle file and load data: d 12 | with open('data.pkl', 'rb') as file: 13 | d = pickle.load(file) 14 | 15 | # Print d 16 | print(d) 17 | 18 | # Print datatype of d 19 | print(type(d)) -------------------------------------------------------------------------------- /Import_Plot_Web_Flatfile_NonLocal_Save.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 15 21:47:37 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import packages 9 | import matplotlib.pyplot as plt 10 | import pandas as pd 11 | 12 | # Assign url of file: url 13 | url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606\ 14 | /datasets/winequality-red.csv' 15 | 16 | # Read file into a DataFrame: df 17 | df = pd.read_csv(url, sep= ';') 18 | 19 | # Print the head of the DataFrame 20 | print(df.head()) 21 | 22 | # Plot first column of df 23 | pd.DataFrame.hist(df.ix[:, 0:1]) 24 | plt.xlabel('fixed acidity (g(tartaric acid)/dm$^3$)') 25 | plt.ylabel('count') 26 | plt.show() 27 | -------------------------------------------------------------------------------- /Import_SAS7BDAT_.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 11 03:00:42 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | 9 | 10 | import pandas as pd 11 | import matplotlib.pyplot as plt 12 | 13 | # Import sas7bdat package 14 | from sas7bdat import SAS7BDAT 15 | 16 | # Save file to a DataFrame: df_sas 17 | with SAS7BDAT('sales.sas7bdat') as file: 18 | df_sas = file.to_data_frame() 19 | 20 | 21 | # Print head of DataFrame 22 | print(df_sas.head()) 23 | 24 | # Plot histogram of DataFrame features (pandas and pyplot already imported) 25 | pd.DataFrame.hist(df_sas[['P']]) 26 | plt.ylabel('count') 27 | plt.show() -------------------------------------------------------------------------------- /Import_Stata_File.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 11 03:10:31 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | 12 | # Import pandas 13 | import pandas as pd 14 | 15 | # Load Stata file into a pandas DataFrame: df 16 | df = pd.read_stata('disarea.dta') 17 | 18 | # Print the head of the DataFrame df 19 | print(df.head()) 20 | 21 | # Plot histogram of one column of the DataFrame 22 | pd.DataFrame.hist(df[['disa10']]) 23 | plt.xlabel('Extent of disease') 24 | plt.ylabel('Number of coutries') 25 | plt.show() 26 | -------------------------------------------------------------------------------- /Inserting_Multiple_Rows.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jan 26 02:43:07 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | 9 | 10 | # Build a list of dictionaries: values_list 11 | values_list = [ 12 | {'name': 'Anna', 'count': 1, 'amount': 1000.00, 'valid':True}, 13 | {'name' : 'Taylor', 'count':1, 'amount':750.00, 'valid':False} 14 | ] 15 | 16 | # Build an insert statement for the data table: stmt 17 | stmt = insert(data) 18 | 19 | # Execute stmt with the values_list: results 20 | results = connection.execute(stmt, values_list) 21 | 22 | # Print rowcount 23 | print(results.rowcount) -------------------------------------------------------------------------------- /Iteration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 27 17:44:35 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | 9 | # ## ITERATING ITERABLES ### # 10 | 11 | # Create a list of strings: flash 12 | flash = ['jay garrick', 'barry allen', 'wally west', 'bart allen'] 13 | 14 | # Print each list item in flash using a for loop 15 | for person in flash: 16 | print(person) 17 | 18 | 19 | # Create an iterator for flash: superspeed 20 | superspeed = iter(flash) 21 | 22 | # Print each item from the iterator 23 | print(next(superspeed)) 24 | print(next(superspeed)) 25 | print(next(superspeed)) 26 | print(next(superspeed)) 27 | 28 | # ########## Iteration 2 ######## # 29 | 30 | # Create an iterator for range(3): small_value 31 | small_value = iter(range(3)) 32 | 33 | # Print the values in small_value 34 | print(next(small_value)) 35 | print(next(small_value)) 36 | print(next(small_value)) 37 | 38 | # Loop over range(3) and print the values 39 | for num in range(3): 40 | print(num) 41 | 42 | 43 | # Create an iterator for range(10 ** 100): googol 44 | googol = iter(range(10**100)) 45 | 46 | # Print the first 5 values from googol 47 | print(next(googol)) 48 | print(next(googol)) 49 | print(next(googol)) 50 | print(next(googol)) 51 | print(next(googol)) 52 | 53 | 54 | # ######## ## # Iterator as a function argument ### # 55 | # Create a range object: values 56 | values = range(10, 21) 57 | 58 | # Print the range object 59 | print(values) 60 | 61 | # Create a list of integers: values_list 62 | values_list = list(values) 63 | 64 | # Print values_list 65 | print(values_list) 66 | 67 | # Get the sum of values: values_sum 68 | values_sum = sum(values) 69 | 70 | # Print values_sum 71 | print(values_sum) 72 | 73 | # ############# Enumerate ############ # 74 | # Create a list of strings: mutants 75 | mutants = ['charles xavier', 76 | 'bobby drake', 77 | 'kurt wagner', 78 | 'max eisenhardt', 79 | 'kitty pride'] 80 | 81 | # Create a list of tuples: mutant_list 82 | mutant_list = list(enumerate(mutants)) 83 | 84 | # Print the list of tuples 85 | print(mutant_list) 86 | 87 | # Unpack and print the tuple pairs 88 | for index1, value1 in enumerate(mutants): 89 | print(index1, value1) 90 | 91 | # Change the start index 92 | for index2, value2 in enumerate(mutants, start=1): 93 | print(index2, value2) 94 | 95 | # ##### Using zip ################# # 96 | 97 | # Create a list of tuples: mutant_data 98 | mutant_data = list(zip(mutants, aliases, powers)) 99 | 100 | # Print the list of tuples 101 | print(mutant_data) 102 | 103 | # Create a zip object using the three lists: mutant_zip 104 | mutant_zip = zip(mutants, aliases, powers) 105 | 106 | # Print the zip object 107 | print(mutant_zip) 108 | 109 | # Unpack the zip object and print the tuple values 110 | for value1, value2, value3 in mutant_zip: 111 | print(value1, value2, value3) 112 | 113 | 114 | # ########### Unzip with * and zip (*iterable) ############# # 115 | 116 | # Create a zip object from mutants and powers: z1 117 | z1 = zip(mutants, powers) 118 | 119 | # Print the tuples in z1 by unpacking with * 120 | print(*z1) 121 | 122 | # Re-create a zip object from mutants and powers: z1 123 | z1 = zip(mutants, powers) 124 | 125 | # 'Unzip' the tuples in z1 by unpacking with * and zip(): result1, result2 126 | result1, result2 = zip(*z1) 127 | 128 | # Check if unpacked tuples are equivalent to original tuples 129 | print(result1 == mutants) 130 | print(result2 == powers) 131 | -------------------------------------------------------------------------------- /Lambda_List_Filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 8 14:10:46 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Create a list of strings: fellowship 9 | fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli'] 10 | 11 | # Use filter() to apply a lambda function over fellowship: result 12 | result = filter(lambda member:len(member) > 6, fellowship) 13 | 14 | # Convert result to a list: result_list 15 | result_list = list(result) 16 | 17 | # Convert result into a list and print it 18 | print(result_list) 19 | 20 | 21 | # random gibberish code test 22 | 23 | # Define gibberish 24 | def gibberish(*args): 25 | """Concatenate strings in *args together.""" 26 | hodgepodge = '' 27 | for word in args: 28 | hodgepodge += word 29 | return hodgepodge 30 | 31 | 32 | # Import reduce from functools 33 | from functools import reduce 34 | 35 | # Create a list of strings: stark 36 | stark = ['robb', 'sansa', 'arya', 'eddard', 'jon'] 37 | 38 | # Use result() to apply a lambda 'function over stark: result 39 | result = reduce(lambda item1, item2:item1+item2,stark) 40 | 41 | # Print the result 42 | print(result) -------------------------------------------------------------------------------- /LinReg_BS_Pairs_func.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 14 00:52:07 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import numpy as np 9 | 10 | 11 | def draw_bs_pairs_linreg(x, y, size=1): 12 | """Perform pairs bootstrap for linear regression.""" 13 | 14 | # Set up array of indices to sample from: inds 15 | inds = np.arange(len(x)) 16 | 17 | # Initialize replicates: bs_slope reps, bs_intercept_reps 18 | bs_slope_reps = np.empty(size) 19 | bs_intercept_reps = np.empty(size) 20 | 21 | # Generate replicates 22 | for i in range(size): 23 | bs_inds = np.random.choice(inds, size=len(inds)) 24 | bs_x, bs_y = x[bs_inds], y[bs_inds] 25 | bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x, bs_y, 1) 26 | 27 | return bs_slope_reps, bs_intercept_reps 28 | -------------------------------------------------------------------------------- /Linear_Regression_Anscombe.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 12 20:24:53 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | # Perform linear regression: a, b 12 | a, b = np.polyfit(x, y, 1) 13 | 14 | # Print the slope and intercept 15 | print(a, b) 16 | 17 | # Generate theoretical x and y data: x_theor, y_theor 18 | x_theor = np.array([3, 15]) 19 | y_theor = a * x_theor + b 20 | 21 | # Plot the Anscombe data and theoretical line 22 | _ = plt.plot(x, y, marker='.', linestyle='none') 23 | _ = plt.plot(x_theor, y_theor) 24 | 25 | # Label the axes 26 | plt.xlabel('x') 27 | plt.ylabel('y') 28 | 29 | # Show the plot 30 | plt.show() 31 | 32 | # ########### LINEAR REGRESSION ON ALL DATA ####### ######### # 33 | 34 | #### Iterate through x,y pairs 35 | for x, y in zip(anscombe_x, anscombe_y): 36 | # Compute the slope and intercept: a, b 37 | a, b = np.polyfit(x, y, 1) 38 | 39 | # Print the result 40 | print('slope:', a, 'intercept:', b) 41 | 42 | # ####### BOOTSTRAP VISUALISATION ### ################# # 43 | 44 | for _ in range(50): 45 | # Generate bootstrap sample: bs_sample 46 | bs_sample = np.random.choice(rainfall, size=len(rainfall)) 47 | 48 | # Compute and plot ECDF from bootstrap sample 49 | x, y = ecdf(bs_sample) 50 | _ = plt.plot(x, y, marker='.', linestyle='none', 51 | color='gray', alpha=0.1) 52 | 53 | # Compute and plot ECDF from original data 54 | x, y = ecdf(rainfall) 55 | _ = plt.plot(x, y, marker='.') 56 | 57 | # Make margins and label axes 58 | plt.margins(0.02) 59 | _ = plt.xlabel('yearly rainfall (mm)') 60 | _ = plt.xlabel('ECDF') 61 | 62 | # Show the plot 63 | plt.show() 64 | 65 | # ########### BOOTSTRAP REPLICATE FUNCTION ####### ############ # 66 | 67 | 68 | # def boostrap_replicate_1d(data, func): 69 | 70 | # bs_sample = np.random.coice(data, len(data)) 71 | # return func(bs_sample) 72 | 73 | # ################# ALTERNATIVE FUNCTION ########### # 74 | 75 | 76 | def boostrap_replicate_1d(data, func): 77 | 78 | """Generate bootstrap replicate of 1D Data""" 79 | return func(np.random.choice(data, size=len(data))) 80 | 81 | 82 | # ######## MULTIPLE BOOTSTRAP REPLICATES ######### ######### # 83 | 84 | bs_replicates = np.empty(10000) 85 | 86 | for i in range(10000): 87 | bs_replicates[i] = bootstrap_replicate_1d(data, np.mean) 88 | 89 | 90 | -------------------------------------------------------------------------------- /ListComp_Gen.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 28 19:13:25 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # List of strings 9 | fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 10 | 'boromir', 'gimli'] 11 | 12 | # List comprehension 13 | fellow1 = [member for member in fellowship if len(member) >= 7] 14 | 15 | print(fellow1) 16 | 17 | # Generator expression 18 | fellow2 = (member for member in fellowship if len(member) >= 7) 19 | 20 | print(fellow2) 21 | 22 | # ''''''''' Basic Gen Expression '''''' '# 23 | 24 | # Create generator object: result 25 | result = (num for num in range(31)) 26 | 27 | # Print the first 5 values 28 | print(next(result)) 29 | print(next(result)) 30 | print(next(result)) 31 | print(next(result)) 32 | print(next(result)) 33 | 34 | # Print the rest of the values 35 | for value in result: 36 | print(value) 37 | 38 | 39 | # '''' Output Change in Generator Expression ''''''''' #### 40 | 41 | # Create a list of strings: lannister 42 | lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey'] 43 | 44 | print(lannister) 45 | 46 | # Create a generator object: lengths 47 | lengths = (len(person) for person in lannister) 48 | 49 | # Iterate over and print the values in lengths 50 | print('Lannister Values.. i.e Name Lengths') 51 | 52 | for value in lengths: 53 | print(value) 54 | 55 | 56 | # '''''''''''''''''' Generator Build _ Basic '''''''''''' ### 57 | 58 | # Create a list of strings 59 | lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey'] 60 | 61 | # Define generator function get_lengths 62 | 63 | 64 | def get_lengths(input_list): 65 | """Generator function that yields the 66 | length of the strings in input_list.""" 67 | 68 | # Yield the length of a string 69 | for person in input_list: 70 | yield len(person) 71 | 72 | # Print the values generated by get_lengths() 73 | for value in get_lengths(lannister): 74 | print(value) -------------------------------------------------------------------------------- /ListComp_timestamped.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 28 23:30:21 2017 4 | 5 | @author: Shabaka 6 | """ 7 | import pandas as pd 8 | 9 | # Extract the created_at column from df: tweet_time 10 | tweet_time = df['created_at'] 11 | 12 | # Extract the clock time: tweet_clock_time 13 | tweet_clock_time = [entry[11:19] for entry in tweet_time] 14 | 15 | # Print the extracted times 16 | print(tweet_clock_time) 17 | 18 | 19 | # '''''''''''''''' Conditional List Comprehension - Time Stamped Data ' # 20 | 21 | # Extract the created_at column from df: tweet_time 22 | tweet_time = df['created_at'] 23 | 24 | # Extract the clock time: tweet_clock_time 25 | tweet_clock_time = [entry[11:19] for entry in tweet_time if entry[17:19] == '19'] 26 | 27 | # Print the extracted times 28 | print(tweet_clock_time) -------------------------------------------------------------------------------- /List_Dictionary_Full.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 28 23:48:08 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # '''''''''' Working with World Bank World Indicator Dataset ''''''' # 9 | # Import the pandas package 10 | 11 | import pandas as pd 12 | 13 | # Zip lists: zipped_lists 14 | zipped_lists = zip(feature_names, row_vals) 15 | 16 | # Create a dictionary: rs_dict 17 | rs_dict = dict(zipped_lists) 18 | 19 | # Print the dictionary 20 | print(rs_dict) 21 | 22 | 23 | # ''''''''''''' List to Dictionary Function ''''' # 24 | 25 | # Define lists2dict() 26 | def lists2dict(list1, list2): 27 | """Return a dictionary where list1 provides 28 | the keys and list2 provides the values.""" 29 | 30 | # Zip lists: zipped_lists 31 | zipped_lists = zip(list1, list2) 32 | 33 | # Create a dictionary: rs_dict 34 | rs_dict = dict(zipped_lists) 35 | 36 | # Return the dictionary 37 | return rs_dict 38 | 39 | # Call lists2dict: rs_fxn 40 | rs_fxn = lists2dict(feature_names, row_vals) 41 | 42 | 43 | # Print the first two lists in row_lists 44 | print(row_lists[0]) 45 | print(row_lists[1]) 46 | 47 | # Turn list of lists into list of dicts: list_of_dicts 48 | list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists] 49 | 50 | # Print the first two dictionaries in list_of_dicts 51 | print(list_of_dicts[0]) 52 | print(list_of_dicts[1]) 53 | 54 | # ''''''''''''''' Turn the list od Data sets to a pandas Dataframe ''''''# 55 | 56 | # Turn list of lists into list of dicts: list_of_dicts 57 | list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists] 58 | 59 | # Turn list of dicts into a dataframe: df 60 | df = pd.DataFrame(list_of_dicts) 61 | 62 | # Print the head of the dataframe 63 | print(df.head()) -------------------------------------------------------------------------------- /Load_Explore_Twitter_Data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 16 02:50:27 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import package 9 | import json 10 | 11 | # String of path to file: tweets_data_path 12 | tweets_data_path = 'tweets.txt' 13 | 14 | # Initialize empty list to store tweets: tweets_data 15 | tweets_data = [] 16 | # Open connection to file 17 | tweets_file = open(tweets_data_path, "r") 18 | 19 | # Read in tweets and store in list: tweets_data 20 | for line in tweets_file: 21 | tweet = json.loads(line) 22 | tweets_data.append(tweet) 23 | 24 | # Close connection to file 25 | tweets_file.close() 26 | 27 | # Print the keys of the first tweet dict 28 | print(tweets_data[0].keys()) 29 | -------------------------------------------------------------------------------- /Local_JSon_Load_Explore.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 16 00:59:00 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # import module 9 | import json 10 | 11 | # Load JSON: json_data 12 | with open("a_movie.json") as json_file: 13 | json_data = json.load(json_file) 14 | 15 | # Print each key-value pair in json_data 16 | for k in json_data.keys(): 17 | print(k + ': ', json_data[k]) -------------------------------------------------------------------------------- /Multidata_tweeter_count_function.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Spyder Editor 4 | 5 | This is a sample script layout file 6 | """ 7 | 8 | # Define count_entries() 9 | def count_entries(df, col_name = 'lang'): 10 | """Return a dictionary with counts of 11 | occurrences as value for each key.""" 12 | 13 | # Initialize an empty dictionary: cols_count 14 | cols_count = {} 15 | 16 | # Extract column from DataFrame: col 17 | col = df[col_name] 18 | 19 | # Iterate over the column in dataframe 20 | for entry in col: 21 | 22 | # If entry is in cols_count, add 1 23 | if entry in cols_count.keys(): 24 | cols_count[entry] += 1 25 | 26 | # Else add the entry to cols_count, set the value to 1 27 | else: 28 | cols_count[entry] = 1 29 | 30 | # Return the cols_count dictionary 31 | return cols_count 32 | 33 | # Call count_entries(): result1 34 | result1 = count_entries(tweets_df, 'lang') 35 | 36 | # Call count_entries(): result2 37 | result2 = count_entries(tweets_df, 'source') 38 | 39 | # Print result1 and result2 40 | print(result1) 41 | print(result2) 42 | 43 | -------------------------------------------------------------------------------- /Nested_List_Comp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 27 22:45:07 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Create a 5 x 5 matrix using a list of lists: matrix 9 | matrix = [[col for col in range(5)] for row in range(5)] 10 | 11 | # Print the matrix 12 | for row in matrix: 13 | print(row) 14 | 15 | 16 | # Create a list of strings: fellowship 17 | fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 18 | 'boromir', 'gimli'] 19 | 20 | # Create list comprehension: new_fellowship 21 | new_fellowship = [member for member in fellowship if len(member) >= 7] 22 | 23 | # Print the new list 24 | print(new_fellowship) 25 | 26 | # ############### Conditional in List comprehension ######## # 27 | 28 | # Create a list of strings: fellowship 29 | fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 30 | 'boromir', 'gimli'] 31 | 32 | # Create list comprehension: new_fellowship 33 | new_fellowship = [member if len(member) >= 7 else '' for member in fellowship] 34 | 35 | # Print the new list 36 | print(new_fellowship) 37 | 38 | 39 | # '''''''''''' Dictionary List COmprehension ''''''''' '''# 40 | 41 | fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 42 | 'boromir', 'gimli'] 43 | 44 | # Create dict comprehension: new_fellowship 45 | new_fellowship = {member: len(member) for member in fellowship} 46 | 47 | # Print the new list 48 | print(new_fellowship) -------------------------------------------------------------------------------- /Non_Flat_File_Import_Web-Excel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 15 22:36:30 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import package 9 | import pandas as pd 10 | 11 | # Assign url of file: url 12 | url = 'http://s3.amazonaws.com/assets.datacamp.com/course/\ 13 | importing_data_into_r/latitude.xls' 14 | 15 | # Read in all sheets of Excel file: xl 16 | xl = pd.read_excel(url, sheetname = None) 17 | 18 | # Print the sheetnames to the shell 19 | print(xl.keys()) 20 | 21 | # Print the head of the first sheet (using its name, NOT its index) 22 | print(xl['1700'].head()) -------------------------------------------------------------------------------- /Numpy_Import_LoadTxt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 9 12:51:35 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import numpy 9 | import numpy as np 10 | 11 | # Assign the filename: file 12 | file = 'digits_header.txt' 13 | 14 | # Load the data: data 15 | data = np.loadtxt(file, delimiter='\t', skiprows= 1, usecols= [0,2]) 16 | 17 | # Print data 18 | print(data) -------------------------------------------------------------------------------- /Numpy_LoadData_and_Plot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 9 03:57:27 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import package 9 | import numpy as np 10 | import matplotlib as plt 11 | 12 | # Assign filename to variable: file 13 | file = 'digits.csv' 14 | 15 | # Load file as array: digits 16 | digits = np.loadtxt(file, delimiter=',') 17 | 18 | # Print datatype of digits 19 | print(type(digits)) 20 | 21 | # Select and reshape a row 22 | im = digits[21, 1:] 23 | im_sq = np.reshape(im, (28, 28)) 24 | 25 | # Plot reshaped data (matplotlib.pyplot already loaded as plt) 26 | plt.imshow(im_sq, cmap='Greys', interpolation='nearest') 27 | plt.show() 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python Scripts Repo on Data Science 2 | 3 | Data Science stuff I have learnt and still refer to over the years. 4 | 5 | Script titles are descriptive in keeping with the verbose nature of the python language (a Xteristic that I absolutely love) 6 | 7 | A few important caveats on using this Repo: 8 | 9 | All scripts were written on either Linux/Windows OSs.., using Anaconda IDE, gedit and sometimes recently, Geany. Most of these are written in 10 | Python3. (Some are in Python27 - Linux gedit in this case) and I will endeavour to specify these differences. 11 | They have all been written for my specific environments as above and for the Data Science domain. 12 | I welcome feedback on how they work for people and if they find them useful. 13 | The Anaconda IDE gives quite a lot of support for debugging and I endeavour to do as much as I can. 14 | 15 | As far as using these scripts, you will need to know how to make them work for your specific use case - ASSUMING that you know what you are doing- 16 | And if you really want to understand the underlying methods -the courses on datacamp, udemy, and many other great content platforms - Coursera! are as good as any other out there. - Perhaos that's a stretch. The point is, they are everywhere. 17 | 18 | IN classes I teach, I can surely try to explain a concept based on my understanding and possibly some implementation. 19 | 20 | 21 | Note that, the data sets are not available here. Use your own data. You will however find datasets online if you google them - there's quite a few out there - Nothing beats a bit of legwork. Beware of those pesky rabbitholes though - It is very easy to get lost when you are having fun.. Debugging... 22 | 23 | These scripts have proven useful in their adaptability for other projects I am working on but for posterity, this page has been created. 24 | 25 | You may find some functions or pieces of these code(s) elsewhere on the web. my commercial programming 26 | experience is still ongoing and just like everyone else, I tend to look up how to do a specific function and sometimes borrow that. Yes, devs live in the stackoverflow-verse - some google-fu helps also. 27 | 28 | Please do not quote me if your implementation doesn't work. #justsaying 29 | 30 | But do quote me if it does :) 31 | 32 | Having said all that, we all know that once in a while, you find something that’s written extremely well (such as on StackOverflow or other blogs), and rightly so - there's no use reinventing the wheel. 33 | 34 | My hope is that this repo can help make your building/dev work a lot easier. 35 | 36 | See what you think. 37 | 38 | I’ll attempt to credit anything of the sorts as I post them, and apologies if anyone is missed - If you see such, please let me know and I will rectify asap 39 | 40 | Happy Hacking.. 41 | -------------------------------------------------------------------------------- /Random_NLTK.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jun 6 14:17:02 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | class FrequencySummarizer: 9 | def _init_(self, min_cut=0.1,max_cut=0.9): 10 | 11 | self.min_cut = min_cut 12 | self.max_cut = max_cut 13 | self._stopwords = set(stopwords.words('english') + list(punctuation) + 14 | [u 15 | # Process 16 | # 1 - Dload article from url 17 | 18 | 19 | # 2 - Eliminate stop words etc that add no meaning -------------------------------------------------------------------------------- /SQL_Arbitrary_Insert_Row.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jan 26 02:19:03 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import insert and select from sqlalchemy 9 | from sqlalchemy import insert, select 10 | 11 | # Build an insert statement to insert a record into the data table: stmt 12 | stmt = insert(data).values(name='Anna', count=1, amount=1000.00, valid=True) 13 | 14 | # Execute the statement via the connection: results 15 | results = connection.execute(stmt) 16 | 17 | # Print result rowcount 18 | print(results.rowcount) 19 | 20 | # Build a select statement to validate the insert 21 | stmt = select([data]).where(data.columns.name == 'Anna') 22 | 23 | # Print the result of executing the query. 24 | print(connection.execute(stmt).first()) -------------------------------------------------------------------------------- /SQL_Arbitrary_Table_Create.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jan 26 01:03:19 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import Table, Column, String, Integer, Float, Boolean from sqlalchemy 9 | from sqlalchemy import Table, Column, String, Integer, Float, Boolean 10 | 11 | # Define a new table with a name, count, amount, and valid column: data 12 | data = Table('data', metadata, 13 | Column('name', String(255)), 14 | Column('count', Integer()), 15 | Column('amount', Float()), 16 | Column('valid', Boolean()) 17 | ) 18 | 19 | # Use the metadata to create the table 20 | metadata.create_all(engine) 21 | 22 | # Print table repr 23 | print(repr(data)) 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /SQL_Automatic_Join_Est_Rel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 25 01:14:58 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Build a statement to join census and state_fact tables: stmt 9 | stmt = select([census.columns.pop2000, state_fact.columns.abbreviation]) 10 | 11 | # Execute the statement and get the first result: result 12 | result = connection.execute(stmt).first() 13 | 14 | # Loop over the keys in the result object and print the key and value 15 | for key in result.keys(): 16 | print(key, getattr(result, key)) -------------------------------------------------------------------------------- /SQL_CaseStudy_Basic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 7 00:59:45 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import sqlalchemy 9 | # Import create_engine, MetaData 10 | from sqlalchemy import create_engine, MetaData 11 | 12 | # Import Table, Column, String, and Integer 13 | from sqlalchemy import (Table, Column, String, Integer) 14 | from sqlalchemy import select 15 | from sqlalchemy import insert 16 | from sqlalchemy import case, cast, Float 17 | 18 | # Define an engine to connect to chapter5.sqlite: engine 19 | engine = create_engine('sqlite:///chapter5.sqlite') 20 | 21 | # Initialize MetaData: metadata 22 | metadata = MetaData() 23 | 24 | 25 | # ''''''''''''''''' Create Table to Database '''''# 26 | 27 | # Build a census table: census 28 | census = Table('census', metadata, 29 | Column('state', String(30)), 30 | Column('sex', String(1)), 31 | Column('age', Integer()), 32 | Column('pop2000', Integer()), 33 | Column('pop2008', Integer())) 34 | 35 | # Create the table in the database 36 | metadata.create_all(engine) 37 | 38 | # '''''Read the Data from a CSV - Leverage Python csv module ''''# 39 | 40 | # Create an empty list: values_list 41 | values_list = [] 42 | 43 | # Iterate over the rows 44 | for row in csv_reader: 45 | # Create a dictionary with the values 46 | data = {'state': row[0], 'sex': row[1], 'age': row[2], 'pop2000': row[3], 47 | 'pop2008': row[4]} 48 | # Append the dictionary to the values list 49 | values_list.append(data) 50 | 51 | 52 | # '''''''' Load Data froma List into the Table '''''' # 53 | 54 | # Build insert statement: stmt 55 | stmt = insert(census) 56 | 57 | # Use values_list to insert data: results 58 | results = connection.execute(stmt, values_list) 59 | 60 | # Print rowcount 61 | print(results.rowcount) 62 | 63 | 64 | # ''''''''Determine Average Age by Population - Test 1''''# 65 | # Import select 66 | 67 | # Calculate weighted average age: stmt 68 | stmt = select([census.columns.sex, 69 | (func.sum(census.columns.pop2008 * census.columns.age) / 70 | func.sum(census.columns.pop2008)).label('average_age') 71 | ]) 72 | 73 | # Group by sex 74 | stmt = stmt.group_by(census.columns.sex) 75 | 76 | # Execute the query and store the results: results 77 | results = connection.execute(stmt).fetchall() 78 | 79 | # Print the average age by sex 80 | for result in results: 81 | print(result.sex, result.average_age) 82 | 83 | 84 | # ''''Query - Percentage of Pop by Gender and State ''''''# 85 | 86 | # import case, cast and Float from sqlalchemy 87 | 88 | # Build a query to calculate the percentage of females in 2000: stmt 89 | stmt = select([census.columns.state, 90 | (func.sum( 91 | case([ 92 | (census.columns.sex == 'F', census.columns.pop2000) 93 | ], else_=0)) / 94 | cast(func.sum(census.columns.pop2000), Float) * 100).label('percent_female') 95 | ]) 96 | 97 | # Group By state 98 | stmt = stmt.group_by(census.columns.state) 99 | 100 | # Execute the query and store the results: results 101 | results = connection.execute(stmt).fetchall() 102 | 103 | # Print the percentage 104 | for result in results: 105 | print(result.state, result.percent_female) 106 | 107 | 108 | # '''''' Query to det pop. diff. by state between 2008 and 2000 '''# 109 | 110 | # Build query to return state name and population difference from 2008 to 2000 111 | stmt = select([census.columns.state, 112 | (census.columns.pop2008 - census.columns.pop2000).label('pop_change') 113 | ]) 114 | 115 | # Group by State 116 | stmt = stmt.group_by(census.columns.state) 117 | 118 | # Order by Population Change 119 | stmt = stmt.order_by(desc('pop_change')) 120 | 121 | # Limit to top 10 122 | stmt = stmt.limit(10) 123 | 124 | # Use connection to execute the statement and fetch all results 125 | results = connection.execute(stmt).fetchall() 126 | 127 | # Print the state and population change for each record 128 | for result in results: 129 | print('{}-{}'.format(result.state, result.pop_change)) -------------------------------------------------------------------------------- /SQL_Check_Col_Population_Percentage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 23 02:38:04 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # import case, cast and Float from sqlalchemy 9 | from sqlalchemy import case, cast, Float 10 | 11 | # Build an expression to calculate female population in 2000 12 | female_pop2000 = func.sum( 13 | case([ 14 | (census.columns.sex == 'F', census.columns.pop2000) 15 | ], else_= 0)) 16 | 17 | # Cast an expression to calculate total population in 2000 to Float 18 | total_pop2000 = cast(func.sum(census.columns.pop2000), Float) 19 | 20 | # Build a query to calculate the percentage of females in 2000: stmt 21 | stmt = select([female_pop2000 / total_pop2000 * 100]) 22 | 23 | # Execute the query and store the scalar result: percent_female 24 | percent_female = connection.execute(stmt).scalar() 25 | 26 | # Print the percentage 27 | print(percent_female) -------------------------------------------------------------------------------- /SQL_Data_Count_Group-By.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 22 22:15:59 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import func 9 | from sqlalchemy.sql import func 10 | 11 | # Build a query to select the state and count of ages by state: stmt 12 | stmt = select([census.columns.state, func.count(census.columns.age)]) 13 | 14 | # Append group by state 15 | stmt = stmt.group_by(census.columns.state) 16 | 17 | # Execute the statement and store all the records: results 18 | results = connection.execute(stmt).fetchall() 19 | 20 | # Print results 21 | print(results) 22 | 23 | # Print the keys/column names of the results returned 24 | print(results[0].keys()) -------------------------------------------------------------------------------- /SQL_Data_Count_Keys_Values.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 22 22:15:59 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import func 9 | from sqlalchemy.sql import func 10 | 11 | # Build a query to select the state and count of ages by state: stmt 12 | stmt = select([census.columns.state, func.count(census.columns.age)]) 13 | 14 | # Append group by state 15 | stmt = stmt.group_by(census.columns.state) 16 | 17 | # Execute the statement and store all the records: results 18 | results = connection.execute(stmt).fetchall() 19 | 20 | # Print results 21 | print(results) 22 | 23 | # Print the keys/column names of the results returned 24 | print(results[0].keys()) -------------------------------------------------------------------------------- /SQL_Delete_Table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Apr 6 23:59:59 2017 4 | 5 | @author: Shabaka 6 | """ 7 | import sqlalchemy 8 | # Import delete, select 9 | from sqlalchemy import delete, select 10 | 11 | # Build a statement to empty the census table: stmt 12 | stmt = delete(census) 13 | 14 | # Execute the statement: results 15 | results = connection.execute(stmt) 16 | 17 | # Print affected rowcount 18 | print(results.rowcount) 19 | 20 | # Build a statement to select all records from the census table 21 | 22 | stmt = select([census]) 23 | 24 | # Print the results of executing the statement to verify 25 | # there are no rows 26 | 27 | print(connection.execute(stmt).fetchall()) 28 | 29 | # ##################### ################ ################ ######## 30 | # '''''' Deleting Specific records '''''''## 31 | 32 | # Build a statement to count records using 33 | # the sex column for Men ('M') age 36: stmt 34 | 35 | stmt = select([func.count(census.columns.sex)]).where( 36 | and_(census.columns.sex == 'M', 37 | census.columns.age == 36) 38 | ) 39 | 40 | # Execute the select statement and use the scalar() fetch 41 | # method to save the record count 42 | 43 | to_delete = connection.execute(stmt).scalar() 44 | 45 | # Build a statement to delete records from the census table: stmt_del 46 | 47 | stmt_del = delete(census).where(stmt) 48 | 49 | # Append a where clause to target Men ('M') age 36 50 | 51 | stmt_del = stmt_del.where( 52 | and_(census.columns.sex == 'M', 53 | census.columns.age == 36) 54 | ) 55 | 56 | # Execute the statement: results 57 | results = connection.execute(stmt_del) 58 | 59 | # Print affected rowcount and to_delete record count, make sure they match 60 | print(results.rowcount, to_delete) 61 | 62 | 63 | # '''''''' Delete TAble COmpletely ''''''''''# 64 | 65 | # Drop the state_fact table 66 | state_fact.drop(engine) 67 | 68 | # Check to see if state_fact exists 69 | print(state_fact.exists(engine)) 70 | 71 | # Drop all tables 72 | metadata.drop_all(engine) 73 | 74 | # Check to see if census exists 75 | print(census.exists(engine)) -------------------------------------------------------------------------------- /SQL_Det_Pop_Sum_by_Column.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 22 23:03:45 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # import pandas 9 | import pandas as pd 10 | # Import Pyplot as plt from matplotlib 11 | import matplotlib.pyplot as plt 12 | 13 | from sqlalchemy import create_engine 14 | 15 | # Import func 16 | from sqlalchemy.sql import func 17 | 18 | from sqlalchemy import MetaData, Table 19 | metadata = MetaData() 20 | 21 | engine = create_engine('sqlite:///census_nyc.sqlite') 22 | 23 | # Reflect census table from the engine: census 24 | census = Table('census', metadata, autoload=True, autoload_with=engine) 25 | 26 | # Build an expression to calculate the sum of pop2008 labeled as population 27 | pop2008_sum = func.sum(census.columns.pop2008).label("population") 28 | 29 | # Build a query to select the state and sum of pop2008 as population grouped by 30 | # state: stmt 31 | stmt = select([census.columns.state, pop2008_sum]) 32 | 33 | # Append group by state 34 | stmt = stmt.group_by(census.columns.state) 35 | 36 | # Execute the statement and store all the records: results 37 | results = connection.execute(stmt).fetchall() 38 | 39 | # Print results 40 | print(results) 41 | 42 | # Print the keys/column names of the results returned 43 | print(results[0].keys()) 44 | 45 | 46 | # Create a DataFrame from the results: df 47 | df = pd.DataFrame(results) 48 | 49 | # Set column names 50 | df.columns = results[0].keys() 51 | 52 | # Print the Dataframe 53 | print(df) 54 | 55 | # Create a DataFrame from the results: df 56 | df = pd.DataFrame(results) 57 | 58 | # Set Column names 59 | df.columns = results[0].keys() 60 | 61 | # Print the DataFrame 62 | print(df) 63 | 64 | # Plot the DataFrame 65 | df.plot.bar() 66 | plt.show() 67 | 68 | -------------------------------------------------------------------------------- /SQL_Join_Columns_Advanced.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 25 19:14:06 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Build a statement to select the state, sum of 2008 population and census 9 | # division name: stmt 10 | stmt = select([census.columns.state, 11 | func.sum(census.columns.pop2008), 12 | state_fact.columns.census_division_name 13 | ]) 14 | 15 | # Append select_from to join the census and state_fact tables by the census state and state_fact name columns 16 | stmt = stmt.select_from( 17 | census.join(state_fact, census.columns.state == state_fact.columns.name) 18 | ) 19 | 20 | # Append a group by for the state_fact name column 21 | stmt = stmt.group_by(state_fact.columns.name) 22 | 23 | # Execute the statement and get the results: results 24 | results = connection.execute(stmt).fetchall() 25 | 26 | # Loop over the the results object and print each record. 27 | for record in results: 28 | print(record) -------------------------------------------------------------------------------- /SQL_Join_Table_Columns.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 25 19:10:55 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Build a statement to select the census and state_fact tables: stmt 9 | stmt = select([census, state_fact]) 10 | 11 | # Add a select_from clause that wraps a join for the census and state_fact 12 | # tables where the census state column and state_fact name column match 13 | stmt = stmt.select_from( 14 | census.join(state_fact, census.columns.state == state_fact.columns.name)) 15 | 16 | # Execute the statement and get the first result: result 17 | result = connection.execute(stmt).first() 18 | 19 | # Loop over the keys in the result object and print the key and value 20 | for key in result.keys(): 21 | print(key, getattr(result, key)) 22 | -------------------------------------------------------------------------------- /SQL_Leverage_Heirach_Data_Group_By.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 25 20:45:56 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Make an alias of the employees table: managers 9 | managers = employees.alias() 10 | 11 | # Build a query to select managers and counts of their employees: stmt 12 | stmt = select([managers.columns.name, func.count(employees.columns.id)]) 13 | 14 | # Append a where clause that ensures the manager id and employee mgr are equal 15 | stmt = stmt.where(managers.columns.id == employees.columns.mgr) 16 | 17 | # Group by Managers Name 18 | stmt = stmt.group_by(managers.columns.name) 19 | 20 | # Execute statement: results 21 | results = connection.execute(stmt).fetchall() 22 | 23 | # print manager 24 | for record in results: 25 | print(record) 26 | 27 | -------------------------------------------------------------------------------- /SQL_LoadCSV_csv-reader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jan 26 03:07:47 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Create a insert statement for census: stmt 9 | stmt = insert(census) 10 | 11 | # Create an empty list and zeroed row count: values_list, total_rowcount 12 | values_list = [] 13 | total_rowcount = 0 14 | 15 | # Enumerate the rows of csv_reader 16 | for idx, row in enumerate(csv_reader): 17 | #create data and append to values_list 18 | data = {'state': row[0], 'sex': row[1], 'age': row[2], 'pop2000': row[3], 19 | 'pop2008': row[4]} 20 | values_list.append(data) 21 | 22 | # Check to see if divisible by 51 23 | if idx % 51 == 0: 24 | results = connection.execute(stmt, values_list) 25 | total_rowcount += results.rowcount 26 | values_list = [] -------------------------------------------------------------------------------- /SQL_Order_Desc_by_Column.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 22 17:50:29 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import desc 9 | from sqlalchemy import desc 10 | from sqlalchemy import Table, MetaData 11 | 12 | # Build a query to select the state column: stmt 13 | stmt = select([census.columns.state]) 14 | 15 | # Append order_by descending state: rev_stmt 16 | rev_stmt = stmt.order_by(desc(census.columns.state)) 17 | 18 | # Execute the query and store the results: rev_results 19 | rev_results = connection.execute(rev_stmt).fetchall() 20 | 21 | # Print the first 10 rev_results 22 | print(rev_results[:10]) -------------------------------------------------------------------------------- /SQL_Order_by_Data by Column.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 22 17:34:53 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Build a query to select the state column: stmt 9 | stmt = select([census.columns.state]) 10 | 11 | # Append an order_by state 12 | stmt = stmt.order_by(census.columns.state) 13 | 14 | # Execute the query and store the results: results 15 | results = connection.execute(stmt).fetchall() 16 | 17 | # Print the first 10 results 18 | print(results[:10]) -------------------------------------------------------------------------------- /SQL_Plot_Results_DataFrame.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 22 23:28:13 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import Pyplot as plt from matplotlib 9 | import matplotlib.pyplot as plt 10 | 11 | # Create a DataFrame from the results: df 12 | df = pd.DataFrame(results) 13 | 14 | # Set Column names 15 | df.columns = results[0].keys() 16 | 17 | # Print the DataFrame 18 | print(df) 19 | 20 | # Plot the DataFrame 21 | df.plot.bar() 22 | plt.show() 23 | -------------------------------------------------------------------------------- /SQL_Same_Table_Joined_Query.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 25 20:31:26 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Make an alias of the employees table: managers 9 | managers = employees.alias() 10 | 11 | # Build a query to select manager's and their employees names: stmt 12 | stmt = select( 13 | [managers.columns.name.label('manager'), 14 | employees.columns.name.label('employee')] 15 | ) 16 | 17 | # Append where to match manager ids with employees managers: stmt 18 | stmt = stmt.where(managers.columns.id == employees.columns.mgr) 19 | 20 | # Append order by managers name: stmt 21 | stmt = stmt.order_by(managers.columns.name) 22 | 23 | # Execute statement: results 24 | results = connection.execute(stmt).fetchall() 25 | 26 | # Print records 27 | for record in results: 28 | print(record) 29 | 30 | """ 31 | Functions and Group_bys using Heirachical Data ( Tables) 32 | """ 33 | 34 | # Fresh Code starts here 35 | 36 | # Make an alias of the employees table: managers 37 | managers = employees.alias() 38 | 39 | # Build a query to select managers and counts of their employees: stmt 40 | stmt = select([managers.columns.name, func.count(employees.columns.id)]) 41 | 42 | # Append a where clause that ensures the manager id and employee mgr are equal 43 | stmt = stmt.where(managers.columns.id == employees.columns.mgr) 44 | 45 | # Group by Managers Name 46 | stmt = stmt.group_by(managers.columns.name) 47 | 48 | # Execute statement: results 49 | results = connection.execute(stmt).fetchall() 50 | 51 | # print manager 52 | for record in results: 53 | print(record) 54 | 55 | -------------------------------------------------------------------------------- /Simple_Data_Filter_Select_Where.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 17 01:38:48 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | from sqlalchemy import creat_engine 9 | 10 | # Create a select query: stmt 11 | stmt = select([census]) 12 | 13 | # Add a where clause to filter the results to only those for New York 14 | stmt = stmt.where(census.columns.state == 'New York') 15 | 16 | # Execute the query to retrieve all the data returned: results 17 | results = connection.execute(stmt).fetchall() 18 | 19 | # Loop over the results and print the age, sex, and pop2008 20 | for result in results: 21 | print(result.age, result.sex, result.pop2008) 22 | 23 | 24 | 25 | # Create a query for the census table: stmt 26 | stmt = select([census]) 27 | 28 | # Append a where clause to match all the states in_ the list states 29 | stmt = stmt.where(census.columns.state.in_(states)) 30 | 31 | # Loop over the ResultProxy and print the state and its population in 2000 32 | for result in connection.execute(stmt): 33 | print(result.state, result.pop2000) -------------------------------------------------------------------------------- /TwitterAPI_Authentication_SampleM.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 16 02:29:31 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import package 9 | import tweepy 10 | 11 | # Store OAuth authentication credentials in relevant variables 12 | access_token = "1092294848-aHN7DcRP9B4VMTQIhwqOYiB14YkW92fFO8k8EPy" 13 | access_token_secret = "X4dHmhPfaksHcQ7SCbmZa2oYBBVSD2g8uIHXsp5CTaksx" 14 | consumer_key = "nZ6EA0FxZ293SxGNg8g8aP0HM" 15 | consumer_secret = "fJGEodwe3KiKUnsYJC3VRndj7jevVvXbK2D5EiJ2nehafRgA6i" 16 | 17 | # Pass OAuth details to tweepy's OAuth handler 18 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 19 | auth.set_access_token(access_token, access_token_secret) -------------------------------------------------------------------------------- /Twitter_Data to DataFrame.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 16 02:54:50 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import package 9 | import pandas as pd 10 | 11 | # Build DataFrame of tweet texts and languages 12 | df = pd.DataFrame(tweets_data, columns=['text', 'lang']) 13 | 14 | # Print head of DataFrame 15 | print(df.head()) -------------------------------------------------------------------------------- /Twitter_Text_dataAnalysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 16 02:56:45 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import re 9 | import pandas as pd 10 | import numpy as np 11 | import seaborn as sns 12 | import matplotlib.pyplot as plt 13 | 14 | 15 | def word_in_text(word, tweet): 16 | word = word.lower() 17 | text = tweet.lower() 18 | match = re.search(word, tweet) 19 | 20 | if match: 21 | return True 22 | return False 23 | 24 | # Build DataFrame of tweet texts and languages 25 | df = pd.DataFrame(tweets_data, columns=['text', 'lang']) 26 | 27 | # Print head of DataFrame 28 | print(df.head()) 29 | """ 30 | iterate over the rows of the DataFrame and calculate how many tweets contain 31 | each of our keywords! The list of objects for each candidate has been 32 | initialized to 0 33 | """ 34 | # Initialize list to store tweet counts 35 | [clinton, trump, sanders, cruz] = [0, 0, 0, 0] 36 | 37 | # Iterate through df, counting the number of tweets in which 38 | # each candidate is mentioned 39 | for index, row in df.iterrows(): 40 | clinton += word_in_text('clinton', row['text']) 41 | trump += word_in_text('trump', row['text']) 42 | sanders += word_in_text('sanders', row['text']) 43 | cruz += word_in_text('cruz', row['text']) 44 | 45 | 46 | # first import seaborn as sns; you'll then construct a barplot of the 47 | # data using sns.barplot, passing it two arguments: (i) a list of labels and 48 | # (ii) a list containing e variables you wish to plot(clinton, trump and so on) 49 | 50 | # Import packages 51 | 52 | 53 | # Set seaborn style 54 | sns.set(color_codes=True) 55 | 56 | # Create a list of labels:cd 57 | cd = ['clinton', 'trump', 'sanders', 'cruz'] 58 | 59 | # Plot histogram 60 | ax = sns.barplot(cd, [clinton, trump, sanders, cruz]) 61 | ax.set(ylabel="count") 62 | plt.show() 63 | -------------------------------------------------------------------------------- /Vis_Regressions_FixData.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 2 19:34:27 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import plotting modules 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | import pandas as pd 12 | import numpy as np 13 | 14 | # ''''' File Import to panda dataframe '# 15 | 16 | fixdat = pd.read_csv('C:\\Users\\Shabaka\\ShabakaCodes\\fixations.csv', 17 | index_col=0, parse_dates=True) 18 | 19 | fix_chunk = pd.read_csv('C:\\Users\\Shabaka\\ShabakaCodes\\fixations.csv', 20 | chunksize=50) 21 | 22 | # Test any number of variables against each other 23 | # Plot a linear regression between 'duration' and 'confidence' 24 | sns.lmplot(x='duration', y='confidence', data=fixdat) 25 | 26 | # Display the plot 27 | plt.show() 28 | 29 | # '''''''' Plotting residuals of a regression ''''''# 30 | 31 | # Generate a green residual plot of the regression between 'dur' and 'conf' 32 | sns.residplot(x='duration', y='confidence', data=fixdat, color='green') 33 | 34 | # Display the plot 35 | plt.show() 36 | 37 | # '''''''''' HIgher Order Regressions''''''# 38 | 39 | # Generate a scatter plot of 'fix_dur' and 'confidence' using red circles 40 | plt.scatter(fixdat['duration'], fixdat['confidence'], 41 | label='data', color='red', marker='o') 42 | 43 | # Plot in blue a linear regression of order 1 btw 'fix_dur' and 'confidence' 44 | sns.regplot(x='duration', y='confidence', data=fixdat, 45 | color='blue', label='order 1', scatter=None) 46 | 47 | # Plot in green a linear regression of order 2 between 'fixdur' and 'conf' 48 | sns.regplot(x='duration', y='confidence', data=fixdat, 49 | color='green', label='order 2', scatter=None, order=2) 50 | 51 | # Add a legend and display the plot 52 | plt.legend(loc='lower right') 53 | plt.show() 54 | 55 | 56 | # ''''''''''' Linear Regressions by Hue ''''''''# 57 | 58 | # Plot a linear regression between 'duration' and 'confidence', with a hue 59 | # of 'avg pupil size' and palette of 'Set1' 60 | sns.lmplot(x='duration', y='confidence', data=fixdat, 61 | palette='Set1') # hue='avg_pupil_size' 62 | 63 | # Display the plot 64 | 65 | plt.show() 66 | 67 | # ''''''''''Strip Plot Construction 68 | 69 | # Make a strip plot of 'duration' grouped by 'conf' 70 | 71 | plt.subplot(2, 1, 1) 72 | sns.stripplot(x='duration', y='confidence', data=fixdat) 73 | 74 | # Make the strip plot again using jitter and a smaller point size 75 | plt.subplot(2, 1, 2) 76 | sns.stripplot(x='duration', y='confidence', data=fixdat, jitter=True, size=3) 77 | 78 | # Display the plot 79 | plt.show() 80 | 81 | # '''''''''''''''' Generating Swarmplots '''''''# 82 | 83 | # Generate a swarm plot of 'dur' grouped horizontally by 'pupil_size' 84 | 85 | plt.subplot(2, 1, 1) 86 | sns.swarmplot(x='avg_pupil_size', y='duration', data=fixdat) 87 | 88 | # Gen a swarm plot of 'avgPup_size' grouped vertically 89 | # by 'confidence' with a hue of 'duraton' 90 | plt.subplot(2, 1, 2) 91 | sns.swarmplot(x='avg_pupil_size', y='duration', data=fixdat, 92 | orient='v') # hue='confidence') 93 | 94 | # Display the plot 95 | plt.show() 96 | 97 | # ''''''''''''''' Constructing Violin Plots ''''''''# 98 | 99 | # Generate a violin plot of 'avg_pupil_size' grouped horizontally by 'conf' 100 | plt.subplot(2, 1, 1) 101 | sns.violinplot(x='confidence', y='avg_pupil_size', data=fixdat) 102 | 103 | # Gen same violin plot: with color= 'lightgray' and without inner annotations 104 | plt.subplot(2, 1, 2) 105 | sns.violinplot(x='confidence', y='avg_pupil_size', 106 | data=fixdat, inner=None, color='lightgray') 107 | 108 | # Overlay a strip plot on the violin plot 109 | sns.stripplot(x='confidence', y='avg_pupil_size', 110 | data=fixdat, size=1.5, jitter=True) 111 | 112 | # Display the plot 113 | plt.show() 114 | 115 | # ''''''''''' Plotting Joint Distributions - 1 ''''''''# 116 | 117 | # Generate a joint plot of 'fix dur' and 'confidence' 118 | _ = sns.jointplot(x='duration', y='confidence', data=fixdat) 119 | 120 | # Display the plot 121 | plt.show() 122 | 123 | # Generate a joint plot of 'avg_pupil size and 'duration' 124 | _ = sns.jointplot(x='duration', y='avg_pupil_size', data=fixdat) 125 | 126 | # Display the plot 127 | plt.show() 128 | 129 | 130 | # ''''''''''' Plotting Joint Distributions 2 ''''''''' # 131 | 132 | # Hex Bin Plot # - kind = scatter/reg/resid/kde/hex ( as below) 133 | 134 | # Generate a joint plot of 'hp' and 'mpg' using a hexbin plot 135 | _ = sns.jointplot(x='duration', y='confidence', data=fixdat, kind='hex') 136 | 137 | # Display the plot 138 | plt.show() 139 | 140 | # ''''''''''''' Plot the Distibutions Pairwise'''''''''''# 141 | 142 | # Print the first 5 rows of the DataFrame 143 | print(fixdat.head()) 144 | 145 | # Plot the pairwise joint distributions from the DataFrame 146 | sns.pairplot(fixdat) 147 | 148 | # Display the plot 149 | plt.show() 150 | 151 | # ''''''''' Pairwise Distributtion gropued by origin + reg lines # 152 | 153 | # Print the first 5 rows of the DataFrame 154 | print(fixdat.head()) 155 | 156 | # Plot the pairwise joint distributions grouped by 'origin' along with 157 | # regression lines 158 | sns.pairplot(fixdat, kind='reg', hue='dispersion') 159 | 160 | # Display the plot 161 | plt.show() 162 | 163 | # ''''''''''' Correlation Viz with a Heat Map - Covariance Matrix) 164 | 165 | # Print the covariance matrix 166 | print(cov_matrix) 167 | 168 | # Visualize the covariance matrix using a heatmap 169 | sns.heatmap(cov_matrix) 170 | 171 | # Display the heatmap 172 | plt.show() 173 | 174 | _ = plt.plot(fixdat) 175 | _ = plt.legend(loc='upper right') 176 | _ = plt.show() 177 | 178 | -------------------------------------------------------------------------------- /csv_DataFrame_NumpyArray.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 10 15:03:52 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | # Import pandas as pd 9 | import pandas as pd 10 | 11 | # Assign the filename: file 12 | file = 'titanic.csv' 13 | 14 | # Read the file into a DataFrame: df 15 | df = pd.read_csv(file) 16 | 17 | # View the head of the DataFrame 18 | 19 | print(df.head()) 20 | 21 | # Assign the filename: file 22 | file = 'digits.csv' 23 | 24 | # Read the first 5 rows of the file into a DataFrame: data 25 | data = pd.read_csv(file, nrows=5, header=None) 26 | 27 | # Build a numpy array from the DataFrame: data_array 28 | data_array = np.array(data.values) 29 | 30 | # Print the datatype of data_array to the shell 31 | print(type(data_array)) -------------------------------------------------------------------------------- /draw_bootstrap_reps.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 12 22:07:58 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import numpy as np 9 | 10 | 11 | def bootstrap_replicate_1d(data, func): 12 | """Generate bootstrap replicate of 1D Data""" 13 | bs_sample = np.random.choice(data, len(data)) 14 | 15 | return func(bs_sample) 16 | 17 | # def boostrap_replicate_1d(data, func): 18 | # return func(np.random.choice(data, size=len(data))) 19 | 20 | 21 | def draw_bs_reps(data, func, size=1): 22 | """Draw bootstrap replicates.""" 23 | 24 | # Initialize array of replicates: bs_replicates 25 | bs_replicates = np.empty(size) 26 | 27 | # Generate replicates 28 | for i in range(size): 29 | bs_replicates[i] = bootstrap_replicate_1d(data, func) 30 | 31 | return bs_replicates -------------------------------------------------------------------------------- /draw_bs_pairs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 26 12:44:52 2017 4 | 5 | @author: Shabaka 6 | """ 7 | 8 | import numpy as np 9 | 10 | def draw_bs_pairs(x, y, func, size=1): 11 | """Perform pairs bootstrap for linear regression.""" 12 | 13 | # Set up array of indices to sample from: inds 14 | inds = np.arange(len(x)) 15 | # Initialize replicates 16 | bs_replicates = np.empty(size) 17 | # bs_intercept_reps = ____ 18 | 19 | # Generate replicates 20 | for i in range(size): 21 | bs_inds = np.random.choice(inds, len(inds)) 22 | bs_x, bs_y = x[bs_inds], y[bs_inds] 23 | bs_replicates[i] = func(bs_x, bs_y) 24 | # bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x, bs_y, 1) 25 | 26 | return bs_replicates --------------------------------------------------------------------------------