├── .gitattributes
├── Basic_OMDBAPI_Title_Request.py
├── Bernoulli_Trial.py
├── Binomial_Dist_plot.py
├── Binomial_Dist_plot_Update_0317.py
├── Bootstrap Variance.py
├── Bootstrap_Data_Analysis.py
├── Bootstrap_test_One Sample.py
├── CSV Import_Panda_Header.py
├── CSV_Concatenate_All.py
├── CSV_Import-All2.py
├── CSV_Reader_Test2.py
├── Calc_Diff_Btw_Columns.py
├── Connect to PostgresQL Database.py
├── Create_DataBase_Engine.py
├── Customer Segmentation_Code_Spyder.py
├── CustomizeSQL_Query_Col_Row_Size.py
├── DB_Tables_Update.py
├── DatStream.py
├── DatVisPython.py
├── DatVis_Bokeh_1.py
├── DatVis_Bokeh_CaseStudy_App_Build_5.py
├── DatVis_Bokeh_High_Level_Charts_3.py
├── DatVis_Bokeh_Intr_App_Build_4.py
├── DatVis_Bokeh_Layout-Int-Annot_2.py
├── DatVis_Images.py
├── Dat_Clean_Analysis.py
├── Dat_Read_Plot.py
├── DataChunkFunc.py
├── DataClean_GS_Analysis5.py
├── DataCombine_Analysis3.py
├── DataFrame_Lambda_Filter_Read.py
├── DataTidy_Analysis2.py
├── DataTypes_Analysis4.py
├── DataXplore_Analysis1.py
├── Data_Corr_Func.py
├── Datchunk_PopPlot.py
├── Deep_Learning_Basics_1.py
├── Deep_Learning_KerasModel_Build_3.py
├── Deep_Learning_KerasModel_Optimise_4.py
├── Deep_Learning_Ntwrk_Optim_2.py
├── Distribution_Check_Theor_ECDF_Data_CDF.py
├── EDA_Analysis_Comarison.py
├── EDA_Hypothesis_Test.py
├── Entry_Count_Check_Exception.py
├── EthicalHackingCourseNotes.py
├── ExtractHist_Image.py
├── Extract_Data_from_HDF5.py
├── File_Import_Multi_DataType.py
├── FilterData_Selected_from_Table_SQLAlchemy.py
├── FilterSQL_Database_Table_Col_Row.py
├── FilterSQL_Database_Table_WHERE.py
├── General Multi_Column DataFrame Analysis.py
├── General Twitter Language Analysis.py
├── Generate from MultiType Data.py
├── HTML_with_BeautifulSoup_GetHypLinktData.py
├── HTML_with_BeautifulSoup_GetTextData.py
├── HTTP_Request_Urllib_Response.py
├── HTTP_Request_Urllib_Response_Read.py
├── HTTP_Request_using_Requests.py
├── Hack_Bern_nprandom.py
├── Hack_Stats_BasicRandGen.py
├── Import_Excel_Pandas.py
├── Import_Excel_Parse.py
├── Import_FlatFile_Web.py
├── Import_HDF5.py
├── Import_MatLab_WorkSpace.py
├── Import_Pickled-Data.py
├── Import_Plot_Web_Flatfile_NonLocal_Save.py
├── Import_SAS7BDAT_.py
├── Import_Stata_File.py
├── Inserting_Multiple_Rows.py
├── Iteration.py
├── LICENSE
├── Lambda_List_Filter.py
├── LinReg_BS_Pairs_func.py
├── Linear_Regression_Anscombe.py
├── ListComp_Gen.py
├── ListComp_timestamped.py
├── List_Dictionary_Full.py
├── Load_Explore_Twitter_Data.py
├── Local_JSon_Load_Explore.py
├── Multidata_tweeter_count_function.py
├── Nested_List_Comp.py
├── NewsArticleClass.py
├── NewsAutosummarize.py
├── Non_Flat_File_Import_Web-Excel.py
├── Numpy_Import_LoadTxt.py
├── Numpy_LoadData_and_Plot.py
├── README.md
├── Random_NLTK.py
├── SQL_Arbitrary_Insert_Row.py
├── SQL_Arbitrary_Table_Create.py
├── SQL_Automatic_Join_Est_Rel.py
├── SQL_CaseStudy_Basic.py
├── SQL_Check_Col_Population_Percentage.py
├── SQL_Data_Count_Group-By.py
├── SQL_Data_Count_Keys_Values.py
├── SQL_Delete_Table.py
├── SQL_Det_Pop_Sum_by_Column.py
├── SQL_Join_Columns_Advanced.py
├── SQL_Join_Table_Columns.py
├── SQL_Leverage_Heirach_Data_Group_By.py
├── SQL_LoadCSV_csv-reader.py
├── SQL_Order_Desc_by_Column.py
├── SQL_Order_by_Data by Column.py
├── SQL_Plot_Results_DataFrame.py
├── SQL_Same_Table_Joined_Query.py
├── Simple_Data_Filter_Select_Where.py
├── TwitterAPI_Authentication_SampleM.py
├── Twitter_Data to DataFrame.py
├── Twitter_Text_dataAnalysis.py
├── Vis_Regressions_FixData.py
├── csv_DataFrame_NumpyArray.py
├── draw_bootstrap_reps.py
├── draw_bs_pairs.py
└── fixations.csv


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto


--------------------------------------------------------------------------------
/Basic_OMDBAPI_Title_Request.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan 16 01:37:31 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import requests package
 9 | import requests
10 | 
11 | # Assign URL to variable: url
12 | url = 'http://www.omdbapi.com/?t=this+is+spinal+tap'
13 | 
14 | # Package the request, send the request and catch the response: r
15 | r = requests.get(url)
16 | 
17 | # Print the text of the response
18 | print(r.text)


--------------------------------------------------------------------------------
/Bernoulli_Trial.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Mar  7 22:50:22 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def perform_bernoulli_trials(n, p):
12 |     """Perform n Bernoulli trials with success probability p
13 |     and return number of successes."""
14 |     # Initialize number of successes: n_success
15 |     n_success = 0
16 | 
17 |     # Perform trials
18 |     for i in range(n):
19 |         # Choose random number between zero and one: random_number
20 |         random_number = np.random.random()
21 | 
22 |         # If less than p, it's a success so add one to n_success
23 |         if random_number < p:
24 |             n_success += 1
25 | 
26 |     return n_success


--------------------------------------------------------------------------------
/Binomial_Dist_plot.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Mar  7 23:26:03 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | 
 9 | import numpy as np
10 | import matplotlib.pyplot as plt
11 | 
12 | from Bernoulli_Trial import perform_bernoulli_trials
13 | from ecdf_func import ecdf
14 | 
15 | # Seed random number generator
16 | np.random.seed(42)
17 | 
18 | # Take 10,000 samples out of the binomial distribution: n_defaults
19 | n_defaults = np.random.binomial(100, 0.05, size=10000)
20 | 
21 | # Compute CDF: x, y
22 | x, y = ecdf(n_defaults)
23 | 
24 | # Plot the CDF with axis labels
25 | _ = plt.plot(x, y, marker='.', linestyle='none')
26 | plt.margins(0.002)
27 | plt.xlabel('Defaults out of 100 loans')
28 | plt.ylabel('ECDF')
29 | 
30 | # Show the plot
31 | plt.show()
32 | 
33 | # Seed random number generator
34 | np.random.seed(42)
35 | 
36 | # Initialize the number of defaults: n_defaults
37 | n_defaults = np.empty(1000)
38 | 
39 | # Compute the number of defaults
40 | for i in range(1000):
41 |     n_defaults[i] = perform_bernoulli_trials(100, 0.05)
42 | 
43 | 
44 | # Plot the histogram with default number of bins; label your axes
45 | _ = plt.hist(n_defaults, normed=True)
46 | _ = plt.xlabel('number of defaults out of 100 loans')
47 | _ = plt.ylabel('probability')
48 | 
49 | # Show the plot
50 | plt.show()
51 | 
52 | # Compute bin edges: bins
53 | bins = np.arange(-0.5, max(n_defaults + 1.5) - 0.5)
54 | 
55 | # Generate histogram
56 | _ = plt.hist(n_defaults, normed=True, bins=bins)
57 | 
58 | # Set margins
59 | plt.margins(0.02)
60 | 
61 | # Label axes
62 | _ = plt.xlabel('number of defaults out of 100 loans')
63 | _ = plt.ylabel('Binomial PMF')
64 | 
65 | 
66 | # #################################################################### #
67 | 
68 | # Draw 10,000 samples out of Poisson distribution: samples_poisson
69 | samples_poisson = np.random.poisson(10, size=10000)
70 | 
71 | # Print the mean and standard deviation
72 | print('Poisson:     ', np.mean(samples_poisson),
73 |                        np.std(samples_poisson))
74 | 
75 | # Specify values of n and p to consider for Binomial: n, p
76 | n = [20, 100, 1000]
77 | p = [0.5, 0.1, 0.01]
78 | 
79 | 
80 | # Draw 10,000 samples for each n,p pair: samples_binomial
81 | for i in range(3):
82 |     samples_binomial = np.random.binomial(n[i], p[i], size=10000)
83 | 
84 |     # Print results
85 |     print('n =', n[i], 'Binom:', np.mean(samples_binomial),
86 |                                  np.std(samples_binomial))
87 | 


--------------------------------------------------------------------------------
/Binomial_Dist_plot_Update_0317.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Mar  7 23:26:03 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | 
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | from Bernoulli_Trial import perform_bernoulli_trials
 13 | from ecdf_func import ecdf
 14 | 
 15 | # Seed random number generator
 16 | np.random.seed(42)
 17 | 
 18 | # Take 10,000 samples out of the binomial distribution: n_defaults
 19 | n_defaults = np.random.binomial(100, 0.05, size=10000)
 20 | 
 21 | # Compute CDF: x, y
 22 | x, y = ecdf(n_defaults)
 23 | 
 24 | # Plot the CDF with axis labels
 25 | _ = plt.plot(x, y, marker='.', linestyle='none')
 26 | plt.margins(0.002)
 27 | plt.xlabel('Defaults out of 100 loans')
 28 | plt.ylabel('ECDF')
 29 | 
 30 | # Show the plot
 31 | plt.show()
 32 | 
 33 | # ################################################################## #
 34 | 
 35 | # Seed random number generator
 36 | np.random.seed(42)
 37 | 
 38 | # Initialize the number of defaults: n_defaults
 39 | n_defaults = np.empty(1000)
 40 | 
 41 | # Compute the number of defaults
 42 | for i in range(1000):
 43 |     n_defaults[i] = perform_bernoulli_trials(100, 0.05)
 44 | 
 45 | 
 46 | # Plot the histogram with default number of bins; label your axes
 47 | _ = plt.hist(n_defaults, normed=True)
 48 | _ = plt.xlabel('number of defaults out of 100 loans')
 49 | _ = plt.ylabel('probability')
 50 | 
 51 | # Show the plot
 52 | plt.show()
 53 | 
 54 | # Compute bin edges: bins
 55 | bins = np.arange(-0.5, max(n_defaults + 1.5) - 0.5)
 56 | 
 57 | # Generate histogram
 58 | _ = plt.hist(n_defaults, normed=True, bins=bins)
 59 | 
 60 | # Set margins
 61 | plt.margins(0.02)
 62 | 
 63 | # Label axes
 64 | _ = plt.xlabel('number of defaults out of 100 loans')
 65 | _ = plt.ylabel('Binomial PMF')
 66 | 
 67 | 
 68 | # #################################################################### #
 69 | 
 70 | # Draw 10,000 samples out of Poisson distribution: samples_poisson
 71 | samples_poisson = np.random.poisson(10, size=10000)
 72 | 
 73 | # Print the mean and standard deviation
 74 | print('Poisson:     ', np.mean(samples_poisson),
 75 |                        np.std(samples_poisson))
 76 | 
 77 | # Specify values of n and p to consider for Binomial: n, p
 78 | n = [20, 100, 1000]
 79 | p = [0.5, 0.1, 0.01]
 80 | 
 81 | 
 82 | # Draw 10,000 samples for each n,p pair: samples_binomial
 83 | for i in range(3):
 84 |     samples_binomial = np.random.binomial(n[i], p[i], size=10000)
 85 | 
 86 |     # Print results
 87 |     print('n =', n[i], 'Binom:', np.mean(samples_binomial),
 88 |                                  np.std(samples_binomial))
 89 | 
 90 | # ##################################################################### #
 91 | 
 92 | # Plotting the Normal PDFs
 93 | 
 94 | # Draw 100000 samples from Normal distribution with stds of interest:
 95 | # samples_std1, samples_std3, samples_std10
 96 | samples_std1 = np.random.normal(20, 1, size=100000)
 97 | samples_std3 = np.random.normal(20, 3, size=100000)
 98 | samples_std10 = np.random.normal(20, 10, size=100000)
 99 | 
100 | # Make histograms
101 | _ = plt.hist(samples_std1, normed=True, histtype='step', bins=100)
102 | _ = plt.hist(samples_std3, normed=True, histtype='step', bins=100)
103 | _ = plt.hist(samples_std10, normed=True, histtype='step', bins=100)
104 | 
105 | # Make a legend, set limits and show plot
106 | _ = plt.legend(('std = 1', 'std = 3', 'std = 10'))
107 | plt.ylim(-0.01, 0.42)
108 | plt.show()
109 | 
110 | # ######################################## #
111 | 
112 | #  Plottign the Normal CDF/ECDF
113 | 
114 | # Generate CDFs
115 | x_std1, y_std1 = ecdf(samples_std1)
116 | x_std3, y_std3 = ecdf(samples_std3)
117 | x_std10, y_std10 = ecdf(samples_std10)
118 | 
119 | # Plot CDFs
120 | _ = plt.plot(x_std1, y_std1, marker='.', linestyle='none')
121 | _ = plt.plot(x_std3, y_std3, marker='.', linestyle='none')
122 | _ = plt.plot(x_std10, y_std10, marker='.', linestyle='none')
123 | 
124 | # Make 2% margin
125 | plt.margins(0.02)
126 | 
127 | # Make a legend and show the plot
128 | _ = plt.legend(('std = 1', 'std = 3', 'std = 10'), loc='lower right')
129 | plt.show()
130 | 
131 | 


--------------------------------------------------------------------------------
/Bootstrap Variance.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 13 21:36:23 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Generate 10,000 bootstrap replicates of the variance: bs_replicates
 9 | bs_replicates = draw_bs_reps(rainfall, np.var, size=10000)
10 | 
11 | # Put the variance in units of square centimeters
12 | bs_replicates /= 100
13 | 
14 | # Make a histogram of the results
15 | _ = plt.hist(bs_replicates, bins=50, normed=True)
16 | _ = plt.xlabel('variance of annual rainfall (sq. cm)')
17 | _ = plt.ylabel('PDF')
18 | 
19 | # Show the plot
20 | plt.show()
21 | 
22 | 
23 | # Draw bootstrap replicates of the mean no-hitter time (equal to tau):
24 | # bs_replicates
25 | bs_replicates = draw_bs_reps(nohitter_times, np.mean, size=10000)
26 | 
27 | # Compute the 95% confidence interval: conf_int
28 | conf_int = np.percentile(bs_replicates, [2.5, 97.5])
29 | 
30 | # Print the confidence interval
31 | print('95% confidence interval =', conf_int, 'games')
32 | 
33 | # Plot the histogram of the replicates
34 | _ = plt.hist(bs_replicates, bins=50, normed=True)
35 | _ = plt.xlabel(r'$\tau$ (games)')
36 | _ = plt.ylabel('PDF')
37 | 
38 | # Show the plot
39 | plt.show()
40 | 
41 | 
42 | def draw_bs_pairs_linreg(x, y, size=1):
43 |     """Perform pairs bootstrap for linear regression."""
44 | 
45 |     # Set up array of indices to sample from: inds
46 |     inds = np.arange(len(x))
47 | 
48 |     # Initialize replicates: bs_slope reps, bs_intercept_reps
49 |     bs_slope_reps = np.empty(size)
50 |     bs_intercept_reps = np.empty(size=size)
51 | 
52 |     # Generate replicates
53 |     for i in range(size):
54 |         bs_inds = np.random.choice(inds, size=len(inds))
55 |         bs_x, bs_y = x[bs_inds], y[bs_inds]
56 |         bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x, bs_y, 1)
57 | 
58 |     return bs_slope_reps, bs_intercept_reps
59 | 


--------------------------------------------------------------------------------
/Bootstrap_Data_Analysis.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Mar 12 21:16:16 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | 
 9 | for _ in range(50):
10 |     # Generate bootstrap sample: bs_sample
11 |     bs_sample = np.random.choice(rainfall, size=len(rainfall))
12 | 
13 |     # Compute and plot ECDF from bootstrap sample
14 |     x, y = ecdf(bs_sample)
15 |     _ = plt.plot(x, y, marker='.', linestyle='none',
16 |                  color='gray', alpha=0.1)
17 | 
18 | # Compute and plot ECDF from original data
19 | x, y = ecdf(rainfall)
20 | _ = plt.plot(x, y, marker='.')
21 | 
22 | # Make margins and label axes
23 | plt.margins(0.02)
24 | _ = plt.xlabel('yearly rainfall (mm)')
25 | _ = plt.xlabel('ECDF')
26 | 
27 | # Show the plot
28 | plt.show()
29 | 
30 | 
31 | # # COMPUTE MEAN & SEM OF BOOTSTRAP REPLICATES #### #
32 | 
33 | # Take 10,000 bootstrap replicates of the mean: bs_replicates
34 | bs_replicates = draw_bs_reps(rainfall, np.mean, 10000)
35 | 
36 | # Compute and print SEM
37 | print(np.std(rainfall) / np.sqrt(len(rainfall)))
38 | 
39 | # Compute and print standard deviation of bootstrap replicates
40 | print(np.std(bs_replicates))
41 | 
42 | # Make a histogram of the results
43 | _ = plt.hist(bs_replicates, bins=50, normed=True)
44 | _ = plt.xlabel('mean annual rainfall (mm)')
45 | _ = plt.ylabel('PDF')
46 | 
47 | # Show the plot
48 | plt.show()
49 | 
50 | 
51 | # #########  PLOTTING BOOTSTRAP REGRESSIONS ###### #
52 | 
53 | # Generate array of x-values for bootstrap lines: x
54 | x = np.array([0, 100])
55 | 
56 | # Plot the bootstrap lines
57 | for i in range(100):
58 |     _ = plt.plot(x, bs_slope_reps[i] * x + bs_intercept_reps[i],
59 |                  linewidth=0.5, alpha=0.2, color='red')
60 | 
61 | # Plot the data
62 | _ = plt.plot(illiteracy, fertility, marker='.', linestyle='none')
63 | 
64 | # Label axes, set the margins, and show the plot
65 | _ = plt.xlabel('illiteracy')
66 | _ = plt.ylabel('fertility')
67 | plt.margins(0.02)
68 | plt.show()


--------------------------------------------------------------------------------
/Bootstrap_test_One Sample.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 20 19:59:42 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import numpy as np
 9 | 
10 | # Make an array of translated impact forces: translated_force_b
11 | translated_force_b = force_b - np.mean(force_b) + 0.55
12 | 
13 | # bootstrap replicates of Frog B's translated impact forces: bs_replicates
14 | bs_replicates = draw_bs_reps(translated_force_b, np.mean, 10000)
15 | 
16 | # Calc.fraction of replicates that are less than the observed Frog B force: p
17 | p = np.sum(bs_replicates <= np.mean(force_b)) / 10000
18 |           
19 | # ##### two sample bootstrap hypothesis test for diff fo means ##### #         
20 | # Compute mean of all forces: mean_force
21 | mean_force = np.mean(forces_concat)
22 | 
23 | # Generate shifted arrays
24 | force_a_shifted = force_a - np.mean(force_a) + mean_force
25 | force_b_shifted = force_b - np.mean(force_b) + mean_force 
26 | 
27 | # Compute 10,000 bootstrap replicates from shifted arrays
28 | bs_replicates_a = draw_bs_reps(force_a_shifted, np.mean, 10000)
29 | bs_replicates_b = draw_bs_reps(force_b_shifted, np.mean, 10000)
30 | 
31 | # Get replicates of difference of means: bs_replicates
32 | bs_replicates = bs_replicates_a - bs_replicates_b
33 | 
34 | # Compute and print p-value: p
35 | p = np.sum(bs_replicates >= empirical_diff_means) / 10000
36 | print('p-value =', p)
37 | 


--------------------------------------------------------------------------------
/CSV Import_Panda_Header.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jan 10 15:03:52 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import pandas as pd
 9 | import pandas as pd
10 | 
11 | # Assign the filename: file
12 | file = 'fixations.csv'
13 | file2 = 'gaze_postions.csv'
14 | 
15 | # Read the file into a DataFrame: df
16 | df = pd.read_csv(file)
17 | df2 = pd.read_csv(file2)
18 | 
19 | # View the head of the DataFrame
20 | 
21 | print(df.head())
22 | print(df2.head())
23 | 


--------------------------------------------------------------------------------
/CSV_Concatenate_All.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Apr  5 12:39:59 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import os
  9 | import glob
 10 | import pandas as pd
 11 | import seaborn as sns
 12 | import matplotlib.pyplot as plt
 13 | import numpy as np
 14 | from scipy import stats
 15 | # from mayavi import mlab
 16 | import multiprocessing
 17 | import plotly.plotly as py
 18 | import plotly.graph_objs as go
 19 | from plotly.graph_objs import Surface
 20 | 
 21 | 
 22 | path = r'C:\Users\Shabaka\Desktop\Test2 DJI_Corretti\100\TIM'
 23 | # path = r'C:\DRO\DCL_rawdata_files'
 24 | allFiles = glob.glob(path + "/*.csv")
 25 | # frame = pd.DataFrame()
 26 | list_TIM = []
 27 | for file_ in allFiles:
 28 |     df_TIM = pd.read_csv(file_, index_col=None, header=0)
 29 |     list_TIM.append(df_TIM)
 30 | frame = pd.concat(list_TIM)   # ignore_index=True)
 31 | 
 32 | print(frame.head())
 33 | 
 34 | # sns.heatmap(frame.head())
 35 | 
 36 | plt.show()
 37 | 
 38 | temp = pd.read_csv('C:\\Users\\Shabaka\\Desktop\\Temperatura_Media.csv')
 39 | # Plot the aapl time series in blue
 40 | print(temp.head())
 41 | plt.plot(temp, color='blue', label='Temp_Median..(yr)')
 42 | 
 43 | plt.show()
 44 | 
 45 | 
 46 | # Plot the pairwise joint distributions grouped by 'origin' along with
 47 | # regression lines
 48 | # sns.pairplot(temp, kind='reg', hue='Temp_Med')
 49 | # plt.show()
 50 | 
 51 | # urb_pop_reader = pd.read_csv(filename, chunksize=1000)
 52 | 
 53 | """
 54 | files = glob("*.txt")
 55 | fig, ax = plt.subplots()
 56 | 
 57 | for f in files:
 58 |     print("Current file is"+f)
 59 |     #your csv loading into data
 60 |     data.plot('time','temp',ax=axes[0])
 61 | 
 62 | #outside of the for loop
 63 | plt.savefig("myplots.png")
 64 | 
 65 | """
 66 | 
 67 | # ''''''''''''3D Density MAp Plot ''''''''''#
 68 | 
 69 | def calc_kde(data):
 70 |     return kde(data.T)
 71 | 
 72 | mu, sigma = 0, 0.1
 73 | x = 10*np.random.normal(mu, sigma, 5000)
 74 | y = 10*np.random.normal(mu, sigma, 5000)
 75 | z = 10*np.random.normal(mu, sigma, 5000)
 76 | 
 77 | xyz = np.vstack([x, y, z])
 78 | kde = stats.gaussian_kde(xyz)
 79 | 
 80 | # Evaluate kde on a grid
 81 | xmin, ymin, zmin = x.min(), y.min(), z.min()
 82 | xmax, ymax, zmax = x.max(), y.max(), z.max()
 83 | xi, yi, zi = np.mgrid[xmin:xmax:30j, ymin:ymax:30j, zmin:zmax:30j]
 84 | coords = np.vstack([item.ravel() for item in [xi, yi, zi]])
 85 | 
 86 | # Multiprocessing
 87 | cores = multiprocessing.cpu_count()
 88 | pool = multiprocessing.Pool(processes=cores)
 89 | results = pool.map(calc_kde, np.array_split(coords.T, 2))
 90 | density = np.concatenate(results).reshape(xi.shape)
 91 | 
 92 | # Plot scatter with mayavi
 93 | figure = mlab.figure('DensityPlot')
 94 | 
 95 | grid = mlab.pipeline.scalar_field(xi, yi, zi, density)
 96 | min = density.min()
 97 | max = density.max()
 98 | mlab.pipeline.volume(grid, vmin=min, vmax=min + .5*(max-min))
 99 | 
100 | mlab.axes()
101 | mlab.show()
102 | 
103 | 
104 | # '''''''' Alternativc Route'''''''''''''#
105 | filename = 'C:\\Users\\Shabaka\\Desktop\\Temperatura_Media.csv'
106 | raw_data = open(filename, 'rt')
107 | tempdata = pd.read_csv(raw_data, header=0)
108 | print(tempdata.shape)
109 | 
110 | print(tempdata.head())
111 | 
112 | plt.plot(tempdata, color='blue', label='Temp_Med')
113 | 
114 | plt.show()
115 | 
116 | sns.pairplot(tempdata, kind='reg')    # hue='Temp_Med')
117 | plt.show()
118 | 
119 | surfdata = [go.Surface(tempdata.as_matrix())]
120 | 
121 | layout = go.Layout(
122 |     title='Temp_Data Elevation',
123 |     autosize=False,
124 |     width=500,
125 |     height=500,
126 |     margin=dict(
127 |         l=65,
128 |         r=50,
129 |         b=65,
130 |         t=90
131 |     )
132 | )
133 | fig = go.Figure(data=surfdata, layout=layout)
134 | py.iplot(fig, filename='elevations-3d-surface', type='surface')
135 | 
136 | plt.show()


--------------------------------------------------------------------------------
/CSV_Import-All2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr  5 12:51:39 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import os
 9 | import glob
10 | import pandas as pd
11 | import seaborn as sns
12 | import matplotlib.pyplot as plt
13 | import numpy as np
14 | 
15 | # path = r'C:\DRO\DCL_rawdata_files'
16 | 
17 | path = r'C:\Users\Shabaka\Desktop\Test2 DJI_Corretti\100\TIM'
18 | allfiles = os.path.join(path, "*.csv")
19 | frame2 = pd.DataFrame()
20 | list2 = []
21 | for file_ in allfiles:
22 |     df = pd.read_csv(file, index_col=None, header=None)
23 |     list2.append(df)
24 | frame = pd.concat(list2, ignore_index=True)
25 | 
26 | print(frame.head())
27 | 
28 | 
29 | df = pd.concat((pd.read_csv(file) for file in allfiles))
30 | 
31 | print(df.head())


--------------------------------------------------------------------------------
/CSV_Reader_Test2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr  5 12:18:36 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import os
 9 | import glob
10 | import pandas as pd
11 | 
12 | 
13 | def concatenate(indir='', outfile=''):
14 |     os.chdir(indir)
15 |     fileList = glob.glob('*.csv')
16 |     dfList = []
17 |     
18 |     for filename in fileList:
19 |         print(filename)
20 |         df = pd.read_csv(filename, header=None)
21 |         dfList.append(df)
22 |     concatDF= pd.concat(dfList, axis=0)
23 |     concatDF.columns=colanmes
24 |     concatDF.to_csv
25 |     


--------------------------------------------------------------------------------
/Calc_Diff_Btw_Columns.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan 23 02:16:54 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | from sqlalchemy import desc
 9 | 
10 | # Build query to return state names by population difference from 2008 to 2000:
11 | # stmt
12 | stmt = select([census.columns.state, (census.columns.pop2008 -census.columns.pop2000).label('pop_change')])
13 | 
14 | # Append group by for the state: stmt
15 | stmt = stmt.group_by(census.columns.state)
16 | 
17 | # Append order by for pop_change descendingly: stmt
18 | stmt = stmt.order_by(desc('pop_change'))
19 | 
20 | # Return only 5 results: stmt
21 | stmt = stmt.limit(5)
22 | 
23 | # Use connection to execute the statement and fetch all results
24 | results = connection.execute(stmt).fetchall()
25 | 
26 | # Print the state and population change for each record
27 | for result in results:
28 |     print('{}-{}'.format(result.state, result.pop_change))
29 | 


--------------------------------------------------------------------------------
/Connect to PostgresQL Database.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jan 17 01:28:54 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import create_engine function
 9 | from sqlalchemy import create_engine
10 | 
11 | # Create an engine to the census database
12 | engine = create_engine('postgresql+psycopg2://' + 'student:datacamp'+\
13 | '@postgresql.csrrinzqubik.us-east-1.rds.amazonaws.com'':5432/census')
14 | 
15 | # Use the 'table_names()' method on the engine to print the table names
16 | print(engine.table_names())


--------------------------------------------------------------------------------
/Create_DataBase_Engine.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 11 19:13:40 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import necessary module
 9 | from sqlalchemy import create_engine
10 | import pandas as pd
11 | 
12 | # Create engine: engine
13 | engine = create_engine('sqlite:///Chinook.sqlite')
14 | 
15 | 
16 | # Save the table names to a list: table_names
17 | table_names = engine.table_names()
18 | 
19 | # Print the table names to the shell
20 | print(table_names)
21 | 
22 | """
23 |     Open the engine connection as con using the method connect() on the engine.
24 |     Execute the query that selects ALL columns from the Album table. Store the
25 |     results in rs.
26 |     Store all of your query results in the DataFrame df by applying the 
27 |     fetchall() method to the results rs.
28 |     Close the connection!
29 | """
30 | 
31 | # 'Retrieve column of table called Album in the chinook database'
32 | 
33 | # Open engine connection: con
34 | con = engine.connect()
35 | 
36 | # Perform query: rs
37 | rs = con.execute('SELECT * FROM Album')
38 | 
39 | # Save results of the query to DataFrame: df
40 | df = pd.DataFrame(rs.fetchall())
41 | 


--------------------------------------------------------------------------------
/Customer Segmentation_Code_Spyder.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Feb 25 18:00:11 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import pandas as pd
 9 | 
10 | df_offers = pd.read_excel("http://blog.yhathq.com/static/misc/data/WineKMC.xlsx", sheetname=0)
11 | df_offers.columns = ["offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak"]
12 | df_offers.head()
13 | 
14 | df_transactions = pd.read_excel("http://blog.yhathq.com/static/misc/data/WineKMC.xlsx", sheetname=1)
15 | df_transactions.columns = ["customer_name", "offer_id"]
16 | df_transactions['n'] = 1
17 | df_transactions.head()
18 | 
19 | # join the offers and transactions table
20 | df = pd.merge(df_offers, df_transactions)
21 | # create a "pivot table" which will give us the number of times each customer responded to a given offer
22 | matrix = df.pivot_table(index=['customer_name'], columns=['offer_id'], values='n')
23 | # a little tidying up. fill NA values with 0 and make the index into a column
24 | matrix = matrix.fillna(0).reset_index()
25 | # save a list of the 0/1 columns. we'll use these a bit later
26 | x_cols = matrix.columns[1:]
27 | 
28 | from sklearn.cluster import KMeans
29 | 
30 | cluster = KMeans(n_clusters=5)
31 | # slice matrix so we only include the 0/1 indicator columns in the clustering
32 | matrix['cluster'] = cluster.fit_predict(matrix[matrix.columns[2:]])
33 | matrix.cluster.value_counts()
34 | 
35 | from sklearn.decomposition import PCA
36 | 
37 | pca = PCA(n_components=2)
38 | matrix['x'] = pca.fit_transform(matrix[x_cols])[:,0]
39 | matrix['y'] = pca.fit_transform(matrix[x_cols])[:,1]
40 | matrix = matrix.reset_index()
41 | 
42 | customer_clusters = matrix[['customer_name', 'cluster', 'x', 'y']]
43 | customer_clusters.head()
44 | 
45 | df = pd.merge(df_transactions, customer_clusters)
46 | df = pd.merge(df_offers, df)
47 | 
48 | from ggplot import *
49 | """
50 | import matplotlib.pyplot as plt
51 | plt.figure()
52 | plt.plot(rigs2)
53 | plt.plot(customer_clusters)
54 | plt.ion()
55 | plt.show()
56 | """
57 | ggplot(df, aes(x='x', y='y', color='cluster')) + \
58 |     geom_point(size=75) + \
59 |     ggtitle("Customers Grouped by Cluster")
60 | 
61 | cluster_centers = pca.transform(cluster.cluster_centers_)
62 | cluster_centers = pd.DataFrame(cluster_centers, columns=['x', 'y'])
63 | cluster_centers['cluster'] = range(0, len(cluster_centers))
64 | 
65 | ggplot(df, aes(x='x', y='y', color='cluster')) + \
66 |     geom_point(size=75) + \
67 |     geom_point(cluster_centers, size=500) +\
68 |     ggtitle("Customers Grouped by Cluster")
69 | 
70 | df['is_4'] = df.cluster==4
71 | df.groupby("is_4").varietal.value_counts()
72 | 
73 | df.groupby("is_4")[['min_qty', 'discount']].mean()
74 | 


--------------------------------------------------------------------------------
/CustomizeSQL_Query_Col_Row_Size.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jan 13 14:32:12 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | """
 9 | 
10 |     Open the engine connection as con using the method connect() on the engine.
11 |     Execute the query that selects ALL columns from the Album table. Store the
12 |     results in rs.
13 |     Store all of your query results in the DataFrame df by applying the
14 |     fetchall() method to the results rs.
15 |     Close the connection! - In Query Script
16 | """
17 | 
18 | # 'This script allows us to perform the following things:'
19 | 
20 | #    Select specified columns from a table;
21 | #    Select a specified number of rows;
22 | #    Import column names from the database table.
23 | 
24 | 
25 | from sqlalchemy import create_engine
26 | import pandas as pd
27 | 
28 | engine = create_engine('sqlite:///Chinook.sqlite')
29 | 
30 | # Open engine in context manager
31 | # Perform query and save results to DataFrame: df
32 | with engine.connect() as con:
33 |     rs = con.execute("SELECT LastName, Title FROM Employee")
34 |     df = pd.DataFrame(rs.fetchmany(size=3))
35 |     df.columns = rs.keys()
36 | 
37 | # Print the length of the DataFrame df
38 | print(len(df))
39 | 
40 | # Print the head of the DataFrame df
41 | print(df.head())
42 | 


--------------------------------------------------------------------------------
/DB_Tables_Update.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Apr  4 17:05:28 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | # '''''''''''' Panda SQL Query ''''''''''''#
  9 | # Import packages
 10 | import sqlite3 
 11 | from sqlalchemy import create_engine
 12 | from sqlalchemy import update
 13 | from sqlalchemy import connection
 14 | import pandas as pd
 15 | # Import insert and select from sqlalchemy
 16 | from sqlalchemy import insert, select
 17 | # Create engine: engine
 18 | engine = create_engine('sqlite:///Chinook.sqlite')
 19 | 
 20 | # Execute query and store records in DataFrame: df
 21 | df = pd.read_sql_query("SELECT * FROM Album", engine)
 22 | 
 23 | # Print head of DataFrame
 24 | 
 25 | print(df.head())
 26 | 
 27 | # Open engine in context manager
 28 | # Perform query and save results to DataFrame: df1
 29 | 
 30 | with engine.connect() as con:
 31 |     rs = con.execute("SELECT * FROM Album")
 32 |     df1 = pd.DataFrame(rs.fetchall())
 33 |     df1.columns = rs.keys()
 34 | 
 35 | # Confirm that both methods yield the same result: does df = df1 ?
 36 | 
 37 | print(df.equals(df1))
 38 | 
 39 | # ''''''''''''#######'''''''##################''''''''#
 40 | 
 41 | # Build an insert statement to insert a record into the data table: stmt
 42 | 
 43 | stmt = insert(data).values(name='Anna', count=1, amount=1000.00, valid=True)
 44 | 
 45 | # Execute the statement via the connection: results
 46 | 
 47 | results = connection.execute(stmt)
 48 | 
 49 | # Print result rowcount
 50 | 
 51 | print(results.rowcount)
 52 | 
 53 | # Build a select statement to validate the insert
 54 | 
 55 | stmt = select([data]).where(data.columns.name == 'Anna')
 56 | 
 57 | # Print the result of executing the query.
 58 | 
 59 | print(connection.execute(stmt).first())
 60 | 
 61 | # '''''''''###########'''''''''''''''' #
 62 | # ''''''####'''''''''''''##########'''''''''#
 63 | 
 64 | # Create a insert statement for census: stmt
 65 | 
 66 | stmt = insert(census)
 67 | 
 68 | # Create an empty list and zeroed row count: values_list, total_rowcount
 69 | 
 70 | values_list = []
 71 | total_rowcount = 0
 72 | 
 73 | # Enumerate the rows of csv_reader
 74 | for idx, row in enumerate(csv_reader):
 75 |     # create data and append to values_list
 76 |     data = {'state': row[0], 'sex': row[1], 'age': row[2], 'pop2000': row[3],
 77 |             'pop2008': row[4]}
 78 |     values_list.append(data)
 79 | 
 80 |     # Check to see if divisible by 51
 81 |     if idx % 51 == 0:
 82 |         results = connection.execute(stmt, values_list)
 83 |         total_rowcount += results.rowcount
 84 |         values_list = []
 85 |         
 86 | 
 87 | # Build a select statement: select_stmt
 88 | select_stmt = select([state_fact]).where(state_fact.columns.name == 'New York')
 89 | 
 90 | # Print the results of executing the select_stmt
 91 | print(connection.execute(select_stmt).fetchall())
 92 | 
 93 | # Build a statement to update the fips_state to 36: stmt
 94 | stmt = update(state_fact).values(fips_state=36)
 95 | 
 96 | # Append a where clause to limit it to records for New York state
 97 | stmt = stmt.where(state_fact.columns.name == 'New York')
 98 | 
 99 | # Execute the statement: results
100 | results = connection.execute(stmt)
101 | 
102 | # Print rowcount
103 | print(results.rowcount)
104 | 
105 | # Execute the select_stmt again to view the changes
106 | print(connection.execute(select_stmt).fetchall())
107 | 
108 | 
109 | # ''''''''''''' Update Multiple Records ''''''#
110 | 
111 | # Build a statement to update the notes to 'The Wild West': stmt
112 | stmt = update(state_fact).values(notes='The Wild West')
113 | 
114 | # Append a where clause to match the West census region records
115 | stmt = stmt.where(state_fact.columns.census_region_name == 'West')
116 | 
117 | # Execute the statement: results
118 | results = connection.execute(stmt)
119 | 
120 | # Print rowcount
121 | print(results.rowcount)
122 | 
123 | # ''''''''''' Making Correlated Updates ''' ########
124 | 
125 | # Build a statement to select name from state_fact: stmt
126 | fips_stmt = select([state_fact.columns.name])
127 | 
128 | # Append a where clause to Match the fips_state to flat_census fips_code
129 | fips_stmt = fips_stmt.where(
130 |     state_fact.columns.fips_state == flat_census.columns.fips_code)
131 | 
132 | # Build an update statement to set the name to fips_stmt: update_stmt
133 | update_stmt = update(flat_census).values(state_name=fips_stmt)
134 | 
135 | # Execute update_stmt: results
136 | results = connection.execute(update_stmt)
137 | 
138 | # Print rowcount
139 | print(results.rowcount)
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/DatStream.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Mar 30 21:27:31 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import pandas as pd
  9 | 
 10 | # Open a connection to the file
 11 | with open('world_dev_ind.csv') as file:
 12 | 
 13 |     # Skip the column names
 14 |     file.readline()
 15 | 
 16 |     # Initialize an empty dictionary: counts_dict
 17 |     counts_dict = {}
 18 | 
 19 |     # Process only the first 1000 rows
 20 |     for j in range(1000):
 21 | 
 22 |         # Split the current line into a list: line
 23 |         line = file.readline().split(',')
 24 | 
 25 |         # Get the value for the first column: first_col
 26 |         first_col = line[0]
 27 | 
 28 |         # If the column value is in the dict, increment its value
 29 |         if first_col in counts_dict.keys():
 30 |             counts_dict[first_col] += 1
 31 | 
 32 |         # Else, add to the dict and set value to 1
 33 |         else:
 34 |             counts_dict[first_col] = 1
 35 | 
 36 | # Print the resulting dictionary
 37 | print(counts_dict)
 38 | 
 39 | # ''''''''''''''''' Write Generator to Load Data Chunks ''''''' #
 40 | 
 41 | # Define read_large_file()
 42 | def read_large_file(file_object):
 43 |     """A generator function to read a large file lazily."""
 44 | 
 45 |     # Loop indefinitely until the end of the file
 46 |     while True:
 47 | 
 48 |         # Read a line from the file: data
 49 |         data = file_object.readline()
 50 | 
 51 |         # Break if this is the end of the file
 52 |         if not data:
 53 |             break
 54 | 
 55 |         # Yield the line of data
 56 |         yield data
 57 | # Open a connection to the file
 58 | with open('world_dev_ind.csv') as file:
 59 | 
 60 |     # Create a generator object for the file: gen_file
 61 |     gen_file = read_large_file(file)
 62 | 
 63 |     # Print the first three lines of the file
 64 |     print(next(gen_file))
 65 |     print(next(gen_file))
 66 |     print(next(gen_file))
 67 | 
 68 | 
 69 | # ''''''''''''''' Load Data in Chunks with Generator ''''''''''' '#
 70 | # Initialize an empty dictionary: counts_dict
 71 | counts_dict = {}
 72 | 
 73 | # Open a connection to the file
 74 | with open('world_dev_ind.csv') as file:
 75 | 
 76 |     # Iterate over the generator from read_large_file()
 77 |     for line in read_large_file(file):
 78 | 
 79 |         row = line.split(',')
 80 |         first_col = row[0]
 81 | 
 82 |         if first_col in counts_dict.keys():
 83 |             counts_dict[first_col] += 1
 84 |         else:
 85 |             counts_dict[first_col] = 1
 86 | 
 87 | # Print
 88 | print(counts_dict)
 89 | 
 90 | # ''''' Iterator to load data in chunks ''''''''''' #
 91 | 
 92 | # Import the pandas package
 93 | 
 94 | # Initialize reader object: df_reader
 95 | df_reader = pd.read_csv('ind_pop.csv', chunksize=10)
 96 | 
 97 | # Print two chunks
 98 | print(next(df_reader))
 99 | print(next(df_reader))
100 | 
101 | # ''''''''''''' Iterator to Load Data in Chunks '''''''''''#
102 | 
103 | # Initialize reader object: urb_pop_reader
104 | urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)
105 | 
106 | # Get the first dataframe chunk: df_urb_pop
107 | df_urb_pop = next(urb_pop_reader)
108 | 
109 | # Check out the head of the dataframe
110 | print(df_urb_pop.head())
111 | 
112 | # Check out specific country: df_pop_ceb
113 | df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']
114 | 
115 | # Zip dataframe columns of interest: pops
116 | pops = zip(df_pop_ceb['Total Population'],
117 |            df_pop_ceb['Urban population (% of total)'])
118 | 
119 | # Turn zip object into list: pops_list
120 | pops_list = list(pops)
121 | 
122 | # Print pops_list
123 | print(pops_list)
124 | 
125 | 
126 | # Use list comp to create new dataframe column 'Total Urban Population'
127 | 
128 | df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1]) for tup in pops_list]
129 | 
130 | # Plot urban population data
131 | 
132 | df_pop_ceb.plot(kind='scatter', x='Year', y='Total Urban Population')
133 | plt.show()
134 | 
135 | 


--------------------------------------------------------------------------------
/DatVis_Bokeh_1.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue May 16 17:04:55 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | from bokeh.plotting import figure
 11 | from bokeh.io import output_file, show
 12 | from bokeh.plotting import ColumnDataSource
 13 | from bokeh.models import HoverTool 
 14 | 
 15 | # Create the figure: p
 16 | p = figure(x_axis_label='fertility (children per woman)',
 17 |            y_axis_label='female_literacy (% population)')
 18 | 
 19 | # Add a circle glyph to the figure p
 20 | p.circle(fertility, female_literacy)
 21 | 
 22 | # Call the output_file() function and specify the name of the file
 23 | output_file('fert_lit.html')
 24 | 
 25 | # Display the plot
 26 | show(p)
 27 | 
 28 | # ''''''''''''' Multiple Dta plots ''''''#
 29 | 
 30 | # Create the figure: p
 31 | p = figure(x_axis_label='fertility',
 32 |            y_axis_label='female_literacy (% population)')
 33 | 
 34 | # Add a circle glyph to the figure p
 35 | _ = p.circle(fertility_latinamerica, female_literacy_latinamerica)
 36 | 
 37 | # Add an x glyph to the figure p
 38 | _ = p.x(fertility_africa, female_literacy_africa)
 39 | 
 40 | # Specify the name of the file
 41 | output_file('fert_lit_separate.html')
 42 | 
 43 | # Display the plot
 44 | show(p)
 45 | 
 46 | # '''''Scatter Plot Customisation '''''''#
 47 | 
 48 | # Create the figure: p
 49 | p = figure(x_axis_label='fertility (children per woman)',
 50 |            y_axis_label='female_literacy (% population)')
 51 | 
 52 | # Add a blue circle glyph to the figure p
 53 | p.circle(fertility_latinamerica, female_literacy_latinamerica,
 54 |          color='blue', size=10, alpha=0.8)
 55 | 
 56 | # Add a red circle glyph to the figure p
 57 | p.circle(fertility_africa, female_literacy_africa,
 58 |          color='red', size=10, alpha=0.8)
 59 | 
 60 | # Specify the name of the file
 61 | output_file('fert_lit_separate_colors.html')
 62 | 
 63 | # Display the plot
 64 | show(p)
 65 | 
 66 | 
 67 | # ''''Bokeh Line PLot '''''''''''#
 68 | 
 69 | # Import figure from bokeh.plotting - to p of file
 70 | 
 71 | # Create a figure with x_axis_type="datetime": p
 72 | p = figure(x_axis_type='datetime',
 73 |            x_axis_label='Date', y_axis_label='US Dollars')
 74 | 
 75 | # Plot date along the x axis and price along the y axis
 76 | p.line(date, price, line_width=3)
 77 | 
 78 | # Specify the name of the output file and show the result
 79 | output_file('line.html')
 80 | show(p)
 81 | 
 82 | # '''''Line and Marker Plot ''''''''#
 83 | 
 84 | # Import figure from bokeh.plotting - top of file
 85 | 
 86 | # Create a figure with x_axis_type='datetime': p
 87 | p = figure(x_axis_type='datetime', x_axis_label='Date',
 88 |            y_axis_label='US Dollars')
 89 | 
 90 | # Plot date along the x-axis and price along the y-axis
 91 | p.line(date, price)
 92 | 
 93 | # With date on the x-axis and price on the y-axis,
 94 | # add a white circle glyph of size 4
 95 | p.circle(date, price, fill_color='white', size=4)
 96 | 
 97 | # Specify the name of the output file and show the result
 98 | output_file('line.html')
 99 | show(p)
100 | 
101 | # ''''''Bokeh Patch Plots 'Maps' ''#
102 | 
103 | # Create a list of az_lons, co_lons, nm_lons and ut_lons: x
104 | x = [az_lons, co_lons, nm_lons, ut_lons]
105 | 
106 | # Create a list of az_lats, co_lats, nm_lats and ut_lats: y
107 | y = [az_lats, co_lats, nm_lats, ut_lats]
108 | 
109 | # Add patches to figure p with line_color=white for x and y
110 | p.patches(x, y, line_color='white')
111 | 
112 | # Specify the name of the output file and show the result
113 | output_file('four_corners.html')
114 | show(p)
115 | 
116 | 
117 | # ''''''''' Plotting from a numpy array ''''''#
118 | 
119 | # Import numpy as np - at top of file
120 | 
121 | # Create array using np.linspace: x
122 | x = np.linspace(0, 5, 100)
123 | 
124 | # Create array using np.cos: y
125 | y = np.cos(x)
126 | 
127 | # Add circles at x and y
128 | p.circle(x, y)
129 | 
130 | # Specify the name of the output file and show the result
131 | output_file('numpy.html')
132 | show(p)
133 | 
134 | # '''''''' Plotting from Pandas Dataframe ''''''''#
135 | 
136 | # Import pandas as pd - top of file
137 | 
138 | # Read in the CSV file: df
139 | df = pd.read_csv('auto.csv')
140 | 
141 | # Import figure from bokeh.plotting - top of file
142 | 
143 | # Create the figure: p
144 | p = figure(x_axis_label='HP', y_axis_label='MPG')
145 | 
146 | # Plot mpg vs hp by color
147 | p.circle(df['hp'], df['mpg'], color=df['color'], size=10)
148 | 
149 | # Specify the name of the output file and show the result
150 | output_file('auto-df.html')
151 | show(p)
152 | 
153 | # '''''''' Plot from ColumnData Source ''''''''#
154 | 
155 | # Import the ColumnDataSource class from bokeh.plotting
156 | 
157 | # Create a ColumnDataSource from df: source
158 | source = ColumnDataSource(df)
159 | 
160 | # Add circle glyphs to the figure p
161 | p.circle('Year', 'Time', source=source, color='color', size=8)
162 | 
163 | # Specify the name of the output file and show the result
164 | output_file('sprint.html')
165 | show(p)
166 | 
167 | # '''''''Selection and non-Selection Glyph Specification ''''#
168 | 
169 | # Create a figure with the "box_select" tool: p
170 | p = figure(x_axis_label='Year', y_axis_label='Time', tools='box_select')
171 | 
172 | # Add circle glyphs to the figure p with the selected
173 | # and non-selected properties
174 | 
175 | p.circle('Year', 'Time', source=source,
176 |          selection_color='red', nonselection_alpha=0.1)
177 | 
178 | # Specify the name of the output file and show the result
179 | output_file('selection_glyph.html')
180 | show(p)
181 | 
182 | # ''''''making Hover Glyphs '''''''#
183 | 
184 | # import the HoverTool - at top of file
185 | 
186 | # Add circle glyphs to figure p
187 | p.circle(x, y, size=10,
188 |          fill_color='grey', alpha=0.1, line_color=None,
189 |          hover_fill_color='firebrick', hover_alpha=0.5,
190 |          hover_line_color='white')
191 | 
192 | # Create a HoverTool: hover
193 | hover = HoverTool(tooltips=None, mode='vline')
194 | 
195 | # Add the hover tool to the figure p
196 | p.add_tools(hover)
197 | 
198 | # Specify the name of the output file and show the result
199 | output_file('hover_glyph.html')
200 | show(p)
201 | 
202 | # ''''''''' Color Mapping '''''''''''#
203 | 
204 | #Import CategoricalColorMapper from bokeh.models
205 | from bokeh.models import CategoricalColorMapper
206 | 
207 | # Convert df to a ColumnDataSource: source
208 | source = ColumnDataSource(df)
209 | 
210 | # Make a CategoricalColorMapper object: color_mapper
211 | color_mapper = CategoricalColorMapper(factors=['Europe', 'Asia', 'US'],
212 |                                       palette=['red', 'green', 'blue'])
213 | 
214 | # Add a circle glyph to the figure p
215 | p.circle('weight', 'mpg', source=source,
216 |             color=dict(field='origin', transform=color_mapper),
217 |             legend='origin')
218 | 
219 | # Specify the name of the output file and show the result
220 | output_file('colormap.html')
221 | show(p)
222 | 


--------------------------------------------------------------------------------
/DatVis_Bokeh_CaseStudy_App_Build_5.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu May 18 15:54:33 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | 
 11 | from bokeh.io import output_file, show
 12 | 
 13 | from bokeh.plotting import figure
 14 | from bokeh.plotting import ColumnDataSource
 15 | 
 16 | from bokeh.layouts import gridplot
 17 | from bokeh.layouts import row, column
 18 | from bokeh.layouts import widgetbox
 19 | 
 20 | from bokeh.charts import BoxPlot
 21 | from bokeh.charts import Scatter
 22 | 
 23 | from bokeh.palettes import Spectral6
 24 | 
 25 | from bokeh.models import Select
 26 | from bokeh.models import Slider
 27 | from bokeh.models import Button
 28 | from bokeh.models import HoverTool
 29 | from bokeh.models import CategoricalColorMapper
 30 | from bokeh.models import CheckboxGroup, RadioGroup, Toggle
 31 | 
 32 | from bokeh.models.widgets import Panel
 33 | from bokeh.models.widgets import Tabs
 34 | 
 35 | # Perform necessary imports
 36 | from bokeh.io import curdoc
 37 | 
 38 | data = pd.read_csv('___.csv')
 39 | _ = data.head()
 40 | _ = data.describe
 41 | _ = data.info()
 42 | _ = data.shape
 43 | 
 44 | 
 45 | 
 46 | # '''' Basic EDA Plot of Gapminder Data set ''''''''#
 47 | 
 48 | # Make the ColumnDataSource: source
 49 | source = ColumnDataSource(data={
 50 |     'x'       : data.loc[1970].fertility,
 51 |     'y'       : data.loc[1970].life,
 52 |     'country' : data.loc[1970].Country,
 53 | })
 54 | 
 55 | # Create the figure: p
 56 | p = figure(title='1970', x_axis_label='Fertility (children per woman)',
 57 |            y_axis_label='Life Expectancy (years)',
 58 |            plot_height=400, plot_width=700,
 59 |            tools=[HoverTool(tooltips='@country')])
 60 | 
 61 | # Add a circle glyph to the figure p
 62 | p.circle(x='x', y='y', source=source)
 63 | 
 64 | # Output the file and show the figure
 65 | output_file('gapminder.html')
 66 | show(p)
 67 | 
 68 | # ''' Basic Data Plot '''''''#
 69 | 
 70 | # Make the ColumnDataSource: source
 71 | source = ColumnDataSource(data={
 72 |     'x'       : data.loc[1970].fertility,
 73 |     'y'       : data.loc[1970].life,
 74 |     'country'      : data.loc[1970].Country,
 75 |     'pop'      : (data.loc[1970].population / 20000000) + 2,
 76 |     'region'      : data.loc[1970].region,
 77 | })
 78 | 
 79 | # Save the minimum and maximum values of the fertility column: xmin, xmax
 80 | xmin, xmax = min(data.fertility), max(data.fertility)
 81 | 
 82 | # Save the minimum and maximum values of the life expectancy column: ymin, ymax
 83 | ymin, ymax = min(data.life), max(data.life)
 84 | 
 85 | # Create the figure: plot
 86 | plot = figure(title='Gapminder Data for 1970', plot_height=400, plot_width=700,
 87 |               x_range=(xmin, xmax), y_range=(ymin, ymax))
 88 | 
 89 | # Add circle glyphs to the plot
 90 | plot.circle(x='x', y='y', fill_alpha=0.8, source=source)
 91 | 
 92 | # Set the x-axis label
 93 | plot.xaxis.axis_label = 'Fertility (children per woman)'
 94 | 
 95 | # Set the y-axis label
 96 | plot.yaxis.axis_label = 'Life Expectancy (years)'
 97 | 
 98 | # Add the plot to the current document and add a title
 99 | curdoc().add_root(plot)
100 | curdoc().title = 'Gapminder'
101 | 
102 | 
103 | # ''''' Enhancing the list with some colours ''''#
104 | 
105 | # Make a list of the unique values from the region column: regions_list
106 | regions_list = data.region.unique().tolist()
107 | 
108 | # Import CategoricalColorMapper from bokeh.models and
109 | # the Spectral6 palette from bokeh.palettes
110 | 
111 | # Make a color mapper: color_mapper
112 | color_mapper = CategoricalColorMapper(factors=regions_list, palette=Spectral6)
113 | 
114 | # Add the color mapper to the circle glyph
115 | plot.circle(x='x', y='y', fill_alpha=0.8, source=source,
116 |             color=dict(field='region', transform=color_mapper),
117 |             legend='region')
118 | 
119 | # Set the legend.location attribute of the plot to 'top_right'
120 | plot.legend.location = 'top_right'
121 | 
122 | # Add the plot to the current document and add the title
123 | curdoc().add_root(plot)
124 | curdoc().title = 'Gapminder'
125 | 
126 | 
127 | # '''''' Adding a Slider to vary the year ''''''#
128 | 
129 | # Define the callback function: update_plot
130 | def update_plot(attr, old, new):
131 |     # set the `yr` name to `slider.value
132 |     # and `source.data = new_data`
133 |     yr = slider.value
134 |     new_data = {
135 |         'x': data.loc[yr].fertility,
136 |         'y': data.loc[yr].life,
137 |         'country': data.loc[yr].Country,
138 |         'pop': (data.loc[yr].population / 20000000) + 2,
139 |         'region': data.loc[yr].region,
140 |     }
141 |     source.data = new_data
142 | 
143 | 
144 | # Make a slider object: slider
145 | slider = Slider(start=1970, end=2010, step=1, value=1970, title='Year')
146 | 
147 | # Attach the callback to the 'value' property of slider
148 | slider.on_change('value', update_plot)
149 | 
150 | # Make a row layout of widgetbox(slider) and plot
151 | # and add it to the current document
152 | layout = row(widgetbox(slider), plot)
153 | curdoc().add_root(layout)
154 | 
155 | # ''''' Customised Plot API from user input '''#
156 | 
157 | # Define the callback function: update_plot
158 | def update_plot(attr, old, new):
159 |     # Assign the value of the slider: yr
160 |     yr = slider.value
161 |     # Set new_data
162 |     new_data = {
163 |         'x'       : data.loc[yr].fertility,
164 |         'y'       : data.loc[yr].life,
165 |         'country' : data.loc[yr].Country,
166 |         'pop'     : (data.loc[yr].population / 20000000) + 2,
167 |         'region'  : data.loc[yr].region,
168 |     }
169 |     # Assign new_data to: source.data
170 |     source.data = new_data
171 | 
172 |     # Add title to figure: plot.title.text
173 |     plot.title.text = 'Gapminder data for %d' % yr
174 | 
175 | # Make a slider object: slider
176 | slider = Slider(start=1970, end=2010, step=1, value=1970, title='Year')
177 | 
178 | # Attach the callback to the 'value' property of slider
179 | slider.on_change('value', update_plot)
180 | 
181 | # Make a row layout of widgetbox(slider) and
182 | # plot and add it to the current document
183 | layout = row(widgetbox(slider), plot)
184 | curdoc().add_root(layout)
185 | 
186 | # '''' Add Hover info_tool to the API '''''''#
187 | 
188 | # Create a HoverTool: hover
189 | hover = HoverTool(tooltips=[('Country', '@country')])
190 | 
191 | # Add the HoverTool to the plot
192 | plot.add_tools(hover)
193 | # Create layout: layout
194 | layout = row(widgetbox(slider), plot)
195 | 
196 | # Add layout to current document
197 | curdoc().add_root(layout)
198 | 
199 | # '''''''Adding drop-down menu to the App ''''''''''#
200 | 
201 | # Define the callback: update_plot
202 | def update_plot(attr, old, new):
203 |     # Read the current value off the slider and 2 dropdowns: yr, x, y
204 |     yr = slider.value
205 |     x = x_select.value
206 |     y = y_select.value
207 |     # Label axes of plot
208 |     plot.xaxis.axis_label = x
209 |     plot.yaxis.axis_label = y
210 |     # Set new_data
211 |     new_data = {
212 |         'x'       : data.loc[yr][x],
213 |         'y'       : data.loc[yr][y],
214 |         'country' : data.loc[yr].Country,
215 |         'pop'     : (data.loc[yr].population / 20000000) + 2,
216 |         'region'  : data.loc[yr].region,
217 |     }
218 |     # Assign new_data to source.data
219 |     source.data = new_data
220 | 
221 |     # Set the range of all axes
222 |     plot.x_range.start = min(data[x])
223 |     plot.x_range.end = max(data[x])
224 |     plot.y_range.start = min(data[y])
225 |     plot.y_range.end = max(data[y])
226 | 
227 |     # Add title to plot
228 |     plot.title.text = 'Gapminder data for %d' % yr
229 | 
230 | # Create a dropdown slider widget: slider
231 | slider = Slider(start=1970, end=2010, step=1, value=1970, title='Year')
232 | 
233 | # Attach the callback to the 'value' property of slider
234 | slider.on_change('value', update_plot)
235 | 
236 | # Create a dropdown Select widget for the x data: x_select
237 | x_select = Select(
238 |     options=['fertility', 'life', 'child_mortality', 'gdp'],
239 |     value='fertility',
240 |     title='x-axis data'
241 | )
242 | 
243 | # Attach the update_plot callback to the
244 | # 'value' property of x_select
245 | x_select.on_change('value', update_plot)
246 | 
247 | # Create a dropdown Select widget for the y data: y_select
248 | y_select = Select(
249 |     options=['fertility', 'life', 'child_mortality', 'gdp'],
250 |     value='life',
251 |     title='y-axis data'
252 | )
253 | 
254 | # Attach the update_plot callback to
255 | # the 'value' property of y_select
256 | y_select.on_change('value', update_plot)
257 | 
258 | # Create layout and add to current document
259 | layout = row(widgetbox(slider, x_select, y_select), plot)
260 | curdoc().add_root(layout)
261 | 
262 | 


--------------------------------------------------------------------------------
/DatVis_Bokeh_High_Level_Charts_3.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed May 17 19:35:15 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | import pandas as pd
  8 | import numpy as np
  9 | 
 10 | from bokeh.plotting import figure
 11 | from bokeh.io import output_file, show
 12 | from bokeh.plotting import ColumnDataSource
 13 | from bokeh.models import HoverTool
 14 | from bokeh.layouts import gridplot
 15 | from bokeh.models.widgets import Panel
 16 | from bokeh.models.widgets import Tabs
 17 | from bokeh.layouts import row, column
 18 | from bokeh.charts import BoxPlot
 19 | from bokeh.charts import Scatter
 20 | 
 21 | # Import Histogram, output_file, and show from bokeh.charts
 22 | from bokeh.charts import Histogram
 23 | 
 24 | 
 25 | # ''''' Basic bokeh Histogram ''''''''#
 26 | df = pd.read_csv('fixations.csv')
 27 | 
 28 | df.head()
 29 | 
 30 | # Create a ColumnDataSource from df: source
 31 | source = ColumnDataSource(df)
 32 | 
 33 | # Make a Histogram: p
 34 | p = Histogram(df, 'duration', title='Gaze_Time', bins=50)
 35 | 
 36 | # Set the x axis label
 37 | p.xaxis.axis_label = 'Gaze_Duration'
 38 | 
 39 | # Set the y axis label
 40 | p.yaxis.axis_label = 'Pupil DIa'
 41 | # Specify the name of the output_file and show the result
 42 | output_file('histogram.html')
 43 | show(p)
 44 | 
 45 | """
 46 | # Make a Histogram: p
 47 | p = Histogram(df, 'female_literacy', title='Female Literacy',
 48 |               bins=40)
 49 | 
 50 | # Set the x axis label
 51 | p.xaxis.axis_label = 'Female Literacy'
 52 | 
 53 | # Set the y axis label
 54 | p.yaxis.axis_label = 'Fertility'
 55 | # Specify the name of the output_file and show the result
 56 | output_file('histogram.html')
 57 | show(p)
 58 | 
 59 | """
 60 | # '''''' Multiple Histograms ''''''''#
 61 | 
 62 | # Make a Histogram: p
 63 | p = Histogram(df, 'female_literacy', title='Female Literacy',
 64 |               color='Continent', legend='top_left')
 65 | 
 66 | # Set axis labels
 67 | p.xaxis.axis_label = 'Female Literacy (% population)'
 68 | p.yaxis.axis_label = 'Number of Countries'
 69 | 
 70 | # Specify the name of the output_file and show the result
 71 | output_file('hist_bins.html')
 72 | 
 73 | """
 74 | # '''''' Basic BoxPlot '''''''''#
 75 | 
 76 | # Make a box plot: p
 77 | p = BoxPlot(df, values='duration', label='confidence',
 78 |             title='Gaze Duration (grouped by Avg_Pupil_Size)',
 79 |             legend='bottom_right')
 80 | 
 81 | # Set the y axis label
 82 | p.yaxis.axis_label = 'Fixations (% Tot_Gaze_Pop)'
 83 | 
 84 | # Specify the name of the output_file and show the result
 85 | output_file('boxplot.html')
 86 | show(p)
 87 | """
 88 | 
 89 | # ''''''''''''''' ################ '''''''''''''' #
 90 | # Make a box plot: p
 91 | p = BoxPlot(df, values='female_literacy', label='Continent',
 92 |             title='Female Literacy (grouped by Continent)',
 93 |             legend='bottom_right')
 94 | 
 95 | # Set the y axis label
 96 | p.yaxis.axis_label = 'Female literacy (% population)'
 97 | 
 98 | # Specify the name of the output_file and show the result
 99 | output_file('boxplot.html')
100 | show(p)
101 | 
102 | # ''''''''''' Multicoloured Boxplots ''''''#
103 | 
104 | # Make a box plot: p
105 | p = BoxPlot(df, values='female_literacy',
106 |             label='Continent', color='Continent',
107 |             title='Female Literacy (grouped by Continent)',
108 |             legend='bottom_right')
109 | 
110 | # Set y-axis label
111 | p.yaxis.axis_label = 'Female literacy (% population)'
112 | 
113 | # Specify the name of the output_file and show the result
114 | output_file('boxplot.html')
115 | show(p)
116 | 
117 | # ''''''''' Basic Bokeh Scatter PLot ''''''#
118 | 
119 | # Make a scatter plot: p
120 | p = Scatter(df, x='population', y='female_literacy',
121 |             title='Female Literacy vs Population')
122 | 
123 | # Set the x-axis label
124 | p.xaxis.axis_label = 'Population'
125 | 
126 | # Set the y-axis label
127 | p.yaxis.axis_label = 'Female Literacy'
128 | # Specify the name of the output_file and show the result
129 | output_file('scatterplot.html')
130 | show(p)
131 | 
132 | # ''''' scatter plot grouping by colour ''''#
133 | 
134 | # Make a scatter plot such that each circle
135 | # is colored by its continent: p
136 | p = Scatter(df, x='population', y='female_literacy',
137 |             color='Continent',
138 |             title='Female Literacy vs Population')
139 | 
140 | # Set x-axis and y-axis labels
141 | p.xaxis.axis_label = 'Population (millions)'
142 | p.yaxis.axis_label = 'Female literacy (% population)'
143 | 
144 | # Specify the name of the output_file and show the result
145 | output_file('scatterplot.html')
146 | 
147 | # ''''' Scatter plot shape(marker) grouping '''''#
148 | 
149 | # Make a scatter plot such that each continent has a different marker type: p
150 | p = p = Scatter(df, x='population', y='female_literacy',
151 |                 color='Continent',
152 |                 marker='Continent',
153 |                 title='Female Literacy vs Population')
154 | 
155 | # Set x-axis and y-axis labels
156 | p.xaxis.axis_label = 'Population (millions)'
157 | p.yaxis.axis_label = 'Female literacy (% population)'
158 | 
159 | # Specify the name of the output_file and show the result
160 | output_file('scatterplot.html')
161 | show(p)
162 | 
163 | 


--------------------------------------------------------------------------------
/DatVis_Bokeh_Intr_App_Build_4.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed May 17 23:09:01 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | 
 11 | from bokeh.io import output_file, show
 12 | 
 13 | from bokeh.plotting import figure
 14 | from bokeh.plotting import ColumnDataSource
 15 | 
 16 | from bokeh.layouts import gridplot
 17 | from bokeh.layouts import row, column
 18 | from bokeh.layouts import widgetbox
 19 | 
 20 | from bokeh.charts import BoxPlot
 21 | from bokeh.charts import Scatter
 22 | 
 23 | from bokeh.models import Select
 24 | from bokeh.models import Slider
 25 | from bokeh.models import Button
 26 | from bokeh.models import HoverTool
 27 | from bokeh.models import CheckboxGroup, RadioGroup, Toggle
 28 | 
 29 | from bokeh.models.widgets import Panel
 30 | from bokeh.models.widgets import Tabs
 31 | 
 32 | # Perform necessary imports
 33 | from bokeh.io import curdoc
 34 | 
 35 | # Create a new plot: plot
 36 | plot = figure()
 37 | 
 38 | # Add a line to the plot
 39 | plot.line(x=[1, 2, 3, 4, 5], y=[2, 5, 4, 6, 7])
 40 | 
 41 | # Add the plot to the current document
 42 | curdoc().add_root(plot)
 43 | 
 44 | # ''''''''' Add a slider ''''''#
 45 | 
 46 | # Create a slider: slider
 47 | slider = Slider(title='my slider', start=0, end=10, step=0.1, value=2)
 48 | 
 49 | # Create a widgetbox layout: layout
 50 | layout = widgetbox(slider)
 51 | 
 52 | # Add the layout to the current document
 53 | curdoc().add_root(layout)
 54 | 
 55 | # '''''''' Multiple Sliders ''''''''#
 56 | 
 57 | # Create first slider: slider1
 58 | slider1 = Slider(title='slider1', start=0, end=10, step=0.1, value=2)
 59 | 
 60 | # Create second slider: slider2
 61 | slider2 = Slider(title='slider2', start=10, end=100, step=1, value=20)
 62 | 
 63 | # Add slider1 and slider2 to a widgetbox
 64 | layout = widgetbox(slider1, slider2)
 65 | 
 66 | # Add the layout to the current document
 67 | curdoc().add_root(layout)
 68 | 
 69 | 
 70 | # '''' Combining bokeh models into a layout ''''#
 71 | 
 72 | # Create ColumnDataSource: source
 73 | source = ColumnDataSource(data={'x': x, 'y': y})
 74 | 
 75 | # Add a line to the plot
 76 | plot.line('x', 'y', source=source)
 77 | 
 78 | # Create a column layout: layout
 79 | layout = column(widgetbox(slider), plot)
 80 | 
 81 | # Add the layout to the current document
 82 | curdoc().add_root(layout)
 83 | 
 84 | # '' Basic callback on widget ''''''#
 85 | 
 86 | # Define a callback function: callback
 87 | def callback(attr, old, new):
 88 | 
 89 |     # Read the current value of the slider: scale
 90 |     scale = slider.value
 91 | 
 92 |     # Compute the updated y using np.sin(scale/x): new_y
 93 |     new_y = np.sin(scale/x)
 94 | 
 95 |     # Update source with the new data values
 96 |     source.data = {'x': x, 'y': new_y}
 97 | 
 98 | # Attach the callback to the 'value' property of slider
 99 | slider.on_change('value', callback)
100 | 
101 | # Create layout and add to current document
102 | layout = column(widgetbox(slider), plot)
103 | curdoc().add_root(layout)
104 | 
105 | # ''''Updating data sources - Drop down in callback '''#
106 | 
107 | # Create ColumnDataSource: source
108 | source = ColumnDataSource(data={
109 |     'x' : fertility,
110 |     'y' : female_literacy
111 | })
112 | 
113 | # Create a new plot: plot
114 | plot = figure()
115 | 
116 | # Add circles to the plot
117 | plot.circle('x', 'y', source=source)
118 | 
119 | # Define a callback function: update_plot
120 | def update_plot(attr, old, new):
121 |     # If the new Selection is 'female_literacy', update 'y' to female_literacy
122 |     if new == 'female_literacy':
123 |         source.data = {
124 |             'x': fertility,
125 |             'y': female_literacy
126 |         }
127 |     # Else, update 'y' to population
128 |     else:
129 |         source.data = {
130 |             'x' : fertility,
131 |             'y' : population
132 |         }
133 | 
134 | # Create a dropdown Select widget: select    
135 | select = Select(title="distribution",
136 |                 options=['female_literacy', 'population'],
137 |                 value='female_literacy')
138 | 
139 | # Attach the update_plot callback to the 'value' property of select
140 | select.on_change('value', update_plot)
141 | 
142 | # Create layout and add to current document
143 | layout = row(select, plot)
144 | curdoc().add_root(layout)
145 | 
146 | # ''''''''' Synchronise two dropdowns '''''''''''#
147 | 
148 | # Create two dropdown Select widgets: select1, select2
149 | 
150 | select1 = Select(title='First', options=['A', 'B'], value='A')
151 | select2 = Select(title='Second', options=['1', '2', '3'], value='1')
152 | 
153 | # Define a callback function: callback
154 | def callback(attr, old, new):
155 |     # If select1 is 'A' 
156 |     if select1.value == 'A':
157 |         # Set select2 options to ['1', '2', '3']
158 |         select2.options = ['1', '2', '3']
159 | 
160 |         # Set select2 value to '1'
161 |         select2.value = '1'
162 |     else:
163 |         # Set select2 options to ['100', '200', '300']
164 |         select2.options = ['100', '200', '300']
165 | 
166 |         # Set select2 value to '100'
167 |         select2.value = '100'
168 | 
169 | # Attach the callback to the 'value' property of select1
170 | select1.on_change('value', callback)
171 | 
172 | # Create layout and add to current document
173 | layout = widgetbox(select1, select2)
174 | curdoc().add_root(layout)
175 | 
176 | 
177 | # ''''''''''Basic button widget '''''''''#
178 | 
179 | # Create a Button with label 'Update Data'
180 | button = Button(label='Update Data')
181 | 
182 | # Define an update callback with no arguments: update
183 | def update():
184 | 
185 |     # Compute new y values: y
186 |     y = np.sin(x) + np.random.random(N)
187 | 
188 |     # Update the ColumnDataSource data dictionary
189 |     source.data = {'x': x, 'y': y}
190 | 
191 | # Add the update callback to the button
192 | button.on_click(update)
193 | 
194 | # Create layout and add to current document
195 | layout = column(widgetbox(button), plot)
196 | curdoc().add_root(layout)
197 | 
198 | 
199 | # ''''''' Button Styles '''''''#
200 | 
201 | # Import CheckboxGroup, RadioGroup, Toggle from bokeh.models
202 | 
203 | # Add a Toggle: toggle
204 | toggle = Toggle(button_type='success', label='Toggle button')
205 | 
206 | # Add a CheckboxGroup: checkbox
207 | checkbox = CheckboxGroup(labels=['Option 1', 'Option 2', 'Option 3'])
208 | 
209 | # Add a RadioGroup: radio
210 | radio = RadioGroup(labels=['Option 1', 'Option 2', 'Option 3'])
211 | 
212 | # Add widgetbox(toggle, checkbox, radio) to the current document
213 | curdoc().add_root(widgetbox(toggle, checkbox, radio))


--------------------------------------------------------------------------------
/DatVis_Bokeh_Layout-Int-Annot_2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed May 17 16:30:01 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | from bokeh.plotting import figure
  9 | from bokeh.io import output_file, show
 10 | from bokeh.plotting import ColumnDataSource
 11 | from bokeh.models import HoverTool
 12 | from bokeh.layouts import gridplot
 13 | from bokeh.models.widgets import Panel
 14 | from bokeh.models.widgets import Tabs
 15 | from bokeh.layouts import row, column
 16 | 
 17 | # Create a ColumnDataSource from df: source
 18 | source = ColumnDataSource(df)
 19 | 
 20 | # '''''' Creating Rows of Plots
 21 | 
 22 | # Create the first figure: p1
 23 | p1 = figure(x_axis_label='fertility (children per woman)',
 24 |             y_axis_label='female_literacy (% population)')
 25 | 
 26 | # Add a circle glyph to p1
 27 | p1.circle('fertility', 'female_literacy', source=source)
 28 | 
 29 | # Create the second figure: p2
 30 | p2 = figure(x_axis_label='population',
 31 |             y_axis_label='female_literacy (% population)')
 32 | 
 33 | # Add a circle glyph to p2
 34 | p2.circle('population', 'female_literacy', source=source)
 35 | 
 36 | # Put p1 and p2 into a horizontal row: layout
 37 | layout = row(p1, p2)
 38 | 
 39 | # Specify the name of the output_file and show the result
 40 | output_file('fert_row.html')
 41 | show(layout)
 42 | 
 43 | # '''''''''''''' Column Plots in Bokeh ''''''#
 44 | 
 45 | # Create a blank figure: p1
 46 | p1 = figure(x_axis_label='fertility (children per woman)',
 47 |             y_axis_label='female_literacy (% population)')
 48 | 
 49 | # Add circle scatter to the figure p1
 50 | p1.circle('fertility', 'female_literacy', source=source)
 51 | 
 52 | # Create a new blank figure: p2
 53 | p2 = figure(x_axis_label='population',
 54 |             y_axis_label='female_literacy (% population)')
 55 | 
 56 | # Add circle scatter to the figure p2
 57 | p2.circle('population', 'female_literacy', source=source)
 58 | 
 59 | # Put plots p1 and p2 in a column: layout
 60 | layout = column(p1, p2)
 61 | 
 62 | # Specify the name of the output_file and show the result
 63 | output_file('fert_column.html')
 64 | show(layout)
 65 | 
 66 | # ''''''' Nesting Rows & Columns of Plots '''''''#
 67 | 
 68 | # Make a column layout that will be used as the second row: row2
 69 | row2 = column([mpg_hp, mpg_weight], sizing_mode='scale_width')
 70 | 
 71 | # Make a row layout that includes the above column layout: layout
 72 | layout = row([avg_mpg, row2], sizing_mode='scale_width')
 73 | 
 74 | # Specify the name of the output_file and show the result
 75 | output_file('layout_custom.html')
 76 | show(layout)
 77 | 
 78 | # '''''Gridded Layouts ''''''''#
 79 | 
 80 | # Create a list containing plots p1 and p2: row1
 81 | row1 = [p1, p2]
 82 | 
 83 | # Create a list containing plots p3 and p4: row2
 84 | row2 = [p3, p4]
 85 | 
 86 | # Create a gridplot using row1 and row2: layout
 87 | layout = gridplot([row1, row2])
 88 | 
 89 | # Specify the name of the output_file and show the result
 90 | output_file('grid.html')
 91 | show(layout)
 92 | 
 93 | # ''''''Start Tabbed Layouts ''''#1 Create Panels
 94 | 
 95 | # Create tab1 from plot p1: tab1
 96 | tab1 = Panel(child=p1, title='Latin America')
 97 | 
 98 | # Create tab2 from plot p2: tab2
 99 | tab2 = Panel(child=p2, title='Africa')
100 | 
101 | # Create tab3 from plot p3: tab3
102 | tab3 = Panel(child=p3, title='Asia')
103 | 
104 | # Create tab4 from plot p4: tab4
105 | tab4 = Panel(child=p4, title='Europe')
106 | 
107 | 
108 | # ''''''''''''' Display the tabbed layouts '''''''''''#
109 | 
110 | # Create a Tabs layout: layout
111 | layout = Tabs(tabs=[tab1, tab2, tab3, tab4])
112 | 
113 | # Specify the name of the output_file and show the result
114 | output_file('tabs.html')
115 | show(layout)
116 | 
117 | # '''''''' Linked Axes Plots '''''''#
118 | 
119 | # Link the x_range of p2 to p1: p2.x_range
120 | p2.x_range = p1.x_range
121 | 
122 | # Link the y_range of p2 to p1: p2.y_range
123 | p2.y_range = p1.y_range
124 | 
125 | # Link the x_range of p3 to p1: p3.x_range
126 | p3.x_range = p1.x_range
127 | 
128 | # Link the y_range of p4 to p1: p4.y_range
129 | p4.y_range = p1.y_range
130 | 
131 | # Specify the name of the output_file and show the result
132 | output_file('linked_range.html')
133 | show(layout)
134 | 
135 | # ' Linked brushed data - brushing ''''''''''''''#
136 | 
137 | # Create ColumnDataSource: source
138 | source = ColumnDataSource(data)
139 | 
140 | # Create the first figure: p1
141 | p1 = figure(x_axis_label='fertility (children per woman)',
142 |             y_axis_label='female literacy (% population)',
143 |             tools='box_select,lasso_select')
144 | 
145 | # Add a circle glyph to p1
146 | _ = p1.circle('fertility', 'female literacy', source=source)
147 | 
148 | # Create the second figure: p2
149 | p2 = figure(x_axis_label='fertility (children per woman)',
150 |             y_axis_label='population (millions)',
151 |             tools='box_select,lasso_select')
152 | 
153 | # Add a circle glyph to p2
154 | _ = p2.circle('fertility', 'population', source=source)
155 | 
156 | # Create row layout of figures p1 and p2: layout
157 | layout = row(p1, p2)
158 | 
159 | # Specify the name of the output_file and show the result
160 | output_file('linked_brush.html')
161 | show(layout)
162 | 
163 | # ''''''' Creating Legends '''''''''#
164 | 
165 | # Add the first circle glyph to the figure p
166 | p.circle('fertility', 'female_literacy',
167 |          source=latin_america, size=10,
168 |          color='red', legend='Latin America')
169 | 
170 | # Add the second circle glyph to the figure p
171 | p.circle('fertility', 'female_literacy',
172 |          source=africa, size=10,
173 |          color='blue', legend='Africa')
174 | 
175 | # Specify the name of the output_file and show the result
176 | output_file('fert_lit_groups.html')
177 | show(p)
178 | 
179 | # '''Legend Position and Style '''''''#
180 | 
181 | # Assign the legend to the bottom left: p.legend.location
182 | p.legend.location='bottom_left'
183 | 
184 | # Fill the legend background with the color 'lightgray': 
185 | # p.legend.background_fill_color
186 | p.legend.background_fill_color='lightgray'
187 | 
188 | # Specify the name of the output_file and show the result
189 | output_file('fert_lit_groups.html')
190 | show(p)
191 | 
192 | # ''''' Add hover tooltip to plot '''''''#
193 | 
194 | # Create a HoverTool object: hover
195 | hover = HoverTool(tooltips=[('Country','@Country')])
196 | 
197 | # Add the HoverTool object to figure p
198 | p.add_tools(hover)
199 | 
200 | # Specify the name of the output_file and show the result
201 | output_file('hover.html')
202 | show(p)


--------------------------------------------------------------------------------
/DatVis_Images.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Apr  1 19:30:44 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import matplotlib.pyplot as plt
  9 | import numpy as np
 10 | 
 11 | # Load the image into an array: img
 12 | img = plt.imread('480px-Astronaut-EVA.jpg')
 13 | 
 14 | # Print the shape of the image
 15 | print(img.shape)
 16 | 
 17 | # Display the image
 18 | plt.imshow(img)
 19 | 
 20 | # Hide the axes
 21 | plt.axis('off')
 22 | plt.show()
 23 | 
 24 | # ''''''''''' Pseudocolor Plot from Image Data ''''''''''''#
 25 | 
 26 | # Load the image into an array: img
 27 | img = plt.imread('480px-Astronaut-EVA.jpg')
 28 | 
 29 | # Print the shape of the image
 30 | print(img.shape)
 31 | 
 32 | # Compute the sum of the red, green and blue channels: intensity
 33 | intensity = img.sum(axis=2)
 34 | 
 35 | # Print the shape of the intensity
 36 | print(intensity.shape)
 37 | 
 38 | # Display the intensity with a colormap of 'gray'
 39 | plt.imshow(intensity, cmap='gray')
 40 | 
 41 | # Add a colorbar
 42 | plt.colorbar()
 43 | 
 44 | # Hide the axes and show the figure
 45 | plt.axis('off')
 46 | plt.show()
 47 | 
 48 | # # '''''''''''''Specifying Extents and Aspect Ratio '''''#
 49 | 
 50 | # Load the image into an array: img
 51 | img = plt.imread('480px-Astronaut-EVA.jpg')
 52 | 
 53 | # Specify the extent and aspect ratio of the top left subplot
 54 | plt.subplot(2, 2, 1)
 55 | plt.title('extent=(-1,1,-1,1),\naspect=0.5')
 56 | plt.xticks([-1, 0, 1])
 57 | plt.yticks([-1, 0, 1])
 58 | plt.imshow(img, extent=(-1, 1, -1, 1), aspect=0.5)
 59 | 
 60 | # Specify the extent and aspect ratio of the top right subplot
 61 | plt.subplot(2, 2, 2)
 62 | plt.title('extent=(-1,1,-1,1),\naspect=1')
 63 | plt.xticks([-1, 0, 1])
 64 | plt.yticks([-1, 0, 1])
 65 | plt.imshow(img, extent=(-1, 1, -1, 1), aspect=1)
 66 | 
 67 | # Specify the extent and aspect ratio of the bottom left subplot
 68 | plt.subplot(2, 2, 3)
 69 | plt.title('extent=(-1,1,-1,1),\naspect=2')
 70 | plt.xticks([-1, 0, 1])
 71 | plt.yticks([-1, 0, 1])
 72 | plt.imshow(img, extent=(-1, 1, -1, 1), aspect=2)
 73 | 
 74 | # Specify the extent and aspect ratio of the bottom right subplot
 75 | plt.subplot(2, 2, 4)
 76 | plt.title('extent=(-2,2,-1,1),\naspect=2')
 77 | plt.xticks([-2, -1, 0, 1, 2])
 78 | plt.yticks([-1, 0, 1])
 79 | plt.imshow(img, extent=(-2, 2, -1, 1), aspect=2)
 80 | 
 81 | # Improve spacing and display the figure
 82 | plt.tight_layout()
 83 | plt.show()
 84 | 
 85 | # '''''' Rescale Pixel Intensities '''''''''''''#
 86 | 
 87 | # Load the image into an array: image
 88 | image = plt.imread('640px-Unequalized_Hawkes_Bay_NZ.jpg')
 89 | 
 90 | # Extract minimum and maximum values from the image: pmin, pmax
 91 | pmin, pmax = image.min(), image.max()
 92 | print("The smallest & largest pixel intensities are %d & %d." % (pmin, pmax))
 93 | 
 94 | # Rescale the pixels: rescaled_image
 95 | rescaled_image = 256*(image - pmin) / (pmax - pmin)
 96 | print("The rescaled smallest & largest pixel intensities are %.1f & %.1f." %
 97 |       (rescaled_image.min(), rescaled_image.max()))
 98 | 
 99 | # Display the original image in the top subplot
100 | plt.subplot(2, 1, 1)
101 | plt.title('original image')
102 | plt.axis('off')
103 | plt.imshow(image, extent=(-2, 2, -1, 1), aspect=2)
104 | 
105 | # Display the rescaled image in the bottom subplot
106 | plt.subplot(2, 1, 2)
107 | plt.title('rescaled image')
108 | plt.axis('off')
109 | plt.imshow(rescaled_image, extent=(-2, 2, -1, 1), aspect=2)
110 | 
111 | plt.show()


--------------------------------------------------------------------------------
/Dat_Clean_Analysis.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Apr  7 02:30:02 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | # ''Load and View Data ''''''''''#
  9 | 
 10 | # Import pandas
 11 | import pandas as pd
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | 
 15 | # Read the file into a DataFrame: df
 16 | # df = pd.read_csv('your_file.csv') . This might also mean filepath.
 17 | 
 18 | df = pd.read_csv('fixations.csv')
 19 | df2 = pd.read_csv('flightdata.csv')
 20 | 
 21 | # Print the head of df
 22 | print(df.head())
 23 | 
 24 | # Print the tail of df
 25 | print(df.tail())
 26 | 
 27 | print('AERO DATA OUTPUT')
 28 | 
 29 | 
 30 | print(df2.head())
 31 | 
 32 | print(df2.tail())
 33 | 
 34 | # Print the shape of df
 35 | print(df.shape)
 36 | 
 37 | print(df2.shape)
 38 | 
 39 | # Print the columns of df
 40 | print(df.columns)
 41 | 
 42 | print(df2.columns)
 43 | 
 44 | # Print the head and tail of df_subset
 45 | # print(df.subset.head())
 46 | # print(df.subset.tail())
 47 | 
 48 | # Print the info of df
 49 | print(df.info())
 50 | 
 51 | print(df2.info())
 52 | 
 53 | # Print the info of df_subset
 54 | # print(df.subset.info())
 55 | 
 56 | 
 57 | # '''''''' Frequency counts for Categorical Data
 58 | # note that dataframe titles here are actually for
 59 | # continuous data. These are simply placeholders.
 60 | 
 61 | # Print the value counts for 'your category - i.e.column titles''
 62 | print(df['duration'].value_counts(dropna=False))
 63 | 
 64 | print(df['duration'].shape)
 65 | 
 66 | # Print the value_counts for 'next_category'
 67 | print(df['confidence'].value_counts(dropna=False))
 68 | 
 69 | print(df['confidence'].shape)
 70 | 
 71 | # Print the value counts for 'and_another'
 72 | print(df['avg_pupil_size'].value_counts(dropna=False))
 73 | 
 74 | 
 75 | # ''''''''''' Single Variable Histogram plot ''''''''#
 76 | 
 77 | # Plot the histogram
 78 | df['duration'].plot(kind='hist', rot=70, logx=True, logy=True)
 79 | 
 80 | # Display the histogram
 81 | plt.show()
 82 | 
 83 | # ''''' Multi Variable Box Plot Visualisation '''''''#
 84 | 
 85 | # Import necessary modules (see top of script)
 86 | # doesn't necessarily have to be at the top of the script
 87 | # but Spyder likes it this way and it looks
 88 | # good too. 
 89 | 
 90 | # you want to create the boxplot?
 91 | df.boxplot(column='duration', by='avg_pupil_size', rot=90)
 92 | 
 93 | # Display the plot
 94 | plt.show()
 95 | 
 96 | # ''''''''''' Multiple variable scatter plot visualisation''''#
 97 | 
 98 | # Import necessary modules -moved to top
 99 | # import pandas as pd - at top
100 | # import matplotlib.pyplot as plt - at top
101 | 
102 | # Create and display the first scatter plot
103 | df.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70)
104 | plt.show()
105 | 
106 | # Create and display the second scatter plot
107 | df_subset.plot(kind='scatter', x='initial_cost', y='total_est_fee', rot=70)
108 | plt.show()
109 | 
110 | 


--------------------------------------------------------------------------------
/Dat_Read_Plot.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr  5 18:57:00 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import os
 9 | import glob
10 | import pandas as pd
11 | import mayavi
12 | import seaborn as sns
13 | import matplotlib.pyplot as plt
14 | import numpy as np
15 | from scipy import stats
16 | from mayavi import mlab
17 | import multiprocessing
18 | import plotly.plotly as py
19 | import plotly.graph_objs as go
20 | from plotly.graph_objs import Surface
21 | 
22 | 
23 | path = 'C:\\Users\\Shabaka\Desktop\\Test2 DJI_Corretti'
24 | all_files = glob.glob(os.path.join(path, "*Temperatura_Media.csv"))
25 | 
26 | df_from_each_file = pd.read_csv(all_files)
27 | conc_df = pd.concat(df_from_each_file, ignore_index=True)
28 | 
29 | print(conc_df.head())
30 | 


--------------------------------------------------------------------------------
/DataChunkFunc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Apr  1 13:19:38 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import matplotlib.pyplot as plt
10 | 
11 | # Define plot_pop()
12 | def plot_pop(filename, country_code):
13 | 
14 |     # Initialize reader object: urb_pop_reader
15 |     urb_pop_reader = pd.read_csv(filename, chunksize=1000)
16 | 
17 |     # Initialize empty dataframe: data
18 |     data = pd.DataFrame()
19 | 
20 |     # Iterate over each dataframe chunk
21 |     for df_urb_pop in urb_pop_reader:
22 |         # Check out specific country: df_pop_ceb
23 |         df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]
24 | 
25 |         # Zip dataframe columns of interest: pops
26 |         pops = zip(df_pop_ceb['Total Population'],
27 |                    df_pop_ceb['Urban population (% of total)'])
28 | 
29 |         # Turn zip object into list: pops_list
30 |         pops_list = list(pops)
31 | 
32 |         # Use list comprehension to create new
33 |         # dataframe column 'Total Urban Population'
34 |         df_pop_ceb['Total Urban Population'] = \
35 |                   [int(tup[0] * tup[1]) for tup in pops_list]
36 | 
37 |         # Append dataframe chunk to data: data
38 |         data = data.append(df_pop_ceb)
39 | 
40 |     # Plot urban population data
41 |     data.plot(kind='scatter', x='Year', y='Total Urban Population')
42 |     plt.show()


--------------------------------------------------------------------------------
/DataClean_GS_Analysis5.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Apr 11 23:54:33 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | 
 14 | # Load csv files OR load up file directory seeking specific file name
 15 | 
 16 | 
 17 | # '''''''Basic EDA Instructions to verify Data
 18 | 
 19 | 
 20 | # ''''''' Carry out some basic Visaualisations ''''''''''''#
 21 | 
 22 | # Import matplotlib.pyplot
 23 | # import matplotlib.pyplot as plt
 24 | 
 25 | # Create the scatter plot
 26 | g1800s.plot(kind='scatter', x='1800', y='1899')
 27 | 
 28 | # Specify axis labels
 29 | plt.xlabel('Life Expectancy by Country in 1800')
 30 | plt.ylabel('Life Expectancy by Country in 1899')
 31 | 
 32 | # Specify axis limits
 33 | plt.xlim(20, 55)
 34 | plt.ylim(20, 55)
 35 | 
 36 | # Display the plot
 37 | plt.show()
 38 | 
 39 | # Think - QUestion at HAnd ''''#
 40 | 
 41 | def check_null_or_valid(row_data):
 42 |     """Function that takes a row of data,
 43 |     drops all missing values,
 44 |     and checks if all remaining values are greater than or equal to 0
 45 |     """
 46 |     no_na = row_data.dropna()[1:-1]
 47 |     numeric = pd.to_numeric(no_na)
 48 |     ge0 = numeric >= 0
 49 |     return ge0
 50 | 
 51 | # Check whether the first column is 'Life expectancy'
 52 | assert g1800s.columns[0] == 'Life expectancy'
 53 | 
 54 | # Check whether the values in the row are valid
 55 | assert g1800s.iloc[:, 1:].apply(check_null_or_valid, axis=1).all().all()
 56 | 
 57 | # Check that there is only one instance of each country
 58 | assert g1800s['Life expectancy'].value_counts()[0] == 1
 59 | 
 60 | 
 61 | # ''''''''''' Assemble the Data '''''''''''''#
 62 | 
 63 | # Concatenate the DataFrames row-wise
 64 | gapminder = pd.concat([g1800s, g1900s, g2000s])
 65 | 
 66 | # Print the shape of gapminder
 67 | print(gapminder.shape)
 68 | 
 69 | # Print the head of gapminder
 70 | print(gapminder.head())
 71 | 
 72 | 
 73 | # ''''Reshape the data to aid easier analysis ( if required)''''#
 74 | 
 75 | # Melt gapminder: gapminder_melt
 76 | gapminder_melt = pd.melt(gapminder, id_vars='Life expectancy')
 77 | 
 78 | # Rename the columns
 79 | gapminder_melt.columns = ['country', 'year', 'life_expectancy']
 80 | 
 81 | # Print the head of gapminder_melt
 82 | print(gapminder_melt.head())
 83 | 
 84 | # '''''''''''Check the data types in the dataset ''''''''#
 85 | 
 86 | # Convert the year column to numeric
 87 | gapminder.year = pd.to_numeric(gapminder['year'])
 88 | 
 89 | # Test if country is of type object
 90 | assert gapminder.country.dtypes == np.object
 91 | 
 92 | # Test if year is of type int64
 93 | assert gapminder.year.dtypes == np.int64
 94 | 
 95 | # Test if life_expectancy is of type float64
 96 | assert gapminder.life_expectancy.dtypes == np.float64
 97 | 
 98 | # '''''''''''''''''Ex. Country Spellings to CHeck for Correctness ''''#
 99 | 
100 | # Create the series of countries: countries
101 | countries = gapminder['country']
102 | 
103 | # Drop all the duplicates from countries
104 | countries = countries.drop_duplicates()
105 | 
106 | # Write the regular expression: pattern
107 | pattern = '^[A-Za-z\.\s]*$'
108 | 
109 | # Create the Boolean vector: mask
110 | mask = countries.str.contains(pattern)
111 | 
112 | # Invert the mask: mask_inverse
113 | mask_inverse = ~mask
114 | 
115 | # Subset countries using mask_inverse: invalid_countries
116 | invalid_countries = countries.loc[mask_inverse]
117 | 
118 | # Print invalid_countries
119 | print(invalid_countries)
120 | 
121 |  # '''''''' More Cleaning  Ex.''''''''''#
122 |  
123 |  # Assert that country does not contain any missing values
124 | assert pd.notnull(gapminder.country).all()
125 | 
126 | # Assert that year does not contain any missing values
127 | assert pd.notnull(gapminder.year).all()
128 | 
129 | # Drop the missing values
130 | gapminder = gapminder.dropna(how='any')
131 | 
132 | # Print the shape of gapminder
133 | print(gapminder.shape)
134 | 
135 | # Add first subplot
136 | plt.subplot(2, 1, 1) 
137 | 
138 | # Create a histogram of life_expectancy
139 | gapminder.life_expectancy.plot(kind='hist')
140 | 
141 | # Group gapminder: gapminder_agg
142 | gapminder_agg = gapminder.groupby('year')['life_expectancy'].mean()
143 | 
144 | # Print the head of gapminder_agg
145 | print(gapminder_agg.head())
146 | 
147 | # Print the tail of gapminder_agg
148 | print(gapminder_agg.tail())
149 | 
150 | # Add second subplot
151 | plt.subplot(2, 1, 2)
152 | 
153 | 
154 | # ''''''''' Wrap up with visualisation of cleaned data set'''' Eg.'''#
155 | # Create a line plot of life expectancy per year
156 | gapminder_agg.plot()
157 | 
158 | # Add title and specify axis labels
159 | plt.title('Life expectancy over the years')
160 | plt.ylabel('Life expectancy')
161 | plt.xlabel('Year')
162 | 
163 | # Display the plots
164 | plt.tight_layout()
165 | plt.show()
166 | 
167 | # Save both DataFrames to csv files
168 | gapminder.to_csv('gapminder.csv')
169 | gapminder_agg.to_csv('gapminder_agg.csv')


--------------------------------------------------------------------------------
/DataCombine_Analysis3.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Apr 10 14:57:36 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | import pandas as pd
  8 | import matplotlib.pyplot as plt
  9 | import glob
 10 | 
 11 | 
 12 | 
 13 | # ''''Combining Rows of Data ''''''''''''#
 14 | 
 15 | # Concatenate uber1, uber2, and uber3: row_concat
 16 | row_concat = pd.concat([uber1, uber2, uber3])
 17 | 
 18 | # Print the shape of row_concat
 19 | print(row_concat.shape)
 20 | 
 21 | # Print the head of row_concat
 22 | print(row_concat.head())
 23 | 
 24 | #'''''''''''' cOMBINING cOLUMNS OF dATA'''''''''''#
 25 | 
 26 | # Concatenate ebola_melt and status_country column-wise: ebola_tidy
 27 | ebola_tidy = pd.concat([ebola_melt, status_country], axis=1)
 28 | 
 29 | # Print the shape of ebola_tidy
 30 | print(ebola_tidy.shape)
 31 | 
 32 | # Print the head of ebola_tidy
 33 | print(ebola_tidy.head())
 34 | 
 35 | 
 36 | # '''Find Files that match a PAttern '''''''' #
 37 | 
 38 | # Import necessary modules
 39 | 
 40 | # Write the pattern: pattern
 41 | pattern = '*.csv'
 42 | 
 43 | # Save all file matches: csv_files
 44 | csv_files = glob.glob(pattern)
 45 | 
 46 | # Print the file names
 47 | print(csv_files)
 48 | 
 49 | # Load the second file into a DataFrame: csv2
 50 | csv2 = pd.read_csv(csv_files[1])
 51 | 
 52 | # Print the head of csv2
 53 | print(csv2.head())
 54 | 
 55 | # '''''''''''Iterate and Concatenate all Matches ''''''#
 56 | 
 57 | # Create an empty list: frames
 58 | frames = []
 59 | 
 60 | #  Iterate over csv_files
 61 | for csv in csv_files:
 62 | 
 63 |     #  Read csv into a DataFrame: df
 64 |     df = pd.read_csv(csv)
 65 |     
 66 |     # Append df to frames
 67 |     frames.append(df)
 68 | 
 69 | # Concatenate frames into a single DataFrame: uber
 70 | uber = pd.concat(frames)
 71 | 
 72 | # Print the shape of uber
 73 | print(uber.shape)
 74 | 
 75 | # Print the head of uber
 76 | print(uber.head())
 77 | 
 78 | 
 79 | # ''''''One to - One Data Merge '#
 80 | 
 81 | # Merge the DataFrames: o2o
 82 | o2o = pd.merge(left=site, right=visited, left_on='name', right_on='site')
 83 | 
 84 | # Print o2o
 85 | print(o2o)
 86 | 
 87 | # '''''''MAny to One Data MErge ''''#
 88 | 
 89 | # Merge the DataFrames: m2o
 90 | m2o = pd.merge(left=site, right=visited, left_on='name', right_on='site')
 91 | 
 92 | # Print m2o
 93 | print(m2o)
 94 | 
 95 | # ''''''''''Many To Many Data Merge ''''''''''''#
 96 | 
 97 | # Merge site and visited: m2m
 98 | m2m = pd.merge(left=site, right = visited, left_on='name', right_on='site')
 99 | 
100 | # Merge m2m and survey: m2m
101 | m2m = pd.merge(left=m2m, right=survey, left_on='ident', right_on='taken')
102 | 
103 | # Print the first 20 lines of m2m
104 | print(m2m.head(20))
105 | 


--------------------------------------------------------------------------------
/DataFrame_Lambda_Filter_Read.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan  9 01:21:36 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Select retweets from the Twitter dataframe: result
 9 | result = filter(lambda x:x[0:2] == 'RT', tweets_df['text'])
10 | 
11 | # Create list from filter object result: res_list
12 | res_list = list(result)
13 | 
14 | # Print all retweets in res_list
15 | for tweet in res_list:
16 |     print(tweet)


--------------------------------------------------------------------------------
/DataTidy_Analysis2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Apr 10 10:02:49 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | 
 12 | 
 13 | # ' Data Reshape wit melt ''#
 14 | 
 15 | # Print the head of airquality
 16 | print(airquality.head())
 17 | 
 18 | # Melt airquality: airquality_melt
 19 | airquality_melt = pd.melt(airquality, id_vars=['Month', 'Day'])
 20 | 
 21 | # Print the head of airquality_melt
 22 | print(airquality_melt.head())
 23 | 
 24 | # ''''Customise melted Data - Change var name & Val'''#
 25 | 
 26 | # Print the head of airquality
 27 | print(airquality.head())
 28 | 
 29 | # Melt airquality: airquality_melt
 30 | airquality_melt = pd.melt(airquality, id_vars=['Month', 'Day'],
 31 |                           var_name='measurement', value_name='reading')
 32 | 
 33 | # Print the head of airquality_melt
 34 | print(airquality_melt.head())
 35 | 
 36 | 
 37 | #''' Pivoting Data''' from melt'''''''''#
 38 | 
 39 | # Print the head of airquality_melt
 40 | print(airquality_melt.head())
 41 | 
 42 | # Pivot airquality_melt: airquality_pivot
 43 | airquality_pivot = airquality_melt.pivot_table(index=['Month', 'Day'], columns='measurement', values='reading')
 44 | 
 45 | # Print the head of airquality_pivot
 46 | print(airquality_pivot.head())
 47 | 
 48 | #''''''''''''''''Reset data frame index''''''''''''#
 49 | 
 50 | # Print the index of airquality_pivot
 51 | print(airquality_pivot.index)
 52 | 
 53 | # Reset the index of airquality_pivot: airquality_pivot
 54 | airquality_pivot = airquality_pivot.reset_index()
 55 | 
 56 | # Print the new index of airquality_pivot
 57 | print(airquality_pivot.index)
 58 | 
 59 | # Print the head of airquality_pivot
 60 | print(airquality_pivot.head())
 61 | 
 62 | # ''''''' Pivoting Duplicate Values ''''''''''#
 63 | 
 64 | # Pivot airquality_dup: airquality_pivot
 65 | airquality_pivot = airquality_dup.pivot_table(index=['Month', 'Day'],
 66 |                                               columns='measurement',
 67 |                                               values='reading', aggfunc=np.mean)
 68 | 
 69 | # Reset the index of airquality_pivot
 70 | airquality_pivot = airquality_pivot.reset_index()
 71 | 
 72 | # Print the head of airquality_pivot
 73 | print(airquality_pivot.head())
 74 | 
 75 | # Print the head of airquality
 76 | print(airquality.head())
 77 | 
 78 | # '''''''''Split column infor using str '''''#
 79 | 
 80 | # Melt tb: tb_melt
 81 | tb_melt = pd.melt(frame=tb, id_vars=['country', 'year'])
 82 | 
 83 | # Create the 'gender' column
 84 | tb_melt['gender'] = tb_melt.variable.str[0]
 85 | 
 86 | # Create the 'age_group' column
 87 | tb_melt['age_group'] = tb_melt.variable.str[1:]
 88 | 
 89 | # '''''' Split a column with .split() and .get()
 90 | 
 91 | # Melt ebola: ebola_melt
 92 | ebola_melt = pd.melt(ebola, id_vars=['Date', 'Day'], var_name='type_country', value_name='counts')
 93 | 
 94 | # Create the 'str_split' column
 95 | ebola_melt['str_split'] = ebola_melt.type_country.str.split('_')
 96 | 
 97 | # Create the 'type' column
 98 | ebola_melt['type'] = ebola_melt.str_split.str.get(0)
 99 | 
100 | # Create the 'country' column
101 | ebola_melt['country'] = ebola_melt.str_split.str.get(1)
102 | 
103 | # Print the head of ebola_melt
104 | print(ebola_melt.head())
105 | 
106 | # ''''Combining Rows of Data ''''''''''''#
107 | 
108 | # Concatenate uber1, uber2, and uber3: row_concat
109 | row_concat = pd.concat([uber1, uber2, uber3])
110 | 
111 | # Print the shape of row_concat
112 | print(row_concat.shape)
113 | 
114 | # Print the head of row_concat
115 | print(row_concat.head())
116 | 
117 | #'''''''''''' cOMBINING cOLUMNS OF dATA'''''''''''#
118 | 
119 | # Concatenate ebola_melt and status_country column-wise: ebola_tidy
120 | ebola_tidy = pd.concat([ebola_melt, status_country], axis=1)
121 | 
122 | # Print the shape of ebola_tidy
123 | print(ebola_tidy.shape)
124 | 
125 | # Print the head of ebola_tidy
126 | print(ebola_tidy.head())
127 | 


--------------------------------------------------------------------------------
/DataTypes_Analysis4.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Apr 10 17:29:54 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | import matplotlib.pyplot as plt
 11 | import re
 12 | 
 13 | 
 14 | # Convert the sex column to type 'category'
 15 | tips.sex = tips.sex.astype('category')
 16 | 
 17 | # Convert the smoker column to type 'category'
 18 | tips.smoker = tips.smoker.astype('category')
 19 | 
 20 | # Print the info of tips
 21 | print(tips.info())
 22 | 
 23 | # '''''Working with Numeric Data - Wrong data types ''''#
 24 | 
 25 | # Convert 'total_bill' to a numeric dtype
 26 | tips['total_bill'] = pd.to_numeric(tips['total_bill'], errors='coerce')
 27 | 
 28 | # Convert 'tip' to a numeric dtype
 29 | tips['tip'] = pd.to_numeric(tips['tip'], errors='coerce')
 30 | 
 31 | # Print the info of tips
 32 | print(tips.info())
 33 | 
 34 | 
 35 | # '''' String Parsing with regular expression '''#
 36 | 
 37 | # Import the regular expression module
 38 | 
 39 | # Compile the pattern: prog
 40 | prog = re.compile('\d{3}-\d{3}-\d{4}')
 41 | 
 42 | # See if the pattern matches
 43 | result = prog.match('123-456-7890')
 44 | print(bool(result))
 45 | 
 46 | # See if the pattern matches
 47 | result = prog.match('1123-456-7890')
 48 | print(bool(result))
 49 | 
 50 | # ''''''' Find Numeric in sstring '''''''' #
 51 | 
 52 | # Find the numeric values: matches
 53 | matches = re.findall('\d+', 'the recipe requires 10 strawberries and 1 banana')
 54 | 
 55 | # Print the matches
 56 | print(matches)
 57 | 
 58 | 
 59 | # ''''' paTTERN maTCHING '''''##
 60 | 
 61 | # Write the first pattern
 62 | print(bool(re.match(pattern='\d{3}-\d{3}-\d{4}', string='123-456-7890')))
 63 | 
 64 | # Write the second pattern
 65 | print(bool(re.match(pattern='\$\d*\.\d{2}', string='$123.45')))
 66 | 
 67 | # Write the third pattern
 68 | print(bool(re.match(pattern='[A-Z]\w*', string='Australia')))
 69 | 
 70 | # '''''''''######## ''''''''''''''''' ##########'''''''''''''''''''#
 71 | 
 72 | # '''''Custom Fxn to clean data  in column ( dataframe)''''''''#
 73 | 
 74 | # Define recode_sex()
 75 | 
 76 | 
 77 | def recode_sex(sex_value):
 78 | 
 79 |     # Return 1 if sex_value is 'Male'
 80 |     if sex_value == 'Male':
 81 |         return 1
 82 | 
 83 |     # Return 0 if sex_value is 'Female'
 84 |     elif sex_value == 'Female':
 85 |         return 0
 86 | 
 87 |     # Return np.nan
 88 |     else:
 89 | 
 90 |         return np.nan
 91 | 
 92 | 
 93 | # Apply the function to the sex column
 94 | tips['sex_recode'] = tips.sex.apply(recode_sex)
 95 | 
 96 | 
 97 | #''' Lambda Functions ''''''#
 98 | 
 99 | # Write the lambda function using replace
100 | tips['total_dollar_replace'] = tips.total_dollar.apply(lambda x: x.replace('$', ''))
101 | 
102 | # Write the lambda function using regular expressions
103 | tips['total_dollar_re'] = tips.total_dollar.apply(lambda x: re.findall('\d+\.\d+', x))
104 | 
105 | # Print the head of tips
106 | print(tips.head())
107 | 
108 | # '''''''Dropping DUplicate Data '''''''''''''#
109 | 
110 |  # Create the new DataFrame: tracks
111 | tracks = billboard[['year', 'artist', 'track', 'time']]
112 | 
113 | # Print info of tracks
114 | print(tracks.info())
115 | 
116 | # Drop the duplicates: tracks_no_duplicates
117 | tracks_no_duplicates = tracks.drop_duplicates()
118 | 
119 | # Print info of tracks
120 | print(tracks_no_duplicates.info())
121 | 
122 | # '''''''''''''''' Fill in MIssing Data ''''''''' #
123 | 
124 | # Calculate the mean of the Ozone column: oz_mean
125 | oz_mean = np.mean(airquality.Ozone)
126 | 
127 | # Replace all the missing values in the Ozone column with the mean
128 | airquality['Ozone'] = airquality['Ozone'].fillna(oz_mean)
129 | 
130 | # Print the info of airquality
131 | print(airquality.info())
132 | 
133 | # ''''''''''''''' Data Test with Assert Statements ''''''#
134 | 
135 | # Assert that there are no missing values
136 | assert pd.notnull(ebola).all().all()
137 | 
138 | # Assert that all values are >= 0
139 | assert (ebola >= 0).all().all()
140 | 
141 | # assert pd.notnull(ebola >= 0).all().all()
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/DataXplore_Analysis1.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Apr  7 02:30:02 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | # ''Load and View Data ''''''''''#
  9 | 
 10 | # Import pandas
 11 | import pandas as pd
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | 
 15 | # Read the file into a DataFrame: df
 16 | # df = pd.read_csv('dob_job_application_filings_subset.csv')
 17 | 
 18 | df = pd.read_csv('fixations.csv')
 19 | df2 = pd.read_csv('aerodata.csv')
 20 | 
 21 | # Print the head of df
 22 | print(df.head())
 23 | 
 24 | # Print the tail of df
 25 | print(df.tail())
 26 | 
 27 | print('AERO DATA OUTPUT')
 28 | 
 29 | 
 30 | print(df2.head())
 31 | 
 32 | print(df2.tail())
 33 | 
 34 | # Print the shape of df
 35 | print(df.shape)
 36 | 
 37 | print(df2.shape)
 38 | 
 39 | # Print the columns of df
 40 | print(df.columns)
 41 | 
 42 | print(df2.columns)
 43 | 
 44 | # Print the head and tail of df_subset
 45 | # print(df.subset.head())
 46 | # print(df.subset.tail())
 47 | 
 48 | # Print the info of df
 49 | print(df.info())
 50 | 
 51 | print(df2.info())
 52 | 
 53 | # Print the info of df_subset
 54 | # print(df.subset.info())
 55 | 
 56 | 
 57 | # '''''''' Frequency counts for Categorical Data
 58 | 
 59 | # Print the value counts for 'Borough'
 60 | print(df['duration'].value_counts(dropna=False))
 61 | 
 62 | print(df['duration'].shape)
 63 | 
 64 | # Print the value_counts for 'State'
 65 | print(df['confidence'].value_counts(dropna=False))
 66 | 
 67 | print(df['confidence'].shape)
 68 | 
 69 | # Print the value counts for 'Site Fill'
 70 | print(df['avg_pupil_size'].value_counts(dropna=False))
 71 | 
 72 | # ''''''''''' Single Variable Histogram plot ''''''''#
 73 | 
 74 | # Plot the histogram
 75 | df['duration'].plot(kind='hist', rot=70, logx=True, logy=True)
 76 | 
 77 | # Display the histogram
 78 | plt.show()
 79 | 
 80 | # ''''' Multi Variable Box Plot Visualisation '''''''#
 81 | 
 82 | # Import necessary modules
 83 | 
 84 | # Create the boxplot
 85 | df.boxplot(column='duration', by='avg_pupil_size', rot=90)
 86 | 
 87 | # Display the plot
 88 | plt.show()
 89 | 
 90 | # ''''''''''' Multiple variable scatter plot visualisation''''#
 91 | 
 92 | # Import necessary modules
 93 | # import pandas as pd
 94 | # import matplotlib.pyplot as plt
 95 | 
 96 | # Create and display the first scatter plot
 97 | df.plot(kind='scatter', x='duration', y='avg_pupil_size', rot=70)
 98 | plt.show()
 99 | 
100 | # Create and display the second scatter plot
101 | df_subset.plot(kind='scatter', x='duration', y='confidence', rot=70)
102 | plt.show()
103 | 
104 | 


--------------------------------------------------------------------------------
/Data_Corr_Func.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar  6 18:59:20 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Code for correelate function is copied from pupil labs git @
 9 | # https://github.com/pupil-labs/pupil/wiki/Data-Format
10 | 
11 | 
12 | def correlate_data(data, timestamps):
13 |     '''
14 |     data:  list of data :
15 |         each datum is a dict with at least:
16 |             timestamp: float
17 | 
18 |     timestamps: timestamps list to correlate  data to
19 | 
20 |     this takes a data list and a timestamps list and makes a new list
21 |     with the length of the number of timestamps.
22 |     Each slot contains a list that will have 0, 1 or more associated
23 |     data points.
24 | 
25 |     Finally we add an index field to the datum with the associated index
26 |     '''
27 |     timestamps = list(timestamps)
28 |     data_by_frame = [[] for i in timestamps]
29 | 
30 |     frame_idx = 0
31 |     data_index = 0
32 | 
33 |     data.sort(key=lambda d: d['timestamp'])
34 | 
35 |     while True:
36 |         try:
37 |             datum = data[data_index]
38 |             # we can take the midpoint between two frames in time:
39 |             # More appropriate for SW timestamps
40 |             ts = (timestamps[frame_idx]+timestamps[frame_idx+1]) / 2.
41 |             # or the time of the next frame:
42 |             # More appropriate for Sart Of Exposure Timestamps (HW timestamps).
43 |             # ts = timestamps[frame_idx+1]
44 |         except IndexError:
45 |             # we might loose a data point at the end but we don't care
46 |             break
47 | 
48 |         if datum['timestamp'] <= ts:
49 |             datum['index'] = frame_idx
50 |             data_by_frame[frame_idx].append(datum)
51 |             data_index += 1
52 |         else:
53 |             frame_idx += 1
54 | 
55 |     return data_by_frame


--------------------------------------------------------------------------------
/Datchunk_PopPlot.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Apr  1 13:24:43 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import matplotlib.pyplot as plt
10 | 
11 | # Define plot_pop()
12 | 
13 | 
14 | def plot_pop(filename, country_code):
15 | 
16 |     # Initialize reader object: urb_pop_reader
17 |     urb_pop_reader = pd.read_csv(filename, chunksize=1000)
18 | 
19 |     # Initialize empty dataframe: data
20 |     data = pd.DataFrame()
21 | 
22 |     # Iterate over each dataframe chunk
23 |     for df_urb_pop in urb_pop_reader:
24 |         # Check out specific country: df_pop_ceb
25 |         df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]
26 | 
27 |         # Zip dataframe columns of interest: pops
28 |         pops = zip(df_pop_ceb['Total Population'],
29 |                    df_pop_ceb['Urban population (% of total)'])
30 | 
31 |         # Turn zip object into list: pops_list
32 |         pops_list = list(pops)
33 | 
34 |         # Use list comp to create new dataframe column 'Total Urban Population'
35 |         df_pop_ceb['Total Urban Population'] = \
36 |                   [int(tup[0] * tup[1]) for tup in pops_list]
37 | 
38 |         # Append dataframe chunk to data: data
39 |         data = data.append(df_pop_ceb)
40 | 
41 |     # Plot urban population data
42 |     data.plot(kind='scatter', x='Year', y='Total Urban Population')
43 |     plt.show()
44 | 
45 | # Set the filename: fn
46 | fn = 'ind_pop_data.csv'
47 | 
48 | # Call plot_pop for country code 'CEB'
49 | plot_pop('ind_pop_data.csv', 'CEB')
50 | 
51 | # Call plot_pop for country code 'ARB'
52 | plot_pop('ind_pop_data.csv', 'ARB')


--------------------------------------------------------------------------------
/Deep_Learning_Basics_1.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Apr 21 11:58:38 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | # ''''''''''Coding the Forward Propagation (FP) Algorithm ''''''''#
 14 | 
 15 | weights = {'node_1': np.array([4, -5]), 'node_0': np.array([2, 4]),
 16 |            'output': np.array([2, 7])}
 17 | 
 18 | input_data = [3, 5]
 19 | 
 20 | # Calculate node 0 value: node_0_value
 21 | node_0_value = (input_data * weights['node_0']).sum()
 22 | 
 23 | # Calculate node 1 value: node_1_value
 24 | node_1_value = (input_data * weights['node_1']).sum()
 25 | 
 26 | # Put node values into array: hidden_layer_outputs
 27 | hidden_layer_outputs = np.array([node_0_value, node_1_value])
 28 | 
 29 | # Calculate output: output
 30 | output = (hidden_layer_outputs * weights['output']).sum()
 31 | 
 32 | # Print output
 33 | print(output, 'is the basic FP output from model')
 34 | 
 35 | # ''''''' Apply the Rectified Linear Activation Function '''''''''''''#
 36 | 
 37 | # NOTE: The activation function is very useful for tuning model weights ''#
 38 | 
 39 | 
 40 | def relu(input):
 41 |     '''Define relu activation function here'''
 42 |     # Calculate the value for the output of the relu function: output
 43 |     output = max(input, 0)
 44 | 
 45 |     # Return the value just calculated
 46 |     return(output)
 47 | 
 48 | # Calculate node 0 value: node_0_output
 49 | node_0_input = (input_data * weights['node_0']).sum()
 50 | node_0_output = relu(node_0_input)
 51 | 
 52 | # Calculate node 1 value: node_1_output
 53 | node_1_input = (input_data * weights['node_1']).sum()
 54 | node_1_output = relu(node_1_input)
 55 | 
 56 | # Put node values into array: hidden_layer_outputs
 57 | hidden_layer_outputs = np.array([node_0_output, node_1_output])
 58 | 
 59 | # Calculate model output (do not apply relu)
 60 | model_output = (hidden_layer_outputs * weights['output']).sum()
 61 | 
 62 | # Print model output
 63 | print(model_output, 'is the FP_ReLU predicted quantity of transactions')
 64 | 
 65 | 
 66 | # ''''''''''' Apply Network to many observations/rows of data '''''''#
 67 | 
 68 | # Define predict_with_network()
 69 | def predict_with_network(input_data_row, weights):
 70 | 
 71 |     # Calculate node 0 value
 72 |     node_0_input = (input_data_row * weights['node_0']).sum()
 73 |     node_0_output = relu(node_0_input)
 74 | 
 75 |     # Calculate node 1 value
 76 |     node_1_input = (input_data_row * weights['node_1']).sum()
 77 |     node_1_output = relu(node_1_input)
 78 | 
 79 |     # Put node values into array: hidden_layer_outputs
 80 |     hidden_layer_outputs = np.array([node_0_output, node_1_output])
 81 | 
 82 |     # Calculate model output
 83 |     input_to_final_layer = (weights['output'] * hidden_layer_outputs).sum()
 84 |     model_output = relu(input_to_final_layer)
 85 | 
 86 |     # Return model output
 87 |     return(model_output)
 88 | 
 89 | 
 90 | # Create empty list to store prediction results
 91 | results = []
 92 | for input_data_row in input_data:
 93 |     # Append prediction to results
 94 |     results.append(predict_with_network(input_data_row, weights))
 95 | 
 96 | # Print results
 97 | print(results)
 98 | 
 99 | 
100 | # ''''''''''''' Behaviour of a Multi Layer Neural Network ''''''''#
101 | 
102 | def predict_with_network(input_data):
103 |     # Calculate node 0 in the first hidden layer
104 |     node_0_0_input = (input_data * weights['node_0_0']).sum()
105 |     node_0_0_output = relu(node_0_0_input)
106 | 
107 |     # Calculate node 1 in the first hidden layer
108 |     node_0_1_input = (input_data * weights['node_0_1']).sum()
109 |     node_0_1_output = relu(node_0_1_input)
110 | 
111 |     # Put node values into array: hidden_0_outputs
112 |     hidden_0_outputs = np.array([node_0_0_output, node_0_1_output])
113 | 
114 |     # Calculate node 0 in the second hidden layer
115 |     node_1_0_input = (hidden_0_outputs * weights['node_1_0']).sum()
116 |     node_1_0_output = relu(node_1_0_input)
117 | 
118 |     # Calculate node 1 in the second hidden layer
119 |     node_1_1_input = (hidden_0_outputs * weights['node_1_1']).sum()
120 |     node_1_1_output = relu(node_1_1_input)
121 | 
122 |     # Put node values into array: hidden_1_outputs
123 |     hidden_1_outputs = np.array([node_1_0_output, node_1_1_output])
124 | 
125 |     # Calculate model output: model_output
126 |     model_output = (weights['output'] * hidden_1_outputs).sum()
127 | 
128 |     # Return model_output
129 |     return(model_output)
130 | 
131 | output = predict_with_network(input_data)
132 | print(output)
133 | 
134 | 
135 | # ''' Calculating Model Errors  - Consideration of weight effects''''###
136 | 
137 | # '''''''' Test Case - Bank Transactions Predictions '''''''##
138 | 
139 | # ''''''' Coding how weight changes affects accuracy ''''#'''''###
140 | 


--------------------------------------------------------------------------------
/Deep_Learning_KerasModel_Build_3.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu May 11 18:46:07 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | import matplotlib.pyplot as plt
 11 | from sklearn.metrics import mean_squared_error
 12 | from keras.layers import Dense
 13 | from keras.models import Sequential
 14 | 
 15 | # predictors = np.loadtxt('predictors_data.csv', delimiter=',')
 16 | 
 17 | predictors = np.loadtxt('aerodata.csv', delimiter=',')
 18 | target = 3
 19 | # Import necessary modules
 20 | 
 21 | # Save the number of columns in predictors: n_cols
 22 | # n_cols = predictors.shape[1]
 23 | 
 24 | # Set up the model: model
 25 | # model = Sequential()
 26 | 
 27 | # Add the first layer
 28 | # model.add(Dense(50, activation='relu', input_shape=(n_cols,)))
 29 | 
 30 | # Add the second layer
 31 | # model.add(Dense(32, activation='relu'))
 32 | 
 33 | # Add the output layer
 34 | # model.add(Dense(1))
 35 | 
 36 | # ''''''''' Compile the Model ''''''''''#
 37 | 
 38 | # Specify the model
 39 | n_cols = predictors.shape[1]
 40 | model = Sequential()
 41 | model.add(Dense(50, activation='relu', input_shape=(n_cols,)))
 42 | model.add(Dense(32, activation='relu'))
 43 | model.add(Dense(1))
 44 | 
 45 | # Compile the model
 46 | model.compile(optimizer='adam', loss='mean_squared_error')
 47 | 
 48 | # Verify that model contains information from compiling
 49 | print("Loss function: " + model.loss)
 50 | 
 51 | model.fit(predictors, target)
 52 | 
 53 | # ''''''''''Define Classification Model - Titaninc datasrt example '''#
 54 | 
 55 | # Convert the target to categorical: target
 56 | target = to_categorical(df.survived)
 57 | 
 58 | # Set up the model
 59 | model = Sequential()
 60 | 
 61 | # Add the first layer
 62 | model.add(Dense(32, activation='relu', input_shape=(n_cols,)))
 63 | 
 64 | # Add the output layer
 65 | model.add(Dense(2, activation='softmax'))
 66 | 
 67 | # Compile the model
 68 | model.compile(optimizer='sgd', loss='categorical_crossentropy',
 69 |               metrics=['accuracy'])
 70 | 
 71 | # Fit the model
 72 | model.fit(predictors, target)
 73 | 
 74 | 
 75 | # '''''''''''' Making predictions ;;;;;;;;;;#
 76 | 
 77 | # Calculate predictions: predictions
 78 | predictions = model.predict(pred_data)
 79 | 
 80 | # Calculate predicted probability of survival: predicted_prob_true
 81 | predicted_prob_true = predictions[:, 1]
 82 | 
 83 | # print predicted_prob_true
 84 | print(predicted_prob_true)
 85 | 
 86 | 
 87 | # '''''''''' Model Optimisation  - (#4)'''''''''''#
 88 | 
 89 | 
 90 | # Create list of learning rates: lr_to_test
 91 | lr_to_test = [.000001, 0.01, 1]
 92 | 
 93 | # Loop over learning rates
 94 | for lr in lr_to_test:
 95 |     print('\n\nTesting model with learning rate: %f\n'%lr )
 96 |     
 97 |     # Build new model to test, unaffected by previous models
 98 |     model = get_new_model()
 99 |     
100 |     # Create SGD optimizer with specified learning rate: my_optimizer
101 |     my_optimizer = SGD(lr=lr)
102 |     
103 |     # Compile the model
104 |     model.compile(optimizer= my_optimizer, loss= 'categorical_crossentropy')
105 |     
106 |     # Fit the model
107 |     model.fit(predictors, target)
108 |     
109 | # ''''''Evaluate model accuracy on validation dataset ''''''#
110 | 
111 | # Save the number of columns in predictors: n_cols
112 | n_cols = predictors.shape[1]
113 | input_shape = (n_cols,)
114 | 
115 | # Specify the model
116 | model = Sequential()
117 | model.add(Dense(100, activation='relu', input_shape=input_shape))
118 | model.add(Dense(100, activation='relu'))
119 | model.add(Dense(2, activation='softmax'))
120 | 
121 | # Compile the model
122 | model.compile(optimizer='adam', loss='categorical_crossentropy',
123 |               metrics=['accuracy'])
124 | 
125 | # Fit the model
126 | hist = model.fit(predictors, target, validation_split=0.3)
127 | 
128 | 
129 | # '''''' Early Stopping - Optimising the optimisation ''''''''''#
130 | 
131 | # Import EarlyStopping - already done above
132 | 
133 | # Save the number of columns in predictors: n_cols
134 | n_cols = predictors.shape[1]
135 | input_shape = (n_cols,)
136 | 
137 | # Specify the model
138 | model = Sequential()
139 | model.add(Dense(100, activation='relu', input_shape=input_shape))
140 | model.add(Dense(100, activation='relu'))
141 | model.add(Dense(2, activation='softmax'))
142 | 
143 | # Compile the model
144 | model.compile(optimizer='adam', loss='categorical_crossentropy',
145 |               metrics=['accuracy'])
146 | 
147 | # Define early_stopping_monitor
148 | early_stopping_monitor = EarlyStopping(patience=2)
149 | 
150 | # Fit the model
151 | model.fit(predictors, target, epochs=30, validation_split=0.3,
152 |           callbacks=[early_stopping_monitor])
153 | 
154 | # ''''''''''''' Experimenting with a wider network ''''''#
155 | 
156 | # Define early_stopping_monitor
157 | early_stopping_monitor = EarlyStopping(patience=2)
158 | 
159 | # Create the new model: model_2
160 | model_2 = Sequential()
161 | 
162 | # Add the first and second layers
163 | model_2.add(Dense(100, activation='relu', input_shape=input_shape))
164 | model_2.add(Dense(100, activation='relu'))
165 | 
166 | # Add the output layer
167 | model_2.add(Dense(2, activation='softmax'))
168 | 
169 | # Compile model_2
170 | model_2.compile(optimizer='adam', loss='categorical_crossentropy',
171 |                 metrics=['accuracy'])
172 | 
173 | # Fit model_1
174 | model_1_training = model_1.fit(predictors, target, epochs=15,
175 |                                validation_split=0.2,
176 |                                callbacks=[early_stopping_monitor],
177 |                                verbose=False)
178 | 
179 | # Fit model_2
180 | model_2_training = model_2.fit(predictors, target, epochs=15,
181 |                                validation_split=0.2,
182 |                                callbacks=[early_stopping_monitor],
183 |                                verbose=False)
184 | 
185 | # Create the plot
186 | plt.plot(model_1_training.history['val_loss'], 'r',
187 |          model_2_training.history['val_loss'], 'b')
188 | plt.xlabel('Epochs')
189 | plt.ylabel('Validation score')
190 | plt.show()
191 | 
192 | 


--------------------------------------------------------------------------------
/Deep_Learning_KerasModel_Optimise_4.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon May 15 09:52:23 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import matplotlib.pyplot as plt
  9 | from keras.layers import Dense
 10 | from keras.models import Sequential
 11 | from keras.callbacks import EarlyStopping
 12 | 
 13 | # Import the SGD optimizer
 14 | from keras.optimizers import SGD
 15 | 
 16 | # Create list of learning rates: lr_to_test
 17 | lr_to_test = [.000001, 0.01, 1]
 18 | 
 19 | # Loop over learning rates
 20 | for lr in lr_to_test:
 21 |     print('\n\nTesting model with learning rate: % f \n'% lr)
 22 |     
 23 |     # Build new model to test, unaffected by previous models
 24 |     model = get_new_model()
 25 |     
 26 |     # Create SGD optimizer with specified learning rate: my_optimizer
 27 |     my_optimizer = SGD(lr=lr)
 28 | 
 29 |     # Compile the model
 30 |     model.compile(optimizer=my_optimizer, loss='categorical_crossentropy')
 31 | 
 32 |     # Fit the model
 33 |     model.fit(predictors, target)
 34 | 
 35 | 
 36 | # ''''''Evaluate model accuracy on validation dataset ''''''#
 37 | 
 38 | # Save the number of columns in predictors: n_cols
 39 | n_cols = predictors.shape[1]
 40 | input_shape = (n_cols,)
 41 | 
 42 | # Specify the model
 43 | model = Sequential()
 44 | model.add(Dense(100, activation='relu', input_shape=input_shape))
 45 | model.add(Dense(100, activation='relu'))
 46 | model.add(Dense(2, activation='softmax'))
 47 | 
 48 | # Compile the model
 49 | model.compile(optimizer='adam', loss='categorical_crossentropy',
 50 |               metrics=['accuracy'])
 51 | 
 52 | # Fit the model
 53 | hist = model.fit(predictors, target, validation_split=0.3)
 54 | 
 55 | 
 56 | # '''''' Early Stopping - Optimising the optimisation ''''''''''#
 57 | 
 58 | # Import EarlyStopping - already done above
 59 | 
 60 | # Save the number of columns in predictors: n_cols
 61 | n_cols = predictors.shape[1]
 62 | input_shape = (n_cols,)
 63 | 
 64 | # Specify the model
 65 | model = Sequential()
 66 | model.add(Dense(100, activation='relu', input_shape=input_shape))
 67 | model.add(Dense(100, activation='relu'))
 68 | model.add(Dense(2, activation='softmax'))
 69 | 
 70 | # Compile the model
 71 | model.compile(optimizer='adam', loss='categorical_crossentropy',
 72 |               metrics=['accuracy'])
 73 | 
 74 | # Define early_stopping_monitor
 75 | early_stopping_monitor = EarlyStopping(patience=2)
 76 | 
 77 | # Fit the model
 78 | model.fit(predictors, target, epochs=30, validation_split=0.3,
 79 |           callbacks=[early_stopping_monitor])
 80 | 
 81 | 
 82 | # ''''''''''''' Experimenting with a wider network ''''''#
 83 | 
 84 | # Define early_stopping_monitor
 85 | early_stopping_monitor = EarlyStopping(patience=2)
 86 | 
 87 | # Create the new model: model_2
 88 | model_2 = Sequential()
 89 | 
 90 | # Add the first and second layers
 91 | model_2.add(Dense(100, activation='relu', input_shape=input_shape))
 92 | model_2.add(Dense(100, activation='relu'))
 93 | 
 94 | # Add the output layer
 95 | model_2.add(Dense(2, activation='softmax'))
 96 | 
 97 | # Compile model_2
 98 | model_2.compile(optimizer='adam', loss='categorical_crossentropy',
 99 |                 metrics=['accuracy'])
100 | 
101 | # Fit model_1
102 | model_1_training = model_1.fit(predictors, target, epochs=15,
103 |                                validation_split=0.2,
104 |                                callbacks=[early_stopping_monitor],
105 |                                verbose=False)
106 | 
107 | # Fit model_2
108 | model_2_training = model_2.fit(predictors, target, epochs=15,
109 |                                validation_split=0.2,
110 |                                callbacks=[early_stopping_monitor],
111 |                                verbose=False)
112 | 
113 | # Create the plot
114 | plt.plot(model_1_training.history['val_loss'], 'r',
115 |          model_2_training.history['val_loss'], 'b')
116 | plt.xlabel('Epochs')
117 | plt.ylabel('Validation score')
118 | plt.show()
119 | 
120 | 
121 | # ''''''''' Adding layers to the model ''''''''' #
122 | 
123 | # The input shape to use in the first hidden layer
124 | input_shape = (n_cols,)
125 | 
126 | # Create the new model: model_2
127 | model_2 = Sequential()
128 | 
129 | # Add the first, second, and third hidden layers
130 | model_2.add(Dense(50, activation='relu', input_shape=input_shape))
131 | model_2.add(Dense(50, activation='relu'))
132 | model_2.add(Dense(50, activation='relu'))
133 | 
134 | # Add the output layer
135 | model_2.add(Dense(2, activation='softmax'))
136 | 
137 | # Compile model_2
138 | model_2.compile(optimizer='adam', loss='categorical_crossentropy',
139 |                 metrics=['accuracy'])
140 | 
141 | # Fit model 1
142 | model_1_training = model_1.fit(predictors, target, epochs=20,
143 |                                validation_split=0.4,
144 |                                callbacks=[early_stopping_monitor],
145 |                                verbose=False)
146 | 
147 | # Fit model 2
148 | model_2_training = model_2.fit(predictors, target, epochs=20,
149 |                                validation_split=0.4,
150 |                                callbacks=[early_stopping_monitor],
151 |                                verbose=False)
152 | 
153 | # Create the plot
154 | plt.plot(model_1_training.history['val_loss'], 'r',
155 |          model_2_training.history['val_loss'], 'b')
156 | plt.xlabel('Epochs')
157 | plt.ylabel('Validation score')
158 | plt.show()
159 | 
160 | 
161 | # '''''' Digit Recognition Model '''''''#
162 | 
163 | # Create the model: model
164 | model = Sequential()
165 | 
166 | # Add the first hidden layer
167 | model.add(Dense(50, activation='relu', input_shape=(784,)))
168 | 
169 | # Add the second hidden layer
170 | model.add(Dense(50, activation='relu'))
171 | 
172 | # Add the output layer
173 | model.add(Dense(10, activation='softmax'))
174 | 
175 | # Compile the model
176 | model.compile(optimizer='adam', loss='categorical_crossentropy',
177 |               metrics=['accuracy'])
178 | 
179 | # Fit the model
180 | model.fit(X, y, validation_split=0.3)
181 | 


--------------------------------------------------------------------------------
/Deep_Learning_Ntwrk_Optim_2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon May  1 18:28:26 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | import matplotlib.pyplot as plt
 11 | from sklearn.metrics import mean_squared_error
 12 | 
 13 | 
 14 | # ''''' Rectified Lin Activa Func. ''''''''''' ##
 15 | 
 16 | 
 17 | def relu(input):
 18 |     '''Define relu activation function here'''
 19 |     # Calculate the value for the output of the relu function: output
 20 |     output = max(input, 0)
 21 | 
 22 |     # Return the value just calculated
 23 |     return(output)
 24 | 
 25 | 
 26 | # .............###
 27 | 
 28 | weights = {'node_1': np.array([4, -5]), 'node_0': np.array([2, 4]),
 29 |            'output': np.array([2, 7])}
 30 | 
 31 | # '''''' Part 1 End ''''''''''' ###
 32 | 
 33 | input_data = [3, 5]
 34 | # ''''''''''''' Behaviour of a Multi Layer Neural Network ''''''''#
 35 | 
 36 | 
 37 | def predict_with_network(input_data):
 38 |     # Calculate node 0 in the first hidden layer
 39 |     node_0_0_input = (input_data * weights['node_0_0']).sum()
 40 |     node_0_0_output = relu(node_0_0_input)
 41 | 
 42 |     # Calculate node 1 in the first hidden layer
 43 |     node_0_1_input = (input_data * weights['node_0_1']).sum()
 44 |     node_0_1_output = relu(node_0_1_input)
 45 | 
 46 |     # Put node values into array: hidden_0_outputs
 47 |     hidden_0_outputs = np.array([node_0_0_output, node_0_1_output])
 48 | 
 49 |     # Calculate node 0 in the second hidden layer
 50 |     node_1_0_input = (hidden_0_outputs * weights['node_1_0']).sum()
 51 |     node_1_0_output = relu(node_1_0_input)
 52 | 
 53 |     # Calculate node 1 in the second hidden layer
 54 |     node_1_1_input = (hidden_0_outputs * weights['node_1_1']).sum()
 55 |     node_1_1_output = relu(node_1_1_input)
 56 | 
 57 |     # Put node values into array: hidden_1_outputs
 58 |     hidden_1_outputs = np.array([node_1_0_output, node_1_1_output])
 59 | 
 60 |     # Calculate model output: model_output
 61 |     model_output = (weights['output'] * hidden_1_outputs).sum()
 62 | 
 63 |     # Return model_output
 64 |     return(model_output)
 65 | 
 66 | output = predict_with_network(input_data)
 67 | print(output)
 68 | 
 69 | # ''''''''''''''''''''' Deep Learning - Part 2  ''''''''''' ##
 70 | 
 71 | 
 72 | # ''' Calculating Model Errors  - Consideration of weight effects''''###
 73 | 
 74 | # '''''''' Test Case - Bank Transactions Predictions '''''''##
 75 | 
 76 | # ''''''' Coding how weight changes affects accuracy ''''#'''''###
 77 | 
 78 | # The data point you will make a prediction for
 79 | 
 80 | input_data = np.array([0, 3])
 81 | 
 82 | # Sample weights
 83 | weights_0 = {'node_0': [2, 1],
 84 |              'node_1': [1, 2],
 85 |              'output': [1, 1]
 86 |              }
 87 | 
 88 | # The actual target value, used to calculate the error
 89 | target_actual = 3
 90 | target = 2
 91 | # Make prediction using original weights
 92 | model_output_0 = predict_with_network(input_data, weights_0)
 93 | 
 94 | # Calculate error: error_0
 95 | error_0 = model_output_0 - target_actual
 96 | 
 97 | # Create weights that cause the network to make perfect prediction (3):
 98 | # weights_1
 99 | weights_1 = {'node_0': [2, 1],
100 |              'node_1': [1, 2],
101 |              'output': [1, 0]
102 |              }
103 | 
104 | # Make prediction using new weights: model_output_1
105 | model_output_1 = predict_with_network(input_data, weights_1)
106 | 
107 | # Calculate error: error_1
108 | error_1 = model_output_1 - target_actual
109 | 
110 | # Print error_0 and error_1
111 | print(error_0)
112 | print(error_1)
113 | 
114 | 
115 | # '''''''''' Scaling up - Multiple Data Points ''''''''''''#
116 | 
117 | # Create model_output_0
118 | model_output_0 = []
119 | # Create model_output_0
120 | model_output_1 = []
121 | 
122 | # Loop over input_data
123 | for row in input_data:
124 |     # Append prediction to model_output_0
125 |     model_output_0.append(predict_with_network(row, weights_0))
126 | 
127 |     # Append prediction to model_output_1
128 |     model_output_1.append(predict_with_network(row, weights_1))
129 | 
130 | # Calculate the mean squared error for model_output_0: mse_0
131 | mse_0 = mean_squared_error(model_output_0, target_actuals)
132 | 
133 | # Calculate the mean squared error for model_output_1: mse_1
134 | mse_1 = mean_squared_error(model_output_1, target_actuals)
135 | 
136 | # Print mse_0 and mse_1
137 | print("Mean squared error with weights_0 : %f" % mse_0)
138 | print("Mean squared error with weights_1 : %f" % mse_1)
139 | 
140 | # ''''''''Calculating Slopes '''''#
141 | 
142 | # Calculate the predictions: preds
143 | preds = (weights * input_data).sum()
144 | 
145 | # Calculate the error: error
146 | error = target - preds
147 | 
148 | # Calculate the slope: slope
149 | slope = 2 * input_data * error
150 | 
151 | # Print the slope
152 | print(slope)
153 | 
154 | # '''''''''' Improving the model weights '''''''' #
155 | 
156 | # Set the learning rate: learning_rate
157 | learning_rate = 0.01
158 | 
159 | # Calculate the predictions: preds
160 | preds = (weights * input_data).sum()
161 | 
162 | # Calculate the error: error
163 | error = target - preds
164 | 
165 | # Calculate the slope: slope
166 | slope = 2 * input_data * error
167 | 
168 | # Update the weights: weights_updated
169 | weights_updated = weights + (learning_rate * slope)
170 | 
171 | # Get updated predictions: preds_updated
172 | preds_updated = (weights_updated * input_data).sum()
173 | 
174 | # Calculate updated error: error_updated
175 | error_updated = target - preds_updated
176 | 
177 | # Print the original error
178 | print(error)
179 | 
180 | # Print the updated error
181 | print(error_updated)
182 | 
183 | # ''''''' Making multiple updates to weights ''''''' #
184 | 
185 | n_updates = 20
186 | mse_hist = []
187 | 
188 | # Iterate over the number of updates
189 | for i in range(n_updates):
190 |     # Calculate the slope: slope
191 |     slope = get_slope(input_data, target, weights)
192 | 
193 |     # Update the weights: weights
194 |     weights = weights + 0.01 * slope
195 | 
196 |     # Calculate mse with new weights: mse
197 |     mse = get_mse(input_data, target, weights)
198 | 
199 |     # Append the mse to mse_hist
200 |     mse_hist.append(mse)
201 | 
202 | # Plot the mse history
203 | plt.plot(mse_hist)
204 | plt.xlabel('Iterations')
205 | plt.ylabel('Mean Squared Error')
206 | plt.show()
207 | 
208 | 


--------------------------------------------------------------------------------
/Distribution_Check_Theor_ECDF_Data_CDF.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Mar 11 16:21:39 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | # import pandas as pd
11 | 
12 | from ecdf_func import ecdf
13 | 
14 | # Compute mean and standard deviation: mu, sigma
15 | mu = np.mean(belmont_no_outliers)
16 | sigma = np.std(belmont_no_outliers)
17 | 
18 | 
19 | # Sample out of a normal distribution with this mu and sigma: samples
20 | samples = np.random.normal(mu, sigma, size=10000)
21 | 
22 | # Get the CDF of the samples and of the data
23 | x_theor, y_theor = ecdf(samples)
24 | x, y = ecdf(belmont_no_outliers)
25 | 
26 | # Plot the CDFs and show the plot
27 | _ = plt.plot(x_theor, y_theor)
28 | _ = plt.plot(x, y, marker='.', linestyle='none')
29 | plt.margins(0.02)
30 | _ = plt.xlabel('Belmont winning time (sec.)')
31 | _ = plt.ylabel('CDF')
32 | plt.show()
33 | 
34 | 
35 | # Take a million samples out of the Normal distribution: samples
36 | samples = np.random.normal(mu, sigma, size=1000000)
37 | 
38 | # Compute the fraction that are faster than 144 seconds: prob
39 | prob = np.sum(samples <= 144)/len(samples)
40 | 
41 | # Print the result
42 | print('Probability of besting Secretariat:', prob)
43 | 
44 | # #################################### # 
45 | 
46 | # Determine successive poisson relationship  - i.e. total time between
47 | # two poisson processes
48 | 
49 | 
50 | def successive_poisson(tau1, tau2, size=1):
51 |     # Draw samples out of first exponential distribution: t1
52 |     t1 = np.random.exponential(tau1, size)
53 | 
54 |     # Draw samples out of second exponential distribution: t2
55 |     t2 = np.random.exponential(tau2, size)
56 | 
57 |     return t1 + t2
58 | 
59 | 
60 | # Draw samples of waiting times: waiting_times
61 | waiting_times = successive_poisson(764, 715, size=100000)
62 | 
63 | # Make the histogram
64 | _ = plt.hist(waiting_times, normed=True, histtype='step', bins=100)
65 | 
66 | 
67 | # Label axes
68 | plt.xlabel('waiting_times')
69 | plt.ylabel('successive_poisson')
70 | 
71 | 
72 | # Show the plot
73 | plt.show()


--------------------------------------------------------------------------------
/EDA_Analysis_Comarison.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Mar 12 17:12:24 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # ########## EDA Analysis ######### #
 9 | 
10 | # Once the dataframe has been created and you can identify the relevant columns of interest
11 | # this example considers the relationship between illiteracy and fertility as per datacamp course example
12 | # Plot the illiteracy rate versus fertility
13 | _ = plt.plot(illiteracy, fertility, marker='.', linestyle='none')
14 | 
15 | # Set the margins and label axes
16 | plt.margins(0.02)
17 | _ = plt.xlabel('percent illiterate')
18 | _ = plt.ylabel('fertility')
19 | 
20 | # Show the plot
21 | plt.show()
22 | 
23 | # Show the Pearson correlation coefficient
24 | print(pearson_r(illiteracy, fertility))
25 | 
26 | # ############ ######### LINEAR REGRESSION ############ #
27 | 
28 | # Perform a linear regression using np.polyfit(): a, b
29 | a, b = np.polyfit(illiteracy, fertility, 1)
30 | 
31 | # Print the results to the screen
32 | print('slope =', a, 'children per woman / percent illiterate')
33 | print('intercept =', b, 'children per woman')
34 | 
35 | # Make theoretical line to plot
36 | x = np.array([0,100])
37 | y = a * x + b
38 | 
39 | # Add regression line to your plot
40 | _ = plt.plot(x, y)
41 | 
42 | # Draw the plot
43 | plt.show()
44 | 
45 | ## ############# IS REGRESSION OPTIMAL? ######## #
46 | 
47 | # Specify slopes to consider: a_vals
48 | a_vals = np.linspace(0, 0.1, 200)
49 | 
50 | # Initialize sum of square of residuals: rss
51 | rss = np.empty_like(a_vals)
52 | 
53 | # Compute sum of square of residuals for each value of a_vals
54 | for i, a in enumerate(a_vals):
55 |     rss[i] = np.sum((fertility - a*illiteracy - b)**2)
56 | 
57 | # Plot the RSS
58 | plt.plot(a_vals, rss, '-')
59 | plt.xlabel('slope (children per woman / percent illiterate)')
60 | plt.ylabel('sum of square of residuals')
61 | 
62 | plt.show()
63 | 
64 | # Notice that the minimum on the plot, that is the value of the slope that
65 | # gives the minimum sum of the square of the residuals, this is the same value
66 | # got when performing the regression.
67 | 


--------------------------------------------------------------------------------
/EDA_Hypothesis_Test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 20 19:02:21 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # the following analysis loks at an example comparing impact force of frogs - interesting :-)
 9 | 
10 | 
11 | # ########### EDA BEFORE HYPOTHESIS TESTING ### #
12 | # Make bee swarm plot
13 | _ = sns.swarmplot(x='ID', y='impact_force', data=df)
14 | 
15 | # Label axes
16 | _ = plt.xlabel('frog')
17 | _ = plt.ylabel('impact force (N)')
18 | 
19 | # Show the plot
20 | plt.show()
21 | 
22 | 
23 | # ######## permutation test on the data ### #
24 | 
25 | # make a difference of means function ####  #
26 | 
27 | def diff_of_means(data_1, data_2):
28 |     """Difference in means of two arrays."""
29 | 
30 |     # The difference of means of data_1, data_2: diff
31 |     diff = np.mean(data_1) - np.mean(data_2)
32 | 
33 |     return diff
34 | 
35 | # Compute difference of mean impact force from experiment: empirical_diff_means
36 | empirical_diff_means = diff_of_means(force_a, force_b)
37 | 
38 | # Draw 10,000 permutation replicates: perm_replicates
39 | perm_replicates = draw_perm_reps(force_a, force_b,
40 |                                  diff_of_means, size=10000)
41 | 
42 | # Compute p-value: p
43 | p = np.sum(perm_replicates >= empirical_diff_means) / len(perm_replicates)
44 | 
45 | # Print the result
46 | print('p-value =', p)
47 | 
48 | 
49 | # ########  ONE SAMPLE BOOTSTRAP TEST ########### #
50 | 
51 | # Make an array of translated impact forces: translated_force_b
52 | translated_force_b = force_b - np.mean(force_b) + 0.55
53 | 
54 | # Take bootstrap replicates of Frog B's translated impact forces: bs_replicates
55 | bs_replicates = draw_bs_reps(translated_force_b, np.mean, 10000)
56 | 
57 | # Compute fraction of replicates that are less than the observed Frog B force: p
58 | p = np.sum(bs_replicates <= np.mean(force_b)) / 10000
59 | 
60 | # BOOTSTRAP TEST FOR AN INDENTICAL DISTRIBUTION
61 | 
62 |  Compute difference of mean impact force from experiment: empirical_diff_means
63 | empirical_diff_means = diff_of_means(force_a, force_b)
64 | 
65 | # Concatenate forces: forces_concat
66 | forces_concat = np.concatenate((force_a, force_b))
67 | 
68 | # Initialize bootstrap replicates: bs_replicates
69 | bs_replicates = np.empty(10000)
70 | 
71 | for i in range(10000):
72 |     # Generate bootstrap sample
73 |     bs_sample = np.random.choice(forces_concat, size=len(forces_concat))
74 |     
75 |     # Compute replicate
76 |     bs_replicates[i] = diff_of_means(bs_sample[:len(force_a)],
77 |                                      bs_sample[len(force_b):])
78 | 
79 | # Compute and print p-value: p
80 | p = np.sum(bs_replicates >= empirical_diff_means)/ 10000
81 | print('p-value =', p)
82 |           
83 | 


--------------------------------------------------------------------------------
/Entry_Count_Check_Exception.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan  9 01:41:06 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Define count_entries()
 9 | def count_entries(df, col_name='lang'):
10 |     """Return a dictionary with counts of
11 |     occurrences as value for each key."""
12 | 
13 |     # Initialize an empty dictionary: cols_count
14 |     cols_count = {}
15 | 
16 |     # Add try block
17 |     try:
18 |         # Extract column from DataFrame: col
19 |         col = df[col_name]
20 |         
21 |         # Iterate over the column in dataframe
22 |         for entry in col:
23 |     
24 |             # If entry is in cols_count, add 1
25 |             if entry in cols_count.keys():
26 |                 cols_count[entry] += 1
27 |             # Else add the entry to cols_count, set the value to 1
28 |             else:
29 |                 cols_count[entry] = 1
30 |     
31 |         # Return the cols_count dictionary
32 |         return cols_count
33 | 
34 |     # Add except block
35 |     except:
36 |         print('The dataframe does not have a ' + col_name + ' column.')
37 | 
38 | # Call count_entries(): result1
39 | result1 = count_entries(tweets_df, 'lang')
40 | 
41 | # Print result1
42 | print(result1)
43 | 
44 | # Call count_entries(): result2
45 | result2 = count_entries(tweets_df, 'lang1')
46 | 


--------------------------------------------------------------------------------
/EthicalHackingCourseNotes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Jun 15 16:03:49 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Pen test methodology
 9 | # 1. Vulnerability
10 | # 2. Exploit - End of Pen Testing
11 | # 3. Trace Removal
12 | ##########################################################
13 | # elevate privileges 
14 | # copy or move data
15 | # log out without being noticed
16 | 
17 | #####################################################
18 | # Attack Vectors
19 | # 3 major areas
20 | # Network -
21 | # Host OS aatack /
22 | # Application attacks
23 | ######################################################
24 | # Vulnerability Managment
25 | # 6 steps
26 | # discover 
27 | # cat and priorotise
28 | # scan for vul
29 | # report and classify
30 | # remediate
31 | # verify checks
32 | ##############################
33 | # Incident Management - Quickly resolve incidents with min
34 | # impact to the process or business
35 | # Improve monitoring
36 | # elimination of loss of requests
37 | # availability of info
38 | # accurate CMDB infor
39 | # improve user and cust satisfaction
40 | #########################
41 | # Incident Management Plan
42 | # Identify
43 | # Analyse
44 | # Gather infor
45 | # Contain
46 | # Mitigate
47 | # Eradicate
48 | ######################################################
49 | # Risk Assesment
50 | #  Vulnerability Assessments
51 |   


--------------------------------------------------------------------------------
/ExtractHist_Image.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Apr  4 13:24:24 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | 
 11 | # Load the image into an array: image
 12 | image = plt.imread('640px-Unequalized_Hawkes_Bay_NZ.jpg')
 13 | 
 14 | # Display image in top subplot using color map 'gray'
 15 | plt.subplot(2, 1, 1)
 16 | plt.title('Original image')
 17 | plt.axis('off')
 18 | plt.imshow(image, cmap='gray')
 19 | 
 20 | # Flatten the image into 1 dimension: pixels
 21 | pixels = image.flatten()
 22 | 
 23 | # Display a histogram of the pixels in the bottom subplot
 24 | plt.subplot(2, 1, 2)
 25 | plt.xlim((0, 255))
 26 | plt.title('Normalized histogram')
 27 | plt.hist(pixels, bins=64, range=(0, 256), normed=True, color='red', alpha=0.4)
 28 | _ = plt.imshow(image)
 29 | # Display the plot
 30 | plt.show()
 31 | 
 32 | # '''''''''''''''' Cumulative Distribution Fuc on Image Hist ''#
 33 | 
 34 | # Load the image into an array: image
 35 | image = plt.imread('640px-Unequalized_Hawkes_Bay_NZ.jpg')
 36 | 
 37 | # Display image in top subplot using color map 'gray'
 38 | plt.subplot(2,1,1)
 39 | plt.imshow(image, cmap='gray')
 40 | plt.title('Original image')
 41 | plt.axis('off')
 42 | 
 43 | # Flatten the image into 1 dimension: pixels
 44 | pixels = image.flatten()
 45 | 
 46 | # Display a histogram of the pixels in the bottom subplot
 47 | plt.subplot(2,1,2)
 48 | pdf = plt.hist(pixels, bins=64, range=(0, 256), normed=False,
 49 |                color='red', alpha=0.4)
 50 | plt.grid('off')
 51 | 
 52 | # Use plt.twinx() to overlay the CDF in the bottom subplot
 53 | _ = plt.twinx()
 54 | 
 55 | # Display a cumulative histogram of the pixels
 56 | cdf = plt.hist(pixels, bins=64, range=(0,256),
 57 |                normed=True, cumulative=True,
 58 |                color='blue', alpha=0.4)
 59 |                
 60 | # Specify x-axis range, hide axes, add title and display plot
 61 | plt.xlim((0,256))
 62 | plt.grid('off')
 63 | plt.title('PDF & CDF (original image)')
 64 | plt.show()
 65 | 
 66 | # ''''''''''''' Equalise Image Histogram ''''''#
 67 | 
 68 | # Load the image into an array: image
 69 | image = plt.imread('640px-Unequalized_Hawkes_Bay_NZ.jpg')
 70 | 
 71 | # Flatten the image into 1 dimension: pixels
 72 | pixels = image.flatten()
 73 | 
 74 | # Generate a cumulative histogram
 75 | cdf, bins, patches = plt.hist(pixels, bins=256, range=(0, 256), normed=True, cumulative=True)
 76 | new_pixels = np.interp(pixels, bins[:-1], cdf*255)
 77 | 
 78 | # Reshape new_pixels as a 2-D array: new_image
 79 | new_image = new_pixels.reshape(image.shape)
 80 | 
 81 | # Display the new image with 'gray' color map
 82 | plt.subplot(2, 1, 1)
 83 | plt.title('Equalized image')
 84 | plt.axis('off')
 85 | plt.imshow(new_image, cmap='gray')
 86 | 
 87 | # Generate a histogram of the new pixels
 88 | plt.subplot(2, 1, 2)
 89 | pdf = plt.hist(new_pixels, bins=64, range=(0, 256), normed=False,
 90 |                color='red', alpha=0.4)
 91 | plt.grid('off')
 92 | 
 93 | # Use plt.twinx() to overlay the CDF in the bottom subplot
 94 | _ = plt.twinx()
 95 | plt.xlim((0, 256))
 96 | plt.grid('off')
 97 | 
 98 | # Add title
 99 | plt.title('PDF & CDF (equalized image)')
100 | 
101 | # Generate a cumulative histogram of the new pixels
102 | cdf = plt.hist(new_pixels, bins=64, range=(0,256),
103 |                cumulative=True, normed=True,
104 |                color='blue', alpha=0.4)
105 | plt.show()
106 | 
107 | # ''''''''''''' Extract Histograms from a colour Image ''''''''''#
108 | 
109 | # Load the image into an array: image
110 | image = plt.imread('hs-2004-32-b-small_web.jpg')
111 | 
112 | # Display image in top subplot
113 | plt.subplot(2, 1, 1)
114 | plt.title('Original image')
115 | plt.axis('off')
116 | plt.imshow(image)
117 | 
118 | # Extract 2-D arrays of the RGB channels: red, blue, green
119 | red, green, blue = image[:, :, 0], image[:, :, 1], image[:, :, 2]
120 | 
121 | # Flatten the 2-D arrays of the RGB channels into 1-D
122 | red_pixels = red.flatten()
123 | blue_pixels = blue.flatten()
124 | green_pixels = green.flatten()
125 | 
126 | # Overlay histograms of the pixels of each color in the bottom subplot
127 | plt.subplot(2, 1, 2)
128 | plt.title('Histograms from color image')
129 | plt.xlim((0, 256))
130 | plt.hist(red_pixels, bins=64, normed=True, color='red', alpha=0.2)
131 | plt.hist(blue_pixels, bins=64, normed=True, color='blue', alpha=0.2)
132 | plt.hist(green_pixels, bins=64, normed=True, color='green', alpha=0.2)
133 | 
134 | # Display the plot
135 | plt.show()
136 | 
137 | # ''''''Extracting Bivariate Histograms from a Colour Image '''''''#
138 | 
139 | # Load the image into an array: image
140 | image = plt.imread('hs-2004-32-b-small_web.jpg')
141 | 
142 | # Extract RGB channels and flatten into 1-D array
143 | red, blue, green = image[:, :, 0], image[:, :, 1], image[:, :, 2]
144 | red_pixels = red.flatten()
145 | blue_pixels = blue.flatten()
146 | green_pixels = green.flatten()
147 | 
148 | # Generate a 2-D histogram of the red and green pixels
149 | plt.subplot(2, 2, 1)
150 | plt.grid('off')
151 | plt.xticks(rotation=60)
152 | plt.xlabel('red')
153 | plt.ylabel('green')
154 | _ = plt.hist2d(red_pixels, green_pixels, bins=(32, 32))
155 | 
156 | # Generate a 2-D histogram of the green and blue pixels
157 | plt.subplot(2, 2, 2)
158 | plt.grid('off')
159 | plt.xticks(rotation=60)
160 | plt.xlabel('green')
161 | plt.ylabel('blue')
162 | _ = plt.hist2d(green_pixels, blue_pixels, bins=(32, 32))
163 | 
164 | # Generate a 2-D histogram of the blue and red pixels
165 | plt.subplot(2, 2, 3)
166 | plt.grid('off')
167 | plt.xticks(rotation=60)
168 | plt.xlabel('blue')
169 | plt.ylabel('red')
170 | _ = plt.hist2d(blue_pixels, red_pixels, bins=(32, 32))
171 | 
172 | # Display the plot
173 | plt.show()


--------------------------------------------------------------------------------
/Extract_Data_from_HDF5.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 11 03:30:20 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Get the HDF5 group: group
 9 | group = data['strain']
10 | 
11 | # Check out keys of group
12 | for key in group.keys():
13 |     print(key)
14 | 
15 | # Set variable equal to time series data: strain
16 | strain = data['strain']['Strain'].value
17 | 
18 | # Set number of time points to sample: num_samples
19 | num_samples = 10000
20 | 
21 | # Set time vector
22 | time = np.arange(0, 1, 1/num_samples)
23 | 
24 | # Plot data
25 | plt.plot(time, strain[:num_samples])
26 | plt.xlabel('GPS Time (s)')
27 | plt.ylabel('strain')
28 | plt.show()
29 | 


--------------------------------------------------------------------------------
/File_Import_Multi_DataType.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan  9 13:05:02 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Assign filename: file
 9 | file = 'seaslug.txt'
10 | 
11 | # Import file: data
12 | data = np.loadtxt(file, delimiter='\t', dtype=str)
13 | 
14 | # Print the first element of data
15 | print(data[0])
16 | 
17 | # Import data as floats and skip the first row: data_float
18 | data_float = np.loadtxt(file, delimiter='\t', dtype=float, skiprows=1)
19 | 
20 | # Print the 10th element of data_float
21 | print(data_float[9])
22 | 
23 | # Plot a scatterplot of the data
24 | plt.scatter(data_float[:, 0], data_float[:, 1])
25 | plt.xlabel('time (min.)')
26 | plt.ylabel('percentage of larvae')
27 | plt.show()
28 | 


--------------------------------------------------------------------------------
/FilterData_Selected_from_Table_SQLAlchemy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jan 17 13:15:13 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # import and_
 9 | from sqlalchemy import and_
10 | 
11 | # BUild a query for the census table: stmt
12 | stmt = select([census])
13 | 
14 | # Append a where clause to select only non-male records from California using and_
15 | 
16 | stmt = stmt.where(
17 |     # The state of California with a non-male sex                  
18 |      and_(census.columns.state == 'California', census.columns.sex != 'M')
19 |              
20 |                   )
21 | 
22 | # Loop over the ResultProxy printing the age and sex
23 | for result in connection.execute(stmt):
24 |     print(result.age, result.sex)
25 |     
26 |     


--------------------------------------------------------------------------------
/FilterSQL_Database_Table_Col_Row.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jan 13 15:01:21 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import pandas as pd
 9 | from sqlalchemy import create_engine
10 | """
11 | Let's say, for example that you wanted to get all records from the Customer 
12 | table of the Chinook database for which the Country is 'Canada'. 
13 | You can do this very easily in SQL
14 |  using a SELECT statement followed by a WHERE clause as follows:
15 | 
16 | SELECT * FROM Customer WHERE Country = 'Canada'
17 | 
18 | In fact, you can filter any SELECT statement by any condition using a WHERE
19 | clause. This is called filtering your records.
20 | Below, you'll select all records of the Employee table for which 'EmployeeId'
21 | is greater than or equal to 6
22 | """
23 | 
24 | 
25 | # Create engine: engine
26 | engine = create_engine('sqlite:///Chinnok.sqlite')
27 | 
28 | # Open engine in context manager
29 | # Perform query and save results to DataFrame: df
30 | with engine.connect() as con:
31 |     rs = con.execute("SELECT * FROM Employee WHERE EmployeeId >= 6")
32 |     df = pd.DataFrame(rs.fetchall())
33 |     df.columns = rs.keys()
34 | 
35 | # Print the head of the DataFrame df
36 | print(df.head())
37 | 


--------------------------------------------------------------------------------
/FilterSQL_Database_Table_WHERE.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jan 13 15:01:21 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import pandas as pd
 9 | from sqlalchemy import create_engine
10 | """
11 | Let's say, for example that you wanted to get all records from the Customer 
12 | table of the Chinook database for which the Country is 'Canada'. 
13 | You can do this very easily in SQL
14 |  using a SELECT statement followed by a WHERE clause as follows:
15 | 
16 | SELECT * FROM Customer WHERE Country = 'Canada'
17 | 
18 | In fact, you can filter any SELECT statement by any condition using a WHERE
19 | clause. This is called filtering your records.
20 | Below, you'll select all records of the Employee table for which 'EmployeeId'
21 | is greater than or equal to 6
22 | """
23 | 
24 | # Create engine: engine
25 | engine = create_engine('sqlite:///Chinnok.sqlite')
26 | 
27 | # Open engine in context manager
28 | # Perform query and save results to DataFrame: df
29 | with engine.connect() as con:
30 |     rs = con.execute("SELECT * FROM Employee WHERE EmployeeId >= 6")
31 |     df = pd.DataFrame(rs.fetchall())
32 |     df.columns = rs.keys()
33 | 
34 | # Print the head of the DataFrame df
35 | print(df.head())
36 | 


--------------------------------------------------------------------------------
/General Multi_Column DataFrame Analysis.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Jan  7 18:14:49 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | import pandas as pd
 8 | import numpy as np
 9 | 
10 | # Define count_entries()
11 | def count_entries(df, *args):
12 |     """Return a dictionary with counts of
13 |     occurrences as value for each key."""
14 |     #Initialize an empty dictionary: cols_count
15 |     cols_count = {}
16 |     # Iterate over column names in args
17 |     for col_name in args:
18 |     
19 |         # Extract column from DataFrame: col
20 |         col = df[col_name]
21 |     
22 |         # Iterate over the column in dataframe
23 |         for entry in col:
24 |     
25 |             # If entry is in cols_count, add 1
26 |             if entry in cols_count.keys():
27 |                 cols_count[entry] += 1
28 |     
29 |             # Else add the entry to cols_count, set the value to 1
30 |             else:
31 |                 cols_count[entry] = 1
32 | 
33 |     # Return the cols_count dictionary
34 |     return cols_count
35 | 
36 | # Call count_entries(): result1
37 | result1 = count_entries(tweets_df, 'lang')
38 | 
39 | # Call count_entries(): result2
40 | result2 = count_entries(tweets_df, 'lang', 'source')
41 | 
42 | # Print result1 and result2
43 | print(result1)
44 | print(result2)
45 | 
46 | 


--------------------------------------------------------------------------------
/General Twitter Language Analysis.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Jan  7 17:31:07 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | import pandas as pd
 8 | import numpy as np
 9 | 
10 | 
11 | # Define count_entries()
12 | def count_entries(df, col_name = 'lang'):
13 |     """Return a dictionary with counts of
14 |     occurrences as value for each key."""
15 | 
16 |     # Initialize an empty dictionary: cols_count
17 |     cols_count = {}
18 | 
19 |     # Extract column from DataFrame: col
20 |     col = df[col_name]
21 |     
22 |     # Iterate over the column in dataframe
23 |     for entry in col:
24 | 
25 |         # If entry is in cols_count, add 1
26 |         if entry in cols_count.keys():
27 |             cols_count[entry] += 1
28 | 
29 |         # Else add the entry to cols_count, set the value to 1
30 |         else:
31 |             cols_count[entry] = 1
32 | 
33 |     # Return the cols_count dictionary
34 |     return cols_count
35 | 
36 | # Call count_entries(): result1
37 | result1 = count_entries(tweets_df, 'lang')
38 | 
39 | # Call count_entries(): result2
40 | result2 = count_entries(tweets_df, 'source')
41 | 
42 | # Print result1 and result2
43 | print(result1)
44 | print(result2)
45 | 
46 | 


--------------------------------------------------------------------------------
/Generate from MultiType Data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jan 10 00:24:52 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import numpy as  np
 9 | import pandas as pd
10 | 
11 | data = np.genfromtxt('gaze_positions.csv', delimiter=',', names=True, dtype=None)
12 | 
13 | np.shape(data)
14 | 
15 | 
16 | data[0]
17 | 
18 | 
19 | #More mixed datatypes
20 | 
21 | # Assign the filename: file
22 | file = 'titanic.csv'
23 | 
24 | # Import file using np.recfromcsv: d
25 | d = np.recfromcsv(file)
26 | 
27 | #np.recfrocsv already contains the default 
28 | #delimiter as a comma and dtype is none
29 | 
30 | # Print out first three entries of d
31 | print(d[:3])


--------------------------------------------------------------------------------
/HTML_with_BeautifulSoup_GetHypLinktData.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan 16 00:08:42 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import packages
 9 | import requests
10 | from bs4 import BeautifulSoup
11 | 
12 | # Specify url
13 | url = 'https://www.python.org/~guido/'
14 | 
15 | # Package the request, send the request and catch the response: r
16 | r = requests.get(url)
17 | 
18 | # Extracts the response as html: html_doc
19 | html_doc = r.text
20 | 
21 | # create a BeautifulSoup object from the HTML: soup
22 | soup = BeautifulSoup(html_doc)
23 | 
24 | # Print the title of Guido's webpage
25 | print(soup.title)
26 | 
27 | # Find all 'a' tags (which define hyperlinks): a_tags
28 | a_tags = soup.find_all('a')
29 | 
30 | # Print the URLs to the shell
31 | for link in a_tags:
32 |     print(link.get('href'))


--------------------------------------------------------------------------------
/HTML_with_BeautifulSoup_GetTextData.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan 16 00:01:32 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import packages
 9 | import requests
10 | from bs4 import BeautifulSoup
11 | 
12 | # Specify url: url
13 | url = 'https://www.python.org/~guido/'
14 | 
15 | # Package the request, send the request and catch the response: r
16 | r = requests.get(url)
17 | 
18 | # Extract the response as html: html_doc
19 | html_doc = r.text
20 | 
21 | # Create a BeautifulSoup object from the HTML: soup
22 | soup = BeautifulSoup(html_doc)  # presents page in a readable manner
23 | soup.body.text
24 | 
25 | bold = soup.finaAll('b')    # find all bold text and return a list
26 | 
27 | print(bold)
28 | print(soup.prettify())
29 | # Get the title of Guido's webpage: guido_title
30 | guido_title = (soup.title)
31 | 
32 | # Print the title of Guido's webpage to the shell
33 | print(guido_title)
34 | 
35 | # Get Guido's text: guido_text
36 | guido_text = (soup.get_text())
37 | 
38 | # Print Guido's text to the shell
39 | print(guido_text)
40 | 
41 | soup.findAll(id-"para2")[0].text
42 | soup.findAll(['b', 'p'])
43 | 
44 | soup.findAll({'b': True, 'p': True})
45 | 
46 | # find all links in the document
47 | 
48 | links = soup.find('a')   # retruns 1st match it gets -use findAll
49 | 
50 | print(links['href'] + " is the url and " + links.text + " is the text")
51 | 
52 | 
53 | # Use find in various ways 
54 | 
55 | # findParents, findNextSiblings, findPreviousSiblings
56 | # findNext, findPrevious and findAllNext and findAllPrevious


--------------------------------------------------------------------------------
/HTTP_Request_Urllib_Response.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jan 15 23:10:17 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import packages
 9 | from urllib.request import urlopen, Request
10 | 
11 | # Specify the url
12 | url = "http://www.datacamp.com/teach/documentation"
13 | 
14 | # This packages the request: request
15 | request = Request(url)
16 | 
17 | # Sends the request and catches the response: response
18 | response = urlopen(request)
19 | 
20 | # Print the datatype of response
21 | print(type(response))
22 | 
23 | # Be polite and close the response!
24 | response.close()


--------------------------------------------------------------------------------
/HTTP_Request_Urllib_Response_Read.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jan 15 23:15:37 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import packages
 9 | from urllib.request import urlopen, Request
10 | 
11 | # Specify the url
12 | url = "http://docs.datacamp.com/teach/"
13 | 
14 | # This packages the request
15 | request = Request(url)
16 | 
17 | # Sends the request and catches the response: response
18 | response = urlopen(request)
19 | 
20 | # Extract the response: html
21 | html = response.read()
22 | 
23 | # Print the html
24 | print(html)
25 | 
26 | # Be polite and close the response!
27 | response.close()
28 | 


--------------------------------------------------------------------------------
/HTTP_Request_using_Requests.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jan 15 23:32:44 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | 
 9 | """
10 | 
11 |     Import the package requests.
12 |     Assign the URL of interest to the variable url.
13 |     Package the request to the URL, send the request and catch the response
14 |     with a single function requests.get(), assigning the response to the
15 |     variable r.
16 |     Use the text attribute of the object r to return the HTML of the webpage
17 |     as a string; store the result in a variable text.
18 |     Hit submit to print the HTML of the webpage.
19 | """
20 | 
21 | # Import package
22 | import requests
23 | 
24 | # Specify the url: url
25 | url = "http://docs.datacamp.com/teach/"
26 | 
27 | # Packages the request, send the request and catch the response: r
28 | r = requests.get(url)
29 | 
30 | # Extract the response: text
31 | text = r.text
32 | 
33 | # Print the html
34 | print(text)


--------------------------------------------------------------------------------
/Hack_Bern_nprandom.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Mar  7 21:39:58 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | import numpy as np
 8 | import matplotlib.pyplot as plt
 9 | 
10 | 
11 | def perform_bernoulli_trials(n, p):
12 |     """Perform n Bernoulli trials with success probability p
13 |     and return number of successes."""
14 |     # Initialize number of successes: n_success
15 |     n_success = 0
16 | 
17 |     # Perform trials
18 |     for i in range(n):
19 |         # Choose random number between zero and one: random_number
20 |         random_number = np.random.random()
21 | 
22 |         # If less than p, it's a success so add one to n_success
23 |         if random_number < p:
24 |             n_success += 1
25 | 
26 |     return n_success
27 | 
28 | # Seed random number generator
29 | np.random.seed(42)
30 | 
31 | # Initialize the number of defaults: n_defaults
32 | n_defaults = np.empty(1000)
33 | 
34 | # Compute the number of defaults
35 | for i in range(1000):
36 |     n_defaults[i] = perform_bernoulli_trials(100, 0.05)
37 | 
38 | 
39 | # Plot the histogram with default number of bins; label your axes
40 | _ = plt.hist(n_defaults, normed=True)
41 | _ = plt.xlabel('number of defaults out of 100 loans')
42 | _ = plt.ylabel('probability')
43 | 
44 | # Show the plot
45 | plt.show()
46 | 
47 | # Compute bin edges: bins
48 | bins = np.arange(-0.5, max(n_defaults + 1.5) - 0.5)
49 | 
50 | # Generate histogram
51 | _ = plt.hist(n_defaults, normed=True, bins=bins)
52 | 
53 | # Set margins
54 | plt.margins(0.02)
55 | 
56 | # Label axes
57 | _ = plt.xlabel('number of defaults out of 100 loans')
58 | _ = plt.ylabel('Binomial PMF')
59 | 
60 | 
61 | # Show the plot
62 | plt.show()
63 | 
64 | 
65 | # Draw 10,000 samples out of Poisson distribution: samples_poisson
66 | samples_poisson = np.random.poisson(10, size=10000)
67 | 
68 | # Print the mean and standard deviation
69 | print('Poisson:     ', np.mean(samples_poisson),
70 |                        np.std(samples_poisson))
71 | 
72 | # Specify values of n and p to consider for Binomial: n, p
73 | n = [20, 100, 1000]
74 | p = [0.5, 0.1, 0.01]
75 | 
76 | 
77 | # Draw 10,000 samples for each n,p pair: samples_binomial
78 | for i in range(3):
79 |     samples_binomial = np.random.binomial(n[i], p[i], size=10000)
80 | 
81 |     # Print results
82 |     print('n =', n[i], 'Binom:', np.mean(samples_binomial),
83 |                                  np.std(samples_binomial))
84 | 
85 | 


--------------------------------------------------------------------------------
/Hack_Stats_BasicRandGen.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Mar  7 21:05:24 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | # Seed the random number generator
12 | np.random.seed(42)
13 | 
14 | # Initialize random numbers: random_numbers
15 | random_numbers = np.empty(100000)
16 | 
17 | # Generate random numbers by looping over range(100000)
18 | for i in range(100000):
19 |     random_numbers[i] = np.random.random()
20 | 
21 | # Plot a histogram
22 | _ = plt.hist(random_numbers)
23 | 
24 | # Show the plot
25 | plt.show()
26 | 


--------------------------------------------------------------------------------
/Import_Excel_Pandas.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 11 01:28:11 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import pandas
 9 | import pandas as pd
10 | 
11 | # Assign spreadsheet filename: file
12 | file = 'battledeath.xlsx'
13 | 
14 | # Load spreadsheet: xl
15 | xl = pd.ExcelFile(file)
16 | 
17 | # Print sheet names
18 | print(xl.sheet_names)
19 | 
20 | """
21 | Import Excel Sheets Specifically
22 | """
23 | 
24 | # Load a sheet into a DataFrame by name: df1
25 | df1 = xl.parse('2004')
26 | 
27 | # Print the head of the DataFrame df1
28 | print(df1.head())
29 | 
30 | # Load a sheet into a DataFrame by index: df2
31 | df2 = xl.parse(0)
32 | 
33 | # Print the head of the DataFrame df2
34 | print(df2.head())


--------------------------------------------------------------------------------
/Import_Excel_Parse.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 11 02:22:12 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | import pandas as pd
 8 | import numpy as np
 9 | """
10 | The spreadsheet 'battledeath.xlsx' is already loaded as xl.
11 | 
12 | As before, you'll use the method parse(). This time, however, you'll add the
13 | additional arguments skiprows, names and parse_cols. These skip rows, name the
14 | columns and designate which columns to parse, respectively. All these arguments
15 | can be assigned to lists containing the specific row numbers, strings and 
16 | column numbers, respectively.
17 | """
18 | 
19 | # Parse the first sheet and rename the columns: df1
20 | df1 = xl.parse(0, skiprows=[0], names=['Country', 'AAM due to War (2002)'])
21 | 
22 | # Print the head of the DataFrame df1
23 | print(df1.head())
24 | 
25 | # Parse the first column of the second sheet and rename the column: df2
26 | df2 = xl.parse(1, parse_cols=[0], skiprows=[0], names=['Country'])
27 | 
28 | # Print the head of the DataFrame df2
29 | print(df2.head())
30 | 


--------------------------------------------------------------------------------
/Import_FlatFile_Web.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jan 15 21:27:31 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import the function urlretrieve from the subpackage urllib.request.
 9 | # Assign the URL of the file to the variable url.
10 | # Use the function urlretrieve() to save the file locally as
11 | # 'winequality-red.csv'.
12 | # Execute the remaining code to load 'winequality-red.csv' in a pandas
13 | # DataFrame and to print its head to the shell
14 | 
15 | 
16 | # Import package
17 | from urllib.request import urlretrieve
18 | 
19 | # Import pandas
20 | import pandas as pd
21 | 
22 | # Assign url of file: url
23 | url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/data\
24 | sets/winequality-red.csv'
25 | 
26 | # Save file locally
27 | urlretrieve(url, 'winequality-red.csv')
28 | 
29 | # Read file into a DataFrame and print its head
30 | df = pd.read_csv('winequality-red.csv', sep=';')
31 | print(df.head())


--------------------------------------------------------------------------------
/Import_HDF5.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 11 03:24:25 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import packages
 9 | import numpy as np
10 | import h5py
11 | 
12 | # Assign filename: file
13 | file = 'LIGO_data.hdf5'
14 | 
15 | # Load file: data
16 | data = h5py.File('LIGO_data.hdf5', 'r')
17 | 
18 | # Print the datatype of the loaded file
19 | print(type(data))
20 | 
21 | # Print the keys of the file
22 | for key in data.keys():
23 |     print(key)


--------------------------------------------------------------------------------
/Import_MatLab_WorkSpace.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 11 18:35:11 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import packages
 9 | import scipy.io
10 | import matplotlib.pyplot as plt
11 | import numpy as np
12 | 
13 | # Load MATLAB file: mat
14 | mat = scipy.io.loadmat('albeck_gene_expression.mat')
15 | 
16 | # Print the datatype type of mat
17 | print(type(mat))
18 | 
19 | """
20 | discover what is in the MATLAB dictionary that you loaded
21 | """
22 | 
23 | # Print the keys of the MATLAB dictionary
24 | print(mat.keys())
25 | 
26 | # Print the type of the value corresponding to the key 'CYratioCyt'
27 | print(type(mat['CYratioCyt']))
28 | 
29 | # Print the shape of the value corresponding to the key 'CYratioCyt'
30 | print(np.shape(mat['CYratioCyt']))
31 | 
32 | # Subset the array and plot it
33 | data = mat['CYratioCyt'][25, 5:]
34 | fig = plt.figure()
35 | plt.plot(data)
36 | plt.xlabel('time (min.)')
37 | plt.ylabel('normalized fluorescence (measure of expression)')
38 | plt.show()
39 | 


--------------------------------------------------------------------------------
/Import_Pickled-Data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 11 01:22:54 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import pickle package
 9 | import pickle
10 | 
11 | # Open pickle file and load data: d
12 | with open('data.pkl', 'rb') as file:
13 |     d = pickle.load(file)
14 | 
15 | # Print d
16 | print(d)
17 | 
18 | # Print datatype of d
19 | print(type(d))


--------------------------------------------------------------------------------
/Import_Plot_Web_Flatfile_NonLocal_Save.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jan 15 21:47:37 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import packages
 9 | import matplotlib.pyplot as plt
10 | import pandas as pd
11 | 
12 | # Assign url of file: url
13 | url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606\
14 | /datasets/winequality-red.csv'
15 | 
16 | # Read file into a DataFrame: df
17 | df = pd.read_csv(url, sep= ';')
18 | 
19 | # Print the head of the DataFrame
20 | print(df.head())
21 | 
22 | # Plot first column of df
23 | pd.DataFrame.hist(df.ix[:, 0:1])
24 | plt.xlabel('fixed acidity (g(tartaric acid)/dm$^3$)')
25 | plt.ylabel('count')
26 | plt.show()
27 | 


--------------------------------------------------------------------------------
/Import_SAS7BDAT_.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 11 03:00:42 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | 
 9 | 
10 | import pandas as pd
11 | import matplotlib.pyplot as plt
12 | 
13 | # Import sas7bdat package
14 | from sas7bdat import SAS7BDAT
15 | 
16 | # Save file to a DataFrame: df_sas
17 | with SAS7BDAT('sales.sas7bdat') as file:
18 |     df_sas = file.to_data_frame()
19 | 
20 | 
21 | # Print head of DataFrame
22 | print(df_sas.head())
23 | 
24 | # Plot histogram of DataFrame features (pandas and pyplot already imported)
25 | pd.DataFrame.hist(df_sas[['P']])
26 | plt.ylabel('count')
27 | plt.show()


--------------------------------------------------------------------------------
/Import_Stata_File.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 11 03:10:31 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | 
 9 | import pandas as pd
10 | import matplotlib.pyplot as plt
11 | 
12 | # Import pandas
13 | import pandas as pd
14 | 
15 | # Load Stata file into a pandas DataFrame: df
16 | df = pd.read_stata('disarea.dta')
17 | 
18 | # Print the head of the DataFrame df
19 | print(df.head())
20 | 
21 | # Plot histogram of one column of the DataFrame
22 | pd.DataFrame.hist(df[['disa10']])
23 | plt.xlabel('Extent of disease')
24 | plt.ylabel('Number of coutries')
25 | plt.show()
26 | 


--------------------------------------------------------------------------------
/Inserting_Multiple_Rows.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Jan 26 02:43:07 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | 
 9 | 
10 | # Build a list of dictionaries: values_list
11 | values_list = [
12 |     {'name': 'Anna', 'count': 1, 'amount': 1000.00, 'valid':True},
13 |     {'name' : 'Taylor', 'count':1, 'amount':750.00, 'valid':False}
14 | ]
15 | 
16 | # Build an insert statement for the data table: stmt
17 | stmt = insert(data)
18 | 
19 | # Execute stmt with the values_list: results
20 | results = connection.execute(stmt, values_list)
21 | 
22 | # Print rowcount
23 | print(results.rowcount)


--------------------------------------------------------------------------------
/Iteration.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Mar 27 17:44:35 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | 
  9 | # ## ITERATING ITERABLES ### #
 10 | 
 11 | # Create a list of strings: flash
 12 | flash = ['jay garrick', 'barry allen', 'wally west', 'bart allen']
 13 | 
 14 | # Print each list item in flash using a for loop
 15 | for person in flash:
 16 |     print(person)
 17 | 
 18 | 
 19 | # Create an iterator for flash: superspeed
 20 |     superspeed = iter(flash)
 21 | 
 22 | # Print each item from the iterator
 23 | print(next(superspeed))
 24 | print(next(superspeed))
 25 | print(next(superspeed))
 26 | print(next(superspeed))
 27 | 
 28 | # ########## Iteration 2 ######## #
 29 | 
 30 | # Create an iterator for range(3): small_value
 31 | small_value = iter(range(3))
 32 | 
 33 | # Print the values in small_value
 34 | print(next(small_value))
 35 | print(next(small_value))
 36 | print(next(small_value))
 37 | 
 38 | # Loop over range(3) and print the values
 39 | for num in range(3):
 40 |     print(num)
 41 | 
 42 | 
 43 | # Create an iterator for range(10 ** 100): googol
 44 | googol = iter(range(10**100))
 45 | 
 46 | # Print the first 5 values from googol
 47 | print(next(googol))
 48 | print(next(googol))
 49 | print(next(googol))
 50 | print(next(googol))
 51 | print(next(googol))
 52 | 
 53 | 
 54 | # ######## ## # Iterator as a function argument ### #
 55 | # Create a range object: values
 56 | values = range(10, 21)
 57 | 
 58 | # Print the range object
 59 | print(values)
 60 | 
 61 | # Create a list of integers: values_list
 62 | values_list = list(values)
 63 | 
 64 | # Print values_list
 65 | print(values_list)
 66 | 
 67 | # Get the sum of values: values_sum
 68 | values_sum = sum(values)
 69 | 
 70 | # Print values_sum
 71 | print(values_sum)
 72 | 
 73 | # ############# Enumerate ############ #
 74 | # Create a list of strings: mutants
 75 | mutants = ['charles xavier',
 76 |             'bobby drake',
 77 |             'kurt wagner',
 78 |             'max eisenhardt',
 79 |             'kitty pride']
 80 | 
 81 | # Create a list of tuples: mutant_list
 82 | mutant_list = list(enumerate(mutants))
 83 | 
 84 | # Print the list of tuples
 85 | print(mutant_list)
 86 | 
 87 | # Unpack and print the tuple pairs
 88 | for index1, value1 in enumerate(mutants):
 89 |     print(index1, value1)
 90 | 
 91 | # Change the start index
 92 | for index2, value2 in enumerate(mutants, start=1):
 93 |     print(index2, value2)
 94 | 
 95 | # #####    Using zip ################# #
 96 | 
 97 | # Create a list of tuples: mutant_data
 98 | mutant_data = list(zip(mutants, aliases, powers))
 99 | 
100 | # Print the list of tuples
101 | print(mutant_data)
102 | 
103 | # Create a zip object using the three lists: mutant_zip
104 | mutant_zip = zip(mutants, aliases, powers)
105 | 
106 | # Print the zip object
107 | print(mutant_zip)
108 | 
109 | # Unpack the zip object and print the tuple values
110 | for value1, value2, value3 in mutant_zip:
111 |     print(value1, value2, value3)
112 | 
113 | 
114 | # ########### Unzip with * and zip (*iterable) ############# #
115 | 
116 | # Create a zip object from mutants and powers: z1
117 | z1 = zip(mutants, powers)
118 | 
119 | # Print the tuples in z1 by unpacking with *
120 | print(*z1)
121 | 
122 | # Re-create a zip object from mutants and powers: z1
123 | z1 = zip(mutants, powers)
124 | 
125 | # 'Unzip' the tuples in z1 by unpacking with * and zip(): result1, result2
126 | result1, result2 = zip(*z1)
127 | 
128 | # Check if unpacked tuples are equivalent to original tuples
129 | print(result1 == mutants)
130 | print(result2 == powers)
131 | 


--------------------------------------------------------------------------------
/Lambda_List_Filter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jan  8 14:10:46 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Create a list of strings: fellowship
 9 | fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']
10 | 
11 | # Use filter() to apply a lambda function over fellowship: result
12 | result = filter(lambda member:len(member) > 6, fellowship)
13 | 
14 | # Convert result to a list: result_list
15 | result_list = list(result)
16 | 
17 | # Convert result into a list and print it
18 | print(result_list)
19 | 
20 | 
21 | # random gibberish code test
22 | 
23 | # Define gibberish
24 | def gibberish(*args):
25 |     """Concatenate strings in *args together."""
26 |     hodgepodge = ''
27 |     for word in args:
28 |         hodgepodge += word
29 |     return hodgepodge
30 |     
31 | 
32 | # Import reduce from functools
33 | from functools import reduce
34 | 
35 | # Create a list of strings: stark
36 | stark = ['robb', 'sansa', 'arya', 'eddard', 'jon']
37 | 
38 | # Use result() to apply a lambda 'function over stark: result
39 | result = reduce(lambda item1, item2:item1+item2,stark)
40 | 
41 | # Print the result
42 | print(result)


--------------------------------------------------------------------------------
/LinReg_BS_Pairs_func.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Mar 14 00:52:07 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def draw_bs_pairs_linreg(x, y, size=1):
12 |     """Perform pairs bootstrap for linear regression."""
13 | 
14 |     # Set up array of indices to sample from: inds
15 |     inds = np.arange(len(x))
16 | 
17 |     # Initialize replicates: bs_slope reps, bs_intercept_reps
18 |     bs_slope_reps = np.empty(size)
19 |     bs_intercept_reps = np.empty(size)
20 | 
21 |     # Generate replicates
22 |     for i in range(size):
23 |         bs_inds = np.random.choice(inds, size=len(inds))
24 |         bs_x, bs_y = x[bs_inds], y[bs_inds]
25 |         bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x, bs_y, 1)
26 | 
27 |     return bs_slope_reps, bs_intercept_reps
28 | 


--------------------------------------------------------------------------------
/Linear_Regression_Anscombe.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Mar 12 20:24:53 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | # Perform linear regression: a, b
12 | a, b = np.polyfit(x, y, 1)
13 | 
14 | # Print the slope and intercept
15 | print(a, b)
16 | 
17 | # Generate theoretical x and y data: x_theor, y_theor
18 | x_theor = np.array([3, 15])
19 | y_theor = a * x_theor + b
20 | 
21 | # Plot the Anscombe data and theoretical line
22 | _ = plt.plot(x, y, marker='.', linestyle='none')
23 | _ = plt.plot(x_theor, y_theor)
24 | 
25 | # Label the axes
26 | plt.xlabel('x')
27 | plt.ylabel('y')
28 | 
29 | # Show the plot
30 | plt.show()
31 | 
32 | # ########### LINEAR REGRESSION ON ALL DATA ####### ######### #
33 | 
34 | #### Iterate through x,y pairs
35 | for x, y in zip(anscombe_x, anscombe_y):
36 |     # Compute the slope and intercept: a, b
37 |     a, b = np.polyfit(x, y, 1)
38 | 
39 |     # Print the result
40 |     print('slope:', a, 'intercept:', b)
41 |     
42 | # ####### BOOTSTRAP VISUALISATION ###  ################# #
43 | 
44 | for _ in range(50):
45 |     # Generate bootstrap sample: bs_sample
46 |     bs_sample = np.random.choice(rainfall, size=len(rainfall))
47 | 
48 |     # Compute and plot ECDF from bootstrap sample
49 |     x, y = ecdf(bs_sample)
50 |     _ = plt.plot(x, y, marker='.', linestyle='none',
51 |                  color='gray', alpha=0.1)
52 | 
53 | # Compute and plot ECDF from original data
54 | x, y = ecdf(rainfall)
55 | _ = plt.plot(x, y, marker='.')
56 | 
57 | # Make margins and label axes
58 | plt.margins(0.02)
59 | _ = plt.xlabel('yearly rainfall (mm)')
60 | _ = plt.xlabel('ECDF')
61 | 
62 | # Show the plot
63 | plt.show()
64 | 
65 | # ###########  BOOTSTRAP  REPLICATE FUNCTION ####### ############ #
66 | 
67 | 
68 | # def boostrap_replicate_1d(data, func):
69 | 
70 | #    bs_sample = np.random.coice(data, len(data))
71 | #    return func(bs_sample)
72 | 
73 | # ################# ALTERNATIVE FUNCTION ########### #
74 | 
75 | 
76 | def boostrap_replicate_1d(data, func):
77 | 
78 |     """Generate bootstrap replicate of 1D Data"""
79 |     return func(np.random.choice(data, size=len(data)))
80 | 
81 | 
82 | # ######## MULTIPLE BOOTSTRAP REPLICATES ######### ######### #
83 | 
84 | bs_replicates = np.empty(10000)
85 | 
86 | for i in range(10000):
87 |     bs_replicates[i] = bootstrap_replicate_1d(data, np.mean)
88 |         
89 | 
90 | 


--------------------------------------------------------------------------------
/ListComp_Gen.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Mar 28 19:13:25 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # List of strings
 9 | fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas',
10 |               'boromir', 'gimli']
11 | 
12 | # List comprehension
13 | fellow1 = [member for member in fellowship if len(member) >= 7]
14 | 
15 | print(fellow1)
16 | 
17 | # Generator expression
18 | fellow2 = (member for member in fellowship if len(member) >= 7)
19 | 
20 | print(fellow2)
21 | 
22 | # '''''''''  Basic Gen Expression '''''' '#
23 | 
24 | # Create generator object: result
25 | result = (num for num in range(31))
26 | 
27 | # Print the first 5 values
28 | print(next(result))
29 | print(next(result))
30 | print(next(result))
31 | print(next(result))
32 | print(next(result))
33 | 
34 | # Print the rest of the values
35 | for value in result:
36 |     print(value)
37 | 
38 | 
39 | # '''' Output Change in Generator Expression ''''''''' ####
40 | 
41 | # Create a list of strings: lannister
42 | lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']
43 | 
44 | print(lannister)
45 | 
46 | # Create a generator object: lengths
47 | lengths = (len(person) for person in lannister)
48 | 
49 | # Iterate over and print the values in lengths
50 | print('Lannister Values.. i.e Name Lengths')
51 | 
52 | for value in lengths:
53 |     print(value)
54 | 
55 | 
56 | # '''''''''''''''''' Generator Build _ Basic '''''''''''' ###
57 | 
58 | # Create a list of strings
59 | lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']
60 | 
61 | # Define generator function get_lengths
62 | 
63 | 
64 | def get_lengths(input_list):
65 |     """Generator function that yields the
66 |     length of the strings in input_list."""
67 | 
68 |     # Yield the length of a string
69 |     for person in input_list:
70 |         yield len(person)
71 | 
72 | # Print the values generated by get_lengths()
73 | for value in get_lengths(lannister):
74 |     print(value)


--------------------------------------------------------------------------------
/ListComp_timestamped.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Mar 28 23:30:21 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | import pandas as pd
 8 | 
 9 | # Extract the created_at column from df: tweet_time
10 | tweet_time = df['created_at']
11 | 
12 | # Extract the clock time: tweet_clock_time
13 | tweet_clock_time = [entry[11:19] for entry in tweet_time]
14 | 
15 | # Print the extracted times
16 | print(tweet_clock_time)
17 | 
18 | 
19 | # '''''''''''''''' Conditional List Comprehension - Time Stamped Data ' #
20 | 
21 | # Extract the created_at column from df: tweet_time
22 | tweet_time = df['created_at']
23 | 
24 | # Extract the clock time: tweet_clock_time
25 | tweet_clock_time = [entry[11:19] for entry in tweet_time if entry[17:19] == '19']
26 | 
27 | # Print the extracted times
28 | print(tweet_clock_time)


--------------------------------------------------------------------------------
/List_Dictionary_Full.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Mar 28 23:48:08 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # '''''''''' Working with World Bank World Indicator Dataset ''''''' #
 9 | # Import the pandas package
10 | 
11 | import pandas as pd
12 | 
13 | # Zip lists: zipped_lists
14 | zipped_lists = zip(feature_names, row_vals)
15 | 
16 | # Create a dictionary: rs_dict
17 | rs_dict = dict(zipped_lists)
18 | 
19 | # Print the dictionary
20 | print(rs_dict)
21 | 
22 | 
23 | # ''''''''''''' List to Dictionary Function ''''' #
24 | 
25 | # Define lists2dict()
26 | def lists2dict(list1, list2):
27 |     """Return a dictionary where list1 provides
28 |     the keys and list2 provides the values."""
29 | 
30 |     # Zip lists: zipped_lists
31 |     zipped_lists = zip(list1, list2)
32 | 
33 |     # Create a dictionary: rs_dict
34 |     rs_dict = dict(zipped_lists)
35 | 
36 |     # Return the dictionary
37 |     return rs_dict
38 | 
39 | # Call lists2dict: rs_fxn
40 | rs_fxn = lists2dict(feature_names, row_vals)
41 | 
42 | 
43 | # Print the first two lists in row_lists
44 | print(row_lists[0])
45 | print(row_lists[1])
46 | 
47 | # Turn list of lists into list of dicts: list_of_dicts
48 | list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists]
49 | 
50 | # Print the first two dictionaries in list_of_dicts
51 | print(list_of_dicts[0])
52 | print(list_of_dicts[1])
53 | 
54 | # ''''''''''''''' Turn the list od Data sets to a pandas Dataframe ''''''#
55 | 
56 | # Turn list of lists into list of dicts: list_of_dicts
57 | list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists]
58 | 
59 | # Turn list of dicts into a dataframe: df
60 | df = pd.DataFrame(list_of_dicts)
61 | 
62 | # Print the head of the dataframe
63 | print(df.head())


--------------------------------------------------------------------------------
/Load_Explore_Twitter_Data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan 16 02:50:27 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import package
 9 | import json
10 | 
11 | # String of path to file: tweets_data_path
12 | tweets_data_path = 'tweets.txt'
13 | 
14 | # Initialize empty list to store tweets: tweets_data
15 | tweets_data = []
16 | # Open connection to file
17 | tweets_file = open(tweets_data_path, "r")
18 | 
19 | # Read in tweets and store in list: tweets_data
20 | for line in tweets_file:
21 |     tweet = json.loads(line)
22 |     tweets_data.append(tweet)
23 | 
24 | # Close connection to file
25 | tweets_file.close()
26 | 
27 | # Print the keys of the first tweet dict
28 | print(tweets_data[0].keys())
29 | 


--------------------------------------------------------------------------------
/Local_JSon_Load_Explore.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan 16 00:59:00 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # import module
 9 | import json
10 | 
11 | # Load JSON: json_data
12 | with open("a_movie.json") as json_file:
13 |     json_data = json.load(json_file)
14 | 
15 | # Print each key-value pair in json_data
16 | for k in json_data.keys():
17 |     print(k + ': ', json_data[k])


--------------------------------------------------------------------------------
/Multidata_tweeter_count_function.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Spyder Editor
 4 | 
 5 | This is a sample script layout file
 6 | """
 7 | 
 8 | # Define count_entries()
 9 | def count_entries(df, col_name = 'lang'):
10 |     """Return a dictionary with counts of
11 |     occurrences as value for each key."""
12 | 
13 |     # Initialize an empty dictionary: cols_count
14 |     cols_count = {}
15 | 
16 |     # Extract column from DataFrame: col
17 |     col = df[col_name]
18 |     
19 |     # Iterate over the column in dataframe
20 |     for entry in col:
21 | 
22 |         # If entry is in cols_count, add 1
23 |         if entry in cols_count.keys():
24 |             cols_count[entry] += 1
25 | 
26 |         # Else add the entry to cols_count, set the value to 1
27 |         else:
28 |             cols_count[entry] = 1
29 | 
30 |     # Return the cols_count dictionary
31 |     return cols_count
32 | 
33 | # Call count_entries(): result1
34 | result1 = count_entries(tweets_df, 'lang')
35 | 
36 | # Call count_entries(): result2
37 | result2 = count_entries(tweets_df, 'source')
38 | 
39 | # Print result1 and result2
40 | print(result1)
41 | print(result2)
42 | 
43 | 


--------------------------------------------------------------------------------
/Nested_List_Comp.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Mar 27 22:45:07 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Create a 5 x 5 matrix using a list of lists: matrix
 9 | matrix = [[col for col in range(5)] for row in range(5)]
10 | 
11 | # Print the matrix
12 | for row in matrix:
13 |     print(row)
14 | 
15 | 
16 | # Create a list of strings: fellowship
17 | fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas',
18 |               'boromir', 'gimli']
19 | 
20 | # Create list comprehension: new_fellowship
21 | new_fellowship = [member for member in fellowship if len(member) >= 7]
22 | 
23 | # Print the new list
24 | print(new_fellowship)
25 | 
26 | # ############### Conditional in List comprehension ######## #
27 | 
28 | # Create a list of strings: fellowship
29 | fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas',
30 |               'boromir', 'gimli']
31 | 
32 | # Create list comprehension: new_fellowship
33 | new_fellowship = [member if len(member) >= 7 else '' for member in fellowship]
34 | 
35 | # Print the new list
36 | print(new_fellowship)
37 | 
38 | 
39 | # '''''''''''' Dictionary List COmprehension ''''''''' '''#
40 | 
41 | fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas',
42 |               'boromir', 'gimli']
43 | 
44 | # Create dict comprehension: new_fellowship
45 | new_fellowship = {member: len(member) for member in fellowship}
46 | 
47 | # Print the new list
48 | print(new_fellowship)


--------------------------------------------------------------------------------
/Non_Flat_File_Import_Web-Excel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jan 15 22:36:30 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import package
 9 | import pandas as pd
10 | 
11 | # Assign url of file: url
12 | url = 'http://s3.amazonaws.com/assets.datacamp.com/course/\
13 | importing_data_into_r/latitude.xls'
14 | 
15 | # Read in all sheets of Excel file: xl
16 | xl = pd.read_excel(url, sheetname = None)
17 | 
18 | # Print the sheetnames to the shell
19 | print(xl.keys())
20 | 
21 | # Print the head of the first sheet (using its name, NOT its index)
22 | print(xl['1700'].head())


--------------------------------------------------------------------------------
/Numpy_Import_LoadTxt.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan  9 12:51:35 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import numpy
 9 | import numpy as np
10 | 
11 | # Assign the filename: file
12 | file = 'digits_header.txt'
13 | 
14 | # Load the data: data
15 | data = np.loadtxt(file, delimiter='\t', skiprows= 1, usecols= [0,2])
16 | 
17 | # Print data
18 | print(data)


--------------------------------------------------------------------------------
/Numpy_LoadData_and_Plot.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan  9 03:57:27 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import package
 9 | import numpy as np
10 | import matplotlib as plt
11 | 
12 | # Assign filename to variable: file
13 | file = 'digits.csv'
14 | 
15 | # Load file as array: digits
16 | digits = np.loadtxt(file, delimiter=',')
17 | 
18 | # Print datatype of digits
19 | print(type(digits))
20 | 
21 | # Select and reshape a row
22 | im = digits[21, 1:]
23 | im_sq = np.reshape(im, (28, 28))
24 | 
25 | # Plot reshaped data (matplotlib.pyplot already loaded as plt)
26 | plt.imshow(im_sq, cmap='Greys', interpolation='nearest')
27 | plt.show()
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python Scripts Repo on Data Science
 2 | 
 3 | Data Science stuff I have learnt and still refer to over the years.
 4 | 
 5 | Script titles are descriptive in keeping with the verbose nature of the python language (a Xteristic that I absolutely love)
 6 | 
 7 | A few important caveats on using this Repo:
 8 | 
 9 | All scripts were written on either Linux/Windows OSs.., using Anaconda IDE, gedit and sometimes recently, Geany. Most of these are written in
10 | Python3. (Some are in Python27 - Linux gedit in this case) and I will endeavour to specify these differences.
11 | They have all been written for my specific environments as above and for the Data Science domain.
12 | I welcome feedback on how they work for people and if they find them useful.
13 | The Anaconda IDE gives quite a lot of support for debugging and I endeavour to do as much as I can.
14 | 
15 | As far as using these scripts, you will need to know how to make them work for your specific use case - ASSUMING that you know what you are doing-
16 | And if you really want to understand the underlying methods -the courses on datacamp, udemy, and many other great content platforms - Coursera! are as good as any other out there. - Perhaos that's a stretch. The point is, they are everywhere.
17 | 
18 | IN classes I teach, I can surely try to explain a concept based on my understanding and possibly some implementation.
19 | 
20 | 
21 | Note that, the data sets are not available here. Use your own data. You will however find datasets online if you google them - there's quite a few out there - Nothing beats a bit of legwork.  Beware of those pesky rabbitholes though - It is very easy to get lost when you are having fun.. Debugging...
22 | 
23 | These scripts have proven useful in their adaptability for other projects I am working on but for posterity, this page has been created.
24 | 
25 | You may find some functions or pieces of these code(s) elsewhere on the web. my commercial programming
26 | experience is still ongoing and just like everyone else, I tend to look up how to do a specific function and sometimes borrow that. Yes, devs live in the stackoverflow-verse - some google-fu helps also.
27 | 
28 | Please do not quote me if your implementation doesn't work. #justsaying
29 | 
30 | But do quote me if it does :)
31 | 
32 | Having said all that, we all know that once in a while, you find something that’s written extremely well (such as on StackOverflow or other blogs), and rightly so - there's no use reinventing the wheel.
33 | 
34 |  My hope is that this repo can help make your building/dev work a lot easier.
35 |  
36 |  See what you think.
37 | 
38 | I’ll attempt to credit anything of the sorts as I post them, and apologies if anyone is missed - If you see such, please let me know and I will rectify asap
39 | 
40 | Happy Hacking..
41 | 


--------------------------------------------------------------------------------
/Random_NLTK.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jun  6 14:17:02 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | class FrequencySummarizer:
 9 |     def _init_(self, min_cut=0.1,max_cut=0.9):
10 |         
11 |         self.min_cut = min_cut
12 |         self.max_cut = max_cut
13 |         self._stopwords = set(stopwords.words('english') + list(punctuation) + 
14 |                               [u
15 | # Process 
16 | # 1 - Dload article from url
17 | 
18 | 
19 | # 2 - Eliminate stop words etc that add no meaning


--------------------------------------------------------------------------------
/SQL_Arbitrary_Insert_Row.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Jan 26 02:19:03 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import insert and select from sqlalchemy
 9 | from sqlalchemy import insert, select
10 | 
11 | # Build an insert statement to insert a record into the data table: stmt
12 | stmt = insert(data).values(name='Anna', count=1, amount=1000.00, valid=True)
13 | 
14 | # Execute the statement via the connection: results
15 | results = connection.execute(stmt)
16 | 
17 | # Print result rowcount
18 | print(results.rowcount)
19 | 
20 | # Build a select statement to validate the insert
21 | stmt = select([data]).where(data.columns.name == 'Anna')
22 | 
23 | # Print the result of executing the query.
24 | print(connection.execute(stmt).first())


--------------------------------------------------------------------------------
/SQL_Arbitrary_Table_Create.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Jan 26 01:03:19 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import Table, Column, String, Integer, Float, Boolean from sqlalchemy
 9 | from sqlalchemy import Table, Column, String, Integer, Float, Boolean
10 | 
11 | # Define a new table with a name, count, amount, and valid column: data
12 | data = Table('data', metadata,
13 |              Column('name', String(255)),
14 |              Column('count', Integer()),
15 |              Column('amount', Float()),
16 |              Column('valid', Boolean())
17 | )
18 | 
19 | # Use the metadata to create the table
20 | metadata.create_all(engine)
21 | 
22 | # Print table repr
23 | print(repr(data))
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/SQL_Automatic_Join_Est_Rel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 25 01:14:58 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Build a statement to join census and state_fact tables: stmt
 9 | stmt = select([census.columns.pop2000, state_fact.columns.abbreviation])
10 | 
11 | # Execute the statement and get the first result: result
12 | result = connection.execute(stmt).first()
13 | 
14 | # Loop over the keys in the result object and print the key and value
15 | for key in result.keys():
16 |     print(key, getattr(result, key))


--------------------------------------------------------------------------------
/SQL_CaseStudy_Basic.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Apr  7 00:59:45 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | import sqlalchemy
  9 | # Import create_engine, MetaData
 10 | from sqlalchemy import create_engine, MetaData
 11 | 
 12 | # Import Table, Column, String, and Integer
 13 | from sqlalchemy import (Table, Column, String, Integer)
 14 | from sqlalchemy import select
 15 | from sqlalchemy import insert
 16 | from sqlalchemy import case, cast, Float
 17 | 
 18 | # Define an engine to connect to chapter5.sqlite: engine
 19 | engine = create_engine('sqlite:///chapter5.sqlite')
 20 | 
 21 | # Initialize MetaData: metadata
 22 | metadata = MetaData()
 23 | 
 24 | 
 25 | # ''''''''''''''''' Create Table to Database '''''#
 26 | 
 27 | # Build a census table: census
 28 | census = Table('census', metadata,
 29 |                Column('state', String(30)),
 30 |                Column('sex', String(1)),
 31 |                Column('age', Integer()),
 32 |                Column('pop2000', Integer()),
 33 |                Column('pop2008', Integer()))
 34 | 
 35 | # Create the table in the database
 36 | metadata.create_all(engine)
 37 | 
 38 | # '''''Read the Data from a CSV - Leverage Python csv module ''''#
 39 | 
 40 | # Create an empty list: values_list
 41 | values_list = []
 42 | 
 43 | # Iterate over the rows
 44 | for row in csv_reader:
 45 |     # Create a dictionary with the values
 46 |     data = {'state': row[0], 'sex': row[1], 'age': row[2], 'pop2000': row[3],
 47 |             'pop2008': row[4]}
 48 |     # Append the dictionary to the values list
 49 |     values_list.append(data)
 50 | 
 51 | 
 52 | # '''''''' Load Data froma List into the Table '''''' #
 53 | 
 54 | # Build insert statement: stmt
 55 | stmt = insert(census)
 56 | 
 57 | # Use values_list to insert data: results
 58 | results = connection.execute(stmt, values_list)
 59 | 
 60 | # Print rowcount
 61 | print(results.rowcount)
 62 | 
 63 | 
 64 | # ''''''''Determine Average Age by Population - Test 1''''#
 65 | # Import select
 66 | 
 67 | # Calculate weighted average age: stmt
 68 | stmt = select([census.columns.sex,
 69 |                (func.sum(census.columns.pop2008 * census.columns.age) /
 70 |                 func.sum(census.columns.pop2008)).label('average_age')
 71 |                ])
 72 | 
 73 | # Group by sex
 74 | stmt = stmt.group_by(census.columns.sex)
 75 | 
 76 | # Execute the query and store the results: results
 77 | results = connection.execute(stmt).fetchall()
 78 | 
 79 | # Print the average age by sex
 80 | for result in results:
 81 |     print(result.sex, result.average_age)
 82 |     
 83 | 
 84 | # ''''Query - Percentage of Pop  by Gender and State ''''''#
 85 | 
 86 | # import case, cast and Float from sqlalchemy
 87 | 
 88 | # Build a query to calculate the percentage of females in 2000: stmt
 89 | stmt = select([census.columns.state,
 90 |     (func.sum(
 91 |         case([
 92 |             (census.columns.sex == 'F', census.columns.pop2000)
 93 |         ], else_=0)) /
 94 |      cast(func.sum(census.columns.pop2000), Float) * 100).label('percent_female')
 95 | ])
 96 | 
 97 | # Group By state
 98 | stmt = stmt.group_by(census.columns.state)
 99 | 
100 | # Execute the query and store the results: results
101 | results = connection.execute(stmt).fetchall()
102 | 
103 | # Print the percentage
104 | for result in results:
105 |     print(result.state, result.percent_female)
106 | 
107 | 
108 | # '''''' Query to det pop. diff. by state between 2008 and 2000 '''#
109 | 
110 | # Build query to return state name and population difference from 2008 to 2000
111 | stmt = select([census.columns.state,
112 |      (census.columns.pop2008 - census.columns.pop2000).label('pop_change')
113 | ])
114 | 
115 | # Group by State
116 | stmt = stmt.group_by(census.columns.state)
117 | 
118 | # Order by Population Change
119 | stmt = stmt.order_by(desc('pop_change'))
120 | 
121 | # Limit to top 10
122 | stmt = stmt.limit(10)
123 | 
124 | # Use connection to execute the statement and fetch all results
125 | results = connection.execute(stmt).fetchall()
126 | 
127 | # Print the state and population change for each record
128 | for result in results:
129 |     print('{}-{}'.format(result.state, result.pop_change))


--------------------------------------------------------------------------------
/SQL_Check_Col_Population_Percentage.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan 23 02:38:04 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # import case, cast and Float from sqlalchemy
 9 | from sqlalchemy import case, cast, Float
10 | 
11 | # Build an expression to calculate female population in 2000
12 | female_pop2000 = func.sum(
13 |     case([
14 |         (census.columns.sex == 'F', census.columns.pop2000)
15 |     ], else_= 0))
16 | 
17 | # Cast an expression to calculate total population in 2000 to Float
18 | total_pop2000 = cast(func.sum(census.columns.pop2000), Float)
19 | 
20 | # Build a query to calculate the percentage of females in 2000: stmt
21 | stmt = select([female_pop2000 / total_pop2000 * 100])
22 | 
23 | # Execute the query and store the scalar result: percent_female
24 | percent_female = connection.execute(stmt).scalar()
25 | 
26 | # Print the percentage
27 | print(percent_female)


--------------------------------------------------------------------------------
/SQL_Data_Count_Group-By.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jan 22 22:15:59 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import func
 9 | from sqlalchemy.sql import func
10 | 
11 | # Build a query to select the state and count of ages by state: stmt
12 | stmt = select([census.columns.state, func.count(census.columns.age)])
13 | 
14 | # Append group by state
15 | stmt = stmt.group_by(census.columns.state)
16 | 
17 | # Execute the statement and store all the records: results
18 | results = connection.execute(stmt).fetchall()
19 | 
20 | # Print results
21 | print(results)
22 | 
23 | # Print the keys/column names of the results returned
24 | print(results[0].keys())


--------------------------------------------------------------------------------
/SQL_Data_Count_Keys_Values.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jan 22 22:15:59 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import func
 9 | from sqlalchemy.sql import func
10 | 
11 | # Build a query to select the state and count of ages by state: stmt
12 | stmt = select([census.columns.state, func.count(census.columns.age)])
13 | 
14 | # Append group by state
15 | stmt = stmt.group_by(census.columns.state)
16 | 
17 | # Execute the statement and store all the records: results
18 | results = connection.execute(stmt).fetchall()
19 | 
20 | # Print results
21 | print(results)
22 | 
23 | # Print the keys/column names of the results returned
24 | print(results[0].keys())


--------------------------------------------------------------------------------
/SQL_Delete_Table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Apr  6 23:59:59 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | import sqlalchemy
 8 | # Import delete, select
 9 | from sqlalchemy import delete, select
10 | 
11 | # Build a statement to empty the census table: stmt
12 | stmt = delete(census)
13 | 
14 | # Execute the statement: results
15 | results = connection.execute(stmt)
16 | 
17 | # Print affected rowcount
18 | print(results.rowcount)
19 | 
20 | # Build a statement to select all records from the census table
21 | 
22 | stmt = select([census])
23 | 
24 | # Print the results of executing the statement to verify
25 | # there are no rows
26 | 
27 | print(connection.execute(stmt).fetchall())
28 | 
29 | # ##################### ################ ################ ########
30 | # '''''' Deleting Specific records '''''''##
31 | 
32 | # Build a statement to count records using
33 | # the sex column for Men ('M') age 36: stmt
34 | 
35 | stmt = select([func.count(census.columns.sex)]).where(
36 |     and_(census.columns.sex == 'M',
37 |          census.columns.age == 36)
38 | )
39 | 
40 | # Execute the select statement and use the scalar() fetch
41 | # method to save the record count
42 | 
43 | to_delete = connection.execute(stmt).scalar()
44 | 
45 | # Build a statement to delete records from the census table: stmt_del
46 | 
47 | stmt_del = delete(census).where(stmt)
48 | 
49 | # Append a where clause to target Men ('M') age 36
50 | 
51 | stmt_del = stmt_del.where(
52 |     and_(census.columns.sex == 'M',
53 |          census.columns.age == 36)
54 | )
55 | 
56 | # Execute the statement: results
57 | results = connection.execute(stmt_del)
58 | 
59 | # Print affected rowcount and to_delete record count, make sure they match
60 | print(results.rowcount, to_delete)
61 | 
62 | 
63 | # '''''''' Delete TAble COmpletely ''''''''''#
64 | 
65 | # Drop the state_fact table
66 | state_fact.drop(engine)
67 | 
68 | # Check to see if state_fact exists
69 | print(state_fact.exists(engine))
70 | 
71 | # Drop all tables
72 | metadata.drop_all(engine)
73 | 
74 | # Check to see if census exists
75 | print(census.exists(engine))


--------------------------------------------------------------------------------
/SQL_Det_Pop_Sum_by_Column.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jan 22 23:03:45 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # import pandas
 9 | import pandas as pd
10 | # Import Pyplot as plt from matplotlib
11 | import matplotlib.pyplot as plt
12 | 
13 | from sqlalchemy import create_engine
14 | 
15 | # Import func
16 | from sqlalchemy.sql import func
17 | 
18 | from sqlalchemy import MetaData, Table
19 | metadata = MetaData()
20 | 
21 | engine = create_engine('sqlite:///census_nyc.sqlite')
22 | 
23 | # Reflect census table from the engine: census
24 | census = Table('census', metadata, autoload=True, autoload_with=engine)
25 | 
26 | # Build an expression to calculate the sum of pop2008 labeled as population
27 | pop2008_sum = func.sum(census.columns.pop2008).label("population")
28 | 
29 | # Build a query to select the state and sum of pop2008 as population grouped by
30 | # state: stmt
31 | stmt = select([census.columns.state, pop2008_sum])
32 | 
33 | # Append group by state
34 | stmt = stmt.group_by(census.columns.state)
35 | 
36 | # Execute the statement and store all the records: results
37 | results = connection.execute(stmt).fetchall()
38 | 
39 | # Print results
40 | print(results)
41 | 
42 | # Print the keys/column names of the results returned
43 | print(results[0].keys())
44 | 
45 | 
46 | # Create a DataFrame from the results: df
47 | df = pd.DataFrame(results)
48 | 
49 | # Set column names
50 | df.columns = results[0].keys()
51 | 
52 | # Print the Dataframe
53 | print(df)
54 | 
55 | # Create a DataFrame from the results: df
56 | df = pd.DataFrame(results)
57 | 
58 | # Set Column names
59 | df.columns = results[0].keys()
60 | 
61 | # Print the DataFrame
62 | print(df)
63 | 
64 | # Plot the DataFrame
65 | df.plot.bar()
66 | plt.show()
67 | 
68 | 


--------------------------------------------------------------------------------
/SQL_Join_Columns_Advanced.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 25 19:14:06 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Build a statement to select the state, sum of 2008 population and census
 9 | # division name: stmt
10 | stmt = select([census.columns.state,
11 |     func.sum(census.columns.pop2008),
12 |     state_fact.columns.census_division_name
13 | ])
14 | 
15 | # Append select_from to join the census and state_fact tables by the census state and state_fact name columns
16 | stmt = stmt.select_from(
17 |     census.join(state_fact, census.columns.state == state_fact.columns.name)
18 | )
19 | 
20 | # Append a group by for the state_fact name column
21 | stmt = stmt.group_by(state_fact.columns.name)
22 | 
23 | # Execute the statement and get the results: results
24 | results = connection.execute(stmt).fetchall()
25 | 
26 | # Loop over the the results object and print each record.
27 | for record in results:
28 |     print(record)


--------------------------------------------------------------------------------
/SQL_Join_Table_Columns.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 25 19:10:55 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Build a statement to select the census and state_fact tables: stmt
 9 | stmt = select([census, state_fact])
10 | 
11 | # Add a select_from clause that wraps a join for the census and state_fact
12 | # tables where the census state column and state_fact name column match
13 | stmt = stmt.select_from(
14 |     census.join(state_fact, census.columns.state == state_fact.columns.name))
15 | 
16 | # Execute the statement and get the first result: result
17 | result = connection.execute(stmt).first()
18 | 
19 | # Loop over the keys in the result object and print the key and value
20 | for key in result.keys():
21 |     print(key, getattr(result, key))
22 | 


--------------------------------------------------------------------------------
/SQL_Leverage_Heirach_Data_Group_By.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 25 20:45:56 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Make an alias of the employees table: managers
 9 | managers = employees.alias()
10 | 
11 | # Build a query to select managers and counts of their employees: stmt
12 | stmt = select([managers.columns.name, func.count(employees.columns.id)])
13 | 
14 | # Append a where clause that ensures the manager id and employee mgr are equal
15 | stmt = stmt.where(managers.columns.id == employees.columns.mgr)
16 | 
17 | # Group by Managers Name
18 | stmt = stmt.group_by(managers.columns.name)
19 | 
20 | # Execute statement: results
21 | results = connection.execute(stmt).fetchall()
22 | 
23 | # print manager
24 | for record in results:
25 |     print(record)
26 | 
27 | 


--------------------------------------------------------------------------------
/SQL_LoadCSV_csv-reader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Jan 26 03:07:47 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Create a insert statement for census: stmt
 9 | stmt = insert(census)
10 | 
11 | # Create an empty list and zeroed row count: values_list, total_rowcount
12 | values_list = []
13 | total_rowcount = 0
14 | 
15 | # Enumerate the rows of csv_reader
16 | for idx, row in enumerate(csv_reader):
17 |     #create data and append to values_list
18 |     data = {'state': row[0], 'sex': row[1], 'age': row[2], 'pop2000': row[3],
19 |             'pop2008': row[4]}
20 |     values_list.append(data)
21 | 
22 |     # Check to see if divisible by 51
23 |     if idx % 51 == 0:
24 |         results = connection.execute(stmt, values_list)
25 |         total_rowcount += results.rowcount
26 |         values_list = []


--------------------------------------------------------------------------------
/SQL_Order_Desc_by_Column.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jan 22 17:50:29 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import desc
 9 | from sqlalchemy import desc
10 | from sqlalchemy import Table, MetaData
11 | 
12 | # Build a query to select the state column: stmt
13 | stmt = select([census.columns.state])
14 | 
15 | # Append order_by descending state: rev_stmt
16 | rev_stmt = stmt.order_by(desc(census.columns.state))
17 | 
18 | # Execute the query and store the results: rev_results
19 | rev_results = connection.execute(rev_stmt).fetchall()
20 | 
21 | # Print the first 10 rev_results
22 | print(rev_results[:10])


--------------------------------------------------------------------------------
/SQL_Order_by_Data by Column.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jan 22 17:34:53 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Build a query to select the state column: stmt
 9 | stmt = select([census.columns.state])
10 | 
11 | # Append an order_by state
12 | stmt = stmt.order_by(census.columns.state)
13 | 
14 | # Execute the query and store the results: results
15 | results = connection.execute(stmt).fetchall()
16 | 
17 | # Print the first 10 results
18 | print(results[:10])


--------------------------------------------------------------------------------
/SQL_Plot_Results_DataFrame.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Jan 22 23:28:13 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import Pyplot as plt from matplotlib
 9 | import matplotlib.pyplot as plt
10 | 
11 | # Create a DataFrame from the results: df
12 | df = pd.DataFrame(results)
13 | 
14 | # Set Column names
15 | df.columns = results[0].keys()
16 | 
17 | # Print the DataFrame
18 | print(df)
19 | 
20 | # Plot the DataFrame
21 | df.plot.bar()
22 | plt.show()
23 | 


--------------------------------------------------------------------------------
/SQL_Same_Table_Joined_Query.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 25 20:31:26 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Make an alias of the employees table: managers
 9 | managers = employees.alias()
10 | 
11 | # Build a query to select manager's and their employees names: stmt
12 | stmt = select(
13 |     [managers.columns.name.label('manager'),
14 |      employees.columns.name.label('employee')]
15 | )
16 | 
17 | # Append where to match manager ids with employees managers: stmt
18 | stmt = stmt.where(managers.columns.id == employees.columns.mgr)
19 | 
20 | # Append order by managers name: stmt
21 | stmt = stmt.order_by(managers.columns.name)
22 | 
23 | # Execute statement: results
24 | results = connection.execute(stmt).fetchall()
25 | 
26 | # Print records
27 | for record in results:
28 |     print(record)
29 | 
30 | """
31 | Functions and Group_bys using Heirachical Data ( Tables)
32 | """
33 | 
34 | # Fresh Code starts here
35 | 
36 | # Make an alias of the employees table: managers
37 | managers = employees.alias()
38 | 
39 | # Build a query to select managers and counts of their employees: stmt
40 | stmt = select([managers.columns.name, func.count(employees.columns.id)])
41 | 
42 | # Append a where clause that ensures the manager id and employee mgr are equal
43 | stmt = stmt.where(managers.columns.id == employees.columns.mgr)
44 | 
45 | # Group by Managers Name
46 | stmt = stmt.group_by(managers.columns.name)
47 | 
48 | # Execute statement: results
49 | results = connection.execute(stmt).fetchall()
50 | 
51 | # print manager
52 | for record in results:
53 |     print(record)
54 | 
55 | 


--------------------------------------------------------------------------------
/Simple_Data_Filter_Select_Where.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jan 17 01:38:48 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | from sqlalchemy import creat_engine
 9 | 
10 | # Create a select query: stmt
11 | stmt = select([census])
12 | 
13 | # Add a where clause to filter the results to only those for New York
14 | stmt = stmt.where(census.columns.state == 'New York')
15 | 
16 | # Execute the query to retrieve all the data returned: results
17 | results = connection.execute(stmt).fetchall()
18 | 
19 | # Loop over the results and print the age, sex, and pop2008
20 | for result in results:
21 |     print(result.age, result.sex, result.pop2008)
22 | 
23 | 
24 |     
25 | # Create a query for the census table: stmt
26 | stmt = select([census])
27 | 
28 | # Append a where clause to match all the states in_ the list states
29 | stmt = stmt.where(census.columns.state.in_(states))
30 | 
31 | # Loop over the ResultProxy and print the state and its population in 2000
32 | for result in connection.execute(stmt):
33 |     print(result.state, result.pop2000)


--------------------------------------------------------------------------------
/TwitterAPI_Authentication_SampleM.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan 16 02:29:31 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import package
 9 | import tweepy
10 | 
11 | # Store OAuth authentication credentials in relevant variables
12 | access_token = "1092294848-aHN7DcRP9B4VMTQIhwqOYiB14YkW92fFO8k8EPy"
13 | access_token_secret = "X4dHmhPfaksHcQ7SCbmZa2oYBBVSD2g8uIHXsp5CTaksx"
14 | consumer_key = "nZ6EA0FxZ293SxGNg8g8aP0HM"
15 | consumer_secret = "fJGEodwe3KiKUnsYJC3VRndj7jevVvXbK2D5EiJ2nehafRgA6i"
16 | 
17 | # Pass OAuth details to tweepy's OAuth handler
18 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
19 | auth.set_access_token(access_token, access_token_secret)


--------------------------------------------------------------------------------
/Twitter_Data to DataFrame.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan 16 02:54:50 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import package
 9 | import pandas as pd
10 | 
11 | # Build DataFrame of tweet texts and languages
12 | df = pd.DataFrame(tweets_data, columns=['text', 'lang'])
13 | 
14 | # Print head of DataFrame
15 | print(df.head())


--------------------------------------------------------------------------------
/Twitter_Text_dataAnalysis.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Jan 16 02:56:45 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import re
 9 | import pandas as pd
10 | import numpy as np
11 | import seaborn as sns
12 | import matplotlib.pyplot as plt
13 | 
14 | 
15 | def word_in_text(word, tweet):
16 |     word = word.lower()
17 |     text = tweet.lower()
18 |     match = re.search(word, tweet)
19 | 
20 |     if match:
21 |         return True
22 |     return False
23 | 
24 | # Build DataFrame of tweet texts and languages
25 | df = pd.DataFrame(tweets_data, columns=['text', 'lang'])
26 | 
27 | # Print head of DataFrame
28 | print(df.head())
29 | """
30 |  iterate over the rows of the DataFrame and calculate how many tweets contain
31 |  each of our keywords! The list of objects for each candidate has been
32 |  initialized to 0
33 | """
34 | # Initialize list to store tweet counts
35 | [clinton, trump, sanders, cruz] = [0, 0, 0, 0]
36 | 
37 | # Iterate through df, counting the number of tweets in which
38 | # each candidate is mentioned
39 | for index, row in df.iterrows():
40 |     clinton += word_in_text('clinton', row['text'])
41 |     trump += word_in_text('trump', row['text'])
42 |     sanders += word_in_text('sanders', row['text'])
43 |     cruz += word_in_text('cruz', row['text'])
44 | 
45 | 
46 | # first import seaborn as sns; you'll then construct a barplot of the
47 | # data using sns.barplot, passing it two arguments: (i) a list of labels and
48 | # (ii) a list containing e variables you wish to plot(clinton, trump and so on)
49 | 
50 | # Import packages
51 | 
52 | 
53 | # Set seaborn style
54 | sns.set(color_codes=True)
55 | 
56 | # Create a list of labels:cd
57 | cd = ['clinton', 'trump', 'sanders', 'cruz']
58 | 
59 | # Plot histogram
60 | ax = sns.barplot(cd, [clinton, trump, sanders, cruz])
61 | ax.set(ylabel="count")
62 | plt.show()
63 | 


--------------------------------------------------------------------------------
/Vis_Regressions_FixData.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Apr  2 19:34:27 2017
  4 | 
  5 | @author: Shabaka
  6 | """
  7 | 
  8 | # Import plotting modules
  9 | import matplotlib.pyplot as plt
 10 | import seaborn as sns
 11 | import pandas as pd
 12 | import numpy as np
 13 | 
 14 | # ''''' File Import to panda dataframe '#
 15 | 
 16 | fixdat = pd.read_csv('C:\\Users\\Shabaka\\ShabakaCodes\\fixations.csv',
 17 |                      index_col=0, parse_dates=True)
 18 | 
 19 | fix_chunk = pd.read_csv('C:\\Users\\Shabaka\\ShabakaCodes\\fixations.csv',
 20 |                         chunksize=50)
 21 | 
 22 | # Test any number of variables against each other
 23 | # Plot a linear regression between 'duration' and 'confidence'
 24 | sns.lmplot(x='duration', y='confidence', data=fixdat)
 25 | 
 26 | # Display the plot
 27 | plt.show()
 28 | 
 29 | # '''''''' Plotting residuals of a regression ''''''#
 30 | 
 31 | # Generate a green residual plot of the regression between 'dur' and 'conf'
 32 | sns.residplot(x='duration', y='confidence', data=fixdat, color='green')
 33 | 
 34 | # Display the plot
 35 | plt.show()
 36 | 
 37 | # '''''''''' HIgher Order Regressions''''''#
 38 | 
 39 | # Generate a scatter plot of 'fix_dur' and 'confidence' using red circles
 40 | plt.scatter(fixdat['duration'], fixdat['confidence'],
 41 |             label='data', color='red', marker='o')
 42 | 
 43 | # Plot in blue a linear regression of order 1 btw 'fix_dur' and 'confidence'
 44 | sns.regplot(x='duration', y='confidence', data=fixdat,
 45 |             color='blue', label='order 1', scatter=None)
 46 | 
 47 | # Plot in green a linear regression of order 2 between 'fixdur' and 'conf'
 48 | sns.regplot(x='duration', y='confidence', data=fixdat,
 49 |             color='green', label='order 2', scatter=None, order=2)
 50 | 
 51 | # Add a legend and display the plot
 52 | plt.legend(loc='lower right')
 53 | plt.show()
 54 | 
 55 | 
 56 | # ''''''''''' Linear Regressions by Hue ''''''''#
 57 | 
 58 | # Plot a linear regression between 'duration' and 'confidence', with a hue
 59 | # of 'avg pupil size' and palette of 'Set1'
 60 | sns.lmplot(x='duration', y='confidence', data=fixdat,
 61 |            palette='Set1')    # hue='avg_pupil_size'
 62 | 
 63 | # Display the plot
 64 | 
 65 | plt.show()
 66 | 
 67 | # ''''''''''Strip Plot Construction
 68 | 
 69 | # Make a strip plot of 'duration' grouped by 'conf'
 70 | 
 71 | plt.subplot(2, 1, 1)
 72 | sns.stripplot(x='duration', y='confidence', data=fixdat)
 73 | 
 74 | # Make the strip plot again using jitter and a smaller point size
 75 | plt.subplot(2, 1, 2)
 76 | sns.stripplot(x='duration', y='confidence', data=fixdat, jitter=True, size=3)
 77 | 
 78 | # Display the plot
 79 | plt.show()
 80 | 
 81 | # ''''''''''''''''  Generating Swarmplots '''''''#
 82 | 
 83 | # Generate a swarm plot of 'dur' grouped horizontally by 'pupil_size'
 84 | 
 85 | plt.subplot(2, 1, 1)
 86 | sns.swarmplot(x='avg_pupil_size', y='duration', data=fixdat)
 87 | 
 88 | # Gen a swarm plot of 'avgPup_size' grouped vertically
 89 | # by 'confidence' with a hue of 'duraton'
 90 | plt.subplot(2, 1, 2)
 91 | sns.swarmplot(x='avg_pupil_size', y='duration', data=fixdat,
 92 |               orient='v')    # hue='confidence')
 93 | 
 94 | # Display the plot
 95 | plt.show()
 96 | 
 97 | # ''''''''''''''' Constructing Violin Plots ''''''''#
 98 | 
 99 | # Generate a violin plot of 'avg_pupil_size' grouped horizontally by 'conf'
100 | plt.subplot(2, 1, 1)
101 | sns.violinplot(x='confidence', y='avg_pupil_size', data=fixdat)
102 | 
103 | # Gen same violin plot: with color= 'lightgray' and without inner annotations
104 | plt.subplot(2, 1, 2)
105 | sns.violinplot(x='confidence', y='avg_pupil_size',
106 |                data=fixdat, inner=None, color='lightgray')
107 | 
108 | # Overlay a strip plot on the violin plot
109 | sns.stripplot(x='confidence', y='avg_pupil_size',
110 |               data=fixdat, size=1.5, jitter=True)
111 | 
112 | # Display the plot
113 | plt.show()
114 | 
115 | # ''''''''''' Plotting Joint Distributions - 1 ''''''''#
116 | 
117 | # Generate a joint plot of 'fix dur' and 'confidence'
118 | _ = sns.jointplot(x='duration', y='confidence', data=fixdat)
119 | 
120 | # Display the plot
121 | plt.show()
122 | 
123 | # Generate a joint plot of 'avg_pupil size and 'duration'
124 | _ = sns.jointplot(x='duration', y='avg_pupil_size', data=fixdat)
125 | 
126 | # Display the plot
127 | plt.show()
128 | 
129 | 
130 | # ''''''''''' Plotting Joint Distributions 2 ''''''''' #
131 | 
132 | # Hex Bin Plot # - kind = scatter/reg/resid/kde/hex ( as below)
133 | 
134 | # Generate a joint plot of 'hp' and 'mpg' using a hexbin plot
135 | _ = sns.jointplot(x='duration', y='confidence', data=fixdat, kind='hex')
136 | 
137 | # Display the plot
138 | plt.show()
139 | 
140 | # ''''''''''''' Plot the Distibutions Pairwise'''''''''''#
141 | 
142 | # Print the first 5 rows of the DataFrame
143 | print(fixdat.head())
144 | 
145 | # Plot the pairwise joint distributions from the DataFrame
146 | sns.pairplot(fixdat)
147 | 
148 | # Display the plot
149 | plt.show()
150 | 
151 | # ''''''''' Pairwise Distributtion gropued by origin + reg lines #
152 | 
153 | # Print the first 5 rows of the DataFrame
154 | print(fixdat.head())
155 | 
156 | # Plot the pairwise joint distributions grouped by 'origin' along with
157 | # regression lines
158 | sns.pairplot(fixdat, kind='reg', hue='dispersion')
159 | 
160 | # Display the plot
161 | plt.show()
162 | 
163 | # ''''''''''' Correlation Viz with a Heat Map - Covariance Matrix)
164 | 
165 | # Print the covariance matrix
166 | print(cov_matrix)
167 | 
168 | # Visualize the covariance matrix using a heatmap
169 | sns.heatmap(cov_matrix)
170 | 
171 | # Display the heatmap
172 | plt.show()
173 | 
174 | _ = plt.plot(fixdat)
175 | _ = plt.legend(loc='upper right')
176 | _ = plt.show()
177 | 
178 | 


--------------------------------------------------------------------------------
/csv_DataFrame_NumpyArray.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jan 10 15:03:52 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | # Import pandas as pd
 9 | import pandas as pd
10 | 
11 | # Assign the filename: file
12 | file = 'titanic.csv'
13 | 
14 | # Read the file into a DataFrame: df
15 | df = pd.read_csv(file)
16 | 
17 | # View the head of the DataFrame
18 | 
19 | print(df.head())
20 | 
21 | # Assign the filename: file
22 | file = 'digits.csv'
23 | 
24 | # Read the first 5 rows of the file into a DataFrame: data
25 | data = pd.read_csv(file, nrows=5, header=None)
26 | 
27 | # Build a numpy array from the DataFrame: data_array
28 | data_array = np.array(data.values)
29 | 
30 | # Print the datatype of data_array to the shell
31 | print(type(data_array))


--------------------------------------------------------------------------------
/draw_bootstrap_reps.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Mar 12 22:07:58 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def bootstrap_replicate_1d(data, func):
12 |     """Generate bootstrap replicate of 1D Data"""
13 |     bs_sample = np.random.choice(data, len(data))
14 | 
15 |     return func(bs_sample)
16 | 
17 | # def boostrap_replicate_1d(data, func):  
18 | #    return func(np.random.choice(data, size=len(data)))
19 | 
20 | 
21 | def draw_bs_reps(data, func, size=1):
22 |     """Draw bootstrap replicates."""
23 | 
24 |     # Initialize array of replicates: bs_replicates
25 |     bs_replicates = np.empty(size)
26 | 
27 |     # Generate replicates
28 |     for i in range(size):
29 |         bs_replicates[i] = bootstrap_replicate_1d(data, func)
30 | 
31 |     return bs_replicates


--------------------------------------------------------------------------------
/draw_bs_pairs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sun Mar 26 12:44:52 2017
 4 | 
 5 | @author: Shabaka
 6 | """
 7 | 
 8 | import numpy as np
 9 | 
10 | def draw_bs_pairs(x, y, func, size=1):
11 |     """Perform pairs bootstrap for linear regression."""
12 | 
13 |     # Set up array of indices to sample from: inds
14 |     inds = np.arange(len(x))
15 |     # Initialize replicates
16 |     bs_replicates = np.empty(size)
17 |     # bs_intercept_reps = ____
18 | 
19 |     # Generate replicates
20 |     for i in range(size):
21 |         bs_inds = np.random.choice(inds, len(inds))
22 |         bs_x, bs_y = x[bs_inds], y[bs_inds]
23 |         bs_replicates[i] = func(bs_x, bs_y)
24 |         # bs_slope_reps[i], bs_intercept_reps[i] = np.polyfit(bs_x, bs_y, 1)
25 | 
26 |     return bs_replicates


--------------------------------------------------------------------------------