├── LICENSE
├── Logistic Regression with StatsModels
    ├── logistic.py
    ├── test.csv
    └── train.csv
├── Logistic-Regression
    ├── citreo.py
    ├── citreo_code_v2.py
    ├── classifier_corrected.py
    └── logistic_regression_updated.py
├── README.md
├── Twitter-Data-Analysis
    ├── extract_twitter_data.py
    └── json2tweets.R
├── basic_commands.py
└── svm_sklearn.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Ujjwal Karn
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Logistic Regression with StatsModels/logistic.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on Wed Sep 09 12:38:16 2015
  3 | @author: ujjwal.karn
  4 | """
  5 | 
  6 | import pandas as pd                #for handling datasets
  7 | import statsmodels.api as sm       #for statistical modeling
  8 | import pylab as pl                 #for plotting
  9 | import numpy as np                 #for numerical computation
 10 | 
 11 | # read the data in
 12 | dfTrain = pd.read_csv("C:\\Users\\ujjwal.karn\\Desktop\\Python\\train.csv")
 13 | dfTest = pd.read_csv("C:\\Users\\ujjwal.karn\\Desktop\\Python\\test.csv")
 14 | 
 15 | # take a look at the dataset
 16 | print dfTrain.head()
 17 | #   admit  gre   gpa prestige
 18 | #0      0  380  3.61     good
 19 | #1      1  660  3.67     good
 20 | #2      1  800  4.00     best
 21 | #3      1  640  3.19       ok
 22 | #4      0  520  2.93       ok
 23 | 
 24 | print dfTest.head()
 25 | #   gre   gpa  prestige
 26 | #0  640  3.30  veryGood
 27 | #1  660  3.60      good
 28 | #2  400  3.15  veryGood
 29 | #3  680  3.98  veryGood
 30 | #4  220  2.83      good
 31 | 
 32 | 
 33 | # summarize the data
 34 | print dfTrain.describe()
 35 | #            admit         gre         gpa
 36 | #count  300.000000  300.000000  300.000000
 37 | #mean     0.306667  590.866667    3.386233
 38 | #std      0.461880  117.717630    0.374880
 39 | #min      0.000000  300.000000    2.260000
 40 | #25%      0.000000  515.000000    3.130000
 41 | #50%      0.000000  600.000000    3.390000
 42 | #75%      1.000000  680.000000    3.642500
 43 | #max      1.000000  800.000000    4.000000
 44 | 
 45 | # take a look at the standard deviation of each column
 46 | print dfTrain.std()
 47 | #admit      0.46188
 48 | #gre      117.71763
 49 | #gpa        0.37488
 50 | 
 51 | # frequency table cutting presitge and whether or not someone was admitted
 52 | print pd.crosstab(dfTrain['admit'], dfTrain['prestige'], rownames=['dmit'])
 53 | #prestige  best  good  ok  veryGood
 54 | #admit                             
 55 | #0           20    73  47        68
 56 | #1           25    19   9        39
 57 | 
 58 | #explore data
 59 | dfTrain.groupby('admit').mean()
 60 | #              gre       gpa
 61 | #admit                      
 62 | #0      573.461538  3.336587
 63 | #1      630.217391  3.498478
 64 | 
 65 | # plot one column
 66 | dfTrain['gpa'].hist()
 67 | pl.title('Histogram of GPA')
 68 | pl.xlabel('GPA')
 69 | pl.ylabel('Frequency')
 70 | pl.show()
 71 | 
 72 | # barplot of gre score grouped by admission status (True or False)
 73 | pd.crosstab(dfTrain.gre, dfTrain.admit.astype(bool)).plot(kind='bar')
 74 | pl.title('GRE score by Admission Status')
 75 | pl.xlabel('GRE score')
 76 | pl.ylabel('Frequency')
 77 | pl.show()
 78 | 
 79 | # dummify prestige
 80 | dummy_ranks = pd.get_dummies(dfTrain['prestige'], prefix='prestige')
 81 | print dummy_ranks.head()
 82 | #      prestige_best  prestige_good  prestige_ok  prestige_veryGood
 83 | #0              0              1            0                  0
 84 | #1              0              1            0                  0
 85 | #2              1              0            0                  0
 86 | #3              0              0            1                  0
 87 | #4              0              0            1                  0
 88 | 
 89 | # create a clean data frame for the regression
 90 | cols_to_keep = ['admit', 'gre', 'gpa']
 91 | data = dfTrain[cols_to_keep].join(dummy_ranks.ix[:, 'prestige_good':])
 92 | print data.head()
 93 | #     admit  gre   gpa  prestige_good  prestige_ok  prestige_veryGood
 94 | #0      0  380  3.61              1            0                  0
 95 | #1      1  660  3.67              1            0                  0
 96 | #2      1  800  4.00              0            0                  0
 97 | #3      1  640  3.19              0            1                  0
 98 | #4      0  520  2.93              0            1                  0
 99 | 
100 | # manually add the intercept
101 | data['intercept'] = 1.0
102 | 
103 | print data.head()
104 | 
105 | train_cols = data.columns[1:]
106 | print data.columns[1:]
107 | # Index([u'gre', u'gpa', u'prestige_good', u'prestige_ok', u'prestige_veryGood', u'intercept'], dtype='object')
108 | 
109 | #Logistic Regression
110 | logit = sm.Logit(data['admit'], data[train_cols])
111 | 
112 | # fit the model
113 | result = logit.fit()
114 | print result.summary()
115 | 
116 | # recreate the dummy variables
117 | dummy_ranks_test = pd.get_dummies(dfTest['prestige'], prefix='prestige')
118 | print dummy_ranks_test
119 | 
120 | #create intercept column
121 | dfTest['intercept'] = 1.0
122 | 
123 | # keep only what we need for making predictions
124 | cols_to_keep = ['gre', 'gpa', 'prestige', 'intercept']
125 | dfTest = dfTest[cols_to_keep].join(dummy_ranks_test.ix[:, 'prestige_good':])
126 | 
127 | dfTest.head()
128 | # make predictions on the enumerated dataset
129 | dfTest['admit_pred'] = result.predict(dfTest[train_cols])
130 | 
131 | #see probabilities
132 | print dfTest.head()
133 | 
134 | #convert probabilities to 'yes' 'no'
135 | dfTest['admit_yn']= np.where(dfTest['admit_pred'] > 0.5,'yes','no')
136 | print dfTest.head()
137 | 
138 | cols= ['gre', 'gpa', 'admit_yn']
139 | dfTest[cols].groupby('admit_yn').mean()
140 | #                 gre       gpa
141 | #admit_yn                      
142 | #no        556.585366  3.324268
143 | #yes       676.666667  3.750000
144 | 
145 | cols= ['gre', 'gpa', 'admit_yn']
146 | dfTest[cols].groupby('admit_yn').mean()
147 | #                 gre       gpa
148 | #admit_yn                      
149 | #no        556.585366  3.324268
150 | #yes       676.666667  3.750000
151 | 
152 | dfTest.to_csv('C:\\Users\\ujjwal.karn\\Desktop\\Python\\output.csv', sep=',')
153 | 


--------------------------------------------------------------------------------
/Logistic Regression with StatsModels/test.csv:
--------------------------------------------------------------------------------
  1 | gre,gpa,prestige
  2 | 640,3.3,veryGood
  3 | 660,3.6,good
  4 | 400,3.15,veryGood
  5 | 680,3.98,veryGood
  6 | 220,2.83,good
  7 | 580,3.46,ok
  8 | 540,3.17,best
  9 | 580,3.51,veryGood
 10 | 540,3.13,veryGood
 11 | 440,2.98,good
 12 | 560,4,good
 13 | 660,3.67,veryGood
 14 | 660,3.77,good
 15 | 520,3.65,ok
 16 | 540,3.46,ok
 17 | 300,2.84,veryGood
 18 | 340,3,veryGood
 19 | 780,3.63,ok
 20 | 480,3.71,ok
 21 | 540,3.28,best
 22 | 460,3.14,good
 23 | 460,3.58,veryGood
 24 | 500,3.01,ok
 25 | 420,2.69,veryGood
 26 | 520,2.7,good
 27 | 680,3.9,best
 28 | 680,3.31,veryGood
 29 | 560,3.48,veryGood
 30 | 580,3.34,veryGood
 31 | 500,2.93,ok
 32 | 740,4,good
 33 | 660,3.59,good
 34 | 420,2.96,best
 35 | 560,3.43,good
 36 | 460,3.64,good
 37 | 620,3.71,best
 38 | 520,3.15,good
 39 | 620,3.09,ok
 40 | 540,3.2,best
 41 | 660,3.47,good
 42 | 500,3.23,ok
 43 | 560,2.65,good
 44 | 500,3.95,ok
 45 | 580,3.06,veryGood
 46 | 520,3.35,good
 47 | 500,3.03,good
 48 | 600,3.35,veryGood
 49 | 580,3.8,veryGood
 50 | 400,3.36,veryGood
 51 | 620,2.85,veryGood
 52 | 780,4,veryGood
 53 | 620,3.43,good
 54 | 580,3.12,good
 55 | 700,3.52,veryGood
 56 | 540,3.78,veryGood
 57 | 760,2.81,best
 58 | 700,3.27,veryGood
 59 | 720,3.31,best
 60 | 560,3.69,good
 61 | 720,3.94,good
 62 | 520,4,best
 63 | 540,3.49,best
 64 | 680,3.14,veryGood
 65 | 460,3.44,veryGood
 66 | 560,3.36,best
 67 | 480,2.78,good
 68 | 460,2.93,good
 69 | 620,3.63,good
 70 | 580,4,best
 71 | 800,3.89,veryGood
 72 | 540,3.77,veryGood
 73 | 680,3.76,good
 74 | 680,2.42,best
 75 | 620,3.37,best
 76 | 560,3.78,veryGood
 77 | 560,3.49,ok
 78 | 620,3.63,veryGood
 79 | 800,4,veryGood
 80 | 640,3.12,good
 81 | 540,2.7,veryGood
 82 | 700,3.65,veryGood
 83 | 540,3.49,veryGood
 84 | 540,3.51,veryGood
 85 | 660,4,best
 86 | 480,2.62,veryGood
 87 | 420,3.02,best
 88 | 740,3.86,veryGood
 89 | 580,3.36,veryGood
 90 | 640,3.17,veryGood
 91 | 640,3.51,veryGood
 92 | 800,3.05,veryGood
 93 | 660,3.88,veryGood
 94 | 600,3.38,good
 95 | 620,3.75,veryGood
 96 | 460,3.99,good
 97 | 620,4,veryGood
 98 | 560,3.04,good
 99 | 460,2.63,veryGood
100 | 700,3.65,veryGood
101 | 600,3.89,good
102 | 


--------------------------------------------------------------------------------
/Logistic Regression with StatsModels/train.csv:
--------------------------------------------------------------------------------
  1 | admit,gre,gpa,prestige
  2 | 0,380,3.61,good
  3 | 1,660,3.67,good
  4 | 1,800,4,best
  5 | 1,640,3.19,ok
  6 | 0,520,2.93,ok
  7 | 1,760,3,veryGood
  8 | 1,560,2.98,best
  9 | 0,400,3.08,veryGood
 10 | 1,540,3.39,good
 11 | 0,700,3.92,veryGood
 12 | 0,800,4,ok
 13 | 0,440,3.22,best
 14 | 1,760,4,best
 15 | 0,700,3.08,veryGood
 16 | 1,700,4,best
 17 | 0,480,3.44,good
 18 | 0,780,3.87,ok
 19 | 0,360,2.56,good
 20 | 0,800,3.75,veryGood
 21 | 1,540,3.81,best
 22 | 0,500,3.17,good
 23 | 1,660,3.63,veryGood
 24 | 0,600,2.82,ok
 25 | 0,680,3.19,ok
 26 | 1,760,3.35,veryGood
 27 | 1,800,3.66,best
 28 | 1,620,3.61,best
 29 | 1,520,3.74,ok
 30 | 1,780,3.22,veryGood
 31 | 0,520,3.29,best
 32 | 0,540,3.78,ok
 33 | 0,760,3.35,good
 34 | 0,600,3.4,good
 35 | 1,800,4,good
 36 | 0,360,3.14,best
 37 | 0,400,3.05,veryGood
 38 | 0,580,3.25,best
 39 | 0,520,2.9,good
 40 | 1,500,3.13,veryGood
 41 | 1,520,2.68,good
 42 | 0,560,2.42,veryGood
 43 | 1,580,3.32,veryGood
 44 | 1,600,3.15,veryGood
 45 | 0,500,3.31,good
 46 | 0,700,2.94,veryGood
 47 | 1,460,3.45,good
 48 | 1,580,3.46,veryGood
 49 | 0,500,2.97,ok
 50 | 0,440,2.48,ok
 51 | 0,400,3.35,good
 52 | 0,640,3.86,good
 53 | 0,440,3.13,ok
 54 | 0,740,3.37,ok
 55 | 1,680,3.27,veryGood
 56 | 0,660,3.34,good
 57 | 1,740,4,good
 58 | 0,560,3.19,good
 59 | 0,380,2.94,good
 60 | 0,400,3.65,veryGood
 61 | 0,600,2.82,ok
 62 | 1,620,3.18,veryGood
 63 | 0,560,3.32,ok
 64 | 0,640,3.67,good
 65 | 1,680,3.85,good
 66 | 0,580,4,good
 67 | 0,600,3.59,veryGood
 68 | 0,740,3.62,ok
 69 | 0,620,3.3,best
 70 | 0,580,3.69,best
 71 | 0,800,3.73,best
 72 | 0,640,4,good
 73 | 0,300,2.92,ok
 74 | 0,480,3.39,ok
 75 | 0,580,4,veryGood
 76 | 0,720,3.45,ok
 77 | 0,720,4,good
 78 | 0,560,3.36,good
 79 | 1,800,4,good
 80 | 0,540,3.12,best
 81 | 1,620,4,best
 82 | 0,700,2.9,ok
 83 | 0,620,3.07,veryGood
 84 | 0,500,2.71,veryGood
 85 | 0,380,2.91,ok
 86 | 1,500,3.6,good
 87 | 0,520,2.98,veryGood
 88 | 0,600,3.32,veryGood
 89 | 0,600,3.48,veryGood
 90 | 0,700,3.28,best
 91 | 1,660,4,veryGood
 92 | 0,700,3.83,veryGood
 93 | 1,720,3.64,best
 94 | 0,800,3.9,veryGood
 95 | 0,580,2.93,veryGood
 96 | 1,660,3.44,veryGood
 97 | 0,660,3.33,veryGood
 98 | 0,640,3.52,ok
 99 | 0,480,3.57,veryGood
100 | 0,700,2.88,veryGood
101 | 0,400,3.31,good
102 | 0,340,3.15,good
103 | 0,580,3.57,good
104 | 0,380,3.33,ok
105 | 0,540,3.94,good
106 | 1,660,3.95,veryGood
107 | 1,740,2.97,veryGood
108 | 1,700,3.56,best
109 | 0,480,3.13,veryGood
110 | 0,400,2.93,good
111 | 0,480,3.45,veryGood
112 | 0,680,3.08,ok
113 | 0,420,3.41,ok
114 | 0,360,3,good
115 | 0,600,3.22,best
116 | 0,720,3.84,good
117 | 0,620,3.99,good
118 | 1,440,3.45,veryGood
119 | 0,700,3.72,veryGood
120 | 1,800,3.7,best
121 | 0,340,2.92,good
122 | 1,520,3.74,veryGood
123 | 1,480,2.67,veryGood
124 | 0,520,2.85,good
125 | 0,500,2.98,good
126 | 0,720,3.88,good
127 | 0,540,3.38,ok
128 | 1,600,3.54,best
129 | 0,740,3.74,ok
130 | 0,540,3.19,veryGood
131 | 0,460,3.15,ok
132 | 1,620,3.17,veryGood
133 | 0,640,2.79,veryGood
134 | 0,580,3.4,veryGood
135 | 0,500,3.08,good
136 | 0,560,2.95,veryGood
137 | 0,500,3.57,good
138 | 0,560,3.33,ok
139 | 0,700,4,good
140 | 0,620,3.4,veryGood
141 | 1,600,3.58,best
142 | 0,640,3.93,veryGood
143 | 1,700,3.52,ok
144 | 0,620,3.94,ok
145 | 0,580,3.4,good
146 | 0,580,3.4,ok
147 | 0,380,3.43,good
148 | 0,480,3.4,veryGood
149 | 0,560,2.71,good
150 | 1,480,2.91,best
151 | 0,740,3.31,best
152 | 1,800,3.74,best
153 | 0,400,3.38,veryGood
154 | 1,640,3.94,veryGood
155 | 0,580,3.46,good
156 | 0,620,3.69,good
157 | 1,580,2.86,ok
158 | 0,560,2.52,veryGood
159 | 1,480,3.58,best
160 | 0,660,3.49,veryGood
161 | 0,700,3.82,good
162 | 0,600,3.13,veryGood
163 | 0,640,3.5,veryGood
164 | 1,700,3.56,veryGood
165 | 0,520,2.73,veryGood
166 | 0,580,3.3,veryGood
167 | 0,700,4,best
168 | 0,440,3.24,ok
169 | 0,720,3.77,good
170 | 0,500,4,good
171 | 0,600,3.62,good
172 | 0,400,3.51,good
173 | 0,540,2.81,good
174 | 0,680,3.48,good
175 | 1,800,3.43,veryGood
176 | 0,500,3.53,ok
177 | 1,620,3.37,veryGood
178 | 0,520,2.62,veryGood
179 | 1,620,3.23,good
180 | 0,620,3.33,good
181 | 0,300,3.01,good
182 | 0,620,3.78,good
183 | 0,500,3.88,ok
184 | 0,700,4,veryGood
185 | 1,540,3.84,veryGood
186 | 0,500,2.79,ok
187 | 0,800,3.6,veryGood
188 | 0,560,3.61,good
189 | 0,580,2.88,veryGood
190 | 0,560,3.07,veryGood
191 | 0,500,3.35,veryGood
192 | 1,640,2.94,veryGood
193 | 0,800,3.54,good
194 | 0,640,3.76,good
195 | 0,380,3.59,ok
196 | 1,600,3.47,veryGood
197 | 0,560,3.59,veryGood
198 | 0,660,3.07,good
199 | 1,400,3.23,ok
200 | 0,600,3.63,good
201 | 0,580,3.77,ok
202 | 0,800,3.31,good
203 | 1,580,3.2,veryGood
204 | 1,700,4,best
205 | 0,420,3.92,ok
206 | 1,600,3.89,best
207 | 1,780,3.8,good
208 | 0,740,3.54,best
209 | 1,640,3.63,best
210 | 0,540,3.16,good
211 | 0,580,3.5,veryGood
212 | 0,740,3.34,ok
213 | 0,580,3.02,veryGood
214 | 0,460,2.87,veryGood
215 | 0,640,3.38,good
216 | 1,600,3.56,veryGood
217 | 1,660,2.91,good
218 | 0,340,2.9,best
219 | 1,460,3.64,best
220 | 0,460,2.98,best
221 | 1,560,3.59,veryGood
222 | 0,540,3.28,good
223 | 0,680,3.99,good
224 | 1,480,3.02,best
225 | 0,800,3.47,good
226 | 0,800,2.9,veryGood
227 | 1,720,3.5,good
228 | 0,620,3.58,veryGood
229 | 0,540,3.02,ok
230 | 0,480,3.43,veryGood
231 | 1,720,3.42,veryGood
232 | 0,580,3.29,ok
233 | 0,600,3.28,good
234 | 0,380,3.38,veryGood
235 | 0,420,2.67,good
236 | 1,800,3.53,best
237 | 0,620,3.05,veryGood
238 | 1,660,3.49,veryGood
239 | 0,480,4,veryGood
240 | 0,500,2.86,ok
241 | 0,700,3.45,good
242 | 0,440,2.76,veryGood
243 | 1,520,3.81,best
244 | 1,680,2.96,good
245 | 0,620,3.22,veryGood
246 | 0,540,3.04,best
247 | 0,800,3.91,good
248 | 0,680,3.34,veryGood
249 | 0,440,3.17,veryGood
250 | 0,680,3.64,good
251 | 0,640,3.73,good
252 | 0,660,3.31,ok
253 | 0,620,3.21,ok
254 | 1,520,4,veryGood
255 | 1,540,3.55,ok
256 | 1,740,3.52,ok
257 | 0,640,3.35,good
258 | 1,520,3.3,veryGood
259 | 1,620,3.95,good
260 | 0,520,3.51,veryGood
261 | 0,640,3.81,veryGood
262 | 0,680,3.11,veryGood
263 | 0,440,3.15,veryGood
264 | 1,520,3.19,good
265 | 1,620,3.95,good
266 | 1,520,3.9,good
267 | 0,380,3.34,good
268 | 0,560,3.24,ok
269 | 1,600,3.64,good
270 | 1,680,3.46,veryGood
271 | 0,500,2.81,good
272 | 1,640,3.95,veryGood
273 | 0,540,3.33,good
274 | 1,680,3.67,veryGood
275 | 0,660,3.32,best
276 | 0,520,3.12,veryGood
277 | 1,600,2.98,veryGood
278 | 0,460,3.77,good
279 | 1,580,3.58,best
280 | 1,680,3,ok
281 | 1,660,3.14,veryGood
282 | 0,660,3.94,veryGood
283 | 0,360,3.27,good
284 | 0,660,3.45,ok
285 | 0,520,3.1,ok
286 | 1,440,3.39,veryGood
287 | 0,600,3.31,ok
288 | 1,800,3.22,best
289 | 1,660,3.7,ok
290 | 0,800,3.15,ok
291 | 0,420,2.26,ok
292 | 1,620,3.45,veryGood
293 | 0,800,2.78,veryGood
294 | 0,680,3.7,veryGood
295 | 0,800,3.97,best
296 | 0,480,2.55,best
297 | 0,520,3.25,good
298 | 0,560,3.16,best
299 | 0,460,3.07,veryGood
300 | 0,540,3.5,veryGood
301 | 0,720,3.4,good
302 | 


--------------------------------------------------------------------------------
/Logistic-Regression/citreo.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
  3 |                    Version 2, December 2004
  4 | 
  5 | Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
  6 | 
  7 | Everyone is permitted to copy and distribute verbatim or modified
  8 | copies of this license document, and changing it is allowed as long
  9 | as the name is changed.
 10 | 
 11 |            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 12 |   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 13 | 
 14 |  0. You just DO WHAT THE FUCK YOU WANT TO.
 15 | '''
 16 | 
 17 | 
 18 | from datetime import datetime
 19 | from csv import DictReader
 20 | from math import exp, log, sqrt
 21 | 
 22 | 
 23 | # parameters #################################################################
 24 | 
 25 | train = 'train.csv'  # path to training file
 26 | test = 'test.csv'  # path to testing file
 27 | 
 28 | D = 2 ** 20   # number of weights use for learning
 29 | alpha = .1    # learning rate for sgd optimization
 30 | 
 31 | 
 32 | # function definitions #######################################################
 33 | 
 34 | # A. Bounded logloss
 35 | # INPUT:
 36 | #     p: our prediction
 37 | #     y: real answer
 38 | # OUTPUT
 39 | #     logarithmic loss of p given y
 40 | def logloss(p, y):
 41 |     p = max(min(p, 1. - 10e-12), 10e-12)
 42 |     return -log(p) if y == 1. else -log(1. - p)
 43 | 
 44 | 
 45 | # B. Apply hash trick of the original csv row
 46 | # for simplicity, we treat both integer and categorical features as categorical
 47 | # INPUT:
 48 | #     csv_row: a csv dictionary, ex: {'Lable': '1', 'I1': '357', 'I2': '', ...}
 49 | #     D: the max index that we can hash to
 50 | # OUTPUT:
 51 | #     x: a list of indices that its value is 1
 52 | def get_x(csv_row, D):
 53 |     x = [0]  # 0 is the index of the bias term
 54 |     for key, value in csv_row.items():
 55 |         index = int(value + key[1:], 16) % D  # weakest hash ever ;)
 56 |         x.append(index)
 57 |     return x  # x contains indices of features that have a value of 1
 58 | 
 59 | 
 60 | # C. Get probability estimation on x
 61 | # INPUT:
 62 | #     x: features
 63 | #     w: weights
 64 | # OUTPUT:
 65 | #     probability of p(y = 1 | x; w)
 66 | def get_p(x, w):
 67 |     wTx = 0.
 68 |     for i in x:  # do wTx
 69 |         wTx += w[i] * 1.  # w[i] * x[i], but if i in x we got x[i] = 1.
 70 |     return 1. / (1. + exp(-max(min(wTx, 20.), -20.)))  # bounded sigmoid
 71 | 
 72 | 
 73 | # D. Update given model
 74 | # INPUT:
 75 | #     w: weights
 76 | #     n: a counter that counts the number of times we encounter a feature
 77 | #        this is used for adaptive learning rate
 78 | #     x: feature
 79 | #     p: prediction of our model
 80 | #     y: answer
 81 | # OUTPUT:
 82 | #     w: updated model
 83 | #     n: updated count
 84 | def update_w(w, n, x, p, y):
 85 |     for i in x:
 86 |         # alpha / (sqrt(n) + 1) is the adaptive learning rate heuristic
 87 |         # (p - y) * x[i] is the current gradient
 88 |         # note that in our case, if i in x then x[i] = 1
 89 |         w[i] -= (p - y) * alpha / (sqrt(n[i]) + 1.)
 90 |         n[i] += 1.
 91 | 
 92 |     return w, n
 93 | 
 94 | 
 95 | # training and testing #######################################################
 96 | 
 97 | # initialize our model
 98 | w = [0.] * D  # weights
 99 | n = [0.] * D  # number of times we've encountered a feature
100 | 
101 | # start training a logistic regression model using on pass sgd
102 | loss = 0.
103 | for t, row in enumerate(DictReader(open(train))):
104 |     y = 1. if row['Label'] == '1' else 0.
105 | 
106 |     del row['Label']  # can't let the model peek the answer
107 |     del row['Id']  # we don't need the Id
108 | 
109 |     # main training procedure
110 |     # step 1, get the hashed features
111 |     x = get_x(row, D)
112 | 
113 |     # step 2, get prediction
114 |     p = get_p(x, w)
115 | 
116 |     # for progress validation, useless for learning our model
117 |     loss += logloss(p, y)
118 |     if t % 1000000 == 0 and t > 1:
119 |         print('%s\tencountered: %d\tcurrent logloss: %f' % (
120 |             datetime.now(), t, loss/t))
121 | 
122 |     # step 3, update model with answer
123 |     w, n = update_w(w, n, x, p, y)
124 | 
125 | # testing (build kaggle's submission file)
126 | with open('submission1234.csv', 'w') as submission:
127 |     submission.write('Id,Predicted\n')
128 |     for t, row in enumerate(DictReader(open(test))):
129 |         Id = row['Id']
130 |         del row['Id']
131 |         x = get_x(row, D)
132 |         p = get_p(x, w)
133 |         submission.write('%s,%f\n' % (Id, p))


--------------------------------------------------------------------------------
/Logistic-Regression/citreo_code_v2.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from csv import DictReader
  3 | from math import exp, log, sqrt
  4 | 
  5 | 
  6 | # parameters #################################################################
  7 | 
  8 | train = 'train.csv'  # path to training file
  9 | test = 'test.csv'  # path to testing file
 10 | 
 11 | D = 2 ** 20   # number of weights use for learning
 12 | alpha = .1    # learning rate for sgd optimization
 13 | 
 14 | 
 15 | # function definitions #######################################################
 16 | 
 17 | # A. Bounded logloss
 18 | # INPUT:
 19 | #     p: our prediction
 20 | #     y: real answer
 21 | # OUTPUT
 22 | #     logarithmic loss of p given y
 23 | def logloss(p, y):
 24 |     p = max(min(p, 1. - 10e-12), 10e-12)
 25 |     return -log(p) if y == 1. else -log(1. - p)
 26 | 
 27 | 
 28 | # B. Apply hash trick of the original csv row
 29 | # for simplicity, we treat both integer and categorical features as categorical
 30 | # INPUT:
 31 | #     csv_row: a csv dictionary, ex: {'Lable': '1', 'I1': '357', 'I2': '', ...}
 32 | #     D: the max index that we can hash to
 33 | # OUTPUT:
 34 | #     x: a list of indices that its value is 1
 35 | 
 36 | # def get_x(csv_row, D):
 37 |     # x = [0]  # 0 is the index of the bias term
 38 |     # for key, value in csv_row.items():
 39 |         # index = int(value + key[1:], 16) % D  # weakest hash ever ;)
 40 |         # x.append(index)
 41 |     # return x  # x contains indices of features that have a value of 1
 42 | 
 43 | 
 44 | # C. Get probability estimation on x
 45 | # INPUT:
 46 | #     x: features
 47 | #     w: weights
 48 | # OUTPUT:
 49 | #     probability of p(y = 1 | x; w)
 50 | def get_p(x, w):
 51 |     wTx = 0.
 52 |     for i in x:  # do wTx
 53 |         wTx += w[i] * 1.  # w[i] * x[i], but if i in x we got x[i] = 1.
 54 |     return 1. / (1. + exp(-max(min(wTx, 20.), -20.)))  # bounded sigmoid
 55 | 
 56 | 
 57 | # D. Update given model
 58 | # INPUT:
 59 | #     w: weights
 60 | #     n: a counter that counts the number of times we encounter a feature
 61 | #        this is used for adaptive learning rate
 62 | #     x: feature
 63 | #     p: prediction of our model
 64 | #     y: answer
 65 | # OUTPUT:
 66 | #     w: updated model
 67 | #     n: updated count
 68 | def update_w(w, n, x, p, y):
 69 |     for i in x:
 70 |         # alpha / (sqrt(n) + 1) is the adaptive learning rate heuristic
 71 |         # (p - y) * x[i] is the current gradient
 72 |         # note that in our case, if i in x then x[i] = 1
 73 |         w[i] -= (p - y) * alpha / (sqrt(n[i]) + 1.)
 74 |         n[i] += 1.
 75 | 
 76 |     return w, n
 77 | 
 78 | 
 79 | # training and testing #######################################################
 80 | 
 81 | # initialize our model
 82 | w = [0.] * D  # weights
 83 | n = [0.] * D  # number of times we've encountered a feature
 84 | 
 85 | # start training a logistic regression model using on pass sgd
 86 | loss = 0.
 87 | for t, row in enumerate(DictReader(open(train))):
 88 |     y = 1. if row['Label'] == '1' else 0.
 89 | 
 90 |     del row['Label']  # can't let the model peek the answer
 91 |     del row['Id']  # we don't need the Id
 92 | 
 93 |     # main training procedure
 94 |     # step 1, get the hashed features
 95 |     x = get_x(row, D)
 96 | 
 97 |     # step 2, get prediction
 98 |     p = get_p(x, w)
 99 | 
100 |     # for progress validation, useless for learning our model
101 |     loss += logloss(p, y)
102 |     if t % 1000000 == 0 and t > 1:
103 |         print('%s\tencountered: %d\tcurrent logloss: %f' % (
104 |             datetime.now(), t, loss/t))
105 | 
106 |     # step 3, update model with answer
107 |     w, n = update_w(w, n, x, p, y)
108 | 
109 | # testing (build kaggle's submission file)
110 | with open('submission1234.csv', 'w') as submission:
111 |     submission.write('Id,Predicted\n')
112 |     for t, row in enumerate(DictReader(open(test))):
113 |         Id = row['Id']
114 |         del row['Id']
115 |         x = get_x(row, D)
116 |         p = get_p(x, w)
117 |         submission.write('%s,%f\n' % (Id, p))
118 | 


--------------------------------------------------------------------------------
/Logistic-Regression/classifier_corrected.py:
--------------------------------------------------------------------------------
  1 | #https://www.kaggle.com/c/amazon-employee-access-challenge/forums/t/4797/starter-code-in-python-with-scikit-learn-auc-885
  2 | 
  3 | """ Amazon Access Challenge Starter Code
  4 | 
  5 | These files provide some starter code using 
  6 | the scikit-learn library. It provides some examples on how
  7 | to design a simple algorithm, including pre-processing,
  8 | training a logistic regression classifier on the data,
  9 | assess its performance through cross-validation and some 
 10 | pointers on where to go next.
 11 | 
 12 | Paul Duan <email@paulduan.com>
 13 | """
 14 | 
 15 | from __future__ import division
 16 | 
 17 | import numpy as np
 18 | from sklearn import (metrics, cross_validation, linear_model, preprocessing)
 19 | 
 20 | SEED = 42  # always use a seed for randomized procedures
 21 | 
 22 | 
 23 | def load_data(filename, use_labels=True):
 24 |     """
 25 |     Load data from CSV files and return them as numpy arrays
 26 |     The use_labels parameter indicates whether one should
 27 |     read the first column (containing class labels). If false,
 28 |     return all 0s. 
 29 |     """
 30 | 
 31 |     # load column 1 to 8 (ignore last one)
 32 |     data = np.loadtxt(open("data/" + filename), delimiter=',',
 33 |                       usecols=range(1, 9), skiprows=1)
 34 |     if use_labels:
 35 |         labels = np.loadtxt(open("data/" + filename), delimiter=',',
 36 |                             usecols=[0], skiprows=1)
 37 |     else:
 38 |         labels = np.zeros(data.shape[0])
 39 |     return labels, data
 40 | 
 41 | 
 42 | def save_results(predictions, filename):
 43 |     """Given a vector of predictions, save results in CSV format."""
 44 |     with open(filename, 'w') as f:
 45 |         f.write("id,ACTION\n")
 46 |         for i, pred in enumerate(predictions):
 47 |             f.write("%d,%f\n" % (i + 1, pred))
 48 | 
 49 | 
 50 | def main():
 51 |     """
 52 |     Fit models and make predictions.
 53 |     We'll use one-hot encoding to transform our categorical features
 54 |     into binary features.
 55 |     y and X will be numpy array objects.
 56 |     """
 57 |     model = linear_model.LogisticRegression(C=3)  # the classifier we'll use
 58 | 
 59 |     # === load data in memory === #
 60 |     print "loading data"
 61 |     y, X = load_data('train.csv')
 62 |     y_test, X_test = load_data('test.csv', use_labels=False)
 63 | 
 64 |     # === one-hot encoding === #
 65 |     # we want to encode the category IDs encountered both in
 66 |     # the training and the test set, so we fit the encoder on both
 67 |     encoder = preprocessing.OneHotEncoder()
 68 |     encoder.fit(np.vstack((X, X_test)))
 69 |     X = encoder.transform(X)  # Returns a sparse matrix (see numpy.sparse)
 70 |     X_test = encoder.transform(X_test)
 71 | 
 72 |     # if you want to create new features, you'll need to compute them
 73 |     # before the encoding, and append them to your dataset after
 74 | 
 75 |     # === training & metrics === #
 76 |     mean_auc = 0.0
 77 |     n = 10  # repeat the CV procedure 10 times to get more precise results
 78 |     for i in range(n):
 79 |         # for each iteration, randomly hold out 20% of the data as CV set
 80 |         X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
 81 |             X, y, test_size=.20, random_state=i*SEED)
 82 | 
 83 |         # if you want to perform feature selection / hyperparameter
 84 |         # optimization, this is where you want to do it
 85 | 
 86 |         # train model and make predictions
 87 |         model.fit(X_train, y_train) 
 88 |         preds = model.predict_proba(X_cv)[:, 1]
 89 | 
 90 |         # compute AUC metric for this CV fold
 91 |         fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
 92 |         roc_auc = metrics.auc(fpr, tpr)
 93 |         print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
 94 |         mean_auc += roc_auc
 95 | 
 96 |     print "Mean AUC: %f" % (mean_auc/n)
 97 | 
 98 |     # === Predictions === #
 99 |     # When making predictions, retrain the model on the whole training set
100 |     model.fit(X, y)
101 |     preds = model.predict_proba(X_test)[:, 1]
102 |     filename = raw_input("Enter name for submission file: ")
103 |     save_results(preds, filename + ".csv")
104 | 
105 | if __name__ == '__main__':
106 |     main()
107 | 


--------------------------------------------------------------------------------
/Logistic-Regression/logistic_regression_updated.py:
--------------------------------------------------------------------------------
  1 | #https://www.kaggle.com/c/amazon-employee-access-challenge/forums/t/4838/python-code-to-achieve-0-90-auc-with-logistic-regression
  2 | 
  3 | __author__ = 'Miroslaw Horbal'
  4 | __email__ = 'miroslaw@gmail.com'
  5 | __date__ = '14-06-2013'
  6 | 
  7 | from numpy import array, hstack
  8 | from sklearn import metrics, cross_validation, linear_model
  9 | from scipy import sparse
 10 | from itertools import combinations
 11 | 
 12 | import numpy as np
 13 | import pandas as pd
 14 | 
 15 | SEED = 25
 16 | 
 17 | def group_data(data, degree=3, hash=hash):
 18 |     """ 
 19 |     numpy.array -> numpy.array
 20 |     
 21 |     Groups all columns of data into all combinations of triples
 22 |     """
 23 |     new_data = []
 24 |     m,n = data.shape
 25 |     for indicies in combinations(range(n), degree):
 26 |         new_data.append([hash(tuple(v)) for v in data[:,indicies]])
 27 |     return array(new_data).T
 28 | 
 29 | def OneHotEncoder(data, keymap=None):
 30 |      """
 31 |      OneHotEncoder takes data matrix with categorical columns and
 32 |      converts it to a sparse binary matrix.
 33 |      
 34 |      Returns sparse binary matrix and keymap mapping categories to indicies.
 35 |      If a keymap is supplied on input it will be used instead of creating one
 36 |      and any categories appearing in the data that are not in the keymap are
 37 |      ignored
 38 |      """
 39 |      if keymap is None:
 40 |           keymap = []
 41 |           for col in data.T:
 42 |                uniques = set(list(col))
 43 |                keymap.append(dict((key, i) for i, key in enumerate(uniques)))
 44 |      total_pts = data.shape[0]
 45 |      outdat = []
 46 |      for i, col in enumerate(data.T):
 47 |           km = keymap[i]
 48 |           num_labels = len(km)
 49 |           spmat = sparse.lil_matrix((total_pts, num_labels))
 50 |           for j, val in enumerate(col):
 51 |                if val in km:
 52 |                     spmat[j, km[val]] = 1
 53 |           outdat.append(spmat)
 54 |      outdat = sparse.hstack(outdat).tocsr()
 55 |      return outdat, keymap
 56 | 
 57 | def create_test_submission(filename, prediction):
 58 |     content = ['id,ACTION']
 59 |     for i, p in enumerate(prediction):
 60 |         content.append('%i,%f' %(i+1,p))
 61 |     f = open(filename, 'w')
 62 |     f.write('\n'.join(content))
 63 |     f.close()
 64 |     print 'Saved'
 65 | 
 66 | # This loop essentially from Paul's starter code
 67 | def cv_loop(X, y, model, N):
 68 |     mean_auc = 0.
 69 |     for i in range(N):
 70 |         X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
 71 |                                        X, y, test_size=.20, 
 72 |                                        random_state = i*SEED)
 73 |         model.fit(X_train, y_train)
 74 |         preds = model.predict_proba(X_cv)[:,1]
 75 |         auc = metrics.auc_score(y_cv, preds)
 76 |         print "AUC (fold %d/%d): %f" % (i + 1, N, auc)
 77 |         mean_auc += auc
 78 |     return mean_auc/N
 79 |     
 80 | def main(train='train.csv', test='test.csv', submit='logistic_pred.csv'):    
 81 |     print "Reading dataset..."
 82 |     train_data = pd.read_csv(train)
 83 |     test_data = pd.read_csv(test)
 84 |     all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1]))
 85 | 
 86 |     num_train = np.shape(train_data)[0]
 87 |     
 88 |     # Transform data
 89 |     print "Transforming data..."
 90 |     dp = group_data(all_data, degree=2) 
 91 |     dt = group_data(all_data, degree=3)
 92 | 
 93 |     y = array(train_data.ACTION)
 94 |     X = all_data[:num_train]
 95 |     X_2 = dp[:num_train]
 96 |     X_3 = dt[:num_train]
 97 | 
 98 |     X_test = all_data[num_train:]
 99 |     X_test_2 = dp[num_train:]
100 |     X_test_3 = dt[num_train:]
101 | 
102 |     X_train_all = np.hstack((X, X_2, X_3))
103 |     X_test_all = np.hstack((X_test, X_test_2, X_test_3))
104 |     num_features = X_train_all.shape[1]
105 |     
106 |     model = linear_model.LogisticRegression()
107 |     
108 |     # Xts holds one hot encodings for each individual feature in memory
109 |     # speeding up feature selection 
110 |     Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)]
111 |     
112 |     print "Performing greedy feature selection..."
113 |     score_hist = []
114 |     N = 10
115 |     good_features = set([])
116 |     # Greedy feature selection loop
117 |     while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
118 |         scores = []
119 |         for f in range(len(Xts)):
120 |             if f not in good_features:
121 |                 feats = list(good_features) + [f]
122 |                 Xt = sparse.hstack([Xts[j] for j in feats]).tocsr()
123 |                 score = cv_loop(Xt, y, model, N)
124 |                 scores.append((score, f))
125 |                 print "Feature: %i Mean AUC: %f" % (f, score)
126 |         good_features.add(sorted(scores)[-1][1])
127 |         score_hist.append(sorted(scores)[-1])
128 |         print "Current features: %s" % sorted(list(good_features))
129 |     
130 |     # Remove last added feature from good_features
131 |     good_features.remove(score_hist[-1][1])
132 |     good_features = sorted(list(good_features))
133 |     print "Selected features %s" % good_features
134 |     
135 |     print "Performing hyperparameter selection..."
136 |     # Hyperparameter selection loop
137 |     score_hist = []
138 |     Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr()
139 |     Cvals = np.logspace(-4, 4, 15, base=2)
140 |     for C in Cvals:
141 |         model.C = C
142 |         score = cv_loop(Xt, y, model, N)
143 |         score_hist.append((score,C))
144 |         print "C: %f Mean AUC: %f" %(C, score)
145 |     bestC = sorted(score_hist)[-1][1]
146 |     print "Best C value: %f" % (bestC)
147 |     
148 |     print "Performing One Hot Encoding on entire dataset..."
149 |     Xt = np.vstack((X_train_all[:,good_features], X_test_all[:,good_features]))
150 |     Xt, keymap = OneHotEncoder(Xt)
151 |     X_train = Xt[:num_train]
152 |     X_test = Xt[num_train:]
153 |     
154 |     print "Training full model..."
155 |     model.fit(X_train, y)
156 |     
157 |     print "Making prediction and saving results..."
158 |     preds = model.predict_proba(X_test)[:,1]
159 |     create_test_submission(submit, preds)
160 |     
161 | if __name__ == "__main__":
162 |     args = { 'train':  'train.csv',
163 |              'test':   'test.csv',
164 |              'submit': 'logistic_regression_pred.csv' }
165 |     main(**args)
166 |     
167 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Python Data Science Tutorials 
  2 | - This repo contains a curated list of Python tutorials for Data Science, NLP and Machine Learning.
  3 | 
  4 | - [**Curated list of R tutorials for Data Science, NLP and Machine Learning**](https://github.com/ujjwalkarn/DataScienceR).
  5 | 
  6 | - [Comprehensive topic-wise list of Machine Learning and Deep Learning tutorials, codes, articles and other resources](https://github.com/ujjwalkarn/Machine-Learning-Tutorials/blob/master/README.md).
  7 | 
  8 | ## The Python Language
  9 | - [Python 3 in one picture](https://fossbytes.com/wp-content/uploads/2015/09/python-3-in-one-pic.png)
 10 | - [**Awesome Python**](https://github.com/vinta/awesome-python)
 11 | - [**Jargon from the functional programming world in simple terms!**](https://github.com/hemanth/functional-programming-jargon)
 12 | - [**Dive Into Python**](http://www.diveintopython.net/index.html)
 13 | - [Learn Python Wiki on Reddit](https://www.reddit.com/r/learnpython/wiki/index)
 14 | - [Learn 90% of Python in 90 Minutes](https://www.slideshare.net/MattHarrison4/learn-90)
 15 | - [Highest Voted Python Questions](http://stackoverflow.com/questions/tagged/python?sort=votes&pageSize=50)
 16 | - [Python Basic Concepts](https://github.com/gumption/Python_for_Data_Science/blob/master/3_Python_Basic_Concepts.ipynb)
 17 | - [Quick Reference to Python](http://www.dataschool.io/python-quick-reference/)
 18 | - [The Elements of Python Style](https://github.com/amontalenti/elements-of-python-style)
 19 | - [**What does the yield keyword do in Python?**](http://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do-in-python)
 20 | - [Parsing values from a JSON file in Python](http://stackoverflow.com/questions/2835559/parsing-values-from-a-json-file-in-python)
 21 | - [**Python Quora FAQs**](https://www.quora.com/topic/Python-programming-language-1)
 22 | - [time-complexity of various operations - list/dict - in current CPython](https://wiki.python.org/moin/TimeComplexity)
 23 | - Scripting in Python
 24 |     - [Python Scripting Tutorial](http://www.dreamsyssoft.com/python-scripting-tutorial/intro-tutorial.php)
 25 |     - [Scripting with Python](https://www.schrodinger.com//AcrobatFile.php?type=supportdocs&type2=&ident=404)
 26 |     - [**Can I use Python as a bash replacement?**](http://stackoverflow.com/questions/209470/can-i-use-python-as-a-bash-replacement)
 27 | 
 28 | ## Useful Online Courses
 29 | - [Learn Python (Codecademy)](https://www.codecademy.com/learn/python#)
 30 | - [Free Interactive Course: Intro to Python for Data Science (DataCamp)](https://www.datacamp.com/courses/intro-to-python-for-data-science)
 31 | - [Introduction to Computer Science and Programming Using Python (MIT)](https://www.edx.org/course/introduction-computer-science-mitx-6-00-1x-11)
 32 | - [Python for Everybody](https://www.coursera.org/learn/python)
 33 | - [Python Programming Essentials](https://www.coursera.org/learn/python-programming)
 34 | 
 35 | ## Data Science with Python
 36 | - [**Data Science IPython Notebooks**](https://github.com/donnemartin/data-science-ipython-notebooks)
 37 | - [Awesome Python - Data Analysis](https://github.com/vinta/awesome-python#science-and-data-analysis)
 38 | - Statistics
 39 |   - [Statistics and Data Science](https://github.com/svaksha/pythonidae/blob/master/Statistics.md)
 40 | - [**An Introduction to Scientific Python (and a Bit of the Maths Behind It) – NumPy**](http://www.kdnuggets.com/2016/06/intro-scientific-python-numpy.html)
 41 | - [Data Analysis and IPython Notebooks](https://github.com/kirang89/pycrumbs#data-analysis)
 42 | - [Python for Data Science: Basic Concepts](https://github.com/gumption/Python_for_Data_Science/blob/master/2_Data_Science_Basic_Concepts.ipynb)
 43 | - [Pycon India 2015 Notes](http://www.analyticsvidhya.com/blog/2015/10/notes-impressions-experience-excitement-pycon-india-2015/)
 44 | - [**5 important Python Data Science advancements of 2015**](https://medium.com/@elgehelge/the-5-most-important-python-data-science-advancements-of-2015-a136482da89b#.sp2c1la9z)
 45 | - [Data Exploration with Numpy cheat sheet](http://www.analyticsvidhya.com/blog/2015/07/11-steps-perform-data-analysis-pandas-python)
 46 | - [Querying Craiglist with Python](http://chrisholdgraf.com/querying-craigslist-with-python/?imm_mid=0d8940&cmp=em-data-na-na-newsltr_20150916)
 47 | - [**An introduction to Numpy and Scipy**](http://www.engr.ucsb.edu/~shell/che210d/numpy.pdf)
 48 | - [Create NBA Shot Charts](http://savvastjortjoglou.com/nba-shot-sharts.html)
 49 | - [PythoR- Python meets R](http://nipunbatra.github.io/2016/01/pythor/)
 50 | - [**How do I learn data analysis with Python?**](https://www.quora.com/How-do-I-learn-data-analysis-with-Python?redirected_qid=2464720)
 51 | - [What are some interesting things to do with Python?](https://www.quora.com/Python-programming-language-What-are-some-interesting-things-to-do-with-Python?redirected_qid=2324227)
 52 | - [**Which is better for data analysis: R or Python?**](https://www.quora.com/Which-is-better-for-data-analysis-R-or-Python)
 53 | - [**Web scraping in Python**](https://github.com/ujjwalkarn/Web-Scraping)
 54 | - [The Guide to Learning Python for Data Science](http://www.datasciencecentral.com/profiles/blogs/the-guide-to-learning-python-for-data-science-2)
 55 | - [Python For Data Science - A Cheat Sheet For Beginners](https://www.datacamp.com/community/tutorials/python-data-science-cheat-sheet-basics)
 56 | - [Top voted Python data science questions](http://datascience.stackexchange.com/questions/tagged/python)
 57 | - [Awesome Python - Data Visualization](https://github.com/vinta/awesome-python#data-visualization)
 58 | - [Awesome Python - Map Reduce](https://github.com/vinta/awesome-python#mapreduce)
 59 | 
 60 | ## Pandas Library in Python
 61 | - [Intro to pandas data structures](http://www.gregreda.com/2013/10/26/intro-to-pandas-data-structures/)
 62 | - [Useful Pandas Cheatsheet](https://github.com/pandas-dev/pandas/blob/master/doc/cheatsheet/Pandas_Cheat_Sheet.pdf)
 63 | - [An Introduction to Scientific Python – Pandas](http://www.datadependence.com/2016/05/scientific-python-pandas/)
 64 | - [10 minutes to Pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html)
 65 | - [Useful Pandas Snippets](http://www.swegler.com/becky/blog/2014/08/06/useful-pandas-snippets/)
 66 | - [Timeseries analysis using Pandas](http://nbviewer.jupyter.org/github/twiecki/financial-analysis-python-tutorial/blob/master/1.%20Pandas%20Basics.ipynb)
 67 | - [Pandas Exercises - Practice your Pandas skills](https://github.com/guipsamora/pandas_exercises)
 68 | - [Grouping in Pandas](http://blog.yhat.com/posts/grouping-pandas.html)
 69 | - [**“Large data” work flows using pandas**](http://stackoverflow.com/questions/14262433/large-data-work-flows-using-pandas)
 70 | - [Easier data analysis with pandas (video series)](http://www.dataschool.io/easier-data-analysis-with-pandas/)
 71 | - [Pandas Basics Cheat Sheet](https://www.datacamp.com/community/blog/python-pandas-cheat-sheet)
 72 | - Quick Operations on a Pandas DataFrame
 73 |     - [Renaming Columns in Pandas](http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas) ([video](https://www.youtube.com/watch?v=0uBirYFhizE&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=5))
 74 |     - [Deleting Columns from pandas DataFrame](http://stackoverflow.com/questions/13411544/delete-column-from-pandas-dataframe) ([video](https://www.youtube.com/watch?v=gnUKkS964WQ&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=6))
 75 |     - [Adding new Column to existing DataFrame](http://stackoverflow.com/questions/12555323/adding-new-column-to-existing-dataframe-in-python-pandas)
 76 |     - [Add one Row in a pandas.DataFrame](http://stackoverflow.com/questions/10715965/add-one-row-in-a-pandas-dataframe)
 77 |     - [Changing the order of DataFrame Columns](http://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns)
 78 |     - [Changing data type of Columns](http://stackoverflow.com/questions/15891038/pandas-change-data-type-of-columns) ([video](https://www.youtube.com/watch?v=V0AWyzVMf54&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=13))
 79 |     - [Getting a list of the column headers from a DataFrame](http://stackoverflow.com/questions/19482970/get-list-from-pandas-dataframe-column-headers)
 80 |     - [Converting list of dictionaries to Dataframe](http://stackoverflow.com/questions/20638006/convert-list-of-dictionaries-to-dataframe)
 81 |     - [Getting row count of pandas DataFrame](http://stackoverflow.com/questions/15943769/how-to-get-row-count-of-pandas-dataframe)
 82 |     - [Most efficient way to loop through DataFrames](http://stackoverflow.com/questions/7837722/what-is-the-most-efficient-way-to-loop-through-dataframes-with-pandas)
 83 |     - [Deleting DataFrame row based on column value](http://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value)
 84 |     - [Dropping a list of rows from Pandas DataFrame](http://stackoverflow.com/questions/14661701/how-to-drop-a-list-of-rows-from-pandas-dataframe)
 85 |     - [Sorting a DataFrame or a single column](https://www.youtube.com/watch?v=zY4doF6xSxY&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=7)
 86 |     - [Filtering DataFrame rows by column value](https://www.youtube.com/watch?v=2AFGPdNn4FM&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=8)
 87 |     - [Filtering DataFrame rows using multiple criteria](https://www.youtube.com/watch?v=YPItfQ87qjM&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=9)
 88 |     - [Dropping all non-numeric columns from a DataFrame](https://youtu.be/B-r9VuK80dk?t=4m31s)
 89 |     - [Counting and removing missing values](https://www.youtube.com/watch?v=fCMrO_VzeL8&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=16)
 90 |     - [Selecting multiple rows and columns from a DataFrame](https://www.youtube.com/watch?v=xvpNA7bC8cs&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=19)
 91 |     - [Reducing the size of a DataFrame](https://www.youtube.com/watch?v=wDYDYGyN_cw&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=21)
 92 | 
 93 | ## Machine Learning with Python
 94 | - [AI, ML Related List](https://github.com/svaksha/pythonidae/blob/master/AI.md)
 95 | - [Data Normalization in Python](http://blog.yhat.com/posts/data-normalization-in-python.html)
 96 | - [**Python Machine Learning Book**](https://github.com/rasbt/python-machine-learning-book)
 97 | - [Table of Contents and Code Notebooks](https://github.com/rasbt/python-machine-learning-book/blob/master/README.md#table-of-contents-and-code-notebooks)
 98 | - [Machine Learning with scikit learn](http://www.dataschool.io/machine-learning-with-scikit-learn/)
 99 | - [Machine Learning Algorithms Cheatsheet](http://www.analyticsvidhya.com/blog/2015/09/full-cheatsheet-machine-learning-algorithms/)
100 | - [**How to compute precision, recall, accuracy and f1-score for the multiclass case with scikit learn?**](http://stackoverflow.com/questions/31421413/how-to-compute-precision-recall-accuracy-and-f1-score-for-the-multiclass-case)
101 | - [One Hot Encoding for Machine learning in Python](http://stackoverflow.com/questions/17469835/one-hot-encoding-for-machine-learning)
102 | - [**Building a (semi) Autonomous Drone with Python**](http://blog.yhat.com/posts/autonomous-droning-with-python.html)
103 | - [Awesome Python - Machine Learning](https://github.com/vinta/awesome-python#machine-learning)
104 | - Computer Vision
105 |   - [Awesome Python - Computer Vision](https://github.com/vinta/awesome-python#computer-vision)
106 | 
107 | ## Scikit Learn
108 | - [scikit learn on Wikipedia](https://en.wikipedia.org/wiki/Scikit-learn)
109 | - [**Introduction to machine learning with scikit-learn**](https://github.com/justmarkham/scikit-learn-videos), [**Videos!**](http://blog.kaggle.com/author/kevin-markham/)
110 | - [**A Gentle Introduction to Scikit-Learn: A Python Machine Learning Library**](http://machinelearningmastery.com/a-gentle-introduction-to-scikit-learn-a-python-machine-learning-library/)
111 | - [**PyData Seattle 2015 Scikit-learn Tutorial**](https://github.com/jakevdp/sklearn_pydata2015), [sklearn_scipy2013](https://github.com/jakevdp/sklearn_scipy2013)
112 | - [SKLEARN BENCHMARKS: A centralized repository to report scikit-learn model performance across a variety of parameter settings and data sets](https://github.com/rhiever/sklearn-benchmarks), [Report results of sklearn benchmarks at openml.org](http://www.openml.org/)
113 | - [How to get most informative features for scikit-learn classifiers?](http://stackoverflow.com/questions/11116697/how-to-get-most-informative-features-for-scikit-learn-classifiers)
114 | - [**Code example to predict prices of Airbnb vacation rentals, using scikit-learn on Spark**](https://github.com/mapr-demos/spark-sklearn-airbnb-predict)
115 | - [**Machine Learning with scikit learn tutorial**](http://amueller.github.io/sklearn_tutorial/)
116 | - [Parallel and Large Scale Machine Learning with scikit-learn](https://speakerdeck.com/ogrisel/parallel-and-large-scale-machine-learning-with-scikit-learn), [Meetup](http://datasciencelondon.org/machine-learning-python-scikit-learn-ipython-dsldn-data-science-london-kaggle/)
117 | - [Saving classifier to disk in scikit-learn](http://stackoverflow.com/questions/10592605/save-classifier-to-disk-in-scikit-learn)
118 | 
119 | 
120 | ## Linear Regression in Python
121 | - [Linear Regression in Python](http://nbviewer.ipython.org/github/justmarkham/DAT4/blob/master/notebooks/08_linear_regression.ipynb), [Blog Post](http://www.dataschool.io/linear-regression-in-python/)
122 | - [Linear Regression using Scikit Learn](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)
123 | - [A friendly introduction to linear regression (using Python)](http://www.dataschool.io/linear-regression-in-python/)
124 | - [Linear Regression Example in Python](http://scipy-cookbook.readthedocs.io/items/LinearRegression.html)
125 | - [Regression analysis using Python StatsModels package](http://www.turingfinance.com/regression-analysis-using-python-statsmodels-and-quandl/)
126 | - [Run an OLS regression with Pandas Data Frame](http://stackoverflow.com/questions/19991445/run-an-ols-regression-with-pandas-data-frame)
127 | 
128 | ## Logistic Regression in Python
129 | - [Logistic Regression with scikit learn](http://www.dataschool.io/logistic-regression-in-python-using-scikit-learn/)
130 | - [Logistic Regression in Python](http://blog.yhat.com/posts/logistic-regression-and-python.html)
131 | - [Implementing the softmax function in Python](http://stackoverflow.com/questions/34968722/softmax-function-python)
132 | - [**What is the inverse of regularization strength in Logistic Regression? How should it affect my code?**](http://stackoverflow.com/questions/22851316/what-is-the-inverse-of-regularization-strength-in-logistic-regression-how-shoul)
133 | - [The Yhat Blog: Logistic Regression in Python](http://blog.yhat.com/posts/logistic-regression-and-python.html)
134 | - [Example of logistic regression in Python using scikit-learn](http://www.dataschool.io/logistic-regression-in-python-using-scikit-learn/)
135 | - [TUTORIAL ON LOGISTIC REGRESSION AND OPTIMIZATION IN PYTHON](https://learningwithdata.wordpress.com/2015/04/30/tutorial-on-logistic-regression-and-optimization-in-python/)
136 | - [Using Logistic Regression in Python for Data Science](http://www.dummies.com/how-to/content/using-logistic-regression-in-python-for-data-scien.html)
137 | 
138 | ## k Nearest Neighbours in Python
139 | - [A good tutorial on implementing K Nearest Neighbors using scikit learn](http://scikit-learn.org/stable/modules/neighbors.html)
140 | - [**Is it possible to specify your own distance function using scikit-learn K-Means Clustering?**](http://stackoverflow.com/questions/5529625/is-it-possible-to-specify-your-own-distance-function-using-scikit-learn-k-means)
141 | - [Tutorial To Implement k-Nearest Neighbors in Python From Scratch](http://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/)
142 | - [Implementing your own k-nearest neighbour algorithm using Python](https://blog.cambridgecoding.com/2016/01/16/machine-learning-under-the-hood-writing-your-own-k-nearest-neighbour-algorithm/)
143 | - [knn Python implementation on StackOverflow](http://stackoverflow.com/questions/5565935/k-nearest-neighbour-in-python)
144 | - [kNN with big sparse matrices in Python](http://stackoverflow.com/questions/20333092/knn-with-big-sparse-matrices-in-python)
145 | - [Sklearn kNN usage with a user defined metric](http://stackoverflow.com/questions/21052509/sklearn-knn-usage-with-a-user-defined-metric)
146 | 
147 | 
148 | ## Neural Networks in Python
149 | - [Implementing a Neural Network from scratch in Python](http://www.wildml.com/2015/09/implementing-a-neural-network-from-scratch/), [Code](https://github.com/dennybritz/nn-from-scratch)
150 | - [A Neural Network in 11 lines of Python](http://iamtrask.github.io/2015/07/12/basic-python-network/)
151 | - [Speeding up your Neural Network with Theano and the gpu](http://www.wildml.com/2015/09/speeding-up-your-neural-network-with-theano-and-the-gpu/), [Code](https://github.com/dennybritz/nn-theano)
152 | - [What is the best neural network library for Python?](https://www.quora.com/What-is-the-best-neural-network-library-for-Python)
153 | - [Recurrent Neural Net Tutorial in Python Part 1](http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-1-introduction-to-rnns/), [Part 2](http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano/), [Code](https://github.com/dennybritz/rnn-tutorial-rnnlm/)
154 | - [PyBrain: modular Machine Learning Library for Python](http://pybrain.org/)
155 | - [Neural Networks Tutorial – a Pathway to Deep Learning](http://www.adventuresinmachinelearning.com/neural-networks-tutorial/)
156 | 
157 | 
158 | ## Decision Trees in Python
159 | - [How to extract the decision rules from scikit-learn decision-tree?](http://stackoverflow.com/questions/20224526/how-to-extract-the-decision-rules-from-scikit-learn-decision-tree)
160 | - [**How do I find which attributes my tree splits on, when using scikit-learn?**](http://stackoverflow.com/questions/20156951/how-do-i-find-which-attributes-my-tree-splits-on-when-using-scikit-learn)
161 | - [Quora: What is a good Python library for decision trees?](https://www.quora.com/What-is-a-good-Python-library-for-decision-trees), [StackOverflow](http://stackoverflow.com/questions/3127922/what-is-a-good-python-library-for-decision-trees) 
162 | - [Building Decision Trees in Python](http://www.onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html?page=1)
163 | - [Pure Python Decision Trees](http://kldavenport.com/pure-python-decision-trees/)
164 | - [Building a decision tree from scratch in Python - a beginner's tutorial](http://www.patricklamle.com/Tutorials/Decision%20tree%20python/tuto_decision%20tree.html)
165 | - [Using Python to Build and Use a Simple Decision Tree Classifier](https://github.com/gumption/Python_for_Data_Science/blob/master/4_Python_Simple_Decision_Tree.ipynb)
166 | - [Decision trees in python with scikit-learn and pandas](http://chrisstrelioff.ws/sandbox/2015/06/08/decision_trees_in_python_with_scikit_learn_and_pandas.html)
167 | - [Code for simple decision tree in Python](https://github.com/gumption/Python_for_Data_Science/blob/master/simple_decision_tree.py)
168 | - [Lesson notebook: Regression and Classification Trees](http://nbviewer.jupyter.org/github/justmarkham/DAT8/blob/master/notebooks/17_decision_trees.ipynb)
169 | - [Discover structure behind data with decision trees](http://vooban.com/en/tips-articles-geek-stuff/discover-structure-behind-data-with-decision-trees/)
170 | 
171 | ## Random Forest with Python
172 | - [Getting Started with Random Forests: Titanic Competition on Kaggle](https://www.kaggle.com/c/titanic/details/getting-started-with-random-forests), [Python sample code](https://www.kaggle.com/c/digit-recognizer/forums/t/2299/getting-started-python-sample-code-random-forest)
173 | - [RandomForestClassifier vs ExtraTreesClassifier in scikit learn](http://stackoverflow.com/questions/22409855/randomforestclassifier-vs-extratreesclassifier-in-scikit-learn)
174 | - [Powerful Guide to learn Random Forest](http://www.analyticsvidhya.com/blog/2015/09/random-forest-algorithm-multiple-challenges/)
175 | - [How are Feature Importances in RandomForestClassifier determined?](http://stackoverflow.com/questions/15810339/how-are-feature-importances-in-randomforestclassifier-determined)
176 | - [Random forest interpretation with scikit-learn](http://blog.datadive.net/random-forest-interpretation-with-scikit-learn/)
177 | - [Random Forests in Python Tutorial](http://blog.yhat.com/posts/random-forests-in-python.html)
178 | - [Unbalanced classification using RandomForestClassifier in sklearn](http://stackoverflow.com/questions/20082674/unbalanced-classification-using-randomforestclassifier-in-sklearn)
179 | - [Random Forest with categorical features in sklearn](http://stackoverflow.com/questions/24715230/random-forest-with-categorical-features-in-sklearn)
180 | - [How to output RandomForest Classifier from python?](http://stackoverflow.com/questions/23000693/how-to-output-randomforest-classifier-from-python)
181 | - [Lesson notebook: Ensembling, Bagging, and Random Forests](http://nbviewer.jupyter.org/github/justmarkham/DAT8/blob/master/notebooks/18_ensembling.ipynb)
182 | 
183 | ## Support Vector Machine in Python
184 | - [Fastest SVM implementation usable in Python](http://stackoverflow.com/questions/9299346/fastest-svm-implementation-usable-in-python)
185 | - [An example using python bindings for SVM library, LIBSVM](http://stackoverflow.com/questions/4214868/an-example-using-python-bindings-for-svm-library-libsvm)
186 | - [What is the best SVM library usable from Python?](https://www.quora.com/What-is-the-best-SVM-library-usable-from-Python)
187 | - [How does sklearn.svm.svc's function predict_proba() work internally?](http://stackoverflow.com/questions/15111408/how-does-sklearn-svm-svcs-function-predict-proba-work-internally)
188 | - [Support vector machine in Python using libsvm example of features](http://stackoverflow.com/questions/30991592/support-vector-machine-in-python-using-libsvm-example-of-features)
189 | - [Linear SVC Machine learning SVM example with Python](https://pythonprogramming.net/linear-svc-example-scikit-learn-svm-python/)
190 | - [Understanding Support Vector Machine algorithm from examples (along with code)](http://www.analyticsvidhya.com/blog/2015/10/understaing-support-vector-machine-example-code/)
191 | 
192 | ## NLP / Text Mining in Python
193 | - [**NLP with Python ORiley Book**](http://www.nltk.org/book_1ed/), [Python 3](http://www.nltk.org/book/)
194 | - [Awesome Python - NLP](https://github.com/vinta/awesome-python#natural-language-processing)
195 | - [Awesome Python - Text Processing](https://github.com/vinta/awesome-python#text-processing)
196 | - [Text Analytics : Intro and Tokenization](http://a4analytics.blogspot.sg/2015/03/text-mining-post-1.html)
197 | - [NLTK BOOK](http://www.nltk.org/book/ch01.html)
198 | - [Elegant N-gram Generation in Python](http://locallyoptimal.com/blog/2013/01/20/elegant-n-gram-generation-in-python/)
199 | - [**Computing N Grams using Python**](http://stackoverflow.com/questions/13423919/computing-n-grams-using-python)
200 | - [N-grams: Explanation + 2 applications](http://stackoverflow.com/questions/1032288/n-grams-explanation-2-applications)
201 | - [NLP Tutorial with Python](http://www.datasciencecentral.com/profiles/blogs/python-nlp-tools)
202 | 
203 | ## Sentiment Analysis with Python
204 | - [A Comprehensive Guide to Sentiment Analysis](https://monkeylearn.com/sentiment-analysis/)
205 | - [Twitter-Sentiment-Analysis](https://github.com/ujjwalkarn/Twitter-Sentiment-Analysis)
206 | - [Basic Sentiment Analysis with Python](http://fjavieralba.com/basic-sentiment-analysis-with-python.html)
207 | - [What is the best way to do Sentiment Analysis with Python?](https://www.quora.com/What-is-the-best-way-to-do-Sentiment-Analysis-with-Python-1)
208 | - [How to Calculate Twitter Sentiment Using AlchemyAPI with Python](http://www.alchemyapi.com/developers/getting-started-guide/twitter-sentiment-analysis)
209 | - [Second Try: Sentiment Analysis in Python](http://andybromberg.com/sentiment-analysis-python/)
210 | - [Sentiment Analysis with Python NLTK Text Classification](http://text-processing.com/demo/sentiment/)
211 | - Codes and Explanation
212 |     - [**Sentiment Analysis with bag-of-words**](http://ataspinar.com/2016/01/21/sentiment-analysis-with-bag-of-words/)
213 |     - [**Sentiment Analysis with Naive Bayes**](http://ataspinar.com/2016/02/15/sentiment-analysis-with-the-naive-bayes-classifier/)
214 | 
215 | ## Pickle: convert a python object into a character stream
216 | - [Python serialization - Why pickle?](http://stackoverflow.com/questions/8968884/python-serialization-why-pickle)
217 | - [**Serializing Python Objects**](http://www.diveinto.org/python3/serializing.html), [**Binary Files**](http://www.diveinto.org/python3/files.html#binary)
218 | - [What is Pickle in python ?](https://pythontips.com/2013/08/02/what-is-pickle-in-python/)
219 | - [How to cPickle dump and load separate dictionaries to the same file?](http://stackoverflow.com/questions/11641493/how-to-cpickle-dump-and-load-separate-dictionaries-to-the-same-file)
220 | - [**Understanding Pickling in Python**](http://stackoverflow.com/questions/7501947/understanding-pickling-in-python)
221 | 
222 | ## AutoML
223 | - [TPOT: A Python tool for automating data science](http://www.randalolson.com/2016/05/08/tpot-a-python-tool-for-automating-data-science/), [GitHub repo](https://github.com/rhiever/tpot)
224 | 
225 | ## Regex Related
226 | - [RegExr](http://regexr.com/)
227 | - [Regex101](https://regex101.com/)
228 | - [Pythex](http://pythex.org/)
229 | - [How to use Regular Expressions (Regex) in Microsoft Excel both in-cell and loops](http://stackoverflow.com/questions/22542834/how-to-use-regular-expressions-regex-in-microsoft-excel-both-in-cell-and-loops)
230 | - [Advanced Filters: Excel’s Amazing Alternative To Regex](http://searchengineland.com/advanced-filters-excels-amazing-alternative-to-regex-143680)
231 | 
232 | ## Shell Scripting
233 | - [**Calling an external command in Python**](http://stackoverflow.com/questions/89228/calling-an-external-command-in-python)
234 | - [**Running shell command from Python and capturing the output**](http://stackoverflow.com/questions/4760215/running-shell-command-from-python-and-capturing-the-output)
235 | - [**Can I use Python as a bash replacement?**](http://stackoverflow.com/questions/209470/can-i-use-python-as-a-bash-replacement)
236 | - [Python Scripts as a Replacement for Bash Utility Scripts](http://www.linuxjournal.com/content/python-scripts-replacement-bash-utility-scripts)
237 | - [How to Write a Shell Script using Bash Shell in Ubuntu](https://www.youtube.com/watch?v=He-5BpUGSag)
238 | - Red Hat Magazine | Python for Bash scripters: A well-kept secret
239 | - [Embed bash in python](http://stackoverflow.com/questions/2651874/embed-bash-in-python)
240 | - [Bash2py: A Bash to Python Translator](https://cs.uwaterloo.ca/~ijdavis/bash2py-final.pdf)
241 | - [Beginners/BashScripting](https://help.ubuntu.com/community/Beginners/BashScripting)
242 | - [The Beginner’s Guide to Shell Scripting: The Basics](http://www.howtogeek.com/67469/the-beginners-guide-to-shell-scripting-the-basics/)
243 | - [Linux Shell Scripting Tutorial v1.05r3 A Beginner's handbook](http://www.freeos.com/guides/lsst/)
244 | 
245 | ## Other good lists
246 | - [pycrumbs - Bits and bytes of Python from the Internet](https://github.com/kirang89/pycrumbs)
247 | - [python github projects - Collect and classify python projects on Github](https://github.com/checkcheckzz/python-github-projects)
248 | - [python reference - Useful functions, tutorials, and other Python-related things](https://github.com/rasbt/python_reference)
249 | - [pythonidae - Curated decibans of scientific programming resources in Python](https://github.com/svaksha/pythonidae)
250 | 


--------------------------------------------------------------------------------
/Twitter-Data-Analysis/extract_twitter_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created on Sun Oct 04 23:10:41 2015
 3 | @author: ujjwal.karn
 4 | """
 5 | 
 6 | #first, install pip by following instructions here: http://stackoverflow.com/questions/4750806/how-to-install-pip-on-windows 
 7 | #then, to install tweepy library, go to Anaconda command prompt and type: pip install tweepy
 8 | #once tweepy is installed, run the codes below:
 9 | 
10 | import tweepy    #this will give an error if tweepy is not installed properly
11 | from tweepy import OAuthHandler
12 |  
13 | #provide your access details below 
14 | access_token = "xxxxxxxx"
15 | access_token_secret = "xxxxxxxx"
16 | consumer_key = "xxxxxxxx"
17 | consumer_secret = "xxxxxxxx"
18 |  
19 | auth = OAuthHandler(consumer_key, consumer_secret)
20 | auth.set_access_token(access_token, access_token_secret)
21 |  
22 | api = tweepy.API(auth)    
23 |     
24 | from tweepy import Stream
25 | from tweepy.streaming import StreamListener
26 |  
27 | class MyListener(StreamListener):
28 |  
29 |     def on_data(self, data):
30 |         try:
31 |             with open('location/file_name.txt', 'a') as f:  #change location here
32 |                 f.write(data)
33 |                 return True
34 |         except BaseException as e:
35 |             print("Error on_data: %s" % str(e))
36 |         return True
37 |  
38 |     def on_error(self, status):
39 |         print(status)
40 |         return True
41 |  
42 | twitter_stream = Stream(auth, MyListener())
43 | 
44 | #change the keyword here
45 | twitter_stream.filter(track=['#cricket'])
46 | 


--------------------------------------------------------------------------------
/Twitter-Data-Analysis/json2tweets.R:
--------------------------------------------------------------------------------
 1 | library(jsonlite)
 2 | options(encoding = "UTF-8")
 3 | 
 4 | # read in individual JSON lines
 5 | json_file <- "C:\\Users\\ujjwal.karn\\Desktop\\Tweets\\python.json"
 6 | 
 7 | # turn it into a proper array by separating each object with a "," and
 8 | # wrapping that up in an array with "[]"'s.
 9 | 
10 | dat <- fromJSON(sprintf("[%s]", paste(readLines(json_file), collapse=",")))
11 | 
12 | dim(dat)
13 | ## [1] 3959   18
14 | 
15 | tweets<-dat$text
16 | tweets
17 | 


--------------------------------------------------------------------------------
/basic_commands.py:
--------------------------------------------------------------------------------
 1 | >>> a = ['a', 'b', 'c', 'd', 'e']
 2 | >>> for index, item in enumerate(a): print index, item # enumerate function will generate an index for the item + item it self. 
 3 | ...
 4 | 0 a
 5 | 1 b
 6 | 2 c
 7 | 3 d
 8 | 4 e
 9 | 
10 | 
11 | 
12 | 
13 | #convert a list to string:
14 | 
15 | list1 = ['1', '2', '3']
16 | str1 = ''.join(list1)
17 | 
18 | Or if the list is of integers, convert the elements before joining them.
19 | 
20 | list1 = [1, 2, 3]
21 | str1 = ''.join(str(e) for e in list1)
22 | 
23 | 
24 | 
25 | #FIND method
26 | 
27 | str.find(str2, beg=0 end=len(string))
28 | 
29 | Parameters
30 | str2 -- This specifies the string to be searched.
31 | beg -- This is the starting index, by default its 0.
32 | end -- This is the ending index, by default its equal to the lenght of the string.
33 | 
34 | Return Value
35 | This method returns index if found and -1 otherwise.
36 | 
37 | str1 = "this is string example....wow!!!";
38 | str2 = "exam";
39 | 
40 | # find function will print the position for the first character of the string if it's found!
41 | print str1.find(str2);
42 | print str1.find(str2, 10);
43 | print str1.find(str2, 40);
44 | 
45 | #15
46 | #15
47 | #-1
48 | 
49 | 
50 | 
51 | 
52 | 
53 | #2D LIST PYTHON
54 | 
55 | # Creates a list containing 5 lists initialized to 0
56 | Matrix = [[0 for x in range(5)] for x in range(5)] 
57 | You can now add items to the list:
58 | 
59 | Matrix[0][0] = 1
60 | Matrix[4][0] = 5
61 | 
62 | print Matrix[0][0] # prints 1
63 | print Matrix[4][0] # prints 5
64 | 
65 | 
66 | if you have a simple two-dimensional list like this:
67 | 
68 | A = [[1,2,3,4],
69 |      [5,6,7,8]]
70 | then you can extract a column like this:
71 | 
72 | def column(matrix, i):
73 |     return [row[i] for row in matrix]
74 | Extracting the second column (index 1):
75 | 
76 | >>> column(A, 1)
77 | [2, 6]
78 | Or alternatively, simply:
79 | 
80 | >>> [row[1] for row in A]
81 | [2, 6]
82 | 


--------------------------------------------------------------------------------
/svm_sklearn.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn import svm
 3 | 
 4 | # Read the data
 5 | train = np.loadtxt(open("train.csv","rb"), delimiter=",", skiprows=0)
 6 | trainLabels = np.loadtxt(open("trainLabels.csv","rb"), delimiter=",", skiprows=0)
 7 | test = np.loadtxt(open("test.csv","rb"), delimiter=",", skiprows=0)
 8 | 
 9 | 
10 | X, y = train, trainLabels
11 | s = svm.SVC()
12 | s.fit(X, y)
13 | 
14 | predictions = s.predict(test)
15 | np.savetxt("fancySVMSubmission.csv", predictions.astype(int), fmt='%d', delimiter=",")
16 | 


--------------------------------------------------------------------------------