├── LICENSE ├── Logistic Regression with StatsModels ├── logistic.py ├── test.csv └── train.csv ├── Logistic-Regression ├── citreo.py ├── citreo_code_v2.py ├── classifier_corrected.py └── logistic_regression_updated.py ├── README.md ├── Twitter-Data-Analysis ├── extract_twitter_data.py └── json2tweets.R ├── basic_commands.py └── svm_sklearn.py /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Ujjwal Karn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Logistic Regression with StatsModels/logistic.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Wed Sep 09 12:38:16 2015 3 | @author: ujjwal.karn 4 | """ 5 | 6 | import pandas as pd #for handling datasets 7 | import statsmodels.api as sm #for statistical modeling 8 | import pylab as pl #for plotting 9 | import numpy as np #for numerical computation 10 | 11 | # read the data in 12 | dfTrain = pd.read_csv("C:\\Users\\ujjwal.karn\\Desktop\\Python\\train.csv") 13 | dfTest = pd.read_csv("C:\\Users\\ujjwal.karn\\Desktop\\Python\\test.csv") 14 | 15 | # take a look at the dataset 16 | print dfTrain.head() 17 | # admit gre gpa prestige 18 | #0 0 380 3.61 good 19 | #1 1 660 3.67 good 20 | #2 1 800 4.00 best 21 | #3 1 640 3.19 ok 22 | #4 0 520 2.93 ok 23 | 24 | print dfTest.head() 25 | # gre gpa prestige 26 | #0 640 3.30 veryGood 27 | #1 660 3.60 good 28 | #2 400 3.15 veryGood 29 | #3 680 3.98 veryGood 30 | #4 220 2.83 good 31 | 32 | 33 | # summarize the data 34 | print dfTrain.describe() 35 | # admit gre gpa 36 | #count 300.000000 300.000000 300.000000 37 | #mean 0.306667 590.866667 3.386233 38 | #std 0.461880 117.717630 0.374880 39 | #min 0.000000 300.000000 2.260000 40 | #25% 0.000000 515.000000 3.130000 41 | #50% 0.000000 600.000000 3.390000 42 | #75% 1.000000 680.000000 3.642500 43 | #max 1.000000 800.000000 4.000000 44 | 45 | # take a look at the standard deviation of each column 46 | print dfTrain.std() 47 | #admit 0.46188 48 | #gre 117.71763 49 | #gpa 0.37488 50 | 51 | # frequency table cutting presitge and whether or not someone was admitted 52 | print pd.crosstab(dfTrain['admit'], dfTrain['prestige'], rownames=['dmit']) 53 | #prestige best good ok veryGood 54 | #admit 55 | #0 20 73 47 68 56 | #1 25 19 9 39 57 | 58 | #explore data 59 | dfTrain.groupby('admit').mean() 60 | # gre gpa 61 | #admit 62 | #0 573.461538 3.336587 63 | #1 630.217391 3.498478 64 | 65 | # plot one column 66 | dfTrain['gpa'].hist() 67 | pl.title('Histogram of GPA') 68 | pl.xlabel('GPA') 69 | pl.ylabel('Frequency') 70 | pl.show() 71 | 72 | # barplot of gre score grouped by admission status (True or False) 73 | pd.crosstab(dfTrain.gre, dfTrain.admit.astype(bool)).plot(kind='bar') 74 | pl.title('GRE score by Admission Status') 75 | pl.xlabel('GRE score') 76 | pl.ylabel('Frequency') 77 | pl.show() 78 | 79 | # dummify prestige 80 | dummy_ranks = pd.get_dummies(dfTrain['prestige'], prefix='prestige') 81 | print dummy_ranks.head() 82 | # prestige_best prestige_good prestige_ok prestige_veryGood 83 | #0 0 1 0 0 84 | #1 0 1 0 0 85 | #2 1 0 0 0 86 | #3 0 0 1 0 87 | #4 0 0 1 0 88 | 89 | # create a clean data frame for the regression 90 | cols_to_keep = ['admit', 'gre', 'gpa'] 91 | data = dfTrain[cols_to_keep].join(dummy_ranks.ix[:, 'prestige_good':]) 92 | print data.head() 93 | # admit gre gpa prestige_good prestige_ok prestige_veryGood 94 | #0 0 380 3.61 1 0 0 95 | #1 1 660 3.67 1 0 0 96 | #2 1 800 4.00 0 0 0 97 | #3 1 640 3.19 0 1 0 98 | #4 0 520 2.93 0 1 0 99 | 100 | # manually add the intercept 101 | data['intercept'] = 1.0 102 | 103 | print data.head() 104 | 105 | train_cols = data.columns[1:] 106 | print data.columns[1:] 107 | # Index([u'gre', u'gpa', u'prestige_good', u'prestige_ok', u'prestige_veryGood', u'intercept'], dtype='object') 108 | 109 | #Logistic Regression 110 | logit = sm.Logit(data['admit'], data[train_cols]) 111 | 112 | # fit the model 113 | result = logit.fit() 114 | print result.summary() 115 | 116 | # recreate the dummy variables 117 | dummy_ranks_test = pd.get_dummies(dfTest['prestige'], prefix='prestige') 118 | print dummy_ranks_test 119 | 120 | #create intercept column 121 | dfTest['intercept'] = 1.0 122 | 123 | # keep only what we need for making predictions 124 | cols_to_keep = ['gre', 'gpa', 'prestige', 'intercept'] 125 | dfTest = dfTest[cols_to_keep].join(dummy_ranks_test.ix[:, 'prestige_good':]) 126 | 127 | dfTest.head() 128 | # make predictions on the enumerated dataset 129 | dfTest['admit_pred'] = result.predict(dfTest[train_cols]) 130 | 131 | #see probabilities 132 | print dfTest.head() 133 | 134 | #convert probabilities to 'yes' 'no' 135 | dfTest['admit_yn']= np.where(dfTest['admit_pred'] > 0.5,'yes','no') 136 | print dfTest.head() 137 | 138 | cols= ['gre', 'gpa', 'admit_yn'] 139 | dfTest[cols].groupby('admit_yn').mean() 140 | # gre gpa 141 | #admit_yn 142 | #no 556.585366 3.324268 143 | #yes 676.666667 3.750000 144 | 145 | cols= ['gre', 'gpa', 'admit_yn'] 146 | dfTest[cols].groupby('admit_yn').mean() 147 | # gre gpa 148 | #admit_yn 149 | #no 556.585366 3.324268 150 | #yes 676.666667 3.750000 151 | 152 | dfTest.to_csv('C:\\Users\\ujjwal.karn\\Desktop\\Python\\output.csv', sep=',') 153 | -------------------------------------------------------------------------------- /Logistic Regression with StatsModels/test.csv: -------------------------------------------------------------------------------- 1 | gre,gpa,prestige 2 | 640,3.3,veryGood 3 | 660,3.6,good 4 | 400,3.15,veryGood 5 | 680,3.98,veryGood 6 | 220,2.83,good 7 | 580,3.46,ok 8 | 540,3.17,best 9 | 580,3.51,veryGood 10 | 540,3.13,veryGood 11 | 440,2.98,good 12 | 560,4,good 13 | 660,3.67,veryGood 14 | 660,3.77,good 15 | 520,3.65,ok 16 | 540,3.46,ok 17 | 300,2.84,veryGood 18 | 340,3,veryGood 19 | 780,3.63,ok 20 | 480,3.71,ok 21 | 540,3.28,best 22 | 460,3.14,good 23 | 460,3.58,veryGood 24 | 500,3.01,ok 25 | 420,2.69,veryGood 26 | 520,2.7,good 27 | 680,3.9,best 28 | 680,3.31,veryGood 29 | 560,3.48,veryGood 30 | 580,3.34,veryGood 31 | 500,2.93,ok 32 | 740,4,good 33 | 660,3.59,good 34 | 420,2.96,best 35 | 560,3.43,good 36 | 460,3.64,good 37 | 620,3.71,best 38 | 520,3.15,good 39 | 620,3.09,ok 40 | 540,3.2,best 41 | 660,3.47,good 42 | 500,3.23,ok 43 | 560,2.65,good 44 | 500,3.95,ok 45 | 580,3.06,veryGood 46 | 520,3.35,good 47 | 500,3.03,good 48 | 600,3.35,veryGood 49 | 580,3.8,veryGood 50 | 400,3.36,veryGood 51 | 620,2.85,veryGood 52 | 780,4,veryGood 53 | 620,3.43,good 54 | 580,3.12,good 55 | 700,3.52,veryGood 56 | 540,3.78,veryGood 57 | 760,2.81,best 58 | 700,3.27,veryGood 59 | 720,3.31,best 60 | 560,3.69,good 61 | 720,3.94,good 62 | 520,4,best 63 | 540,3.49,best 64 | 680,3.14,veryGood 65 | 460,3.44,veryGood 66 | 560,3.36,best 67 | 480,2.78,good 68 | 460,2.93,good 69 | 620,3.63,good 70 | 580,4,best 71 | 800,3.89,veryGood 72 | 540,3.77,veryGood 73 | 680,3.76,good 74 | 680,2.42,best 75 | 620,3.37,best 76 | 560,3.78,veryGood 77 | 560,3.49,ok 78 | 620,3.63,veryGood 79 | 800,4,veryGood 80 | 640,3.12,good 81 | 540,2.7,veryGood 82 | 700,3.65,veryGood 83 | 540,3.49,veryGood 84 | 540,3.51,veryGood 85 | 660,4,best 86 | 480,2.62,veryGood 87 | 420,3.02,best 88 | 740,3.86,veryGood 89 | 580,3.36,veryGood 90 | 640,3.17,veryGood 91 | 640,3.51,veryGood 92 | 800,3.05,veryGood 93 | 660,3.88,veryGood 94 | 600,3.38,good 95 | 620,3.75,veryGood 96 | 460,3.99,good 97 | 620,4,veryGood 98 | 560,3.04,good 99 | 460,2.63,veryGood 100 | 700,3.65,veryGood 101 | 600,3.89,good 102 | -------------------------------------------------------------------------------- /Logistic Regression with StatsModels/train.csv: -------------------------------------------------------------------------------- 1 | admit,gre,gpa,prestige 2 | 0,380,3.61,good 3 | 1,660,3.67,good 4 | 1,800,4,best 5 | 1,640,3.19,ok 6 | 0,520,2.93,ok 7 | 1,760,3,veryGood 8 | 1,560,2.98,best 9 | 0,400,3.08,veryGood 10 | 1,540,3.39,good 11 | 0,700,3.92,veryGood 12 | 0,800,4,ok 13 | 0,440,3.22,best 14 | 1,760,4,best 15 | 0,700,3.08,veryGood 16 | 1,700,4,best 17 | 0,480,3.44,good 18 | 0,780,3.87,ok 19 | 0,360,2.56,good 20 | 0,800,3.75,veryGood 21 | 1,540,3.81,best 22 | 0,500,3.17,good 23 | 1,660,3.63,veryGood 24 | 0,600,2.82,ok 25 | 0,680,3.19,ok 26 | 1,760,3.35,veryGood 27 | 1,800,3.66,best 28 | 1,620,3.61,best 29 | 1,520,3.74,ok 30 | 1,780,3.22,veryGood 31 | 0,520,3.29,best 32 | 0,540,3.78,ok 33 | 0,760,3.35,good 34 | 0,600,3.4,good 35 | 1,800,4,good 36 | 0,360,3.14,best 37 | 0,400,3.05,veryGood 38 | 0,580,3.25,best 39 | 0,520,2.9,good 40 | 1,500,3.13,veryGood 41 | 1,520,2.68,good 42 | 0,560,2.42,veryGood 43 | 1,580,3.32,veryGood 44 | 1,600,3.15,veryGood 45 | 0,500,3.31,good 46 | 0,700,2.94,veryGood 47 | 1,460,3.45,good 48 | 1,580,3.46,veryGood 49 | 0,500,2.97,ok 50 | 0,440,2.48,ok 51 | 0,400,3.35,good 52 | 0,640,3.86,good 53 | 0,440,3.13,ok 54 | 0,740,3.37,ok 55 | 1,680,3.27,veryGood 56 | 0,660,3.34,good 57 | 1,740,4,good 58 | 0,560,3.19,good 59 | 0,380,2.94,good 60 | 0,400,3.65,veryGood 61 | 0,600,2.82,ok 62 | 1,620,3.18,veryGood 63 | 0,560,3.32,ok 64 | 0,640,3.67,good 65 | 1,680,3.85,good 66 | 0,580,4,good 67 | 0,600,3.59,veryGood 68 | 0,740,3.62,ok 69 | 0,620,3.3,best 70 | 0,580,3.69,best 71 | 0,800,3.73,best 72 | 0,640,4,good 73 | 0,300,2.92,ok 74 | 0,480,3.39,ok 75 | 0,580,4,veryGood 76 | 0,720,3.45,ok 77 | 0,720,4,good 78 | 0,560,3.36,good 79 | 1,800,4,good 80 | 0,540,3.12,best 81 | 1,620,4,best 82 | 0,700,2.9,ok 83 | 0,620,3.07,veryGood 84 | 0,500,2.71,veryGood 85 | 0,380,2.91,ok 86 | 1,500,3.6,good 87 | 0,520,2.98,veryGood 88 | 0,600,3.32,veryGood 89 | 0,600,3.48,veryGood 90 | 0,700,3.28,best 91 | 1,660,4,veryGood 92 | 0,700,3.83,veryGood 93 | 1,720,3.64,best 94 | 0,800,3.9,veryGood 95 | 0,580,2.93,veryGood 96 | 1,660,3.44,veryGood 97 | 0,660,3.33,veryGood 98 | 0,640,3.52,ok 99 | 0,480,3.57,veryGood 100 | 0,700,2.88,veryGood 101 | 0,400,3.31,good 102 | 0,340,3.15,good 103 | 0,580,3.57,good 104 | 0,380,3.33,ok 105 | 0,540,3.94,good 106 | 1,660,3.95,veryGood 107 | 1,740,2.97,veryGood 108 | 1,700,3.56,best 109 | 0,480,3.13,veryGood 110 | 0,400,2.93,good 111 | 0,480,3.45,veryGood 112 | 0,680,3.08,ok 113 | 0,420,3.41,ok 114 | 0,360,3,good 115 | 0,600,3.22,best 116 | 0,720,3.84,good 117 | 0,620,3.99,good 118 | 1,440,3.45,veryGood 119 | 0,700,3.72,veryGood 120 | 1,800,3.7,best 121 | 0,340,2.92,good 122 | 1,520,3.74,veryGood 123 | 1,480,2.67,veryGood 124 | 0,520,2.85,good 125 | 0,500,2.98,good 126 | 0,720,3.88,good 127 | 0,540,3.38,ok 128 | 1,600,3.54,best 129 | 0,740,3.74,ok 130 | 0,540,3.19,veryGood 131 | 0,460,3.15,ok 132 | 1,620,3.17,veryGood 133 | 0,640,2.79,veryGood 134 | 0,580,3.4,veryGood 135 | 0,500,3.08,good 136 | 0,560,2.95,veryGood 137 | 0,500,3.57,good 138 | 0,560,3.33,ok 139 | 0,700,4,good 140 | 0,620,3.4,veryGood 141 | 1,600,3.58,best 142 | 0,640,3.93,veryGood 143 | 1,700,3.52,ok 144 | 0,620,3.94,ok 145 | 0,580,3.4,good 146 | 0,580,3.4,ok 147 | 0,380,3.43,good 148 | 0,480,3.4,veryGood 149 | 0,560,2.71,good 150 | 1,480,2.91,best 151 | 0,740,3.31,best 152 | 1,800,3.74,best 153 | 0,400,3.38,veryGood 154 | 1,640,3.94,veryGood 155 | 0,580,3.46,good 156 | 0,620,3.69,good 157 | 1,580,2.86,ok 158 | 0,560,2.52,veryGood 159 | 1,480,3.58,best 160 | 0,660,3.49,veryGood 161 | 0,700,3.82,good 162 | 0,600,3.13,veryGood 163 | 0,640,3.5,veryGood 164 | 1,700,3.56,veryGood 165 | 0,520,2.73,veryGood 166 | 0,580,3.3,veryGood 167 | 0,700,4,best 168 | 0,440,3.24,ok 169 | 0,720,3.77,good 170 | 0,500,4,good 171 | 0,600,3.62,good 172 | 0,400,3.51,good 173 | 0,540,2.81,good 174 | 0,680,3.48,good 175 | 1,800,3.43,veryGood 176 | 0,500,3.53,ok 177 | 1,620,3.37,veryGood 178 | 0,520,2.62,veryGood 179 | 1,620,3.23,good 180 | 0,620,3.33,good 181 | 0,300,3.01,good 182 | 0,620,3.78,good 183 | 0,500,3.88,ok 184 | 0,700,4,veryGood 185 | 1,540,3.84,veryGood 186 | 0,500,2.79,ok 187 | 0,800,3.6,veryGood 188 | 0,560,3.61,good 189 | 0,580,2.88,veryGood 190 | 0,560,3.07,veryGood 191 | 0,500,3.35,veryGood 192 | 1,640,2.94,veryGood 193 | 0,800,3.54,good 194 | 0,640,3.76,good 195 | 0,380,3.59,ok 196 | 1,600,3.47,veryGood 197 | 0,560,3.59,veryGood 198 | 0,660,3.07,good 199 | 1,400,3.23,ok 200 | 0,600,3.63,good 201 | 0,580,3.77,ok 202 | 0,800,3.31,good 203 | 1,580,3.2,veryGood 204 | 1,700,4,best 205 | 0,420,3.92,ok 206 | 1,600,3.89,best 207 | 1,780,3.8,good 208 | 0,740,3.54,best 209 | 1,640,3.63,best 210 | 0,540,3.16,good 211 | 0,580,3.5,veryGood 212 | 0,740,3.34,ok 213 | 0,580,3.02,veryGood 214 | 0,460,2.87,veryGood 215 | 0,640,3.38,good 216 | 1,600,3.56,veryGood 217 | 1,660,2.91,good 218 | 0,340,2.9,best 219 | 1,460,3.64,best 220 | 0,460,2.98,best 221 | 1,560,3.59,veryGood 222 | 0,540,3.28,good 223 | 0,680,3.99,good 224 | 1,480,3.02,best 225 | 0,800,3.47,good 226 | 0,800,2.9,veryGood 227 | 1,720,3.5,good 228 | 0,620,3.58,veryGood 229 | 0,540,3.02,ok 230 | 0,480,3.43,veryGood 231 | 1,720,3.42,veryGood 232 | 0,580,3.29,ok 233 | 0,600,3.28,good 234 | 0,380,3.38,veryGood 235 | 0,420,2.67,good 236 | 1,800,3.53,best 237 | 0,620,3.05,veryGood 238 | 1,660,3.49,veryGood 239 | 0,480,4,veryGood 240 | 0,500,2.86,ok 241 | 0,700,3.45,good 242 | 0,440,2.76,veryGood 243 | 1,520,3.81,best 244 | 1,680,2.96,good 245 | 0,620,3.22,veryGood 246 | 0,540,3.04,best 247 | 0,800,3.91,good 248 | 0,680,3.34,veryGood 249 | 0,440,3.17,veryGood 250 | 0,680,3.64,good 251 | 0,640,3.73,good 252 | 0,660,3.31,ok 253 | 0,620,3.21,ok 254 | 1,520,4,veryGood 255 | 1,540,3.55,ok 256 | 1,740,3.52,ok 257 | 0,640,3.35,good 258 | 1,520,3.3,veryGood 259 | 1,620,3.95,good 260 | 0,520,3.51,veryGood 261 | 0,640,3.81,veryGood 262 | 0,680,3.11,veryGood 263 | 0,440,3.15,veryGood 264 | 1,520,3.19,good 265 | 1,620,3.95,good 266 | 1,520,3.9,good 267 | 0,380,3.34,good 268 | 0,560,3.24,ok 269 | 1,600,3.64,good 270 | 1,680,3.46,veryGood 271 | 0,500,2.81,good 272 | 1,640,3.95,veryGood 273 | 0,540,3.33,good 274 | 1,680,3.67,veryGood 275 | 0,660,3.32,best 276 | 0,520,3.12,veryGood 277 | 1,600,2.98,veryGood 278 | 0,460,3.77,good 279 | 1,580,3.58,best 280 | 1,680,3,ok 281 | 1,660,3.14,veryGood 282 | 0,660,3.94,veryGood 283 | 0,360,3.27,good 284 | 0,660,3.45,ok 285 | 0,520,3.1,ok 286 | 1,440,3.39,veryGood 287 | 0,600,3.31,ok 288 | 1,800,3.22,best 289 | 1,660,3.7,ok 290 | 0,800,3.15,ok 291 | 0,420,2.26,ok 292 | 1,620,3.45,veryGood 293 | 0,800,2.78,veryGood 294 | 0,680,3.7,veryGood 295 | 0,800,3.97,best 296 | 0,480,2.55,best 297 | 0,520,3.25,good 298 | 0,560,3.16,best 299 | 0,460,3.07,veryGood 300 | 0,540,3.5,veryGood 301 | 0,720,3.4,good 302 | -------------------------------------------------------------------------------- /Logistic-Regression/citreo.py: -------------------------------------------------------------------------------- 1 | ''' 2 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 3 | Version 2, December 2004 4 | 5 | Copyright (C) 2004 Sam Hocevar 6 | 7 | Everyone is permitted to copy and distribute verbatim or modified 8 | copies of this license document, and changing it is allowed as long 9 | as the name is changed. 10 | 11 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 12 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 13 | 14 | 0. You just DO WHAT THE FUCK YOU WANT TO. 15 | ''' 16 | 17 | 18 | from datetime import datetime 19 | from csv import DictReader 20 | from math import exp, log, sqrt 21 | 22 | 23 | # parameters ################################################################# 24 | 25 | train = 'train.csv' # path to training file 26 | test = 'test.csv' # path to testing file 27 | 28 | D = 2 ** 20 # number of weights use for learning 29 | alpha = .1 # learning rate for sgd optimization 30 | 31 | 32 | # function definitions ####################################################### 33 | 34 | # A. Bounded logloss 35 | # INPUT: 36 | # p: our prediction 37 | # y: real answer 38 | # OUTPUT 39 | # logarithmic loss of p given y 40 | def logloss(p, y): 41 | p = max(min(p, 1. - 10e-12), 10e-12) 42 | return -log(p) if y == 1. else -log(1. - p) 43 | 44 | 45 | # B. Apply hash trick of the original csv row 46 | # for simplicity, we treat both integer and categorical features as categorical 47 | # INPUT: 48 | # csv_row: a csv dictionary, ex: {'Lable': '1', 'I1': '357', 'I2': '', ...} 49 | # D: the max index that we can hash to 50 | # OUTPUT: 51 | # x: a list of indices that its value is 1 52 | def get_x(csv_row, D): 53 | x = [0] # 0 is the index of the bias term 54 | for key, value in csv_row.items(): 55 | index = int(value + key[1:], 16) % D # weakest hash ever ;) 56 | x.append(index) 57 | return x # x contains indices of features that have a value of 1 58 | 59 | 60 | # C. Get probability estimation on x 61 | # INPUT: 62 | # x: features 63 | # w: weights 64 | # OUTPUT: 65 | # probability of p(y = 1 | x; w) 66 | def get_p(x, w): 67 | wTx = 0. 68 | for i in x: # do wTx 69 | wTx += w[i] * 1. # w[i] * x[i], but if i in x we got x[i] = 1. 70 | return 1. / (1. + exp(-max(min(wTx, 20.), -20.))) # bounded sigmoid 71 | 72 | 73 | # D. Update given model 74 | # INPUT: 75 | # w: weights 76 | # n: a counter that counts the number of times we encounter a feature 77 | # this is used for adaptive learning rate 78 | # x: feature 79 | # p: prediction of our model 80 | # y: answer 81 | # OUTPUT: 82 | # w: updated model 83 | # n: updated count 84 | def update_w(w, n, x, p, y): 85 | for i in x: 86 | # alpha / (sqrt(n) + 1) is the adaptive learning rate heuristic 87 | # (p - y) * x[i] is the current gradient 88 | # note that in our case, if i in x then x[i] = 1 89 | w[i] -= (p - y) * alpha / (sqrt(n[i]) + 1.) 90 | n[i] += 1. 91 | 92 | return w, n 93 | 94 | 95 | # training and testing ####################################################### 96 | 97 | # initialize our model 98 | w = [0.] * D # weights 99 | n = [0.] * D # number of times we've encountered a feature 100 | 101 | # start training a logistic regression model using on pass sgd 102 | loss = 0. 103 | for t, row in enumerate(DictReader(open(train))): 104 | y = 1. if row['Label'] == '1' else 0. 105 | 106 | del row['Label'] # can't let the model peek the answer 107 | del row['Id'] # we don't need the Id 108 | 109 | # main training procedure 110 | # step 1, get the hashed features 111 | x = get_x(row, D) 112 | 113 | # step 2, get prediction 114 | p = get_p(x, w) 115 | 116 | # for progress validation, useless for learning our model 117 | loss += logloss(p, y) 118 | if t % 1000000 == 0 and t > 1: 119 | print('%s\tencountered: %d\tcurrent logloss: %f' % ( 120 | datetime.now(), t, loss/t)) 121 | 122 | # step 3, update model with answer 123 | w, n = update_w(w, n, x, p, y) 124 | 125 | # testing (build kaggle's submission file) 126 | with open('submission1234.csv', 'w') as submission: 127 | submission.write('Id,Predicted\n') 128 | for t, row in enumerate(DictReader(open(test))): 129 | Id = row['Id'] 130 | del row['Id'] 131 | x = get_x(row, D) 132 | p = get_p(x, w) 133 | submission.write('%s,%f\n' % (Id, p)) -------------------------------------------------------------------------------- /Logistic-Regression/citreo_code_v2.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from csv import DictReader 3 | from math import exp, log, sqrt 4 | 5 | 6 | # parameters ################################################################# 7 | 8 | train = 'train.csv' # path to training file 9 | test = 'test.csv' # path to testing file 10 | 11 | D = 2 ** 20 # number of weights use for learning 12 | alpha = .1 # learning rate for sgd optimization 13 | 14 | 15 | # function definitions ####################################################### 16 | 17 | # A. Bounded logloss 18 | # INPUT: 19 | # p: our prediction 20 | # y: real answer 21 | # OUTPUT 22 | # logarithmic loss of p given y 23 | def logloss(p, y): 24 | p = max(min(p, 1. - 10e-12), 10e-12) 25 | return -log(p) if y == 1. else -log(1. - p) 26 | 27 | 28 | # B. Apply hash trick of the original csv row 29 | # for simplicity, we treat both integer and categorical features as categorical 30 | # INPUT: 31 | # csv_row: a csv dictionary, ex: {'Lable': '1', 'I1': '357', 'I2': '', ...} 32 | # D: the max index that we can hash to 33 | # OUTPUT: 34 | # x: a list of indices that its value is 1 35 | 36 | # def get_x(csv_row, D): 37 | # x = [0] # 0 is the index of the bias term 38 | # for key, value in csv_row.items(): 39 | # index = int(value + key[1:], 16) % D # weakest hash ever ;) 40 | # x.append(index) 41 | # return x # x contains indices of features that have a value of 1 42 | 43 | 44 | # C. Get probability estimation on x 45 | # INPUT: 46 | # x: features 47 | # w: weights 48 | # OUTPUT: 49 | # probability of p(y = 1 | x; w) 50 | def get_p(x, w): 51 | wTx = 0. 52 | for i in x: # do wTx 53 | wTx += w[i] * 1. # w[i] * x[i], but if i in x we got x[i] = 1. 54 | return 1. / (1. + exp(-max(min(wTx, 20.), -20.))) # bounded sigmoid 55 | 56 | 57 | # D. Update given model 58 | # INPUT: 59 | # w: weights 60 | # n: a counter that counts the number of times we encounter a feature 61 | # this is used for adaptive learning rate 62 | # x: feature 63 | # p: prediction of our model 64 | # y: answer 65 | # OUTPUT: 66 | # w: updated model 67 | # n: updated count 68 | def update_w(w, n, x, p, y): 69 | for i in x: 70 | # alpha / (sqrt(n) + 1) is the adaptive learning rate heuristic 71 | # (p - y) * x[i] is the current gradient 72 | # note that in our case, if i in x then x[i] = 1 73 | w[i] -= (p - y) * alpha / (sqrt(n[i]) + 1.) 74 | n[i] += 1. 75 | 76 | return w, n 77 | 78 | 79 | # training and testing ####################################################### 80 | 81 | # initialize our model 82 | w = [0.] * D # weights 83 | n = [0.] * D # number of times we've encountered a feature 84 | 85 | # start training a logistic regression model using on pass sgd 86 | loss = 0. 87 | for t, row in enumerate(DictReader(open(train))): 88 | y = 1. if row['Label'] == '1' else 0. 89 | 90 | del row['Label'] # can't let the model peek the answer 91 | del row['Id'] # we don't need the Id 92 | 93 | # main training procedure 94 | # step 1, get the hashed features 95 | x = get_x(row, D) 96 | 97 | # step 2, get prediction 98 | p = get_p(x, w) 99 | 100 | # for progress validation, useless for learning our model 101 | loss += logloss(p, y) 102 | if t % 1000000 == 0 and t > 1: 103 | print('%s\tencountered: %d\tcurrent logloss: %f' % ( 104 | datetime.now(), t, loss/t)) 105 | 106 | # step 3, update model with answer 107 | w, n = update_w(w, n, x, p, y) 108 | 109 | # testing (build kaggle's submission file) 110 | with open('submission1234.csv', 'w') as submission: 111 | submission.write('Id,Predicted\n') 112 | for t, row in enumerate(DictReader(open(test))): 113 | Id = row['Id'] 114 | del row['Id'] 115 | x = get_x(row, D) 116 | p = get_p(x, w) 117 | submission.write('%s,%f\n' % (Id, p)) 118 | -------------------------------------------------------------------------------- /Logistic-Regression/classifier_corrected.py: -------------------------------------------------------------------------------- 1 | #https://www.kaggle.com/c/amazon-employee-access-challenge/forums/t/4797/starter-code-in-python-with-scikit-learn-auc-885 2 | 3 | """ Amazon Access Challenge Starter Code 4 | 5 | These files provide some starter code using 6 | the scikit-learn library. It provides some examples on how 7 | to design a simple algorithm, including pre-processing, 8 | training a logistic regression classifier on the data, 9 | assess its performance through cross-validation and some 10 | pointers on where to go next. 11 | 12 | Paul Duan 13 | """ 14 | 15 | from __future__ import division 16 | 17 | import numpy as np 18 | from sklearn import (metrics, cross_validation, linear_model, preprocessing) 19 | 20 | SEED = 42 # always use a seed for randomized procedures 21 | 22 | 23 | def load_data(filename, use_labels=True): 24 | """ 25 | Load data from CSV files and return them as numpy arrays 26 | The use_labels parameter indicates whether one should 27 | read the first column (containing class labels). If false, 28 | return all 0s. 29 | """ 30 | 31 | # load column 1 to 8 (ignore last one) 32 | data = np.loadtxt(open("data/" + filename), delimiter=',', 33 | usecols=range(1, 9), skiprows=1) 34 | if use_labels: 35 | labels = np.loadtxt(open("data/" + filename), delimiter=',', 36 | usecols=[0], skiprows=1) 37 | else: 38 | labels = np.zeros(data.shape[0]) 39 | return labels, data 40 | 41 | 42 | def save_results(predictions, filename): 43 | """Given a vector of predictions, save results in CSV format.""" 44 | with open(filename, 'w') as f: 45 | f.write("id,ACTION\n") 46 | for i, pred in enumerate(predictions): 47 | f.write("%d,%f\n" % (i + 1, pred)) 48 | 49 | 50 | def main(): 51 | """ 52 | Fit models and make predictions. 53 | We'll use one-hot encoding to transform our categorical features 54 | into binary features. 55 | y and X will be numpy array objects. 56 | """ 57 | model = linear_model.LogisticRegression(C=3) # the classifier we'll use 58 | 59 | # === load data in memory === # 60 | print "loading data" 61 | y, X = load_data('train.csv') 62 | y_test, X_test = load_data('test.csv', use_labels=False) 63 | 64 | # === one-hot encoding === # 65 | # we want to encode the category IDs encountered both in 66 | # the training and the test set, so we fit the encoder on both 67 | encoder = preprocessing.OneHotEncoder() 68 | encoder.fit(np.vstack((X, X_test))) 69 | X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) 70 | X_test = encoder.transform(X_test) 71 | 72 | # if you want to create new features, you'll need to compute them 73 | # before the encoding, and append them to your dataset after 74 | 75 | # === training & metrics === # 76 | mean_auc = 0.0 77 | n = 10 # repeat the CV procedure 10 times to get more precise results 78 | for i in range(n): 79 | # for each iteration, randomly hold out 20% of the data as CV set 80 | X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( 81 | X, y, test_size=.20, random_state=i*SEED) 82 | 83 | # if you want to perform feature selection / hyperparameter 84 | # optimization, this is where you want to do it 85 | 86 | # train model and make predictions 87 | model.fit(X_train, y_train) 88 | preds = model.predict_proba(X_cv)[:, 1] 89 | 90 | # compute AUC metric for this CV fold 91 | fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds) 92 | roc_auc = metrics.auc(fpr, tpr) 93 | print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc) 94 | mean_auc += roc_auc 95 | 96 | print "Mean AUC: %f" % (mean_auc/n) 97 | 98 | # === Predictions === # 99 | # When making predictions, retrain the model on the whole training set 100 | model.fit(X, y) 101 | preds = model.predict_proba(X_test)[:, 1] 102 | filename = raw_input("Enter name for submission file: ") 103 | save_results(preds, filename + ".csv") 104 | 105 | if __name__ == '__main__': 106 | main() 107 | -------------------------------------------------------------------------------- /Logistic-Regression/logistic_regression_updated.py: -------------------------------------------------------------------------------- 1 | #https://www.kaggle.com/c/amazon-employee-access-challenge/forums/t/4838/python-code-to-achieve-0-90-auc-with-logistic-regression 2 | 3 | __author__ = 'Miroslaw Horbal' 4 | __email__ = 'miroslaw@gmail.com' 5 | __date__ = '14-06-2013' 6 | 7 | from numpy import array, hstack 8 | from sklearn import metrics, cross_validation, linear_model 9 | from scipy import sparse 10 | from itertools import combinations 11 | 12 | import numpy as np 13 | import pandas as pd 14 | 15 | SEED = 25 16 | 17 | def group_data(data, degree=3, hash=hash): 18 | """ 19 | numpy.array -> numpy.array 20 | 21 | Groups all columns of data into all combinations of triples 22 | """ 23 | new_data = [] 24 | m,n = data.shape 25 | for indicies in combinations(range(n), degree): 26 | new_data.append([hash(tuple(v)) for v in data[:,indicies]]) 27 | return array(new_data).T 28 | 29 | def OneHotEncoder(data, keymap=None): 30 | """ 31 | OneHotEncoder takes data matrix with categorical columns and 32 | converts it to a sparse binary matrix. 33 | 34 | Returns sparse binary matrix and keymap mapping categories to indicies. 35 | If a keymap is supplied on input it will be used instead of creating one 36 | and any categories appearing in the data that are not in the keymap are 37 | ignored 38 | """ 39 | if keymap is None: 40 | keymap = [] 41 | for col in data.T: 42 | uniques = set(list(col)) 43 | keymap.append(dict((key, i) for i, key in enumerate(uniques))) 44 | total_pts = data.shape[0] 45 | outdat = [] 46 | for i, col in enumerate(data.T): 47 | km = keymap[i] 48 | num_labels = len(km) 49 | spmat = sparse.lil_matrix((total_pts, num_labels)) 50 | for j, val in enumerate(col): 51 | if val in km: 52 | spmat[j, km[val]] = 1 53 | outdat.append(spmat) 54 | outdat = sparse.hstack(outdat).tocsr() 55 | return outdat, keymap 56 | 57 | def create_test_submission(filename, prediction): 58 | content = ['id,ACTION'] 59 | for i, p in enumerate(prediction): 60 | content.append('%i,%f' %(i+1,p)) 61 | f = open(filename, 'w') 62 | f.write('\n'.join(content)) 63 | f.close() 64 | print 'Saved' 65 | 66 | # This loop essentially from Paul's starter code 67 | def cv_loop(X, y, model, N): 68 | mean_auc = 0. 69 | for i in range(N): 70 | X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( 71 | X, y, test_size=.20, 72 | random_state = i*SEED) 73 | model.fit(X_train, y_train) 74 | preds = model.predict_proba(X_cv)[:,1] 75 | auc = metrics.auc_score(y_cv, preds) 76 | print "AUC (fold %d/%d): %f" % (i + 1, N, auc) 77 | mean_auc += auc 78 | return mean_auc/N 79 | 80 | def main(train='train.csv', test='test.csv', submit='logistic_pred.csv'): 81 | print "Reading dataset..." 82 | train_data = pd.read_csv(train) 83 | test_data = pd.read_csv(test) 84 | all_data = np.vstack((train_data.ix[:,1:-1], test_data.ix[:,1:-1])) 85 | 86 | num_train = np.shape(train_data)[0] 87 | 88 | # Transform data 89 | print "Transforming data..." 90 | dp = group_data(all_data, degree=2) 91 | dt = group_data(all_data, degree=3) 92 | 93 | y = array(train_data.ACTION) 94 | X = all_data[:num_train] 95 | X_2 = dp[:num_train] 96 | X_3 = dt[:num_train] 97 | 98 | X_test = all_data[num_train:] 99 | X_test_2 = dp[num_train:] 100 | X_test_3 = dt[num_train:] 101 | 102 | X_train_all = np.hstack((X, X_2, X_3)) 103 | X_test_all = np.hstack((X_test, X_test_2, X_test_3)) 104 | num_features = X_train_all.shape[1] 105 | 106 | model = linear_model.LogisticRegression() 107 | 108 | # Xts holds one hot encodings for each individual feature in memory 109 | # speeding up feature selection 110 | Xts = [OneHotEncoder(X_train_all[:,[i]])[0] for i in range(num_features)] 111 | 112 | print "Performing greedy feature selection..." 113 | score_hist = [] 114 | N = 10 115 | good_features = set([]) 116 | # Greedy feature selection loop 117 | while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]: 118 | scores = [] 119 | for f in range(len(Xts)): 120 | if f not in good_features: 121 | feats = list(good_features) + [f] 122 | Xt = sparse.hstack([Xts[j] for j in feats]).tocsr() 123 | score = cv_loop(Xt, y, model, N) 124 | scores.append((score, f)) 125 | print "Feature: %i Mean AUC: %f" % (f, score) 126 | good_features.add(sorted(scores)[-1][1]) 127 | score_hist.append(sorted(scores)[-1]) 128 | print "Current features: %s" % sorted(list(good_features)) 129 | 130 | # Remove last added feature from good_features 131 | good_features.remove(score_hist[-1][1]) 132 | good_features = sorted(list(good_features)) 133 | print "Selected features %s" % good_features 134 | 135 | print "Performing hyperparameter selection..." 136 | # Hyperparameter selection loop 137 | score_hist = [] 138 | Xt = sparse.hstack([Xts[j] for j in good_features]).tocsr() 139 | Cvals = np.logspace(-4, 4, 15, base=2) 140 | for C in Cvals: 141 | model.C = C 142 | score = cv_loop(Xt, y, model, N) 143 | score_hist.append((score,C)) 144 | print "C: %f Mean AUC: %f" %(C, score) 145 | bestC = sorted(score_hist)[-1][1] 146 | print "Best C value: %f" % (bestC) 147 | 148 | print "Performing One Hot Encoding on entire dataset..." 149 | Xt = np.vstack((X_train_all[:,good_features], X_test_all[:,good_features])) 150 | Xt, keymap = OneHotEncoder(Xt) 151 | X_train = Xt[:num_train] 152 | X_test = Xt[num_train:] 153 | 154 | print "Training full model..." 155 | model.fit(X_train, y) 156 | 157 | print "Making prediction and saving results..." 158 | preds = model.predict_proba(X_test)[:,1] 159 | create_test_submission(submit, preds) 160 | 161 | if __name__ == "__main__": 162 | args = { 'train': 'train.csv', 163 | 'test': 'test.csv', 164 | 'submit': 'logistic_regression_pred.csv' } 165 | main(**args) 166 | 167 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python Data Science Tutorials 2 | - This repo contains a curated list of Python tutorials for Data Science, NLP and Machine Learning. 3 | 4 | - [**Curated list of R tutorials for Data Science, NLP and Machine Learning**](https://github.com/ujjwalkarn/DataScienceR). 5 | 6 | - [Comprehensive topic-wise list of Machine Learning and Deep Learning tutorials, codes, articles and other resources](https://github.com/ujjwalkarn/Machine-Learning-Tutorials/blob/master/README.md). 7 | 8 | ## The Python Language 9 | - [Python 3 in one picture](https://fossbytes.com/wp-content/uploads/2015/09/python-3-in-one-pic.png) 10 | - [**Awesome Python**](https://github.com/vinta/awesome-python) 11 | - [**Jargon from the functional programming world in simple terms!**](https://github.com/hemanth/functional-programming-jargon) 12 | - [**Dive Into Python**](http://www.diveintopython.net/index.html) 13 | - [Learn Python Wiki on Reddit](https://www.reddit.com/r/learnpython/wiki/index) 14 | - [Learn 90% of Python in 90 Minutes](https://www.slideshare.net/MattHarrison4/learn-90) 15 | - [Highest Voted Python Questions](http://stackoverflow.com/questions/tagged/python?sort=votes&pageSize=50) 16 | - [Python Basic Concepts](https://github.com/gumption/Python_for_Data_Science/blob/master/3_Python_Basic_Concepts.ipynb) 17 | - [Quick Reference to Python](http://www.dataschool.io/python-quick-reference/) 18 | - [The Elements of Python Style](https://github.com/amontalenti/elements-of-python-style) 19 | - [**What does the yield keyword do in Python?**](http://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do-in-python) 20 | - [Parsing values from a JSON file in Python](http://stackoverflow.com/questions/2835559/parsing-values-from-a-json-file-in-python) 21 | - [**Python Quora FAQs**](https://www.quora.com/topic/Python-programming-language-1) 22 | - [time-complexity of various operations - list/dict - in current CPython](https://wiki.python.org/moin/TimeComplexity) 23 | - Scripting in Python 24 | - [Python Scripting Tutorial](http://www.dreamsyssoft.com/python-scripting-tutorial/intro-tutorial.php) 25 | - [Scripting with Python](https://www.schrodinger.com//AcrobatFile.php?type=supportdocs&type2=&ident=404) 26 | - [**Can I use Python as a bash replacement?**](http://stackoverflow.com/questions/209470/can-i-use-python-as-a-bash-replacement) 27 | 28 | ## Useful Online Courses 29 | - [Learn Python (Codecademy)](https://www.codecademy.com/learn/python#) 30 | - [Free Interactive Course: Intro to Python for Data Science (DataCamp)](https://www.datacamp.com/courses/intro-to-python-for-data-science) 31 | - [Introduction to Computer Science and Programming Using Python (MIT)](https://www.edx.org/course/introduction-computer-science-mitx-6-00-1x-11) 32 | - [Python for Everybody](https://www.coursera.org/learn/python) 33 | - [Python Programming Essentials](https://www.coursera.org/learn/python-programming) 34 | 35 | ## Data Science with Python 36 | - [**Data Science IPython Notebooks**](https://github.com/donnemartin/data-science-ipython-notebooks) 37 | - [Awesome Python - Data Analysis](https://github.com/vinta/awesome-python#science-and-data-analysis) 38 | - Statistics 39 | - [Statistics and Data Science](https://github.com/svaksha/pythonidae/blob/master/Statistics.md) 40 | - [**An Introduction to Scientific Python (and a Bit of the Maths Behind It) – NumPy**](http://www.kdnuggets.com/2016/06/intro-scientific-python-numpy.html) 41 | - [Data Analysis and IPython Notebooks](https://github.com/kirang89/pycrumbs#data-analysis) 42 | - [Python for Data Science: Basic Concepts](https://github.com/gumption/Python_for_Data_Science/blob/master/2_Data_Science_Basic_Concepts.ipynb) 43 | - [Pycon India 2015 Notes](http://www.analyticsvidhya.com/blog/2015/10/notes-impressions-experience-excitement-pycon-india-2015/) 44 | - [**5 important Python Data Science advancements of 2015**](https://medium.com/@elgehelge/the-5-most-important-python-data-science-advancements-of-2015-a136482da89b#.sp2c1la9z) 45 | - [Data Exploration with Numpy cheat sheet](http://www.analyticsvidhya.com/blog/2015/07/11-steps-perform-data-analysis-pandas-python) 46 | - [Querying Craiglist with Python](http://chrisholdgraf.com/querying-craigslist-with-python/?imm_mid=0d8940&cmp=em-data-na-na-newsltr_20150916) 47 | - [**An introduction to Numpy and Scipy**](http://www.engr.ucsb.edu/~shell/che210d/numpy.pdf) 48 | - [Create NBA Shot Charts](http://savvastjortjoglou.com/nba-shot-sharts.html) 49 | - [PythoR- Python meets R](http://nipunbatra.github.io/2016/01/pythor/) 50 | - [**How do I learn data analysis with Python?**](https://www.quora.com/How-do-I-learn-data-analysis-with-Python?redirected_qid=2464720) 51 | - [What are some interesting things to do with Python?](https://www.quora.com/Python-programming-language-What-are-some-interesting-things-to-do-with-Python?redirected_qid=2324227) 52 | - [**Which is better for data analysis: R or Python?**](https://www.quora.com/Which-is-better-for-data-analysis-R-or-Python) 53 | - [**Web scraping in Python**](https://github.com/ujjwalkarn/Web-Scraping) 54 | - [The Guide to Learning Python for Data Science](http://www.datasciencecentral.com/profiles/blogs/the-guide-to-learning-python-for-data-science-2) 55 | - [Python For Data Science - A Cheat Sheet For Beginners](https://www.datacamp.com/community/tutorials/python-data-science-cheat-sheet-basics) 56 | - [Top voted Python data science questions](http://datascience.stackexchange.com/questions/tagged/python) 57 | - [Awesome Python - Data Visualization](https://github.com/vinta/awesome-python#data-visualization) 58 | - [Awesome Python - Map Reduce](https://github.com/vinta/awesome-python#mapreduce) 59 | 60 | ## Pandas Library in Python 61 | - [Intro to pandas data structures](http://www.gregreda.com/2013/10/26/intro-to-pandas-data-structures/) 62 | - [Useful Pandas Cheatsheet](https://github.com/pandas-dev/pandas/blob/master/doc/cheatsheet/Pandas_Cheat_Sheet.pdf) 63 | - [An Introduction to Scientific Python – Pandas](http://www.datadependence.com/2016/05/scientific-python-pandas/) 64 | - [10 minutes to Pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html) 65 | - [Useful Pandas Snippets](http://www.swegler.com/becky/blog/2014/08/06/useful-pandas-snippets/) 66 | - [Timeseries analysis using Pandas](http://nbviewer.jupyter.org/github/twiecki/financial-analysis-python-tutorial/blob/master/1.%20Pandas%20Basics.ipynb) 67 | - [Pandas Exercises - Practice your Pandas skills](https://github.com/guipsamora/pandas_exercises) 68 | - [Grouping in Pandas](http://blog.yhat.com/posts/grouping-pandas.html) 69 | - [**“Large data” work flows using pandas**](http://stackoverflow.com/questions/14262433/large-data-work-flows-using-pandas) 70 | - [Easier data analysis with pandas (video series)](http://www.dataschool.io/easier-data-analysis-with-pandas/) 71 | - [Pandas Basics Cheat Sheet](https://www.datacamp.com/community/blog/python-pandas-cheat-sheet) 72 | - Quick Operations on a Pandas DataFrame 73 | - [Renaming Columns in Pandas](http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas) ([video](https://www.youtube.com/watch?v=0uBirYFhizE&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=5)) 74 | - [Deleting Columns from pandas DataFrame](http://stackoverflow.com/questions/13411544/delete-column-from-pandas-dataframe) ([video](https://www.youtube.com/watch?v=gnUKkS964WQ&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=6)) 75 | - [Adding new Column to existing DataFrame](http://stackoverflow.com/questions/12555323/adding-new-column-to-existing-dataframe-in-python-pandas) 76 | - [Add one Row in a pandas.DataFrame](http://stackoverflow.com/questions/10715965/add-one-row-in-a-pandas-dataframe) 77 | - [Changing the order of DataFrame Columns](http://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns) 78 | - [Changing data type of Columns](http://stackoverflow.com/questions/15891038/pandas-change-data-type-of-columns) ([video](https://www.youtube.com/watch?v=V0AWyzVMf54&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=13)) 79 | - [Getting a list of the column headers from a DataFrame](http://stackoverflow.com/questions/19482970/get-list-from-pandas-dataframe-column-headers) 80 | - [Converting list of dictionaries to Dataframe](http://stackoverflow.com/questions/20638006/convert-list-of-dictionaries-to-dataframe) 81 | - [Getting row count of pandas DataFrame](http://stackoverflow.com/questions/15943769/how-to-get-row-count-of-pandas-dataframe) 82 | - [Most efficient way to loop through DataFrames](http://stackoverflow.com/questions/7837722/what-is-the-most-efficient-way-to-loop-through-dataframes-with-pandas) 83 | - [Deleting DataFrame row based on column value](http://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value) 84 | - [Dropping a list of rows from Pandas DataFrame](http://stackoverflow.com/questions/14661701/how-to-drop-a-list-of-rows-from-pandas-dataframe) 85 | - [Sorting a DataFrame or a single column](https://www.youtube.com/watch?v=zY4doF6xSxY&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=7) 86 | - [Filtering DataFrame rows by column value](https://www.youtube.com/watch?v=2AFGPdNn4FM&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=8) 87 | - [Filtering DataFrame rows using multiple criteria](https://www.youtube.com/watch?v=YPItfQ87qjM&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=9) 88 | - [Dropping all non-numeric columns from a DataFrame](https://youtu.be/B-r9VuK80dk?t=4m31s) 89 | - [Counting and removing missing values](https://www.youtube.com/watch?v=fCMrO_VzeL8&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=16) 90 | - [Selecting multiple rows and columns from a DataFrame](https://www.youtube.com/watch?v=xvpNA7bC8cs&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=19) 91 | - [Reducing the size of a DataFrame](https://www.youtube.com/watch?v=wDYDYGyN_cw&list=PL5-da3qGB5ICCsgW1MxlZ0Hq8LL5U3u9y&index=21) 92 | 93 | ## Machine Learning with Python 94 | - [AI, ML Related List](https://github.com/svaksha/pythonidae/blob/master/AI.md) 95 | - [Data Normalization in Python](http://blog.yhat.com/posts/data-normalization-in-python.html) 96 | - [**Python Machine Learning Book**](https://github.com/rasbt/python-machine-learning-book) 97 | - [Table of Contents and Code Notebooks](https://github.com/rasbt/python-machine-learning-book/blob/master/README.md#table-of-contents-and-code-notebooks) 98 | - [Machine Learning with scikit learn](http://www.dataschool.io/machine-learning-with-scikit-learn/) 99 | - [Machine Learning Algorithms Cheatsheet](http://www.analyticsvidhya.com/blog/2015/09/full-cheatsheet-machine-learning-algorithms/) 100 | - [**How to compute precision, recall, accuracy and f1-score for the multiclass case with scikit learn?**](http://stackoverflow.com/questions/31421413/how-to-compute-precision-recall-accuracy-and-f1-score-for-the-multiclass-case) 101 | - [One Hot Encoding for Machine learning in Python](http://stackoverflow.com/questions/17469835/one-hot-encoding-for-machine-learning) 102 | - [**Building a (semi) Autonomous Drone with Python**](http://blog.yhat.com/posts/autonomous-droning-with-python.html) 103 | - [Awesome Python - Machine Learning](https://github.com/vinta/awesome-python#machine-learning) 104 | - Computer Vision 105 | - [Awesome Python - Computer Vision](https://github.com/vinta/awesome-python#computer-vision) 106 | 107 | ## Scikit Learn 108 | - [scikit learn on Wikipedia](https://en.wikipedia.org/wiki/Scikit-learn) 109 | - [**Introduction to machine learning with scikit-learn**](https://github.com/justmarkham/scikit-learn-videos), [**Videos!**](http://blog.kaggle.com/author/kevin-markham/) 110 | - [**A Gentle Introduction to Scikit-Learn: A Python Machine Learning Library**](http://machinelearningmastery.com/a-gentle-introduction-to-scikit-learn-a-python-machine-learning-library/) 111 | - [**PyData Seattle 2015 Scikit-learn Tutorial**](https://github.com/jakevdp/sklearn_pydata2015), [sklearn_scipy2013](https://github.com/jakevdp/sklearn_scipy2013) 112 | - [SKLEARN BENCHMARKS: A centralized repository to report scikit-learn model performance across a variety of parameter settings and data sets](https://github.com/rhiever/sklearn-benchmarks), [Report results of sklearn benchmarks at openml.org](http://www.openml.org/) 113 | - [How to get most informative features for scikit-learn classifiers?](http://stackoverflow.com/questions/11116697/how-to-get-most-informative-features-for-scikit-learn-classifiers) 114 | - [**Code example to predict prices of Airbnb vacation rentals, using scikit-learn on Spark**](https://github.com/mapr-demos/spark-sklearn-airbnb-predict) 115 | - [**Machine Learning with scikit learn tutorial**](http://amueller.github.io/sklearn_tutorial/) 116 | - [Parallel and Large Scale Machine Learning with scikit-learn](https://speakerdeck.com/ogrisel/parallel-and-large-scale-machine-learning-with-scikit-learn), [Meetup](http://datasciencelondon.org/machine-learning-python-scikit-learn-ipython-dsldn-data-science-london-kaggle/) 117 | - [Saving classifier to disk in scikit-learn](http://stackoverflow.com/questions/10592605/save-classifier-to-disk-in-scikit-learn) 118 | 119 | 120 | ## Linear Regression in Python 121 | - [Linear Regression in Python](http://nbviewer.ipython.org/github/justmarkham/DAT4/blob/master/notebooks/08_linear_regression.ipynb), [Blog Post](http://www.dataschool.io/linear-regression-in-python/) 122 | - [Linear Regression using Scikit Learn](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html) 123 | - [A friendly introduction to linear regression (using Python)](http://www.dataschool.io/linear-regression-in-python/) 124 | - [Linear Regression Example in Python](http://scipy-cookbook.readthedocs.io/items/LinearRegression.html) 125 | - [Regression analysis using Python StatsModels package](http://www.turingfinance.com/regression-analysis-using-python-statsmodels-and-quandl/) 126 | - [Run an OLS regression with Pandas Data Frame](http://stackoverflow.com/questions/19991445/run-an-ols-regression-with-pandas-data-frame) 127 | 128 | ## Logistic Regression in Python 129 | - [Logistic Regression with scikit learn](http://www.dataschool.io/logistic-regression-in-python-using-scikit-learn/) 130 | - [Logistic Regression in Python](http://blog.yhat.com/posts/logistic-regression-and-python.html) 131 | - [Implementing the softmax function in Python](http://stackoverflow.com/questions/34968722/softmax-function-python) 132 | - [**What is the inverse of regularization strength in Logistic Regression? How should it affect my code?**](http://stackoverflow.com/questions/22851316/what-is-the-inverse-of-regularization-strength-in-logistic-regression-how-shoul) 133 | - [The Yhat Blog: Logistic Regression in Python](http://blog.yhat.com/posts/logistic-regression-and-python.html) 134 | - [Example of logistic regression in Python using scikit-learn](http://www.dataschool.io/logistic-regression-in-python-using-scikit-learn/) 135 | - [TUTORIAL ON LOGISTIC REGRESSION AND OPTIMIZATION IN PYTHON](https://learningwithdata.wordpress.com/2015/04/30/tutorial-on-logistic-regression-and-optimization-in-python/) 136 | - [Using Logistic Regression in Python for Data Science](http://www.dummies.com/how-to/content/using-logistic-regression-in-python-for-data-scien.html) 137 | 138 | ## k Nearest Neighbours in Python 139 | - [A good tutorial on implementing K Nearest Neighbors using scikit learn](http://scikit-learn.org/stable/modules/neighbors.html) 140 | - [**Is it possible to specify your own distance function using scikit-learn K-Means Clustering?**](http://stackoverflow.com/questions/5529625/is-it-possible-to-specify-your-own-distance-function-using-scikit-learn-k-means) 141 | - [Tutorial To Implement k-Nearest Neighbors in Python From Scratch](http://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/) 142 | - [Implementing your own k-nearest neighbour algorithm using Python](https://blog.cambridgecoding.com/2016/01/16/machine-learning-under-the-hood-writing-your-own-k-nearest-neighbour-algorithm/) 143 | - [knn Python implementation on StackOverflow](http://stackoverflow.com/questions/5565935/k-nearest-neighbour-in-python) 144 | - [kNN with big sparse matrices in Python](http://stackoverflow.com/questions/20333092/knn-with-big-sparse-matrices-in-python) 145 | - [Sklearn kNN usage with a user defined metric](http://stackoverflow.com/questions/21052509/sklearn-knn-usage-with-a-user-defined-metric) 146 | 147 | 148 | ## Neural Networks in Python 149 | - [Implementing a Neural Network from scratch in Python](http://www.wildml.com/2015/09/implementing-a-neural-network-from-scratch/), [Code](https://github.com/dennybritz/nn-from-scratch) 150 | - [A Neural Network in 11 lines of Python](http://iamtrask.github.io/2015/07/12/basic-python-network/) 151 | - [Speeding up your Neural Network with Theano and the gpu](http://www.wildml.com/2015/09/speeding-up-your-neural-network-with-theano-and-the-gpu/), [Code](https://github.com/dennybritz/nn-theano) 152 | - [What is the best neural network library for Python?](https://www.quora.com/What-is-the-best-neural-network-library-for-Python) 153 | - [Recurrent Neural Net Tutorial in Python Part 1](http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-1-introduction-to-rnns/), [Part 2](http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano/), [Code](https://github.com/dennybritz/rnn-tutorial-rnnlm/) 154 | - [PyBrain: modular Machine Learning Library for Python](http://pybrain.org/) 155 | - [Neural Networks Tutorial – a Pathway to Deep Learning](http://www.adventuresinmachinelearning.com/neural-networks-tutorial/) 156 | 157 | 158 | ## Decision Trees in Python 159 | - [How to extract the decision rules from scikit-learn decision-tree?](http://stackoverflow.com/questions/20224526/how-to-extract-the-decision-rules-from-scikit-learn-decision-tree) 160 | - [**How do I find which attributes my tree splits on, when using scikit-learn?**](http://stackoverflow.com/questions/20156951/how-do-i-find-which-attributes-my-tree-splits-on-when-using-scikit-learn) 161 | - [Quora: What is a good Python library for decision trees?](https://www.quora.com/What-is-a-good-Python-library-for-decision-trees), [StackOverflow](http://stackoverflow.com/questions/3127922/what-is-a-good-python-library-for-decision-trees) 162 | - [Building Decision Trees in Python](http://www.onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html?page=1) 163 | - [Pure Python Decision Trees](http://kldavenport.com/pure-python-decision-trees/) 164 | - [Building a decision tree from scratch in Python - a beginner's tutorial](http://www.patricklamle.com/Tutorials/Decision%20tree%20python/tuto_decision%20tree.html) 165 | - [Using Python to Build and Use a Simple Decision Tree Classifier](https://github.com/gumption/Python_for_Data_Science/blob/master/4_Python_Simple_Decision_Tree.ipynb) 166 | - [Decision trees in python with scikit-learn and pandas](http://chrisstrelioff.ws/sandbox/2015/06/08/decision_trees_in_python_with_scikit_learn_and_pandas.html) 167 | - [Code for simple decision tree in Python](https://github.com/gumption/Python_for_Data_Science/blob/master/simple_decision_tree.py) 168 | - [Lesson notebook: Regression and Classification Trees](http://nbviewer.jupyter.org/github/justmarkham/DAT8/blob/master/notebooks/17_decision_trees.ipynb) 169 | - [Discover structure behind data with decision trees](http://vooban.com/en/tips-articles-geek-stuff/discover-structure-behind-data-with-decision-trees/) 170 | 171 | ## Random Forest with Python 172 | - [Getting Started with Random Forests: Titanic Competition on Kaggle](https://www.kaggle.com/c/titanic/details/getting-started-with-random-forests), [Python sample code](https://www.kaggle.com/c/digit-recognizer/forums/t/2299/getting-started-python-sample-code-random-forest) 173 | - [RandomForestClassifier vs ExtraTreesClassifier in scikit learn](http://stackoverflow.com/questions/22409855/randomforestclassifier-vs-extratreesclassifier-in-scikit-learn) 174 | - [Powerful Guide to learn Random Forest](http://www.analyticsvidhya.com/blog/2015/09/random-forest-algorithm-multiple-challenges/) 175 | - [How are Feature Importances in RandomForestClassifier determined?](http://stackoverflow.com/questions/15810339/how-are-feature-importances-in-randomforestclassifier-determined) 176 | - [Random forest interpretation with scikit-learn](http://blog.datadive.net/random-forest-interpretation-with-scikit-learn/) 177 | - [Random Forests in Python Tutorial](http://blog.yhat.com/posts/random-forests-in-python.html) 178 | - [Unbalanced classification using RandomForestClassifier in sklearn](http://stackoverflow.com/questions/20082674/unbalanced-classification-using-randomforestclassifier-in-sklearn) 179 | - [Random Forest with categorical features in sklearn](http://stackoverflow.com/questions/24715230/random-forest-with-categorical-features-in-sklearn) 180 | - [How to output RandomForest Classifier from python?](http://stackoverflow.com/questions/23000693/how-to-output-randomforest-classifier-from-python) 181 | - [Lesson notebook: Ensembling, Bagging, and Random Forests](http://nbviewer.jupyter.org/github/justmarkham/DAT8/blob/master/notebooks/18_ensembling.ipynb) 182 | 183 | ## Support Vector Machine in Python 184 | - [Fastest SVM implementation usable in Python](http://stackoverflow.com/questions/9299346/fastest-svm-implementation-usable-in-python) 185 | - [An example using python bindings for SVM library, LIBSVM](http://stackoverflow.com/questions/4214868/an-example-using-python-bindings-for-svm-library-libsvm) 186 | - [What is the best SVM library usable from Python?](https://www.quora.com/What-is-the-best-SVM-library-usable-from-Python) 187 | - [How does sklearn.svm.svc's function predict_proba() work internally?](http://stackoverflow.com/questions/15111408/how-does-sklearn-svm-svcs-function-predict-proba-work-internally) 188 | - [Support vector machine in Python using libsvm example of features](http://stackoverflow.com/questions/30991592/support-vector-machine-in-python-using-libsvm-example-of-features) 189 | - [Linear SVC Machine learning SVM example with Python](https://pythonprogramming.net/linear-svc-example-scikit-learn-svm-python/) 190 | - [Understanding Support Vector Machine algorithm from examples (along with code)](http://www.analyticsvidhya.com/blog/2015/10/understaing-support-vector-machine-example-code/) 191 | 192 | ## NLP / Text Mining in Python 193 | - [**NLP with Python ORiley Book**](http://www.nltk.org/book_1ed/), [Python 3](http://www.nltk.org/book/) 194 | - [Awesome Python - NLP](https://github.com/vinta/awesome-python#natural-language-processing) 195 | - [Awesome Python - Text Processing](https://github.com/vinta/awesome-python#text-processing) 196 | - [Text Analytics : Intro and Tokenization](http://a4analytics.blogspot.sg/2015/03/text-mining-post-1.html) 197 | - [NLTK BOOK](http://www.nltk.org/book/ch01.html) 198 | - [Elegant N-gram Generation in Python](http://locallyoptimal.com/blog/2013/01/20/elegant-n-gram-generation-in-python/) 199 | - [**Computing N Grams using Python**](http://stackoverflow.com/questions/13423919/computing-n-grams-using-python) 200 | - [N-grams: Explanation + 2 applications](http://stackoverflow.com/questions/1032288/n-grams-explanation-2-applications) 201 | - [NLP Tutorial with Python](http://www.datasciencecentral.com/profiles/blogs/python-nlp-tools) 202 | 203 | ## Sentiment Analysis with Python 204 | - [A Comprehensive Guide to Sentiment Analysis](https://monkeylearn.com/sentiment-analysis/) 205 | - [Twitter-Sentiment-Analysis](https://github.com/ujjwalkarn/Twitter-Sentiment-Analysis) 206 | - [Basic Sentiment Analysis with Python](http://fjavieralba.com/basic-sentiment-analysis-with-python.html) 207 | - [What is the best way to do Sentiment Analysis with Python?](https://www.quora.com/What-is-the-best-way-to-do-Sentiment-Analysis-with-Python-1) 208 | - [How to Calculate Twitter Sentiment Using AlchemyAPI with Python](http://www.alchemyapi.com/developers/getting-started-guide/twitter-sentiment-analysis) 209 | - [Second Try: Sentiment Analysis in Python](http://andybromberg.com/sentiment-analysis-python/) 210 | - [Sentiment Analysis with Python NLTK Text Classification](http://text-processing.com/demo/sentiment/) 211 | - Codes and Explanation 212 | - [**Sentiment Analysis with bag-of-words**](http://ataspinar.com/2016/01/21/sentiment-analysis-with-bag-of-words/) 213 | - [**Sentiment Analysis with Naive Bayes**](http://ataspinar.com/2016/02/15/sentiment-analysis-with-the-naive-bayes-classifier/) 214 | 215 | ## Pickle: convert a python object into a character stream 216 | - [Python serialization - Why pickle?](http://stackoverflow.com/questions/8968884/python-serialization-why-pickle) 217 | - [**Serializing Python Objects**](http://www.diveinto.org/python3/serializing.html), [**Binary Files**](http://www.diveinto.org/python3/files.html#binary) 218 | - [What is Pickle in python ?](https://pythontips.com/2013/08/02/what-is-pickle-in-python/) 219 | - [How to cPickle dump and load separate dictionaries to the same file?](http://stackoverflow.com/questions/11641493/how-to-cpickle-dump-and-load-separate-dictionaries-to-the-same-file) 220 | - [**Understanding Pickling in Python**](http://stackoverflow.com/questions/7501947/understanding-pickling-in-python) 221 | 222 | ## AutoML 223 | - [TPOT: A Python tool for automating data science](http://www.randalolson.com/2016/05/08/tpot-a-python-tool-for-automating-data-science/), [GitHub repo](https://github.com/rhiever/tpot) 224 | 225 | ## Regex Related 226 | - [RegExr](http://regexr.com/) 227 | - [Regex101](https://regex101.com/) 228 | - [Pythex](http://pythex.org/) 229 | - [How to use Regular Expressions (Regex) in Microsoft Excel both in-cell and loops](http://stackoverflow.com/questions/22542834/how-to-use-regular-expressions-regex-in-microsoft-excel-both-in-cell-and-loops) 230 | - [Advanced Filters: Excel’s Amazing Alternative To Regex](http://searchengineland.com/advanced-filters-excels-amazing-alternative-to-regex-143680) 231 | 232 | ## Shell Scripting 233 | - [**Calling an external command in Python**](http://stackoverflow.com/questions/89228/calling-an-external-command-in-python) 234 | - [**Running shell command from Python and capturing the output**](http://stackoverflow.com/questions/4760215/running-shell-command-from-python-and-capturing-the-output) 235 | - [**Can I use Python as a bash replacement?**](http://stackoverflow.com/questions/209470/can-i-use-python-as-a-bash-replacement) 236 | - [Python Scripts as a Replacement for Bash Utility Scripts](http://www.linuxjournal.com/content/python-scripts-replacement-bash-utility-scripts) 237 | - [How to Write a Shell Script using Bash Shell in Ubuntu](https://www.youtube.com/watch?v=He-5BpUGSag) 238 | - Red Hat Magazine | Python for Bash scripters: A well-kept secret 239 | - [Embed bash in python](http://stackoverflow.com/questions/2651874/embed-bash-in-python) 240 | - [Bash2py: A Bash to Python Translator](https://cs.uwaterloo.ca/~ijdavis/bash2py-final.pdf) 241 | - [Beginners/BashScripting](https://help.ubuntu.com/community/Beginners/BashScripting) 242 | - [The Beginner’s Guide to Shell Scripting: The Basics](http://www.howtogeek.com/67469/the-beginners-guide-to-shell-scripting-the-basics/) 243 | - [Linux Shell Scripting Tutorial v1.05r3 A Beginner's handbook](http://www.freeos.com/guides/lsst/) 244 | 245 | ## Other good lists 246 | - [pycrumbs - Bits and bytes of Python from the Internet](https://github.com/kirang89/pycrumbs) 247 | - [python github projects - Collect and classify python projects on Github](https://github.com/checkcheckzz/python-github-projects) 248 | - [python reference - Useful functions, tutorials, and other Python-related things](https://github.com/rasbt/python_reference) 249 | - [pythonidae - Curated decibans of scientific programming resources in Python](https://github.com/svaksha/pythonidae) 250 | -------------------------------------------------------------------------------- /Twitter-Data-Analysis/extract_twitter_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Sun Oct 04 23:10:41 2015 3 | @author: ujjwal.karn 4 | """ 5 | 6 | #first, install pip by following instructions here: http://stackoverflow.com/questions/4750806/how-to-install-pip-on-windows 7 | #then, to install tweepy library, go to Anaconda command prompt and type: pip install tweepy 8 | #once tweepy is installed, run the codes below: 9 | 10 | import tweepy #this will give an error if tweepy is not installed properly 11 | from tweepy import OAuthHandler 12 | 13 | #provide your access details below 14 | access_token = "xxxxxxxx" 15 | access_token_secret = "xxxxxxxx" 16 | consumer_key = "xxxxxxxx" 17 | consumer_secret = "xxxxxxxx" 18 | 19 | auth = OAuthHandler(consumer_key, consumer_secret) 20 | auth.set_access_token(access_token, access_token_secret) 21 | 22 | api = tweepy.API(auth) 23 | 24 | from tweepy import Stream 25 | from tweepy.streaming import StreamListener 26 | 27 | class MyListener(StreamListener): 28 | 29 | def on_data(self, data): 30 | try: 31 | with open('location/file_name.txt', 'a') as f: #change location here 32 | f.write(data) 33 | return True 34 | except BaseException as e: 35 | print("Error on_data: %s" % str(e)) 36 | return True 37 | 38 | def on_error(self, status): 39 | print(status) 40 | return True 41 | 42 | twitter_stream = Stream(auth, MyListener()) 43 | 44 | #change the keyword here 45 | twitter_stream.filter(track=['#cricket']) 46 | -------------------------------------------------------------------------------- /Twitter-Data-Analysis/json2tweets.R: -------------------------------------------------------------------------------- 1 | library(jsonlite) 2 | options(encoding = "UTF-8") 3 | 4 | # read in individual JSON lines 5 | json_file <- "C:\\Users\\ujjwal.karn\\Desktop\\Tweets\\python.json" 6 | 7 | # turn it into a proper array by separating each object with a "," and 8 | # wrapping that up in an array with "[]"'s. 9 | 10 | dat <- fromJSON(sprintf("[%s]", paste(readLines(json_file), collapse=","))) 11 | 12 | dim(dat) 13 | ## [1] 3959 18 14 | 15 | tweets<-dat$text 16 | tweets 17 | -------------------------------------------------------------------------------- /basic_commands.py: -------------------------------------------------------------------------------- 1 | >>> a = ['a', 'b', 'c', 'd', 'e'] 2 | >>> for index, item in enumerate(a): print index, item # enumerate function will generate an index for the item + item it self. 3 | ... 4 | 0 a 5 | 1 b 6 | 2 c 7 | 3 d 8 | 4 e 9 | 10 | 11 | 12 | 13 | #convert a list to string: 14 | 15 | list1 = ['1', '2', '3'] 16 | str1 = ''.join(list1) 17 | 18 | Or if the list is of integers, convert the elements before joining them. 19 | 20 | list1 = [1, 2, 3] 21 | str1 = ''.join(str(e) for e in list1) 22 | 23 | 24 | 25 | #FIND method 26 | 27 | str.find(str2, beg=0 end=len(string)) 28 | 29 | Parameters 30 | str2 -- This specifies the string to be searched. 31 | beg -- This is the starting index, by default its 0. 32 | end -- This is the ending index, by default its equal to the lenght of the string. 33 | 34 | Return Value 35 | This method returns index if found and -1 otherwise. 36 | 37 | str1 = "this is string example....wow!!!"; 38 | str2 = "exam"; 39 | 40 | # find function will print the position for the first character of the string if it's found! 41 | print str1.find(str2); 42 | print str1.find(str2, 10); 43 | print str1.find(str2, 40); 44 | 45 | #15 46 | #15 47 | #-1 48 | 49 | 50 | 51 | 52 | 53 | #2D LIST PYTHON 54 | 55 | # Creates a list containing 5 lists initialized to 0 56 | Matrix = [[0 for x in range(5)] for x in range(5)] 57 | You can now add items to the list: 58 | 59 | Matrix[0][0] = 1 60 | Matrix[4][0] = 5 61 | 62 | print Matrix[0][0] # prints 1 63 | print Matrix[4][0] # prints 5 64 | 65 | 66 | if you have a simple two-dimensional list like this: 67 | 68 | A = [[1,2,3,4], 69 | [5,6,7,8]] 70 | then you can extract a column like this: 71 | 72 | def column(matrix, i): 73 | return [row[i] for row in matrix] 74 | Extracting the second column (index 1): 75 | 76 | >>> column(A, 1) 77 | [2, 6] 78 | Or alternatively, simply: 79 | 80 | >>> [row[1] for row in A] 81 | [2, 6] 82 | -------------------------------------------------------------------------------- /svm_sklearn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import svm 3 | 4 | # Read the data 5 | train = np.loadtxt(open("train.csv","rb"), delimiter=",", skiprows=0) 6 | trainLabels = np.loadtxt(open("trainLabels.csv","rb"), delimiter=",", skiprows=0) 7 | test = np.loadtxt(open("test.csv","rb"), delimiter=",", skiprows=0) 8 | 9 | 10 | X, y = train, trainLabels 11 | s = svm.SVC() 12 | s.fit(X, y) 13 | 14 | predictions = s.predict(test) 15 | np.savetxt("fancySVMSubmission.csv", predictions.astype(int), fmt='%d', delimiter=",") 16 | --------------------------------------------------------------------------------