├── README.md ├── kaggle.py └── leaderboard.png /README.md: -------------------------------------------------------------------------------- 1 | # Kaggle's PyCon 2015 competition 2 | 3 | At [PyCon 2015](https://us.pycon.org/2015/), Kaggle hosted a small competition during their tutorial: [Winning Machine Learning Competitions with scikit-learn](https://us.pycon.org/2015/schedule/presentation/321/). There were 28 teams, and we had less than three hours to work on the problem. 4 | 5 | - [Code](kaggle.py) from my first place submission 6 | - [Tutorial repo](https://github.com/dchudz/pycon2015-kaggle-tutorial) 7 | - [Competition page](https://inclass.kaggle.com/c/pycon-2015-tutorial) 8 | - [Data files](https://inclass.kaggle.com/c/pycon-2015-tutorial/data) 9 | - [Final leaderboard](https://inclass.kaggle.com/c/pycon-2015-tutorial/leaderboard) 10 | 11 | Feel free to [contact me](http://www.dataschool.io/about/) with any questions! If you'd like to **learn how to use scikit-learn for machine learning**, I have a new [series of video tutorials](http://blog.kaggle.com/author/kevin-markham/) covering that topic that are hosted on Kaggle's blog. 12 | 13 | ----- 14 | 15 | ![Leaderboard](leaderboard.png) 16 | -------------------------------------------------------------------------------- /kaggle.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Code from my first place submission in Kaggle's PyCon 2015 competition 3 | https://github.com/justmarkham/kaggle-pycon-2015 4 | 5 | Kevin Markham 6 | kevin@dataschool.io 7 | http://dataschool.io 8 | ''' 9 | 10 | import pandas as pd 11 | from sklearn.linear_model import LogisticRegression 12 | from sklearn.feature_extraction.text import CountVectorizer 13 | from sklearn.naive_bayes import MultinomialNB 14 | 15 | 16 | ## FILE READING AND FEATURE ENGINEERING 17 | 18 | def make_features(filename): 19 | 20 | # read in a CSV file and define the first column as the index 21 | df = pd.read_csv(filename, index_col=0) 22 | 23 | # create feature that represents the length of the post title 24 | df['TitleLength'] = df.Title.apply(len) 25 | 26 | # create feature that represents the length of the body text 27 | df['BodyLength'] = df.BodyMarkdown.apply(len) 28 | 29 | # create feature that represents the number of tags 30 | df['has1'] = df.Tag1.notnull().astype(int) 31 | df['has2'] = df.Tag2.notnull().astype(int) 32 | df['has3'] = df.Tag3.notnull().astype(int) 33 | df['has4'] = df.Tag4.notnull().astype(int) 34 | df['has5'] = df.Tag5.notnull().astype(int) 35 | df['NumTags'] = df.has1 + df.has2 + df.has3 + df.has4 + df.has5 36 | 37 | # convert date fields from strings to datetime objects 38 | df['OwnerCreationDate'] = pd.to_datetime(df.OwnerCreationDate) 39 | df['PostCreationDate'] = pd.to_datetime(df.PostCreationDate) 40 | 41 | # create feature that represents the age of the account (in days) at the time of posting 42 | df['OwnerAge'] = (df.PostCreationDate - df.OwnerCreationDate).dt.days 43 | 44 | # return a DataFrame 45 | return df 46 | 47 | # add the same features to the training and testing data 48 | train = make_features('train.csv') 49 | test = make_features('test.csv') 50 | 51 | 52 | ## MODEL 1: Logistic Regression using 5 features 53 | 54 | # create a list of features 55 | cols = ['TitleLength', 'BodyLength', 'ReputationAtPostCreation', 'NumTags', 'OwnerAge'] 56 | 57 | # create X (feature matrix) and y (response vector) 58 | X = train[cols] 59 | y = train.OpenStatus 60 | 61 | # instantiate model and fit with training data 62 | lr = LogisticRegression() 63 | lr.fit(X, y) 64 | 65 | # calculate predicted probabilities on testing data 66 | test_probs_lr = lr.predict_proba(test[cols])[:, 1] 67 | 68 | 69 | ## MODEL 2: Naive Bayes using vectorized post title as features 70 | 71 | # instantiate vectorizer with default settings 72 | vect = CountVectorizer() 73 | 74 | # create document-term matrix from the training data 75 | train_dtm = vect.fit_transform(train.Title) 76 | 77 | # use vocabulary learned from training data to create document-term matrix from the testing data 78 | test_dtm = vect.transform(test.Title) 79 | 80 | # instantiate model and fit with training document-term matrix 81 | nb = MultinomialNB() 82 | nb.fit(train_dtm, train.OpenStatus) 83 | 84 | # calculate predicted probabilities on the testing document-term matrix 85 | test_probs_nb = nb.predict_proba(test_dtm)[:, 1] 86 | 87 | 88 | ## MODEL 3: Naive Bayes using vectorized body text as features 89 | 90 | # instantiate vectorizer with optional arguments 91 | vect = CountVectorizer(stop_words='english', max_features=20000) 92 | 93 | # use the same pattern as model 2 94 | train_dtm = vect.fit_transform(train.BodyMarkdown) 95 | test_dtm = vect.transform(test.BodyMarkdown) 96 | nb = MultinomialNB() 97 | nb.fit(train_dtm, train.OpenStatus) 98 | test_probs_nb2 = nb.predict_proba(test_dtm)[:, 1] 99 | 100 | 101 | ## ENSEMBLE THE MODELS AND SUBMIT 102 | 103 | # calculate a weighted average of the predicted probabilities 104 | test_probs = (test_probs_lr + test_probs_nb*4 + test_probs_nb2)/6 105 | 106 | # create a DataFrame to store my submissions 107 | sub = pd.DataFrame({'id':test.index, 'OpenStatus':test_probs}).set_index('id') 108 | 109 | # write the submission to a CSV file 110 | sub.to_csv('sub.csv') 111 | 112 | 113 | ## OTHER ASSORTED CODE (NOT USED IN BEST SUBMISSION) 114 | 115 | # validate locally using 5-fold cross-validation 116 | from sklearn.cross_validation import cross_val_score 117 | cross_val_score(lr, X, y, cv=5, scoring='log_loss').mean() 118 | 119 | # Random Forest model 120 | from sklearn.ensemble import RandomForestClassifier 121 | rfclf = RandomForestClassifier(n_estimators=100, max_features='auto') 122 | rfclf.fit(X, y) 123 | test_probs_rf = rfclf.predict_proba(test[cols])[:, 1] 124 | -------------------------------------------------------------------------------- /leaderboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/justmarkham/kaggle-pycon-2015/d3eb029a6a7953d2184c2babf34790962e26b60f/leaderboard.png --------------------------------------------------------------------------------