├── .gitignore ├── README.md ├── old ├── BagOfPopcorn │ ├── README.md │ ├── bag_of_words.py │ ├── kaggle_utility.py │ ├── word2vec_average_vectors.py │ └── word2vec_bag_of_centroids.py ├── BikeSharing │ ├── README.md │ └── bikes.py ├── Expedia │ ├── README.md │ └── script.py ├── ForestCover │ ├── README.md │ └── forest.py ├── HiggsBoson │ ├── README.md │ ├── Resources │ │ ├── ATLAS.png │ │ └── documentation.pdf │ ├── auto_l1.yaml │ ├── auto_l2.yaml │ ├── auto_mlp.yaml │ ├── higgs.py │ ├── higgs_adv.py │ ├── higgs_nn.py │ ├── metric.py │ └── mlp.yaml ├── NerveSegmentation │ ├── README.md │ ├── data.py │ ├── submission.py │ └── train.py ├── OttoGroup │ ├── README.md │ ├── Resources │ │ └── Grafik.jpg │ ├── find_ensemble_weights.py │ ├── graphlab_starter.py │ ├── keras_starter.py │ ├── keras_wrapper.py │ ├── otto.py │ ├── simple_svm.py │ └── xgboost_walkthrough.py └── PropertyInspection │ ├── README.md │ ├── Resources │ └── houses.png │ └── property.py └── scripts ├── __init__.py ├── pyro_basics.py ├── pytorch_basics.py ├── pytorch_embedding.py ├── pytorch_mnist.py ├── tf_basics.py └── tf_mnist.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # PyCharm 60 | .idea/ 61 | 62 | # VS Code 63 | .vscode/ 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kaggle Competition Code 2 | 3 | Repository for code used in Kaggle competitions. Browse the folders from the main directory to see details about each competition. 4 | -------------------------------------------------------------------------------- /old/BagOfPopcorn/README.md: -------------------------------------------------------------------------------- 1 | # Bag Of Popcorn (Word2Vec Tutorial) 2 | 3 | View the competition details here.
4 | 5 | This directory includes all of the code used for the competition. Since this challenge is really a learning exercise for natural language processing using gensim and word2vec there is not much unique or interesting as I'm mostly following the tutorial. -------------------------------------------------------------------------------- /old/BagOfPopcorn/bag_of_words.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('/home/john/git/kaggle/BagOfPopcorn/') 3 | 4 | import pandas as pd 5 | from sklearn.feature_extraction.text import CountVectorizer 6 | from sklearn.ensemble import RandomForestClassifier 7 | from old.Word2Vec.kaggle_utility import KaggleUtility 8 | 9 | 10 | def main(): 11 | data_dir = '/home/john/data/bag-of-popcorn/' 12 | 13 | train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3) 14 | test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3) 15 | 16 | print 'The first review is:' 17 | print train['review'][0] 18 | 19 | raw_input('Press Enter to continue...') 20 | 21 | # print 'Downloading text data sets...' 22 | # nltk.download() 23 | 24 | # Initialize an empty list to hold the clean reviews 25 | clean_train_reviews = [] 26 | 27 | # Loop over each review; create an index i that goes from 0 to the length 28 | # of the movie review list 29 | 30 | print 'Cleaning and parsing the training set movie reviews...\n' 31 | for i in xrange(0, len(train['review'])): 32 | clean_train_reviews.append(' '.join(KaggleUtility.review_to_wordlist(train['review'][i], True))) 33 | 34 | # Create a bag of words from the training set 35 | 36 | print 'Creating the bag of words...\n' 37 | 38 | # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool 39 | vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, 40 | stop_words=None, max_features=5000) 41 | 42 | # fit_transform() does two functions: First, it fits the model 43 | # and learns the vocabulary; second, it transforms our training data 44 | # into feature vectors. The input to fit_transform should be a list of 45 | # strings 46 | train_data_features = vectorizer.fit_transform(clean_train_reviews) 47 | 48 | # Numpy arrays are easy to work with, so convert the result to an array 49 | train_data_features = train_data_features.toarray() 50 | 51 | # Train a random forest using the bag of words 52 | 53 | print 'Training the random forest (this may take a while)...' 54 | 55 | # Initialize a Random Forest classifier with 100 trees 56 | forest = RandomForestClassifier(n_estimators=100) 57 | 58 | # Fit the forest to the training set, using the bag of words as 59 | # features and the sentiment labels as the response variable 60 | 61 | # This may take a few minutes to run 62 | forest = forest.fit(train_data_features, train['sentiment']) 63 | 64 | # Create an empty list and append the clean reviews one by one 65 | clean_test_reviews = [] 66 | 67 | print 'Cleaning and parsing the test set movie reviews...\n' 68 | for i in xrange(0, len(test['review'])): 69 | clean_test_reviews.append(' '.join(KaggleUtility.review_to_wordlist(test['review'][i], True))) 70 | 71 | # Get a bag of words for the test set, and convert to a numpy array 72 | test_data_features = vectorizer.transform(clean_test_reviews) 73 | test_data_features = test_data_features.toarray() 74 | 75 | # Use the random forest to make sentiment label predictions 76 | print 'Predicting test labels...\n' 77 | result = forest.predict(test_data_features) 78 | 79 | # Copy the results to a pandas dataframe with an "id" column and 80 | # a "sentiment" column 81 | output = pd.DataFrame(data={'id': test['id'], 'sentiment': result}) 82 | 83 | # Use pandas to write the comma-separated output file 84 | output.to_csv(data_dir + 'Bag_of_Words_model.csv', index=False, quoting=3) 85 | print 'Wrote results to Bag_of_Words_model.csv' 86 | 87 | 88 | if __name__ == "__main__": 89 | main() -------------------------------------------------------------------------------- /old/BagOfPopcorn/kaggle_utility.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bs4 import BeautifulSoup 3 | from nltk.corpus import stopwords 4 | 5 | 6 | class KaggleUtility(object): 7 | @staticmethod 8 | def review_to_wordlist(review, remove_stopwords=False): 9 | # Function to convert a document to a sequence of words, 10 | # optionally removing stop words. Returns a list of words. 11 | 12 | # 1. Remove HTML 13 | review_text = BeautifulSoup(review).get_text() 14 | 15 | # 2. Remove non-letters 16 | review_text = re.sub('[^a-zA-Z]', ' ', review_text) 17 | 18 | # 3. Convert words to lower case and split them 19 | words = review_text.lower().split() 20 | 21 | # 4. Optionally remove stop words (false by default) 22 | if remove_stopwords: 23 | stops = set(stopwords.words('english')) 24 | words = [w for w in words if w not in stops] 25 | 26 | # 5. Return a list of words 27 | return words 28 | 29 | @staticmethod 30 | def review_to_sentences(review, tokenizer, remove_stopwords=False): 31 | # Function to split a review into parsed sentences. Returns a 32 | # list of sentences, where each sentence is a list of words 33 | 34 | # 1. Use the NLTK tokenizer to split the paragraph into sentences 35 | raw_sentences = tokenizer.tokenize(review.decode('utf8').strip()) 36 | 37 | # 2. Loop over each sentence 38 | sentences = [] 39 | for raw_sentence in raw_sentences: 40 | # If a sentence is empty, skip it 41 | if len(raw_sentence) > 0: 42 | # Otherwise, call review_to_wordlist to get a list of words 43 | sentences.append(KaggleUtility.review_to_wordlist(raw_sentence, remove_stopwords)) 44 | 45 | # Return the list of sentences (each sentence is a list of words, 46 | # so this returns a list of lists 47 | return sentences -------------------------------------------------------------------------------- /old/BagOfPopcorn/word2vec_average_vectors.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('/home/john/git/kaggle/BagOfPopcorn/') 3 | 4 | import logging 5 | import nltk.data 6 | import numpy as np 7 | import pandas as pd 8 | from gensim.models import Word2Vec 9 | from sklearn.ensemble import RandomForestClassifier 10 | from old.Word2Vec.kaggle_utility import KaggleUtility 11 | 12 | 13 | def make_feature_vec(words, model, num_features): 14 | # Function to average all of the word vectors in a given 15 | # paragraph 16 | 17 | # Pre-initialize an empty numpy array (for speed) 18 | feature_vec = np.zeros(num_features, dtype='float32') 19 | 20 | nwords = 0 21 | 22 | # Index2word is a list that contains the names of the words in 23 | # the model's vocabulary. Convert it to a set, for speed 24 | index2word_set = set(model.index2word) 25 | 26 | # Loop over each word in the review and, if it is in the model's 27 | # vocabulary, add its feature vector to the total 28 | for word in words: 29 | if word in index2word_set: 30 | nwords += 1 31 | feature_vec = np.add(feature_vec, model[word]) 32 | 33 | # Divide the result by the number of words to get the average 34 | feature_vec = np.divide(feature_vec, nwords) 35 | return feature_vec 36 | 37 | 38 | def get_avg_feature_vecs(reviews, model, num_features): 39 | # Given a set of reviews (each one a list of words), calculate 40 | # the average feature vector for each one and return a 2D numpy array 41 | 42 | # Initialize a counter 43 | counter = 0 44 | 45 | # Preallocate a 2D numpy array, for speed 46 | review_feature_vecs = np.zeros((len(reviews), num_features), dtype='float32') 47 | 48 | # Loop through the reviews 49 | for review in reviews: 50 | # Print a status message every 1000th review 51 | if counter % 1000 == 0: 52 | print 'Review %d of %d' % (counter, len(reviews)) 53 | 54 | # Call the function (defined above) that makes average feature vectors 55 | review_feature_vecs[counter] = make_feature_vec(review, model, num_features) 56 | 57 | # Increment the counter 58 | counter += 1 59 | return review_feature_vecs 60 | 61 | 62 | def get_clean_reviews(reviews): 63 | clean_reviews = [] 64 | for review in reviews['review']: 65 | clean_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True)) 66 | return clean_reviews 67 | 68 | 69 | def main(): 70 | data_dir = '/home/john/data/bag-of-popcorn/' 71 | 72 | # Read data from files 73 | train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3) 74 | test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3) 75 | unlabeled_train = pd.read_csv(data_dir + 'unlabeledTrainData.tsv', header=0, delimiter='\t', quoting=3) 76 | 77 | # Verify the number of reviews that were read (100,000 in total) 78 | print 'Read %d labeled train reviews, %d labeled test reviews, and %d unlabeled reviews\n' % \ 79 | (train['review'].size, test['review'].size, unlabeled_train['review'].size) 80 | 81 | # Load the punkt tokenizer 82 | tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 83 | 84 | # Split the labeled and unlabeled training sets into clean sentences 85 | 86 | # Initialize an empty list of sentences 87 | sentences = [] 88 | 89 | print 'Parsing sentences from training set' 90 | for review in train['review']: 91 | sentences += KaggleUtility.review_to_sentences(review, tokenizer) 92 | 93 | print 'Parsing sentences from unlabeled set' 94 | for review in unlabeled_train['review']: 95 | sentences += KaggleUtility.review_to_sentences(review, tokenizer) 96 | 97 | # Set parameters and train the word2vec model 98 | 99 | # Import the built-in logging module and configure it so that BagOfPopcorn 100 | # creates nice output messages 101 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 102 | 103 | # Set values for various parameters 104 | num_features = 300 # Word vector dimensionality 105 | min_word_count = 40 # Minimum word count 106 | num_workers = 4 # Number of threads to run in parallel 107 | context = 10 # Context window size 108 | downsampling = 1e-3 # Downsample setting for frequent words 109 | 110 | # Initialize and train the model (this will take some time) 111 | print 'Training BagOfPopcorn model...' 112 | model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count, 113 | window=context, sample=downsampling, seed=1) 114 | 115 | # If you don't plan to train the model any further, calling 116 | # init_sims will make the model much more memory-efficient. 117 | model.init_sims(replace=True) 118 | 119 | # It can be helpful to create a meaningful model name and 120 | # save the model for later use. You can load it later using BagOfPopcorn.load() 121 | model_name = '300features_40minwords_10context' 122 | model.save(data_dir + model_name) 123 | 124 | model.doesnt_match('man woman child kitchen'.split()) 125 | model.doesnt_match('france england germany berlin'.split()) 126 | model.doesnt_match('paris berlin london austria'.split()) 127 | model.most_similar('man') 128 | model.most_similar('queen') 129 | model.most_similar('awful') 130 | 131 | # Create average vectors for the training and test sets 132 | 133 | print 'Creating average feature vecs for training reviews' 134 | 135 | train_data_vecs = get_avg_feature_vecs(get_clean_reviews(train), model, num_features) 136 | 137 | print 'Creating average feature vecs for test reviews' 138 | 139 | test_data_vecs = get_avg_feature_vecs(get_clean_reviews(test), model, num_features) 140 | 141 | # Fit a random forest to the training set, then make predictions 142 | 143 | # Fit a random forest to the training data, using 100 trees 144 | forest = RandomForestClassifier(n_estimators=100) 145 | 146 | print 'Fitting a random forest to labeled training data...' 147 | forest = forest.fit(train_data_vecs, train['sentiment']) 148 | 149 | # Test & extract results 150 | result = forest.predict(test_data_vecs) 151 | 152 | # Write the test results 153 | output = pd.DataFrame(data={'id': test['id'], 'sentiment': result}) 154 | output.to_csv(data_dir + 'Word2Vec_AverageVectors.csv', index=False, quoting=3) 155 | print 'Wrote Word2Vec_AverageVectors.csv' 156 | 157 | 158 | if __name__ == "__main__": 159 | main() -------------------------------------------------------------------------------- /old/BagOfPopcorn/word2vec_bag_of_centroids.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('/home/john/git/kaggle/BagOfPopcorn/') 3 | 4 | import time 5 | import numpy as np 6 | import pandas as pd 7 | from gensim.models import Word2Vec 8 | from sklearn.cluster import KMeans 9 | from sklearn.ensemble import RandomForestClassifier 10 | from old.Word2Vec.kaggle_utility import KaggleUtility 11 | 12 | 13 | def create_bag_of_centroids(wordlist, word_centroid_map): 14 | # The number of clusters is equal to the highest cluster index 15 | # in the word / centroid map 16 | num_centroids = max(word_centroid_map.values()) + 1 17 | 18 | # Pre-allocate the bag of centroids vector (for speed) 19 | bag_of_centroids = np.zeros(num_centroids, dtype='float32') 20 | 21 | # Loop over the words in the review. If the word is in the vocabulary, 22 | # find which cluster it belongs to, and increment that cluster count 23 | # by one 24 | for word in wordlist: 25 | if word in word_centroid_map: 26 | index = word_centroid_map[word] 27 | bag_of_centroids[index] += 1 28 | 29 | # Return the 'bag of centroids' 30 | return bag_of_centroids 31 | 32 | 33 | def main(): 34 | data_dir = '/home/john/data/bag-of-popcorn/' 35 | model = Word2Vec.load(data_dir + '300features_40minwords_10context') 36 | 37 | # Run k-means on the word vectors and print a few clusters 38 | 39 | # Start time 40 | start = time.time() 41 | 42 | # Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an 43 | # average of 5 words per cluster 44 | word_vectors = model.syn0 45 | num_clusters = word_vectors.shape[0] / 5 46 | 47 | # Initialize a k-means object and use it to extract centroids 48 | print 'Running K means' 49 | kmeans_clustering = KMeans(n_clusters=num_clusters) 50 | idx = kmeans_clustering.fit_predict(word_vectors) 51 | 52 | # Get the end time and print how long the process took 53 | end = time.time() 54 | elapsed = end - start 55 | print 'Time taken for K Means clustering: ', elapsed, 'seconds.' 56 | 57 | # Create a Word / Index dictionary, mapping each vocabulary word to 58 | # a cluster number 59 | word_centroid_map = dict(zip(model.index2word, idx)) 60 | 61 | # Print the first ten clusters 62 | for cluster in xrange(0, 10): 63 | 64 | # Print the cluster number 65 | print '\nCluster %d' % cluster 66 | 67 | # Find all of the words for that cluster number, and print them out 68 | words = [] 69 | for i in xrange(0, len(word_centroid_map.values())): 70 | if word_centroid_map.values()[i] == cluster: 71 | words.append(word_centroid_map.keys()[i]) 72 | print words 73 | 74 | # Create clean_train_reviews and clean_test_reviews as we did before 75 | 76 | # Read data from files 77 | train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3) 78 | test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3) 79 | 80 | print 'Cleaning training reviews' 81 | clean_train_reviews = [] 82 | for review in train['review']: 83 | clean_train_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True)) 84 | 85 | print 'Cleaning test reviews' 86 | clean_test_reviews = [] 87 | for review in test['review']: 88 | clean_test_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True)) 89 | 90 | # Create bags of centroids 91 | 92 | # Pre-allocate an array for the training set bags of centroids (for speed) 93 | train_centroids = np.zeros((train['review'].size, num_clusters), dtype='float32') 94 | 95 | # Transform the training set reviews into bags of centroids 96 | counter = 0 97 | for review in clean_train_reviews: 98 | train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map) 99 | counter += 1 100 | 101 | # Repeat for test reviews 102 | test_centroids = np.zeros((test['review'].size, num_clusters), dtype='float32') 103 | 104 | counter = 0 105 | for review in clean_test_reviews: 106 | test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map) 107 | counter += 1 108 | 109 | # Fit a random forest and extract predictions 110 | forest = RandomForestClassifier(n_estimators=100) 111 | 112 | # Fitting the forest may take a few minutes 113 | print 'Fitting a random forest to labeled training data...' 114 | forest = forest.fit(train_centroids, train['sentiment']) 115 | result = forest.predict(test_centroids) 116 | 117 | # Write the test results 118 | output = pd.DataFrame(data={'id': test['id'], 'sentiment': result}) 119 | output.to_csv(data_dir + 'BagOfCentroids.csv', index=False, quoting=3) 120 | print 'Wrote BagOfCentroids.csv' 121 | 122 | 123 | if __name__ == '__main__': 124 | main() -------------------------------------------------------------------------------- /old/BikeSharing/README.md: -------------------------------------------------------------------------------- 1 | # Bike Sharing Demand 2 | 3 | View the competition details here.
4 | 5 | This directory includes all of the code used for the competition. Since the bike sharing challenge is a fairly easy problem designed for beginners, I used this competition to refine my forecasting/regression script a bit and learn some scikit-learn APIs that I wasn't overly familiar with. -------------------------------------------------------------------------------- /old/BikeSharing/bikes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import pickle 4 | import numpy as np 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | import seaborn as sb 8 | from sklearn.cross_validation import * 9 | from sklearn.decomposition import * 10 | from sklearn.ensemble import * 11 | from sklearn.feature_selection import * 12 | from sklearn.grid_search import * 13 | from sklearn.learning_curve import * 14 | from sklearn.linear_model import * 15 | from sklearn.manifold import * 16 | from sklearn.naive_bayes import * 17 | from sklearn.preprocessing import * 18 | from sklearn.svm import * 19 | 20 | 21 | def performance_test(x): 22 | """ 23 | Test NumPy performance. Use to compare computation speed across machines. 24 | """ 25 | A = np.random.random((x, x)) 26 | B = np.random.random((x, x)) 27 | t = time.time() 28 | np.dot(A, B) 29 | print(time.time() - t) 30 | 31 | 32 | def load_csv_data(directory, filename, dtype=None, index=None, convert_to_date=False): 33 | """ 34 | Test NumPy performance. Use to compare computation speed across machines. 35 | """ 36 | data = pd.read_csv(directory + filename, sep=',', dtype=dtype) 37 | 38 | if index is not None: 39 | if convert_to_date: 40 | if type(index) is str: 41 | data[index] = data[index].convert_objects(convert_dates='coerce') 42 | else: 43 | for key in index: 44 | data[key] = data[key].convert_objects(convert_dates='coerce') 45 | 46 | data = data.set_index(index) 47 | 48 | print('Data file ' + filename + ' loaded successfully.') 49 | 50 | return data 51 | 52 | 53 | def load_model(filename): 54 | """ 55 | Load a previously training model from disk. 56 | """ 57 | model_file = open(filename, 'rb') 58 | model = pickle.load(model_file) 59 | model_file.close() 60 | 61 | return model 62 | 63 | 64 | def save_model(model, filename): 65 | """ 66 | Persist a trained model to disk. 67 | """ 68 | model_file = open(filename, 'wb') 69 | pickle.dump(model, model_file) 70 | model_file.close() 71 | 72 | 73 | def predict(X, model, transforms): 74 | """ 75 | Predicts the class label. 76 | """ 77 | X = apply_transforms(X, transforms) 78 | y_est = model.predict(X) 79 | 80 | return y_est 81 | 82 | 83 | def predict_probability(X, model, transforms): 84 | """ 85 | Predicts the class probabilities. 86 | """ 87 | X = apply_transforms(X, transforms) 88 | y_prob = model.predict_proba(X)[:, 1] 89 | 90 | return y_prob 91 | 92 | 93 | def score(X, y, model, transforms): 94 | """ 95 | Scores the model's performance and returns the result. 96 | """ 97 | X = apply_transforms(X, transforms) 98 | 99 | return model.score(X, y) 100 | 101 | 102 | def generate_features(data): 103 | """ 104 | Generates new derived features to add to the data set for model training. 105 | """ 106 | data['DayOfWeek'] = data.index.map(lambda x: x.dayofweek) 107 | data['TimeOfDay'] = data.index.map(lambda x: x.hour) 108 | 109 | return data 110 | 111 | 112 | def process_training_data(directory, filename, ex_generate_features): 113 | """ 114 | Reads in training data and prepares numpy arrays. 115 | """ 116 | training_data = load_csv_data(directory, filename, index='datetime', convert_to_date=True) 117 | num_features = len(training_data.columns) - 3 118 | 119 | # drop the total count label and move the registered/casual counts to the front 120 | cols = training_data.columns.tolist() 121 | cols = cols[-3:-1] + cols[0:num_features] 122 | training_data = training_data[cols] 123 | 124 | if ex_generate_features: 125 | training_data = generate_features(training_data) 126 | 127 | num_features = len(training_data.columns) 128 | X = training_data.iloc[:, 2:num_features].values 129 | y1 = training_data.iloc[:, 0].values 130 | y2 = training_data.iloc[:, 1].values 131 | 132 | return training_data, X, y1, y2 133 | 134 | 135 | def process_test_data(directory, filename, ex_generate_features): 136 | """ 137 | Reads in the test data set and prepares it for prediction by the model. 138 | """ 139 | test_data = load_csv_data(directory, filename, index='datetime', convert_to_date=True) 140 | 141 | if ex_generate_features: 142 | test_data = generate_features(test_data) 143 | 144 | num_features = len(test_data.columns) 145 | X_test = test_data.iloc[:, 0:num_features].values 146 | 147 | return test_data, X_test 148 | 149 | 150 | def create_transforms(X, transforms, missing='NaN', impute_strategy='mean', categories=None): 151 | """ 152 | Creates transform objects to apply before training or scoring. 153 | """ 154 | for i, (key, transform) in enumerate(transforms): 155 | if key == 'imputer': 156 | # impute missing values 157 | transform = Imputer(missing_values=missing, strategy=impute_strategy) 158 | X = transform.fit_transform(X) 159 | elif key == 'onehot': 160 | # create a category encoder 161 | transform = OneHotEncoder(categorical_features=categories, sparse=False) 162 | X = transform.fit_transform(X) 163 | elif key == 'selector': 164 | # create a feature selection transform 165 | transform = VarianceThreshold(threshold=0.0) 166 | X = transform.fit_transform(X) 167 | elif key == 'scaler': 168 | # create a standardization transform 169 | transform = StandardScaler() 170 | X = transform.fit_transform(X) 171 | elif key == 'pca': 172 | # create a PCA transform 173 | transform = PCA(whiten=True) 174 | transform.fit(X) 175 | elif key == 'isomap': 176 | # create an isomap transform 177 | transform = Isomap() 178 | transform.fit(X) 179 | elif key == 'lle': 180 | # create a modified LLE transform 181 | transform = LocallyLinearEmbedding(method='modified') 182 | transform.fit(X) 183 | elif key == 'mds': 184 | # create a multi-dimensional scaling transform 185 | transform = MDS() 186 | transform.fit(X) 187 | elif key == 't-sne': 188 | # create a t-SNE transform 189 | transform = TSNE() 190 | transform.fit(X) 191 | 192 | transforms[i] = (key, transform) 193 | 194 | return transforms 195 | 196 | 197 | def apply_transforms(X, transforms): 198 | """ 199 | Applies pre-computed transformations to a data set. 200 | """ 201 | for key, transform in transforms: 202 | if transform is not None: 203 | X = transform.transform(X) 204 | 205 | return X 206 | 207 | 208 | def visualize_variable_relationships(training_data, viz_type, category_vars, quantitative_vars): 209 | """ 210 | Generates plots showing the relationship between several variables. 211 | """ 212 | # compare the continuous variable distributions using a violin plot 213 | sub_data = training_data[quantitative_vars] 214 | fig, ax = plt.subplots(1, 1, figsize=(16, 12)) 215 | sb.violinplot(sub_data, ax=ax) 216 | fig.tight_layout() 217 | 218 | # if categorical variables were provided, visualize the quantitative distributions by category 219 | if len(category_vars) > 0: 220 | fig, ax = plt.subplots(len(quantitative_vars), len(category_vars), figsize=(16, 12)) 221 | for i, var in enumerate(quantitative_vars): 222 | for j, cat in enumerate(category_vars): 223 | sb.violinplot(training_data[var], training_data[cat], ax=ax[i, j]) 224 | fig.tight_layout() 225 | 226 | # generate plots to directly compare the variables 227 | if len(category_vars) == 0: 228 | if len(quantitative_vars) == 2: 229 | sb.jointplot(quantitative_vars[0], quantitative_vars[1], training_data, kind=viz_type, size=16) 230 | else: 231 | sb.pairplot(training_data, vars=quantitative_vars, kind='scatter', 232 | diag_kind='kde', size=16 / len(quantitative_vars)) 233 | else: 234 | if len(quantitative_vars) == 1: 235 | if len(category_vars) == 1: 236 | sb.factorplot(category_vars[0], quantitative_vars[0], None, 237 | training_data, kind='auto', size=16) 238 | else: 239 | sb.factorplot(category_vars[0], quantitative_vars[0], category_vars[1], 240 | training_data, kind='auto', size=16) 241 | if len(quantitative_vars) == 2: 242 | if len(category_vars) == 1: 243 | sb.lmplot(quantitative_vars[0], quantitative_vars[1], training_data, 244 | col=None, row=category_vars[0], size=16) 245 | else: 246 | sb.lmplot(quantitative_vars[0], quantitative_vars[1], training_data, 247 | col=category_vars[0], row=category_vars[1], size=16) 248 | else: 249 | sb.pairplot(training_data, hue=category_vars[0], vars=quantitative_vars, kind='scatter', 250 | diag_kind='kde', size=16 / len(quantitative_vars)) 251 | 252 | 253 | def visualize_feature_distributions(training_data, viz_type, plot_size): 254 | """ 255 | Generates feature distribution plots (histogram or kde) for each feature. 256 | """ 257 | if viz_type == 'hist': 258 | hist = True 259 | kde = False 260 | else: 261 | hist = False 262 | kde = True 263 | 264 | num_features = plot_size if plot_size < len(training_data.columns) else len(training_data.columns) 265 | num_plots = num_features / 16 if num_features % 16 == 0 else num_features / 16 + 1 266 | 267 | for i in range(num_plots): 268 | fig, ax = plt.subplots(4, 4, figsize=(20, 10)) 269 | for j in range(16): 270 | index = (i * 16) + j 271 | if index < num_features: 272 | if index != 3: # this column is all 0s in the bike set 273 | sb.distplot(training_data.iloc[:, index], hist=hist, kde=kde, label=training_data.columns[index], 274 | ax=ax[j / 4, j % 4], kde_kws={"shade": True}) 275 | fig.tight_layout() 276 | 277 | 278 | def visualize_correlations(training_data): 279 | """ 280 | Generates a correlation matrix heat map. 281 | """ 282 | fig, ax = plt.subplots(figsize=(16, 10)) 283 | colormap = sb.blend_palette(sb.color_palette('coolwarm'), as_cmap=True) 284 | if len(training_data.columns) < 30: 285 | sb.corrplot(training_data, annot=True, sig_stars=False, diag_names=True, cmap=colormap, ax=ax) 286 | else: 287 | sb.corrplot(training_data, annot=False, sig_stars=False, diag_names=False, cmap=colormap, ax=ax) 288 | fig.tight_layout() 289 | 290 | 291 | def visualize_sequential_relationships(training_data, plot_size, smooth=None, window=1): 292 | """ 293 | Generates line plots to visualize sequential data. Assumes the data frame index is time series. 294 | """ 295 | training_data.index.name = None 296 | num_features = plot_size if plot_size < len(training_data.columns) else len(training_data.columns) 297 | num_plots = num_features / 16 if num_features % 16 == 0 else num_features / 16 + 1 298 | 299 | for i in range(num_plots): 300 | fig, ax = plt.subplots(4, 4, sharex=True, figsize=(20, 10)) 301 | for j in range(16): 302 | index = (i * 16) + j 303 | if index < num_features: 304 | if index != 3: # this column is all 0s in the bike set 305 | if smooth == 'mean': 306 | training_data.iloc[:, index] = pd.rolling_mean(training_data.iloc[:, index], window) 307 | elif smooth == 'var': 308 | training_data.iloc[:, index] = pd.rolling_var(training_data.iloc[:, index], window) 309 | elif smooth == 'skew': 310 | training_data.iloc[:, index] = pd.rolling_skew(training_data.iloc[:, index], window) 311 | elif smooth == 'kurt': 312 | training_data.iloc[:, index] = pd.rolling_kurt(training_data.iloc[:, index], window) 313 | 314 | training_data.iloc[:, index].plot(ax=ax[j / 4, j % 4], kind='line', legend=False, 315 | title=training_data.columns[index]) 316 | fig.tight_layout() 317 | 318 | 319 | def visualize_principal_components(X, y1, y2, model_type, num_components, transforms): 320 | """ 321 | Generates scatter plots to visualize the principal components of the data set. 322 | """ 323 | X = apply_transforms(X, transforms) 324 | for y in (y1, y2): 325 | if model_type == 'classification': 326 | class_count = np.count_nonzero(np.unique(y)) 327 | colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] 328 | 329 | for i in range(num_components): 330 | fig, ax = plt.subplots(figsize=(16, 10)) 331 | for j in range(class_count): 332 | ax.scatter(X[y == j, i], X[y == j, i + 1], s=30, c=colors[j], label=j) 333 | ax.set_title('Principal Components ' + str(i) + ' and ' + str(i + 1)) 334 | ax.legend() 335 | fig.tight_layout() 336 | else: 337 | for i in range(num_components): 338 | fig, ax = plt.subplots(figsize=(16, 10)) 339 | sc = ax.scatter(X[:, i], X[:, i + 1], s=30, c=y, cmap='Blues') 340 | ax.set_title('Principal Components ' + str(i) + ' and ' + str(i + 1)) 341 | ax.legend() 342 | fig.colorbar(sc) 343 | fig.tight_layout() 344 | 345 | 346 | def define_model(model_type, algorithm): 347 | """ 348 | Defines and returns a model object of the designated type. 349 | """ 350 | model = None 351 | 352 | if model_type == 'classification': 353 | if algorithm == 'bayes': 354 | model = GaussianNB() 355 | elif algorithm == 'logistic': 356 | model = LogisticRegression(penalty='l2', C=1.0) 357 | elif algorithm == 'svm': 358 | model = SVC(C=1.0, kernel='rbf', shrinking=True, probability=False, cache_size=200) 359 | elif algorithm == 'sgd': 360 | model = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, n_iter=1000, shuffle=False, n_jobs=-1) 361 | elif algorithm == 'forest': 362 | model = RandomForestClassifier(n_estimators=10, criterion='gini', max_features='auto', max_depth=None, 363 | min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None, n_jobs=-1) 364 | elif algorithm == 'boost': 365 | model = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, 366 | min_samples_split=2, min_samples_leaf=1, max_depth=3, max_features=None, 367 | max_leaf_nodes=None) 368 | else: 369 | print('No model defined for ' + algorithm) 370 | exit() 371 | else: 372 | if algorithm == 'ridge': 373 | model = Ridge(alpha=1.0) 374 | elif algorithm == 'svm': 375 | model = SVR(C=1.0, kernel='rbf', shrinking=True, probability=False, cache_size=200) 376 | elif algorithm == 'sgd': 377 | model = SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, n_iter=1000, shuffle=False) 378 | elif algorithm == 'forest': 379 | model = RandomForestRegressor(n_estimators=10, criterion='mse', max_features='auto', max_depth=None, 380 | min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None, n_jobs=-1) 381 | elif algorithm == 'boost': 382 | model = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, 383 | min_samples_split=2, min_samples_leaf=1, max_depth=3, max_features=None, 384 | max_leaf_nodes=None) 385 | else: 386 | print('No model defined for ' + algorithm) 387 | exit() 388 | 389 | return model 390 | 391 | 392 | def train_model(X, y, model_type, algorithm, transforms): 393 | """ 394 | Trains a new model using the training data. 395 | """ 396 | t0 = time.time() 397 | model = define_model(model_type, algorithm) 398 | X = apply_transforms(X, transforms) 399 | model.fit(X, y) 400 | t1 = time.time() 401 | print('Model trained in {0:3f} s.'.format(t1 - t0)) 402 | 403 | return model 404 | 405 | 406 | def visualize_feature_importance(training_data, model, column_offset): 407 | """ 408 | Generates a feature importance plot. Requires a trained random forest or gradient boosting model. 409 | Does not work properly if transformations are applied to training data that expands the number 410 | of features. 411 | """ 412 | importance = model.feature_importances_ 413 | importance = 100.0 * (importance / importance.max()) 414 | importance = importance[0:30] if len(training_data.columns) > 30 else importance 415 | sorted_idx = np.argsort(importance) 416 | pos = np.arange(sorted_idx.shape[0]) 417 | 418 | fig, ax = plt.subplots(figsize=(16, 10)) 419 | ax.set_title('Variable Importance') 420 | ax.barh(pos, importance[sorted_idx], align='center') 421 | ax.set_yticks(pos) 422 | ax.set_yticklabels(training_data.columns[sorted_idx + column_offset]) 423 | ax.set_xlabel('Relative Importance') 424 | 425 | fig.tight_layout() 426 | 427 | 428 | def cross_validate(X, y, model_type, algorithm, metric, transforms, folds=3): 429 | """ 430 | Performs cross-validation to estimate the true performance of the model. 431 | """ 432 | model = define_model(model_type, algorithm) 433 | X = apply_transforms(X, transforms) 434 | 435 | t0 = time.time() 436 | scores = cross_val_score(model, X, y, scoring=metric, cv=folds, n_jobs=-1) 437 | t1 = time.time() 438 | print('Cross-validation completed in {0:3f} s.'.format(t1 - t0)) 439 | 440 | return np.mean(scores) 441 | 442 | 443 | def time_series_cross_validate(X, y, model_type, algorithm, transforms, strategy='traditional', folds=3, 444 | window_type='cumulative', min_window=0, forecast_range=1, plot=False): 445 | """ 446 | Performs time series cross-validation to estimate the true performance of the model. 447 | """ 448 | model = define_model(model_type, algorithm) 449 | X = apply_transforms(X, transforms) 450 | 451 | scores = [] 452 | train_count = len(X) 453 | 454 | if strategy == 'walk-forward': 455 | folds = train_count - min_window - forecast_range 456 | fold_size = 1 457 | else: 458 | fold_size = train_count / folds 459 | 460 | t0 = time.time() 461 | for i in range(folds): 462 | if window_type == 'fixed': 463 | fold_start = i * fold_size 464 | else: 465 | fold_start = 0 466 | 467 | fold_end = (i + 1) * fold_size + min_window 468 | fold_train_end = fold_end - forecast_range 469 | 470 | X_train, X_val = X[fold_start:fold_train_end, :], X[fold_train_end:fold_end, :] 471 | y_train, y_val = y[fold_start:fold_train_end], y[fold_train_end:fold_end] 472 | 473 | model.fit(X_train, y_train) 474 | scores.append(model.score(X_val, y_val)) 475 | 476 | if plot is True: 477 | y_est = model.predict(X_val) 478 | fig, ax = plt.subplots(figsize=(16, 10)) 479 | ax.set_title('Estimation Error') 480 | ax.plot(y_est - y_val) 481 | fig.tight_layout() 482 | 483 | t1 = time.time() 484 | print('Cross-validation completed in {0:3f} s.'.format(t1 - t0)) 485 | 486 | return np.mean(scores) 487 | 488 | 489 | def plot_learning_curve(X, y, model_type, algorithm, metric, transforms): 490 | """ 491 | Plots a learning curve showing model performance against both training and 492 | validation data sets as a function of the number of training samples. 493 | """ 494 | model = define_model(model_type, algorithm) 495 | X = apply_transforms(X, transforms) 496 | 497 | t0 = time.time() 498 | train_sizes, train_scores, test_scores = learning_curve(model, X, y, scoring=metric, cv=3, n_jobs=-1) 499 | train_scores_mean = np.mean(train_scores, axis=1) 500 | train_scores_std = np.std(train_scores, axis=1) 501 | test_scores_mean = np.mean(test_scores, axis=1) 502 | test_scores_std = np.std(test_scores, axis=1) 503 | 504 | fig, ax = plt.subplots(figsize=(16, 10)) 505 | ax.set_title('Learning Curve') 506 | ax.set_xlabel('Training Examples') 507 | ax.set_ylabel('Score') 508 | ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 509 | alpha=0.1, color='r') 510 | ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 511 | alpha=0.1, color='r') 512 | ax.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score') 513 | ax.plot(train_sizes, test_scores_mean, 'o-', color='r', label='Cross-validation score') 514 | ax.legend(loc='best') 515 | fig.tight_layout() 516 | t1 = time.time() 517 | print('Learning curve generated in {0:3f} s.'.format(t1 - t0)) 518 | 519 | 520 | def parameter_search(X, y, model_type, algorithm, metric, transforms): 521 | """ 522 | Performs an exhaustive search over the specified model parameters. 523 | """ 524 | model = define_model(model_type, algorithm) 525 | X = apply_transforms(X, transforms) 526 | 527 | param_grid = None 528 | if algorithm == 'logistic': 529 | param_grid = [{'penalty': ['l1', 'l2'], 'C': [0.1, 0.3, 1.0, 3.0]}] 530 | elif algorithm == 'ridge': 531 | param_grid = [{'alpha': [0.1, 0.3, 1.0, 3.0, 10.0]}] 532 | elif algorithm == 'svm': 533 | param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, 534 | {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}] 535 | elif algorithm == 'sgd': 536 | param_grid = [{'loss': ['hinge', 'log', 'modified_huber'], 'penalty': ['l1', 'l2'], 537 | 'alpha': [0.0001, 0.001, 0.01], 'iter': [100, 1000, 10000]}] 538 | elif algorithm == 'forest': 539 | param_grid = [{'n_estimators': [10, 30, 100, 300], 'criterion': ['gini', 'entropy'], 540 | 'max_features': ['auto', 'log2', None], 'max_depth': [3, 5, 7, None], 541 | 'min_samples_split': [2, 10, 30, 100], 'min_samples_leaf': [1, 3, 10, 30, 100]}] 542 | elif algorithm == 'boost': 543 | param_grid = [{'learning_rate': [0.1, 0.3, 1.0], 'subsample': [1.0, 0.9, 0.7, 0.5], 544 | 'n_estimators': [100, 300, 1000], 'max_features': ['auto', 'log2', None], 545 | 'max_depth': [3, 5, 7, None], 'min_samples_split': [2, 10, 30, 100], 546 | 'min_samples_leaf': [1, 3, 10, 30, 100]}] 547 | 548 | t0 = time.time() 549 | grid_estimator = GridSearchCV(model, param_grid, scoring=metric, cv=3, n_jobs=-1) 550 | grid_estimator.fit(X, y) 551 | t1 = time.time() 552 | print('Grid search completed in {0:3f} s.'.format(t1 - t0)) 553 | 554 | return grid_estimator.best_estimator_, grid_estimator.best_params_, grid_estimator.best_score_ 555 | 556 | 557 | def train_ensemble(X, y, model_type, algorithm, transforms): 558 | """ 559 | Creates an ensemble of many models together. 560 | """ 561 | model = define_model(model_type, algorithm) 562 | X = apply_transforms(X, transforms) 563 | 564 | t0 = time.time() 565 | ensemble_model = BaggingClassifier(base_estimator=model, n_estimators=10, max_samples=1.0, max_features=1.0, 566 | bootstrap=True, bootstrap_features=False) 567 | ensemble_model.fit(X, y) 568 | t1 = time.time() 569 | print('Ensemble training completed in {0:3f} s.'.format(t1 - t0)) 570 | 571 | return ensemble_model 572 | 573 | 574 | def create_submission(test_data, y_est, data_dir, submit_file): 575 | """ 576 | Create a new submission file with test data and predictions generated by the model. 577 | """ 578 | submit = pd.DataFrame(columns=['datetime', 'count']) 579 | submit['datetime'] = test_data.index 580 | submit['count'] = y_est 581 | submit.to_csv(data_dir + submit_file, sep=',', index=False, index_label=False) 582 | 583 | 584 | def experiments(): 585 | """ 586 | Testing area for miscellaneous experiments. 587 | """ 588 | 589 | 590 | def main(): 591 | ex_process_training_data = True 592 | ex_generate_features = True 593 | ex_create_transforms = True 594 | ex_load_model = False 595 | ex_save_model = False 596 | ex_visualize_variable_relationships = False 597 | ex_visualize_feature_distributions = False 598 | ex_visualize_correlations = False 599 | ex_visualize_sequential_relationships = False 600 | ex_visualize_principal_components = False 601 | ex_train_model = True 602 | ex_visualize_feature_importance = False 603 | ex_cross_validate = True 604 | ex_plot_learning_curve = False 605 | ex_parameter_search = False 606 | ex_train_ensemble = False 607 | ex_create_submission = False 608 | 609 | code_dir = '/home/john/git/kaggle/BikeSharing/' 610 | data_dir = '/home/john/data/bike-sharing/' 611 | training_file = 'train.csv' 612 | test_file = 'test.csv' 613 | submit_file = 'submission.csv' 614 | model_file = 'model.pkl' 615 | 616 | model_type = 'regression' # classification, regression 617 | algorithm = 'forest' # bayes, logistic, ridge, svm, sgd, forest, boost 618 | metric = None # accuracy, f1, rcc_auc, mean_absolute_error, mean_squared_error, r2_score 619 | transforms = [('imputer', None), ('onehot', None), ('selector', None), ('scaler', None)] 620 | categories = [0, 1, 2, 3] 621 | column_offset = 2 622 | plot_size = 16 623 | num_components = 3 624 | 625 | training_data = None 626 | X = None 627 | y1 = None 628 | y2 = None 629 | model = None 630 | model2 = None 631 | 632 | os.chdir(code_dir) 633 | 634 | print('Starting process...') 635 | print('Algorithm = {0}'.format(algorithm)) 636 | print('Scoring Metric = {0}'.format(metric)) 637 | print('Generate Features = {0}'.format(ex_generate_features)) 638 | print('Transforms = {0}'.format(transforms)) 639 | 640 | if ex_process_training_data: 641 | print('Reading in training data...') 642 | training_data, X, y1, y2 = process_training_data(data_dir, training_file, ex_generate_features) 643 | 644 | if ex_create_transforms: 645 | transforms = create_transforms(X, transforms, categories=categories) 646 | 647 | if ex_visualize_variable_relationships: 648 | print('Visualizing pairwise relationships...') 649 | # scatter, reg, resid, kde, hex 650 | visualize_variable_relationships(training_data, 'scatter', ['season', 'weather'], ['casual', 'registered']) 651 | 652 | if ex_visualize_feature_distributions: 653 | print('Visualizing feature distributions...') 654 | # hist, kde 655 | visualize_feature_distributions(training_data, 'hist', plot_size) 656 | 657 | if ex_visualize_correlations: 658 | print('Visualizing feature correlations...') 659 | visualize_correlations(training_data) 660 | 661 | if ex_visualize_sequential_relationships: 662 | print('Visualizing sequential relationships...') 663 | visualize_sequential_relationships(training_data, plot_size) 664 | 665 | if ex_visualize_principal_components: 666 | print('Visualizing principal components...') 667 | visualize_principal_components(X, y1, y2, model_type, num_components, transforms) 668 | 669 | if ex_load_model: 670 | print('Loading model from disk...') 671 | model = load_model(data_dir + model_file) 672 | 673 | if ex_train_model: 674 | print('Training model on full data set...') 675 | model = train_model(X, y1, model_type, algorithm, transforms) 676 | model2 = train_model(X, y2, model_type, algorithm, transforms) 677 | 678 | print('Calculating training score...') 679 | model_score = score(X, y1, model, transforms) 680 | print('Casual training score ='), model_score 681 | model_score2 = score(X, y2, model2, transforms) 682 | print('Registered training score ='), model_score2 683 | 684 | if ex_visualize_feature_importance and (algorithm == 'forest' or algorithm == 'boost'): 685 | print('Generating feature importance plot...') 686 | visualize_feature_importance(training_data, model, column_offset) 687 | visualize_feature_importance(training_data, model2, column_offset) 688 | 689 | if ex_cross_validate: 690 | print('Performing cross-validation...') 691 | cross_validation_score = time_series_cross_validate(X, y1, model_type, algorithm, transforms, 692 | forecast_range=258, plot=True) 693 | print('Casual cross-validation score ='), cross_validation_score 694 | cross_validation_score2 = time_series_cross_validate(X, y2, model_type, algorithm, transforms, 695 | forecast_range=258, plot=True) 696 | print('Registered cross-validation score ='), cross_validation_score2 697 | 698 | if ex_plot_learning_curve: 699 | print('Generating learning curve...') 700 | plot_learning_curve(X, y1, model_type, algorithm, metric, transforms) 701 | 702 | if ex_parameter_search: 703 | print('Performing hyper-parameter grid search...') 704 | best_model, best_params, best_score = parameter_search(X, y1, model_type, algorithm, metric, transforms) 705 | print('Best model = ', best_model) 706 | print('Best params = ', best_params) 707 | print('Best score = ', best_score) 708 | 709 | if ex_train_ensemble: 710 | print('Creating an ensemble of models...') 711 | model = train_ensemble(X, y1, model_type, algorithm, transforms) 712 | 713 | print('Calculating ensemble training score...') 714 | ensemble_score = score(X, y1, model, transforms) 715 | print('Ensemble Training score ='), ensemble_score 716 | 717 | if ex_save_model: 718 | print('Saving model to disk...') 719 | save_model(model, data_dir + model_file) 720 | 721 | if ex_create_submission: 722 | print('Reading in test data...') 723 | test_data, X_test = process_test_data(data_dir, test_file, ex_generate_features) 724 | 725 | print('Predicting test data...') 726 | y_est_1 = predict(X_test, model, transforms) 727 | y_est_2 = predict(X_test, model2, transforms) 728 | y_est = y_est_1 + y_est_2 729 | 730 | print('Creating submission file...') 731 | create_submission(test_data, y_est, data_dir, submit_file) 732 | 733 | print('Process complete.') 734 | 735 | 736 | if __name__ == "__main__": 737 | main() -------------------------------------------------------------------------------- /old/Expedia/README.md: -------------------------------------------------------------------------------- 1 | # Expedia Hotel Recommendations 2 | 3 | View the competition details here.
4 | 5 | I started looking at this but then didn't really end up working on it due to time constraints. It looked like an interesting problem that seemed to require some creativity to progress on. The included script is not original work, it was mostly copied from one of the "starter" scripts for the competition. -------------------------------------------------------------------------------- /old/Expedia/script.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import random 4 | import operator 5 | 6 | 7 | def apk(actual, predicted, k=10): 8 | if len(predicted) > k: 9 | predicted = predicted[:k] 10 | 11 | score = 0.0 12 | num_hits = 0.0 13 | 14 | for i, p in enumerate(predicted): 15 | if p in actual and p not in predicted[:i]: 16 | num_hits += 1.0 17 | score += num_hits / (i + 1.0) 18 | 19 | if not actual: 20 | return 0.0 21 | 22 | return score / min(len(actual), k) 23 | 24 | 25 | def mapk(actual, predicted, k=10): 26 | return np.mean([apk(a, p ,k) for a, p in zip(actual, predicted)]) 27 | 28 | 29 | def make_key(items): 30 | return '_'.join([str(i) for i in items]) 31 | 32 | 33 | def generate_exact_matches(row, match_cols, groups): 34 | index = tuple([row[t] for t in match_cols]) 35 | try: 36 | group = groups.get_group(index) 37 | except Exception: 38 | return [] 39 | 40 | clus = list(set(group.hotel_cluster)) 41 | 42 | return clus 43 | 44 | 45 | def f5(seq, idfun=None): 46 | if idfun is None: 47 | def idfun(x): return x 48 | 49 | seen = {} 50 | result = [] 51 | for item in seq: 52 | marker = idfun(item) 53 | if marker in seen: continue 54 | seen[marker] = 1 55 | result.append(item) 56 | 57 | return result 58 | 59 | 60 | def generate_submission(data_dir, preds, test): 61 | write_p = [' '.join([str(l) for l in p]) for p in preds] 62 | write_frame = ['{0},{1}'.format(test['id'].iloc[i], write_p[i]) for i in range(len(preds))] 63 | write_frame = ['id,hotel_cluster'] + write_frame 64 | 65 | with open(data_dir + 'predictions.csv', 'w+') as f: 66 | f.write('\n'.join(write_frame)) 67 | 68 | 69 | print('Loading data sets...') 70 | data_dir = '/home/john/data/expedia/' 71 | 72 | destinations = pd.read_csv(data_dir + 'destinations.csv') 73 | 74 | train = pd.read_csv(data_dir + 'train.csv', 75 | usecols=['date_time', 'user_location_country', 'user_location_region', 'user_location_city', 76 | 'user_id', 'is_booking', 'orig_destination_distance', 77 | 'hotel_cluster', 'srch_ci', 'srch_co', 'srch_destination_id', 78 | 'hotel_continent', 'hotel_country', 'hotel_market'], 79 | dtype={'date_time': np.str_, 'user_location_country': np.int8, 80 | 'user_location_region': np.int8, 'user_location_city': np.int8, 81 | 'user_id': np.int32, 'is_booking': np.int8, 82 | 'orig_destination_distance': np.float64, 83 | 'hotel_cluster': np.int8, 84 | 'srch_ci': np.str_, 'srch_co': np.str_, 85 | 'srch_destination_id': np.int32, 86 | 'hotel_continent': np.int8, 87 | 'hotel_country': np.int8, 88 | 'hotel_market': np.int8}) 89 | 90 | test = pd.read_csv(data_dir + 'test.csv', 91 | usecols=['id', 'date_time', 'user_location_country', 'user_location_region', 92 | 'user_location_city', 93 | 'user_id', 'orig_destination_distance', 94 | 'srch_ci', 'srch_co', 'srch_destination_id', 95 | 'hotel_continent', 'hotel_country', 'hotel_market'], 96 | dtype={'id': np.int32, 'date_time': np.str_, 'user_location_country': np.int8, 97 | 'user_location_region': np.int8, 'user_location_city': np.int8, 98 | 'user_id': np.int32, 99 | 'orig_destination_distance': np.float64, 'srch_ci': np.str_, 'srch_co': np.str_, 100 | 'srch_destination_id': np.int32, 101 | 'hotel_continent': np.int8, 102 | 'hotel_country': np.int8, 103 | 'hotel_market': np.int8}) 104 | 105 | print('Generating first set of predictions...') 106 | 107 | # add year and month features to the training data 108 | train['date_time'] = pd.to_datetime(train['date_time']) 109 | train['year'] = train['date_time'].dt.year 110 | train['month'] = train['date_time'].dt.month 111 | 112 | # generate a list of randomly selected unique user ids 113 | unique_users = train.user_id.unique() 114 | sel_user_ids = [unique_users[i] for i in sorted(random.sample(range(len(unique_users)), 10000))] 115 | sel_train = train[train.user_id.isin(sel_user_ids)] 116 | 117 | # create sampled training and validation data sets 118 | t1 = sel_train[((sel_train.year == 2012) | (sel_train.year == 2013))] 119 | t2 = sel_train[(sel_train.year == 2014)] 120 | t2 = t2[t2.is_booking == True] 121 | 122 | # skip sampling and use full data set 123 | # t1 = train 124 | # t2 = test 125 | 126 | # identify the most common clusters 127 | most_common_clusters = list(train.hotel_cluster.value_counts().head().index) 128 | 129 | # match clusters to search destination 130 | match_cols = ['srch_destination_id'] 131 | cluster_cols = match_cols + ['hotel_cluster'] 132 | groups = t1.groupby(cluster_cols) 133 | 134 | top_clusters = {} 135 | for name, group in groups: 136 | clicks = len(group.is_booking[group.is_booking == False]) 137 | bookings = len(group.is_booking[group.is_booking == True]) 138 | 139 | score = bookings + .15 * clicks 140 | 141 | clus_name = make_key(name[:len(match_cols)]) 142 | 143 | if clus_name not in top_clusters: 144 | top_clusters[clus_name] = {} 145 | 146 | top_clusters[clus_name][name[-1]] = score 147 | 148 | # find the top 5 for each search destination 149 | cluster_dict = {} 150 | for n in top_clusters: 151 | tc = top_clusters[n] 152 | top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]] 153 | cluster_dict[n] = top 154 | 155 | # generate predictions based on the top clusters per search destination 156 | preds = [] 157 | for index, row in t2.iterrows(): 158 | key = make_key([row[m] for m in match_cols]) 159 | 160 | if key in cluster_dict: 161 | preds.append(cluster_dict[key]) 162 | else: 163 | preds.append([]) 164 | 165 | print('Generating second set of predictions...') 166 | 167 | # use data leak to match users between train and test data 168 | match_cols = ['user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance'] 169 | groups = t1.groupby(match_cols) 170 | 171 | exact_matches = [] 172 | for i in range(t2.shape[0]): 173 | exact_matches.append(generate_exact_matches(t2.iloc[i], match_cols, groups)) 174 | 175 | # generate predictions 176 | full_preds = [f5(exact_matches[p] + preds[p] + most_common_clusters)[:5] for p in range(len(preds))] 177 | 178 | # evaluate the accuracy of this solution 179 | print('Score = ' + str(mapk([[l] for l in t2['hotel_cluster']], full_preds, k=5))) 180 | 181 | # print('Writing submission file...') 182 | # generate_submission(data_dir, full_preds, t2) 183 | 184 | print('Script complete.') 185 | -------------------------------------------------------------------------------- /old/ForestCover/README.md: -------------------------------------------------------------------------------- 1 | # Forest Cover Type Prediction 2 | 3 | View the competition details here.
4 | 5 | This directory includes all of the code used for the competition. Since the forest cover challenge is a fairly easy problem designed for beginners, I used this competition to refine my classification script a bit and learn some scikit-learn APIs that I wasn't overly familiar with. -------------------------------------------------------------------------------- /old/ForestCover/forest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import pickle 4 | import numpy as np 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | import seaborn as sb 8 | from sklearn import * 9 | from sklearn.ensemble import * 10 | from sklearn.grid_search import * 11 | from sklearn.feature_selection import * 12 | from sklearn.learning_curve import * 13 | 14 | 15 | def performance_test(): 16 | """ 17 | Test NumPy performance. Should run in less than a second on most machines. 18 | """ 19 | A = np.random.random((2000, 2000)) 20 | B = np.random.random((2000, 2000)) 21 | t = time.time() 22 | np.dot(A, B) 23 | print(time.time()-t) 24 | 25 | 26 | def load(filename): 27 | """ 28 | Load a previously training model from disk. 29 | """ 30 | model_file = open(filename, 'rb') 31 | model = pickle.load(model_file) 32 | model_file.close() 33 | return model 34 | 35 | 36 | def save(model, filename): 37 | """ 38 | Persist a trained model to disk. 39 | """ 40 | model_file = open(filename, 'wb') 41 | pickle.dump(model, model_file) 42 | model_file.close() 43 | 44 | 45 | def generate_features(data): 46 | """ 47 | Generates new derived features to add to the data set for model training. 48 | """ 49 | data['Aspect_Shifted'] = data['Aspect'].map(lambda x: x - 180 if x + 180 < 360 else x + 180) 50 | data['High_Water'] = data['Vertical_Distance_To_Hydrology'] < 0 51 | data['EVDtH'] = data['Elevation'] - data['Vertical_Distance_To_Hydrology'] 52 | data['EHDtH'] = data['Elevation'] - data['Horizontal_Distance_To_Hydrology'] * 0.2 53 | data['DTH'] = (data['Horizontal_Distance_To_Hydrology'] ** 2 + data['Vertical_Distance_To_Hydrology'] ** 2) ** 0.5 54 | data['Hydro_Fire_1'] = data['Horizontal_Distance_To_Hydrology'] + data['Horizontal_Distance_To_Fire_Points'] 55 | data['Hydro_Fire_2'] = abs(data['Horizontal_Distance_To_Hydrology'] - data['Horizontal_Distance_To_Fire_Points']) 56 | data['Hydro_Road_1'] = abs(data['Horizontal_Distance_To_Hydrology'] + data['Horizontal_Distance_To_Roadways']) 57 | data['Hydro_Road_2'] = abs(data['Horizontal_Distance_To_Hydrology'] - data['Horizontal_Distance_To_Roadways']) 58 | data['Fire_Road_1'] = abs(data['Horizontal_Distance_To_Fire_Points'] + data['Horizontal_Distance_To_Roadways']) 59 | data['Fire_Road_2'] = abs(data['Horizontal_Distance_To_Fire_Points'] - data['Horizontal_Distance_To_Roadways']) 60 | 61 | return data 62 | 63 | 64 | def process_training_data(filename, create_features): 65 | """ 66 | Reads in training data and prepares numpy arrays. 67 | """ 68 | training_data = pd.read_csv(filename, sep=',') 69 | num_features = len(training_data.columns) - 1 70 | 71 | # move the label to the first position and drop the ID column 72 | cols = training_data.columns.tolist() 73 | cols = cols[-1:] + cols[1:num_features] 74 | training_data = training_data[cols] 75 | 76 | if create_features: 77 | training_data = generate_features(training_data) 78 | 79 | num_features = len(training_data.columns) 80 | X = training_data.iloc[:, 1:num_features].values 81 | y = training_data.iloc[:, 0].values 82 | 83 | return training_data, X, y 84 | 85 | 86 | def create_transforms(X, standardize, whiten, select): 87 | """ 88 | Creates transform objects to apply before training or scoring. 89 | """ 90 | # create a standardization transform 91 | scaler = None 92 | if standardize: 93 | scaler = preprocessing.StandardScaler() 94 | scaler.fit(X) 95 | 96 | # create a PCA transform 97 | pca = None 98 | if whiten: 99 | pca = decomposition.PCA(whiten=True) 100 | pca.fit(X) 101 | 102 | # create a feature selection transform 103 | selector = None 104 | if select: 105 | selector = VarianceThreshold(threshold=0.0) 106 | selector.fit(X) 107 | 108 | return scaler, pca, selector 109 | 110 | 111 | def apply_transforms(X, scaler, pca, selector): 112 | """ 113 | Applies pre-computed transformations to a data set. 114 | """ 115 | if scaler is not None: 116 | X = scaler.transform(X) 117 | 118 | if pca is not None: 119 | X = pca.transform(X) 120 | 121 | if selector is not None: 122 | X = selector.transform(X) 123 | 124 | return X 125 | 126 | 127 | def visualize(training_data, X, y, pca): 128 | """ 129 | Computes statistics describing the data and creates some visualizations 130 | that attempt to highlight the underlying structure. 131 | 132 | Note: Use '%matplotlib inline' and '%matplotlib qt' at the IPython console 133 | to switch between display modes. 134 | """ 135 | 136 | print('Generating individual feature histograms...') 137 | num_features = len(training_data.columns) 138 | num_plots = num_features / 16 if num_features % 16 == 0 else num_features / 16 + 1 139 | for i in range(num_plots): 140 | fig, ax = plt.subplots(4, 4, figsize=(20, 10)) 141 | for j in range(16): 142 | index = (i * 16) + j 143 | if index == 0: 144 | ax[j / 4, j % 4].hist(y, bins=30) 145 | ax[j / 4, j % 4].set_title(training_data.columns[index]) 146 | ax[j / 4, j % 4].set_xlim((min(y), max(y))) 147 | elif index < num_features: 148 | ax[j / 4, j % 4].hist(X[:, index - 1], bins=30) 149 | ax[j / 4, j % 4].set_title(training_data.columns[index]) 150 | ax[j / 4, j % 4].set_xlim((min(X[:, index - 1]), max(X[:, index - 1]))) 151 | fig.tight_layout() 152 | 153 | print('Generating correlation matrix...') 154 | fig2, ax2 = plt.subplots(figsize=(16, 10)) 155 | colormap = sb.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF", "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True) 156 | sb.corrplot(training_data, annot=False, sig_stars=False, diag_names=False, cmap=colormap, ax=ax2) 157 | fig2.tight_layout() 158 | 159 | if pca is not None: 160 | print('Generating principal component plots...') 161 | X = pca.transform(X) 162 | class_count = np.count_nonzero(np.unique(y)) 163 | colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] 164 | 165 | fig3, ax3 = plt.subplots(figsize=(16, 10)) 166 | for i in range(class_count): 167 | class_idx = i + 1 # add 1 if class labels start at 1 instead of 0 168 | ax3.scatter(X[y == class_idx, 0], X[y == class_idx, 1], c=colors[i], label=class_idx) 169 | ax3.set_title('First & Second Principal Components') 170 | ax3.legend() 171 | fig3.tight_layout() 172 | 173 | fig4, ax4 = plt.subplots(figsize=(16, 10)) 174 | for i in range(class_count): 175 | class_idx = i + 1 # add 1 if class labels start at 1 instead of 0 176 | ax4.scatter(X[y == class_idx, 1], X[y == class_idx, 2], c=colors[i], label=class_idx) 177 | ax4.set_title('Second & Third Principal Components') 178 | ax4.legend() 179 | fig4.tight_layout() 180 | 181 | fig5, ax5 = plt.subplots(figsize=(16, 10)) 182 | for i in range(class_count): 183 | class_idx = i + 1 # add 1 if class labels start at 1 instead of 0 184 | ax5.scatter(X[y == class_idx, 2], X[y == class_idx, 3], c=colors[i], label=class_idx) 185 | ax5.set_title('Third & Fourth Principal Components') 186 | ax5.legend() 187 | fig5.tight_layout() 188 | 189 | 190 | def define_model(algorithm): 191 | """ 192 | Defines and returns a model object of the designated type. 193 | """ 194 | model = None 195 | 196 | if algorithm == 'bayes': 197 | model = naive_bayes.GaussianNB() 198 | elif algorithm == 'logistic': 199 | model = linear_model.LogisticRegression(penalty='l2', C=1.0) 200 | elif algorithm == 'svm': 201 | model = svm.SVC(C=1.0, kernel='rbf', shrinking=True, probability=False, cache_size=200) 202 | elif algorithm == 'sgd': 203 | model = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, 204 | n_iter=1000, shuffle=False, n_jobs=-1) 205 | elif algorithm == 'forest': 206 | model = RandomForestClassifier(n_estimators=10, criterion='gini', max_features='auto', max_depth=None, 207 | min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None, n_jobs=-1) 208 | elif algorithm == 'boost': 209 | model = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, 210 | min_samples_split=2, min_samples_leaf=1, max_depth=3, max_features=None, 211 | max_leaf_nodes=None) 212 | else: 213 | print('No model defined for ' + algorithm) 214 | exit() 215 | 216 | return model 217 | 218 | 219 | def train(training_data, X, y, algorithm, scaler, pca, selector): 220 | """ 221 | Trains a new model using the training data. 222 | """ 223 | t0 = time.time() 224 | model = define_model(algorithm) 225 | X = apply_transforms(X, scaler, pca, selector) 226 | model.fit(X, y) 227 | t1 = time.time() 228 | print('Model trained in {0:3f} s.'.format(t1 - t0)) 229 | 230 | if algorithm == 'forest' or algorithm == 'boost': 231 | print('Generating feature importance plot...') 232 | fig, ax = plt.subplots(figsize=(16, 10)) 233 | 234 | importance = model.feature_importances_ 235 | importance = 100.0 * (importance / importance.max()) 236 | importance = importance[0:30] 237 | sorted_idx = np.argsort(importance) 238 | pos = np.arange(sorted_idx.shape[0]) 239 | ax.set_title('Variable Importance') 240 | ax.barh(pos, importance[sorted_idx], align='center') 241 | ax.set_yticks(pos) 242 | ax.set_yticklabels(training_data.columns[sorted_idx + 1]) 243 | ax.set_xlabel('Relative Importance') 244 | 245 | fig.tight_layout() 246 | 247 | return model 248 | 249 | 250 | def predict(X, model, scaler, pca, selector): 251 | """ 252 | Predicts the class label. 253 | """ 254 | X = apply_transforms(X, scaler, pca, selector) 255 | y_est = model.predict(X) 256 | 257 | return y_est 258 | 259 | 260 | def predict_probability(X, model, scaler, pca, selector): 261 | """ 262 | Predicts the class probabilities. 263 | """ 264 | X = apply_transforms(X, scaler, pca, selector) 265 | y_prob = model.predict_proba(X)[:, 1] 266 | 267 | return y_prob 268 | 269 | 270 | def score(X, y, model, scaler, pca, selector): 271 | """ 272 | Scores the model's performance and returns the result. 273 | """ 274 | X = apply_transforms(X, scaler, pca, selector) 275 | 276 | return model.score(X, y) 277 | 278 | 279 | def cross_validate(X, y, algorithm, scaler, pca, selector, metric): 280 | """ 281 | Performs cross-validation to estimate the true performance of the model. 282 | """ 283 | model = define_model(algorithm) 284 | X = apply_transforms(X, scaler, pca, selector) 285 | 286 | t0 = time.time() 287 | scores = cross_validation.cross_val_score(model, X, y, scoring=metric, cv=3, n_jobs=-1) 288 | t1 = time.time() 289 | print('Cross-validation completed in {0:3f} s.'.format(t1 - t0)) 290 | 291 | return np.mean(scores) 292 | 293 | 294 | def plot_learning_curve(X, y, algorithm, scaler, pca, selector, metric): 295 | """ 296 | Plots a learning curve showing model performance against both training and 297 | validation data sets as a function of the number of training samples. 298 | """ 299 | model = define_model(algorithm) 300 | X = apply_transforms(X, scaler, pca, selector) 301 | 302 | t0 = time.time() 303 | train_sizes, train_scores, test_scores = learning_curve(model, X, y, scoring=metric, cv=3, n_jobs=-1) 304 | train_scores_mean = np.mean(train_scores, axis=1) 305 | train_scores_std = np.std(train_scores, axis=1) 306 | test_scores_mean = np.mean(test_scores, axis=1) 307 | test_scores_std = np.std(test_scores, axis=1) 308 | 309 | fig, ax = plt.subplots(figsize=(16, 10)) 310 | ax.set_title('Learning Curve') 311 | ax.set_xlabel('Training Examples') 312 | ax.set_ylabel('Score') 313 | ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, 314 | alpha=0.1, color='r') 315 | ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, 316 | alpha=0.1, color='r') 317 | ax.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score') 318 | ax.plot(train_sizes, test_scores_mean, 'o-', color='r', label='Cross-validation score') 319 | ax.legend(loc='best') 320 | fig.tight_layout() 321 | t1 = time.time() 322 | print('Learning curve generated in {0:3f} s.'.format(t1 - t0)) 323 | 324 | 325 | def parameter_search(X, y, algorithm, scaler, pca, selector, metric): 326 | """ 327 | Performs an exhaustive search over the specified model parameters. 328 | """ 329 | model = define_model(algorithm) 330 | X = apply_transforms(X, scaler, pca, selector) 331 | 332 | param_grid = None 333 | if algorithm == 'logistic': 334 | param_grid = [{'penalty': ['l1', 'l2'], 'C': [0.1, 0.3, 1.0, 3.0]}] 335 | elif algorithm == 'svm': 336 | param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, 337 | {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}] 338 | elif algorithm == 'sgd': 339 | param_grid = [{'loss': ['hinge', 'log', 'modified_huber'], 'penalty': ['l1', 'l2'], 340 | 'alpha': [0.0001, 0.001, 0.01], 'iter': [100, 1000, 10000]}] 341 | elif algorithm == 'forest': 342 | param_grid = [{'n_estimators': [10, 30, 100, 300], 'criterion': ['gini', 'entropy'], 343 | 'max_features': ['auto', 'log2', None], 'max_depth': [3, 5, 7, None], 344 | 'min_samples_split': [2, 10, 30, 100], 'min_samples_leaf': [1, 3, 10, 30, 100]}] 345 | elif algorithm == 'boost': 346 | param_grid = [{'learning_rate': [0.1, 0.3, 1.0], 'subsample': [1.0, 0.9, 0.7, 0.5], 347 | 'n_estimators': [100, 300, 1000], 'max_features': ['auto', 'log2', None], 348 | 'max_depth': [3, 5, 7, None], 'min_samples_split': [2, 10, 30, 100], 349 | 'min_samples_leaf': [1, 3, 10, 30, 100]}] 350 | 351 | t0 = time.time() 352 | grid_estimator = GridSearchCV(model, param_grid, scoring=metric, cv=3, n_jobs=-1) 353 | grid_estimator.fit(X, y) 354 | t1 = time.time() 355 | print('Grid search completed in {0:3f} s.'.format(t1 - t0)) 356 | 357 | return grid_estimator.best_estimator_, grid_estimator.best_params_, grid_estimator.best_score_ 358 | 359 | 360 | def train_ensemble(X, y, algorithm, scaler, pca, selector): 361 | """ 362 | Creates an ensemble of many models together. 363 | """ 364 | model = define_model(algorithm) 365 | X = apply_transforms(X, scaler, pca, selector) 366 | 367 | t0 = time.time() 368 | ensemble_model = BaggingClassifier(base_estimator=model, n_estimators=10, max_samples=1.0, max_features=1.0, 369 | bootstrap=True, bootstrap_features=False) 370 | ensemble_model.fit(X, y) 371 | t1 = time.time() 372 | print('Ensemble training completed in {0:3f} s.'.format(t1 - t0)) 373 | 374 | return ensemble_model 375 | 376 | 377 | def process_test_data(filename, create_features): 378 | """ 379 | Reads in the test data set and prepares it for prediction by the model. 380 | """ 381 | test_data = pd.read_csv(filename, sep=',') 382 | 383 | if create_features: 384 | test_data = generate_features(test_data) 385 | 386 | num_features = len(test_data.columns) 387 | X_test = test_data.iloc[:, 1:num_features].values 388 | 389 | return test_data, X_test 390 | 391 | 392 | def create_submission(test_data, y_est, submit_file): 393 | """ 394 | Create a new submission file with test data and predictions generated by the model. 395 | """ 396 | submit = pd.DataFrame(columns=['Id', 'Cover_Type']) 397 | submit['Id'] = test_data['Id'] 398 | submit['Cover_Type'] = y_est 399 | submit.to_csv(submit_file, sep=',', index=False, index_label=False) 400 | 401 | 402 | def main(): 403 | load_training_data = True 404 | create_features = False 405 | create_visualizations = False 406 | load_model = False 407 | train_model = True 408 | create_learning_curve = False 409 | perform_grid_search = False 410 | perform_ensemble = False 411 | save_model = False 412 | create_submission_file = False 413 | 414 | code_dir = '/home/john/git/kaggle/ForestCover/' 415 | data_dir = '/home/john/data/forest-cover/' 416 | training_file = 'train.csv' 417 | test_file = 'test.csv' 418 | submit_file = 'submission.csv' 419 | model_file = 'model.pkl' 420 | 421 | algorithm = 'forest' # bayes, logistic, svm, sgd, forest, boost 422 | metric = None # accuracy, f1, rcc_auc, mean_absolute_error, mean_squared_error, r2_score 423 | standardize = False 424 | whiten = False 425 | select = True 426 | 427 | training_data = None 428 | X = None 429 | y = None 430 | scaler = None 431 | pca = None 432 | selector = None 433 | model = None 434 | ensemble_model = None 435 | 436 | os.chdir(code_dir) 437 | 438 | print('Starting process...') 439 | print('Algorithm={0}, Create={1}, Select={2}, Standardize={3}, Whiten={4}'.format( 440 | algorithm, create_features, select, standardize, whiten)) 441 | 442 | if load_training_data: 443 | print('Reading in training data...') 444 | training_data, X, y = process_training_data(data_dir + training_file, create_features) 445 | 446 | if standardize or whiten or select: 447 | print('Creating data transforms...') 448 | scaler, pca, selector = create_transforms(X, standardize, whiten, select) 449 | 450 | if create_visualizations: 451 | print('Creating visualizations...') 452 | visualize(training_data, X, y, pca) 453 | 454 | if load_model: 455 | print('Loading model from disk...') 456 | model = load(data_dir + model_file) 457 | 458 | if train_model: 459 | print('Training model on full data set...') 460 | model = train(training_data, X, y, algorithm, scaler, pca, selector) 461 | 462 | print('Calculating training score...') 463 | model_score = score(X, y, model, scaler, pca, selector) 464 | print('Training score ='), model_score 465 | 466 | if create_learning_curve: 467 | print('Generating learning curve...') 468 | plot_learning_curve(X, y, algorithm, scaler, pca, selector, metric) 469 | else: 470 | print('Performing cross-validation...') 471 | cross_val_score = cross_validate(X, y, algorithm, scaler, pca, selector, metric) 472 | print('Cross-validation score ='), cross_val_score 473 | 474 | if perform_grid_search: 475 | print('Performing hyper-parameter grid search...') 476 | best_model, best_params, best_score = parameter_search(X, y, algorithm, scaler, pca, selector, metric) 477 | print('Best model = ', best_model) 478 | print('Best params = ', best_params) 479 | print('Best score = ', best_score) 480 | 481 | if perform_ensemble: 482 | print('Creating an ensemble of models...') 483 | ensemble_model = train_ensemble(X, y, algorithm, scaler, pca, selector) 484 | 485 | print('Calculating ensemble training score...') 486 | ensemble_model_score = score(X, y, ensemble_model, scaler, pca, selector) 487 | print('Ensemble Training score ='), ensemble_model_score 488 | 489 | if save_model: 490 | print('Saving model to disk...') 491 | save(model, data_dir + model_file) 492 | 493 | if create_submission_file: 494 | print('Reading in test data...') 495 | test_data, X_test = process_test_data(data_dir + test_file, create_features) 496 | 497 | print('Predicting test data...') 498 | if perform_ensemble: 499 | y_est = predict(X_test, ensemble_model, scaler, pca, selector) 500 | else: 501 | y_est = predict(X_test, model, scaler, pca, selector) 502 | 503 | print('Creating submission file...') 504 | create_submission(test_data, y_est, data_dir + submit_file) 505 | 506 | print('Process complete.') 507 | 508 | 509 | if __name__ == "__main__": 510 | main() -------------------------------------------------------------------------------- /old/HiggsBoson/README.md: -------------------------------------------------------------------------------- 1 | # Higgs Boson Machine Learning Challenge 2 | 3 | ATLAS

4 | 5 | View the competition details here.
6 | 7 | This directory includes the code I used to run experiments for the competition. Despite starting only a few weeks before the deadline and having very limited time to invest, I managed to place in the top 25%.
8 | 9 | I used the Anaconda distribution of Python with the IPython kernel and PyCharm IDE to run experiments. I also installed and configured several additional dependencies (xgboost and pylearn 2). There are three scripts of interest:
10 | 11 | higgs.py - Primarily based on scikit-learn solutions
12 | higgs-adv.py - Switched to a Linux VM and incorporated the xgboost library
13 | higgs-nn.py - Set up pylearn 2 and started experimenting with deep learning neural nets (unfortunately I ran out of time before making any significant progress here)
14 | 15 | The scripts are fairly basic due to time constraints but well-modularized and easy to follow. Enjoy! 16 | -------------------------------------------------------------------------------- /old/HiggsBoson/Resources/ATLAS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jdwittenauer/kaggle/cc489100a0c93315e424551f68968ffba85d268f/old/HiggsBoson/Resources/ATLAS.png -------------------------------------------------------------------------------- /old/HiggsBoson/Resources/documentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jdwittenauer/kaggle/cc489100a0c93315e424551f68968ffba85d268f/old/HiggsBoson/Resources/documentation.pdf -------------------------------------------------------------------------------- /old/HiggsBoson/auto_l1.yaml: -------------------------------------------------------------------------------- 1 | !obj:pylearn2.train.Train { 2 | dataset: &train !obj:pylearn2.datasets.csv_dataset.CSVDataset { 3 | path: '%(data_dir)s/combined_nn.csv', 4 | task: 'classification', 5 | one_hot: True, 6 | expect_labels: True, 7 | expect_headers: True, 8 | delimiter: ',', 9 | start: %(train_start)i, 10 | stop: %(train_stop)i 11 | }, 12 | model: !obj:pylearn2.models.autoencoder.DenoisingAutoencoder { 13 | nvis : %(num_features)i, 14 | nhid : %(hid_l1)i, 15 | irange : 0.05, 16 | corruptor: !obj:pylearn2.corruption.BinomialCorruptor { 17 | corruption_level: .2, 18 | }, 19 | act_enc: "tanh", 20 | act_dec: null, # Linear activation on the decoder side 21 | }, 22 | algorithm: !obj:pylearn2.training_algorithms.sgd.SGD { 23 | learning_rate : 1e-3, 24 | batch_size : %(batch_size)i, 25 | monitoring_batches : %(monitoring_batches)i, 26 | monitoring_dataset : *train, 27 | cost : !obj:pylearn2.costs.autoencoder.MeanSquaredReconstructionError {}, 28 | termination_criterion : !obj:pylearn2.termination_criteria.EpochCounter { 29 | max_epochs: %(max_epochs)i, 30 | }, 31 | }, 32 | save_path: '%(data_dir)s/auto_l1.pkl', 33 | save_freq: 1 34 | } 35 | -------------------------------------------------------------------------------- /old/HiggsBoson/auto_l2.yaml: -------------------------------------------------------------------------------- 1 | !obj:pylearn2.train.Train { 2 | dataset: &train !obj:pylearn2.datasets.transformer_dataset.TransformerDataset { 3 | raw: !obj:pylearn2.datasets.csv_dataset.CSVDataset { 4 | path: '%(data_dir)s/combined_nn.csv', 5 | task: 'classification', 6 | one_hot: True, 7 | expect_labels: True, 8 | expect_headers: True, 9 | delimiter: ',', 10 | start: %(train_start)i, 11 | stop: %(train_stop)i 12 | }, 13 | transformer: !pkl: '%(data_dir)s/auto_l1.pkl' 14 | }, 15 | model: !obj:pylearn2.models.autoencoder.DenoisingAutoencoder { 16 | nvis : %(num_features)i, 17 | nhid : %(hid_l2)i, 18 | irange : 0.05, 19 | corruptor: !obj:pylearn2.corruption.BinomialCorruptor { 20 | corruption_level: .3, 21 | }, 22 | act_enc: "tanh", 23 | act_dec: null, # Linear activation on the decoder sided 24 | }, 25 | algorithm: !obj:pylearn2.training_algorithms.sgd.SGD { 26 | learning_rate : 1e-3, 27 | batch_size : %(batch_size)i, 28 | monitoring_batches : %(monitoring_batches)i, 29 | monitoring_dataset : *train, 30 | cost : !obj:pylearn2.costs.autoencoder.MeanSquaredReconstructionError {}, 31 | termination_criterion : !obj:pylearn2.termination_criteria.EpochCounter { 32 | max_epochs: %(max_epochs)i, 33 | }, 34 | }, 35 | save_path: '%(data_dir)s/auto_l2.pkl', 36 | save_freq: 1 37 | } 38 | -------------------------------------------------------------------------------- /old/HiggsBoson/auto_mlp.yaml: -------------------------------------------------------------------------------- 1 | !obj:pylearn2.train.Train { 2 | dataset: &train !obj:pylearn2.datasets.csv_dataset.CSVDataset { 3 | path: '%(data_dir)s/training_nn.csv', 4 | task: 'classification', 5 | one_hot: True, 6 | expect_labels: True, 7 | expect_headers: True, 8 | delimiter: ',', 9 | start: %(train_start)i, 10 | stop: %(train_stop)i 11 | }, 12 | model: !obj:pylearn2.models.mlp.MLP { 13 | batch_size: %(batch_size)i, 14 | layers: [ 15 | !obj:pylearn2.models.mlp.PretrainedLayer { 16 | layer_name: 'h1', 17 | layer_content: !pkl: '%(data_dir)s/auto_l1.pkl' 18 | }, 19 | !obj:pylearn2.models.mlp.PretrainedLayer { 20 | layer_name: 'h2', 21 | layer_content: !pkl: '%(data_dir)s/auto_l2.pkl' 22 | }, 23 | !obj:pylearn2.models.mlp.Softmax { 24 | max_col_norm: 1.9365, 25 | layer_name: 'y', 26 | n_classes: 2, 27 | irange: .005 28 | } 29 | ], 30 | nvis: %(num_features)i 31 | }, 32 | algorithm: !obj:pylearn2.training_algorithms.sgd.SGD { 33 | learning_rate: .05, 34 | learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum { 35 | init_momentum: .5, 36 | }, 37 | monitoring_dataset: 38 | { 39 | 'valid' : !obj:pylearn2.datasets.csv_dataset.CSVDataset { 40 | path: '%(data_dir)s/training_nn.csv', 41 | task: 'classification', 42 | one_hot: True, 43 | expect_labels: True, 44 | expect_headers: True, 45 | delimiter: ',', 46 | start: %(valid_start)i, 47 | stop: %(valid_stop)i 48 | }, 49 | }, 50 | cost: !obj:pylearn2.costs.mlp.Default {}, 51 | termination_criterion: !obj:pylearn2.termination_criteria.And { 52 | criteria: [ 53 | !obj:pylearn2.termination_criteria.MonitorBased { 54 | channel_name: 'valid_y_misclass', 55 | prop_decrease: 0., 56 | N: 100 57 | }, 58 | !obj:pylearn2.termination_criteria.EpochCounter { 59 | max_epochs: %(max_epochs)i 60 | } 61 | ] 62 | }, 63 | update_callbacks: !obj:pylearn2.training_algorithms.sgd.ExponentialDecay { 64 | decay_factor: 1.00004, 65 | min_lr: .000001 66 | } 67 | }, 68 | extensions: [ 69 | !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor { 70 | start: 1, 71 | saturate: 250, 72 | final_momentum: .7 73 | }, 74 | !obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest { 75 | channel_name: 'valid_y_misclass', 76 | save_path: '%(data_dir)s/auto_mlp.pkl' 77 | } 78 | ] 79 | } 80 | -------------------------------------------------------------------------------- /old/HiggsBoson/higgs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import time 4 | import pickle 5 | import numpy as np 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | from sklearn import cross_validation 9 | from sklearn import decomposition 10 | from sklearn import ensemble 11 | from sklearn import linear_model 12 | from sklearn import naive_bayes 13 | from sklearn import preprocessing 14 | from sklearn import svm 15 | 16 | 17 | def ams(s, b): 18 | """ 19 | Approximate Median Significant function to evaluate solutions. 20 | """ 21 | br = 10.0 22 | radicand = 2 * ((s + b + br) * math.log(1.0 + s / (b + br)) - s) 23 | if radicand < 0: 24 | print 'Radicand is negative.' 25 | exit() 26 | else: 27 | return math.sqrt(radicand) 28 | 29 | 30 | def load(filename): 31 | """ 32 | Load a previously training model from disk. 33 | """ 34 | model_file = open(filename, 'rb') 35 | model = pickle.load(model_file) 36 | model_file.close() 37 | return model 38 | 39 | 40 | def save(model, filename): 41 | """ 42 | Persist a trained model to disk. 43 | """ 44 | model_file = open(filename, 'wb') 45 | pickle.dump(model, model_file) 46 | model_file.close() 47 | 48 | 49 | def process_training_data(filename, features, impute, standardize, whiten): 50 | """ 51 | Reads in training data and prepares numpy arrays. 52 | """ 53 | training_data = pd.read_csv(filename, sep=',') 54 | 55 | # add a nominal label (0, 1) 56 | temp = training_data['Label'].replace(to_replace=['s', 'b'], value=[1, 0]) 57 | training_data['Nominal'] = temp 58 | 59 | X = training_data.iloc[:, 1:features+1].values 60 | y = training_data.iloc[:, features+3].values 61 | w = training_data.iloc[:, features+1].values 62 | 63 | # optionally impute the -999 values 64 | if impute == 'mean': 65 | imp = preprocessing.Imputer(missing_values=-999) 66 | X = imp.fit_transform(X) 67 | elif impute == 'zeros': 68 | X[X == -999] = 0 69 | 70 | # create a standardization transform 71 | scaler = None 72 | if standardize: 73 | scaler = preprocessing.StandardScaler() 74 | scaler.fit(X) 75 | 76 | # create a PCA transform 77 | pca = None 78 | if whiten: 79 | pca = decomposition.PCA(whiten=True) 80 | pca.fit(X) 81 | 82 | return training_data, X, y, w, scaler, pca 83 | 84 | 85 | def visualize(training_data, X, y, scaler, pca, features): 86 | """ 87 | Computes statistics describing the data and creates some visualizations 88 | that attempt to highlight the underlying structure. 89 | 90 | Note: Use '%matplotlib inline' and '%matplotlib qt' at the IPython console 91 | to switch between display modes. 92 | """ 93 | 94 | # feature histograms 95 | fig1, ax1 = plt.subplots(4, 4, figsize=(20, 10)) 96 | for i in range(16): 97 | ax1[i % 4, i / 4].hist(X[:, i]) 98 | ax1[i % 4, i / 4].set_title(training_data.columns[i + 1]) 99 | ax1[i % 4, i / 4].set_xlim((min(X[:, i]), max(X[:, i]))) 100 | fig1.tight_layout() 101 | 102 | fig2, ax2 = plt.subplots(4, 4, figsize=(20, 10)) 103 | for i in range(16, features): 104 | ax2[i % 4, (i - 16) / 4].hist(X[:, i]) 105 | ax2[i % 4, (i - 16) / 4].set_title(training_data.columns[i + 1]) 106 | ax2[i % 4, (i - 16) / 4].set_xlim((min(X[:, i]), max(X[:, i]))) 107 | fig2.tight_layout() 108 | 109 | # covariance matrix 110 | if scaler is not None: 111 | X = scaler.transform(X) 112 | 113 | cov = np.cov(X, rowvar=0) 114 | 115 | fig3, ax3 = plt.subplots(figsize=(16, 10)) 116 | p = ax3.pcolor(cov) 117 | fig3.colorbar(p, ax=ax3) 118 | ax3.set_title('Feature Covariance Matrix') 119 | 120 | # pca plots 121 | if pca is not None: 122 | X = pca.transform(X) 123 | 124 | fig4, ax4 = plt.subplots(figsize=(16, 10)) 125 | ax4.scatter(X[:, 0], X[:, 1], c=y) 126 | ax4.set_title('First & Second Principal Components') 127 | 128 | fig5, ax5 = plt.subplots(figsize=(16, 10)) 129 | ax5.scatter(X[:, 1], X[:, 2], c=y) 130 | ax5.set_title('Second & Third Principal Components') 131 | 132 | 133 | def train(X, y, alg, scaler, pca): 134 | """ 135 | Trains a new model using the training data. 136 | """ 137 | if scaler is not None: 138 | X = scaler.transform(X) 139 | 140 | if pca is not None: 141 | X = pca.transform(X) 142 | 143 | t0 = time.time() 144 | 145 | if alg == 'bayes': 146 | model = naive_bayes.GaussianNB() 147 | elif alg == 'logistic': 148 | model = linear_model.LogisticRegression() 149 | elif alg == 'svm': 150 | model = svm.SVC() 151 | elif alg == 'boost': 152 | model = ensemble.GradientBoostingClassifier(n_estimators=100, max_depth=7, min_samples_split=200, 153 | min_samples_leaf=200, max_features=30) 154 | else: 155 | print 'No model defined for ' + alg 156 | exit() 157 | 158 | model.fit(X, y) 159 | 160 | t1 = time.time() 161 | print 'Model trained in {0:3f} s.'.format(t1 - t0) 162 | 163 | return model 164 | 165 | 166 | def predict(X, model, threshold, scaler, pca): 167 | """ 168 | Predicts the probability of a positive outcome and converts the 169 | probability to a binary prediction based on the cutoff percentage. 170 | """ 171 | if scaler is not None: 172 | X = scaler.transform(X) 173 | 174 | if pca is not None: 175 | X = pca.transform(X) 176 | 177 | y_prob = model.predict_proba(X)[:, 1] 178 | cutoff = np.percentile(y_prob, threshold) 179 | y_est = y_prob > cutoff 180 | 181 | return y_prob, y_est 182 | 183 | 184 | def score(y, y_est, w): 185 | """ 186 | Create weighted signal and background sets and calculate the AMS. 187 | """ 188 | y_signal = w * (y == 1.0) 189 | y_background = w * (y == 0.0) 190 | s = np.sum(y_signal * (y_est == 1.0)) 191 | b = np.sum(y_background * (y_est == 1.0)) 192 | 193 | return ams(s, b) 194 | 195 | 196 | def cross_validate(X, y, w, alg, scaler, pca, threshold): 197 | """ 198 | Perform cross-validation on the training set and compute the AMS scores. 199 | """ 200 | scores = [0, 0, 0] 201 | folds = cross_validation.StratifiedKFold(y, n_folds=3) 202 | i = 0 203 | 204 | for i_train, i_val in folds: 205 | # create the training and validation sets 206 | X_train, X_val = X[i_train], X[i_val] 207 | y_train, y_val = y[i_train], y[i_val] 208 | w_train, w_val = w[i_train], w[i_val] 209 | 210 | # normalize the weights 211 | w_train[y_train == 1] *= (sum(w[y == 1]) / sum(w[y_train == 1])) 212 | w_train[y_train == 0] *= (sum(w[y == 0]) / sum(w_train[y_train == 0])) 213 | w_val[y_val == 1] *= (sum(w[y == 1]) / sum(w_val[y_val == 1])) 214 | w_val[y_val == 0] *= (sum(w[y == 0]) / sum(w_val[y_val == 0])) 215 | 216 | # train the model 217 | model = train(X_train, y_train, alg, scaler, pca) 218 | 219 | # predict and score performance on the validation set 220 | y_val_prob, y_val_est = predict(X_val, model, threshold, scaler, pca) 221 | scores[i] = score(y_val, y_val_est, w_val) 222 | i += 1 223 | 224 | return np.mean(scores) 225 | 226 | 227 | def process_test_data(filename, features, impute): 228 | """ 229 | Reads in test data and prepares numpy arrays. 230 | """ 231 | test_data = pd.read_csv(filename, sep=',') 232 | X_test = test_data.iloc[:, 1:features+1].values 233 | 234 | if impute == 'mean': 235 | imp = preprocessing.Imputer(missing_values=-999) 236 | X_test = imp.fit_transform(X_test) 237 | elif impute == 'zeros': 238 | X_test[X_test == -999] = 0 239 | 240 | return test_data, X_test 241 | 242 | 243 | def create_submission(test_data, y_test_prob, y_test_est, submit_file): 244 | """ 245 | Create a new data frame with the submission data. 246 | """ 247 | temp = pd.DataFrame(y_test_prob, columns=['RankOrder']) 248 | temp2 = pd.DataFrame(y_test_est, columns=['Class']) 249 | submit = pd.DataFrame([test_data.EventId, temp.RankOrder, temp2.Class]).transpose() 250 | 251 | # sort it so they're in the ascending order by probability 252 | submit = submit.sort(['RankOrder'], ascending=True) 253 | 254 | # convert the probabilities to rank order (required by the submission guidelines) 255 | for i in range(0, y_test_est.shape[0], 1): 256 | submit.iloc[i, 1] = i + 1 257 | 258 | # re-sort by event ID 259 | submit = submit.sort(['EventId'], ascending=True) 260 | 261 | # convert the integer classification to (s, b) 262 | submit['Class'] = submit['Class'].map({1: 's', 0: 'b'}) 263 | 264 | # force pandas to treat these columns at int (otherwise will write as floats) 265 | submit[['EventId', 'RankOrder']] = submit[['EventId', 'RankOrder']].astype(int) 266 | 267 | # finally create the submission file 268 | submit.to_csv(submit_file, sep=',', index=False, index_label=False) 269 | 270 | 271 | def main(): 272 | # perform some initialization 273 | features = 30 274 | threshold = 85 275 | alg = 'boost' # bayes, logistic, svm, boost 276 | impute = 'none' # zeros, mean, none 277 | standardize = False 278 | whiten = False 279 | load_training_data = True 280 | load_model = False 281 | train_model = False 282 | save_model = False 283 | create_visualizations = True 284 | create_submission_file = False 285 | code_dir = '/home/john/git/kaggle/HiggsBoson/' 286 | data_dir = '/home/john/data/higgs-boson/' 287 | training_file = 'training.csv' 288 | test_file = 'test.csv' 289 | submit_file = 'submission.csv' 290 | model_file = 'model.pkl' 291 | 292 | os.chdir(code_dir) 293 | 294 | print 'Starting process...' 295 | print 'alg={0}, impute={1}, standardize={2}, whiten={3} threshold={4}'.format( 296 | alg, impute, standardize, whiten, threshold) 297 | 298 | if load_training_data: 299 | print 'Reading in training data...' 300 | training_data, X, y, w, scaler, pca = process_training_data( 301 | data_dir + training_file, features, impute, standardize, whiten) 302 | 303 | if create_visualizations: 304 | print 'Creating visualizations...' 305 | visualize(training_data, X, y, scaler, pca, features) 306 | 307 | if load_model: 308 | print 'Loading model from disk...' 309 | model = load(data_dir + model_file) 310 | 311 | if train_model: 312 | print 'Training model on full data set...' 313 | model = train(X, y, alg, scaler, pca) 314 | 315 | print 'Calculating predictions...' 316 | y_prob, y_est = predict(X, model, threshold, scaler, pca) 317 | 318 | print 'Calculating AMS...' 319 | ams_val = score(y, y_est, w) 320 | print 'AMS =', ams_val 321 | 322 | print 'Performing cross-validation...' 323 | val = cross_validate(X, y, w, alg, scaler, pca, threshold) 324 | print'Cross-validation AMS =', val 325 | 326 | if save_model: 327 | print 'Saving model to disk...' 328 | save(model, data_dir + model_file) 329 | 330 | if create_submission_file: 331 | print 'Reading in test data...' 332 | test_data, X_test = process_test_data(data_dir + test_file, features, impute) 333 | 334 | print 'Predicting test data...' 335 | y_test_prob, y_test_est = predict(X_test, model, threshold, scaler, pca) 336 | 337 | print 'Creating submission file...' 338 | create_submission(test_data, y_test_prob, y_test_est, data_dir + submit_file) 339 | 340 | print 'Process complete.' 341 | 342 | 343 | if __name__ == "__main__": 344 | main() -------------------------------------------------------------------------------- /old/HiggsBoson/higgs_adv.py: -------------------------------------------------------------------------------- 1 | import os, math, time, pickle, sys 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | from sklearn import cross_validation 6 | from sklearn import decomposition 7 | from sklearn import ensemble 8 | from sklearn import linear_model 9 | from sklearn import naive_bayes 10 | from sklearn import preprocessing 11 | from sklearn import svm 12 | 13 | sys.path.append('/home/git/xgboost/wrapper') 14 | import xgboost as xgb 15 | 16 | 17 | def ams(s, b): 18 | """ 19 | Approximate Median Significant function to evaluate solutions. 20 | """ 21 | br = 10.0 22 | radicand = 2 * ((s + b + br) * math.log(1.0 + s / (b + br)) - s) 23 | if radicand < 0: 24 | print 'Radicand is negative.' 25 | exit() 26 | else: 27 | return math.sqrt(radicand) 28 | 29 | 30 | def load(alg, filename): 31 | """ 32 | Load a previously training model from disk. 33 | """ 34 | if alg == 'xgboost': 35 | model = xgb.Booster({'nthread': 16}, model_file=filename) 36 | else: 37 | model_file = open(filename, 'rb') 38 | model = pickle.load(model_file) 39 | model_file.close() 40 | 41 | return model 42 | 43 | 44 | def save(alg, model, filename): 45 | """ 46 | Persist a trained model to disk. 47 | """ 48 | if alg == 'xgboost': 49 | model.save_model(filename) 50 | else: 51 | model_file = open(filename, 'wb') 52 | pickle.dump(model, model_file) 53 | model_file.close() 54 | 55 | 56 | def process_training_data(filename, features, impute, standardize, whiten): 57 | """ 58 | Reads in training data and prepares numpy arrays. 59 | """ 60 | training_data = pd.read_csv(filename, sep=',') 61 | 62 | # add a nominal label (0, 1) 63 | temp = training_data['Label'].replace(to_replace=['s', 'b'], value=[1, 0]) 64 | training_data['Nominal'] = temp 65 | 66 | X = training_data.iloc[:, 1:features+1].values 67 | y = training_data.iloc[:, features+3].values 68 | w = training_data.iloc[:, features+1].values 69 | 70 | # optionally impute the -999 values 71 | if impute == 'mean': 72 | imp = preprocessing.Imputer(missing_values=-999) 73 | X = imp.fit_transform(X) 74 | elif impute == 'zeros': 75 | X[X == -999] = 0 76 | 77 | # create a standardization transform 78 | scaler = None 79 | if standardize: 80 | scaler = preprocessing.StandardScaler() 81 | scaler.fit(X) 82 | 83 | # create a PCA transform 84 | pca = None 85 | if whiten: 86 | pca = decomposition.PCA(whiten=True) 87 | pca.fit(X) 88 | 89 | return training_data, X, y, w, scaler, pca 90 | 91 | 92 | def visualize(training_data, X, y, scaler, pca, features): 93 | """ 94 | Computes statistics describing the data and creates some visualizations 95 | that attempt to highlight the underlying structure. 96 | 97 | Note: Use '%matplotlib inline' and '%matplotlib qt' at the IPython console 98 | to switch between display modes. 99 | """ 100 | 101 | # feature histograms 102 | fig1, ax1 = plt.subplots(4, 4, figsize=(20, 10)) 103 | for i in range(16): 104 | ax1[i % 4, i / 4].hist(X[:, i]) 105 | ax1[i % 4, i / 4].set_title(training_data.columns[i + 1]) 106 | ax1[i % 4, i / 4].set_xlim((min(X[:, i]), max(X[:, i]))) 107 | fig1.tight_layout() 108 | 109 | fig2, ax2 = plt.subplots(4, 4, figsize=(20, 10)) 110 | for i in range(16, features): 111 | ax2[i % 4, (i - 16) / 4].hist(X[:, i]) 112 | ax2[i % 4, (i - 16) / 4].set_title(training_data.columns[i + 1]) 113 | ax2[i % 4, (i - 16) / 4].set_xlim((min(X[:, i]), max(X[:, i]))) 114 | fig2.tight_layout() 115 | 116 | # covariance matrix 117 | if scaler is not None: 118 | X = scaler.transform(X) 119 | 120 | cov = np.cov(X, rowvar=0) 121 | 122 | fig3, ax3 = plt.subplots(figsize=(16, 10)) 123 | p = ax3.pcolor(cov) 124 | fig3.colorbar(p, ax=ax3) 125 | ax3.set_title('Feature Covariance Matrix') 126 | 127 | # pca plots 128 | if pca is not None: 129 | X = pca.transform(X) 130 | 131 | fig4, ax4 = plt.subplots(figsize=(16, 10)) 132 | ax4.scatter(X[:, 0], X[:, 1], c=y) 133 | ax4.set_title('First & Second Principal Components') 134 | 135 | fig5, ax5 = plt.subplots(figsize=(16, 10)) 136 | ax5.scatter(X[:, 1], X[:, 2], c=y) 137 | ax5.set_title('Second & Third Principal Components') 138 | 139 | 140 | def train(X, y, w, alg, scaler, pca): 141 | """ 142 | Trains a new model using the training data. 143 | """ 144 | if scaler is not None: 145 | X = scaler.transform(X) 146 | 147 | if pca is not None: 148 | X = pca.transform(X) 149 | 150 | if alg == 'xgboost': 151 | # use a separate process for the xgboost library 152 | return train_xgb(X, y, w, scaler, pca) 153 | 154 | t0 = time.time() 155 | 156 | if alg == 'bayes': 157 | model = naive_bayes.GaussianNB() 158 | elif alg == 'logistic': 159 | model = linear_model.LogisticRegression() 160 | elif alg == 'svm': 161 | model = svm.SVC() 162 | elif alg == 'boost': 163 | model = ensemble.GradientBoostingClassifier(n_estimators=100, max_depth=7, 164 | min_samples_split=200, min_samples_leaf=200, max_features=30) 165 | else: 166 | print 'No model defined for ' + alg 167 | exit() 168 | 169 | model.fit(X, y) 170 | 171 | t1 = time.time() 172 | print 'Model trained in {0:3f} s.'.format(t1 - t0) 173 | 174 | return model 175 | 176 | 177 | def train_xgb(X, y, w, scaler, pca): 178 | """ 179 | Trains a boosted trees model using the XGBoost library. 180 | """ 181 | t0 = time.time() 182 | 183 | xgmat = xgb.DMatrix(X, label=y, missing=-999.0, weight=w) 184 | 185 | w_pos = sum(w[i] for i in range(len(y)) if y[i] == 1) 186 | w_neg = sum(w[i] for i in range(len(y)) if y[i] == 0) 187 | 188 | param = {} 189 | param['objective'] = 'binary:logitraw' 190 | param['scale_pos_weight'] = w_neg/w_pos 191 | param['eta'] = 0.08 192 | param['max_depth'] = 7 193 | param['subsample'] = 0.8 194 | param['eval_metric'] = 'auc' 195 | param['silent'] = 1 196 | 197 | plst = list(param.items()) 198 | watchlist = [] 199 | 200 | model = xgb.train(plst, xgmat, 128, watchlist) 201 | 202 | t1 = time.time() 203 | print 'Model trained in {0:3f} s.'.format(t1 - t0) 204 | 205 | return model 206 | 207 | 208 | def predict(X, model, alg, threshold, scaler, pca): 209 | """ 210 | Predicts the probability of a positive outcome and converts the 211 | probability to a binary prediction based on the cutoff percentage. 212 | """ 213 | if scaler is not None: 214 | X = scaler.transform(X) 215 | 216 | if pca is not None: 217 | X = pca.transform(X) 218 | 219 | if alg == 'xgboost': 220 | xgmat = xgb.DMatrix(X, missing=-999.0) 221 | y_prob = model.predict(xgmat) 222 | else: 223 | y_prob = model.predict_proba(X)[:, 1] 224 | 225 | cutoff = np.percentile(y_prob, threshold) 226 | y_est = y_prob > cutoff 227 | 228 | return y_prob, y_est 229 | 230 | 231 | def score(y, y_est, w): 232 | """ 233 | Create weighted signal and background sets and calculate the AMS. 234 | """ 235 | y_signal = w * (y == 1.0) 236 | y_background = w * (y == 0.0) 237 | s = np.sum(y_signal * (y_est == 1.0)) 238 | b = np.sum(y_background * (y_est == 1.0)) 239 | 240 | return ams(s, b) 241 | 242 | 243 | def cross_validate(X, y, w, alg, scaler, pca, threshold): 244 | """ 245 | Perform cross-validation on the training set and compute the AMS scores. 246 | """ 247 | scores = [0, 0, 0] 248 | folds = cross_validation.StratifiedKFold(y, n_folds=3) 249 | i = 0 250 | 251 | for i_train, i_val in folds: 252 | # create the training and validation sets 253 | X_train, X_val = X[i_train], X[i_val] 254 | y_train, y_val = y[i_train], y[i_val] 255 | w_train, w_val = w[i_train], w[i_val] 256 | 257 | # normalize the weights 258 | w_train[y_train == 1] *= (sum(w[y == 1]) / sum(w[y_train == 1])) 259 | w_train[y_train == 0] *= (sum(w[y == 0]) / sum(w_train[y_train == 0])) 260 | w_val[y_val == 1] *= (sum(w[y == 1]) / sum(w_val[y_val == 1])) 261 | w_val[y_val == 0] *= (sum(w[y == 0]) / sum(w_val[y_val == 0])) 262 | 263 | # train the model 264 | model = train(X_train, y_train, w_train, alg, scaler, pca) 265 | 266 | # predict and score performance on the validation set 267 | y_val_prob, y_val_est = predict(X_val, model, alg, threshold, scaler, pca) 268 | scores[i] = score(y_val, y_val_est, w_val) 269 | i += 1 270 | 271 | return np.mean(scores) 272 | 273 | 274 | def process_test_data(filename, features, impute): 275 | """ 276 | Reads in test data and prepares numpy arrays. 277 | """ 278 | test_data = pd.read_csv(filename, sep=',') 279 | X_test = test_data.iloc[:, 1:features+1].values 280 | 281 | if impute == 'mean': 282 | imp = preprocessing.Imputer(missing_values=-999) 283 | X_test = imp.fit_transform(X_test) 284 | elif impute == 'zeros': 285 | X_test[X_test == -999] = 0 286 | 287 | return test_data, X_test 288 | 289 | 290 | def create_submission(test_data, y_test_prob, y_test_est, submit_file): 291 | """ 292 | Create a new data frame with the submission data. 293 | """ 294 | temp = pd.DataFrame(y_test_prob, columns=['RankOrder']) 295 | temp2 = pd.DataFrame(y_test_est, columns=['Class']) 296 | submit = pd.DataFrame([test_data.EventId, temp.RankOrder, temp2.Class]).transpose() 297 | 298 | # sort it so they're in the ascending order by probability 299 | submit = submit.sort(['RankOrder'], ascending=True) 300 | 301 | # convert the probabilities to rank order (required by the submission guidelines) 302 | for i in range(0, y_test_est.shape[0], 1): 303 | submit.iloc[i, 1] = i + 1 304 | 305 | # re-sort by event ID 306 | submit = submit.sort(['EventId'], ascending=True) 307 | 308 | # convert the integer classification to (s, b) 309 | submit['Class'] = submit['Class'].map({1: 's', 0: 'b'}) 310 | 311 | # force pandas to treat these columns at int (otherwise will write as floats) 312 | submit[['EventId', 'RankOrder']] = submit[['EventId', 'RankOrder']].astype(int) 313 | 314 | # finally create the submission file 315 | submit.to_csv(submit_file, sep=',', index=False, index_label=False) 316 | 317 | 318 | def main(): 319 | # perform some initialization 320 | features = 30 321 | threshold = 85 322 | alg = 'xgboost' # bayes, logistic, boost, xgboost 323 | impute = 'none' # zeros, mean, none 324 | standardize = False 325 | whiten = False 326 | load_training_data = True 327 | load_model = False 328 | train_model = False 329 | save_model = False 330 | create_visualizations = True 331 | create_submission_file = False 332 | code_dir = '/home/john/git/kaggle/HiggsBoson/' 333 | data_dir = '/home/john/data/higgs-boson/' 334 | training_file = 'training.csv' 335 | test_file = 'test.csv' 336 | submit_file = 'submission.csv' 337 | model_file = 'model.pkl' 338 | 339 | os.chdir(code_dir) 340 | 341 | print 'Starting process...' 342 | print 'alg={0}, impute={1}, standardize={2}, whiten={3} threshold={4}'.format( 343 | alg, impute, standardize, whiten, threshold) 344 | 345 | if load_training_data: 346 | print 'Reading in training data...' 347 | training_data, X, y, w, scaler, pca = process_training_data( 348 | data_dir + training_file, features, impute, standardize, whiten) 349 | 350 | if create_visualizations: 351 | print 'Creating visualizations...' 352 | visualize(training_data, X, y, scaler, pca, features) 353 | 354 | if load_model: 355 | print 'Loading model from disk...' 356 | model = load(alg, data_dir + model_file) 357 | 358 | if train_model: 359 | print 'Training model on full data set...' 360 | model = train(X, y, w, alg, scaler, pca) 361 | 362 | print 'Calculating predictions...' 363 | y_prob, y_est = predict(X, model, alg, threshold, scaler, pca) 364 | 365 | print 'Calculating AMS...' 366 | ams_val = score(y, y_est, w) 367 | print 'AMS =', ams_val 368 | 369 | print 'Performing cross-validation...' 370 | val = cross_validate(X, y, w, alg, scaler, pca, threshold) 371 | print'Cross-validation AMS =', val 372 | 373 | if save_model: 374 | print 'Saving model to disk...' 375 | save(alg, model, data_dir + model_file) 376 | 377 | if create_submission_file: 378 | print 'Reading in test data...' 379 | test_data, X_test = process_test_data(data_dir + test_file, features, impute) 380 | 381 | print 'Predicting test data...' 382 | y_test_prob, y_test_est = predict(X_test, model, alg, threshold, scaler, pca) 383 | 384 | print 'Creating submission file...' 385 | create_submission(test_data, y_test_prob, y_test_est, data_dir + submit_file) 386 | 387 | print 'Process complete.' 388 | 389 | 390 | if __name__ == "__main__": 391 | main() -------------------------------------------------------------------------------- /old/HiggsBoson/higgs_nn.py: -------------------------------------------------------------------------------- 1 | import os, math 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn import decomposition 5 | from sklearn import preprocessing 6 | from theano import function 7 | from pylearn2.config import yaml_parse 8 | from pylearn2.utils import serial 9 | 10 | 11 | def ams(s, b): 12 | """ 13 | Approximate Median Significant function to evaluate solutions. 14 | """ 15 | br = 10.0 16 | radicand = 2 * ((s + b + br) * math.log(1.0 + s / (b + br)) - s) 17 | if radicand < 0: 18 | print 'Radicand is negative.' 19 | exit() 20 | else: 21 | return math.sqrt(radicand) 22 | 23 | 24 | def process_training_data(filename, features, impute, standardize, whiten): 25 | """ 26 | Reads in training data and prepares numpy arrays. 27 | """ 28 | training_data = pd.read_csv(filename, sep=',') 29 | 30 | # add a nominal label (0, 1) 31 | temp = training_data['Label'].replace(to_replace=['s', 'b'], value=[1, 0]) 32 | training_data['Nominal'] = temp 33 | 34 | X = training_data.iloc[:, 1:features+1].values 35 | y = training_data.iloc[:, features+3].values 36 | w = training_data.iloc[:, features+1].values 37 | 38 | # optionally impute the -999 values 39 | if impute == 'mean': 40 | imp = preprocessing.Imputer(missing_values=-999) 41 | X = imp.fit_transform(X) 42 | elif impute == 'zeros': 43 | X[X == -999] = 0 44 | 45 | # create a standardization transform 46 | scaler = None 47 | if standardize: 48 | scaler = preprocessing.StandardScaler() 49 | scaler.fit(X) 50 | 51 | # create a PCA transform 52 | pca = None 53 | if whiten: 54 | pca = decomposition.PCA(whiten=True) 55 | pca.fit(X) 56 | 57 | return training_data, X, y, w, scaler, pca 58 | 59 | 60 | def create_nn_pre_train_file(original_filename, new_filename, impute, scaler, pca): 61 | """ 62 | Creates a non-labeled data set with transforms applied to be used 63 | by pylearn2's csv data set class. 64 | """ 65 | combined_data = pd.read_csv(original_filename, sep=',') 66 | 67 | X = combined_data.values 68 | 69 | if impute == 'mean': 70 | imp = preprocessing.Imputer(missing_values=-999) 71 | X = imp.fit_transform(X) 72 | elif impute == 'zeros': 73 | X[X == -999] = 0 74 | 75 | if scaler is not None: 76 | X = scaler.transform(X) 77 | 78 | if pca is not None: 79 | X = pca.transform(X) 80 | 81 | combined_data = pd.DataFrame(X, columns=combined_data.columns.values) 82 | combined_data.to_csv(new_filename, sep=',', index=False) 83 | 84 | 85 | def create_nn_training_file(training_data, features, impute, scaler, pca, filename): 86 | """ 87 | Creates a labeled training set with transforms applied to be used 88 | by pylearn2's csv data set class. 89 | """ 90 | nn_training_data = training_data 91 | 92 | nn_training_data.insert(0, 'NN_Label', nn_training_data['Nominal'].values) 93 | 94 | nn_training_data.drop('EventId', axis=1, inplace=True) 95 | nn_training_data.drop('Weight', axis=1, inplace=True) 96 | nn_training_data.drop('Label', axis=1, inplace=True) 97 | nn_training_data.drop('Nominal', axis=1, inplace=True) 98 | 99 | X = nn_training_data.iloc[:, 1:features+1].values 100 | 101 | if impute == 'mean': 102 | imp = preprocessing.Imputer(missing_values=-999) 103 | X = imp.fit_transform(X) 104 | elif impute == 'zeros': 105 | X[X == -999] = 0 106 | 107 | if scaler is not None: 108 | X = scaler.transform(X) 109 | 110 | if pca is not None: 111 | X = pca.transform(X) 112 | 113 | X = np.insert(X, 0, nn_training_data['NN_Label'].values, 1) 114 | 115 | nn_training_data = pd.DataFrame(X, columns=nn_training_data.columns.values) 116 | nn_training_data.to_csv(filename, sep=',', index=False) 117 | 118 | 119 | def train(model_definition_file, data_dir): 120 | """ 121 | Trains a neural network model using the pylearn2 library. 122 | """ 123 | with open(model_definition_file, 'r') as f: 124 | train_nn = f.read() 125 | 126 | hyper_params = {'data_dir': data_dir, 127 | 'num_features': 30, 128 | 'dim_h0': 50, 129 | 'batch_size': 100, 130 | 'max_epochs': 10, 131 | 'train_start': 0, 132 | 'train_stop': 150000, 133 | 'valid_start': 150001, 134 | 'valid_stop': 200000, 135 | 'test_start': 200001, 136 | 'test_stop': 250000} 137 | train_nn = train_nn % hyper_params 138 | train_nn = yaml_parse.load(train_nn) 139 | train_nn.main_loop() 140 | 141 | 142 | def predict(X, threshold, scaler, pca, model_file): 143 | """ 144 | Compiles a Theano function using the pylearn 2 model's fprop 145 | to predict the probability of a positive outcome, and converts 146 | to a binary prediction based on the cutoff percentage. 147 | """ 148 | if scaler is not None: 149 | X = scaler.transform(X) 150 | 151 | if pca is not None: 152 | X = pca.transform(X) 153 | 154 | # Load the model 155 | model = serial.load(model_file) 156 | 157 | # Create Theano function to compute probability 158 | x = model.get_input_space().make_theano_batch() 159 | y = model.fprop(x) 160 | pred = function([x], y) 161 | 162 | # Convert to a prediction 163 | y_prob = pred(X)[:, 1] 164 | cutoff = np.percentile(y_prob, threshold) 165 | y_est = y_prob > cutoff 166 | 167 | return y_prob, y_est 168 | 169 | 170 | def score(y, y_est, w): 171 | """ 172 | Create weighted signal and background sets and calculate the AMS. 173 | """ 174 | y_signal = w * (y == 1.0) 175 | y_background = w * (y == 0.0) 176 | s = np.sum(y_signal * (y_est == 1.0)) 177 | b = np.sum(y_background * (y_est == 1.0)) 178 | 179 | return ams(s, b) 180 | 181 | 182 | def process_test_data(filename, features, impute): 183 | """ 184 | Reads in test data and prepares numpy arrays. 185 | """ 186 | test_data = pd.read_csv(filename, sep=',') 187 | X_test = test_data.iloc[:, 1:features+1].values 188 | 189 | if impute == 'mean': 190 | imp = preprocessing.Imputer(missing_values=-999) 191 | X_test = imp.fit_transform(X_test) 192 | elif impute == 'zeros': 193 | X_test[X_test == -999] = 0 194 | 195 | return test_data, X_test 196 | 197 | 198 | def create_submission(test_data, y_test_prob, y_test_est, submit_file): 199 | """ 200 | Create a new data frame with the submission data. 201 | """ 202 | temp = pd.DataFrame(y_test_prob, columns=['RankOrder']) 203 | temp2 = pd.DataFrame(y_test_est, columns=['Class']) 204 | submit = pd.DataFrame([test_data.EventId, temp.RankOrder, temp2.Class]).transpose() 205 | 206 | # sort it so they're in the ascending order by probability 207 | submit = submit.sort(['RankOrder'], ascending=True) 208 | 209 | # convert the probabilities to rank order (required by the submission guidelines) 210 | for i in range(0, y_test_est.shape[0], 1): 211 | submit.iloc[i, 1] = i + 1 212 | 213 | # re-sort by event ID 214 | submit = submit.sort(['EventId'], ascending=True) 215 | 216 | # convert the integer classification to (s, b) 217 | submit['Class'] = submit['Class'].map({1: 's', 0: 'b'}) 218 | 219 | # force pandas to treat these columns at int (otherwise will write as floats) 220 | submit[['EventId', 'RankOrder']] = submit[['EventId', 'RankOrder']].astype(int) 221 | 222 | # finally create the submission file 223 | submit.to_csv(submit_file, sep=',', index=False, index_label=False) 224 | 225 | 226 | def main(): 227 | # perform some initialization 228 | features = 30 229 | threshold = 85 230 | impute = 'zeros' # zeros, mean, none 231 | standardize = True 232 | whiten = False 233 | load_training_data = True 234 | train_model = True 235 | create_nn_files = True 236 | train_nn_model = True 237 | create_submission_file = False 238 | code_dir = '/home/john/git/kaggle/HiggsBoson/' 239 | data_dir = '/home/john/data/higgs-boson/' 240 | pretrain_file = 'combined.csv' 241 | training_file = 'training.csv' 242 | test_file = 'test.csv' 243 | submit_file = 'submission.csv' 244 | pretrain_nn_file = 'combined_nn.csv' 245 | training_nn_file = 'training_nn.csv' 246 | model_definition_file = 'mlp.yaml' 247 | model_file = 'mlp.pkl' 248 | 249 | os.chdir(code_dir) 250 | 251 | print 'Starting process...' 252 | print 'impute={0}, standardize={1}, whiten={2} threshold={3}'.format( 253 | impute, standardize, whiten, threshold) 254 | 255 | if load_training_data: 256 | print 'Reading in training data...' 257 | training_data, X, y, w, scaler, pca = process_training_data( 258 | data_dir + training_file, features, impute, standardize, whiten) 259 | 260 | if train_model: 261 | print 'Running neural network process...' 262 | 263 | if create_nn_files: 264 | print 'Creating training files...' 265 | create_nn_training_file(training_data, features, impute, scaler, pca, 266 | data_dir + training_nn_file) 267 | create_nn_pre_train_file(data_dir + pretrain_file, 268 | data_dir + pretrain_nn_file, impute, scaler, pca) 269 | 270 | if train_nn_model: 271 | print 'Training the model...' 272 | train(code_dir + model_definition_file, data_dir) 273 | 274 | print 'Calculating predictions...' 275 | y_prob, y_est = predict(X, threshold, scaler, pca, data_dir + model_file) 276 | 277 | print 'Calculating AMS...' 278 | ams_val = score(y, y_est, w) 279 | print 'AMS =', ams_val 280 | 281 | if create_submission_file: 282 | print 'Reading in test data...' 283 | test_data, X_test = process_test_data(data_dir + test_file, features, impute) 284 | 285 | print 'Predicting test data...' 286 | y_test_prob, y_test_est = predict(X_test, threshold, scaler, pca, data_dir + model_file) 287 | 288 | print 'Creating submission file...' 289 | create_submission(test_data, y_test_prob, y_test_est, data_dir + submit_file) 290 | 291 | print 'Process complete.' 292 | 293 | 294 | if __name__ == "__main__": 295 | main() -------------------------------------------------------------------------------- /old/HiggsBoson/metric.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Evaluation metric for the Higgs Boson Kaggle Competition, 4 | as described on: 5 | https://www.kaggle.com/c/higgs-boson/details/evaluation 6 | 7 | @author: Joyce Noah-Vanhoukce 8 | Created: Thu Apr 24 2014 9 | """ 10 | 11 | import os 12 | import csv 13 | import math 14 | 15 | 16 | def create_solution_dictionary(solution): 17 | """ Read solution file, return a dictionary with key EventId and value (weight,label). 18 | Solution file headers: EventId, Label, Weight """ 19 | 20 | solnDict = {} 21 | with open(solution, 'rb') as f: 22 | soln = csv.reader(f) 23 | soln.next() # header 24 | for row in soln: 25 | if row[0] not in solnDict: 26 | solnDict[row[0]] = (row[1], row[2]) 27 | return solnDict 28 | 29 | 30 | def check_submission(submission, Nelements): 31 | """ Check that submission RankOrder column is correct: 32 | 1. All numbers are in [1,NTestSet] 33 | 2. All numbers are unqiue 34 | """ 35 | rankOrderSet = set() 36 | with open(submission, 'rb') as f: 37 | sub = csv.reader(f) 38 | sub.next() # header 39 | for row in sub: 40 | rankOrderSet.add(row[1]) 41 | 42 | if len(rankOrderSet) != Nelements: 43 | print 'RankOrder column must contain unique values' 44 | exit() 45 | elif rankOrderSet.isdisjoint(set(xrange(1,Nelements+1))) == False: 46 | print 'RankOrder column must contain all numbers from [1..NTestSset]' 47 | exit() 48 | else: 49 | return True 50 | 51 | 52 | def AMS(s, b): 53 | """ Approximate Median Significance defined as: 54 | AMS = sqrt( 55 | 2 { (s + b + b_r) log[1 + (s/(b+b_r))] - s} 56 | ) 57 | where b_r = 10, b = background, s = signal, log is natural logarithm """ 58 | 59 | br = 10.0 60 | radicand = 2 *( (s+b+br) * math.log (1.0 + s/(b+br)) -s) 61 | if radicand < 0: 62 | print 'radicand is negative. Exiting' 63 | exit() 64 | else: 65 | return math.sqrt(radicand) 66 | 67 | 68 | def AMS_metric(solution, submission): 69 | """ Prints the AMS metric value to screen. 70 | Solution File header: EventId, Class, Weight 71 | Submission File header: EventId, RankOrder, Class 72 | """ 73 | 74 | numEvents = 550000 # number of events = size of test set 75 | 76 | # solutionDict: key=eventId, value=(label, class) 77 | solutionDict = create_solution_dictionary(solution) 78 | 79 | signal = 0.0 80 | background = 0.0 81 | if check_submission(submission, numEvents): 82 | with open(submission, 'rb') as f: 83 | sub = csv.reader(f) 84 | sub.next() # header row 85 | for row in sub: 86 | if row[2] == 's': # only events predicted to be signal are scored 87 | if solutionDict[row[0]][0] == 's': 88 | signal += float(solutionDict[row[0]][1]) 89 | elif solutionDict[row[0]][0] == 'b': 90 | background += float(solutionDict[row[0]][1]) 91 | 92 | print 'signal = {0}, background = {1}'.format(signal, background) 93 | print 'AMS = ' + str(AMS(signal, background)) 94 | 95 | 96 | if __name__ == "__main__": 97 | 98 | # enter path and file names here 99 | path = "" 100 | solutionFile = "" 101 | submissionFile = "" 102 | 103 | AMS_metric(solutionFile, submissionFile) 104 | 105 | 106 | -------------------------------------------------------------------------------- /old/HiggsBoson/mlp.yaml: -------------------------------------------------------------------------------- 1 | !obj:pylearn2.train.Train { 2 | dataset: &train !obj:pylearn2.datasets.csv_dataset.CSVDataset { 3 | path: '%(data_dir)s/training_nn.csv', 4 | task: 'classification', 5 | one_hot: True, 6 | expect_labels: True, 7 | expect_headers: True, 8 | delimiter: ',', 9 | start: %(train_start)i, 10 | stop: %(train_stop)i 11 | }, 12 | model: !obj:pylearn2.models.mlp.MLP { 13 | layers: [ 14 | !obj:pylearn2.models.mlp.Sigmoid { 15 | layer_name: 'h0', 16 | dim: %(dim_h0)i, 17 | sparse_init: 15, 18 | }, !obj:pylearn2.models.mlp.Softmax { 19 | layer_name: 'y', 20 | n_classes: 2, 21 | irange: 0. 22 | } 23 | ], 24 | nvis: %(num_features)i, 25 | }, 26 | algorithm: !obj:pylearn2.training_algorithms.bgd.BGD { 27 | batch_size: %(batch_size)i, 28 | line_search_mode: 'exhaustive', 29 | conjugate: 1, 30 | updates_per_batch: 10, 31 | monitoring_dataset: { 32 | 'train' : *train, 33 | 'valid' : !obj:pylearn2.datasets.csv_dataset.CSVDataset { 34 | path: '%(data_dir)s/training_nn.csv', 35 | task: 'classification', 36 | one_hot: True, 37 | expect_labels: True, 38 | expect_headers: True, 39 | delimiter: ',', 40 | start: %(valid_start)i, 41 | stop: %(valid_stop)i 42 | }, 43 | 'test' : !obj:pylearn2.datasets.csv_dataset.CSVDataset { 44 | path: '%(data_dir)s/training_nn.csv', 45 | task: 'classification', 46 | one_hot: True, 47 | expect_labels: True, 48 | expect_headers: True, 49 | delimiter: ',', 50 | start: %(test_start)i, 51 | stop: %(test_stop)i 52 | } 53 | }, 54 | termination_criterion: !obj:pylearn2.termination_criteria.And { 55 | criteria: [ 56 | !obj:pylearn2.termination_criteria.MonitorBased { 57 | channel_name: 'valid_y_misclass' 58 | }, 59 | !obj:pylearn2.termination_criteria.EpochCounter { 60 | max_epochs: %(max_epochs)i 61 | } 62 | ] 63 | } 64 | }, 65 | extensions: [ 66 | !obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest { 67 | channel_name: 'valid_y_misclass', 68 | save_path: '%(data_dir)s/mlp.pkl' 69 | } 70 | ] 71 | } 72 | -------------------------------------------------------------------------------- /old/NerveSegmentation/README.md: -------------------------------------------------------------------------------- 1 | # Ultrasound Nerve Segmentation 2 | 3 | View the competition details here.
4 | 5 | I started this competition just to mess around with image classification and see what scripts others were coming up with. I didn't really do any original work on this one. -------------------------------------------------------------------------------- /old/NerveSegmentation/data.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import numpy as np 4 | import cv2 5 | 6 | data_path = '/home/john/data/nerve-segmentation/' 7 | image_rows = 420 8 | image_cols = 580 9 | 10 | 11 | def create_train_data(): 12 | train_data_path = os.path.join(data_path, 'train') 13 | images = os.listdir(train_data_path) 14 | total = len(images) / 2 15 | 16 | imgs = np.ndarray((total, 1, image_rows, image_cols), dtype=np.uint8) 17 | imgs_mask = np.ndarray((total, 1, image_rows, image_cols), dtype=np.uint8) 18 | 19 | i = 0 20 | print('Creating training images...') 21 | for image_name in images: 22 | if 'mask' in image_name: 23 | continue 24 | image_mask_name = image_name.split('.')[0] + '_mask.tif' 25 | img = cv2.imread(os.path.join(train_data_path, image_name), cv2.IMREAD_GRAYSCALE) 26 | img_mask = cv2.imread(os.path.join(train_data_path, image_mask_name), cv2.IMREAD_GRAYSCALE) 27 | 28 | img = np.array([img]) 29 | img_mask = np.array([img_mask]) 30 | 31 | imgs[i] = img 32 | imgs_mask[i] = img_mask 33 | 34 | if i % 100 == 0: 35 | print('Done: {0}/{1} images'.format(i, total)) 36 | i += 1 37 | print('Loading done.') 38 | 39 | np.save(data_path + 'imgs_train.npy', imgs) 40 | np.save(data_path + 'imgs_mask_train.npy', imgs_mask) 41 | print('Saving to .npy files done.') 42 | 43 | 44 | def load_train_data(): 45 | imgs_train = np.load(data_path + 'imgs_train.npy') 46 | imgs_mask_train = np.load(data_path + 'imgs_mask_train.npy') 47 | return imgs_train, imgs_mask_train 48 | 49 | 50 | def create_test_data(): 51 | train_data_path = os.path.join(data_path, 'test') 52 | images = os.listdir(train_data_path) 53 | total = len(images) 54 | 55 | imgs = np.ndarray((total, 1, image_rows, image_cols), dtype=np.uint8) 56 | imgs_id = np.ndarray((total, ), dtype=np.int32) 57 | 58 | i = 0 59 | print('Creating test images...') 60 | for image_name in images: 61 | img_id = int(image_name.split('.')[0]) 62 | img = cv2.imread(os.path.join(train_data_path, image_name), cv2.IMREAD_GRAYSCALE) 63 | 64 | img = np.array([img]) 65 | 66 | imgs[i] = img 67 | imgs_id[i] = img_id 68 | 69 | if i % 100 == 0: 70 | print('Done: {0}/{1} images'.format(i, total)) 71 | i += 1 72 | print('Loading done.') 73 | 74 | np.save(data_path + 'imgs_test.npy', imgs) 75 | np.save(data_path + 'imgs_id_test.npy', imgs_id) 76 | print('Saving to .npy files done.') 77 | 78 | 79 | def load_test_data(): 80 | imgs_test = np.load(data_path + 'imgs_test.npy') 81 | imgs_id = np.load(data_path + 'imgs_id_test.npy') 82 | return imgs_test, imgs_id 83 | 84 | 85 | if __name__ == '__main__': 86 | create_train_data() 87 | create_test_data() 88 | print('Complete.') -------------------------------------------------------------------------------- /old/NerveSegmentation/submission.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | sys.path.append('/home/john/git/kaggle/NerveSegmentation/') 4 | 5 | import numpy as np 6 | import cv2 7 | from data import image_cols, image_rows 8 | 9 | data_path = '/home/john/data/nerve-segmentation/' 10 | 11 | 12 | def prep(img): 13 | img = img.astype('float32') 14 | img = cv2.threshold(img, 0.5, 1., cv2.THRESH_BINARY)[1].astype(np.uint8) 15 | img = cv2.resize(img, (image_cols, image_rows)) 16 | return img 17 | 18 | 19 | def run_length_enc(label): 20 | from itertools import chain 21 | x = label.transpose().flatten() 22 | y = np.where(x > 0)[0] 23 | if len(y) < 10: # consider as empty 24 | return '' 25 | z = np.where(np.diff(y) > 1)[0] 26 | start = np.insert(y[z+1], 0, y[0]) 27 | end = np.append(y[z], y[-1]) 28 | length = end - start 29 | res = [[s+1, l+1] for s, l in zip(list(start), list(length))] 30 | res = list(chain.from_iterable(res)) 31 | return ' '.join([str(r) for r in res]) 32 | 33 | 34 | def submission(): 35 | from data import load_test_data 36 | imgs_test, imgs_id_test = load_test_data() 37 | imgs_test = np.load(data_path + 'imgs_mask_test.npy') 38 | 39 | argsort = np.argsort(imgs_id_test) 40 | imgs_id_test = imgs_id_test[argsort] 41 | imgs_test = imgs_test[argsort] 42 | 43 | total = imgs_test.shape[0] 44 | ids = [] 45 | rles = [] 46 | for i in range(total): 47 | img = imgs_test[i, 0] 48 | img = prep(img) 49 | rle = run_length_enc(img) 50 | 51 | rles.append(rle) 52 | ids.append(imgs_id_test[i]) 53 | 54 | if i % 100 == 0: 55 | print('{}/{}'.format(i, total)) 56 | 57 | first_row = 'img,pixels' 58 | file_name = data_path + 'submission.csv' 59 | 60 | with open(file_name, 'w+') as f: 61 | f.write(first_row + '\n') 62 | for i in range(total): 63 | s = str(ids[i]) + ',' + rles[i] 64 | f.write(s + '\n') 65 | 66 | 67 | if __name__ == '__main__': 68 | submission() 69 | print('Complete.') -------------------------------------------------------------------------------- /old/NerveSegmentation/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | sys.path.append('/home/john/git/kaggle/NerveSegmentation/') 4 | 5 | import cv2 6 | import numpy as np 7 | from keras.models import Model 8 | from keras.layers import Input, merge, Convolution2D, MaxPooling2D, UpSampling2D 9 | from keras.optimizers import Adam 10 | from keras.callbacks import ModelCheckpoint 11 | from keras import backend as K 12 | from data import load_train_data, load_test_data 13 | 14 | data_path = '/home/john/data/nerve-segmentation/' 15 | img_rows = 64 16 | img_cols = 80 17 | smooth = 1. 18 | 19 | 20 | def dice_coef(y_true, y_pred): 21 | y_true_f = K.flatten(y_true) 22 | y_pred_f = K.flatten(y_pred) 23 | intersection = K.sum(y_true_f * y_pred_f) 24 | return (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth) 25 | 26 | 27 | def dice_coef_loss(y_true, y_pred): 28 | return -dice_coef(y_true, y_pred) 29 | 30 | 31 | def get_unet(): 32 | inputs = Input((1, img_rows, img_cols)) 33 | conv1 = Convolution2D(32, 3, 3, activation='relu', border_mode='same')(inputs) 34 | conv1 = Convolution2D(32, 3, 3, activation='relu', border_mode='same')(conv1) 35 | pool1 = MaxPooling2D(pool_size=(2, 2))(conv1) 36 | 37 | conv2 = Convolution2D(64, 3, 3, activation='relu', border_mode='same')(pool1) 38 | conv2 = Convolution2D(64, 3, 3, activation='relu', border_mode='same')(conv2) 39 | pool2 = MaxPooling2D(pool_size=(2, 2))(conv2) 40 | 41 | conv3 = Convolution2D(128, 3, 3, activation='relu', border_mode='same')(pool2) 42 | conv3 = Convolution2D(128, 3, 3, activation='relu', border_mode='same')(conv3) 43 | pool3 = MaxPooling2D(pool_size=(2, 2))(conv3) 44 | 45 | conv4 = Convolution2D(256, 3, 3, activation='relu', border_mode='same')(pool3) 46 | conv4 = Convolution2D(256, 3, 3, activation='relu', border_mode='same')(conv4) 47 | pool4 = MaxPooling2D(pool_size=(2, 2))(conv4) 48 | 49 | conv5 = Convolution2D(512, 3, 3, activation='relu', border_mode='same')(pool4) 50 | conv5 = Convolution2D(512, 3, 3, activation='relu', border_mode='same')(conv5) 51 | 52 | up6 = merge([UpSampling2D(size=(2, 2))(conv5), conv4], mode='concat', concat_axis=1) 53 | conv6 = Convolution2D(256, 3, 3, activation='relu', border_mode='same')(up6) 54 | conv6 = Convolution2D(256, 3, 3, activation='relu', border_mode='same')(conv6) 55 | 56 | up7 = merge([UpSampling2D(size=(2, 2))(conv6), conv3], mode='concat', concat_axis=1) 57 | conv7 = Convolution2D(128, 3, 3, activation='relu', border_mode='same')(up7) 58 | conv7 = Convolution2D(128, 3, 3, activation='relu', border_mode='same')(conv7) 59 | 60 | up8 = merge([UpSampling2D(size=(2, 2))(conv7), conv2], mode='concat', concat_axis=1) 61 | conv8 = Convolution2D(64, 3, 3, activation='relu', border_mode='same')(up8) 62 | conv8 = Convolution2D(64, 3, 3, activation='relu', border_mode='same')(conv8) 63 | 64 | up9 = merge([UpSampling2D(size=(2, 2))(conv8), conv1], mode='concat', concat_axis=1) 65 | conv9 = Convolution2D(32, 3, 3, activation='relu', border_mode='same')(up9) 66 | conv9 = Convolution2D(32, 3, 3, activation='relu', border_mode='same')(conv9) 67 | 68 | conv10 = Convolution2D(1, 1, 1, activation='sigmoid')(conv9) 69 | 70 | model = Model(input=inputs, output=conv10) 71 | 72 | model.compile(optimizer=Adam(lr=1e-5), loss=dice_coef_loss, metrics=[dice_coef]) 73 | 74 | return model 75 | 76 | 77 | def preprocess(imgs): 78 | imgs_p = np.ndarray((imgs.shape[0], imgs.shape[1], img_rows, img_cols), dtype=np.uint8) 79 | for i in range(imgs.shape[0]): 80 | imgs_p[i, 0] = cv2.resize(imgs[i, 0], (img_cols, img_rows), interpolation=cv2.INTER_CUBIC) 81 | return imgs_p 82 | 83 | 84 | def train_and_predict(): 85 | print('Loading and preprocessing train data...') 86 | imgs_train, imgs_mask_train = load_train_data() 87 | 88 | imgs_train = preprocess(imgs_train) 89 | imgs_mask_train = preprocess(imgs_mask_train) 90 | 91 | imgs_train = imgs_train.astype('float32') 92 | mean = np.mean(imgs_train) # mean for data centering 93 | std = np.std(imgs_train) # std for data normalization 94 | 95 | imgs_train -= mean 96 | imgs_train /= std 97 | 98 | imgs_mask_train = imgs_mask_train.astype('float32') 99 | imgs_mask_train /= 255. # scale masks to [0, 1] 100 | 101 | print('Creating and compiling model...') 102 | model = get_unet() 103 | model_checkpoint = ModelCheckpoint(data_path + 'unet.hdf5', monitor='loss', save_best_only=True) 104 | 105 | print('Fitting model...') 106 | model.fit(imgs_train, imgs_mask_train, batch_size=32, nb_epoch=20, verbose=1, shuffle=True, 107 | callbacks=[model_checkpoint]) 108 | 109 | print('Loading and preprocessing test data...') 110 | imgs_test, imgs_id_test = load_test_data() 111 | imgs_test = preprocess(imgs_test) 112 | 113 | imgs_test = imgs_test.astype('float32') 114 | imgs_test -= mean 115 | imgs_test /= std 116 | 117 | print('Loading saved weights...') 118 | model.load_weights(data_path + 'unet.hdf5') 119 | 120 | print('Predicting masks on test data...') 121 | imgs_mask_test = model.predict(imgs_test, verbose=1) 122 | np.save(data_path + 'imgs_mask_test.npy', imgs_mask_test) 123 | 124 | 125 | if __name__ == '__main__': 126 | train_and_predict() 127 | print('Complete.') -------------------------------------------------------------------------------- /old/OttoGroup/README.md: -------------------------------------------------------------------------------- 1 | # Otto Group Product Classification Challenge 2 | 3 | Otto

4 | 5 | View the competition details here.
6 | 7 | This directory includes the code I used to run experiments for the competition. I started very late (with only a few days remaining) so I didn't have much time to experiment, but I messed around with xgboost and keras (deep learning library) a bit.
8 | 9 | I used the Anaconda distribution of Python with the IPython kernel and PyCharm IDE to run experiments, with some additional dependencies configured like a decent BLAS for Theano. The primary script is otto.py. The others are example scripts I got from various places.
-------------------------------------------------------------------------------- /old/OttoGroup/Resources/Grafik.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jdwittenauer/kaggle/cc489100a0c93315e424551f68968ffba85d268f/old/OttoGroup/Resources/Grafik.jpg -------------------------------------------------------------------------------- /old/OttoGroup/find_ensemble_weights.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from scipy.optimize import minimize 3 | from sklearn.cross_validation import StratifiedShuffleSplit 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics import log_loss 7 | import os 8 | 9 | 10 | def log_loss_func(weights): 11 | """ 12 | scipy minimize will pass the weights as a numpy array 13 | """ 14 | final_prediction = 0 15 | for weight, prediction in zip(weights, predictions): 16 | final_prediction += weight*prediction 17 | 18 | return log_loss(test_y, final_prediction) 19 | 20 | 21 | os.system("ls ../input") 22 | 23 | train = pd.read_csv("../input/train.csv") 24 | print("Training set has {0[0]} rows and {0[1]} columns".format(train.shape)) 25 | 26 | labels = train['target'] 27 | train.drop(['target', 'id'], axis=1, inplace=True) 28 | 29 | print(train.head()) 30 | 31 | # we need a test set that we didn't train on to find the best weights for combining the classifiers 32 | sss = StratifiedShuffleSplit(labels, test_size=0.05, random_state=1234) 33 | for train_index, test_index in sss: 34 | break 35 | 36 | train_x, train_y = train.values[train_index], labels.values[train_index] 37 | test_x, test_y = train.values[test_index], labels.values[test_index] 38 | 39 | # building the classifiers 40 | clfs = [] 41 | 42 | rfc = RandomForestClassifier(n_estimators=50, random_state=4141, n_jobs=-1) 43 | rfc.fit(train_x, train_y) 44 | print('RFC LogLoss {score}'.format(score=log_loss(test_y, rfc.predict_proba(test_x)))) 45 | clfs.append(rfc) 46 | 47 | # usually you'd use xgboost and neural nets here 48 | logreg = LogisticRegression() 49 | logreg.fit(train_x, train_y) 50 | print('LogisticRegression LogLoss {score}'.format(score=log_loss(test_y, logreg.predict_proba(test_x)))) 51 | clfs.append(logreg) 52 | 53 | rfc2 = RandomForestClassifier(n_estimators=50, random_state=1337, n_jobs=-1) 54 | rfc2.fit(train_x, train_y) 55 | print('RFC2 LogLoss {score}'.format(score=log_loss(test_y, rfc2.predict_proba(test_x)))) 56 | clfs.append(rfc2) 57 | 58 | 59 | # finding the optimum weights 60 | predictions = [] 61 | for clf in clfs: 62 | predictions.append(clf.predict_proba(test_x)) 63 | 64 | # the algorithms need a starting value, right not we chose 0.5 for all weights 65 | # its better to choose many random starting points and run minimize a few times 66 | starting_values = [0.5] * len(predictions) 67 | 68 | # adding constraints and a different solver as suggested by user 16universe 69 | cons = ({'type': 'eq', 'fun': lambda w: 1-sum(w)}) 70 | 71 | # our weights are bound between 0 and 1 72 | bounds = [(0, 1)] * len(predictions) 73 | 74 | res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons) 75 | 76 | print('Ensemble Score: {best_score}'.format(best_score=res['fun'])) 77 | print('Best Weights: {weights}'.format(weights=res['x'])) 78 | -------------------------------------------------------------------------------- /old/OttoGroup/graphlab_starter.py: -------------------------------------------------------------------------------- 1 | import graphlab as gl 2 | import math 3 | import random 4 | 5 | train = gl.SFrame.read_csv('data/train.csv') 6 | test = gl.SFrame.read_csv('data/test.csv') 7 | del train['id'] 8 | 9 | 10 | def make_submission(m, test, filename): 11 | preds = m.predict_topk(test, output_type='probability', k=9) 12 | preds['id'] = preds['id'].astype(int) + 1 13 | preds = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '') 14 | preds = preds.sort('id') 15 | preds.save(filename) 16 | 17 | 18 | def multiclass_logloss(model, test): 19 | preds = model.predict_topk(test, output_type='probability', k=9) 20 | preds = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '') 21 | preds['id'] = preds['id'].astype(int) + 1 22 | preds = preds.sort('id') 23 | preds['target'] = test['target'] 24 | neg_log_loss = 0 25 | for row in preds: 26 | label = row['target'] 27 | neg_log_loss += - math.log(row[label]) 28 | return neg_log_loss / preds.num_rows() 29 | 30 | 31 | def shuffle(sf): 32 | sf['_id'] = [random.random() for i in xrange(sf.num_rows())] 33 | sf = sf.sort('_id') 34 | del sf['_id'] 35 | return sf 36 | 37 | 38 | def evaluate_logloss(model, train, valid): 39 | return {'train_logloss': multiclass_logloss(model, train), 40 | 'valid_logloss': multiclass_logloss(model, valid)} 41 | 42 | 43 | params = {'target': 'target', 44 | 'max_iterations': 250, 45 | 'max_depth': 10, 46 | 'min_child_weight': 4, 47 | 'row_subsample': .9, 48 | 'min_loss_reduction': 1, 49 | 'column_subsample': .8, 50 | 'validation_set': None} 51 | 52 | train = shuffle(train) 53 | 54 | # Check performance on internal validation set 55 | tr, va = train.random_split(.8) 56 | m = gl.boosted_trees_classifier.create(tr, **params) 57 | print evaluate_logloss(m, tr, va) 58 | 59 | # Make final submission by using full training set 60 | m = gl.boosted_trees_classifier.create(train, **params) 61 | make_submission(m, test, 'submission.csv') 62 | 63 | -------------------------------------------------------------------------------- /old/OttoGroup/keras_starter.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from keras.models import Sequential 8 | from keras.layers.core import Dense, Dropout, Activation 9 | from keras.layers.normalization import BatchNormalization 10 | from keras.layers.advanced_activations import PReLU 11 | from keras.utils import np_utils, generic_utils 12 | 13 | from sklearn.preprocessing import LabelEncoder 14 | from sklearn.preprocessing import StandardScaler 15 | 16 | ''' 17 | This demonstrates how to reach a score of 0.4890 (local validation) 18 | on the Kaggle Otto challenge, with a deep net using Keras. 19 | Compatible Python 2.7-3.4 20 | Recommended to run on GPU: 21 | Command: THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python kaggle_otto_nn.py 22 | On EC2 g2.2xlarge instance: 19s/epoch. 6-7 minutes total training time. 23 | Best validation score at epoch 21: 0.4881 24 | Try it at home: 25 | - with/without BatchNormalization (BatchNormalization helps!) 26 | - with ReLU or with PReLU (PReLU helps!) 27 | - with smaller layers, largers layers 28 | - with more layers, less layers 29 | - with different optimizers (SGD+momentum+decay is probably better than Adam!) 30 | ''' 31 | 32 | np.random.seed(1337) # for reproducibility 33 | 34 | 35 | def load_data(path, train=True): 36 | df = pd.read_csv(path) 37 | X = df.values.copy() 38 | if train: 39 | np.random.shuffle(X) # https://youtu.be/uyUXoap67N8 40 | X, labels = X[:, 1:-1].astype(np.float32), X[:, -1] 41 | return X, labels 42 | else: 43 | X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str) 44 | return X, ids 45 | 46 | 47 | def preprocess_data(X, scaler=None): 48 | if not scaler: 49 | scaler = StandardScaler() 50 | scaler.fit(X) 51 | X = scaler.transform(X) 52 | return X, scaler 53 | 54 | 55 | def preprocess_labels(y, encoder=None, categorical=True): 56 | if not encoder: 57 | encoder = LabelEncoder() 58 | encoder.fit(labels) 59 | y = encoder.transform(labels).astype(np.int32) 60 | if categorical: 61 | y = np_utils.to_categorical(y) 62 | return y, encoder 63 | 64 | 65 | def make_submission(y_prob, ids, encoder, fname): 66 | with open(fname, 'w') as f: 67 | f.write('id,') 68 | f.write(','.join([str(i) for i in encoder.classes_])) 69 | f.write('\n') 70 | for i, probs in zip(ids, y_prob): 71 | probas = ','.join([i] + [str(p) for p in probs.tolist()]) 72 | f.write(probas) 73 | f.write('\n') 74 | print("Wrote submission to file {}.".format(fname)) 75 | 76 | 77 | print("Loading data...") 78 | X, labels = load_data('train.csv', train=True) 79 | X, scaler = preprocess_data(X) 80 | y, encoder = preprocess_labels(labels) 81 | 82 | X_test, ids = load_data('test.csv', train=False) 83 | X_test, _ = preprocess_data(X_test, scaler) 84 | 85 | nb_classes = y.shape[1] 86 | print(nb_classes, 'classes') 87 | 88 | dims = X.shape[1] 89 | print(dims, 'dims') 90 | 91 | print("Building model...") 92 | 93 | model = Sequential() 94 | model.add(Dense(dims, 512, init='glorot_uniform')) 95 | model.add(PReLU((512,))) 96 | model.add(BatchNormalization((512,))) 97 | model.add(Dropout(0.5)) 98 | 99 | model.add(Dense(512, 512, init='glorot_uniform')) 100 | model.add(PReLU((512,))) 101 | model.add(BatchNormalization((512,))) 102 | model.add(Dropout(0.5)) 103 | 104 | model.add(Dense(512, 512, init='glorot_uniform')) 105 | model.add(PReLU((512,))) 106 | model.add(BatchNormalization((512,))) 107 | model.add(Dropout(0.5)) 108 | 109 | model.add(Dense(512, nb_classes, init='glorot_uniform')) 110 | model.add(Activation('softmax')) 111 | 112 | model.compile(loss='categorical_crossentropy', optimizer="adam") 113 | 114 | print("Training model...") 115 | 116 | model.fit(X, y, nb_epoch=20, batch_size=16, validation_split=0.15) 117 | 118 | print("Generating submission...") 119 | 120 | proba = model.predict_proba(X_test) 121 | make_submission(proba, ids, encoder, fname='keras-otto.csv') 122 | 123 | -------------------------------------------------------------------------------- /old/OttoGroup/keras_wrapper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | 5 | from sklearn.preprocessing import * 6 | from sklearn.ensemble import * 7 | 8 | from keras.models import Sequential 9 | from keras.layers.core import Dense, Dropout, Activation 10 | from keras.layers.normalization import BatchNormalization 11 | from keras.layers.advanced_activations import PReLU 12 | from keras.utils import np_utils 13 | from keras.wrappers.scikit_learn import KerasClassifier 14 | 15 | 16 | def load_training_data(path, filename): 17 | df = pd.read_csv(path + filename) 18 | X = df.values.copy() 19 | np.random.shuffle(X) 20 | X, labels = X[:, 1:-1].astype(np.float32), X[:, -1] 21 | 22 | return X, labels 23 | 24 | 25 | def create_scaler(X): 26 | scaler = StandardScaler() 27 | scaler.fit(X) 28 | 29 | return scaler 30 | 31 | 32 | def apply_scaler(X, scaler): 33 | return scaler.transform(X) 34 | 35 | 36 | def preprocess_labels(labels): 37 | encoder = LabelEncoder() 38 | encoder.fit(labels) 39 | y = encoder.transform(labels).astype(np.int32) 40 | y_onehot = np_utils.to_categorical(y) 41 | 42 | return y, y_onehot, encoder 43 | 44 | 45 | def define_model(num_features, num_classes): 46 | layer_size = 512 47 | init_method = 'glorot_uniform' 48 | 49 | model = Sequential() 50 | model.add(Dense(num_features, layer_size, init=init_method)) 51 | model.add(PReLU((layer_size,))) 52 | model.add(BatchNormalization((layer_size,))) 53 | model.add(Dropout(0.5)) 54 | 55 | model.add(Dense(layer_size, layer_size, init=init_method)) 56 | model.add(PReLU((layer_size,))) 57 | model.add(BatchNormalization((layer_size,))) 58 | model.add(Dropout(0.5)) 59 | 60 | model.add(Dense(layer_size, layer_size, init=init_method)) 61 | model.add(PReLU((layer_size,))) 62 | model.add(BatchNormalization((layer_size,))) 63 | model.add(Dropout(0.5)) 64 | 65 | model.add(Dense(layer_size, num_classes, init=init_method)) 66 | model.add(Activation('softmax')) 67 | 68 | return model 69 | 70 | 71 | def main(): 72 | code_dir = '/home/john/git/kaggle/OttoGroup/' 73 | data_dir = '/home/john/data/otto/' 74 | training_file = 'train.csv' 75 | 76 | os.chdir(code_dir) 77 | np.random.seed(1337) 78 | 79 | print('Starting script...') 80 | 81 | print('Loading data...') 82 | X, labels = load_training_data(data_dir, training_file) 83 | 84 | print('Pre-processing...') 85 | scaler = create_scaler(X) 86 | X = apply_scaler(X, scaler) 87 | y, y_onehot, encoder = preprocess_labels(labels) 88 | num_features = X.shape[1] 89 | num_classes = y_onehot.shape[1] 90 | print('Features = ' + str(num_features)) 91 | print('Classes = ' + str(num_classes)) 92 | 93 | print('Building model...') 94 | model = define_model(num_features, num_classes) 95 | print('Complete.') 96 | 97 | print('Training model...') 98 | wrapper = KerasClassifier(model) 99 | wrapper.fit(X, y_onehot, nb_epoch=20) 100 | print('Complete.') 101 | 102 | print('Training score = ' + str(wrapper.score(X, y_onehot))) 103 | 104 | preds = wrapper.predict(X) 105 | print('Predictions shape = ' + str(preds.shape)) 106 | 107 | proba = wrapper.predict_proba(X) 108 | print('Probabilities shape = ' + str(proba.shape)) 109 | 110 | print('Building ensemble...') 111 | ensemble = BaggingClassifier(wrapper, n_estimators=3, max_samples=1.0, max_features=1.0) 112 | print('Complete.') 113 | 114 | print('Training ensemble...') 115 | ensemble.fit(X, y) 116 | print('Complete.') 117 | 118 | print('Ensemble score = ' + str(ensemble.score(X, y))) 119 | 120 | print('Script complete.') 121 | 122 | 123 | if __name__ == "__main__": 124 | main() 125 | -------------------------------------------------------------------------------- /old/OttoGroup/otto.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import pandas as pd 4 | import numpy as np 5 | 6 | from sklearn.cross_validation import * 7 | from sklearn.preprocessing import * 8 | from sklearn.metrics import * 9 | from sklearn.ensemble import * 10 | 11 | import xgboost as xgb 12 | 13 | from keras.models import Sequential 14 | from keras.layers.core import Dense, Dropout, Activation 15 | from keras.layers.normalization import BatchNormalization 16 | from keras.layers.advanced_activations import PReLU 17 | from keras.utils import np_utils 18 | 19 | 20 | def predict_probability(X, model, scaler): 21 | X = apply_scaler(X, scaler) 22 | y_prob = model.predict_proba(X) 23 | 24 | return y_prob 25 | 26 | 27 | def score(X, y, model, scaler): 28 | X = apply_scaler(X, scaler) 29 | y_est = model.predict_proba(X) 30 | 31 | return log_loss(y, y_est) 32 | 33 | 34 | def load_training_data(path, filename): 35 | df = pd.read_csv(path + filename) 36 | X = df.values.copy() 37 | np.random.shuffle(X) 38 | X, labels = X[:, 1:-1].astype(np.float32), X[:, -1] 39 | 40 | return X, labels 41 | 42 | 43 | def load_test_data(path, filename): 44 | df = pd.read_csv(path + filename) 45 | X = df.values.copy() 46 | X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str) 47 | 48 | return X, ids 49 | 50 | 51 | def create_scaler(X): 52 | scaler = StandardScaler() 53 | scaler.fit(X) 54 | 55 | return scaler 56 | 57 | 58 | def apply_scaler(X, scaler): 59 | return scaler.transform(X) 60 | 61 | 62 | def preprocess_labels(labels): 63 | encoder = LabelEncoder() 64 | encoder.fit(labels) 65 | y = encoder.transform(labels).astype(np.int32) 66 | y_onehot = np_utils.to_categorical(y) 67 | 68 | return y, y_onehot, encoder 69 | 70 | 71 | def define_xgb_model(): 72 | model = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=250, silent=True, 73 | objective="multi:softprob", nthread=-1, gamma=0, min_child_weight=4, 74 | max_delta_step=0, subsample=0.9, colsample_bytree=0.8, base_score=0.5, seed=0) 75 | 76 | return model 77 | 78 | 79 | def define_nn_model(num_features, num_classes): 80 | layer_size = 512 81 | init_method = 'glorot_uniform' 82 | loss_function = 'categorical_crossentropy' 83 | optimization_method = 'adam' 84 | 85 | model = Sequential() 86 | model.add(Dense(num_features, layer_size, init=init_method)) 87 | model.add(PReLU((layer_size,))) 88 | model.add(BatchNormalization((layer_size,))) 89 | model.add(Dropout(0.5)) 90 | 91 | model.add(Dense(layer_size, layer_size, init=init_method)) 92 | model.add(PReLU((layer_size,))) 93 | model.add(BatchNormalization((layer_size,))) 94 | model.add(Dropout(0.5)) 95 | 96 | model.add(Dense(layer_size, layer_size, init=init_method)) 97 | model.add(PReLU((layer_size,))) 98 | model.add(BatchNormalization((layer_size,))) 99 | model.add(Dropout(0.5)) 100 | 101 | model.add(Dense(layer_size, num_classes, init=init_method)) 102 | model.add(Activation('softmax')) 103 | 104 | model.compile(loss=loss_function, optimizer=optimization_method) 105 | 106 | return model 107 | 108 | 109 | def train_xgb_model(X, y, model, scaler): 110 | t0 = time.time() 111 | X = apply_scaler(X, scaler) 112 | model.fit(X, y) 113 | t1 = time.time() 114 | print('Model trained in {0:3f} s.'.format(t1 - t0)) 115 | 116 | return model 117 | 118 | 119 | def train_nn_model(X, y_onehot, model, scaler): 120 | t0 = time.time() 121 | X = apply_scaler(X, scaler) 122 | model.fit(X, y_onehot, nb_epoch=20, batch_size=16, verbose=0) 123 | t1 = time.time() 124 | print('Model trained in {0:3f} s.'.format(t1 - t0)) 125 | 126 | return model 127 | 128 | 129 | def cross_validate_xgb(X, y, scaler, folds=3): 130 | model = define_xgb_model() 131 | X = apply_scaler(X, scaler) 132 | t0 = time.time() 133 | 134 | scores = [] 135 | kf = KFold(y.shape[0], n_folds=folds, shuffle=True) 136 | for train_index, test_index in kf: 137 | model.fit(X[train_index], y[train_index]) 138 | predictions = model.predict_proba(X[test_index]) 139 | actuals = y[test_index] 140 | scores.append(log_loss(actuals, predictions)) 141 | 142 | t1 = time.time() 143 | print('Cross-validation completed in {0:3f} s.'.format(t1 - t0)) 144 | 145 | return np.mean(scores) 146 | 147 | 148 | def cross_validate_nn(X, y, y_onehot, scaler, num_features, num_classes, folds=3): 149 | model = define_nn_model(num_features, num_classes) 150 | X = apply_scaler(X, scaler) 151 | t0 = time.time() 152 | 153 | scores = [] 154 | kf = KFold(y.shape[0], n_folds=folds, shuffle=True) 155 | for train_index, test_index in kf: 156 | model.fit(X[train_index], y_onehot[train_index], nb_epoch=20, batch_size=16, verbose=0) 157 | predictions = model.predict_proba(X[test_index]) 158 | actuals = y[test_index] 159 | scores.append(log_loss(actuals, predictions)) 160 | 161 | t1 = time.time() 162 | print('Cross-validation completed in {0:3f} s.'.format(t1 - t0)) 163 | 164 | return np.mean(scores) 165 | 166 | 167 | def make_submission(y_prob, ids, encoder, path, filename): 168 | with open(path + filename, 'w') as f: 169 | f.write('id,') 170 | f.write(','.join([str(i) for i in encoder.classes_])) 171 | f.write('\n') 172 | for i, probabilities in zip(ids, y_prob): 173 | p = ','.join([i] + [str(p) for p in probabilities.tolist()]) 174 | f.write(p) 175 | f.write('\n') 176 | 177 | 178 | def main(): 179 | code_dir = '/home/john/git/kaggle/OttoGroup/' 180 | data_dir = '/home/john/data/otto-group/' 181 | training_file = 'train.csv' 182 | test_file = 'test.csv' 183 | submit_file = 'submission.csv' 184 | 185 | os.chdir(code_dir) 186 | np.random.seed(1337) 187 | 188 | print('Starting script...') 189 | 190 | print('Loading data...') 191 | X, labels = load_training_data(data_dir, training_file) 192 | X_test, ids = load_test_data(data_dir, test_file) 193 | 194 | print('Pre-processing...') 195 | scaler = create_scaler(X) 196 | y, y_onehot, encoder = preprocess_labels(labels) 197 | num_features = X.shape[1] 198 | num_classes = y_onehot.shape[1] 199 | print('Features = ' + str(num_features)) 200 | print('Classes = ' + str(num_classes)) 201 | 202 | print('Building model...') 203 | model = define_xgb_model() 204 | 205 | print('Training model...') 206 | model = train_xgb_model(X, y, model, scaler) 207 | 208 | print('Training score = ' + str(score(X, y, model, scaler))) 209 | 210 | print('Running cross-validation...') 211 | val_score = cross_validate_xgb(X, y, scaler) 212 | print('Cross-validation score = ' + str(val_score)) 213 | 214 | print('Building ensemble...') 215 | ensemble = BaggingClassifier(model, n_estimators=5, max_samples=1.0, max_features=1.0) 216 | 217 | print('Training ensemble...') 218 | X = apply_scaler(X, scaler) 219 | ensemble.fit(X, y) 220 | 221 | print('Generating submission file...') 222 | y_prob = predict_probability(X_test, ensemble, scaler) 223 | make_submission(y_prob, ids, encoder, data_dir, submit_file) 224 | 225 | print('Script complete.') 226 | 227 | 228 | if __name__ == "__main__": 229 | main() 230 | -------------------------------------------------------------------------------- /old/OttoGroup/simple_svm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.svm import LinearSVC 4 | from sklearn.preprocessing import LabelEncoder 5 | 6 | train = pd.read_csv('../input/train.csv') 7 | test = pd.read_csv('../input/test.csv') 8 | sample_submission = pd.read_csv('../input/sampleSubmission.csv') 9 | training_labels = LabelEncoder().fit_transform(train['target']) 10 | 11 | # SVMs tend to like features that look similar to ~ N(0,1), so let's stabilise the long tails 12 | train_features = train.drop('target', axis=1) 13 | train_features[train_features > 4] = 4 14 | 15 | model = LinearSVC().fit(train_features, training_labels) 16 | 17 | scores = model.decision_function(test) 18 | predictions = 1.0 / (1.0 + np.exp(-scores)) 19 | row_sums = predictions.sum(axis=1) 20 | predictions_normalised = predictions / row_sums[:, np.newaxis] 21 | 22 | # create submission file 23 | prediction_DF = pd.DataFrame(predictions_normalised, index=sample_submission.id.values, 24 | columns=sample_submission.columns[1:]) 25 | prediction_DF.to_csv('svc_submission.csv', index_label='id') 26 | -------------------------------------------------------------------------------- /old/OttoGroup/xgboost_walkthrough.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import xgboost as xgb 3 | 4 | from sklearn.cross_validation import KFold 5 | from sklearn.grid_search import GridSearchCV 6 | from sklearn.metrics import confusion_matrix, mean_squared_error 7 | from sklearn.datasets import load_iris, load_digits, load_boston 8 | 9 | rng = np.random.RandomState(31337) 10 | 11 | # load file from text file, also binary buffer generated by xgboost 12 | dtrain = xgb.DMatrix('../data/agaricus.txt.train') 13 | dtest = xgb.DMatrix('../data/agaricus.txt.test') 14 | 15 | # specify parameters via map, definition are same as c++ version 16 | param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic' } 17 | 18 | # specify validations set to watch performance 19 | watchlist = [(dtest, 'eval'), (dtrain, 'train')] 20 | num_round = 2 21 | bst = xgb.train(param, dtrain, num_round, watchlist) 22 | 23 | # this is prediction 24 | preds = bst.predict(dtest) 25 | labels = dtest.get_label() 26 | print ('error=%f' % (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds)))) 27 | bst.save_model('0001.model') 28 | 29 | # dump model 30 | bst.dump_model('dump.raw.txt') 31 | 32 | # dump model with feature map 33 | bst.dump_model('dump.nice.txt', '../data/featmap.txt') 34 | 35 | # save dmatrix into binary buffer 36 | dtest.save_binary('dtest.buffer') 37 | bst.save_model('xgb.model') 38 | 39 | # load model and data in 40 | bst2 = xgb.Booster(model_file='xgb.model') 41 | dtest2 = xgb.DMatrix('dtest.buffer') 42 | preds2 = bst2.predict(dtest2) 43 | 44 | # assert they are the same 45 | assert np.sum(np.abs(preds2-preds)) == 0 46 | 47 | print ('running cross validation') 48 | # do cross validation, this will print result out as 49 | # [iteration] metric_name:mean_value+std_value 50 | # std_value is standard deviation of the metric 51 | xgb.cv(param, dtrain, num_round, nfold=5, metrics={'error'}, seed=0) 52 | 53 | print ('running cross validation, disable standard deviation display') 54 | # do cross validation, this will print result out as 55 | # [iteration] metric_name:mean_value+std_value 56 | # std_value is standard deviation of the metric 57 | xgb.cv(param, dtrain, num_round, nfold=5, metrics={'error'}, seed=0, show_stdv=False) 58 | 59 | print ('running cross validation, with preprocessing function') 60 | # define the preprocessing function 61 | # used to return the preprocessed training, test data, and parameter 62 | # we can use this to do weight rescale, etc. 63 | # as a example, we try to set scale_pos_weight 64 | def fpreproc(dtrain, dtest, param): 65 | label = dtrain.get_label() 66 | ratio = float(np.sum(label == 0)) / np.sum(label == 1) 67 | param['scale_pos_weight'] = ratio 68 | return dtrain, dtest, param 69 | 70 | # do cross validation, for each fold 71 | # the dtrain, dtest, param will be passed into fpreproc 72 | # then the return value of fpreproc will be used to generate 73 | # results of that fold 74 | xgb.cv(param, dtrain, num_round, nfold=5, metrics={'auc'}, seed=0, fpreproc=fpreproc) 75 | 76 | print("Zeros and Ones from the Digits dataset: binary classification") 77 | digits = load_digits(2) 78 | y = digits['target'] 79 | X = digits['data'] 80 | kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) 81 | for train_index, test_index in kf: 82 | xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index]) 83 | predictions = xgb_model.predict(X[test_index]) 84 | actuals = y[test_index] 85 | print(confusion_matrix(actuals, predictions)) 86 | 87 | print("Iris: multiclass classification") 88 | iris = load_iris() 89 | y = iris['target'] 90 | X = iris['data'] 91 | kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) 92 | for train_index, test_index in kf: 93 | xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index]) 94 | predictions = xgb_model.predict(X[test_index]) 95 | actuals = y[test_index] 96 | print(confusion_matrix(actuals, predictions)) 97 | 98 | print("Boston Housing: regression") 99 | boston = load_boston() 100 | y = boston['target'] 101 | X = boston['data'] 102 | kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng) 103 | for train_index, test_index in kf: 104 | xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index]) 105 | predictions = xgb_model.predict(X[test_index]) 106 | actuals = y[test_index] 107 | print(mean_squared_error(actuals, predictions)) 108 | 109 | print("Parameter optimization") 110 | y = boston['target'] 111 | X = boston['data'] 112 | xgb_model = xgb.XGBRegressor() 113 | clf = GridSearchCV(xgb_model, 114 | {'max_depth': [2, 4, 6], 115 | 'n_estimators': [50, 100, 200]}, verbose=1) 116 | clf.fit(X, y) 117 | print(clf.best_score_) 118 | print(clf.best_params_) 119 | 120 | -------------------------------------------------------------------------------- /old/PropertyInspection/README.md: -------------------------------------------------------------------------------- 1 | # Liberty Mutual Property Inspection Prediction Challenge 2 | 3 | Houses

4 | 5 | View the competition details here.
6 | 7 | I used this competition primarily to develop my knowledge of ensembling (particularly averaging and stacking diverse models) and get familiar with deep learning using Keras. Unfortunately I wasn't able to spend enough time on it to get a decent score. -------------------------------------------------------------------------------- /old/PropertyInspection/Resources/houses.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jdwittenauer/kaggle/cc489100a0c93315e424551f68968ffba85d268f/old/PropertyInspection/Resources/houses.png -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /scripts/pyro_basics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pyro 3 | import pyro.distributions as dist 4 | from torch.autograd import Variable 5 | 6 | 7 | def weather(): 8 | cloudy = pyro.sample('cloudy', dist.bernoulli, 9 | Variable(torch.Tensor([0.3]))) 10 | cloudy = 'cloudy' if cloudy.data[0] == 1.0 else 'sunny' 11 | mean_temp = {'cloudy': [55.0], 'sunny': [75.0]}[cloudy] 12 | sigma_temp = {'cloudy': [10.0], 'sunny': [15.0]}[cloudy] 13 | temp = pyro.sample('temp', dist.normal, 14 | Variable(torch.Tensor(mean_temp)), 15 | Variable(torch.Tensor(sigma_temp))) 16 | return cloudy, temp.data[0] 17 | 18 | 19 | for _ in range(3): 20 | print(weather()) 21 | 22 | 23 | def ice_cream_sales(): 24 | cloudy, temp = weather() 25 | expected_sales = [200] if cloudy == 'sunny' and temp > 80.0 else [50] 26 | ice_cream = pyro.sample('ice_cream', dist.normal, 27 | Variable(torch.Tensor(expected_sales)), 28 | Variable(torch.Tensor([10.0]))) 29 | return cloudy, temp, ice_cream.data[0] 30 | 31 | 32 | for _ in range(3): 33 | print(ice_cream_sales()) 34 | -------------------------------------------------------------------------------- /scripts/pytorch_basics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | from torch.autograd import Variable 6 | 7 | 8 | ########################################################################## 9 | # What Is PyTorch? 10 | ########################################################################## 11 | 12 | x = torch.Tensor(5, 3) 13 | print(x) 14 | 15 | x = torch.rand(5, 3) 16 | print(x) 17 | print(x.size()) 18 | 19 | y = torch.rand(5, 3) 20 | print(x + y) 21 | print(torch.add(x, y)) 22 | 23 | result = torch.Tensor(5, 3) 24 | torch.add(x, y, out=result) 25 | print(result) 26 | 27 | print(x[:, 1]) 28 | 29 | x = torch.randn(4, 4) 30 | y = x.view(16) 31 | z = x.view(-1, 8) 32 | print(x.size(), y.size(), z.size()) 33 | 34 | a = torch.ones(5) 35 | b = a.numpy() 36 | a.add_(1) 37 | print(a) 38 | print(b) 39 | 40 | if torch.cuda.is_available(): 41 | x = x.cuda() 42 | y = y.cuda() 43 | print(x + y) 44 | 45 | ########################################################################## 46 | # Autograd: Automatic Differentiation 47 | ########################################################################## 48 | 49 | x = Variable(torch.ones(2, 2), requires_grad=True) 50 | print(x) 51 | 52 | y = x + 2 53 | print(y) 54 | 55 | print(y.grad_fn) 56 | 57 | z = y * y * 3 58 | out = z.mean() 59 | 60 | print(z, out) 61 | 62 | out.backward() 63 | print(x.grad) 64 | 65 | x = torch.randn(3) 66 | x = Variable(x, requires_grad=True) 67 | y = x * 2 68 | while y.data.norm() < 1000: 69 | y = y * 2 70 | print(y) 71 | 72 | gradients = torch.FloatTensor([0.1, 1.0, 0.0001]) 73 | y.backward(gradients) 74 | print(x.grad) 75 | 76 | ########################################################################## 77 | # Neural Networks 78 | ########################################################################## 79 | 80 | 81 | class Net(nn.Module): 82 | def __init__(self): 83 | super(Net, self).__init__() 84 | self.conv1 = nn.Conv2d(1, 6, 5) 85 | self.conv2 = nn.Conv2d(6, 16, 5) 86 | self.fc1 = nn.Linear(16 * 5 * 5, 120) 87 | self.fc2 = nn.Linear(120, 84) 88 | self.fc3 = nn.Linear(84, 10) 89 | 90 | def forward(self, x): 91 | x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) 92 | x = F.max_pool2d(F.relu(self.conv2(x)), 2) 93 | x = x.view(-1, self.num_flat_features(x)) 94 | x = F.relu(self.fc1(x)) 95 | x = F.relu(self.fc2(x)) 96 | x = self.fc3(x) 97 | return x 98 | 99 | def num_flat_features(self, x): 100 | size = x.size()[1:] 101 | num_features = 1 102 | for s in size: 103 | num_features *= s 104 | return num_features 105 | 106 | 107 | net = Net() 108 | print(net) 109 | 110 | params = list(net.parameters()) 111 | print(len(params)) 112 | print(params[0].size()) 113 | 114 | input = Variable(torch.randn(1, 1, 32, 32)) 115 | out = net(input) 116 | print(out) 117 | 118 | output = net(input) 119 | target = Variable(torch.arange(1, 11)) 120 | criterion = nn.MSELoss() 121 | loss = criterion(output, target) 122 | print(loss) 123 | 124 | print(loss.grad_fn) 125 | print(loss.grad_fn.next_functions[0][0]) 126 | print(loss.grad_fn.next_functions[0][0].next_functions[0][0]) 127 | 128 | net.zero_grad() 129 | print('conv1.bias.grad before backward') 130 | print(net.conv1.bias.grad) 131 | 132 | loss.backward() 133 | print('conv1.bias.grad after backward') 134 | print(net.conv1.bias.grad) 135 | 136 | optimizer = optim.SGD(net.parameters(), lr=0.01) 137 | optimizer.zero_grad() 138 | output = net(input) 139 | loss = criterion(output, target) 140 | loss.backward() 141 | optimizer.step() 142 | print(loss) 143 | -------------------------------------------------------------------------------- /scripts/pytorch_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from torch.utils.data import DataLoader, TensorDataset 6 | 7 | class TorchEmbeddingNet(nn.Module): 8 | def __init__(self, cat_vars, cont_vars, embedding_sizes): 9 | super(TorchNet, self).__init__() 10 | self.embeddings = nn.ModuleList([nn.Embedding(c, s) for c, s in embedding_sizes]) 11 | 12 | self.n_cat = len(cat_vars) 13 | self.n_cont = len(cont_vars) 14 | self.n_embed = sum(e.embedding_dim for e in self.embeddings) 15 | 16 | self.fc1 = nn.Linear(self.n_embed + self.n_cont, 1000) 17 | self.fc2 = nn.Linear(1000, 500) 18 | self.out = nn.Linear(500, 1) 19 | 20 | self.bn_cont = nn.BatchNorm1d(self.n_cont) 21 | self.bn1 = nn.BatchNorm1d(1000) 22 | self.bn2 = nn.BatchNorm1d(500) 23 | 24 | self.d_embed = nn.Dropout(0.04) 25 | self.d1 = nn.Dropout(0.001) 26 | self.d2 = nn.Dropout(0.01) 27 | 28 | for e in self.embeddings: 29 | e = e.weight.data 30 | sc = 2 / (e.size(1) + 1) 31 | e.uniform_(-sc, sc) 32 | 33 | nn.init.kaiming_normal(self.fc1.weight.data) 34 | nn.init.kaiming_normal(self.fc2.weight.data) 35 | nn.init.kaiming_normal(self.out.weight.data) 36 | 37 | def forward(self, x_cat, x_cont): 38 | x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)] 39 | x = torch.cat(x, 1) 40 | x = self.d_embed(x) 41 | 42 | x2 = self.bn_cont(x_cont) 43 | x = torch.cat([x, x2], 1) 44 | 45 | x = F.relu(self.fc1(x)) 46 | x = self.bn1(x) 47 | x = self.d1(x) 48 | 49 | x = F.relu(self.fc2(x)) 50 | x = self.bn2(x) 51 | x = self.d2(x) 52 | 53 | x = self.out(x) 54 | 55 | return x 56 | 57 | model = TorchEmbeddingNet(cat_vars, cont_vars, embedding_sizes) 58 | loss_fn = nn.L1Loss() 59 | optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 60 | print(model) 61 | 62 | def create_dataloaders(X, y, batch_size, val_data=None): 63 | X_cat = X[cat_vars].values.astype('int64') 64 | X_cont = X[cont_vars].values.astype('float32') 65 | y = y.values.astype('float32') 66 | train_ds = TensorDataset(torch.from_numpy(X_cat), torch.from_numpy(X_cont), torch.from_numpy(y)) 67 | train_dl = DataLoader(train_ds, batch_size) 68 | if val_data is not None: 69 | X_val, y_val = val_data 70 | X_val_cat = X_val[cat_vars].values.astype('int64') 71 | X_val_cont = X_val[cont_vars].values.astype('float32') 72 | y_val = y_val.values.astype('float32') 73 | val_ds = TensorDataset(torch.from_numpy(X_val_cat), torch.from_numpy(X_val_cont), torch.from_numpy(y_val)) 74 | val_dl = DataLoader(val_ds, batch_size) 75 | return train_dl, val_dl 76 | else: 77 | return train_dl 78 | 79 | train_dl, val_dl = create_dataloaders(X, y, batch_size, val_data=(X_val, y_val)) 80 | 81 | def fit(model, optimizer, loss_fn, train_dl, n_epochs, val_dl=None): 82 | for epoch in range(n_epochs): 83 | t0 = time.time() 84 | model.train() 85 | epoch_loss = 0.0 86 | epoch_val_loss = 0.0 87 | steps = 0 88 | val_steps = 0 89 | for i, data in enumerate(train_dl, 0): 90 | X_cat, X_cont, y = data 91 | y = y.view(-1, 1) 92 | y_pred = model(X_cat, X_cont) 93 | loss = loss_fn(y_pred, y) 94 | optimizer.zero_grad() 95 | loss.backward() 96 | optimizer.step() 97 | epoch_loss += loss.item() 98 | steps += 1 99 | if val_dl is not None: 100 | model.eval() 101 | for i, data in enumerate(val_dl, 0): 102 | X_cat, X_cont, y = data 103 | y = y.view(-1, 1) 104 | y_pred = model(X_cat, X_cont) 105 | val_loss = loss_fn(y_pred, y) 106 | epoch_val_loss += val_loss.item() 107 | val_steps += 1 108 | t1 = time.time() 109 | print('[Epoch {0:d}] loss: {1:.3f} | val loss: {2:.3f} | {3:.0f} s'.format( 110 | epoch + 1, epoch_loss / steps, epoch_val_loss / val_steps, t1 - t0)) 111 | else: 112 | t1 = time.time() 113 | print('[Epoch {0:d}] loss: {1:.3f} | {2:.0f} s'.format(epoch + 1, epoch_loss / steps, t1 - t0)) 114 | 115 | fit(model, optimizer, loss_fn, train_dl, n_epochs=n_epochs, val_dl=val_dl) 116 | -------------------------------------------------------------------------------- /scripts/pytorch_mnist.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torchvision import datasets, transforms 8 | from torch.autograd import Variable 9 | 10 | 11 | class MnistModel(nn.Module): 12 | def __init__(self): 13 | super(MnistModel, self).__init__() 14 | # input is 28x28 15 | # padding=2 for same padding 16 | self.conv1 = nn.Conv2d(1, 32, 5, padding=2) 17 | # feature map size is 14*14 by pooling 18 | # padding=2 for same padding 19 | self.conv2 = nn.Conv2d(32, 64, 5, padding=2) 20 | # feature map size is 7*7 by pooling 21 | self.fc1 = nn.Linear(64 * 7 * 7, 1024) 22 | self.fc2 = nn.Linear(1024, 10) 23 | 24 | def forward(self, x): 25 | x = F.max_pool2d(F.relu(self.conv1(x)), 2) 26 | x = F.max_pool2d(F.relu(self.conv2(x)), 2) 27 | x = x.view(-1, 64 * 7 * 7) # reshape Variable 28 | x = F.relu(self.fc1(x)) 29 | x = F.dropout(x, training=self.training) 30 | x = self.fc2(x) 31 | return F.log_softmax(x) 32 | 33 | 34 | model = MnistModel() 35 | batch_size = 50 36 | 37 | train_loader = torch.utils.data.DataLoader( 38 | datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()), 39 | batch_size=batch_size, shuffle=True) 40 | test_loader = torch.utils.data.DataLoader( 41 | datasets.MNIST('data', train=False, transform=transforms.ToTensor()), 42 | batch_size=1000) 43 | 44 | for p in model.parameters(): 45 | print(p.size()) 46 | 47 | optimizer = optim.Adam(model.parameters(), lr=0.0001) 48 | model.train() 49 | train_loss = [] 50 | train_accu = [] 51 | i = 0 52 | for epoch in range(15): 53 | for data, target in train_loader: 54 | data, target = Variable(data), Variable(target) 55 | optimizer.zero_grad() 56 | output = model(data) 57 | loss = F.nll_loss(output, target) 58 | loss.backward() # calc gradients 59 | train_loss.append(loss.data[0]) 60 | optimizer.step() # update gradients 61 | prediction = output.data.max(1)[1] # first column has actual prob. 62 | accuracy = prediction.eq(target.data).sum() / batch_size * 100 63 | train_accu.append(accuracy) 64 | if i % 1000 == 0: 65 | print('Train Step: {}\tLoss: {:.3f}\tAccuracy: {:.3f}'.format(i, loss.data[0], accuracy)) 66 | i += 1 67 | 68 | plt.plot(np.arange(len(train_loss)), train_loss) 69 | plt.plot(np.arange(len(train_accu)), train_accu) 70 | 71 | model.eval() 72 | correct = 0 73 | for data, target in test_loader: 74 | data, target = Variable(data, volatile=True), Variable(target) 75 | output = model(data) 76 | prediction = output.data.max(1)[1] 77 | correct += prediction.eq(target.data).sum() 78 | 79 | print('\nTest set: Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset))) 80 | -------------------------------------------------------------------------------- /scripts/tf_basics.py: -------------------------------------------------------------------------------- 1 | ########################################################################## 2 | # Graph Basics 3 | ########################################################################## 4 | 5 | import tensorflow as tf 6 | 7 | # Create a Constant op that produces a 1x2 matrix. The op is 8 | # added as a node to the default graph. 9 | # 10 | # The value returned by the constructor represents the output 11 | # of the Constant op. 12 | matrix1 = tf.constant([[3., 3.]]) 13 | 14 | # Create another Constant that produces a 2x1 matrix. 15 | matrix2 = tf.constant([[2.], [2.]]) 16 | 17 | # Create a Matmul op that takes 'matrix1' and 'matrix2' as inputs. 18 | # The returned value, 'product', represents the result of the matrix 19 | # multiplication. 20 | product = tf.matmul(matrix1, matrix2) 21 | 22 | # Launch the default graph. 23 | sess = tf.Session() 24 | 25 | # To run the matmul op we call the session 'run()' method, passing 'product' 26 | # which represents the output of the matmul op. This indicates to the call 27 | # that we want to get the output of the matmul op back. 28 | # 29 | # All inputs needed by the op are run automatically by the session. They 30 | # typically are run in parallel. 31 | # 32 | # The call 'run(product)' thus causes the execution of threes ops in the 33 | # graph: the two constants and matmul. 34 | # 35 | # The output of the op is returned in 'result' as a numpy `ndarray` object. 36 | result = sess.run(product) 37 | print(result) 38 | # ==> [[ 12.]] 39 | 40 | # Close the Session when we're done. 41 | sess.close() 42 | 43 | ########################################################################## 44 | # Interactive Usage 45 | ########################################################################## 46 | 47 | # Enter an interactive TensorFlow Session. 48 | sess = tf.InteractiveSession() 49 | 50 | x = tf.Variable([1.0, 2.0]) 51 | a = tf.constant([3.0, 3.0]) 52 | 53 | # Initialize 'x' using the run() method of its initializer op. 54 | x.initializer.run() 55 | 56 | # Add an op to subtract 'a' from 'x'. Run it and print the result 57 | sub = tf.sub(x, a) 58 | print(sub.eval()) 59 | # ==> [-2. -1.] 60 | 61 | # Close the Session when we're done. 62 | sess.close() 63 | 64 | ########################################################################## 65 | # Variables 66 | ########################################################################## 67 | 68 | # Create a Variable, that will be initialized to the scalar value 0. 69 | state = tf.Variable(0, name="counter") 70 | 71 | # Create an Op to add one to `state`. 72 | 73 | one = tf.constant(1) 74 | new_value = tf.add(state, one) 75 | update = tf.assign(state, new_value) 76 | 77 | # Variables must be initialized by running an `init` Op after having 78 | # launched the graph. We first have to add the `init` Op to the graph. 79 | init_op = tf.initialize_all_variables() 80 | 81 | # Launch the graph and run the ops. 82 | with tf.Session() as sess: 83 | # Run the 'init' op 84 | sess.run(init_op) 85 | # Print the initial value of 'state' 86 | print(sess.run(state)) 87 | # Run the op that updates 'state' and print 'state'. 88 | for _ in range(3): 89 | sess.run(update) 90 | print(sess.run(state)) 91 | 92 | # output: 93 | 94 | # 0 95 | # 1 96 | # 2 97 | # 3 98 | 99 | ########################################################################## 100 | # Fetches 101 | ########################################################################## 102 | 103 | input1 = tf.constant(3.0) 104 | input2 = tf.constant(2.0) 105 | input3 = tf.constant(5.0) 106 | intermed = tf.add(input2, input3) 107 | mul = tf.mul(input1, intermed) 108 | 109 | with tf.Session() as sess: 110 | result = sess.run([mul, intermed]) 111 | print(result) 112 | 113 | # output: 114 | # [array([ 21.], dtype=float32), array([ 7.], dtype=float32)] 115 | 116 | ########################################################################## 117 | # Feeds 118 | ########################################################################## 119 | 120 | input1 = tf.placeholder(tf.float32) 121 | input2 = tf.placeholder(tf.float32) 122 | output = tf.mul(input1, input2) 123 | 124 | with tf.Session() as sess: 125 | print(sess.run([output], feed_dict={input1: [7.], input2: [2.]})) 126 | 127 | # output: 128 | # [array([ 14.], dtype=float32)] 129 | -------------------------------------------------------------------------------- /scripts/tf_mnist.py: -------------------------------------------------------------------------------- 1 | # Import the MNIST data set. 2 | from tensorflow.examples.tutorials.mnist import input_data 3 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 4 | 5 | # Define some initial variables. 6 | import tensorflow as tf 7 | x = tf.placeholder(tf.float32, [None, 784]) 8 | W = tf.Variable(tf.zeros([784, 10])) 9 | b = tf.Variable(tf.zeros([10])) 10 | 11 | # Implement the model using the built-in softmax function. 12 | y = tf.nn.softmax(tf.matmul(x, W) + b) 13 | 14 | # Define the operation to compute cross-entropy. 15 | y_ = tf.placeholder(tf.float32, [None, 10]) 16 | cross_entropy = -tf.reduce_sum(y_ * tf.log(y)) 17 | 18 | # Use a built-in optimization algorithm to define how to proceed with training. 19 | train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy) 20 | 21 | # Create a session and initialize the variables. 22 | init = tf.initialize_all_variables() 23 | sess = tf.Session() 24 | sess.run(init) 25 | 26 | # Run the training algorithm for 1000 iterations. 27 | for i in range(1000): 28 | batch_xs, batch_ys = mnist.train.next_batch(100) 29 | sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) 30 | 31 | # Calculate the accuracy of the trained model. 32 | correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) 33 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) 34 | print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels})) --------------------------------------------------------------------------------