├── .gitignore
├── README.md
├── old
├── BagOfPopcorn
│ ├── README.md
│ ├── bag_of_words.py
│ ├── kaggle_utility.py
│ ├── word2vec_average_vectors.py
│ └── word2vec_bag_of_centroids.py
├── BikeSharing
│ ├── README.md
│ └── bikes.py
├── Expedia
│ ├── README.md
│ └── script.py
├── ForestCover
│ ├── README.md
│ └── forest.py
├── HiggsBoson
│ ├── README.md
│ ├── Resources
│ │ ├── ATLAS.png
│ │ └── documentation.pdf
│ ├── auto_l1.yaml
│ ├── auto_l2.yaml
│ ├── auto_mlp.yaml
│ ├── higgs.py
│ ├── higgs_adv.py
│ ├── higgs_nn.py
│ ├── metric.py
│ └── mlp.yaml
├── NerveSegmentation
│ ├── README.md
│ ├── data.py
│ ├── submission.py
│ └── train.py
├── OttoGroup
│ ├── README.md
│ ├── Resources
│ │ └── Grafik.jpg
│ ├── find_ensemble_weights.py
│ ├── graphlab_starter.py
│ ├── keras_starter.py
│ ├── keras_wrapper.py
│ ├── otto.py
│ ├── simple_svm.py
│ └── xgboost_walkthrough.py
└── PropertyInspection
│ ├── README.md
│ ├── Resources
│ └── houses.png
│ └── property.py
└── scripts
├── __init__.py
├── pyro_basics.py
├── pytorch_basics.py
├── pytorch_embedding.py
├── pytorch_mnist.py
├── tf_basics.py
└── tf_mnist.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 |
46 | # Translations
47 | *.mo
48 | *.pot
49 |
50 | # Django stuff:
51 | *.log
52 |
53 | # Sphinx documentation
54 | docs/_build/
55 |
56 | # PyBuilder
57 | target/
58 |
59 | # PyCharm
60 | .idea/
61 |
62 | # VS Code
63 | .vscode/
64 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Kaggle Competition Code
2 |
3 | Repository for code used in Kaggle competitions. Browse the folders from the main directory to see details about each competition.
4 |
--------------------------------------------------------------------------------
/old/BagOfPopcorn/README.md:
--------------------------------------------------------------------------------
1 | # Bag Of Popcorn (Word2Vec Tutorial)
2 |
3 | View the competition details here.
4 |
5 | This directory includes all of the code used for the competition. Since this challenge is really a learning exercise for natural language processing using gensim and word2vec there is not much unique or interesting as I'm mostly following the tutorial.
--------------------------------------------------------------------------------
/old/BagOfPopcorn/bag_of_words.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('/home/john/git/kaggle/BagOfPopcorn/')
3 |
4 | import pandas as pd
5 | from sklearn.feature_extraction.text import CountVectorizer
6 | from sklearn.ensemble import RandomForestClassifier
7 | from old.Word2Vec.kaggle_utility import KaggleUtility
8 |
9 |
10 | def main():
11 | data_dir = '/home/john/data/bag-of-popcorn/'
12 |
13 | train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
14 | test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3)
15 |
16 | print 'The first review is:'
17 | print train['review'][0]
18 |
19 | raw_input('Press Enter to continue...')
20 |
21 | # print 'Downloading text data sets...'
22 | # nltk.download()
23 |
24 | # Initialize an empty list to hold the clean reviews
25 | clean_train_reviews = []
26 |
27 | # Loop over each review; create an index i that goes from 0 to the length
28 | # of the movie review list
29 |
30 | print 'Cleaning and parsing the training set movie reviews...\n'
31 | for i in xrange(0, len(train['review'])):
32 | clean_train_reviews.append(' '.join(KaggleUtility.review_to_wordlist(train['review'][i], True)))
33 |
34 | # Create a bag of words from the training set
35 |
36 | print 'Creating the bag of words...\n'
37 |
38 | # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool
39 | vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None,
40 | stop_words=None, max_features=5000)
41 |
42 | # fit_transform() does two functions: First, it fits the model
43 | # and learns the vocabulary; second, it transforms our training data
44 | # into feature vectors. The input to fit_transform should be a list of
45 | # strings
46 | train_data_features = vectorizer.fit_transform(clean_train_reviews)
47 |
48 | # Numpy arrays are easy to work with, so convert the result to an array
49 | train_data_features = train_data_features.toarray()
50 |
51 | # Train a random forest using the bag of words
52 |
53 | print 'Training the random forest (this may take a while)...'
54 |
55 | # Initialize a Random Forest classifier with 100 trees
56 | forest = RandomForestClassifier(n_estimators=100)
57 |
58 | # Fit the forest to the training set, using the bag of words as
59 | # features and the sentiment labels as the response variable
60 |
61 | # This may take a few minutes to run
62 | forest = forest.fit(train_data_features, train['sentiment'])
63 |
64 | # Create an empty list and append the clean reviews one by one
65 | clean_test_reviews = []
66 |
67 | print 'Cleaning and parsing the test set movie reviews...\n'
68 | for i in xrange(0, len(test['review'])):
69 | clean_test_reviews.append(' '.join(KaggleUtility.review_to_wordlist(test['review'][i], True)))
70 |
71 | # Get a bag of words for the test set, and convert to a numpy array
72 | test_data_features = vectorizer.transform(clean_test_reviews)
73 | test_data_features = test_data_features.toarray()
74 |
75 | # Use the random forest to make sentiment label predictions
76 | print 'Predicting test labels...\n'
77 | result = forest.predict(test_data_features)
78 |
79 | # Copy the results to a pandas dataframe with an "id" column and
80 | # a "sentiment" column
81 | output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})
82 |
83 | # Use pandas to write the comma-separated output file
84 | output.to_csv(data_dir + 'Bag_of_Words_model.csv', index=False, quoting=3)
85 | print 'Wrote results to Bag_of_Words_model.csv'
86 |
87 |
88 | if __name__ == "__main__":
89 | main()
--------------------------------------------------------------------------------
/old/BagOfPopcorn/kaggle_utility.py:
--------------------------------------------------------------------------------
1 | import re
2 | from bs4 import BeautifulSoup
3 | from nltk.corpus import stopwords
4 |
5 |
6 | class KaggleUtility(object):
7 | @staticmethod
8 | def review_to_wordlist(review, remove_stopwords=False):
9 | # Function to convert a document to a sequence of words,
10 | # optionally removing stop words. Returns a list of words.
11 |
12 | # 1. Remove HTML
13 | review_text = BeautifulSoup(review).get_text()
14 |
15 | # 2. Remove non-letters
16 | review_text = re.sub('[^a-zA-Z]', ' ', review_text)
17 |
18 | # 3. Convert words to lower case and split them
19 | words = review_text.lower().split()
20 |
21 | # 4. Optionally remove stop words (false by default)
22 | if remove_stopwords:
23 | stops = set(stopwords.words('english'))
24 | words = [w for w in words if w not in stops]
25 |
26 | # 5. Return a list of words
27 | return words
28 |
29 | @staticmethod
30 | def review_to_sentences(review, tokenizer, remove_stopwords=False):
31 | # Function to split a review into parsed sentences. Returns a
32 | # list of sentences, where each sentence is a list of words
33 |
34 | # 1. Use the NLTK tokenizer to split the paragraph into sentences
35 | raw_sentences = tokenizer.tokenize(review.decode('utf8').strip())
36 |
37 | # 2. Loop over each sentence
38 | sentences = []
39 | for raw_sentence in raw_sentences:
40 | # If a sentence is empty, skip it
41 | if len(raw_sentence) > 0:
42 | # Otherwise, call review_to_wordlist to get a list of words
43 | sentences.append(KaggleUtility.review_to_wordlist(raw_sentence, remove_stopwords))
44 |
45 | # Return the list of sentences (each sentence is a list of words,
46 | # so this returns a list of lists
47 | return sentences
--------------------------------------------------------------------------------
/old/BagOfPopcorn/word2vec_average_vectors.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('/home/john/git/kaggle/BagOfPopcorn/')
3 |
4 | import logging
5 | import nltk.data
6 | import numpy as np
7 | import pandas as pd
8 | from gensim.models import Word2Vec
9 | from sklearn.ensemble import RandomForestClassifier
10 | from old.Word2Vec.kaggle_utility import KaggleUtility
11 |
12 |
13 | def make_feature_vec(words, model, num_features):
14 | # Function to average all of the word vectors in a given
15 | # paragraph
16 |
17 | # Pre-initialize an empty numpy array (for speed)
18 | feature_vec = np.zeros(num_features, dtype='float32')
19 |
20 | nwords = 0
21 |
22 | # Index2word is a list that contains the names of the words in
23 | # the model's vocabulary. Convert it to a set, for speed
24 | index2word_set = set(model.index2word)
25 |
26 | # Loop over each word in the review and, if it is in the model's
27 | # vocabulary, add its feature vector to the total
28 | for word in words:
29 | if word in index2word_set:
30 | nwords += 1
31 | feature_vec = np.add(feature_vec, model[word])
32 |
33 | # Divide the result by the number of words to get the average
34 | feature_vec = np.divide(feature_vec, nwords)
35 | return feature_vec
36 |
37 |
38 | def get_avg_feature_vecs(reviews, model, num_features):
39 | # Given a set of reviews (each one a list of words), calculate
40 | # the average feature vector for each one and return a 2D numpy array
41 |
42 | # Initialize a counter
43 | counter = 0
44 |
45 | # Preallocate a 2D numpy array, for speed
46 | review_feature_vecs = np.zeros((len(reviews), num_features), dtype='float32')
47 |
48 | # Loop through the reviews
49 | for review in reviews:
50 | # Print a status message every 1000th review
51 | if counter % 1000 == 0:
52 | print 'Review %d of %d' % (counter, len(reviews))
53 |
54 | # Call the function (defined above) that makes average feature vectors
55 | review_feature_vecs[counter] = make_feature_vec(review, model, num_features)
56 |
57 | # Increment the counter
58 | counter += 1
59 | return review_feature_vecs
60 |
61 |
62 | def get_clean_reviews(reviews):
63 | clean_reviews = []
64 | for review in reviews['review']:
65 | clean_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True))
66 | return clean_reviews
67 |
68 |
69 | def main():
70 | data_dir = '/home/john/data/bag-of-popcorn/'
71 |
72 | # Read data from files
73 | train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
74 | test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3)
75 | unlabeled_train = pd.read_csv(data_dir + 'unlabeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
76 |
77 | # Verify the number of reviews that were read (100,000 in total)
78 | print 'Read %d labeled train reviews, %d labeled test reviews, and %d unlabeled reviews\n' % \
79 | (train['review'].size, test['review'].size, unlabeled_train['review'].size)
80 |
81 | # Load the punkt tokenizer
82 | tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
83 |
84 | # Split the labeled and unlabeled training sets into clean sentences
85 |
86 | # Initialize an empty list of sentences
87 | sentences = []
88 |
89 | print 'Parsing sentences from training set'
90 | for review in train['review']:
91 | sentences += KaggleUtility.review_to_sentences(review, tokenizer)
92 |
93 | print 'Parsing sentences from unlabeled set'
94 | for review in unlabeled_train['review']:
95 | sentences += KaggleUtility.review_to_sentences(review, tokenizer)
96 |
97 | # Set parameters and train the word2vec model
98 |
99 | # Import the built-in logging module and configure it so that BagOfPopcorn
100 | # creates nice output messages
101 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
102 |
103 | # Set values for various parameters
104 | num_features = 300 # Word vector dimensionality
105 | min_word_count = 40 # Minimum word count
106 | num_workers = 4 # Number of threads to run in parallel
107 | context = 10 # Context window size
108 | downsampling = 1e-3 # Downsample setting for frequent words
109 |
110 | # Initialize and train the model (this will take some time)
111 | print 'Training BagOfPopcorn model...'
112 | model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count,
113 | window=context, sample=downsampling, seed=1)
114 |
115 | # If you don't plan to train the model any further, calling
116 | # init_sims will make the model much more memory-efficient.
117 | model.init_sims(replace=True)
118 |
119 | # It can be helpful to create a meaningful model name and
120 | # save the model for later use. You can load it later using BagOfPopcorn.load()
121 | model_name = '300features_40minwords_10context'
122 | model.save(data_dir + model_name)
123 |
124 | model.doesnt_match('man woman child kitchen'.split())
125 | model.doesnt_match('france england germany berlin'.split())
126 | model.doesnt_match('paris berlin london austria'.split())
127 | model.most_similar('man')
128 | model.most_similar('queen')
129 | model.most_similar('awful')
130 |
131 | # Create average vectors for the training and test sets
132 |
133 | print 'Creating average feature vecs for training reviews'
134 |
135 | train_data_vecs = get_avg_feature_vecs(get_clean_reviews(train), model, num_features)
136 |
137 | print 'Creating average feature vecs for test reviews'
138 |
139 | test_data_vecs = get_avg_feature_vecs(get_clean_reviews(test), model, num_features)
140 |
141 | # Fit a random forest to the training set, then make predictions
142 |
143 | # Fit a random forest to the training data, using 100 trees
144 | forest = RandomForestClassifier(n_estimators=100)
145 |
146 | print 'Fitting a random forest to labeled training data...'
147 | forest = forest.fit(train_data_vecs, train['sentiment'])
148 |
149 | # Test & extract results
150 | result = forest.predict(test_data_vecs)
151 |
152 | # Write the test results
153 | output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})
154 | output.to_csv(data_dir + 'Word2Vec_AverageVectors.csv', index=False, quoting=3)
155 | print 'Wrote Word2Vec_AverageVectors.csv'
156 |
157 |
158 | if __name__ == "__main__":
159 | main()
--------------------------------------------------------------------------------
/old/BagOfPopcorn/word2vec_bag_of_centroids.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('/home/john/git/kaggle/BagOfPopcorn/')
3 |
4 | import time
5 | import numpy as np
6 | import pandas as pd
7 | from gensim.models import Word2Vec
8 | from sklearn.cluster import KMeans
9 | from sklearn.ensemble import RandomForestClassifier
10 | from old.Word2Vec.kaggle_utility import KaggleUtility
11 |
12 |
13 | def create_bag_of_centroids(wordlist, word_centroid_map):
14 | # The number of clusters is equal to the highest cluster index
15 | # in the word / centroid map
16 | num_centroids = max(word_centroid_map.values()) + 1
17 |
18 | # Pre-allocate the bag of centroids vector (for speed)
19 | bag_of_centroids = np.zeros(num_centroids, dtype='float32')
20 |
21 | # Loop over the words in the review. If the word is in the vocabulary,
22 | # find which cluster it belongs to, and increment that cluster count
23 | # by one
24 | for word in wordlist:
25 | if word in word_centroid_map:
26 | index = word_centroid_map[word]
27 | bag_of_centroids[index] += 1
28 |
29 | # Return the 'bag of centroids'
30 | return bag_of_centroids
31 |
32 |
33 | def main():
34 | data_dir = '/home/john/data/bag-of-popcorn/'
35 | model = Word2Vec.load(data_dir + '300features_40minwords_10context')
36 |
37 | # Run k-means on the word vectors and print a few clusters
38 |
39 | # Start time
40 | start = time.time()
41 |
42 | # Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
43 | # average of 5 words per cluster
44 | word_vectors = model.syn0
45 | num_clusters = word_vectors.shape[0] / 5
46 |
47 | # Initialize a k-means object and use it to extract centroids
48 | print 'Running K means'
49 | kmeans_clustering = KMeans(n_clusters=num_clusters)
50 | idx = kmeans_clustering.fit_predict(word_vectors)
51 |
52 | # Get the end time and print how long the process took
53 | end = time.time()
54 | elapsed = end - start
55 | print 'Time taken for K Means clustering: ', elapsed, 'seconds.'
56 |
57 | # Create a Word / Index dictionary, mapping each vocabulary word to
58 | # a cluster number
59 | word_centroid_map = dict(zip(model.index2word, idx))
60 |
61 | # Print the first ten clusters
62 | for cluster in xrange(0, 10):
63 |
64 | # Print the cluster number
65 | print '\nCluster %d' % cluster
66 |
67 | # Find all of the words for that cluster number, and print them out
68 | words = []
69 | for i in xrange(0, len(word_centroid_map.values())):
70 | if word_centroid_map.values()[i] == cluster:
71 | words.append(word_centroid_map.keys()[i])
72 | print words
73 |
74 | # Create clean_train_reviews and clean_test_reviews as we did before
75 |
76 | # Read data from files
77 | train = pd.read_csv(data_dir + 'labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
78 | test = pd.read_csv(data_dir + 'testData.tsv', header=0, delimiter='\t', quoting=3)
79 |
80 | print 'Cleaning training reviews'
81 | clean_train_reviews = []
82 | for review in train['review']:
83 | clean_train_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True))
84 |
85 | print 'Cleaning test reviews'
86 | clean_test_reviews = []
87 | for review in test['review']:
88 | clean_test_reviews.append(KaggleUtility.review_to_wordlist(review, remove_stopwords=True))
89 |
90 | # Create bags of centroids
91 |
92 | # Pre-allocate an array for the training set bags of centroids (for speed)
93 | train_centroids = np.zeros((train['review'].size, num_clusters), dtype='float32')
94 |
95 | # Transform the training set reviews into bags of centroids
96 | counter = 0
97 | for review in clean_train_reviews:
98 | train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
99 | counter += 1
100 |
101 | # Repeat for test reviews
102 | test_centroids = np.zeros((test['review'].size, num_clusters), dtype='float32')
103 |
104 | counter = 0
105 | for review in clean_test_reviews:
106 | test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
107 | counter += 1
108 |
109 | # Fit a random forest and extract predictions
110 | forest = RandomForestClassifier(n_estimators=100)
111 |
112 | # Fitting the forest may take a few minutes
113 | print 'Fitting a random forest to labeled training data...'
114 | forest = forest.fit(train_centroids, train['sentiment'])
115 | result = forest.predict(test_centroids)
116 |
117 | # Write the test results
118 | output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})
119 | output.to_csv(data_dir + 'BagOfCentroids.csv', index=False, quoting=3)
120 | print 'Wrote BagOfCentroids.csv'
121 |
122 |
123 | if __name__ == '__main__':
124 | main()
--------------------------------------------------------------------------------
/old/BikeSharing/README.md:
--------------------------------------------------------------------------------
1 | # Bike Sharing Demand
2 |
3 | View the competition details here.
4 |
5 | This directory includes all of the code used for the competition. Since the bike sharing challenge is a fairly easy problem designed for beginners, I used this competition to refine my forecasting/regression script a bit and learn some scikit-learn APIs that I wasn't overly familiar with.
--------------------------------------------------------------------------------
/old/BikeSharing/bikes.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import pickle
4 | import numpy as np
5 | import pandas as pd
6 | import matplotlib.pyplot as plt
7 | import seaborn as sb
8 | from sklearn.cross_validation import *
9 | from sklearn.decomposition import *
10 | from sklearn.ensemble import *
11 | from sklearn.feature_selection import *
12 | from sklearn.grid_search import *
13 | from sklearn.learning_curve import *
14 | from sklearn.linear_model import *
15 | from sklearn.manifold import *
16 | from sklearn.naive_bayes import *
17 | from sklearn.preprocessing import *
18 | from sklearn.svm import *
19 |
20 |
21 | def performance_test(x):
22 | """
23 | Test NumPy performance. Use to compare computation speed across machines.
24 | """
25 | A = np.random.random((x, x))
26 | B = np.random.random((x, x))
27 | t = time.time()
28 | np.dot(A, B)
29 | print(time.time() - t)
30 |
31 |
32 | def load_csv_data(directory, filename, dtype=None, index=None, convert_to_date=False):
33 | """
34 | Test NumPy performance. Use to compare computation speed across machines.
35 | """
36 | data = pd.read_csv(directory + filename, sep=',', dtype=dtype)
37 |
38 | if index is not None:
39 | if convert_to_date:
40 | if type(index) is str:
41 | data[index] = data[index].convert_objects(convert_dates='coerce')
42 | else:
43 | for key in index:
44 | data[key] = data[key].convert_objects(convert_dates='coerce')
45 |
46 | data = data.set_index(index)
47 |
48 | print('Data file ' + filename + ' loaded successfully.')
49 |
50 | return data
51 |
52 |
53 | def load_model(filename):
54 | """
55 | Load a previously training model from disk.
56 | """
57 | model_file = open(filename, 'rb')
58 | model = pickle.load(model_file)
59 | model_file.close()
60 |
61 | return model
62 |
63 |
64 | def save_model(model, filename):
65 | """
66 | Persist a trained model to disk.
67 | """
68 | model_file = open(filename, 'wb')
69 | pickle.dump(model, model_file)
70 | model_file.close()
71 |
72 |
73 | def predict(X, model, transforms):
74 | """
75 | Predicts the class label.
76 | """
77 | X = apply_transforms(X, transforms)
78 | y_est = model.predict(X)
79 |
80 | return y_est
81 |
82 |
83 | def predict_probability(X, model, transforms):
84 | """
85 | Predicts the class probabilities.
86 | """
87 | X = apply_transforms(X, transforms)
88 | y_prob = model.predict_proba(X)[:, 1]
89 |
90 | return y_prob
91 |
92 |
93 | def score(X, y, model, transforms):
94 | """
95 | Scores the model's performance and returns the result.
96 | """
97 | X = apply_transforms(X, transforms)
98 |
99 | return model.score(X, y)
100 |
101 |
102 | def generate_features(data):
103 | """
104 | Generates new derived features to add to the data set for model training.
105 | """
106 | data['DayOfWeek'] = data.index.map(lambda x: x.dayofweek)
107 | data['TimeOfDay'] = data.index.map(lambda x: x.hour)
108 |
109 | return data
110 |
111 |
112 | def process_training_data(directory, filename, ex_generate_features):
113 | """
114 | Reads in training data and prepares numpy arrays.
115 | """
116 | training_data = load_csv_data(directory, filename, index='datetime', convert_to_date=True)
117 | num_features = len(training_data.columns) - 3
118 |
119 | # drop the total count label and move the registered/casual counts to the front
120 | cols = training_data.columns.tolist()
121 | cols = cols[-3:-1] + cols[0:num_features]
122 | training_data = training_data[cols]
123 |
124 | if ex_generate_features:
125 | training_data = generate_features(training_data)
126 |
127 | num_features = len(training_data.columns)
128 | X = training_data.iloc[:, 2:num_features].values
129 | y1 = training_data.iloc[:, 0].values
130 | y2 = training_data.iloc[:, 1].values
131 |
132 | return training_data, X, y1, y2
133 |
134 |
135 | def process_test_data(directory, filename, ex_generate_features):
136 | """
137 | Reads in the test data set and prepares it for prediction by the model.
138 | """
139 | test_data = load_csv_data(directory, filename, index='datetime', convert_to_date=True)
140 |
141 | if ex_generate_features:
142 | test_data = generate_features(test_data)
143 |
144 | num_features = len(test_data.columns)
145 | X_test = test_data.iloc[:, 0:num_features].values
146 |
147 | return test_data, X_test
148 |
149 |
150 | def create_transforms(X, transforms, missing='NaN', impute_strategy='mean', categories=None):
151 | """
152 | Creates transform objects to apply before training or scoring.
153 | """
154 | for i, (key, transform) in enumerate(transforms):
155 | if key == 'imputer':
156 | # impute missing values
157 | transform = Imputer(missing_values=missing, strategy=impute_strategy)
158 | X = transform.fit_transform(X)
159 | elif key == 'onehot':
160 | # create a category encoder
161 | transform = OneHotEncoder(categorical_features=categories, sparse=False)
162 | X = transform.fit_transform(X)
163 | elif key == 'selector':
164 | # create a feature selection transform
165 | transform = VarianceThreshold(threshold=0.0)
166 | X = transform.fit_transform(X)
167 | elif key == 'scaler':
168 | # create a standardization transform
169 | transform = StandardScaler()
170 | X = transform.fit_transform(X)
171 | elif key == 'pca':
172 | # create a PCA transform
173 | transform = PCA(whiten=True)
174 | transform.fit(X)
175 | elif key == 'isomap':
176 | # create an isomap transform
177 | transform = Isomap()
178 | transform.fit(X)
179 | elif key == 'lle':
180 | # create a modified LLE transform
181 | transform = LocallyLinearEmbedding(method='modified')
182 | transform.fit(X)
183 | elif key == 'mds':
184 | # create a multi-dimensional scaling transform
185 | transform = MDS()
186 | transform.fit(X)
187 | elif key == 't-sne':
188 | # create a t-SNE transform
189 | transform = TSNE()
190 | transform.fit(X)
191 |
192 | transforms[i] = (key, transform)
193 |
194 | return transforms
195 |
196 |
197 | def apply_transforms(X, transforms):
198 | """
199 | Applies pre-computed transformations to a data set.
200 | """
201 | for key, transform in transforms:
202 | if transform is not None:
203 | X = transform.transform(X)
204 |
205 | return X
206 |
207 |
208 | def visualize_variable_relationships(training_data, viz_type, category_vars, quantitative_vars):
209 | """
210 | Generates plots showing the relationship between several variables.
211 | """
212 | # compare the continuous variable distributions using a violin plot
213 | sub_data = training_data[quantitative_vars]
214 | fig, ax = plt.subplots(1, 1, figsize=(16, 12))
215 | sb.violinplot(sub_data, ax=ax)
216 | fig.tight_layout()
217 |
218 | # if categorical variables were provided, visualize the quantitative distributions by category
219 | if len(category_vars) > 0:
220 | fig, ax = plt.subplots(len(quantitative_vars), len(category_vars), figsize=(16, 12))
221 | for i, var in enumerate(quantitative_vars):
222 | for j, cat in enumerate(category_vars):
223 | sb.violinplot(training_data[var], training_data[cat], ax=ax[i, j])
224 | fig.tight_layout()
225 |
226 | # generate plots to directly compare the variables
227 | if len(category_vars) == 0:
228 | if len(quantitative_vars) == 2:
229 | sb.jointplot(quantitative_vars[0], quantitative_vars[1], training_data, kind=viz_type, size=16)
230 | else:
231 | sb.pairplot(training_data, vars=quantitative_vars, kind='scatter',
232 | diag_kind='kde', size=16 / len(quantitative_vars))
233 | else:
234 | if len(quantitative_vars) == 1:
235 | if len(category_vars) == 1:
236 | sb.factorplot(category_vars[0], quantitative_vars[0], None,
237 | training_data, kind='auto', size=16)
238 | else:
239 | sb.factorplot(category_vars[0], quantitative_vars[0], category_vars[1],
240 | training_data, kind='auto', size=16)
241 | if len(quantitative_vars) == 2:
242 | if len(category_vars) == 1:
243 | sb.lmplot(quantitative_vars[0], quantitative_vars[1], training_data,
244 | col=None, row=category_vars[0], size=16)
245 | else:
246 | sb.lmplot(quantitative_vars[0], quantitative_vars[1], training_data,
247 | col=category_vars[0], row=category_vars[1], size=16)
248 | else:
249 | sb.pairplot(training_data, hue=category_vars[0], vars=quantitative_vars, kind='scatter',
250 | diag_kind='kde', size=16 / len(quantitative_vars))
251 |
252 |
253 | def visualize_feature_distributions(training_data, viz_type, plot_size):
254 | """
255 | Generates feature distribution plots (histogram or kde) for each feature.
256 | """
257 | if viz_type == 'hist':
258 | hist = True
259 | kde = False
260 | else:
261 | hist = False
262 | kde = True
263 |
264 | num_features = plot_size if plot_size < len(training_data.columns) else len(training_data.columns)
265 | num_plots = num_features / 16 if num_features % 16 == 0 else num_features / 16 + 1
266 |
267 | for i in range(num_plots):
268 | fig, ax = plt.subplots(4, 4, figsize=(20, 10))
269 | for j in range(16):
270 | index = (i * 16) + j
271 | if index < num_features:
272 | if index != 3: # this column is all 0s in the bike set
273 | sb.distplot(training_data.iloc[:, index], hist=hist, kde=kde, label=training_data.columns[index],
274 | ax=ax[j / 4, j % 4], kde_kws={"shade": True})
275 | fig.tight_layout()
276 |
277 |
278 | def visualize_correlations(training_data):
279 | """
280 | Generates a correlation matrix heat map.
281 | """
282 | fig, ax = plt.subplots(figsize=(16, 10))
283 | colormap = sb.blend_palette(sb.color_palette('coolwarm'), as_cmap=True)
284 | if len(training_data.columns) < 30:
285 | sb.corrplot(training_data, annot=True, sig_stars=False, diag_names=True, cmap=colormap, ax=ax)
286 | else:
287 | sb.corrplot(training_data, annot=False, sig_stars=False, diag_names=False, cmap=colormap, ax=ax)
288 | fig.tight_layout()
289 |
290 |
291 | def visualize_sequential_relationships(training_data, plot_size, smooth=None, window=1):
292 | """
293 | Generates line plots to visualize sequential data. Assumes the data frame index is time series.
294 | """
295 | training_data.index.name = None
296 | num_features = plot_size if plot_size < len(training_data.columns) else len(training_data.columns)
297 | num_plots = num_features / 16 if num_features % 16 == 0 else num_features / 16 + 1
298 |
299 | for i in range(num_plots):
300 | fig, ax = plt.subplots(4, 4, sharex=True, figsize=(20, 10))
301 | for j in range(16):
302 | index = (i * 16) + j
303 | if index < num_features:
304 | if index != 3: # this column is all 0s in the bike set
305 | if smooth == 'mean':
306 | training_data.iloc[:, index] = pd.rolling_mean(training_data.iloc[:, index], window)
307 | elif smooth == 'var':
308 | training_data.iloc[:, index] = pd.rolling_var(training_data.iloc[:, index], window)
309 | elif smooth == 'skew':
310 | training_data.iloc[:, index] = pd.rolling_skew(training_data.iloc[:, index], window)
311 | elif smooth == 'kurt':
312 | training_data.iloc[:, index] = pd.rolling_kurt(training_data.iloc[:, index], window)
313 |
314 | training_data.iloc[:, index].plot(ax=ax[j / 4, j % 4], kind='line', legend=False,
315 | title=training_data.columns[index])
316 | fig.tight_layout()
317 |
318 |
319 | def visualize_principal_components(X, y1, y2, model_type, num_components, transforms):
320 | """
321 | Generates scatter plots to visualize the principal components of the data set.
322 | """
323 | X = apply_transforms(X, transforms)
324 | for y in (y1, y2):
325 | if model_type == 'classification':
326 | class_count = np.count_nonzero(np.unique(y))
327 | colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
328 |
329 | for i in range(num_components):
330 | fig, ax = plt.subplots(figsize=(16, 10))
331 | for j in range(class_count):
332 | ax.scatter(X[y == j, i], X[y == j, i + 1], s=30, c=colors[j], label=j)
333 | ax.set_title('Principal Components ' + str(i) + ' and ' + str(i + 1))
334 | ax.legend()
335 | fig.tight_layout()
336 | else:
337 | for i in range(num_components):
338 | fig, ax = plt.subplots(figsize=(16, 10))
339 | sc = ax.scatter(X[:, i], X[:, i + 1], s=30, c=y, cmap='Blues')
340 | ax.set_title('Principal Components ' + str(i) + ' and ' + str(i + 1))
341 | ax.legend()
342 | fig.colorbar(sc)
343 | fig.tight_layout()
344 |
345 |
346 | def define_model(model_type, algorithm):
347 | """
348 | Defines and returns a model object of the designated type.
349 | """
350 | model = None
351 |
352 | if model_type == 'classification':
353 | if algorithm == 'bayes':
354 | model = GaussianNB()
355 | elif algorithm == 'logistic':
356 | model = LogisticRegression(penalty='l2', C=1.0)
357 | elif algorithm == 'svm':
358 | model = SVC(C=1.0, kernel='rbf', shrinking=True, probability=False, cache_size=200)
359 | elif algorithm == 'sgd':
360 | model = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, n_iter=1000, shuffle=False, n_jobs=-1)
361 | elif algorithm == 'forest':
362 | model = RandomForestClassifier(n_estimators=10, criterion='gini', max_features='auto', max_depth=None,
363 | min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None, n_jobs=-1)
364 | elif algorithm == 'boost':
365 | model = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0,
366 | min_samples_split=2, min_samples_leaf=1, max_depth=3, max_features=None,
367 | max_leaf_nodes=None)
368 | else:
369 | print('No model defined for ' + algorithm)
370 | exit()
371 | else:
372 | if algorithm == 'ridge':
373 | model = Ridge(alpha=1.0)
374 | elif algorithm == 'svm':
375 | model = SVR(C=1.0, kernel='rbf', shrinking=True, probability=False, cache_size=200)
376 | elif algorithm == 'sgd':
377 | model = SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, n_iter=1000, shuffle=False)
378 | elif algorithm == 'forest':
379 | model = RandomForestRegressor(n_estimators=10, criterion='mse', max_features='auto', max_depth=None,
380 | min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None, n_jobs=-1)
381 | elif algorithm == 'boost':
382 | model = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0,
383 | min_samples_split=2, min_samples_leaf=1, max_depth=3, max_features=None,
384 | max_leaf_nodes=None)
385 | else:
386 | print('No model defined for ' + algorithm)
387 | exit()
388 |
389 | return model
390 |
391 |
392 | def train_model(X, y, model_type, algorithm, transforms):
393 | """
394 | Trains a new model using the training data.
395 | """
396 | t0 = time.time()
397 | model = define_model(model_type, algorithm)
398 | X = apply_transforms(X, transforms)
399 | model.fit(X, y)
400 | t1 = time.time()
401 | print('Model trained in {0:3f} s.'.format(t1 - t0))
402 |
403 | return model
404 |
405 |
406 | def visualize_feature_importance(training_data, model, column_offset):
407 | """
408 | Generates a feature importance plot. Requires a trained random forest or gradient boosting model.
409 | Does not work properly if transformations are applied to training data that expands the number
410 | of features.
411 | """
412 | importance = model.feature_importances_
413 | importance = 100.0 * (importance / importance.max())
414 | importance = importance[0:30] if len(training_data.columns) > 30 else importance
415 | sorted_idx = np.argsort(importance)
416 | pos = np.arange(sorted_idx.shape[0])
417 |
418 | fig, ax = plt.subplots(figsize=(16, 10))
419 | ax.set_title('Variable Importance')
420 | ax.barh(pos, importance[sorted_idx], align='center')
421 | ax.set_yticks(pos)
422 | ax.set_yticklabels(training_data.columns[sorted_idx + column_offset])
423 | ax.set_xlabel('Relative Importance')
424 |
425 | fig.tight_layout()
426 |
427 |
428 | def cross_validate(X, y, model_type, algorithm, metric, transforms, folds=3):
429 | """
430 | Performs cross-validation to estimate the true performance of the model.
431 | """
432 | model = define_model(model_type, algorithm)
433 | X = apply_transforms(X, transforms)
434 |
435 | t0 = time.time()
436 | scores = cross_val_score(model, X, y, scoring=metric, cv=folds, n_jobs=-1)
437 | t1 = time.time()
438 | print('Cross-validation completed in {0:3f} s.'.format(t1 - t0))
439 |
440 | return np.mean(scores)
441 |
442 |
443 | def time_series_cross_validate(X, y, model_type, algorithm, transforms, strategy='traditional', folds=3,
444 | window_type='cumulative', min_window=0, forecast_range=1, plot=False):
445 | """
446 | Performs time series cross-validation to estimate the true performance of the model.
447 | """
448 | model = define_model(model_type, algorithm)
449 | X = apply_transforms(X, transforms)
450 |
451 | scores = []
452 | train_count = len(X)
453 |
454 | if strategy == 'walk-forward':
455 | folds = train_count - min_window - forecast_range
456 | fold_size = 1
457 | else:
458 | fold_size = train_count / folds
459 |
460 | t0 = time.time()
461 | for i in range(folds):
462 | if window_type == 'fixed':
463 | fold_start = i * fold_size
464 | else:
465 | fold_start = 0
466 |
467 | fold_end = (i + 1) * fold_size + min_window
468 | fold_train_end = fold_end - forecast_range
469 |
470 | X_train, X_val = X[fold_start:fold_train_end, :], X[fold_train_end:fold_end, :]
471 | y_train, y_val = y[fold_start:fold_train_end], y[fold_train_end:fold_end]
472 |
473 | model.fit(X_train, y_train)
474 | scores.append(model.score(X_val, y_val))
475 |
476 | if plot is True:
477 | y_est = model.predict(X_val)
478 | fig, ax = plt.subplots(figsize=(16, 10))
479 | ax.set_title('Estimation Error')
480 | ax.plot(y_est - y_val)
481 | fig.tight_layout()
482 |
483 | t1 = time.time()
484 | print('Cross-validation completed in {0:3f} s.'.format(t1 - t0))
485 |
486 | return np.mean(scores)
487 |
488 |
489 | def plot_learning_curve(X, y, model_type, algorithm, metric, transforms):
490 | """
491 | Plots a learning curve showing model performance against both training and
492 | validation data sets as a function of the number of training samples.
493 | """
494 | model = define_model(model_type, algorithm)
495 | X = apply_transforms(X, transforms)
496 |
497 | t0 = time.time()
498 | train_sizes, train_scores, test_scores = learning_curve(model, X, y, scoring=metric, cv=3, n_jobs=-1)
499 | train_scores_mean = np.mean(train_scores, axis=1)
500 | train_scores_std = np.std(train_scores, axis=1)
501 | test_scores_mean = np.mean(test_scores, axis=1)
502 | test_scores_std = np.std(test_scores, axis=1)
503 |
504 | fig, ax = plt.subplots(figsize=(16, 10))
505 | ax.set_title('Learning Curve')
506 | ax.set_xlabel('Training Examples')
507 | ax.set_ylabel('Score')
508 | ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
509 | alpha=0.1, color='r')
510 | ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
511 | alpha=0.1, color='r')
512 | ax.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
513 | ax.plot(train_sizes, test_scores_mean, 'o-', color='r', label='Cross-validation score')
514 | ax.legend(loc='best')
515 | fig.tight_layout()
516 | t1 = time.time()
517 | print('Learning curve generated in {0:3f} s.'.format(t1 - t0))
518 |
519 |
520 | def parameter_search(X, y, model_type, algorithm, metric, transforms):
521 | """
522 | Performs an exhaustive search over the specified model parameters.
523 | """
524 | model = define_model(model_type, algorithm)
525 | X = apply_transforms(X, transforms)
526 |
527 | param_grid = None
528 | if algorithm == 'logistic':
529 | param_grid = [{'penalty': ['l1', 'l2'], 'C': [0.1, 0.3, 1.0, 3.0]}]
530 | elif algorithm == 'ridge':
531 | param_grid = [{'alpha': [0.1, 0.3, 1.0, 3.0, 10.0]}]
532 | elif algorithm == 'svm':
533 | param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
534 | {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]
535 | elif algorithm == 'sgd':
536 | param_grid = [{'loss': ['hinge', 'log', 'modified_huber'], 'penalty': ['l1', 'l2'],
537 | 'alpha': [0.0001, 0.001, 0.01], 'iter': [100, 1000, 10000]}]
538 | elif algorithm == 'forest':
539 | param_grid = [{'n_estimators': [10, 30, 100, 300], 'criterion': ['gini', 'entropy'],
540 | 'max_features': ['auto', 'log2', None], 'max_depth': [3, 5, 7, None],
541 | 'min_samples_split': [2, 10, 30, 100], 'min_samples_leaf': [1, 3, 10, 30, 100]}]
542 | elif algorithm == 'boost':
543 | param_grid = [{'learning_rate': [0.1, 0.3, 1.0], 'subsample': [1.0, 0.9, 0.7, 0.5],
544 | 'n_estimators': [100, 300, 1000], 'max_features': ['auto', 'log2', None],
545 | 'max_depth': [3, 5, 7, None], 'min_samples_split': [2, 10, 30, 100],
546 | 'min_samples_leaf': [1, 3, 10, 30, 100]}]
547 |
548 | t0 = time.time()
549 | grid_estimator = GridSearchCV(model, param_grid, scoring=metric, cv=3, n_jobs=-1)
550 | grid_estimator.fit(X, y)
551 | t1 = time.time()
552 | print('Grid search completed in {0:3f} s.'.format(t1 - t0))
553 |
554 | return grid_estimator.best_estimator_, grid_estimator.best_params_, grid_estimator.best_score_
555 |
556 |
557 | def train_ensemble(X, y, model_type, algorithm, transforms):
558 | """
559 | Creates an ensemble of many models together.
560 | """
561 | model = define_model(model_type, algorithm)
562 | X = apply_transforms(X, transforms)
563 |
564 | t0 = time.time()
565 | ensemble_model = BaggingClassifier(base_estimator=model, n_estimators=10, max_samples=1.0, max_features=1.0,
566 | bootstrap=True, bootstrap_features=False)
567 | ensemble_model.fit(X, y)
568 | t1 = time.time()
569 | print('Ensemble training completed in {0:3f} s.'.format(t1 - t0))
570 |
571 | return ensemble_model
572 |
573 |
574 | def create_submission(test_data, y_est, data_dir, submit_file):
575 | """
576 | Create a new submission file with test data and predictions generated by the model.
577 | """
578 | submit = pd.DataFrame(columns=['datetime', 'count'])
579 | submit['datetime'] = test_data.index
580 | submit['count'] = y_est
581 | submit.to_csv(data_dir + submit_file, sep=',', index=False, index_label=False)
582 |
583 |
584 | def experiments():
585 | """
586 | Testing area for miscellaneous experiments.
587 | """
588 |
589 |
590 | def main():
591 | ex_process_training_data = True
592 | ex_generate_features = True
593 | ex_create_transforms = True
594 | ex_load_model = False
595 | ex_save_model = False
596 | ex_visualize_variable_relationships = False
597 | ex_visualize_feature_distributions = False
598 | ex_visualize_correlations = False
599 | ex_visualize_sequential_relationships = False
600 | ex_visualize_principal_components = False
601 | ex_train_model = True
602 | ex_visualize_feature_importance = False
603 | ex_cross_validate = True
604 | ex_plot_learning_curve = False
605 | ex_parameter_search = False
606 | ex_train_ensemble = False
607 | ex_create_submission = False
608 |
609 | code_dir = '/home/john/git/kaggle/BikeSharing/'
610 | data_dir = '/home/john/data/bike-sharing/'
611 | training_file = 'train.csv'
612 | test_file = 'test.csv'
613 | submit_file = 'submission.csv'
614 | model_file = 'model.pkl'
615 |
616 | model_type = 'regression' # classification, regression
617 | algorithm = 'forest' # bayes, logistic, ridge, svm, sgd, forest, boost
618 | metric = None # accuracy, f1, rcc_auc, mean_absolute_error, mean_squared_error, r2_score
619 | transforms = [('imputer', None), ('onehot', None), ('selector', None), ('scaler', None)]
620 | categories = [0, 1, 2, 3]
621 | column_offset = 2
622 | plot_size = 16
623 | num_components = 3
624 |
625 | training_data = None
626 | X = None
627 | y1 = None
628 | y2 = None
629 | model = None
630 | model2 = None
631 |
632 | os.chdir(code_dir)
633 |
634 | print('Starting process...')
635 | print('Algorithm = {0}'.format(algorithm))
636 | print('Scoring Metric = {0}'.format(metric))
637 | print('Generate Features = {0}'.format(ex_generate_features))
638 | print('Transforms = {0}'.format(transforms))
639 |
640 | if ex_process_training_data:
641 | print('Reading in training data...')
642 | training_data, X, y1, y2 = process_training_data(data_dir, training_file, ex_generate_features)
643 |
644 | if ex_create_transforms:
645 | transforms = create_transforms(X, transforms, categories=categories)
646 |
647 | if ex_visualize_variable_relationships:
648 | print('Visualizing pairwise relationships...')
649 | # scatter, reg, resid, kde, hex
650 | visualize_variable_relationships(training_data, 'scatter', ['season', 'weather'], ['casual', 'registered'])
651 |
652 | if ex_visualize_feature_distributions:
653 | print('Visualizing feature distributions...')
654 | # hist, kde
655 | visualize_feature_distributions(training_data, 'hist', plot_size)
656 |
657 | if ex_visualize_correlations:
658 | print('Visualizing feature correlations...')
659 | visualize_correlations(training_data)
660 |
661 | if ex_visualize_sequential_relationships:
662 | print('Visualizing sequential relationships...')
663 | visualize_sequential_relationships(training_data, plot_size)
664 |
665 | if ex_visualize_principal_components:
666 | print('Visualizing principal components...')
667 | visualize_principal_components(X, y1, y2, model_type, num_components, transforms)
668 |
669 | if ex_load_model:
670 | print('Loading model from disk...')
671 | model = load_model(data_dir + model_file)
672 |
673 | if ex_train_model:
674 | print('Training model on full data set...')
675 | model = train_model(X, y1, model_type, algorithm, transforms)
676 | model2 = train_model(X, y2, model_type, algorithm, transforms)
677 |
678 | print('Calculating training score...')
679 | model_score = score(X, y1, model, transforms)
680 | print('Casual training score ='), model_score
681 | model_score2 = score(X, y2, model2, transforms)
682 | print('Registered training score ='), model_score2
683 |
684 | if ex_visualize_feature_importance and (algorithm == 'forest' or algorithm == 'boost'):
685 | print('Generating feature importance plot...')
686 | visualize_feature_importance(training_data, model, column_offset)
687 | visualize_feature_importance(training_data, model2, column_offset)
688 |
689 | if ex_cross_validate:
690 | print('Performing cross-validation...')
691 | cross_validation_score = time_series_cross_validate(X, y1, model_type, algorithm, transforms,
692 | forecast_range=258, plot=True)
693 | print('Casual cross-validation score ='), cross_validation_score
694 | cross_validation_score2 = time_series_cross_validate(X, y2, model_type, algorithm, transforms,
695 | forecast_range=258, plot=True)
696 | print('Registered cross-validation score ='), cross_validation_score2
697 |
698 | if ex_plot_learning_curve:
699 | print('Generating learning curve...')
700 | plot_learning_curve(X, y1, model_type, algorithm, metric, transforms)
701 |
702 | if ex_parameter_search:
703 | print('Performing hyper-parameter grid search...')
704 | best_model, best_params, best_score = parameter_search(X, y1, model_type, algorithm, metric, transforms)
705 | print('Best model = ', best_model)
706 | print('Best params = ', best_params)
707 | print('Best score = ', best_score)
708 |
709 | if ex_train_ensemble:
710 | print('Creating an ensemble of models...')
711 | model = train_ensemble(X, y1, model_type, algorithm, transforms)
712 |
713 | print('Calculating ensemble training score...')
714 | ensemble_score = score(X, y1, model, transforms)
715 | print('Ensemble Training score ='), ensemble_score
716 |
717 | if ex_save_model:
718 | print('Saving model to disk...')
719 | save_model(model, data_dir + model_file)
720 |
721 | if ex_create_submission:
722 | print('Reading in test data...')
723 | test_data, X_test = process_test_data(data_dir, test_file, ex_generate_features)
724 |
725 | print('Predicting test data...')
726 | y_est_1 = predict(X_test, model, transforms)
727 | y_est_2 = predict(X_test, model2, transforms)
728 | y_est = y_est_1 + y_est_2
729 |
730 | print('Creating submission file...')
731 | create_submission(test_data, y_est, data_dir, submit_file)
732 |
733 | print('Process complete.')
734 |
735 |
736 | if __name__ == "__main__":
737 | main()
--------------------------------------------------------------------------------
/old/Expedia/README.md:
--------------------------------------------------------------------------------
1 | # Expedia Hotel Recommendations
2 |
3 | View the competition details here.
4 |
5 | I started looking at this but then didn't really end up working on it due to time constraints. It looked like an interesting problem that seemed to require some creativity to progress on. The included script is not original work, it was mostly copied from one of the "starter" scripts for the competition.
--------------------------------------------------------------------------------
/old/Expedia/script.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import random
4 | import operator
5 |
6 |
7 | def apk(actual, predicted, k=10):
8 | if len(predicted) > k:
9 | predicted = predicted[:k]
10 |
11 | score = 0.0
12 | num_hits = 0.0
13 |
14 | for i, p in enumerate(predicted):
15 | if p in actual and p not in predicted[:i]:
16 | num_hits += 1.0
17 | score += num_hits / (i + 1.0)
18 |
19 | if not actual:
20 | return 0.0
21 |
22 | return score / min(len(actual), k)
23 |
24 |
25 | def mapk(actual, predicted, k=10):
26 | return np.mean([apk(a, p ,k) for a, p in zip(actual, predicted)])
27 |
28 |
29 | def make_key(items):
30 | return '_'.join([str(i) for i in items])
31 |
32 |
33 | def generate_exact_matches(row, match_cols, groups):
34 | index = tuple([row[t] for t in match_cols])
35 | try:
36 | group = groups.get_group(index)
37 | except Exception:
38 | return []
39 |
40 | clus = list(set(group.hotel_cluster))
41 |
42 | return clus
43 |
44 |
45 | def f5(seq, idfun=None):
46 | if idfun is None:
47 | def idfun(x): return x
48 |
49 | seen = {}
50 | result = []
51 | for item in seq:
52 | marker = idfun(item)
53 | if marker in seen: continue
54 | seen[marker] = 1
55 | result.append(item)
56 |
57 | return result
58 |
59 |
60 | def generate_submission(data_dir, preds, test):
61 | write_p = [' '.join([str(l) for l in p]) for p in preds]
62 | write_frame = ['{0},{1}'.format(test['id'].iloc[i], write_p[i]) for i in range(len(preds))]
63 | write_frame = ['id,hotel_cluster'] + write_frame
64 |
65 | with open(data_dir + 'predictions.csv', 'w+') as f:
66 | f.write('\n'.join(write_frame))
67 |
68 |
69 | print('Loading data sets...')
70 | data_dir = '/home/john/data/expedia/'
71 |
72 | destinations = pd.read_csv(data_dir + 'destinations.csv')
73 |
74 | train = pd.read_csv(data_dir + 'train.csv',
75 | usecols=['date_time', 'user_location_country', 'user_location_region', 'user_location_city',
76 | 'user_id', 'is_booking', 'orig_destination_distance',
77 | 'hotel_cluster', 'srch_ci', 'srch_co', 'srch_destination_id',
78 | 'hotel_continent', 'hotel_country', 'hotel_market'],
79 | dtype={'date_time': np.str_, 'user_location_country': np.int8,
80 | 'user_location_region': np.int8, 'user_location_city': np.int8,
81 | 'user_id': np.int32, 'is_booking': np.int8,
82 | 'orig_destination_distance': np.float64,
83 | 'hotel_cluster': np.int8,
84 | 'srch_ci': np.str_, 'srch_co': np.str_,
85 | 'srch_destination_id': np.int32,
86 | 'hotel_continent': np.int8,
87 | 'hotel_country': np.int8,
88 | 'hotel_market': np.int8})
89 |
90 | test = pd.read_csv(data_dir + 'test.csv',
91 | usecols=['id', 'date_time', 'user_location_country', 'user_location_region',
92 | 'user_location_city',
93 | 'user_id', 'orig_destination_distance',
94 | 'srch_ci', 'srch_co', 'srch_destination_id',
95 | 'hotel_continent', 'hotel_country', 'hotel_market'],
96 | dtype={'id': np.int32, 'date_time': np.str_, 'user_location_country': np.int8,
97 | 'user_location_region': np.int8, 'user_location_city': np.int8,
98 | 'user_id': np.int32,
99 | 'orig_destination_distance': np.float64, 'srch_ci': np.str_, 'srch_co': np.str_,
100 | 'srch_destination_id': np.int32,
101 | 'hotel_continent': np.int8,
102 | 'hotel_country': np.int8,
103 | 'hotel_market': np.int8})
104 |
105 | print('Generating first set of predictions...')
106 |
107 | # add year and month features to the training data
108 | train['date_time'] = pd.to_datetime(train['date_time'])
109 | train['year'] = train['date_time'].dt.year
110 | train['month'] = train['date_time'].dt.month
111 |
112 | # generate a list of randomly selected unique user ids
113 | unique_users = train.user_id.unique()
114 | sel_user_ids = [unique_users[i] for i in sorted(random.sample(range(len(unique_users)), 10000))]
115 | sel_train = train[train.user_id.isin(sel_user_ids)]
116 |
117 | # create sampled training and validation data sets
118 | t1 = sel_train[((sel_train.year == 2012) | (sel_train.year == 2013))]
119 | t2 = sel_train[(sel_train.year == 2014)]
120 | t2 = t2[t2.is_booking == True]
121 |
122 | # skip sampling and use full data set
123 | # t1 = train
124 | # t2 = test
125 |
126 | # identify the most common clusters
127 | most_common_clusters = list(train.hotel_cluster.value_counts().head().index)
128 |
129 | # match clusters to search destination
130 | match_cols = ['srch_destination_id']
131 | cluster_cols = match_cols + ['hotel_cluster']
132 | groups = t1.groupby(cluster_cols)
133 |
134 | top_clusters = {}
135 | for name, group in groups:
136 | clicks = len(group.is_booking[group.is_booking == False])
137 | bookings = len(group.is_booking[group.is_booking == True])
138 |
139 | score = bookings + .15 * clicks
140 |
141 | clus_name = make_key(name[:len(match_cols)])
142 |
143 | if clus_name not in top_clusters:
144 | top_clusters[clus_name] = {}
145 |
146 | top_clusters[clus_name][name[-1]] = score
147 |
148 | # find the top 5 for each search destination
149 | cluster_dict = {}
150 | for n in top_clusters:
151 | tc = top_clusters[n]
152 | top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]]
153 | cluster_dict[n] = top
154 |
155 | # generate predictions based on the top clusters per search destination
156 | preds = []
157 | for index, row in t2.iterrows():
158 | key = make_key([row[m] for m in match_cols])
159 |
160 | if key in cluster_dict:
161 | preds.append(cluster_dict[key])
162 | else:
163 | preds.append([])
164 |
165 | print('Generating second set of predictions...')
166 |
167 | # use data leak to match users between train and test data
168 | match_cols = ['user_location_country', 'user_location_region', 'user_location_city', 'hotel_market', 'orig_destination_distance']
169 | groups = t1.groupby(match_cols)
170 |
171 | exact_matches = []
172 | for i in range(t2.shape[0]):
173 | exact_matches.append(generate_exact_matches(t2.iloc[i], match_cols, groups))
174 |
175 | # generate predictions
176 | full_preds = [f5(exact_matches[p] + preds[p] + most_common_clusters)[:5] for p in range(len(preds))]
177 |
178 | # evaluate the accuracy of this solution
179 | print('Score = ' + str(mapk([[l] for l in t2['hotel_cluster']], full_preds, k=5)))
180 |
181 | # print('Writing submission file...')
182 | # generate_submission(data_dir, full_preds, t2)
183 |
184 | print('Script complete.')
185 |
--------------------------------------------------------------------------------
/old/ForestCover/README.md:
--------------------------------------------------------------------------------
1 | # Forest Cover Type Prediction
2 |
3 | View the competition details here.
4 |
5 | This directory includes all of the code used for the competition. Since the forest cover challenge is a fairly easy problem designed for beginners, I used this competition to refine my classification script a bit and learn some scikit-learn APIs that I wasn't overly familiar with.
--------------------------------------------------------------------------------
/old/ForestCover/forest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import pickle
4 | import numpy as np
5 | import pandas as pd
6 | import matplotlib.pyplot as plt
7 | import seaborn as sb
8 | from sklearn import *
9 | from sklearn.ensemble import *
10 | from sklearn.grid_search import *
11 | from sklearn.feature_selection import *
12 | from sklearn.learning_curve import *
13 |
14 |
15 | def performance_test():
16 | """
17 | Test NumPy performance. Should run in less than a second on most machines.
18 | """
19 | A = np.random.random((2000, 2000))
20 | B = np.random.random((2000, 2000))
21 | t = time.time()
22 | np.dot(A, B)
23 | print(time.time()-t)
24 |
25 |
26 | def load(filename):
27 | """
28 | Load a previously training model from disk.
29 | """
30 | model_file = open(filename, 'rb')
31 | model = pickle.load(model_file)
32 | model_file.close()
33 | return model
34 |
35 |
36 | def save(model, filename):
37 | """
38 | Persist a trained model to disk.
39 | """
40 | model_file = open(filename, 'wb')
41 | pickle.dump(model, model_file)
42 | model_file.close()
43 |
44 |
45 | def generate_features(data):
46 | """
47 | Generates new derived features to add to the data set for model training.
48 | """
49 | data['Aspect_Shifted'] = data['Aspect'].map(lambda x: x - 180 if x + 180 < 360 else x + 180)
50 | data['High_Water'] = data['Vertical_Distance_To_Hydrology'] < 0
51 | data['EVDtH'] = data['Elevation'] - data['Vertical_Distance_To_Hydrology']
52 | data['EHDtH'] = data['Elevation'] - data['Horizontal_Distance_To_Hydrology'] * 0.2
53 | data['DTH'] = (data['Horizontal_Distance_To_Hydrology'] ** 2 + data['Vertical_Distance_To_Hydrology'] ** 2) ** 0.5
54 | data['Hydro_Fire_1'] = data['Horizontal_Distance_To_Hydrology'] + data['Horizontal_Distance_To_Fire_Points']
55 | data['Hydro_Fire_2'] = abs(data['Horizontal_Distance_To_Hydrology'] - data['Horizontal_Distance_To_Fire_Points'])
56 | data['Hydro_Road_1'] = abs(data['Horizontal_Distance_To_Hydrology'] + data['Horizontal_Distance_To_Roadways'])
57 | data['Hydro_Road_2'] = abs(data['Horizontal_Distance_To_Hydrology'] - data['Horizontal_Distance_To_Roadways'])
58 | data['Fire_Road_1'] = abs(data['Horizontal_Distance_To_Fire_Points'] + data['Horizontal_Distance_To_Roadways'])
59 | data['Fire_Road_2'] = abs(data['Horizontal_Distance_To_Fire_Points'] - data['Horizontal_Distance_To_Roadways'])
60 |
61 | return data
62 |
63 |
64 | def process_training_data(filename, create_features):
65 | """
66 | Reads in training data and prepares numpy arrays.
67 | """
68 | training_data = pd.read_csv(filename, sep=',')
69 | num_features = len(training_data.columns) - 1
70 |
71 | # move the label to the first position and drop the ID column
72 | cols = training_data.columns.tolist()
73 | cols = cols[-1:] + cols[1:num_features]
74 | training_data = training_data[cols]
75 |
76 | if create_features:
77 | training_data = generate_features(training_data)
78 |
79 | num_features = len(training_data.columns)
80 | X = training_data.iloc[:, 1:num_features].values
81 | y = training_data.iloc[:, 0].values
82 |
83 | return training_data, X, y
84 |
85 |
86 | def create_transforms(X, standardize, whiten, select):
87 | """
88 | Creates transform objects to apply before training or scoring.
89 | """
90 | # create a standardization transform
91 | scaler = None
92 | if standardize:
93 | scaler = preprocessing.StandardScaler()
94 | scaler.fit(X)
95 |
96 | # create a PCA transform
97 | pca = None
98 | if whiten:
99 | pca = decomposition.PCA(whiten=True)
100 | pca.fit(X)
101 |
102 | # create a feature selection transform
103 | selector = None
104 | if select:
105 | selector = VarianceThreshold(threshold=0.0)
106 | selector.fit(X)
107 |
108 | return scaler, pca, selector
109 |
110 |
111 | def apply_transforms(X, scaler, pca, selector):
112 | """
113 | Applies pre-computed transformations to a data set.
114 | """
115 | if scaler is not None:
116 | X = scaler.transform(X)
117 |
118 | if pca is not None:
119 | X = pca.transform(X)
120 |
121 | if selector is not None:
122 | X = selector.transform(X)
123 |
124 | return X
125 |
126 |
127 | def visualize(training_data, X, y, pca):
128 | """
129 | Computes statistics describing the data and creates some visualizations
130 | that attempt to highlight the underlying structure.
131 |
132 | Note: Use '%matplotlib inline' and '%matplotlib qt' at the IPython console
133 | to switch between display modes.
134 | """
135 |
136 | print('Generating individual feature histograms...')
137 | num_features = len(training_data.columns)
138 | num_plots = num_features / 16 if num_features % 16 == 0 else num_features / 16 + 1
139 | for i in range(num_plots):
140 | fig, ax = plt.subplots(4, 4, figsize=(20, 10))
141 | for j in range(16):
142 | index = (i * 16) + j
143 | if index == 0:
144 | ax[j / 4, j % 4].hist(y, bins=30)
145 | ax[j / 4, j % 4].set_title(training_data.columns[index])
146 | ax[j / 4, j % 4].set_xlim((min(y), max(y)))
147 | elif index < num_features:
148 | ax[j / 4, j % 4].hist(X[:, index - 1], bins=30)
149 | ax[j / 4, j % 4].set_title(training_data.columns[index])
150 | ax[j / 4, j % 4].set_xlim((min(X[:, index - 1]), max(X[:, index - 1])))
151 | fig.tight_layout()
152 |
153 | print('Generating correlation matrix...')
154 | fig2, ax2 = plt.subplots(figsize=(16, 10))
155 | colormap = sb.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF", "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True)
156 | sb.corrplot(training_data, annot=False, sig_stars=False, diag_names=False, cmap=colormap, ax=ax2)
157 | fig2.tight_layout()
158 |
159 | if pca is not None:
160 | print('Generating principal component plots...')
161 | X = pca.transform(X)
162 | class_count = np.count_nonzero(np.unique(y))
163 | colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
164 |
165 | fig3, ax3 = plt.subplots(figsize=(16, 10))
166 | for i in range(class_count):
167 | class_idx = i + 1 # add 1 if class labels start at 1 instead of 0
168 | ax3.scatter(X[y == class_idx, 0], X[y == class_idx, 1], c=colors[i], label=class_idx)
169 | ax3.set_title('First & Second Principal Components')
170 | ax3.legend()
171 | fig3.tight_layout()
172 |
173 | fig4, ax4 = plt.subplots(figsize=(16, 10))
174 | for i in range(class_count):
175 | class_idx = i + 1 # add 1 if class labels start at 1 instead of 0
176 | ax4.scatter(X[y == class_idx, 1], X[y == class_idx, 2], c=colors[i], label=class_idx)
177 | ax4.set_title('Second & Third Principal Components')
178 | ax4.legend()
179 | fig4.tight_layout()
180 |
181 | fig5, ax5 = plt.subplots(figsize=(16, 10))
182 | for i in range(class_count):
183 | class_idx = i + 1 # add 1 if class labels start at 1 instead of 0
184 | ax5.scatter(X[y == class_idx, 2], X[y == class_idx, 3], c=colors[i], label=class_idx)
185 | ax5.set_title('Third & Fourth Principal Components')
186 | ax5.legend()
187 | fig5.tight_layout()
188 |
189 |
190 | def define_model(algorithm):
191 | """
192 | Defines and returns a model object of the designated type.
193 | """
194 | model = None
195 |
196 | if algorithm == 'bayes':
197 | model = naive_bayes.GaussianNB()
198 | elif algorithm == 'logistic':
199 | model = linear_model.LogisticRegression(penalty='l2', C=1.0)
200 | elif algorithm == 'svm':
201 | model = svm.SVC(C=1.0, kernel='rbf', shrinking=True, probability=False, cache_size=200)
202 | elif algorithm == 'sgd':
203 | model = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001,
204 | n_iter=1000, shuffle=False, n_jobs=-1)
205 | elif algorithm == 'forest':
206 | model = RandomForestClassifier(n_estimators=10, criterion='gini', max_features='auto', max_depth=None,
207 | min_samples_split=2, min_samples_leaf=1, max_leaf_nodes=None, n_jobs=-1)
208 | elif algorithm == 'boost':
209 | model = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0,
210 | min_samples_split=2, min_samples_leaf=1, max_depth=3, max_features=None,
211 | max_leaf_nodes=None)
212 | else:
213 | print('No model defined for ' + algorithm)
214 | exit()
215 |
216 | return model
217 |
218 |
219 | def train(training_data, X, y, algorithm, scaler, pca, selector):
220 | """
221 | Trains a new model using the training data.
222 | """
223 | t0 = time.time()
224 | model = define_model(algorithm)
225 | X = apply_transforms(X, scaler, pca, selector)
226 | model.fit(X, y)
227 | t1 = time.time()
228 | print('Model trained in {0:3f} s.'.format(t1 - t0))
229 |
230 | if algorithm == 'forest' or algorithm == 'boost':
231 | print('Generating feature importance plot...')
232 | fig, ax = plt.subplots(figsize=(16, 10))
233 |
234 | importance = model.feature_importances_
235 | importance = 100.0 * (importance / importance.max())
236 | importance = importance[0:30]
237 | sorted_idx = np.argsort(importance)
238 | pos = np.arange(sorted_idx.shape[0])
239 | ax.set_title('Variable Importance')
240 | ax.barh(pos, importance[sorted_idx], align='center')
241 | ax.set_yticks(pos)
242 | ax.set_yticklabels(training_data.columns[sorted_idx + 1])
243 | ax.set_xlabel('Relative Importance')
244 |
245 | fig.tight_layout()
246 |
247 | return model
248 |
249 |
250 | def predict(X, model, scaler, pca, selector):
251 | """
252 | Predicts the class label.
253 | """
254 | X = apply_transforms(X, scaler, pca, selector)
255 | y_est = model.predict(X)
256 |
257 | return y_est
258 |
259 |
260 | def predict_probability(X, model, scaler, pca, selector):
261 | """
262 | Predicts the class probabilities.
263 | """
264 | X = apply_transforms(X, scaler, pca, selector)
265 | y_prob = model.predict_proba(X)[:, 1]
266 |
267 | return y_prob
268 |
269 |
270 | def score(X, y, model, scaler, pca, selector):
271 | """
272 | Scores the model's performance and returns the result.
273 | """
274 | X = apply_transforms(X, scaler, pca, selector)
275 |
276 | return model.score(X, y)
277 |
278 |
279 | def cross_validate(X, y, algorithm, scaler, pca, selector, metric):
280 | """
281 | Performs cross-validation to estimate the true performance of the model.
282 | """
283 | model = define_model(algorithm)
284 | X = apply_transforms(X, scaler, pca, selector)
285 |
286 | t0 = time.time()
287 | scores = cross_validation.cross_val_score(model, X, y, scoring=metric, cv=3, n_jobs=-1)
288 | t1 = time.time()
289 | print('Cross-validation completed in {0:3f} s.'.format(t1 - t0))
290 |
291 | return np.mean(scores)
292 |
293 |
294 | def plot_learning_curve(X, y, algorithm, scaler, pca, selector, metric):
295 | """
296 | Plots a learning curve showing model performance against both training and
297 | validation data sets as a function of the number of training samples.
298 | """
299 | model = define_model(algorithm)
300 | X = apply_transforms(X, scaler, pca, selector)
301 |
302 | t0 = time.time()
303 | train_sizes, train_scores, test_scores = learning_curve(model, X, y, scoring=metric, cv=3, n_jobs=-1)
304 | train_scores_mean = np.mean(train_scores, axis=1)
305 | train_scores_std = np.std(train_scores, axis=1)
306 | test_scores_mean = np.mean(test_scores, axis=1)
307 | test_scores_std = np.std(test_scores, axis=1)
308 |
309 | fig, ax = plt.subplots(figsize=(16, 10))
310 | ax.set_title('Learning Curve')
311 | ax.set_xlabel('Training Examples')
312 | ax.set_ylabel('Score')
313 | ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
314 | alpha=0.1, color='r')
315 | ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
316 | alpha=0.1, color='r')
317 | ax.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training score')
318 | ax.plot(train_sizes, test_scores_mean, 'o-', color='r', label='Cross-validation score')
319 | ax.legend(loc='best')
320 | fig.tight_layout()
321 | t1 = time.time()
322 | print('Learning curve generated in {0:3f} s.'.format(t1 - t0))
323 |
324 |
325 | def parameter_search(X, y, algorithm, scaler, pca, selector, metric):
326 | """
327 | Performs an exhaustive search over the specified model parameters.
328 | """
329 | model = define_model(algorithm)
330 | X = apply_transforms(X, scaler, pca, selector)
331 |
332 | param_grid = None
333 | if algorithm == 'logistic':
334 | param_grid = [{'penalty': ['l1', 'l2'], 'C': [0.1, 0.3, 1.0, 3.0]}]
335 | elif algorithm == 'svm':
336 | param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
337 | {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]
338 | elif algorithm == 'sgd':
339 | param_grid = [{'loss': ['hinge', 'log', 'modified_huber'], 'penalty': ['l1', 'l2'],
340 | 'alpha': [0.0001, 0.001, 0.01], 'iter': [100, 1000, 10000]}]
341 | elif algorithm == 'forest':
342 | param_grid = [{'n_estimators': [10, 30, 100, 300], 'criterion': ['gini', 'entropy'],
343 | 'max_features': ['auto', 'log2', None], 'max_depth': [3, 5, 7, None],
344 | 'min_samples_split': [2, 10, 30, 100], 'min_samples_leaf': [1, 3, 10, 30, 100]}]
345 | elif algorithm == 'boost':
346 | param_grid = [{'learning_rate': [0.1, 0.3, 1.0], 'subsample': [1.0, 0.9, 0.7, 0.5],
347 | 'n_estimators': [100, 300, 1000], 'max_features': ['auto', 'log2', None],
348 | 'max_depth': [3, 5, 7, None], 'min_samples_split': [2, 10, 30, 100],
349 | 'min_samples_leaf': [1, 3, 10, 30, 100]}]
350 |
351 | t0 = time.time()
352 | grid_estimator = GridSearchCV(model, param_grid, scoring=metric, cv=3, n_jobs=-1)
353 | grid_estimator.fit(X, y)
354 | t1 = time.time()
355 | print('Grid search completed in {0:3f} s.'.format(t1 - t0))
356 |
357 | return grid_estimator.best_estimator_, grid_estimator.best_params_, grid_estimator.best_score_
358 |
359 |
360 | def train_ensemble(X, y, algorithm, scaler, pca, selector):
361 | """
362 | Creates an ensemble of many models together.
363 | """
364 | model = define_model(algorithm)
365 | X = apply_transforms(X, scaler, pca, selector)
366 |
367 | t0 = time.time()
368 | ensemble_model = BaggingClassifier(base_estimator=model, n_estimators=10, max_samples=1.0, max_features=1.0,
369 | bootstrap=True, bootstrap_features=False)
370 | ensemble_model.fit(X, y)
371 | t1 = time.time()
372 | print('Ensemble training completed in {0:3f} s.'.format(t1 - t0))
373 |
374 | return ensemble_model
375 |
376 |
377 | def process_test_data(filename, create_features):
378 | """
379 | Reads in the test data set and prepares it for prediction by the model.
380 | """
381 | test_data = pd.read_csv(filename, sep=',')
382 |
383 | if create_features:
384 | test_data = generate_features(test_data)
385 |
386 | num_features = len(test_data.columns)
387 | X_test = test_data.iloc[:, 1:num_features].values
388 |
389 | return test_data, X_test
390 |
391 |
392 | def create_submission(test_data, y_est, submit_file):
393 | """
394 | Create a new submission file with test data and predictions generated by the model.
395 | """
396 | submit = pd.DataFrame(columns=['Id', 'Cover_Type'])
397 | submit['Id'] = test_data['Id']
398 | submit['Cover_Type'] = y_est
399 | submit.to_csv(submit_file, sep=',', index=False, index_label=False)
400 |
401 |
402 | def main():
403 | load_training_data = True
404 | create_features = False
405 | create_visualizations = False
406 | load_model = False
407 | train_model = True
408 | create_learning_curve = False
409 | perform_grid_search = False
410 | perform_ensemble = False
411 | save_model = False
412 | create_submission_file = False
413 |
414 | code_dir = '/home/john/git/kaggle/ForestCover/'
415 | data_dir = '/home/john/data/forest-cover/'
416 | training_file = 'train.csv'
417 | test_file = 'test.csv'
418 | submit_file = 'submission.csv'
419 | model_file = 'model.pkl'
420 |
421 | algorithm = 'forest' # bayes, logistic, svm, sgd, forest, boost
422 | metric = None # accuracy, f1, rcc_auc, mean_absolute_error, mean_squared_error, r2_score
423 | standardize = False
424 | whiten = False
425 | select = True
426 |
427 | training_data = None
428 | X = None
429 | y = None
430 | scaler = None
431 | pca = None
432 | selector = None
433 | model = None
434 | ensemble_model = None
435 |
436 | os.chdir(code_dir)
437 |
438 | print('Starting process...')
439 | print('Algorithm={0}, Create={1}, Select={2}, Standardize={3}, Whiten={4}'.format(
440 | algorithm, create_features, select, standardize, whiten))
441 |
442 | if load_training_data:
443 | print('Reading in training data...')
444 | training_data, X, y = process_training_data(data_dir + training_file, create_features)
445 |
446 | if standardize or whiten or select:
447 | print('Creating data transforms...')
448 | scaler, pca, selector = create_transforms(X, standardize, whiten, select)
449 |
450 | if create_visualizations:
451 | print('Creating visualizations...')
452 | visualize(training_data, X, y, pca)
453 |
454 | if load_model:
455 | print('Loading model from disk...')
456 | model = load(data_dir + model_file)
457 |
458 | if train_model:
459 | print('Training model on full data set...')
460 | model = train(training_data, X, y, algorithm, scaler, pca, selector)
461 |
462 | print('Calculating training score...')
463 | model_score = score(X, y, model, scaler, pca, selector)
464 | print('Training score ='), model_score
465 |
466 | if create_learning_curve:
467 | print('Generating learning curve...')
468 | plot_learning_curve(X, y, algorithm, scaler, pca, selector, metric)
469 | else:
470 | print('Performing cross-validation...')
471 | cross_val_score = cross_validate(X, y, algorithm, scaler, pca, selector, metric)
472 | print('Cross-validation score ='), cross_val_score
473 |
474 | if perform_grid_search:
475 | print('Performing hyper-parameter grid search...')
476 | best_model, best_params, best_score = parameter_search(X, y, algorithm, scaler, pca, selector, metric)
477 | print('Best model = ', best_model)
478 | print('Best params = ', best_params)
479 | print('Best score = ', best_score)
480 |
481 | if perform_ensemble:
482 | print('Creating an ensemble of models...')
483 | ensemble_model = train_ensemble(X, y, algorithm, scaler, pca, selector)
484 |
485 | print('Calculating ensemble training score...')
486 | ensemble_model_score = score(X, y, ensemble_model, scaler, pca, selector)
487 | print('Ensemble Training score ='), ensemble_model_score
488 |
489 | if save_model:
490 | print('Saving model to disk...')
491 | save(model, data_dir + model_file)
492 |
493 | if create_submission_file:
494 | print('Reading in test data...')
495 | test_data, X_test = process_test_data(data_dir + test_file, create_features)
496 |
497 | print('Predicting test data...')
498 | if perform_ensemble:
499 | y_est = predict(X_test, ensemble_model, scaler, pca, selector)
500 | else:
501 | y_est = predict(X_test, model, scaler, pca, selector)
502 |
503 | print('Creating submission file...')
504 | create_submission(test_data, y_est, data_dir + submit_file)
505 |
506 | print('Process complete.')
507 |
508 |
509 | if __name__ == "__main__":
510 | main()
--------------------------------------------------------------------------------
/old/HiggsBoson/README.md:
--------------------------------------------------------------------------------
1 | # Higgs Boson Machine Learning Challenge
2 |
3 |
4 |
5 | View the competition details here.
6 |
7 | This directory includes the code I used to run experiments for the competition. Despite starting only a few weeks before the deadline and having very limited time to invest, I managed to place in the top 25%.
8 |
9 | I used the Anaconda distribution of Python with the IPython kernel and PyCharm IDE to run experiments. I also installed and configured several additional dependencies (xgboost and pylearn 2). There are three scripts of interest:
10 |
11 | higgs.py - Primarily based on scikit-learn solutions
12 | higgs-adv.py - Switched to a Linux VM and incorporated the xgboost library
13 | higgs-nn.py - Set up pylearn 2 and started experimenting with deep learning neural nets (unfortunately I ran out of time before making any significant progress here)
14 |
15 | The scripts are fairly basic due to time constraints but well-modularized and easy to follow. Enjoy!
16 |
--------------------------------------------------------------------------------
/old/HiggsBoson/Resources/ATLAS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdwittenauer/kaggle/cc489100a0c93315e424551f68968ffba85d268f/old/HiggsBoson/Resources/ATLAS.png
--------------------------------------------------------------------------------
/old/HiggsBoson/Resources/documentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdwittenauer/kaggle/cc489100a0c93315e424551f68968ffba85d268f/old/HiggsBoson/Resources/documentation.pdf
--------------------------------------------------------------------------------
/old/HiggsBoson/auto_l1.yaml:
--------------------------------------------------------------------------------
1 | !obj:pylearn2.train.Train {
2 | dataset: &train !obj:pylearn2.datasets.csv_dataset.CSVDataset {
3 | path: '%(data_dir)s/combined_nn.csv',
4 | task: 'classification',
5 | one_hot: True,
6 | expect_labels: True,
7 | expect_headers: True,
8 | delimiter: ',',
9 | start: %(train_start)i,
10 | stop: %(train_stop)i
11 | },
12 | model: !obj:pylearn2.models.autoencoder.DenoisingAutoencoder {
13 | nvis : %(num_features)i,
14 | nhid : %(hid_l1)i,
15 | irange : 0.05,
16 | corruptor: !obj:pylearn2.corruption.BinomialCorruptor {
17 | corruption_level: .2,
18 | },
19 | act_enc: "tanh",
20 | act_dec: null, # Linear activation on the decoder side
21 | },
22 | algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
23 | learning_rate : 1e-3,
24 | batch_size : %(batch_size)i,
25 | monitoring_batches : %(monitoring_batches)i,
26 | monitoring_dataset : *train,
27 | cost : !obj:pylearn2.costs.autoencoder.MeanSquaredReconstructionError {},
28 | termination_criterion : !obj:pylearn2.termination_criteria.EpochCounter {
29 | max_epochs: %(max_epochs)i,
30 | },
31 | },
32 | save_path: '%(data_dir)s/auto_l1.pkl',
33 | save_freq: 1
34 | }
35 |
--------------------------------------------------------------------------------
/old/HiggsBoson/auto_l2.yaml:
--------------------------------------------------------------------------------
1 | !obj:pylearn2.train.Train {
2 | dataset: &train !obj:pylearn2.datasets.transformer_dataset.TransformerDataset {
3 | raw: !obj:pylearn2.datasets.csv_dataset.CSVDataset {
4 | path: '%(data_dir)s/combined_nn.csv',
5 | task: 'classification',
6 | one_hot: True,
7 | expect_labels: True,
8 | expect_headers: True,
9 | delimiter: ',',
10 | start: %(train_start)i,
11 | stop: %(train_stop)i
12 | },
13 | transformer: !pkl: '%(data_dir)s/auto_l1.pkl'
14 | },
15 | model: !obj:pylearn2.models.autoencoder.DenoisingAutoencoder {
16 | nvis : %(num_features)i,
17 | nhid : %(hid_l2)i,
18 | irange : 0.05,
19 | corruptor: !obj:pylearn2.corruption.BinomialCorruptor {
20 | corruption_level: .3,
21 | },
22 | act_enc: "tanh",
23 | act_dec: null, # Linear activation on the decoder sided
24 | },
25 | algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
26 | learning_rate : 1e-3,
27 | batch_size : %(batch_size)i,
28 | monitoring_batches : %(monitoring_batches)i,
29 | monitoring_dataset : *train,
30 | cost : !obj:pylearn2.costs.autoencoder.MeanSquaredReconstructionError {},
31 | termination_criterion : !obj:pylearn2.termination_criteria.EpochCounter {
32 | max_epochs: %(max_epochs)i,
33 | },
34 | },
35 | save_path: '%(data_dir)s/auto_l2.pkl',
36 | save_freq: 1
37 | }
38 |
--------------------------------------------------------------------------------
/old/HiggsBoson/auto_mlp.yaml:
--------------------------------------------------------------------------------
1 | !obj:pylearn2.train.Train {
2 | dataset: &train !obj:pylearn2.datasets.csv_dataset.CSVDataset {
3 | path: '%(data_dir)s/training_nn.csv',
4 | task: 'classification',
5 | one_hot: True,
6 | expect_labels: True,
7 | expect_headers: True,
8 | delimiter: ',',
9 | start: %(train_start)i,
10 | stop: %(train_stop)i
11 | },
12 | model: !obj:pylearn2.models.mlp.MLP {
13 | batch_size: %(batch_size)i,
14 | layers: [
15 | !obj:pylearn2.models.mlp.PretrainedLayer {
16 | layer_name: 'h1',
17 | layer_content: !pkl: '%(data_dir)s/auto_l1.pkl'
18 | },
19 | !obj:pylearn2.models.mlp.PretrainedLayer {
20 | layer_name: 'h2',
21 | layer_content: !pkl: '%(data_dir)s/auto_l2.pkl'
22 | },
23 | !obj:pylearn2.models.mlp.Softmax {
24 | max_col_norm: 1.9365,
25 | layer_name: 'y',
26 | n_classes: 2,
27 | irange: .005
28 | }
29 | ],
30 | nvis: %(num_features)i
31 | },
32 | algorithm: !obj:pylearn2.training_algorithms.sgd.SGD {
33 | learning_rate: .05,
34 | learning_rule: !obj:pylearn2.training_algorithms.learning_rule.Momentum {
35 | init_momentum: .5,
36 | },
37 | monitoring_dataset:
38 | {
39 | 'valid' : !obj:pylearn2.datasets.csv_dataset.CSVDataset {
40 | path: '%(data_dir)s/training_nn.csv',
41 | task: 'classification',
42 | one_hot: True,
43 | expect_labels: True,
44 | expect_headers: True,
45 | delimiter: ',',
46 | start: %(valid_start)i,
47 | stop: %(valid_stop)i
48 | },
49 | },
50 | cost: !obj:pylearn2.costs.mlp.Default {},
51 | termination_criterion: !obj:pylearn2.termination_criteria.And {
52 | criteria: [
53 | !obj:pylearn2.termination_criteria.MonitorBased {
54 | channel_name: 'valid_y_misclass',
55 | prop_decrease: 0.,
56 | N: 100
57 | },
58 | !obj:pylearn2.termination_criteria.EpochCounter {
59 | max_epochs: %(max_epochs)i
60 | }
61 | ]
62 | },
63 | update_callbacks: !obj:pylearn2.training_algorithms.sgd.ExponentialDecay {
64 | decay_factor: 1.00004,
65 | min_lr: .000001
66 | }
67 | },
68 | extensions: [
69 | !obj:pylearn2.training_algorithms.learning_rule.MomentumAdjustor {
70 | start: 1,
71 | saturate: 250,
72 | final_momentum: .7
73 | },
74 | !obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest {
75 | channel_name: 'valid_y_misclass',
76 | save_path: '%(data_dir)s/auto_mlp.pkl'
77 | }
78 | ]
79 | }
80 |
--------------------------------------------------------------------------------
/old/HiggsBoson/higgs.py:
--------------------------------------------------------------------------------
1 | import os
2 | import math
3 | import time
4 | import pickle
5 | import numpy as np
6 | import pandas as pd
7 | import matplotlib.pyplot as plt
8 | from sklearn import cross_validation
9 | from sklearn import decomposition
10 | from sklearn import ensemble
11 | from sklearn import linear_model
12 | from sklearn import naive_bayes
13 | from sklearn import preprocessing
14 | from sklearn import svm
15 |
16 |
17 | def ams(s, b):
18 | """
19 | Approximate Median Significant function to evaluate solutions.
20 | """
21 | br = 10.0
22 | radicand = 2 * ((s + b + br) * math.log(1.0 + s / (b + br)) - s)
23 | if radicand < 0:
24 | print 'Radicand is negative.'
25 | exit()
26 | else:
27 | return math.sqrt(radicand)
28 |
29 |
30 | def load(filename):
31 | """
32 | Load a previously training model from disk.
33 | """
34 | model_file = open(filename, 'rb')
35 | model = pickle.load(model_file)
36 | model_file.close()
37 | return model
38 |
39 |
40 | def save(model, filename):
41 | """
42 | Persist a trained model to disk.
43 | """
44 | model_file = open(filename, 'wb')
45 | pickle.dump(model, model_file)
46 | model_file.close()
47 |
48 |
49 | def process_training_data(filename, features, impute, standardize, whiten):
50 | """
51 | Reads in training data and prepares numpy arrays.
52 | """
53 | training_data = pd.read_csv(filename, sep=',')
54 |
55 | # add a nominal label (0, 1)
56 | temp = training_data['Label'].replace(to_replace=['s', 'b'], value=[1, 0])
57 | training_data['Nominal'] = temp
58 |
59 | X = training_data.iloc[:, 1:features+1].values
60 | y = training_data.iloc[:, features+3].values
61 | w = training_data.iloc[:, features+1].values
62 |
63 | # optionally impute the -999 values
64 | if impute == 'mean':
65 | imp = preprocessing.Imputer(missing_values=-999)
66 | X = imp.fit_transform(X)
67 | elif impute == 'zeros':
68 | X[X == -999] = 0
69 |
70 | # create a standardization transform
71 | scaler = None
72 | if standardize:
73 | scaler = preprocessing.StandardScaler()
74 | scaler.fit(X)
75 |
76 | # create a PCA transform
77 | pca = None
78 | if whiten:
79 | pca = decomposition.PCA(whiten=True)
80 | pca.fit(X)
81 |
82 | return training_data, X, y, w, scaler, pca
83 |
84 |
85 | def visualize(training_data, X, y, scaler, pca, features):
86 | """
87 | Computes statistics describing the data and creates some visualizations
88 | that attempt to highlight the underlying structure.
89 |
90 | Note: Use '%matplotlib inline' and '%matplotlib qt' at the IPython console
91 | to switch between display modes.
92 | """
93 |
94 | # feature histograms
95 | fig1, ax1 = plt.subplots(4, 4, figsize=(20, 10))
96 | for i in range(16):
97 | ax1[i % 4, i / 4].hist(X[:, i])
98 | ax1[i % 4, i / 4].set_title(training_data.columns[i + 1])
99 | ax1[i % 4, i / 4].set_xlim((min(X[:, i]), max(X[:, i])))
100 | fig1.tight_layout()
101 |
102 | fig2, ax2 = plt.subplots(4, 4, figsize=(20, 10))
103 | for i in range(16, features):
104 | ax2[i % 4, (i - 16) / 4].hist(X[:, i])
105 | ax2[i % 4, (i - 16) / 4].set_title(training_data.columns[i + 1])
106 | ax2[i % 4, (i - 16) / 4].set_xlim((min(X[:, i]), max(X[:, i])))
107 | fig2.tight_layout()
108 |
109 | # covariance matrix
110 | if scaler is not None:
111 | X = scaler.transform(X)
112 |
113 | cov = np.cov(X, rowvar=0)
114 |
115 | fig3, ax3 = plt.subplots(figsize=(16, 10))
116 | p = ax3.pcolor(cov)
117 | fig3.colorbar(p, ax=ax3)
118 | ax3.set_title('Feature Covariance Matrix')
119 |
120 | # pca plots
121 | if pca is not None:
122 | X = pca.transform(X)
123 |
124 | fig4, ax4 = plt.subplots(figsize=(16, 10))
125 | ax4.scatter(X[:, 0], X[:, 1], c=y)
126 | ax4.set_title('First & Second Principal Components')
127 |
128 | fig5, ax5 = plt.subplots(figsize=(16, 10))
129 | ax5.scatter(X[:, 1], X[:, 2], c=y)
130 | ax5.set_title('Second & Third Principal Components')
131 |
132 |
133 | def train(X, y, alg, scaler, pca):
134 | """
135 | Trains a new model using the training data.
136 | """
137 | if scaler is not None:
138 | X = scaler.transform(X)
139 |
140 | if pca is not None:
141 | X = pca.transform(X)
142 |
143 | t0 = time.time()
144 |
145 | if alg == 'bayes':
146 | model = naive_bayes.GaussianNB()
147 | elif alg == 'logistic':
148 | model = linear_model.LogisticRegression()
149 | elif alg == 'svm':
150 | model = svm.SVC()
151 | elif alg == 'boost':
152 | model = ensemble.GradientBoostingClassifier(n_estimators=100, max_depth=7, min_samples_split=200,
153 | min_samples_leaf=200, max_features=30)
154 | else:
155 | print 'No model defined for ' + alg
156 | exit()
157 |
158 | model.fit(X, y)
159 |
160 | t1 = time.time()
161 | print 'Model trained in {0:3f} s.'.format(t1 - t0)
162 |
163 | return model
164 |
165 |
166 | def predict(X, model, threshold, scaler, pca):
167 | """
168 | Predicts the probability of a positive outcome and converts the
169 | probability to a binary prediction based on the cutoff percentage.
170 | """
171 | if scaler is not None:
172 | X = scaler.transform(X)
173 |
174 | if pca is not None:
175 | X = pca.transform(X)
176 |
177 | y_prob = model.predict_proba(X)[:, 1]
178 | cutoff = np.percentile(y_prob, threshold)
179 | y_est = y_prob > cutoff
180 |
181 | return y_prob, y_est
182 |
183 |
184 | def score(y, y_est, w):
185 | """
186 | Create weighted signal and background sets and calculate the AMS.
187 | """
188 | y_signal = w * (y == 1.0)
189 | y_background = w * (y == 0.0)
190 | s = np.sum(y_signal * (y_est == 1.0))
191 | b = np.sum(y_background * (y_est == 1.0))
192 |
193 | return ams(s, b)
194 |
195 |
196 | def cross_validate(X, y, w, alg, scaler, pca, threshold):
197 | """
198 | Perform cross-validation on the training set and compute the AMS scores.
199 | """
200 | scores = [0, 0, 0]
201 | folds = cross_validation.StratifiedKFold(y, n_folds=3)
202 | i = 0
203 |
204 | for i_train, i_val in folds:
205 | # create the training and validation sets
206 | X_train, X_val = X[i_train], X[i_val]
207 | y_train, y_val = y[i_train], y[i_val]
208 | w_train, w_val = w[i_train], w[i_val]
209 |
210 | # normalize the weights
211 | w_train[y_train == 1] *= (sum(w[y == 1]) / sum(w[y_train == 1]))
212 | w_train[y_train == 0] *= (sum(w[y == 0]) / sum(w_train[y_train == 0]))
213 | w_val[y_val == 1] *= (sum(w[y == 1]) / sum(w_val[y_val == 1]))
214 | w_val[y_val == 0] *= (sum(w[y == 0]) / sum(w_val[y_val == 0]))
215 |
216 | # train the model
217 | model = train(X_train, y_train, alg, scaler, pca)
218 |
219 | # predict and score performance on the validation set
220 | y_val_prob, y_val_est = predict(X_val, model, threshold, scaler, pca)
221 | scores[i] = score(y_val, y_val_est, w_val)
222 | i += 1
223 |
224 | return np.mean(scores)
225 |
226 |
227 | def process_test_data(filename, features, impute):
228 | """
229 | Reads in test data and prepares numpy arrays.
230 | """
231 | test_data = pd.read_csv(filename, sep=',')
232 | X_test = test_data.iloc[:, 1:features+1].values
233 |
234 | if impute == 'mean':
235 | imp = preprocessing.Imputer(missing_values=-999)
236 | X_test = imp.fit_transform(X_test)
237 | elif impute == 'zeros':
238 | X_test[X_test == -999] = 0
239 |
240 | return test_data, X_test
241 |
242 |
243 | def create_submission(test_data, y_test_prob, y_test_est, submit_file):
244 | """
245 | Create a new data frame with the submission data.
246 | """
247 | temp = pd.DataFrame(y_test_prob, columns=['RankOrder'])
248 | temp2 = pd.DataFrame(y_test_est, columns=['Class'])
249 | submit = pd.DataFrame([test_data.EventId, temp.RankOrder, temp2.Class]).transpose()
250 |
251 | # sort it so they're in the ascending order by probability
252 | submit = submit.sort(['RankOrder'], ascending=True)
253 |
254 | # convert the probabilities to rank order (required by the submission guidelines)
255 | for i in range(0, y_test_est.shape[0], 1):
256 | submit.iloc[i, 1] = i + 1
257 |
258 | # re-sort by event ID
259 | submit = submit.sort(['EventId'], ascending=True)
260 |
261 | # convert the integer classification to (s, b)
262 | submit['Class'] = submit['Class'].map({1: 's', 0: 'b'})
263 |
264 | # force pandas to treat these columns at int (otherwise will write as floats)
265 | submit[['EventId', 'RankOrder']] = submit[['EventId', 'RankOrder']].astype(int)
266 |
267 | # finally create the submission file
268 | submit.to_csv(submit_file, sep=',', index=False, index_label=False)
269 |
270 |
271 | def main():
272 | # perform some initialization
273 | features = 30
274 | threshold = 85
275 | alg = 'boost' # bayes, logistic, svm, boost
276 | impute = 'none' # zeros, mean, none
277 | standardize = False
278 | whiten = False
279 | load_training_data = True
280 | load_model = False
281 | train_model = False
282 | save_model = False
283 | create_visualizations = True
284 | create_submission_file = False
285 | code_dir = '/home/john/git/kaggle/HiggsBoson/'
286 | data_dir = '/home/john/data/higgs-boson/'
287 | training_file = 'training.csv'
288 | test_file = 'test.csv'
289 | submit_file = 'submission.csv'
290 | model_file = 'model.pkl'
291 |
292 | os.chdir(code_dir)
293 |
294 | print 'Starting process...'
295 | print 'alg={0}, impute={1}, standardize={2}, whiten={3} threshold={4}'.format(
296 | alg, impute, standardize, whiten, threshold)
297 |
298 | if load_training_data:
299 | print 'Reading in training data...'
300 | training_data, X, y, w, scaler, pca = process_training_data(
301 | data_dir + training_file, features, impute, standardize, whiten)
302 |
303 | if create_visualizations:
304 | print 'Creating visualizations...'
305 | visualize(training_data, X, y, scaler, pca, features)
306 |
307 | if load_model:
308 | print 'Loading model from disk...'
309 | model = load(data_dir + model_file)
310 |
311 | if train_model:
312 | print 'Training model on full data set...'
313 | model = train(X, y, alg, scaler, pca)
314 |
315 | print 'Calculating predictions...'
316 | y_prob, y_est = predict(X, model, threshold, scaler, pca)
317 |
318 | print 'Calculating AMS...'
319 | ams_val = score(y, y_est, w)
320 | print 'AMS =', ams_val
321 |
322 | print 'Performing cross-validation...'
323 | val = cross_validate(X, y, w, alg, scaler, pca, threshold)
324 | print'Cross-validation AMS =', val
325 |
326 | if save_model:
327 | print 'Saving model to disk...'
328 | save(model, data_dir + model_file)
329 |
330 | if create_submission_file:
331 | print 'Reading in test data...'
332 | test_data, X_test = process_test_data(data_dir + test_file, features, impute)
333 |
334 | print 'Predicting test data...'
335 | y_test_prob, y_test_est = predict(X_test, model, threshold, scaler, pca)
336 |
337 | print 'Creating submission file...'
338 | create_submission(test_data, y_test_prob, y_test_est, data_dir + submit_file)
339 |
340 | print 'Process complete.'
341 |
342 |
343 | if __name__ == "__main__":
344 | main()
--------------------------------------------------------------------------------
/old/HiggsBoson/higgs_adv.py:
--------------------------------------------------------------------------------
1 | import os, math, time, pickle, sys
2 | import numpy as np
3 | import pandas as pd
4 | import matplotlib.pyplot as plt
5 | from sklearn import cross_validation
6 | from sklearn import decomposition
7 | from sklearn import ensemble
8 | from sklearn import linear_model
9 | from sklearn import naive_bayes
10 | from sklearn import preprocessing
11 | from sklearn import svm
12 |
13 | sys.path.append('/home/git/xgboost/wrapper')
14 | import xgboost as xgb
15 |
16 |
17 | def ams(s, b):
18 | """
19 | Approximate Median Significant function to evaluate solutions.
20 | """
21 | br = 10.0
22 | radicand = 2 * ((s + b + br) * math.log(1.0 + s / (b + br)) - s)
23 | if radicand < 0:
24 | print 'Radicand is negative.'
25 | exit()
26 | else:
27 | return math.sqrt(radicand)
28 |
29 |
30 | def load(alg, filename):
31 | """
32 | Load a previously training model from disk.
33 | """
34 | if alg == 'xgboost':
35 | model = xgb.Booster({'nthread': 16}, model_file=filename)
36 | else:
37 | model_file = open(filename, 'rb')
38 | model = pickle.load(model_file)
39 | model_file.close()
40 |
41 | return model
42 |
43 |
44 | def save(alg, model, filename):
45 | """
46 | Persist a trained model to disk.
47 | """
48 | if alg == 'xgboost':
49 | model.save_model(filename)
50 | else:
51 | model_file = open(filename, 'wb')
52 | pickle.dump(model, model_file)
53 | model_file.close()
54 |
55 |
56 | def process_training_data(filename, features, impute, standardize, whiten):
57 | """
58 | Reads in training data and prepares numpy arrays.
59 | """
60 | training_data = pd.read_csv(filename, sep=',')
61 |
62 | # add a nominal label (0, 1)
63 | temp = training_data['Label'].replace(to_replace=['s', 'b'], value=[1, 0])
64 | training_data['Nominal'] = temp
65 |
66 | X = training_data.iloc[:, 1:features+1].values
67 | y = training_data.iloc[:, features+3].values
68 | w = training_data.iloc[:, features+1].values
69 |
70 | # optionally impute the -999 values
71 | if impute == 'mean':
72 | imp = preprocessing.Imputer(missing_values=-999)
73 | X = imp.fit_transform(X)
74 | elif impute == 'zeros':
75 | X[X == -999] = 0
76 |
77 | # create a standardization transform
78 | scaler = None
79 | if standardize:
80 | scaler = preprocessing.StandardScaler()
81 | scaler.fit(X)
82 |
83 | # create a PCA transform
84 | pca = None
85 | if whiten:
86 | pca = decomposition.PCA(whiten=True)
87 | pca.fit(X)
88 |
89 | return training_data, X, y, w, scaler, pca
90 |
91 |
92 | def visualize(training_data, X, y, scaler, pca, features):
93 | """
94 | Computes statistics describing the data and creates some visualizations
95 | that attempt to highlight the underlying structure.
96 |
97 | Note: Use '%matplotlib inline' and '%matplotlib qt' at the IPython console
98 | to switch between display modes.
99 | """
100 |
101 | # feature histograms
102 | fig1, ax1 = plt.subplots(4, 4, figsize=(20, 10))
103 | for i in range(16):
104 | ax1[i % 4, i / 4].hist(X[:, i])
105 | ax1[i % 4, i / 4].set_title(training_data.columns[i + 1])
106 | ax1[i % 4, i / 4].set_xlim((min(X[:, i]), max(X[:, i])))
107 | fig1.tight_layout()
108 |
109 | fig2, ax2 = plt.subplots(4, 4, figsize=(20, 10))
110 | for i in range(16, features):
111 | ax2[i % 4, (i - 16) / 4].hist(X[:, i])
112 | ax2[i % 4, (i - 16) / 4].set_title(training_data.columns[i + 1])
113 | ax2[i % 4, (i - 16) / 4].set_xlim((min(X[:, i]), max(X[:, i])))
114 | fig2.tight_layout()
115 |
116 | # covariance matrix
117 | if scaler is not None:
118 | X = scaler.transform(X)
119 |
120 | cov = np.cov(X, rowvar=0)
121 |
122 | fig3, ax3 = plt.subplots(figsize=(16, 10))
123 | p = ax3.pcolor(cov)
124 | fig3.colorbar(p, ax=ax3)
125 | ax3.set_title('Feature Covariance Matrix')
126 |
127 | # pca plots
128 | if pca is not None:
129 | X = pca.transform(X)
130 |
131 | fig4, ax4 = plt.subplots(figsize=(16, 10))
132 | ax4.scatter(X[:, 0], X[:, 1], c=y)
133 | ax4.set_title('First & Second Principal Components')
134 |
135 | fig5, ax5 = plt.subplots(figsize=(16, 10))
136 | ax5.scatter(X[:, 1], X[:, 2], c=y)
137 | ax5.set_title('Second & Third Principal Components')
138 |
139 |
140 | def train(X, y, w, alg, scaler, pca):
141 | """
142 | Trains a new model using the training data.
143 | """
144 | if scaler is not None:
145 | X = scaler.transform(X)
146 |
147 | if pca is not None:
148 | X = pca.transform(X)
149 |
150 | if alg == 'xgboost':
151 | # use a separate process for the xgboost library
152 | return train_xgb(X, y, w, scaler, pca)
153 |
154 | t0 = time.time()
155 |
156 | if alg == 'bayes':
157 | model = naive_bayes.GaussianNB()
158 | elif alg == 'logistic':
159 | model = linear_model.LogisticRegression()
160 | elif alg == 'svm':
161 | model = svm.SVC()
162 | elif alg == 'boost':
163 | model = ensemble.GradientBoostingClassifier(n_estimators=100, max_depth=7,
164 | min_samples_split=200, min_samples_leaf=200, max_features=30)
165 | else:
166 | print 'No model defined for ' + alg
167 | exit()
168 |
169 | model.fit(X, y)
170 |
171 | t1 = time.time()
172 | print 'Model trained in {0:3f} s.'.format(t1 - t0)
173 |
174 | return model
175 |
176 |
177 | def train_xgb(X, y, w, scaler, pca):
178 | """
179 | Trains a boosted trees model using the XGBoost library.
180 | """
181 | t0 = time.time()
182 |
183 | xgmat = xgb.DMatrix(X, label=y, missing=-999.0, weight=w)
184 |
185 | w_pos = sum(w[i] for i in range(len(y)) if y[i] == 1)
186 | w_neg = sum(w[i] for i in range(len(y)) if y[i] == 0)
187 |
188 | param = {}
189 | param['objective'] = 'binary:logitraw'
190 | param['scale_pos_weight'] = w_neg/w_pos
191 | param['eta'] = 0.08
192 | param['max_depth'] = 7
193 | param['subsample'] = 0.8
194 | param['eval_metric'] = 'auc'
195 | param['silent'] = 1
196 |
197 | plst = list(param.items())
198 | watchlist = []
199 |
200 | model = xgb.train(plst, xgmat, 128, watchlist)
201 |
202 | t1 = time.time()
203 | print 'Model trained in {0:3f} s.'.format(t1 - t0)
204 |
205 | return model
206 |
207 |
208 | def predict(X, model, alg, threshold, scaler, pca):
209 | """
210 | Predicts the probability of a positive outcome and converts the
211 | probability to a binary prediction based on the cutoff percentage.
212 | """
213 | if scaler is not None:
214 | X = scaler.transform(X)
215 |
216 | if pca is not None:
217 | X = pca.transform(X)
218 |
219 | if alg == 'xgboost':
220 | xgmat = xgb.DMatrix(X, missing=-999.0)
221 | y_prob = model.predict(xgmat)
222 | else:
223 | y_prob = model.predict_proba(X)[:, 1]
224 |
225 | cutoff = np.percentile(y_prob, threshold)
226 | y_est = y_prob > cutoff
227 |
228 | return y_prob, y_est
229 |
230 |
231 | def score(y, y_est, w):
232 | """
233 | Create weighted signal and background sets and calculate the AMS.
234 | """
235 | y_signal = w * (y == 1.0)
236 | y_background = w * (y == 0.0)
237 | s = np.sum(y_signal * (y_est == 1.0))
238 | b = np.sum(y_background * (y_est == 1.0))
239 |
240 | return ams(s, b)
241 |
242 |
243 | def cross_validate(X, y, w, alg, scaler, pca, threshold):
244 | """
245 | Perform cross-validation on the training set and compute the AMS scores.
246 | """
247 | scores = [0, 0, 0]
248 | folds = cross_validation.StratifiedKFold(y, n_folds=3)
249 | i = 0
250 |
251 | for i_train, i_val in folds:
252 | # create the training and validation sets
253 | X_train, X_val = X[i_train], X[i_val]
254 | y_train, y_val = y[i_train], y[i_val]
255 | w_train, w_val = w[i_train], w[i_val]
256 |
257 | # normalize the weights
258 | w_train[y_train == 1] *= (sum(w[y == 1]) / sum(w[y_train == 1]))
259 | w_train[y_train == 0] *= (sum(w[y == 0]) / sum(w_train[y_train == 0]))
260 | w_val[y_val == 1] *= (sum(w[y == 1]) / sum(w_val[y_val == 1]))
261 | w_val[y_val == 0] *= (sum(w[y == 0]) / sum(w_val[y_val == 0]))
262 |
263 | # train the model
264 | model = train(X_train, y_train, w_train, alg, scaler, pca)
265 |
266 | # predict and score performance on the validation set
267 | y_val_prob, y_val_est = predict(X_val, model, alg, threshold, scaler, pca)
268 | scores[i] = score(y_val, y_val_est, w_val)
269 | i += 1
270 |
271 | return np.mean(scores)
272 |
273 |
274 | def process_test_data(filename, features, impute):
275 | """
276 | Reads in test data and prepares numpy arrays.
277 | """
278 | test_data = pd.read_csv(filename, sep=',')
279 | X_test = test_data.iloc[:, 1:features+1].values
280 |
281 | if impute == 'mean':
282 | imp = preprocessing.Imputer(missing_values=-999)
283 | X_test = imp.fit_transform(X_test)
284 | elif impute == 'zeros':
285 | X_test[X_test == -999] = 0
286 |
287 | return test_data, X_test
288 |
289 |
290 | def create_submission(test_data, y_test_prob, y_test_est, submit_file):
291 | """
292 | Create a new data frame with the submission data.
293 | """
294 | temp = pd.DataFrame(y_test_prob, columns=['RankOrder'])
295 | temp2 = pd.DataFrame(y_test_est, columns=['Class'])
296 | submit = pd.DataFrame([test_data.EventId, temp.RankOrder, temp2.Class]).transpose()
297 |
298 | # sort it so they're in the ascending order by probability
299 | submit = submit.sort(['RankOrder'], ascending=True)
300 |
301 | # convert the probabilities to rank order (required by the submission guidelines)
302 | for i in range(0, y_test_est.shape[0], 1):
303 | submit.iloc[i, 1] = i + 1
304 |
305 | # re-sort by event ID
306 | submit = submit.sort(['EventId'], ascending=True)
307 |
308 | # convert the integer classification to (s, b)
309 | submit['Class'] = submit['Class'].map({1: 's', 0: 'b'})
310 |
311 | # force pandas to treat these columns at int (otherwise will write as floats)
312 | submit[['EventId', 'RankOrder']] = submit[['EventId', 'RankOrder']].astype(int)
313 |
314 | # finally create the submission file
315 | submit.to_csv(submit_file, sep=',', index=False, index_label=False)
316 |
317 |
318 | def main():
319 | # perform some initialization
320 | features = 30
321 | threshold = 85
322 | alg = 'xgboost' # bayes, logistic, boost, xgboost
323 | impute = 'none' # zeros, mean, none
324 | standardize = False
325 | whiten = False
326 | load_training_data = True
327 | load_model = False
328 | train_model = False
329 | save_model = False
330 | create_visualizations = True
331 | create_submission_file = False
332 | code_dir = '/home/john/git/kaggle/HiggsBoson/'
333 | data_dir = '/home/john/data/higgs-boson/'
334 | training_file = 'training.csv'
335 | test_file = 'test.csv'
336 | submit_file = 'submission.csv'
337 | model_file = 'model.pkl'
338 |
339 | os.chdir(code_dir)
340 |
341 | print 'Starting process...'
342 | print 'alg={0}, impute={1}, standardize={2}, whiten={3} threshold={4}'.format(
343 | alg, impute, standardize, whiten, threshold)
344 |
345 | if load_training_data:
346 | print 'Reading in training data...'
347 | training_data, X, y, w, scaler, pca = process_training_data(
348 | data_dir + training_file, features, impute, standardize, whiten)
349 |
350 | if create_visualizations:
351 | print 'Creating visualizations...'
352 | visualize(training_data, X, y, scaler, pca, features)
353 |
354 | if load_model:
355 | print 'Loading model from disk...'
356 | model = load(alg, data_dir + model_file)
357 |
358 | if train_model:
359 | print 'Training model on full data set...'
360 | model = train(X, y, w, alg, scaler, pca)
361 |
362 | print 'Calculating predictions...'
363 | y_prob, y_est = predict(X, model, alg, threshold, scaler, pca)
364 |
365 | print 'Calculating AMS...'
366 | ams_val = score(y, y_est, w)
367 | print 'AMS =', ams_val
368 |
369 | print 'Performing cross-validation...'
370 | val = cross_validate(X, y, w, alg, scaler, pca, threshold)
371 | print'Cross-validation AMS =', val
372 |
373 | if save_model:
374 | print 'Saving model to disk...'
375 | save(alg, model, data_dir + model_file)
376 |
377 | if create_submission_file:
378 | print 'Reading in test data...'
379 | test_data, X_test = process_test_data(data_dir + test_file, features, impute)
380 |
381 | print 'Predicting test data...'
382 | y_test_prob, y_test_est = predict(X_test, model, alg, threshold, scaler, pca)
383 |
384 | print 'Creating submission file...'
385 | create_submission(test_data, y_test_prob, y_test_est, data_dir + submit_file)
386 |
387 | print 'Process complete.'
388 |
389 |
390 | if __name__ == "__main__":
391 | main()
--------------------------------------------------------------------------------
/old/HiggsBoson/higgs_nn.py:
--------------------------------------------------------------------------------
1 | import os, math
2 | import numpy as np
3 | import pandas as pd
4 | from sklearn import decomposition
5 | from sklearn import preprocessing
6 | from theano import function
7 | from pylearn2.config import yaml_parse
8 | from pylearn2.utils import serial
9 |
10 |
11 | def ams(s, b):
12 | """
13 | Approximate Median Significant function to evaluate solutions.
14 | """
15 | br = 10.0
16 | radicand = 2 * ((s + b + br) * math.log(1.0 + s / (b + br)) - s)
17 | if radicand < 0:
18 | print 'Radicand is negative.'
19 | exit()
20 | else:
21 | return math.sqrt(radicand)
22 |
23 |
24 | def process_training_data(filename, features, impute, standardize, whiten):
25 | """
26 | Reads in training data and prepares numpy arrays.
27 | """
28 | training_data = pd.read_csv(filename, sep=',')
29 |
30 | # add a nominal label (0, 1)
31 | temp = training_data['Label'].replace(to_replace=['s', 'b'], value=[1, 0])
32 | training_data['Nominal'] = temp
33 |
34 | X = training_data.iloc[:, 1:features+1].values
35 | y = training_data.iloc[:, features+3].values
36 | w = training_data.iloc[:, features+1].values
37 |
38 | # optionally impute the -999 values
39 | if impute == 'mean':
40 | imp = preprocessing.Imputer(missing_values=-999)
41 | X = imp.fit_transform(X)
42 | elif impute == 'zeros':
43 | X[X == -999] = 0
44 |
45 | # create a standardization transform
46 | scaler = None
47 | if standardize:
48 | scaler = preprocessing.StandardScaler()
49 | scaler.fit(X)
50 |
51 | # create a PCA transform
52 | pca = None
53 | if whiten:
54 | pca = decomposition.PCA(whiten=True)
55 | pca.fit(X)
56 |
57 | return training_data, X, y, w, scaler, pca
58 |
59 |
60 | def create_nn_pre_train_file(original_filename, new_filename, impute, scaler, pca):
61 | """
62 | Creates a non-labeled data set with transforms applied to be used
63 | by pylearn2's csv data set class.
64 | """
65 | combined_data = pd.read_csv(original_filename, sep=',')
66 |
67 | X = combined_data.values
68 |
69 | if impute == 'mean':
70 | imp = preprocessing.Imputer(missing_values=-999)
71 | X = imp.fit_transform(X)
72 | elif impute == 'zeros':
73 | X[X == -999] = 0
74 |
75 | if scaler is not None:
76 | X = scaler.transform(X)
77 |
78 | if pca is not None:
79 | X = pca.transform(X)
80 |
81 | combined_data = pd.DataFrame(X, columns=combined_data.columns.values)
82 | combined_data.to_csv(new_filename, sep=',', index=False)
83 |
84 |
85 | def create_nn_training_file(training_data, features, impute, scaler, pca, filename):
86 | """
87 | Creates a labeled training set with transforms applied to be used
88 | by pylearn2's csv data set class.
89 | """
90 | nn_training_data = training_data
91 |
92 | nn_training_data.insert(0, 'NN_Label', nn_training_data['Nominal'].values)
93 |
94 | nn_training_data.drop('EventId', axis=1, inplace=True)
95 | nn_training_data.drop('Weight', axis=1, inplace=True)
96 | nn_training_data.drop('Label', axis=1, inplace=True)
97 | nn_training_data.drop('Nominal', axis=1, inplace=True)
98 |
99 | X = nn_training_data.iloc[:, 1:features+1].values
100 |
101 | if impute == 'mean':
102 | imp = preprocessing.Imputer(missing_values=-999)
103 | X = imp.fit_transform(X)
104 | elif impute == 'zeros':
105 | X[X == -999] = 0
106 |
107 | if scaler is not None:
108 | X = scaler.transform(X)
109 |
110 | if pca is not None:
111 | X = pca.transform(X)
112 |
113 | X = np.insert(X, 0, nn_training_data['NN_Label'].values, 1)
114 |
115 | nn_training_data = pd.DataFrame(X, columns=nn_training_data.columns.values)
116 | nn_training_data.to_csv(filename, sep=',', index=False)
117 |
118 |
119 | def train(model_definition_file, data_dir):
120 | """
121 | Trains a neural network model using the pylearn2 library.
122 | """
123 | with open(model_definition_file, 'r') as f:
124 | train_nn = f.read()
125 |
126 | hyper_params = {'data_dir': data_dir,
127 | 'num_features': 30,
128 | 'dim_h0': 50,
129 | 'batch_size': 100,
130 | 'max_epochs': 10,
131 | 'train_start': 0,
132 | 'train_stop': 150000,
133 | 'valid_start': 150001,
134 | 'valid_stop': 200000,
135 | 'test_start': 200001,
136 | 'test_stop': 250000}
137 | train_nn = train_nn % hyper_params
138 | train_nn = yaml_parse.load(train_nn)
139 | train_nn.main_loop()
140 |
141 |
142 | def predict(X, threshold, scaler, pca, model_file):
143 | """
144 | Compiles a Theano function using the pylearn 2 model's fprop
145 | to predict the probability of a positive outcome, and converts
146 | to a binary prediction based on the cutoff percentage.
147 | """
148 | if scaler is not None:
149 | X = scaler.transform(X)
150 |
151 | if pca is not None:
152 | X = pca.transform(X)
153 |
154 | # Load the model
155 | model = serial.load(model_file)
156 |
157 | # Create Theano function to compute probability
158 | x = model.get_input_space().make_theano_batch()
159 | y = model.fprop(x)
160 | pred = function([x], y)
161 |
162 | # Convert to a prediction
163 | y_prob = pred(X)[:, 1]
164 | cutoff = np.percentile(y_prob, threshold)
165 | y_est = y_prob > cutoff
166 |
167 | return y_prob, y_est
168 |
169 |
170 | def score(y, y_est, w):
171 | """
172 | Create weighted signal and background sets and calculate the AMS.
173 | """
174 | y_signal = w * (y == 1.0)
175 | y_background = w * (y == 0.0)
176 | s = np.sum(y_signal * (y_est == 1.0))
177 | b = np.sum(y_background * (y_est == 1.0))
178 |
179 | return ams(s, b)
180 |
181 |
182 | def process_test_data(filename, features, impute):
183 | """
184 | Reads in test data and prepares numpy arrays.
185 | """
186 | test_data = pd.read_csv(filename, sep=',')
187 | X_test = test_data.iloc[:, 1:features+1].values
188 |
189 | if impute == 'mean':
190 | imp = preprocessing.Imputer(missing_values=-999)
191 | X_test = imp.fit_transform(X_test)
192 | elif impute == 'zeros':
193 | X_test[X_test == -999] = 0
194 |
195 | return test_data, X_test
196 |
197 |
198 | def create_submission(test_data, y_test_prob, y_test_est, submit_file):
199 | """
200 | Create a new data frame with the submission data.
201 | """
202 | temp = pd.DataFrame(y_test_prob, columns=['RankOrder'])
203 | temp2 = pd.DataFrame(y_test_est, columns=['Class'])
204 | submit = pd.DataFrame([test_data.EventId, temp.RankOrder, temp2.Class]).transpose()
205 |
206 | # sort it so they're in the ascending order by probability
207 | submit = submit.sort(['RankOrder'], ascending=True)
208 |
209 | # convert the probabilities to rank order (required by the submission guidelines)
210 | for i in range(0, y_test_est.shape[0], 1):
211 | submit.iloc[i, 1] = i + 1
212 |
213 | # re-sort by event ID
214 | submit = submit.sort(['EventId'], ascending=True)
215 |
216 | # convert the integer classification to (s, b)
217 | submit['Class'] = submit['Class'].map({1: 's', 0: 'b'})
218 |
219 | # force pandas to treat these columns at int (otherwise will write as floats)
220 | submit[['EventId', 'RankOrder']] = submit[['EventId', 'RankOrder']].astype(int)
221 |
222 | # finally create the submission file
223 | submit.to_csv(submit_file, sep=',', index=False, index_label=False)
224 |
225 |
226 | def main():
227 | # perform some initialization
228 | features = 30
229 | threshold = 85
230 | impute = 'zeros' # zeros, mean, none
231 | standardize = True
232 | whiten = False
233 | load_training_data = True
234 | train_model = True
235 | create_nn_files = True
236 | train_nn_model = True
237 | create_submission_file = False
238 | code_dir = '/home/john/git/kaggle/HiggsBoson/'
239 | data_dir = '/home/john/data/higgs-boson/'
240 | pretrain_file = 'combined.csv'
241 | training_file = 'training.csv'
242 | test_file = 'test.csv'
243 | submit_file = 'submission.csv'
244 | pretrain_nn_file = 'combined_nn.csv'
245 | training_nn_file = 'training_nn.csv'
246 | model_definition_file = 'mlp.yaml'
247 | model_file = 'mlp.pkl'
248 |
249 | os.chdir(code_dir)
250 |
251 | print 'Starting process...'
252 | print 'impute={0}, standardize={1}, whiten={2} threshold={3}'.format(
253 | impute, standardize, whiten, threshold)
254 |
255 | if load_training_data:
256 | print 'Reading in training data...'
257 | training_data, X, y, w, scaler, pca = process_training_data(
258 | data_dir + training_file, features, impute, standardize, whiten)
259 |
260 | if train_model:
261 | print 'Running neural network process...'
262 |
263 | if create_nn_files:
264 | print 'Creating training files...'
265 | create_nn_training_file(training_data, features, impute, scaler, pca,
266 | data_dir + training_nn_file)
267 | create_nn_pre_train_file(data_dir + pretrain_file,
268 | data_dir + pretrain_nn_file, impute, scaler, pca)
269 |
270 | if train_nn_model:
271 | print 'Training the model...'
272 | train(code_dir + model_definition_file, data_dir)
273 |
274 | print 'Calculating predictions...'
275 | y_prob, y_est = predict(X, threshold, scaler, pca, data_dir + model_file)
276 |
277 | print 'Calculating AMS...'
278 | ams_val = score(y, y_est, w)
279 | print 'AMS =', ams_val
280 |
281 | if create_submission_file:
282 | print 'Reading in test data...'
283 | test_data, X_test = process_test_data(data_dir + test_file, features, impute)
284 |
285 | print 'Predicting test data...'
286 | y_test_prob, y_test_est = predict(X_test, threshold, scaler, pca, data_dir + model_file)
287 |
288 | print 'Creating submission file...'
289 | create_submission(test_data, y_test_prob, y_test_est, data_dir + submit_file)
290 |
291 | print 'Process complete.'
292 |
293 |
294 | if __name__ == "__main__":
295 | main()
--------------------------------------------------------------------------------
/old/HiggsBoson/metric.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Evaluation metric for the Higgs Boson Kaggle Competition,
4 | as described on:
5 | https://www.kaggle.com/c/higgs-boson/details/evaluation
6 |
7 | @author: Joyce Noah-Vanhoukce
8 | Created: Thu Apr 24 2014
9 | """
10 |
11 | import os
12 | import csv
13 | import math
14 |
15 |
16 | def create_solution_dictionary(solution):
17 | """ Read solution file, return a dictionary with key EventId and value (weight,label).
18 | Solution file headers: EventId, Label, Weight """
19 |
20 | solnDict = {}
21 | with open(solution, 'rb') as f:
22 | soln = csv.reader(f)
23 | soln.next() # header
24 | for row in soln:
25 | if row[0] not in solnDict:
26 | solnDict[row[0]] = (row[1], row[2])
27 | return solnDict
28 |
29 |
30 | def check_submission(submission, Nelements):
31 | """ Check that submission RankOrder column is correct:
32 | 1. All numbers are in [1,NTestSet]
33 | 2. All numbers are unqiue
34 | """
35 | rankOrderSet = set()
36 | with open(submission, 'rb') as f:
37 | sub = csv.reader(f)
38 | sub.next() # header
39 | for row in sub:
40 | rankOrderSet.add(row[1])
41 |
42 | if len(rankOrderSet) != Nelements:
43 | print 'RankOrder column must contain unique values'
44 | exit()
45 | elif rankOrderSet.isdisjoint(set(xrange(1,Nelements+1))) == False:
46 | print 'RankOrder column must contain all numbers from [1..NTestSset]'
47 | exit()
48 | else:
49 | return True
50 |
51 |
52 | def AMS(s, b):
53 | """ Approximate Median Significance defined as:
54 | AMS = sqrt(
55 | 2 { (s + b + b_r) log[1 + (s/(b+b_r))] - s}
56 | )
57 | where b_r = 10, b = background, s = signal, log is natural logarithm """
58 |
59 | br = 10.0
60 | radicand = 2 *( (s+b+br) * math.log (1.0 + s/(b+br)) -s)
61 | if radicand < 0:
62 | print 'radicand is negative. Exiting'
63 | exit()
64 | else:
65 | return math.sqrt(radicand)
66 |
67 |
68 | def AMS_metric(solution, submission):
69 | """ Prints the AMS metric value to screen.
70 | Solution File header: EventId, Class, Weight
71 | Submission File header: EventId, RankOrder, Class
72 | """
73 |
74 | numEvents = 550000 # number of events = size of test set
75 |
76 | # solutionDict: key=eventId, value=(label, class)
77 | solutionDict = create_solution_dictionary(solution)
78 |
79 | signal = 0.0
80 | background = 0.0
81 | if check_submission(submission, numEvents):
82 | with open(submission, 'rb') as f:
83 | sub = csv.reader(f)
84 | sub.next() # header row
85 | for row in sub:
86 | if row[2] == 's': # only events predicted to be signal are scored
87 | if solutionDict[row[0]][0] == 's':
88 | signal += float(solutionDict[row[0]][1])
89 | elif solutionDict[row[0]][0] == 'b':
90 | background += float(solutionDict[row[0]][1])
91 |
92 | print 'signal = {0}, background = {1}'.format(signal, background)
93 | print 'AMS = ' + str(AMS(signal, background))
94 |
95 |
96 | if __name__ == "__main__":
97 |
98 | # enter path and file names here
99 | path = ""
100 | solutionFile = ""
101 | submissionFile = ""
102 |
103 | AMS_metric(solutionFile, submissionFile)
104 |
105 |
106 |
--------------------------------------------------------------------------------
/old/HiggsBoson/mlp.yaml:
--------------------------------------------------------------------------------
1 | !obj:pylearn2.train.Train {
2 | dataset: &train !obj:pylearn2.datasets.csv_dataset.CSVDataset {
3 | path: '%(data_dir)s/training_nn.csv',
4 | task: 'classification',
5 | one_hot: True,
6 | expect_labels: True,
7 | expect_headers: True,
8 | delimiter: ',',
9 | start: %(train_start)i,
10 | stop: %(train_stop)i
11 | },
12 | model: !obj:pylearn2.models.mlp.MLP {
13 | layers: [
14 | !obj:pylearn2.models.mlp.Sigmoid {
15 | layer_name: 'h0',
16 | dim: %(dim_h0)i,
17 | sparse_init: 15,
18 | }, !obj:pylearn2.models.mlp.Softmax {
19 | layer_name: 'y',
20 | n_classes: 2,
21 | irange: 0.
22 | }
23 | ],
24 | nvis: %(num_features)i,
25 | },
26 | algorithm: !obj:pylearn2.training_algorithms.bgd.BGD {
27 | batch_size: %(batch_size)i,
28 | line_search_mode: 'exhaustive',
29 | conjugate: 1,
30 | updates_per_batch: 10,
31 | monitoring_dataset: {
32 | 'train' : *train,
33 | 'valid' : !obj:pylearn2.datasets.csv_dataset.CSVDataset {
34 | path: '%(data_dir)s/training_nn.csv',
35 | task: 'classification',
36 | one_hot: True,
37 | expect_labels: True,
38 | expect_headers: True,
39 | delimiter: ',',
40 | start: %(valid_start)i,
41 | stop: %(valid_stop)i
42 | },
43 | 'test' : !obj:pylearn2.datasets.csv_dataset.CSVDataset {
44 | path: '%(data_dir)s/training_nn.csv',
45 | task: 'classification',
46 | one_hot: True,
47 | expect_labels: True,
48 | expect_headers: True,
49 | delimiter: ',',
50 | start: %(test_start)i,
51 | stop: %(test_stop)i
52 | }
53 | },
54 | termination_criterion: !obj:pylearn2.termination_criteria.And {
55 | criteria: [
56 | !obj:pylearn2.termination_criteria.MonitorBased {
57 | channel_name: 'valid_y_misclass'
58 | },
59 | !obj:pylearn2.termination_criteria.EpochCounter {
60 | max_epochs: %(max_epochs)i
61 | }
62 | ]
63 | }
64 | },
65 | extensions: [
66 | !obj:pylearn2.train_extensions.best_params.MonitorBasedSaveBest {
67 | channel_name: 'valid_y_misclass',
68 | save_path: '%(data_dir)s/mlp.pkl'
69 | }
70 | ]
71 | }
72 |
--------------------------------------------------------------------------------
/old/NerveSegmentation/README.md:
--------------------------------------------------------------------------------
1 | # Ultrasound Nerve Segmentation
2 |
3 | View the competition details here.
4 |
5 | I started this competition just to mess around with image classification and see what scripts others were coming up with. I didn't really do any original work on this one.
--------------------------------------------------------------------------------
/old/NerveSegmentation/data.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import os
3 | import numpy as np
4 | import cv2
5 |
6 | data_path = '/home/john/data/nerve-segmentation/'
7 | image_rows = 420
8 | image_cols = 580
9 |
10 |
11 | def create_train_data():
12 | train_data_path = os.path.join(data_path, 'train')
13 | images = os.listdir(train_data_path)
14 | total = len(images) / 2
15 |
16 | imgs = np.ndarray((total, 1, image_rows, image_cols), dtype=np.uint8)
17 | imgs_mask = np.ndarray((total, 1, image_rows, image_cols), dtype=np.uint8)
18 |
19 | i = 0
20 | print('Creating training images...')
21 | for image_name in images:
22 | if 'mask' in image_name:
23 | continue
24 | image_mask_name = image_name.split('.')[0] + '_mask.tif'
25 | img = cv2.imread(os.path.join(train_data_path, image_name), cv2.IMREAD_GRAYSCALE)
26 | img_mask = cv2.imread(os.path.join(train_data_path, image_mask_name), cv2.IMREAD_GRAYSCALE)
27 |
28 | img = np.array([img])
29 | img_mask = np.array([img_mask])
30 |
31 | imgs[i] = img
32 | imgs_mask[i] = img_mask
33 |
34 | if i % 100 == 0:
35 | print('Done: {0}/{1} images'.format(i, total))
36 | i += 1
37 | print('Loading done.')
38 |
39 | np.save(data_path + 'imgs_train.npy', imgs)
40 | np.save(data_path + 'imgs_mask_train.npy', imgs_mask)
41 | print('Saving to .npy files done.')
42 |
43 |
44 | def load_train_data():
45 | imgs_train = np.load(data_path + 'imgs_train.npy')
46 | imgs_mask_train = np.load(data_path + 'imgs_mask_train.npy')
47 | return imgs_train, imgs_mask_train
48 |
49 |
50 | def create_test_data():
51 | train_data_path = os.path.join(data_path, 'test')
52 | images = os.listdir(train_data_path)
53 | total = len(images)
54 |
55 | imgs = np.ndarray((total, 1, image_rows, image_cols), dtype=np.uint8)
56 | imgs_id = np.ndarray((total, ), dtype=np.int32)
57 |
58 | i = 0
59 | print('Creating test images...')
60 | for image_name in images:
61 | img_id = int(image_name.split('.')[0])
62 | img = cv2.imread(os.path.join(train_data_path, image_name), cv2.IMREAD_GRAYSCALE)
63 |
64 | img = np.array([img])
65 |
66 | imgs[i] = img
67 | imgs_id[i] = img_id
68 |
69 | if i % 100 == 0:
70 | print('Done: {0}/{1} images'.format(i, total))
71 | i += 1
72 | print('Loading done.')
73 |
74 | np.save(data_path + 'imgs_test.npy', imgs)
75 | np.save(data_path + 'imgs_id_test.npy', imgs_id)
76 | print('Saving to .npy files done.')
77 |
78 |
79 | def load_test_data():
80 | imgs_test = np.load(data_path + 'imgs_test.npy')
81 | imgs_id = np.load(data_path + 'imgs_id_test.npy')
82 | return imgs_test, imgs_id
83 |
84 |
85 | if __name__ == '__main__':
86 | create_train_data()
87 | create_test_data()
88 | print('Complete.')
--------------------------------------------------------------------------------
/old/NerveSegmentation/submission.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import sys
3 | sys.path.append('/home/john/git/kaggle/NerveSegmentation/')
4 |
5 | import numpy as np
6 | import cv2
7 | from data import image_cols, image_rows
8 |
9 | data_path = '/home/john/data/nerve-segmentation/'
10 |
11 |
12 | def prep(img):
13 | img = img.astype('float32')
14 | img = cv2.threshold(img, 0.5, 1., cv2.THRESH_BINARY)[1].astype(np.uint8)
15 | img = cv2.resize(img, (image_cols, image_rows))
16 | return img
17 |
18 |
19 | def run_length_enc(label):
20 | from itertools import chain
21 | x = label.transpose().flatten()
22 | y = np.where(x > 0)[0]
23 | if len(y) < 10: # consider as empty
24 | return ''
25 | z = np.where(np.diff(y) > 1)[0]
26 | start = np.insert(y[z+1], 0, y[0])
27 | end = np.append(y[z], y[-1])
28 | length = end - start
29 | res = [[s+1, l+1] for s, l in zip(list(start), list(length))]
30 | res = list(chain.from_iterable(res))
31 | return ' '.join([str(r) for r in res])
32 |
33 |
34 | def submission():
35 | from data import load_test_data
36 | imgs_test, imgs_id_test = load_test_data()
37 | imgs_test = np.load(data_path + 'imgs_mask_test.npy')
38 |
39 | argsort = np.argsort(imgs_id_test)
40 | imgs_id_test = imgs_id_test[argsort]
41 | imgs_test = imgs_test[argsort]
42 |
43 | total = imgs_test.shape[0]
44 | ids = []
45 | rles = []
46 | for i in range(total):
47 | img = imgs_test[i, 0]
48 | img = prep(img)
49 | rle = run_length_enc(img)
50 |
51 | rles.append(rle)
52 | ids.append(imgs_id_test[i])
53 |
54 | if i % 100 == 0:
55 | print('{}/{}'.format(i, total))
56 |
57 | first_row = 'img,pixels'
58 | file_name = data_path + 'submission.csv'
59 |
60 | with open(file_name, 'w+') as f:
61 | f.write(first_row + '\n')
62 | for i in range(total):
63 | s = str(ids[i]) + ',' + rles[i]
64 | f.write(s + '\n')
65 |
66 |
67 | if __name__ == '__main__':
68 | submission()
69 | print('Complete.')
--------------------------------------------------------------------------------
/old/NerveSegmentation/train.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import sys
3 | sys.path.append('/home/john/git/kaggle/NerveSegmentation/')
4 |
5 | import cv2
6 | import numpy as np
7 | from keras.models import Model
8 | from keras.layers import Input, merge, Convolution2D, MaxPooling2D, UpSampling2D
9 | from keras.optimizers import Adam
10 | from keras.callbacks import ModelCheckpoint
11 | from keras import backend as K
12 | from data import load_train_data, load_test_data
13 |
14 | data_path = '/home/john/data/nerve-segmentation/'
15 | img_rows = 64
16 | img_cols = 80
17 | smooth = 1.
18 |
19 |
20 | def dice_coef(y_true, y_pred):
21 | y_true_f = K.flatten(y_true)
22 | y_pred_f = K.flatten(y_pred)
23 | intersection = K.sum(y_true_f * y_pred_f)
24 | return (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
25 |
26 |
27 | def dice_coef_loss(y_true, y_pred):
28 | return -dice_coef(y_true, y_pred)
29 |
30 |
31 | def get_unet():
32 | inputs = Input((1, img_rows, img_cols))
33 | conv1 = Convolution2D(32, 3, 3, activation='relu', border_mode='same')(inputs)
34 | conv1 = Convolution2D(32, 3, 3, activation='relu', border_mode='same')(conv1)
35 | pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
36 |
37 | conv2 = Convolution2D(64, 3, 3, activation='relu', border_mode='same')(pool1)
38 | conv2 = Convolution2D(64, 3, 3, activation='relu', border_mode='same')(conv2)
39 | pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
40 |
41 | conv3 = Convolution2D(128, 3, 3, activation='relu', border_mode='same')(pool2)
42 | conv3 = Convolution2D(128, 3, 3, activation='relu', border_mode='same')(conv3)
43 | pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)
44 |
45 | conv4 = Convolution2D(256, 3, 3, activation='relu', border_mode='same')(pool3)
46 | conv4 = Convolution2D(256, 3, 3, activation='relu', border_mode='same')(conv4)
47 | pool4 = MaxPooling2D(pool_size=(2, 2))(conv4)
48 |
49 | conv5 = Convolution2D(512, 3, 3, activation='relu', border_mode='same')(pool4)
50 | conv5 = Convolution2D(512, 3, 3, activation='relu', border_mode='same')(conv5)
51 |
52 | up6 = merge([UpSampling2D(size=(2, 2))(conv5), conv4], mode='concat', concat_axis=1)
53 | conv6 = Convolution2D(256, 3, 3, activation='relu', border_mode='same')(up6)
54 | conv6 = Convolution2D(256, 3, 3, activation='relu', border_mode='same')(conv6)
55 |
56 | up7 = merge([UpSampling2D(size=(2, 2))(conv6), conv3], mode='concat', concat_axis=1)
57 | conv7 = Convolution2D(128, 3, 3, activation='relu', border_mode='same')(up7)
58 | conv7 = Convolution2D(128, 3, 3, activation='relu', border_mode='same')(conv7)
59 |
60 | up8 = merge([UpSampling2D(size=(2, 2))(conv7), conv2], mode='concat', concat_axis=1)
61 | conv8 = Convolution2D(64, 3, 3, activation='relu', border_mode='same')(up8)
62 | conv8 = Convolution2D(64, 3, 3, activation='relu', border_mode='same')(conv8)
63 |
64 | up9 = merge([UpSampling2D(size=(2, 2))(conv8), conv1], mode='concat', concat_axis=1)
65 | conv9 = Convolution2D(32, 3, 3, activation='relu', border_mode='same')(up9)
66 | conv9 = Convolution2D(32, 3, 3, activation='relu', border_mode='same')(conv9)
67 |
68 | conv10 = Convolution2D(1, 1, 1, activation='sigmoid')(conv9)
69 |
70 | model = Model(input=inputs, output=conv10)
71 |
72 | model.compile(optimizer=Adam(lr=1e-5), loss=dice_coef_loss, metrics=[dice_coef])
73 |
74 | return model
75 |
76 |
77 | def preprocess(imgs):
78 | imgs_p = np.ndarray((imgs.shape[0], imgs.shape[1], img_rows, img_cols), dtype=np.uint8)
79 | for i in range(imgs.shape[0]):
80 | imgs_p[i, 0] = cv2.resize(imgs[i, 0], (img_cols, img_rows), interpolation=cv2.INTER_CUBIC)
81 | return imgs_p
82 |
83 |
84 | def train_and_predict():
85 | print('Loading and preprocessing train data...')
86 | imgs_train, imgs_mask_train = load_train_data()
87 |
88 | imgs_train = preprocess(imgs_train)
89 | imgs_mask_train = preprocess(imgs_mask_train)
90 |
91 | imgs_train = imgs_train.astype('float32')
92 | mean = np.mean(imgs_train) # mean for data centering
93 | std = np.std(imgs_train) # std for data normalization
94 |
95 | imgs_train -= mean
96 | imgs_train /= std
97 |
98 | imgs_mask_train = imgs_mask_train.astype('float32')
99 | imgs_mask_train /= 255. # scale masks to [0, 1]
100 |
101 | print('Creating and compiling model...')
102 | model = get_unet()
103 | model_checkpoint = ModelCheckpoint(data_path + 'unet.hdf5', monitor='loss', save_best_only=True)
104 |
105 | print('Fitting model...')
106 | model.fit(imgs_train, imgs_mask_train, batch_size=32, nb_epoch=20, verbose=1, shuffle=True,
107 | callbacks=[model_checkpoint])
108 |
109 | print('Loading and preprocessing test data...')
110 | imgs_test, imgs_id_test = load_test_data()
111 | imgs_test = preprocess(imgs_test)
112 |
113 | imgs_test = imgs_test.astype('float32')
114 | imgs_test -= mean
115 | imgs_test /= std
116 |
117 | print('Loading saved weights...')
118 | model.load_weights(data_path + 'unet.hdf5')
119 |
120 | print('Predicting masks on test data...')
121 | imgs_mask_test = model.predict(imgs_test, verbose=1)
122 | np.save(data_path + 'imgs_mask_test.npy', imgs_mask_test)
123 |
124 |
125 | if __name__ == '__main__':
126 | train_and_predict()
127 | print('Complete.')
--------------------------------------------------------------------------------
/old/OttoGroup/README.md:
--------------------------------------------------------------------------------
1 | # Otto Group Product Classification Challenge
2 |
3 |
4 |
5 | View the competition details here.
6 |
7 | This directory includes the code I used to run experiments for the competition. I started very late (with only a few days remaining) so I didn't have much time to experiment, but I messed around with xgboost and keras (deep learning library) a bit.
8 |
9 | I used the Anaconda distribution of Python with the IPython kernel and PyCharm IDE to run experiments, with some additional dependencies configured like a decent BLAS for Theano. The primary script is otto.py. The others are example scripts I got from various places.
--------------------------------------------------------------------------------
/old/OttoGroup/Resources/Grafik.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdwittenauer/kaggle/cc489100a0c93315e424551f68968ffba85d268f/old/OttoGroup/Resources/Grafik.jpg
--------------------------------------------------------------------------------
/old/OttoGroup/find_ensemble_weights.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from scipy.optimize import minimize
3 | from sklearn.cross_validation import StratifiedShuffleSplit
4 | from sklearn.ensemble import RandomForestClassifier
5 | from sklearn.linear_model import LogisticRegression
6 | from sklearn.metrics import log_loss
7 | import os
8 |
9 |
10 | def log_loss_func(weights):
11 | """
12 | scipy minimize will pass the weights as a numpy array
13 | """
14 | final_prediction = 0
15 | for weight, prediction in zip(weights, predictions):
16 | final_prediction += weight*prediction
17 |
18 | return log_loss(test_y, final_prediction)
19 |
20 |
21 | os.system("ls ../input")
22 |
23 | train = pd.read_csv("../input/train.csv")
24 | print("Training set has {0[0]} rows and {0[1]} columns".format(train.shape))
25 |
26 | labels = train['target']
27 | train.drop(['target', 'id'], axis=1, inplace=True)
28 |
29 | print(train.head())
30 |
31 | # we need a test set that we didn't train on to find the best weights for combining the classifiers
32 | sss = StratifiedShuffleSplit(labels, test_size=0.05, random_state=1234)
33 | for train_index, test_index in sss:
34 | break
35 |
36 | train_x, train_y = train.values[train_index], labels.values[train_index]
37 | test_x, test_y = train.values[test_index], labels.values[test_index]
38 |
39 | # building the classifiers
40 | clfs = []
41 |
42 | rfc = RandomForestClassifier(n_estimators=50, random_state=4141, n_jobs=-1)
43 | rfc.fit(train_x, train_y)
44 | print('RFC LogLoss {score}'.format(score=log_loss(test_y, rfc.predict_proba(test_x))))
45 | clfs.append(rfc)
46 |
47 | # usually you'd use xgboost and neural nets here
48 | logreg = LogisticRegression()
49 | logreg.fit(train_x, train_y)
50 | print('LogisticRegression LogLoss {score}'.format(score=log_loss(test_y, logreg.predict_proba(test_x))))
51 | clfs.append(logreg)
52 |
53 | rfc2 = RandomForestClassifier(n_estimators=50, random_state=1337, n_jobs=-1)
54 | rfc2.fit(train_x, train_y)
55 | print('RFC2 LogLoss {score}'.format(score=log_loss(test_y, rfc2.predict_proba(test_x))))
56 | clfs.append(rfc2)
57 |
58 |
59 | # finding the optimum weights
60 | predictions = []
61 | for clf in clfs:
62 | predictions.append(clf.predict_proba(test_x))
63 |
64 | # the algorithms need a starting value, right not we chose 0.5 for all weights
65 | # its better to choose many random starting points and run minimize a few times
66 | starting_values = [0.5] * len(predictions)
67 |
68 | # adding constraints and a different solver as suggested by user 16universe
69 | cons = ({'type': 'eq', 'fun': lambda w: 1-sum(w)})
70 |
71 | # our weights are bound between 0 and 1
72 | bounds = [(0, 1)] * len(predictions)
73 |
74 | res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)
75 |
76 | print('Ensemble Score: {best_score}'.format(best_score=res['fun']))
77 | print('Best Weights: {weights}'.format(weights=res['x']))
78 |
--------------------------------------------------------------------------------
/old/OttoGroup/graphlab_starter.py:
--------------------------------------------------------------------------------
1 | import graphlab as gl
2 | import math
3 | import random
4 |
5 | train = gl.SFrame.read_csv('data/train.csv')
6 | test = gl.SFrame.read_csv('data/test.csv')
7 | del train['id']
8 |
9 |
10 | def make_submission(m, test, filename):
11 | preds = m.predict_topk(test, output_type='probability', k=9)
12 | preds['id'] = preds['id'].astype(int) + 1
13 | preds = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '')
14 | preds = preds.sort('id')
15 | preds.save(filename)
16 |
17 |
18 | def multiclass_logloss(model, test):
19 | preds = model.predict_topk(test, output_type='probability', k=9)
20 | preds = preds.unstack(['class', 'probability'], 'probs').unpack('probs', '')
21 | preds['id'] = preds['id'].astype(int) + 1
22 | preds = preds.sort('id')
23 | preds['target'] = test['target']
24 | neg_log_loss = 0
25 | for row in preds:
26 | label = row['target']
27 | neg_log_loss += - math.log(row[label])
28 | return neg_log_loss / preds.num_rows()
29 |
30 |
31 | def shuffle(sf):
32 | sf['_id'] = [random.random() for i in xrange(sf.num_rows())]
33 | sf = sf.sort('_id')
34 | del sf['_id']
35 | return sf
36 |
37 |
38 | def evaluate_logloss(model, train, valid):
39 | return {'train_logloss': multiclass_logloss(model, train),
40 | 'valid_logloss': multiclass_logloss(model, valid)}
41 |
42 |
43 | params = {'target': 'target',
44 | 'max_iterations': 250,
45 | 'max_depth': 10,
46 | 'min_child_weight': 4,
47 | 'row_subsample': .9,
48 | 'min_loss_reduction': 1,
49 | 'column_subsample': .8,
50 | 'validation_set': None}
51 |
52 | train = shuffle(train)
53 |
54 | # Check performance on internal validation set
55 | tr, va = train.random_split(.8)
56 | m = gl.boosted_trees_classifier.create(tr, **params)
57 | print evaluate_logloss(m, tr, va)
58 |
59 | # Make final submission by using full training set
60 | m = gl.boosted_trees_classifier.create(train, **params)
61 | make_submission(m, test, 'submission.csv')
62 |
63 |
--------------------------------------------------------------------------------
/old/OttoGroup/keras_starter.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import print_function
3 |
4 | import numpy as np
5 | import pandas as pd
6 |
7 | from keras.models import Sequential
8 | from keras.layers.core import Dense, Dropout, Activation
9 | from keras.layers.normalization import BatchNormalization
10 | from keras.layers.advanced_activations import PReLU
11 | from keras.utils import np_utils, generic_utils
12 |
13 | from sklearn.preprocessing import LabelEncoder
14 | from sklearn.preprocessing import StandardScaler
15 |
16 | '''
17 | This demonstrates how to reach a score of 0.4890 (local validation)
18 | on the Kaggle Otto challenge, with a deep net using Keras.
19 | Compatible Python 2.7-3.4
20 | Recommended to run on GPU:
21 | Command: THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python kaggle_otto_nn.py
22 | On EC2 g2.2xlarge instance: 19s/epoch. 6-7 minutes total training time.
23 | Best validation score at epoch 21: 0.4881
24 | Try it at home:
25 | - with/without BatchNormalization (BatchNormalization helps!)
26 | - with ReLU or with PReLU (PReLU helps!)
27 | - with smaller layers, largers layers
28 | - with more layers, less layers
29 | - with different optimizers (SGD+momentum+decay is probably better than Adam!)
30 | '''
31 |
32 | np.random.seed(1337) # for reproducibility
33 |
34 |
35 | def load_data(path, train=True):
36 | df = pd.read_csv(path)
37 | X = df.values.copy()
38 | if train:
39 | np.random.shuffle(X) # https://youtu.be/uyUXoap67N8
40 | X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]
41 | return X, labels
42 | else:
43 | X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str)
44 | return X, ids
45 |
46 |
47 | def preprocess_data(X, scaler=None):
48 | if not scaler:
49 | scaler = StandardScaler()
50 | scaler.fit(X)
51 | X = scaler.transform(X)
52 | return X, scaler
53 |
54 |
55 | def preprocess_labels(y, encoder=None, categorical=True):
56 | if not encoder:
57 | encoder = LabelEncoder()
58 | encoder.fit(labels)
59 | y = encoder.transform(labels).astype(np.int32)
60 | if categorical:
61 | y = np_utils.to_categorical(y)
62 | return y, encoder
63 |
64 |
65 | def make_submission(y_prob, ids, encoder, fname):
66 | with open(fname, 'w') as f:
67 | f.write('id,')
68 | f.write(','.join([str(i) for i in encoder.classes_]))
69 | f.write('\n')
70 | for i, probs in zip(ids, y_prob):
71 | probas = ','.join([i] + [str(p) for p in probs.tolist()])
72 | f.write(probas)
73 | f.write('\n')
74 | print("Wrote submission to file {}.".format(fname))
75 |
76 |
77 | print("Loading data...")
78 | X, labels = load_data('train.csv', train=True)
79 | X, scaler = preprocess_data(X)
80 | y, encoder = preprocess_labels(labels)
81 |
82 | X_test, ids = load_data('test.csv', train=False)
83 | X_test, _ = preprocess_data(X_test, scaler)
84 |
85 | nb_classes = y.shape[1]
86 | print(nb_classes, 'classes')
87 |
88 | dims = X.shape[1]
89 | print(dims, 'dims')
90 |
91 | print("Building model...")
92 |
93 | model = Sequential()
94 | model.add(Dense(dims, 512, init='glorot_uniform'))
95 | model.add(PReLU((512,)))
96 | model.add(BatchNormalization((512,)))
97 | model.add(Dropout(0.5))
98 |
99 | model.add(Dense(512, 512, init='glorot_uniform'))
100 | model.add(PReLU((512,)))
101 | model.add(BatchNormalization((512,)))
102 | model.add(Dropout(0.5))
103 |
104 | model.add(Dense(512, 512, init='glorot_uniform'))
105 | model.add(PReLU((512,)))
106 | model.add(BatchNormalization((512,)))
107 | model.add(Dropout(0.5))
108 |
109 | model.add(Dense(512, nb_classes, init='glorot_uniform'))
110 | model.add(Activation('softmax'))
111 |
112 | model.compile(loss='categorical_crossentropy', optimizer="adam")
113 |
114 | print("Training model...")
115 |
116 | model.fit(X, y, nb_epoch=20, batch_size=16, validation_split=0.15)
117 |
118 | print("Generating submission...")
119 |
120 | proba = model.predict_proba(X_test)
121 | make_submission(proba, ids, encoder, fname='keras-otto.csv')
122 |
123 |
--------------------------------------------------------------------------------
/old/OttoGroup/keras_wrapper.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import numpy as np
4 |
5 | from sklearn.preprocessing import *
6 | from sklearn.ensemble import *
7 |
8 | from keras.models import Sequential
9 | from keras.layers.core import Dense, Dropout, Activation
10 | from keras.layers.normalization import BatchNormalization
11 | from keras.layers.advanced_activations import PReLU
12 | from keras.utils import np_utils
13 | from keras.wrappers.scikit_learn import KerasClassifier
14 |
15 |
16 | def load_training_data(path, filename):
17 | df = pd.read_csv(path + filename)
18 | X = df.values.copy()
19 | np.random.shuffle(X)
20 | X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]
21 |
22 | return X, labels
23 |
24 |
25 | def create_scaler(X):
26 | scaler = StandardScaler()
27 | scaler.fit(X)
28 |
29 | return scaler
30 |
31 |
32 | def apply_scaler(X, scaler):
33 | return scaler.transform(X)
34 |
35 |
36 | def preprocess_labels(labels):
37 | encoder = LabelEncoder()
38 | encoder.fit(labels)
39 | y = encoder.transform(labels).astype(np.int32)
40 | y_onehot = np_utils.to_categorical(y)
41 |
42 | return y, y_onehot, encoder
43 |
44 |
45 | def define_model(num_features, num_classes):
46 | layer_size = 512
47 | init_method = 'glorot_uniform'
48 |
49 | model = Sequential()
50 | model.add(Dense(num_features, layer_size, init=init_method))
51 | model.add(PReLU((layer_size,)))
52 | model.add(BatchNormalization((layer_size,)))
53 | model.add(Dropout(0.5))
54 |
55 | model.add(Dense(layer_size, layer_size, init=init_method))
56 | model.add(PReLU((layer_size,)))
57 | model.add(BatchNormalization((layer_size,)))
58 | model.add(Dropout(0.5))
59 |
60 | model.add(Dense(layer_size, layer_size, init=init_method))
61 | model.add(PReLU((layer_size,)))
62 | model.add(BatchNormalization((layer_size,)))
63 | model.add(Dropout(0.5))
64 |
65 | model.add(Dense(layer_size, num_classes, init=init_method))
66 | model.add(Activation('softmax'))
67 |
68 | return model
69 |
70 |
71 | def main():
72 | code_dir = '/home/john/git/kaggle/OttoGroup/'
73 | data_dir = '/home/john/data/otto/'
74 | training_file = 'train.csv'
75 |
76 | os.chdir(code_dir)
77 | np.random.seed(1337)
78 |
79 | print('Starting script...')
80 |
81 | print('Loading data...')
82 | X, labels = load_training_data(data_dir, training_file)
83 |
84 | print('Pre-processing...')
85 | scaler = create_scaler(X)
86 | X = apply_scaler(X, scaler)
87 | y, y_onehot, encoder = preprocess_labels(labels)
88 | num_features = X.shape[1]
89 | num_classes = y_onehot.shape[1]
90 | print('Features = ' + str(num_features))
91 | print('Classes = ' + str(num_classes))
92 |
93 | print('Building model...')
94 | model = define_model(num_features, num_classes)
95 | print('Complete.')
96 |
97 | print('Training model...')
98 | wrapper = KerasClassifier(model)
99 | wrapper.fit(X, y_onehot, nb_epoch=20)
100 | print('Complete.')
101 |
102 | print('Training score = ' + str(wrapper.score(X, y_onehot)))
103 |
104 | preds = wrapper.predict(X)
105 | print('Predictions shape = ' + str(preds.shape))
106 |
107 | proba = wrapper.predict_proba(X)
108 | print('Probabilities shape = ' + str(proba.shape))
109 |
110 | print('Building ensemble...')
111 | ensemble = BaggingClassifier(wrapper, n_estimators=3, max_samples=1.0, max_features=1.0)
112 | print('Complete.')
113 |
114 | print('Training ensemble...')
115 | ensemble.fit(X, y)
116 | print('Complete.')
117 |
118 | print('Ensemble score = ' + str(ensemble.score(X, y)))
119 |
120 | print('Script complete.')
121 |
122 |
123 | if __name__ == "__main__":
124 | main()
125 |
--------------------------------------------------------------------------------
/old/OttoGroup/otto.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import pandas as pd
4 | import numpy as np
5 |
6 | from sklearn.cross_validation import *
7 | from sklearn.preprocessing import *
8 | from sklearn.metrics import *
9 | from sklearn.ensemble import *
10 |
11 | import xgboost as xgb
12 |
13 | from keras.models import Sequential
14 | from keras.layers.core import Dense, Dropout, Activation
15 | from keras.layers.normalization import BatchNormalization
16 | from keras.layers.advanced_activations import PReLU
17 | from keras.utils import np_utils
18 |
19 |
20 | def predict_probability(X, model, scaler):
21 | X = apply_scaler(X, scaler)
22 | y_prob = model.predict_proba(X)
23 |
24 | return y_prob
25 |
26 |
27 | def score(X, y, model, scaler):
28 | X = apply_scaler(X, scaler)
29 | y_est = model.predict_proba(X)
30 |
31 | return log_loss(y, y_est)
32 |
33 |
34 | def load_training_data(path, filename):
35 | df = pd.read_csv(path + filename)
36 | X = df.values.copy()
37 | np.random.shuffle(X)
38 | X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]
39 |
40 | return X, labels
41 |
42 |
43 | def load_test_data(path, filename):
44 | df = pd.read_csv(path + filename)
45 | X = df.values.copy()
46 | X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str)
47 |
48 | return X, ids
49 |
50 |
51 | def create_scaler(X):
52 | scaler = StandardScaler()
53 | scaler.fit(X)
54 |
55 | return scaler
56 |
57 |
58 | def apply_scaler(X, scaler):
59 | return scaler.transform(X)
60 |
61 |
62 | def preprocess_labels(labels):
63 | encoder = LabelEncoder()
64 | encoder.fit(labels)
65 | y = encoder.transform(labels).astype(np.int32)
66 | y_onehot = np_utils.to_categorical(y)
67 |
68 | return y, y_onehot, encoder
69 |
70 |
71 | def define_xgb_model():
72 | model = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=250, silent=True,
73 | objective="multi:softprob", nthread=-1, gamma=0, min_child_weight=4,
74 | max_delta_step=0, subsample=0.9, colsample_bytree=0.8, base_score=0.5, seed=0)
75 |
76 | return model
77 |
78 |
79 | def define_nn_model(num_features, num_classes):
80 | layer_size = 512
81 | init_method = 'glorot_uniform'
82 | loss_function = 'categorical_crossentropy'
83 | optimization_method = 'adam'
84 |
85 | model = Sequential()
86 | model.add(Dense(num_features, layer_size, init=init_method))
87 | model.add(PReLU((layer_size,)))
88 | model.add(BatchNormalization((layer_size,)))
89 | model.add(Dropout(0.5))
90 |
91 | model.add(Dense(layer_size, layer_size, init=init_method))
92 | model.add(PReLU((layer_size,)))
93 | model.add(BatchNormalization((layer_size,)))
94 | model.add(Dropout(0.5))
95 |
96 | model.add(Dense(layer_size, layer_size, init=init_method))
97 | model.add(PReLU((layer_size,)))
98 | model.add(BatchNormalization((layer_size,)))
99 | model.add(Dropout(0.5))
100 |
101 | model.add(Dense(layer_size, num_classes, init=init_method))
102 | model.add(Activation('softmax'))
103 |
104 | model.compile(loss=loss_function, optimizer=optimization_method)
105 |
106 | return model
107 |
108 |
109 | def train_xgb_model(X, y, model, scaler):
110 | t0 = time.time()
111 | X = apply_scaler(X, scaler)
112 | model.fit(X, y)
113 | t1 = time.time()
114 | print('Model trained in {0:3f} s.'.format(t1 - t0))
115 |
116 | return model
117 |
118 |
119 | def train_nn_model(X, y_onehot, model, scaler):
120 | t0 = time.time()
121 | X = apply_scaler(X, scaler)
122 | model.fit(X, y_onehot, nb_epoch=20, batch_size=16, verbose=0)
123 | t1 = time.time()
124 | print('Model trained in {0:3f} s.'.format(t1 - t0))
125 |
126 | return model
127 |
128 |
129 | def cross_validate_xgb(X, y, scaler, folds=3):
130 | model = define_xgb_model()
131 | X = apply_scaler(X, scaler)
132 | t0 = time.time()
133 |
134 | scores = []
135 | kf = KFold(y.shape[0], n_folds=folds, shuffle=True)
136 | for train_index, test_index in kf:
137 | model.fit(X[train_index], y[train_index])
138 | predictions = model.predict_proba(X[test_index])
139 | actuals = y[test_index]
140 | scores.append(log_loss(actuals, predictions))
141 |
142 | t1 = time.time()
143 | print('Cross-validation completed in {0:3f} s.'.format(t1 - t0))
144 |
145 | return np.mean(scores)
146 |
147 |
148 | def cross_validate_nn(X, y, y_onehot, scaler, num_features, num_classes, folds=3):
149 | model = define_nn_model(num_features, num_classes)
150 | X = apply_scaler(X, scaler)
151 | t0 = time.time()
152 |
153 | scores = []
154 | kf = KFold(y.shape[0], n_folds=folds, shuffle=True)
155 | for train_index, test_index in kf:
156 | model.fit(X[train_index], y_onehot[train_index], nb_epoch=20, batch_size=16, verbose=0)
157 | predictions = model.predict_proba(X[test_index])
158 | actuals = y[test_index]
159 | scores.append(log_loss(actuals, predictions))
160 |
161 | t1 = time.time()
162 | print('Cross-validation completed in {0:3f} s.'.format(t1 - t0))
163 |
164 | return np.mean(scores)
165 |
166 |
167 | def make_submission(y_prob, ids, encoder, path, filename):
168 | with open(path + filename, 'w') as f:
169 | f.write('id,')
170 | f.write(','.join([str(i) for i in encoder.classes_]))
171 | f.write('\n')
172 | for i, probabilities in zip(ids, y_prob):
173 | p = ','.join([i] + [str(p) for p in probabilities.tolist()])
174 | f.write(p)
175 | f.write('\n')
176 |
177 |
178 | def main():
179 | code_dir = '/home/john/git/kaggle/OttoGroup/'
180 | data_dir = '/home/john/data/otto-group/'
181 | training_file = 'train.csv'
182 | test_file = 'test.csv'
183 | submit_file = 'submission.csv'
184 |
185 | os.chdir(code_dir)
186 | np.random.seed(1337)
187 |
188 | print('Starting script...')
189 |
190 | print('Loading data...')
191 | X, labels = load_training_data(data_dir, training_file)
192 | X_test, ids = load_test_data(data_dir, test_file)
193 |
194 | print('Pre-processing...')
195 | scaler = create_scaler(X)
196 | y, y_onehot, encoder = preprocess_labels(labels)
197 | num_features = X.shape[1]
198 | num_classes = y_onehot.shape[1]
199 | print('Features = ' + str(num_features))
200 | print('Classes = ' + str(num_classes))
201 |
202 | print('Building model...')
203 | model = define_xgb_model()
204 |
205 | print('Training model...')
206 | model = train_xgb_model(X, y, model, scaler)
207 |
208 | print('Training score = ' + str(score(X, y, model, scaler)))
209 |
210 | print('Running cross-validation...')
211 | val_score = cross_validate_xgb(X, y, scaler)
212 | print('Cross-validation score = ' + str(val_score))
213 |
214 | print('Building ensemble...')
215 | ensemble = BaggingClassifier(model, n_estimators=5, max_samples=1.0, max_features=1.0)
216 |
217 | print('Training ensemble...')
218 | X = apply_scaler(X, scaler)
219 | ensemble.fit(X, y)
220 |
221 | print('Generating submission file...')
222 | y_prob = predict_probability(X_test, ensemble, scaler)
223 | make_submission(y_prob, ids, encoder, data_dir, submit_file)
224 |
225 | print('Script complete.')
226 |
227 |
228 | if __name__ == "__main__":
229 | main()
230 |
--------------------------------------------------------------------------------
/old/OttoGroup/simple_svm.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from sklearn.svm import LinearSVC
4 | from sklearn.preprocessing import LabelEncoder
5 |
6 | train = pd.read_csv('../input/train.csv')
7 | test = pd.read_csv('../input/test.csv')
8 | sample_submission = pd.read_csv('../input/sampleSubmission.csv')
9 | training_labels = LabelEncoder().fit_transform(train['target'])
10 |
11 | # SVMs tend to like features that look similar to ~ N(0,1), so let's stabilise the long tails
12 | train_features = train.drop('target', axis=1)
13 | train_features[train_features > 4] = 4
14 |
15 | model = LinearSVC().fit(train_features, training_labels)
16 |
17 | scores = model.decision_function(test)
18 | predictions = 1.0 / (1.0 + np.exp(-scores))
19 | row_sums = predictions.sum(axis=1)
20 | predictions_normalised = predictions / row_sums[:, np.newaxis]
21 |
22 | # create submission file
23 | prediction_DF = pd.DataFrame(predictions_normalised, index=sample_submission.id.values,
24 | columns=sample_submission.columns[1:])
25 | prediction_DF.to_csv('svc_submission.csv', index_label='id')
26 |
--------------------------------------------------------------------------------
/old/OttoGroup/xgboost_walkthrough.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import xgboost as xgb
3 |
4 | from sklearn.cross_validation import KFold
5 | from sklearn.grid_search import GridSearchCV
6 | from sklearn.metrics import confusion_matrix, mean_squared_error
7 | from sklearn.datasets import load_iris, load_digits, load_boston
8 |
9 | rng = np.random.RandomState(31337)
10 |
11 | # load file from text file, also binary buffer generated by xgboost
12 | dtrain = xgb.DMatrix('../data/agaricus.txt.train')
13 | dtest = xgb.DMatrix('../data/agaricus.txt.test')
14 |
15 | # specify parameters via map, definition are same as c++ version
16 | param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic' }
17 |
18 | # specify validations set to watch performance
19 | watchlist = [(dtest, 'eval'), (dtrain, 'train')]
20 | num_round = 2
21 | bst = xgb.train(param, dtrain, num_round, watchlist)
22 |
23 | # this is prediction
24 | preds = bst.predict(dtest)
25 | labels = dtest.get_label()
26 | print ('error=%f' % (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))))
27 | bst.save_model('0001.model')
28 |
29 | # dump model
30 | bst.dump_model('dump.raw.txt')
31 |
32 | # dump model with feature map
33 | bst.dump_model('dump.nice.txt', '../data/featmap.txt')
34 |
35 | # save dmatrix into binary buffer
36 | dtest.save_binary('dtest.buffer')
37 | bst.save_model('xgb.model')
38 |
39 | # load model and data in
40 | bst2 = xgb.Booster(model_file='xgb.model')
41 | dtest2 = xgb.DMatrix('dtest.buffer')
42 | preds2 = bst2.predict(dtest2)
43 |
44 | # assert they are the same
45 | assert np.sum(np.abs(preds2-preds)) == 0
46 |
47 | print ('running cross validation')
48 | # do cross validation, this will print result out as
49 | # [iteration] metric_name:mean_value+std_value
50 | # std_value is standard deviation of the metric
51 | xgb.cv(param, dtrain, num_round, nfold=5, metrics={'error'}, seed=0)
52 |
53 | print ('running cross validation, disable standard deviation display')
54 | # do cross validation, this will print result out as
55 | # [iteration] metric_name:mean_value+std_value
56 | # std_value is standard deviation of the metric
57 | xgb.cv(param, dtrain, num_round, nfold=5, metrics={'error'}, seed=0, show_stdv=False)
58 |
59 | print ('running cross validation, with preprocessing function')
60 | # define the preprocessing function
61 | # used to return the preprocessed training, test data, and parameter
62 | # we can use this to do weight rescale, etc.
63 | # as a example, we try to set scale_pos_weight
64 | def fpreproc(dtrain, dtest, param):
65 | label = dtrain.get_label()
66 | ratio = float(np.sum(label == 0)) / np.sum(label == 1)
67 | param['scale_pos_weight'] = ratio
68 | return dtrain, dtest, param
69 |
70 | # do cross validation, for each fold
71 | # the dtrain, dtest, param will be passed into fpreproc
72 | # then the return value of fpreproc will be used to generate
73 | # results of that fold
74 | xgb.cv(param, dtrain, num_round, nfold=5, metrics={'auc'}, seed=0, fpreproc=fpreproc)
75 |
76 | print("Zeros and Ones from the Digits dataset: binary classification")
77 | digits = load_digits(2)
78 | y = digits['target']
79 | X = digits['data']
80 | kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
81 | for train_index, test_index in kf:
82 | xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
83 | predictions = xgb_model.predict(X[test_index])
84 | actuals = y[test_index]
85 | print(confusion_matrix(actuals, predictions))
86 |
87 | print("Iris: multiclass classification")
88 | iris = load_iris()
89 | y = iris['target']
90 | X = iris['data']
91 | kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
92 | for train_index, test_index in kf:
93 | xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
94 | predictions = xgb_model.predict(X[test_index])
95 | actuals = y[test_index]
96 | print(confusion_matrix(actuals, predictions))
97 |
98 | print("Boston Housing: regression")
99 | boston = load_boston()
100 | y = boston['target']
101 | X = boston['data']
102 | kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
103 | for train_index, test_index in kf:
104 | xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
105 | predictions = xgb_model.predict(X[test_index])
106 | actuals = y[test_index]
107 | print(mean_squared_error(actuals, predictions))
108 |
109 | print("Parameter optimization")
110 | y = boston['target']
111 | X = boston['data']
112 | xgb_model = xgb.XGBRegressor()
113 | clf = GridSearchCV(xgb_model,
114 | {'max_depth': [2, 4, 6],
115 | 'n_estimators': [50, 100, 200]}, verbose=1)
116 | clf.fit(X, y)
117 | print(clf.best_score_)
118 | print(clf.best_params_)
119 |
120 |
--------------------------------------------------------------------------------
/old/PropertyInspection/README.md:
--------------------------------------------------------------------------------
1 | # Liberty Mutual Property Inspection Prediction Challenge
2 |
3 |
4 |
5 | View the competition details here.
6 |
7 | I used this competition primarily to develop my knowledge of ensembling (particularly averaging and stacking diverse models) and get familiar with deep learning using Keras. Unfortunately I wasn't able to spend enough time on it to get a decent score.
--------------------------------------------------------------------------------
/old/PropertyInspection/Resources/houses.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdwittenauer/kaggle/cc489100a0c93315e424551f68968ffba85d268f/old/PropertyInspection/Resources/houses.png
--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/scripts/pyro_basics.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import pyro
3 | import pyro.distributions as dist
4 | from torch.autograd import Variable
5 |
6 |
7 | def weather():
8 | cloudy = pyro.sample('cloudy', dist.bernoulli,
9 | Variable(torch.Tensor([0.3])))
10 | cloudy = 'cloudy' if cloudy.data[0] == 1.0 else 'sunny'
11 | mean_temp = {'cloudy': [55.0], 'sunny': [75.0]}[cloudy]
12 | sigma_temp = {'cloudy': [10.0], 'sunny': [15.0]}[cloudy]
13 | temp = pyro.sample('temp', dist.normal,
14 | Variable(torch.Tensor(mean_temp)),
15 | Variable(torch.Tensor(sigma_temp)))
16 | return cloudy, temp.data[0]
17 |
18 |
19 | for _ in range(3):
20 | print(weather())
21 |
22 |
23 | def ice_cream_sales():
24 | cloudy, temp = weather()
25 | expected_sales = [200] if cloudy == 'sunny' and temp > 80.0 else [50]
26 | ice_cream = pyro.sample('ice_cream', dist.normal,
27 | Variable(torch.Tensor(expected_sales)),
28 | Variable(torch.Tensor([10.0])))
29 | return cloudy, temp, ice_cream.data[0]
30 |
31 |
32 | for _ in range(3):
33 | print(ice_cream_sales())
34 |
--------------------------------------------------------------------------------
/scripts/pytorch_basics.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import torch.optim as optim
5 | from torch.autograd import Variable
6 |
7 |
8 | ##########################################################################
9 | # What Is PyTorch?
10 | ##########################################################################
11 |
12 | x = torch.Tensor(5, 3)
13 | print(x)
14 |
15 | x = torch.rand(5, 3)
16 | print(x)
17 | print(x.size())
18 |
19 | y = torch.rand(5, 3)
20 | print(x + y)
21 | print(torch.add(x, y))
22 |
23 | result = torch.Tensor(5, 3)
24 | torch.add(x, y, out=result)
25 | print(result)
26 |
27 | print(x[:, 1])
28 |
29 | x = torch.randn(4, 4)
30 | y = x.view(16)
31 | z = x.view(-1, 8)
32 | print(x.size(), y.size(), z.size())
33 |
34 | a = torch.ones(5)
35 | b = a.numpy()
36 | a.add_(1)
37 | print(a)
38 | print(b)
39 |
40 | if torch.cuda.is_available():
41 | x = x.cuda()
42 | y = y.cuda()
43 | print(x + y)
44 |
45 | ##########################################################################
46 | # Autograd: Automatic Differentiation
47 | ##########################################################################
48 |
49 | x = Variable(torch.ones(2, 2), requires_grad=True)
50 | print(x)
51 |
52 | y = x + 2
53 | print(y)
54 |
55 | print(y.grad_fn)
56 |
57 | z = y * y * 3
58 | out = z.mean()
59 |
60 | print(z, out)
61 |
62 | out.backward()
63 | print(x.grad)
64 |
65 | x = torch.randn(3)
66 | x = Variable(x, requires_grad=True)
67 | y = x * 2
68 | while y.data.norm() < 1000:
69 | y = y * 2
70 | print(y)
71 |
72 | gradients = torch.FloatTensor([0.1, 1.0, 0.0001])
73 | y.backward(gradients)
74 | print(x.grad)
75 |
76 | ##########################################################################
77 | # Neural Networks
78 | ##########################################################################
79 |
80 |
81 | class Net(nn.Module):
82 | def __init__(self):
83 | super(Net, self).__init__()
84 | self.conv1 = nn.Conv2d(1, 6, 5)
85 | self.conv2 = nn.Conv2d(6, 16, 5)
86 | self.fc1 = nn.Linear(16 * 5 * 5, 120)
87 | self.fc2 = nn.Linear(120, 84)
88 | self.fc3 = nn.Linear(84, 10)
89 |
90 | def forward(self, x):
91 | x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
92 | x = F.max_pool2d(F.relu(self.conv2(x)), 2)
93 | x = x.view(-1, self.num_flat_features(x))
94 | x = F.relu(self.fc1(x))
95 | x = F.relu(self.fc2(x))
96 | x = self.fc3(x)
97 | return x
98 |
99 | def num_flat_features(self, x):
100 | size = x.size()[1:]
101 | num_features = 1
102 | for s in size:
103 | num_features *= s
104 | return num_features
105 |
106 |
107 | net = Net()
108 | print(net)
109 |
110 | params = list(net.parameters())
111 | print(len(params))
112 | print(params[0].size())
113 |
114 | input = Variable(torch.randn(1, 1, 32, 32))
115 | out = net(input)
116 | print(out)
117 |
118 | output = net(input)
119 | target = Variable(torch.arange(1, 11))
120 | criterion = nn.MSELoss()
121 | loss = criterion(output, target)
122 | print(loss)
123 |
124 | print(loss.grad_fn)
125 | print(loss.grad_fn.next_functions[0][0])
126 | print(loss.grad_fn.next_functions[0][0].next_functions[0][0])
127 |
128 | net.zero_grad()
129 | print('conv1.bias.grad before backward')
130 | print(net.conv1.bias.grad)
131 |
132 | loss.backward()
133 | print('conv1.bias.grad after backward')
134 | print(net.conv1.bias.grad)
135 |
136 | optimizer = optim.SGD(net.parameters(), lr=0.01)
137 | optimizer.zero_grad()
138 | output = net(input)
139 | loss = criterion(output, target)
140 | loss.backward()
141 | optimizer.step()
142 | print(loss)
143 |
--------------------------------------------------------------------------------
/scripts/pytorch_embedding.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | from torch.utils.data import DataLoader, TensorDataset
6 |
7 | class TorchEmbeddingNet(nn.Module):
8 | def __init__(self, cat_vars, cont_vars, embedding_sizes):
9 | super(TorchNet, self).__init__()
10 | self.embeddings = nn.ModuleList([nn.Embedding(c, s) for c, s in embedding_sizes])
11 |
12 | self.n_cat = len(cat_vars)
13 | self.n_cont = len(cont_vars)
14 | self.n_embed = sum(e.embedding_dim for e in self.embeddings)
15 |
16 | self.fc1 = nn.Linear(self.n_embed + self.n_cont, 1000)
17 | self.fc2 = nn.Linear(1000, 500)
18 | self.out = nn.Linear(500, 1)
19 |
20 | self.bn_cont = nn.BatchNorm1d(self.n_cont)
21 | self.bn1 = nn.BatchNorm1d(1000)
22 | self.bn2 = nn.BatchNorm1d(500)
23 |
24 | self.d_embed = nn.Dropout(0.04)
25 | self.d1 = nn.Dropout(0.001)
26 | self.d2 = nn.Dropout(0.01)
27 |
28 | for e in self.embeddings:
29 | e = e.weight.data
30 | sc = 2 / (e.size(1) + 1)
31 | e.uniform_(-sc, sc)
32 |
33 | nn.init.kaiming_normal(self.fc1.weight.data)
34 | nn.init.kaiming_normal(self.fc2.weight.data)
35 | nn.init.kaiming_normal(self.out.weight.data)
36 |
37 | def forward(self, x_cat, x_cont):
38 | x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
39 | x = torch.cat(x, 1)
40 | x = self.d_embed(x)
41 |
42 | x2 = self.bn_cont(x_cont)
43 | x = torch.cat([x, x2], 1)
44 |
45 | x = F.relu(self.fc1(x))
46 | x = self.bn1(x)
47 | x = self.d1(x)
48 |
49 | x = F.relu(self.fc2(x))
50 | x = self.bn2(x)
51 | x = self.d2(x)
52 |
53 | x = self.out(x)
54 |
55 | return x
56 |
57 | model = TorchEmbeddingNet(cat_vars, cont_vars, embedding_sizes)
58 | loss_fn = nn.L1Loss()
59 | optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
60 | print(model)
61 |
62 | def create_dataloaders(X, y, batch_size, val_data=None):
63 | X_cat = X[cat_vars].values.astype('int64')
64 | X_cont = X[cont_vars].values.astype('float32')
65 | y = y.values.astype('float32')
66 | train_ds = TensorDataset(torch.from_numpy(X_cat), torch.from_numpy(X_cont), torch.from_numpy(y))
67 | train_dl = DataLoader(train_ds, batch_size)
68 | if val_data is not None:
69 | X_val, y_val = val_data
70 | X_val_cat = X_val[cat_vars].values.astype('int64')
71 | X_val_cont = X_val[cont_vars].values.astype('float32')
72 | y_val = y_val.values.astype('float32')
73 | val_ds = TensorDataset(torch.from_numpy(X_val_cat), torch.from_numpy(X_val_cont), torch.from_numpy(y_val))
74 | val_dl = DataLoader(val_ds, batch_size)
75 | return train_dl, val_dl
76 | else:
77 | return train_dl
78 |
79 | train_dl, val_dl = create_dataloaders(X, y, batch_size, val_data=(X_val, y_val))
80 |
81 | def fit(model, optimizer, loss_fn, train_dl, n_epochs, val_dl=None):
82 | for epoch in range(n_epochs):
83 | t0 = time.time()
84 | model.train()
85 | epoch_loss = 0.0
86 | epoch_val_loss = 0.0
87 | steps = 0
88 | val_steps = 0
89 | for i, data in enumerate(train_dl, 0):
90 | X_cat, X_cont, y = data
91 | y = y.view(-1, 1)
92 | y_pred = model(X_cat, X_cont)
93 | loss = loss_fn(y_pred, y)
94 | optimizer.zero_grad()
95 | loss.backward()
96 | optimizer.step()
97 | epoch_loss += loss.item()
98 | steps += 1
99 | if val_dl is not None:
100 | model.eval()
101 | for i, data in enumerate(val_dl, 0):
102 | X_cat, X_cont, y = data
103 | y = y.view(-1, 1)
104 | y_pred = model(X_cat, X_cont)
105 | val_loss = loss_fn(y_pred, y)
106 | epoch_val_loss += val_loss.item()
107 | val_steps += 1
108 | t1 = time.time()
109 | print('[Epoch {0:d}] loss: {1:.3f} | val loss: {2:.3f} | {3:.0f} s'.format(
110 | epoch + 1, epoch_loss / steps, epoch_val_loss / val_steps, t1 - t0))
111 | else:
112 | t1 = time.time()
113 | print('[Epoch {0:d}] loss: {1:.3f} | {2:.0f} s'.format(epoch + 1, epoch_loss / steps, t1 - t0))
114 |
115 | fit(model, optimizer, loss_fn, train_dl, n_epochs=n_epochs, val_dl=val_dl)
116 |
--------------------------------------------------------------------------------
/scripts/pytorch_mnist.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | import torch.optim as optim
7 | from torchvision import datasets, transforms
8 | from torch.autograd import Variable
9 |
10 |
11 | class MnistModel(nn.Module):
12 | def __init__(self):
13 | super(MnistModel, self).__init__()
14 | # input is 28x28
15 | # padding=2 for same padding
16 | self.conv1 = nn.Conv2d(1, 32, 5, padding=2)
17 | # feature map size is 14*14 by pooling
18 | # padding=2 for same padding
19 | self.conv2 = nn.Conv2d(32, 64, 5, padding=2)
20 | # feature map size is 7*7 by pooling
21 | self.fc1 = nn.Linear(64 * 7 * 7, 1024)
22 | self.fc2 = nn.Linear(1024, 10)
23 |
24 | def forward(self, x):
25 | x = F.max_pool2d(F.relu(self.conv1(x)), 2)
26 | x = F.max_pool2d(F.relu(self.conv2(x)), 2)
27 | x = x.view(-1, 64 * 7 * 7) # reshape Variable
28 | x = F.relu(self.fc1(x))
29 | x = F.dropout(x, training=self.training)
30 | x = self.fc2(x)
31 | return F.log_softmax(x)
32 |
33 |
34 | model = MnistModel()
35 | batch_size = 50
36 |
37 | train_loader = torch.utils.data.DataLoader(
38 | datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()),
39 | batch_size=batch_size, shuffle=True)
40 | test_loader = torch.utils.data.DataLoader(
41 | datasets.MNIST('data', train=False, transform=transforms.ToTensor()),
42 | batch_size=1000)
43 |
44 | for p in model.parameters():
45 | print(p.size())
46 |
47 | optimizer = optim.Adam(model.parameters(), lr=0.0001)
48 | model.train()
49 | train_loss = []
50 | train_accu = []
51 | i = 0
52 | for epoch in range(15):
53 | for data, target in train_loader:
54 | data, target = Variable(data), Variable(target)
55 | optimizer.zero_grad()
56 | output = model(data)
57 | loss = F.nll_loss(output, target)
58 | loss.backward() # calc gradients
59 | train_loss.append(loss.data[0])
60 | optimizer.step() # update gradients
61 | prediction = output.data.max(1)[1] # first column has actual prob.
62 | accuracy = prediction.eq(target.data).sum() / batch_size * 100
63 | train_accu.append(accuracy)
64 | if i % 1000 == 0:
65 | print('Train Step: {}\tLoss: {:.3f}\tAccuracy: {:.3f}'.format(i, loss.data[0], accuracy))
66 | i += 1
67 |
68 | plt.plot(np.arange(len(train_loss)), train_loss)
69 | plt.plot(np.arange(len(train_accu)), train_accu)
70 |
71 | model.eval()
72 | correct = 0
73 | for data, target in test_loader:
74 | data, target = Variable(data, volatile=True), Variable(target)
75 | output = model(data)
76 | prediction = output.data.max(1)[1]
77 | correct += prediction.eq(target.data).sum()
78 |
79 | print('\nTest set: Accuracy: {:.2f}%'.format(100. * correct / len(test_loader.dataset)))
80 |
--------------------------------------------------------------------------------
/scripts/tf_basics.py:
--------------------------------------------------------------------------------
1 | ##########################################################################
2 | # Graph Basics
3 | ##########################################################################
4 |
5 | import tensorflow as tf
6 |
7 | # Create a Constant op that produces a 1x2 matrix. The op is
8 | # added as a node to the default graph.
9 | #
10 | # The value returned by the constructor represents the output
11 | # of the Constant op.
12 | matrix1 = tf.constant([[3., 3.]])
13 |
14 | # Create another Constant that produces a 2x1 matrix.
15 | matrix2 = tf.constant([[2.], [2.]])
16 |
17 | # Create a Matmul op that takes 'matrix1' and 'matrix2' as inputs.
18 | # The returned value, 'product', represents the result of the matrix
19 | # multiplication.
20 | product = tf.matmul(matrix1, matrix2)
21 |
22 | # Launch the default graph.
23 | sess = tf.Session()
24 |
25 | # To run the matmul op we call the session 'run()' method, passing 'product'
26 | # which represents the output of the matmul op. This indicates to the call
27 | # that we want to get the output of the matmul op back.
28 | #
29 | # All inputs needed by the op are run automatically by the session. They
30 | # typically are run in parallel.
31 | #
32 | # The call 'run(product)' thus causes the execution of threes ops in the
33 | # graph: the two constants and matmul.
34 | #
35 | # The output of the op is returned in 'result' as a numpy `ndarray` object.
36 | result = sess.run(product)
37 | print(result)
38 | # ==> [[ 12.]]
39 |
40 | # Close the Session when we're done.
41 | sess.close()
42 |
43 | ##########################################################################
44 | # Interactive Usage
45 | ##########################################################################
46 |
47 | # Enter an interactive TensorFlow Session.
48 | sess = tf.InteractiveSession()
49 |
50 | x = tf.Variable([1.0, 2.0])
51 | a = tf.constant([3.0, 3.0])
52 |
53 | # Initialize 'x' using the run() method of its initializer op.
54 | x.initializer.run()
55 |
56 | # Add an op to subtract 'a' from 'x'. Run it and print the result
57 | sub = tf.sub(x, a)
58 | print(sub.eval())
59 | # ==> [-2. -1.]
60 |
61 | # Close the Session when we're done.
62 | sess.close()
63 |
64 | ##########################################################################
65 | # Variables
66 | ##########################################################################
67 |
68 | # Create a Variable, that will be initialized to the scalar value 0.
69 | state = tf.Variable(0, name="counter")
70 |
71 | # Create an Op to add one to `state`.
72 |
73 | one = tf.constant(1)
74 | new_value = tf.add(state, one)
75 | update = tf.assign(state, new_value)
76 |
77 | # Variables must be initialized by running an `init` Op after having
78 | # launched the graph. We first have to add the `init` Op to the graph.
79 | init_op = tf.initialize_all_variables()
80 |
81 | # Launch the graph and run the ops.
82 | with tf.Session() as sess:
83 | # Run the 'init' op
84 | sess.run(init_op)
85 | # Print the initial value of 'state'
86 | print(sess.run(state))
87 | # Run the op that updates 'state' and print 'state'.
88 | for _ in range(3):
89 | sess.run(update)
90 | print(sess.run(state))
91 |
92 | # output:
93 |
94 | # 0
95 | # 1
96 | # 2
97 | # 3
98 |
99 | ##########################################################################
100 | # Fetches
101 | ##########################################################################
102 |
103 | input1 = tf.constant(3.0)
104 | input2 = tf.constant(2.0)
105 | input3 = tf.constant(5.0)
106 | intermed = tf.add(input2, input3)
107 | mul = tf.mul(input1, intermed)
108 |
109 | with tf.Session() as sess:
110 | result = sess.run([mul, intermed])
111 | print(result)
112 |
113 | # output:
114 | # [array([ 21.], dtype=float32), array([ 7.], dtype=float32)]
115 |
116 | ##########################################################################
117 | # Feeds
118 | ##########################################################################
119 |
120 | input1 = tf.placeholder(tf.float32)
121 | input2 = tf.placeholder(tf.float32)
122 | output = tf.mul(input1, input2)
123 |
124 | with tf.Session() as sess:
125 | print(sess.run([output], feed_dict={input1: [7.], input2: [2.]}))
126 |
127 | # output:
128 | # [array([ 14.], dtype=float32)]
129 |
--------------------------------------------------------------------------------
/scripts/tf_mnist.py:
--------------------------------------------------------------------------------
1 | # Import the MNIST data set.
2 | from tensorflow.examples.tutorials.mnist import input_data
3 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
4 |
5 | # Define some initial variables.
6 | import tensorflow as tf
7 | x = tf.placeholder(tf.float32, [None, 784])
8 | W = tf.Variable(tf.zeros([784, 10]))
9 | b = tf.Variable(tf.zeros([10]))
10 |
11 | # Implement the model using the built-in softmax function.
12 | y = tf.nn.softmax(tf.matmul(x, W) + b)
13 |
14 | # Define the operation to compute cross-entropy.
15 | y_ = tf.placeholder(tf.float32, [None, 10])
16 | cross_entropy = -tf.reduce_sum(y_ * tf.log(y))
17 |
18 | # Use a built-in optimization algorithm to define how to proceed with training.
19 | train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
20 |
21 | # Create a session and initialize the variables.
22 | init = tf.initialize_all_variables()
23 | sess = tf.Session()
24 | sess.run(init)
25 |
26 | # Run the training algorithm for 1000 iterations.
27 | for i in range(1000):
28 | batch_xs, batch_ys = mnist.train.next_batch(100)
29 | sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
30 |
31 | # Calculate the accuracy of the trained model.
32 | correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
33 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
34 | print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
--------------------------------------------------------------------------------