├── lib ├── __init__.py ├── preprocessing.py └── ml.py ├── README.md ├── requirements.txt ├── run_model.py └── LICENSE /lib/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['preprocessing', 'ml'] 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | kaggle_ctr_model 2 | ================ 3 | 4 | Kaggle Avazu beat-the-benchmark model. Very low memory implementation, scores 0.4037 with a lot of room for feature engineering improvements. 5 | 6 | 7 | Usage: 8 | 9 | 1) Get the Data: 10 | ``` 11 | mkdir original_data 12 | ``` 13 | Download the data from https://www.kaggle.com/c/avazu-ctr-prediction/data into original_data directory. 14 | 15 | 2) Subset the data (feel free to use more or less): 16 | ``` 17 | mkdir submissions 18 | mkdir modified_data 19 | (head -1 original_data/train && tail -10000000 original_data/train) > modified_data/sub_train10 20 | ```` 21 | 3) Run the model: 22 | ``` 23 | python run_model.py --neg_rate=.05 --submission_num=1 --n_iter=250 --train_path=modified_data/sub_train10 24 | ``` 25 | 26 | -------------------------------------------------------------------------------- /lib/preprocessing.py: -------------------------------------------------------------------------------- 1 | from csv import DictReader 2 | 3 | 4 | features = ['hour', 5 | 'banner_pos', 6 | 'site_id', 7 | 'site_domain', 8 | 'site_category', 9 | 'app_id', 10 | 'app_domain', 11 | 'app_category', 12 | 'device_id', 13 | 'device_ip', 14 | 'device_model', 15 | 'device_type', 16 | 'device_conn_type', 17 | 'C1', 18 | 'C15', 19 | 'C16', 20 | 'C20', 21 | 'C14', 22 | 'C15', 23 | 'C17', 24 | 'C19', 25 | 'C21'] 26 | 27 | 28 | def _make_interact(iterable): 29 | return ':'.join(iterable) 30 | 31 | 32 | def _clean_row(X): 33 | X['hour'] = str(int(X['hour']) % 24) 34 | 35 | 36 | def clean_parse_row(row, features=features): 37 | 38 | _clean_row(row) 39 | 40 | for k in features: 41 | yield (':'.join((k, row[k])), 1.0) 42 | 43 | for j in features: 44 | if k != j: 45 | yield (_make_interact([k, j, row[k],row[j]]), 1.0) 46 | 47 | yield (_make_interact([row['app_category'], row['site_category'], row['banner_pos']]), 1.0) 48 | 49 | 50 | def get_int_field(field, path): 51 | with open(path) as f: 52 | for row in DictReader(f): 53 | yield (int(row[field]) if row[field] != '' else 0) 54 | 55 | 56 | def data_generator(parser, path, rowfeatures=features): 57 | """ 58 | Needs to return vanilla str, not numpy str to work with hashing 59 | """ 60 | with open(path) as f: 61 | for row in DictReader(f): 62 | yield (parser(row, rowfeatures)) 63 | 64 | 65 | def write_submission(number, ids, preds): 66 | with open('submissions/submission{}.csv'.format(number), 'w') as f: 67 | f.write('id,click\n') 68 | for id_, prob in zip(ids, preds): 69 | f.write('%s,%f\n' % (id_, prob)) 70 | 71 | 72 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython==0.20.1 2 | DataShape==0.1.1 3 | Jinja2==2.7.3 4 | MDP==3.3 5 | MarkupSafe==0.23 6 | MySQL-python==1.2.5 7 | PIL==1.1.7 8 | PyAudio==0.2.7 9 | PySAL==1.6.0 10 | PyYAML==3.11 11 | Pygments==1.6 12 | Sphinx==1.2.1 13 | VTK==5.10.1 14 | WTForms==2.0.1 15 | Werkzeug==0.9.6 16 | XlsxWriter==0.5.2 17 | apptools==4.2.1 18 | argcomplete==0.6.7 19 | astropy==0.3 20 | atom==0.3.7 21 | backports.ssl-match-hostname==3.4.0.2 22 | beautifulsoup4==4.3.1 23 | binstar==0.4.4 24 | biopython==1.63 25 | bitarray==0.8.1 26 | blaze==0.4.2 27 | blinker==1.3 28 | blz==0.6.1 29 | bokeh==0.6.1 30 | boto==2.25.0 31 | casuarius==1.1 32 | cdecimal==2.3 33 | chaco==4.4.1 34 | colorama==0.2.7 35 | conda==3.7.1 36 | conda-build==1.2.0 37 | configobj==4.7.2 38 | cubes==0.10.2 39 | cytoolz==0.7.0 40 | docopt==0.6.2 41 | docutils==0.11 42 | enable==4.3.0 43 | enaml==0.9.1 44 | envisage==4.4.0 45 | future==0.11.2 46 | geopy==1.1.2 47 | gevent==1.0.1 48 | gevent-websocket==0.9.3 49 | gevent-zeromq==0.2.2 50 | gnureadline==6.3.3 51 | greenlet==0.4.4 52 | grin==1.2.1 53 | h5py==2.2.1 54 | ipython==2.1.0 55 | itsdangerous==0.24 56 | joblib==0.8.2 57 | keyring==3.3 58 | kiwisolver==0.1.2 59 | llvmpy==0.12.3 60 | lxml==3.3.1 61 | matplotlib==1.3.1 62 | mayavi==4.3.1 63 | mimerender==0.5.4 64 | mock==1.0.1 65 | netCDF4==1.0.8 66 | networkx==1.8.1 67 | nltk==2.0.4 68 | nose==1.3.0 69 | numba==0.12.1 70 | numexpr==2.3.1 71 | numpy==1.8.2 72 | openpyxl==1.8.2 73 | pandas==0.14.1 74 | passlib==1.6.2 75 | patsy==0.2.1 76 | pep8==1.4.6 77 | plotly==1.3.0 78 | ply==3.4 79 | psutil==1.2.1 80 | py==1.4.20 81 | pycosat==0.6.1 82 | pycparser==2.10 83 | pycrypto==2.6.1 84 | pycurl==7.19.0 85 | pyface==4.4.0 86 | pyflakes==0.7.3 87 | pykit==0.2.0 88 | pyparsing==2.0.1 89 | pysam==0.6 90 | pytest==2.5.2 91 | python-dateutil==2.2 92 | python-mimeparse==0.1.4 93 | scikit-image==0.9.3 94 | scikit-learn==0.15.1 95 | scipy==0.14.0 96 | simplejson==3.6.2 97 | six==1.8.0 98 | statsmodels==0.5.0 99 | sympy==0.7.4.1 100 | tables==3.1.0 101 | traits==4.4.0 102 | traitsui==4.4.0 103 | ujson==1.33 104 | virtualenv==1.11.6 105 | wsgiref==0.1.2 106 | xlrd==0.9.2 107 | xlwt==0.7.5 108 | -------------------------------------------------------------------------------- /run_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | from sklearn.feature_extraction import FeatureHasher 4 | from sklearn.linear_model import SGDClassifier 5 | 6 | import lib.ml as ml 7 | import lib.preprocessing as pp 8 | 9 | 10 | def main(neg_rate, submission_num, n_iter, train_path): 11 | ids = [x for x in pp.get_int_field('id', 'original_data/test')] 12 | clicks = pp.get_int_field('click', train_path) 13 | # Get Data Generators 14 | train = pp.data_generator(pp.clean_parse_row, train_path) 15 | test = pp.data_generator(pp.clean_parse_row, 'original_data/test') 16 | 17 | # Define estimators 18 | fh = FeatureHasher(n_features=2 ** 20, input_type='pair') 19 | sgd = SGDClassifier(loss='log', n_iter=1, alpha=.003, penalty='l2') 20 | 21 | #Fit pipeline 22 | pipeline = ml.PartialFitter([fh, sgd], 23 | batch_size=10000, 24 | logging=True, 25 | n_iter=n_iter, 26 | neg_rate=neg_rate) 27 | 28 | pipeline.partial_fit(X=train, y=clicks) 29 | # Correct Intercept 30 | pipeline.steps[-1].intercept_[0] += np.log(neg_rate) 31 | preds = pipeline.predict_proba(newX=test)[:, 1] 32 | pp.write_submission(number=submission_num, ids=ids, preds=preds) 33 | 34 | 35 | if __name__ == '__main__': 36 | logging.basicConfig(filename='train_errors.log', level=logging.WARNING) 37 | from docopt import docopt 38 | import sys 39 | 40 | usage = '''Train SGD model and create entry for Kaggle Avazu Competition. 41 | Usage: 42 | %(program_name)s --neg_rate= --submission_num= --n_iter= --train_path=

43 | %(program_name)s (-h | --help) 44 | Options: 45 | -h --help Show this screen. 46 | --neg_rate= Rate at which to sample negative cases 47 | --submission_num= Submission number. 48 | --n_iter= Number of fitting iterations over training data 49 | --train_path=

Path to training dataset 50 | ''' % {'program_name': sys.argv[0]} 51 | 52 | arguments = docopt(usage) 53 | 54 | main(np.float(arguments['--neg_rate']), 55 | arguments['--submission_num'], 56 | np.int(arguments['--n_iter']), 57 | arguments['--train_path'] 58 | ) 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /lib/ml.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | 4 | import cytoolz.itertoolz as toolz 5 | import numpy as np 6 | import scipy as sp 7 | 8 | from sklearn.metrics import make_scorer 9 | from sklearn.base import BaseEstimator, TransformerMixin 10 | 11 | 12 | class PartialFitter(BaseEstimator, TransformerMixin): 13 | def __init__(self, 14 | steps, 15 | batch_size=2000, 16 | n_iter=10, 17 | logging=False, 18 | neg_rate=None): 19 | 20 | self.batch_size = batch_size 21 | self.steps = steps 22 | self.logging = logging 23 | self.n_iter = n_iter 24 | self.neg_rate = neg_rate 25 | 26 | def partial_fit(self, X, y): 27 | 28 | for i, batch in enumerate(toolz.partition_all(self.batch_size, X)): 29 | print 'batch {}'.format(i) 30 | 31 | rows = [] 32 | response = [] 33 | 34 | for row in batch: 35 | 36 | try: 37 | row_y = y.next() 38 | if self.check_response(row_y): 39 | row = self._transform(row) 40 | rows.append(row) 41 | response.append(row_y) 42 | except Exception as e: 43 | if self.logging: 44 | logging.exception(e) 45 | 46 | shuffledRange = range(len(rows)) 47 | # need to shuffle data during each iteration 48 | for _ in range(self.n_iter): 49 | random.shuffle(shuffledRange) 50 | batch_data = sp.sparse.vstack([rows[i] for i in shuffledRange]) 51 | shuffled_response = [response[i] for i in shuffledRange] 52 | self.steps[-1].partial_fit(batch_data, shuffled_response, classes=[0, 1]) 53 | 54 | def predict_proba(self, newX): 55 | 56 | all_preds = [] 57 | for batch in toolz.partition_all(self.batch_size, newX): 58 | pred_rows = [] 59 | for newrow in batch: 60 | newrow = self._transform(newrow) 61 | pred_rows.append(newrow) 62 | test_data = sp.sparse.vstack(pred_rows) 63 | all_preds.append(self.steps[-1].predict_proba(test_data)) 64 | 65 | return np.vstack(all_preds) 66 | 67 | def _transform(self, datapoint): 68 | for t in self.steps[:-1]: 69 | # Requiers stateless transformers 70 | datapoint = t.transform([datapoint]) 71 | return datapoint 72 | 73 | def check_response(self, response_i): 74 | if self.neg_rate is not None: 75 | return (response_i == 1 or np.random.random() < self.neg_rate) 76 | else: 77 | return True 78 | 79 | 80 | def llfun(act, pred): 81 | p_true = pred[:, 1] 82 | epsilon = 1e-15 83 | p_true = sp.maximum(epsilon, p_true) 84 | p_true = sp.minimum(1 - epsilon, p_true) 85 | ll = sum(act * sp.log(p_true) + sp.subtract(1, act) * sp.log(sp.subtract(1, p_true))) 86 | ll = ll * -1.0 / len(act) 87 | return ll 88 | 89 | 90 | def _make_scorer(score_function): 91 | return make_scorer(score_function, 92 | greater_is_better=False, 93 | needs_proba=True) 94 | 95 | 96 | ll_scorer = _make_scorer(llfun) 97 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | --------------------------------------------------------------------------------