├── 3rdparty ├── fast_retraining │ ├── experiments │ │ ├── __init__.py │ │ ├── libs │ │ │ ├── __init__.py │ │ │ ├── utils.py │ │ │ ├── metrics.py │ │ │ ├── timer.py │ │ │ ├── notebook_memory_management.py │ │ │ ├── planet_kaggle.py │ │ │ ├── conversion.py │ │ │ ├── loaders.py │ │ │ └── football.py │ │ ├── 02_BCI_GPU.ipynb │ │ ├── 04_PlanetKaggle.ipynb │ │ ├── 04_PlanetKaggle_GPU.ipynb │ │ └── 06_HIGGS.ipynb │ ├── environment │ │ ├── deactivate_env_vars.sh │ │ └── activate_env_vars.sh │ ├── requirements.txt │ ├── LICENSE │ ├── .gitignore │ ├── README.md │ └── INSTALL.md ├── README.md └── codebase │ └── python │ └── machine_learning │ └── metrics.py ├── .gitlab-ci.yml ├── .gitignore ├── LICENSE ├── metrics.py ├── json2csv.py ├── Dockerfile ├── README.md ├── runme.py ├── datasets.py ├── algorithms.py └── .pylintrc /3rdparty/fast_retraining/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/experiments/libs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | test: 2 | script: 3 | - apt-get update -qy 4 | - apt-get install -y python3 python3-pip 5 | - pip3 install pylint 6 | - pylint *.py -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.tsv 3 | *.json 4 | *.csv 5 | *.log 6 | test/ 7 | train/ 8 | catboost_info/ 9 | learn/ 10 | dask-worker-space/ 11 | file-*.model 12 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/environment/deactivate_env_vars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export PYTHONPATH=$OLD_PYTHON_PATH 4 | export PATH=$OLD_PATH 5 | export MOUNT_POINT= 6 | export CACHE_DIR= 7 | echo Noooooooooooooooo -------------------------------------------------------------------------------- /3rdparty/fast_retraining/experiments/libs/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import multiprocessing 3 | 4 | 5 | def get_number_processors(): 6 | try: 7 | num = os.cpu_count() 8 | except: 9 | num = multiprocessing.cpu_count() 10 | return num 11 | 12 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn>=0.18.1 2 | scipy>=0.19.0 3 | tqdm>=4.11.2 4 | bokeh>=0.12.6 5 | selenium>=3.4.3 6 | matplotlib>=1.5.3 7 | arff>=0.9 8 | glob2>=0.5 9 | ipython>=6.1.0 10 | tensorflow>=1.1.0 11 | Keras>=2.0.3 12 | memory_profiler>=0.47 13 | psutil>=5.2.2 14 | ipykernel>=4.6.1 15 | -------------------------------------------------------------------------------- /3rdparty/README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | This folder contains all the third-party codes that we borrowed for this 3 | project. They are basically a snapshot from their original locations, denoting 4 | when we borrowed their code. 5 | 6 | # fast_retraining 7 | Link: https://github.com/Azure/fast_retraining 8 | Commit ID when last borrowed: e43c9195213189ee0476c4a114dd8395ae11ed26 9 | 10 | # metrics.py 11 | Link: https://github.com/miguelgfierro/codebase 12 | Commit ID when last borrowed: 1080ba63a97bb13b2d61ca8ad9b83c7593337e86 13 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/environment/activate_env_vars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # The location of the repository on the host machine 4 | REPOSPATH=$HOME'/repos/' 5 | export PROJECTPATH=$REPOSPATH'fast_retraining' 6 | 7 | 8 | # Add custom libraries to the python path 9 | export OLD_PYTHON_PATH=$PYTHONPATH 10 | export PYTHONPATH=$PYTHONPATH:$PROJECTPATH # Adds the repository to the python path 11 | 12 | # Add scripts to path 13 | export OLD_PATH=$PATH 14 | export PATH=$PATH:PROJECTPATH 15 | 16 | # The mounting location for the data 17 | export MOUNT_POINT=/fileshare 18 | echo Me Gusta! 19 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/experiments/libs/metrics.py: -------------------------------------------------------------------------------- 1 | #Original source: https://github.com/miguelgfierro/codebase/blob/master/python/machine_learning/metrics.py 2 | import numpy as np 3 | from sklearn.metrics import roc_auc_score,accuracy_score, precision_score, recall_score, f1_score 4 | 5 | 6 | def classification_metrics_binary(y_true, y_pred): 7 | m_acc = accuracy_score(y_true, y_pred) 8 | m_f1 = f1_score(y_true, y_pred) 9 | m_precision = precision_score(y_true, y_pred) 10 | m_recall = recall_score(y_true, y_pred) 11 | report = {'Accuracy':m_acc, 'Precision':m_precision, 'Recall':m_recall, 'F1':m_f1} 12 | return report 13 | 14 | 15 | def classification_metrics_binary_prob(y_true, y_prob): 16 | m_auc = roc_auc_score(y_true, y_prob) 17 | report = {'AUC':m_auc} 18 | return report 19 | 20 | 21 | def classification_metrics_multilabel(y_true, y_pred, labels): 22 | m_acc = accuracy_score(y_true, y_pred) 23 | m_f1 = f1_score(y_true, y_pred, labels, average='weighted') 24 | m_precision = precision_score(y_true, y_pred, labels, average='weighted') 25 | m_recall = recall_score(y_true, y_pred, labels, average='weighted') 26 | report = {'Accuracy':m_acc, 'Precision':m_precision, 'Recall':m_recall, 'F1':m_f1} 27 | return report 28 | 29 | 30 | def binarize_prediction(y, threshold=0.5): 31 | y_pred = np.where(y > threshold, 1, 0) 32 | return y_pred 33 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/experiments/libs/timer.py: -------------------------------------------------------------------------------- 1 | #code based on https://github.com/miguelgfierro/codebase/ 2 | 3 | from timeit import default_timer 4 | 5 | class Timer(object): 6 | """Timer class. 7 | Examples: 8 | >>> big_num = 100000 9 | >>> t = Timer() 10 | >>> t.start() 11 | >>> for i in range(big_num): 12 | >>> r = 1 13 | >>> t.stop() 14 | >>> print(t.interval) 15 | 0.0946876304844 16 | >>> with Timer() as t: 17 | >>> for i in range(big_num): 18 | >>> r = 1 19 | >>> print(t.interval) 20 | 0.0766928562442 21 | >>> try: 22 | >>> with Timer() as t: 23 | >>> for i in range(big_num): 24 | >>> r = 1 25 | >>> raise(Exception("Get out!")) 26 | >>> finally: 27 | >>> print(t.interval) 28 | 0.0757778924471 29 | 30 | """ 31 | def __init__(self): 32 | self._timer = default_timer 33 | 34 | def __enter__(self): 35 | self.start() 36 | return self 37 | 38 | def __exit__(self, *args): 39 | self.stop() 40 | 41 | def start(self): 42 | """Start the timer.""" 43 | self.start = self._timer() 44 | 45 | def stop(self): 46 | """Stop the timer. Calculate the interval in seconds.""" 47 | self.end = self._timer() 48 | self.interval = self.end - self.start 49 | 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | ########## 92 | *-Copy*.ipynb 93 | experiments/*.svg 94 | experiments/*.pk 95 | Untitled.ipynb 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/experiments/libs/notebook_memory_management.py: -------------------------------------------------------------------------------- 1 | #Source: https://github.com/ianozsvald/ipython_memory_usage 2 | """Profile mem usage envelope of IPython commands and report interactively""" 3 | from __future__ import division # 1/2 == 0.5, as in Py3 4 | from __future__ import absolute_import # avoid hiding global modules with locals 5 | from __future__ import print_function # force use of print("hello") 6 | from __future__ import unicode_literals # force unadorned strings "" to be unicode without prepending u"" 7 | import time 8 | import memory_profiler 9 | from IPython import get_ipython 10 | import threading 11 | 12 | 13 | # keep a global accounting for the last known memory usage 14 | # which is the reference point for the memory delta calculation 15 | previous_call_memory_usage = memory_profiler.memory_usage()[0] 16 | t1 = time.time() # will be set to current time later 17 | keep_watching = True 18 | watching_memory = True 19 | input_cells = get_ipython().user_ns['In'] 20 | 21 | 22 | def start_watching_memory(): 23 | """Register memory profiling tools to IPython instance.""" 24 | global watching_memory 25 | watching_memory = True 26 | ip = get_ipython() 27 | ip.events.register("post_run_cell", watch_memory) 28 | ip.events.register("pre_run_cell", pre_run_cell) 29 | 30 | 31 | def stop_watching_memory(): 32 | """Unregister memory profiling tools from IPython instance.""" 33 | global watching_memory 34 | watching_memory = False 35 | ip = get_ipython() 36 | try: 37 | ip.events.unregister("post_run_cell", watch_memory) 38 | except ValueError: 39 | pass 40 | try: 41 | ip.events.unregister("pre_run_cell", pre_run_cell) 42 | except ValueError: 43 | pass 44 | 45 | 46 | def watch_memory(): 47 | # bring in the global memory usage value from the previous iteration 48 | global previous_call_memory_usage, peak_memory_usage, keep_watching, \ 49 | watching_memory, input_cells 50 | new_memory_usage = memory_profiler.memory_usage()[0] 51 | memory_delta = new_memory_usage - previous_call_memory_usage 52 | keep_watching = False 53 | # calculate time delta using global t1 (from the pre-run event) and current 54 | # time 55 | time_delta_secs = time.time() - t1 56 | num_commands = len(input_cells) - 1 57 | cmd = "In [{}]".format(num_commands) 58 | # convert the results into a pretty string 59 | output_template = ("{cmd} used {memory_delta:0.4f} MiB RAM in " 60 | "{time_delta:0.2f}s, total RAM usage " 61 | "{memory_usage:0.2f} MiB") 62 | output = output_template.format(time_delta=time_delta_secs, 63 | cmd=cmd, 64 | memory_delta=memory_delta, 65 | memory_usage=new_memory_usage) 66 | if watching_memory: 67 | print(str(output)) 68 | previous_call_memory_usage = new_memory_usage 69 | 70 | 71 | def pre_run_cell(): 72 | """Capture current time before we execute the current command""" 73 | global t1 74 | t1 = time.time() 75 | 76 | 77 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/experiments/libs/planet_kaggle.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import glob 4 | from tqdm import tqdm 5 | import shutil 6 | from keras.preprocessing import image 7 | from keras.applications.imagenet_utils import preprocess_input 8 | 9 | 10 | def labels_from(labels_df): 11 | """ Extracts the unique labels from the labels dataframe 12 | """ 13 | # Build list with unique labels 14 | label_list = [] 15 | for tag_str in labels_df.tags.values: 16 | labels = tag_str.split(' ') 17 | for label in labels: 18 | if label not in label_list: 19 | label_list.append(label) 20 | return label_list 21 | 22 | 23 | def enrich_with_feature_encoding(labels_df): 24 | # Add onehot features for every label 25 | for label in labels_from(labels_df): 26 | labels_df[label] = labels_df['tags'].apply(lambda x: 1 if label in x.split(' ') else 0) 27 | return labels_df 28 | 29 | 30 | def to_multi_label_dict(enriched_labels_df): 31 | df = enriched_labels_df.set_index('image_name').drop('tags', axis=1) 32 | return dict((filename, encoded_array) for filename, encoded_array in zip(df.index, df.values)) 33 | 34 | 35 | def get_file_count(folderpath): 36 | """ Returns the number of files in a folder 37 | """ 38 | return len(glob.glob(folderpath)) 39 | 40 | 41 | def threshold_prediction(pred_y, threshold=0.5):# TODO: Needs to be tuned? 42 | return pred_y > threshold 43 | 44 | 45 | def read_images(filepath, filenames): 46 | """ Read images in batches 47 | """ 48 | img_data = list() 49 | for name in filenames: 50 | img_path = os.path.join(filepath, name+'.jpg') 51 | img = image.load_img(img_path, target_size=(224, 224)) 52 | x = image.img_to_array(img) 53 | x = np.expand_dims(x, axis=0) 54 | img_data.append(preprocess_input(x)) 55 | return np.concatenate(img_data) 56 | 57 | 58 | def chunks(l, n): 59 | for i in range(0, len(l), n): 60 | yield l[i:i + n] 61 | 62 | 63 | def featurise_images(model, filepath, nameformat, num_iter, batch_size=32, desc=None): 64 | """ Use DL model to featurise images 65 | """ 66 | features = list() 67 | img_names = list() 68 | num_list = list(num_iter) 69 | num_batches = np.ceil(len(num_list)/batch_size) 70 | 71 | for num_chunk in tqdm(chunks(num_list, batch_size), total=num_batches, desc=desc): 72 | filenames = [nameformat.format(index) for index in num_chunk] 73 | batch_images = read_images(filepath, filenames) 74 | img_names.extend(filenames) 75 | features.extend(model.predict_on_batch(batch_images).squeeze()) 76 | return np.array(features), img_names 77 | 78 | 79 | def generate_validation_files(train_path, val_path, num_train = 35000): 80 | """ Creates the validation files from the train files. 81 | """ 82 | num_train_ini = get_file_count(os.path.join(train_path, '*.jpg')) 83 | assert num_train_ini > num_train 84 | 85 | order = 'mv ' + train_path + '/train_{' + str(num_train) + '..' + str(num_train_ini) + '}.jpg ' + val_path 86 | os.system(order) 87 | 88 | 89 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/experiments/libs/conversion.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def _get_nominal_integer_dict(nominal_vals): 5 | """Convert nominal values in integers, starting at 0. 6 | Parameters: 7 | nominal_vals (pd.Series): A series. 8 | Returns: 9 | d (dict): An dictionary with numeric values. 10 | 11 | """ 12 | d = {} 13 | for val in nominal_vals: 14 | if val not in d: 15 | current_max = max(d.values()) if len(d) > 0 else -1 16 | d[val] = current_max+1 17 | return d 18 | 19 | 20 | def _convert_to_integer(srs, d): 21 | """Convert series to integer, given a dictionary. 22 | Parameters: 23 | srs (pd.Series): A series. 24 | d (dict): A dictionary mapping values to integers 25 | Returns: 26 | srs (pd.Series): An series with numeric values. 27 | 28 | """ 29 | return srs.map(lambda x: d[x]) 30 | 31 | 32 | def convert_cols_categorical_to_numeric(df, col_list=None): 33 | """Convert categorical columns to numeric and leave numeric columns 34 | as they are. You can force to convert a numerical column if it is 35 | included in col_list 36 | Parameters: 37 | df (pd.DataFrame): Dataframe. 38 | col_list (list): List of columns. 39 | Returns: 40 | ret (pd.DataFrame): An dataframe with numeric values. 41 | Examples: 42 | >>> df = pd.DataFrame({'letters':['a','b','c'],'numbers':[1,2,3]}) 43 | >>> df_numeric = convert_cols_categorical_to_numeric(df) 44 | >>> print(df_numeric) 45 | letters numbers 46 | 0 0 1 47 | 1 1 2 48 | 2 2 3 49 | 50 | """ 51 | if col_list is None: col_list = [] 52 | ret = pd.DataFrame() 53 | for column_name in df.columns: 54 | column = df[column_name] 55 | if column.dtype == 'object' or column_name in col_list: 56 | col_dict = _get_nominal_integer_dict(column) 57 | ret[column_name] = _convert_to_integer(column, col_dict) 58 | else: 59 | ret[column_name] = column 60 | return ret 61 | 62 | 63 | def convert_related_cols_categorical_to_numeric(df, col_list): 64 | """Convert categorical columns, that are related between each other, 65 | to numeric and leave numeric columns 66 | as they are. 67 | Parameters: 68 | df (pd.DataFrame): Dataframe. 69 | col_list (list): List of columns. 70 | Returns: 71 | ret (pd.DataFrame): An dataframe with numeric values. 72 | Examples: 73 | >>> df = pd.DataFrame({'letters':['a','b','c'],'letters2':['c','d','e'],'numbers':[1,2,3]}) 74 | >>> df_numeric = convert_related_cols_categorical_to_numeric(df, col_list=['letters','letters2']) 75 | >>> print(df_numeric) 76 | letters letters2 numbers 77 | 0 0 2 1 78 | 1 1 3 2 79 | 2 2 4 3 80 | 81 | """ 82 | ret = pd.DataFrame() 83 | values=None 84 | for c in col_list: 85 | values = pd.concat([values,df[c]], axis=0) 86 | values = pd.Series(values.unique()) 87 | col_dict = _get_nominal_integer_dict(values) 88 | for column_name in df.columns: 89 | column = df[column_name] 90 | if column_name in col_list: 91 | ret[column_name] = _convert_to_integer(column, col_dict) 92 | else: 93 | ret[column_name] = column 94 | return ret 95 | 96 | -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | # BSD License 2 | # 3 | # Copyright (c) 2016-present, Miguel Gonzalez-Fierro. All rights reserved. 4 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without modification, 7 | # are permitted provided that the following conditions are met: 8 | # 9 | # * Redistributions of source code must retain the above copyright notice, this 10 | # list of conditions and the following disclaimer. 11 | # 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # * Neither the name Miguel Gonzalez-Fierro nor the names of its contributors may be used to 17 | # endorse or promote products derived from this software without specific 18 | # prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 21 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 22 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 24 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 25 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 26 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 27 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | import numpy as np 32 | import sklearn.metrics as sklm 33 | from datasets import LearningTask 34 | 35 | 36 | def get_metrics(data, pred): 37 | if data.learning_task == LearningTask.REGRESSION: 38 | return regression_metrics(data.y_test, pred) 39 | if data.learning_task == LearningTask.CLASSIFICATION: 40 | return classification_metrics(data.y_test, pred) 41 | if data.learning_task == LearningTask.MULTICLASS_CLASSIFICATION: 42 | return classification_metrics_multilabel(data.y_test, pred) 43 | raise ValueError("No metrics defined for learning task: " + str(data.learning_task)) 44 | 45 | 46 | def evaluate_metrics(y_true, y_pred, metrics): 47 | res = {} 48 | for metric_name, metric in metrics.items(): 49 | res[metric_name] = float(metric(y_true, y_pred)) 50 | return res 51 | 52 | 53 | def classification_metrics(y_true, y_prob, threshold=0.5): 54 | y_pred = np.where(y_prob > threshold, 1, 0) 55 | metrics = { 56 | "Accuracy": sklm.accuracy_score, 57 | "Log_Loss": lambda real, pred: sklm.log_loss(real, y_prob, eps=1e-5), 58 | # yes, I'm using y_prob here! 59 | "AUC": lambda real, pred: sklm.roc_auc_score(real, y_prob), 60 | "Precision": sklm.precision_score, 61 | "Recall": sklm.recall_score, 62 | } 63 | return evaluate_metrics(y_true, y_pred, metrics) 64 | 65 | 66 | def classification_metrics_multilabel(y_true, y_pred): 67 | metrics = { 68 | "Accuracy": sklm.accuracy_score, 69 | "Precision": lambda real, pred: sklm.precision_score(real, pred, 70 | average="weighted"), 71 | "Recall": lambda real, pred: sklm.recall_score(real, pred, 72 | average="weighted"), 73 | "F1": lambda real, pred: sklm.f1_score(real, pred, 74 | average="weighted"), 75 | } 76 | return evaluate_metrics(y_true, y_pred, metrics) 77 | 78 | 79 | def regression_metrics(y_true, y_pred): 80 | metrics = { 81 | "MeanAbsError": sklm.mean_absolute_error, 82 | "MeanSquaredError": sklm.mean_squared_error, 83 | "MedianAbsError": sklm.median_absolute_error, 84 | } 85 | return evaluate_metrics(y_true, y_pred, metrics) 86 | -------------------------------------------------------------------------------- /json2csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | import sys 29 | import json 30 | import os 31 | import csv 32 | 33 | TIMINGS = ["train_time", "test_time"] 34 | METRICS = ["AUC", "Accuracy", "F1", "Precision", "Recall", "MeanAbsError", "MeanSquaredError", 35 | "MedianAbsError"] 36 | ALLMETRICS = TIMINGS + METRICS 37 | 38 | 39 | def load_perf_data(json_file): 40 | file = open(json_file, "r") 41 | data = json.load(file) 42 | file.close() 43 | return data 44 | 45 | 46 | def load_all_perf_data(files): 47 | data = {} 48 | for json_file in files: 49 | dataset = os.path.basename(json_file) 50 | dataset = dataset.replace(".json", "") 51 | data[dataset] = load_perf_data(json_file) 52 | return data 53 | 54 | 55 | def get_all_datasets(data): 56 | return data.keys() 57 | 58 | 59 | def get_all_algos(data): 60 | algos = {} 61 | for dset in data.keys(): 62 | for algo in data[dset].keys(): 63 | algos[algo] = 1 64 | return algos.keys() 65 | 66 | 67 | def read_from_dict(hashmap, key, def_val="-na-"): 68 | return hashmap[key] if key in hashmap else def_val 69 | 70 | 71 | def combine_perf_data(data, datasets, algos): 72 | all_data = {} 73 | for dataset in datasets: 74 | out = [] 75 | dset = read_from_dict(data, dataset, {}) 76 | for algo in algos: 77 | algo_data = read_from_dict(dset, algo, {}) 78 | perf = [algo] 79 | for timing in TIMINGS: 80 | perf.append(read_from_dict(algo_data, timing)) 81 | metric_data = read_from_dict(algo_data, "accuracy", {}) 82 | for metric in METRICS: 83 | perf.append(read_from_dict(metric_data, metric)) 84 | out.append(perf) 85 | all_data[dataset] = out 86 | return all_data 87 | 88 | 89 | def write_csv(all_data, datasets): 90 | writer = csv.writer(sys.stdout) 91 | header = ['dataset', 'algorithm'] + ALLMETRICS 92 | writer.writerow(header) 93 | for dataset in sorted(datasets): 94 | for row in all_data[dataset]: 95 | writer.writerow([dataset] + row) 96 | 97 | 98 | def main(): 99 | data = load_perf_data(sys.argv[1]) 100 | datasets = get_all_datasets(data) 101 | algos = get_all_algos(data) 102 | table = combine_perf_data(data, datasets, algos) 103 | write_csv(table, datasets) 104 | 105 | 106 | if __name__ == '__main__': 107 | main() 108 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG CUDA_VERSION 2 | FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu18.04 3 | SHELL ["/bin/bash", "-c"] 4 | # Install conda (and use python 3.7) 5 | RUN apt-get update && \ 6 | apt-get install -y --no-install-recommends \ 7 | build-essential \ 8 | ca-certificates \ 9 | curl \ 10 | doxygen \ 11 | git \ 12 | graphviz \ 13 | libcurl4-openssl-dev \ 14 | libboost-all-dev \ 15 | make \ 16 | tar \ 17 | unzip \ 18 | wget \ 19 | zlib1g-dev && \ 20 | rm -rf /var/lib/apt/* 21 | 22 | RUN curl -o /opt/miniconda.sh \ 23 | https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 24 | chmod +x /opt/miniconda.sh && \ 25 | /opt/miniconda.sh -b -p /opt/conda && \ 26 | /opt/conda/bin/conda update -n base conda && \ 27 | rm /opt/miniconda.sh 28 | ENV PATH /opt/conda/bin:$PATH 29 | RUN conda install -c conda-forge -c rapidsai -c nvidia -c defaults \ 30 | bokeh \ 31 | cmake>=3.14 \ 32 | h5py \ 33 | ipython \ 34 | ipywidgets \ 35 | jupyter \ 36 | kaggle \ 37 | matplotlib \ 38 | nose \ 39 | numpy \ 40 | pandas \ 41 | Pillow \ 42 | pydot \ 43 | pylint\ 44 | psutil\ 45 | scikit-learn \ 46 | scipy \ 47 | six \ 48 | dask \ 49 | distributed \ 50 | tqdm \ 51 | cudf=0.18.0 \ 52 | dask-cuda \ 53 | rmm \ 54 | librmm \ 55 | rapids-xgboost \ 56 | cuml=0.18 && \ 57 | conda clean -ya 58 | 59 | # lightgbm 60 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 61 | RUN apt-get update && \ 62 | apt-get install -y --no-install-recommends \ 63 | build-essential \ 64 | bzip2 \ 65 | ca-certificates \ 66 | curl \ 67 | git \ 68 | libblas-dev \ 69 | libboost-dev \ 70 | libboost-filesystem-dev \ 71 | libboost-system-dev \ 72 | libbz2-dev \ 73 | libc6 \ 74 | libglib2.0-0 \ 75 | liblapack-dev \ 76 | libsm6 \ 77 | libxext6 \ 78 | libxrender1 \ 79 | make \ 80 | tar \ 81 | unzip \ 82 | wget && \ 83 | rm -rf /var/lib/apt/* 84 | RUN mkdir -p /etc/OpenCL/vendors && \ 85 | echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd 86 | ENV OPENCL_LIBRARIES /usr/local/cuda/lib64 87 | ENV OPENCL_INCLUDE_DIR /usr/local/cuda/include 88 | RUN git config --global http.sslVerify false && \ 89 | git clone --recursive https://github.com/Microsoft/LightGBM /opt/LightGBM && \ 90 | cd /opt/LightGBM && \ 91 | mkdir build && \ 92 | cd build && \ 93 | cmake .. \ 94 | -DUSE_GPU=1 \ 95 | -DOpenCL_LIBRARY=$OPENCL_LIBRARIES/libOpenCL.so \ 96 | -DOpenCL_INCLUDE_DIR=$OPENCL_INCLUDE_DIR && \ 97 | make OPENCL_HEADERS="/usr/local/cuda/targets/x86_64-linux/include" \ 98 | LIBOPENCL="/usr/local/cuda/targets/x86_64-linux/lib" -j4 && \ 99 | cd ../python-package && \ 100 | python setup.py install --precompile 101 | 102 | # catboost 103 | RUN if [ "`echo $CUDA_VERSION | sed -e 's/[.].*//'`" -lt "11" ]; then git config --global http.sslVerify false && \ 104 | git clone --recursive "https://github.com/catboost/catboost" /opt/catboost && \ 105 | cd /opt/catboost && \ 106 | cd catboost/python-package/catboost && \ 107 | ../../../ya make \ 108 | -r \ 109 | -o ../../.. \ 110 | -DUSE_ARCADIA_PYTHON=no \ 111 | -DUSE_SYSTEM_PYTHON=3.7\ 112 | -DPYTHON_CONFIG=python3-config \ 113 | -DCUDA_ROOT=$(dirname $(dirname $(which nvcc))); \ 114 | fi 115 | ENV if [ "`echo $CUDA_VERSION | sed -e 's/[.].*//'`" -lt "11" ]; then PYTHONPATH=$PYTHONPATH:/opt/catboost/catboost/python-package; fi 116 | 117 | 118 | 119 | # xgboost 120 | RUN git config --global http.sslVerify false && \ 121 | git clone --recursive https://github.com/dmlc/xgboost /opt/xgboost && \ 122 | cd /opt/xgboost && \ 123 | mkdir build && \ 124 | cd build && \ 125 | RMM_ROOT=/opt/conda cmake .. \ 126 | -DUSE_CUDA=ON \ 127 | -DUSE_NCCL=ON \ 128 | -DPLUGIN_RMM=ON && \ 129 | make -j4 && \ 130 | cd ../python-package && \ 131 | pip uninstall -y xgboost && \ 132 | python setup.py install 133 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/README.md: -------------------------------------------------------------------------------- 1 | # Fast Retraining 2 | 3 | In this repo we compare two of the fastest boosted decision tree libraries: [XGBoost](https://github.com/dmlc/xgboost) and [LightGBM](https://github.com/microsoft/LightGBM). We will evaluate them across datasets of several domains and different sizes. 4 | 5 | On July 25, 2017, we published a blog post evaluating both libraries and discussing the benchmark results. The post is [Lessons Learned From Benchmarking Fast Machine Learning Algorithms](https://blogs.technet.microsoft.com/machinelearning/2017/07/25/lessons-learned-benchmarking-fast-machine-learning-algorithms/). 6 | 7 | ## Installation and Setup 8 | 9 | The installation instructions can be found [here](./INSTALL.md). 10 | 11 | ## Project 12 | 13 | In the folder [experiments](./experiments) you can find the different experiments of the project. We developed 6 experiments with the CPU and GPU versions of the libraries. 14 | 15 | * Airline 16 | * BCI 17 | * Football 18 | * Planet Kaggle 19 | * Fraud Detection 20 | * HIGGS 21 | 22 | In the folder [experiment/libs](./experiment/libs) there is the common code for the project. 23 | 24 | ## Benchmark 25 | 26 | In the following table there are summarized the time results (in seconds) and the ratio of the benchmarks performed in the experiments: 27 | 28 | | Dataset | Experiment | Data size | Features | xgb time:
CPU (GPU) | xgb_hist time:
CPU (GPU) | lgb time:
CPU (GPU) | ratio xgb/lgb:
CPU (GPU) | ratio xgb_hist/lgb:
CPU
(GPU) | 29 | | --- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | 30 | | Football | [Link CPU](./experiments/03_football.ipynb)
[Link GPU](./experiments/03_football_GPU.ipynb) | 19673 | 46 | 2.27 (7.09) | 2.47 (4.58) | 0.58 (0.97) | 3.90
(7.26) | 4.25
(4.69) | 31 | | Fraud Detection | [Link CPU](./experiments/05_FraudDetection.ipynb)
[Link GPU](./experiments/05_FraudDetection_GPU.ipynb) | 284807 | 30 | 4.34 (5.80) | 2.01 (1.64) | 0.66 (0.29) | 6.58
(19.74) | 3.04
(5.58) | 32 | | BCI | [Link CPU](./experiments/02_BCI.ipynb)
[Link GPU](./experiments/02_BCI_GPU.ipynb) | 20497 | 2048 | 11.51 (12.93) | 41.84 (42.69) | 7.31 (2.76)| 1.57
(4.67) | 5.72
(15.43) | 33 | | Planet Kaggle | [Link CPU](./experiments/04_PlanetKaggle.ipynb)
[Link GPU](./experiments/04_PlanetKaggle_GPU.ipynb) | 40479 | 2048 | 313.89 (-) | 2115.28 (2028.43) | 194.57 (317.68)| 1.61
(-) | 10.87
(6.38) | 34 | | HIGGS | [Link CPU](./experiments/06_HIGGS.ipynb)
[Link GPU](./experiments/06_HIGGS_GPU.ipynb) | 11000000 | 28 | 2996.16 (-) | 121.21 (114.88) | 119.34 (71.87) | 25.10
(-) | 1.01
(1.59) | 35 | | Airline | [Link CPU](./experiments/01_airline.ipynb)
[Link GPU](./experiments/01_airline_GPU.ipynb) | 115069017 | 13 | - (-) | 1242.09 (1271.91) | 1056.20 (645.40) | -
(-) | 1.17
(1.97) | 36 | 37 | 38 | In the next table we summarize the performance results using the [F1-Score](https://en.wikipedia.org/wiki/F1_score). 39 | 40 | | Dataset | Experiment | Data size | Features | xgb F1:
CPU (GPU) | xgb_hist F1:
CPU (GPU) | lgb F1:
CPU (GPU) | 41 | | --- | :---: | :---: | :---: | :---: | :---: | :---: | 42 | | Football | [Link](./experiments/03_football.ipynb)
[Link](./experiments/03_football_GPU.ipynb) | 19673 | 46 | 0.458 (0.470) | 0.460 (0.472) | 0.459 (0.470)| 43 | | Fraud Detection | [Link](./experiments/05_FraudDetection.ipynb)
[Link](./experiments/05_FraudDetection_GPU.ipynb) | 284807 | 30 | 0.824 (0.821) | 0.802 (0.814) | 0.813 (0.811) | 44 | | BCI | [Link](./experiments/02_BCI.ipynb)
[Link](./experiments/02_BCI_GPU.ipynb) | 20497 | 2048 | 0.110 (0.093) | 0.142 (0.120) | 0.137 (0.138) | 45 | | Planet Kaggle | [Link](./experiments/04_PlanetKaggle.ipynb)
[Link](./experiments/04_PlanetKaggle_GPU.ipynb) | 40479 | 2048 | 0.805 (-) | 0.822 (0.822) | 0.822 (0.821)| 46 | | HIGGS | [Link](./experiments/06_HIGGS.ipynb)
[Link](./experiments/06_HIGGS_GPU.ipynb) | 11000000 | 28 | 0.763 (-) | 0.767 (0.767) | 0.768 (0.767) | 47 | | Airline | [Link](./experiments/01_airline.ipynb)
[Link](./experiments/01_airline_GPU.ipynb) | 115069017 | 13 | - (-) | 0.741 (0.745) | 0.732 (0.745) | 48 | 49 | The experiments were run on an Azure NV24 VM with 24 cores and 224 GB memory. The machine has 4 NVIDIA M60 GPUs. In both cases we used Ubuntu 16.04. 50 | 51 | 52 | ## Contributing 53 | 54 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 55 | 56 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/INSTALL.md: -------------------------------------------------------------------------------- 1 | # Installation and Setup 2 | 3 | Here we present the instructions for setting up the project on an [Ubuntu Azure VM](https://azure.microsoft.com/en-us/services/virtual-machines/). The VM we used for the experiment was a NV24 with 4 NVIDIA M60 GPUs. The OS was Ubuntu 16.04. We recommend to use the [Azure Data Science VM](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/microsoft-ads.standard-data-science-vm) which comes with many machine learning tools already installed. 4 | 5 | ## Setting up the environment 6 | 7 | Clone this repo to your desired location 8 | ```bash 9 | git clone https://github.com/Azure/fast_retraining.git 10 | ``` 11 | 12 | Create a conda environment if you haven't already done so. The command below creates a python 3 environment called fast. 13 | ```bash 14 | conda create --name fast python=3.5 anaconda 15 | ``` 16 | 17 | Edit [activate_env_vars.sh](environment/activate_env_vars.sh ) and [deactivate_env_vars.sh](environment/deactivate_env_vars.sh ) 18 | so that they contain the correct information. 19 | 20 | Install command line json parser 21 | ```bash 22 | apt-get install jq 23 | ``` 24 | 25 | Activate the conda environment and install the requirements. 26 | ```bash 27 | source activate fast 28 | pip install -r requirements.txt 29 | ``` 30 | 31 | Get the currently activated environment and assign it to env_path. 32 | Get info of current env and output to json | look for default_prefix element in JSON | remove all quotes 33 | ```bash 34 | env_path=$(conda info --json | jq '.default_prefix' | tr -d '"') 35 | ``` 36 | 37 | Make sure you are in the environment folder of the project and run the following 38 | ```bash 39 | activate_script_path=$(readlink -f activate_env_vars.sh) 40 | deactivate_script_path=$(readlink -f deactivate_env_vars.sh) 41 | ``` 42 | 43 | Then we create the activation and deactivation scripts and make sure they point to our now modified activation 44 | and deactivation scripts in our environment folder 45 | ```bash 46 | mkdir -p $env_path/etc/conda/activate.d 47 | mkdir -p $env_path/etc/conda/deactivate.d 48 | echo 'source '$activate_script_path >> $env_path/etc/conda/activate.d/env_vars.sh 49 | echo 'source '$deactivate_script_path >> $env_path/etc/conda/deactivate.d/env_vars.sh 50 | ``` 51 | 52 | Exit the environment 53 | ```bash 54 | source deactivate 55 | ``` 56 | 57 | Enter the environment again 58 | ```bash 59 | source activate fast 60 | ``` 61 | 62 | Finally, to register the environment in the jupyter notebook: 63 | ```bash 64 | python -m ipykernel install --user --name fast --display-name "Python Fast" 65 | ``` 66 | 67 | ## Installation of boosted tree libraries 68 | 69 | We need to install [XGBoost](https://github.com/dmlc/xgboost) and [LightGBM](https://github.com/microsoft/LightGBM). Even though both libraries have pypi versions, for creating the experiments contained in this repo we compiled from source. 70 | 71 | To install XGBoost you can follow the [installation guide](https://xgboost.readthedocs.io/en/latest/build.html). To build in CPU, using the specific commit we used: 72 | 73 | git clone --recursive https://github.com/dmlc/xgboost 74 | cd xgboost 75 | git checkout 6776292951565c8cd72e69afd9d94de1474f00c0 76 | git submodule update --recursive 77 | make -j$(nproc) 78 | 79 | In case you want to use the last version, just skip the commands `git checkout` and `git submodule`. 80 | 81 | If you want to build in GPU, the instructions are [here](https://github.com/dmlc/xgboost/tree/master/plugin/updater_gpu). You first need to download and unzip [CUB 1.6.4](https://nvlabs.github.io/cub/). 82 | 83 | git clone --recursive https://github.com/dmlc/xgboost 84 | cd xgboost 85 | git checkout 6776292951565c8cd72e69afd9d94de1474f00c0 86 | git submodule update --recursive 87 | mkdir build 88 | cd build 89 | cmake .. -DPLUGIN_UPDATER_GPU=ON -DCUB_DIRECTORY=/path/to/cub-1.6.4 90 | make -j$(nproc) 91 | 92 | To install LighGBM you can follow the [installation guide](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide). To build on CPU: 93 | 94 | git clone --recursive https://github.com/Microsoft/LightGBM ; cd LightGBM 95 | git checkout 73968a96829e212b333c88cd44725c8c39c03ad1 96 | mkdir build ; cd build 97 | cmake .. 98 | make -j$(nproc) 99 | 100 | To install the GPU version: 101 | 102 | git clone --recursive https://github.com/Microsoft/LightGBM ; cd LightGBM 103 | git checkout 73968a96829e212b333c88cd44725c8c39c03ad1 104 | mkdir build ; cd build 105 | cmake .. -DUSE_GPU=1 106 | make -j$(nproc) 107 | 108 | To install the python biddings you have to compile in the python directory. Both libraries have the exact same name for the python package, so you just need to do the following step in both libraries: 109 | 110 | cd python-package 111 | python setup.py install 112 | 113 | Finally, to check that the libraries are correctly installed, try to load them from python: 114 | 115 | python -c "import xgboost; import lightgbm" 116 | 117 | 118 | ## Installation of bokeh functionality to export plots 119 | 120 | To generate png exports with bokeh you have to follow the instructions explained in [this link](http://bokeh.pydata.org/en/0.12.6/docs/user_guide/export.html). 121 | 122 | sudo apt-get install npm 123 | sudo npm install -g phantomjs-prebuilt 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | This repo tries to benchmark boosting frameworks against some of the popular 3 | ML datasets. This is a more scriptable version of Microsoft's work on comparing 4 | LightGBM and XGBoost: https://github.com/Azure/fast_retraining/. Most of the 5 | datasets used here are the same as in the above repo. 6 | 7 | # Dependencies 8 | - Cuda 9.2 or greater 9 | - Nvidia docker 2.0 10 | 11 | # Setting up this repo 12 | ```bash 13 | 14 | $ git clone https://github.com/NVIDIA/gbm-bench.git 15 | $ cd gbm-bench 16 | ``` 17 | Create a docker image for cuda 10.0 18 | ```bash 19 | $ docker build -t gbm-bench:10.0 . --build-arg CUDA_VERSION=10.0 20 | ``` 21 | You can create docker images with different cuda versions as below. You will not be able to create an image for a cuda version greater than what is installed on your system. The GBM libraries may not support very recent versions of cuda. 22 | ```bash 23 | $ docker build -t gbm-bench:9.2 . --build-arg CUDA_VERSION=9.2 24 | ``` 25 | 26 | # Datasets 27 | gbm-bench will automatically download datasets as needed using wget or the [Kaggle API](https://github.com/Kaggle/kaggle-api). To use the kaggle datasets you will need a valid kaggle account and API token. Create a folder 'gbm-datasets' in some location with sufficient space for large datasets. Mounting this folder on fast local storage as opposed to network storage is recommended. 28 | 29 | ```bash 30 | $ mkdir gbm-datasets 31 | ``` 32 | Upon launching docker you will pass this folder as well as the location of the kaggle API key as volumes to the container. 33 | 34 | | Name | Rows | Columns | Task | 35 | |--------------------------------------------------------------------------------|--------|---------|----------------| 36 | | [airline](http://kt.ijs.si/elena_ikonomovska/data.html) | 115M | 13 | Classification | 37 | | [airline_regression](http://kt.ijs.si/elena_ikonomovska/data.html) | 115M | 13 | Regression | 38 | | [bosch](https://www.kaggle.com/c/bosch-production-line-performance) | 1.184M | 968 | Classification | 39 | | [fraud](https://www.kaggle.com/mlg-ulb/creditcardfraud) | 285K | 28 | Classification | 40 | | [higgs](https://archive.ics.uci.edu/ml/datasets/HIGGS) | 11M | 28 | Classification | 41 | | [year](https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd) | 515K | 90 | Regression | 42 | | [covtype](https://archive.ics.uci.edu/ml/datasets/covertype) | 581K | 54 | Multiclass | 43 | | [epsilon](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html) | 500K | 2000 | Classification | 44 | 45 | # Benchmarking 46 | This section assumes that one has elevated permissions on the system where this 47 | docker image will be run for benchmarking! In case this is not true, update 48 | your flow accordingly. 49 | 50 | ## Launching container 51 | ```bash 52 | docker run --runtime=nvidia -it --rm \ 53 | -w /opt/gbm-bench \ 54 | -v {YOUR-LOCATION/gbm-datasets}:/opt/gbm-datasets \ 55 | -v {YOUR-LOCATION/gbm-bench}:/opt/gbm-bench \ 56 | -v {KAGGLE-API-LOCATION/.kaggle}:/root/.kaggle \ 57 | gbm-bench:10.0 /bin/bash 58 | ``` 59 | The above command launches an interactive session and mounts the dataset folder, the gbm-bench repo and your kaggle API key inside the container. "gbm-bench:10.0" refers to the docker image, modify this if you are using a different cuda version. 60 | 61 | ## Running benchmarks 62 | Benchmarks are launched from the python runme.py script 63 | ```bash 64 | python runme.py --help 65 | usage: runme.py [-h] [-dataset DATASET] [-root ROOT] [-algorithm ALGORITHM] 66 | [-gpus GPUS] [-cpus CPUS] [-output OUTPUT] [-ntrees NTREES] 67 | [-nrows NROWS] [-warmup] [-verbose] [-extra EXTRA] 68 | 69 | Benchmark xgboost/lightgbm/catboost on real datasets 70 | 71 | optional arguments: 72 | -h, --help show this help message and exit 73 | -dataset DATASET The dataset to be used for benchmarking. 'all' for all 74 | datasets. 75 | -root ROOT The root datasets folder 76 | -algorithm ALGORITHM Comma-separated list of algorithms to run; 'all' run 77 | all 78 | -gpus GPUS #GPUs to use for the benchmarks; ignored when not 79 | supported. Default is to use all. 80 | -cpus CPUS #CPUs to use for the benchmarks; 0 means 81 | psutil.cpu_count(logical=False) 82 | -output OUTPUT Output json file with runtime/accuracy stats 83 | -ntrees NTREES Number of trees. Default is as specified in the 84 | respective dataset configuration 85 | -nrows NROWS Subset of rows in the datasets to use. Useful for test 86 | running benchmarks on small amounts of data. WARNING: 87 | Some datasets will give incorrect accuracy results if 88 | nrows is specified as they have predefined train/test 89 | splits. 90 | -warmup Whether to run a small benchmark (fraud) as a warmup 91 | -verbose Produce verbose output 92 | -extra EXTRA Extra arguments as a python dictionary 93 | ``` 94 | 95 | As an example, launch the xgb-gpu algorithm on the year dataset. 96 | ```bash 97 | python runme.py -dataset year -algorithm xgb-gpu 98 | ``` 99 | # Yet another boosting tree benchmark? 100 | * This is more scriptable (and configurable) version (eg: for automated benchmarking) 101 | * Also adds CatBoost to the comparison list 102 | * Tries to keep the boosting hyper-params the same across frameworks for a fair 103 | comparison. Reference: [this paper](https://openreview.net/pdf?id=ryexWdLRtm) 104 | * Supports multi-GPU as well as multi-node benchmarking (assuming underlying framework allows) 105 | 106 | # Third party codes and licensing 107 | The third party codes which we borrowed from, and their license texts, are released 108 | "as-received" under the folder named "3rdparty". Refer to 3rdparty/README.md as to 109 | when they are borrowed and their respective licenses. 110 | 111 | # License for this project 112 | This project is released under BSD License. Refer to LICENSE for more details. 113 | -------------------------------------------------------------------------------- /runme.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | import os 29 | import sys 30 | import argparse 31 | import json 32 | import ast 33 | import psutil 34 | import algorithms 35 | from metrics import get_metrics 36 | from datasets import prepare_dataset 37 | 38 | 39 | def get_number_processors(args): 40 | if args.cpus == 0: 41 | return psutil.cpu_count(logical=False) 42 | return args.cpus 43 | 44 | 45 | def print_sys_info(args): 46 | try: 47 | import xgboost # pylint: disable=import-outside-toplevel 48 | print("Xgboost : %s" % xgboost.__version__) 49 | except ImportError: 50 | pass 51 | try: 52 | import lightgbm # pylint: disable=import-outside-toplevel 53 | print("LightGBM: %s" % lightgbm.__version__) 54 | except (ImportError, OSError): 55 | pass 56 | try: 57 | import catboost # pylint: disable=import-outside-toplevel 58 | print("Catboost: %s" % catboost.__version__) 59 | except ImportError: 60 | pass 61 | print("System : %s" % sys.version) 62 | print("#jobs : %d" % args.cpus) 63 | 64 | 65 | def parse_args(): 66 | parser = argparse.ArgumentParser( 67 | description="Benchmark xgboost/lightgbm/catboost on real datasets") 68 | parser.add_argument("-dataset", default="all", type=str, 69 | help="The dataset to be used for benchmarking. 'all' for all datasets.") 70 | parser.add_argument("-root", default="/opt/gbm-datasets", 71 | type=str, help="The root datasets folder") 72 | parser.add_argument("-algorithm", default="all", type=str, 73 | help=("Comma-separated list of algorithms to run; " 74 | "'all' run all")) 75 | parser.add_argument("-gpus", default=-1, type=int, 76 | help=("#GPUs to use for the benchmarks; " 77 | "ignored when not supported. Default is to use all.")) 78 | parser.add_argument("-cpus", default=0, type=int, 79 | help=("#CPUs to use for the benchmarks; " 80 | "0 means psutil.cpu_count(logical=False)")) 81 | parser.add_argument("-output", default=sys.path[0] + "/results.json", type=str, 82 | help="Output json file with runtime/accuracy stats") 83 | parser.add_argument("-ntrees", default=500, type=int, 84 | help=("Number of trees. Default is as specified in " 85 | "the respective dataset configuration")) 86 | parser.add_argument("-nrows", default=None, type=int, 87 | help=( 88 | "Subset of rows in the datasets to use. Useful for test running " 89 | "benchmarks on small amounts of data. WARNING: Some datasets will " 90 | "give incorrect accuracy results if nrows is specified as they have " 91 | "predefined train/test splits.")) 92 | parser.add_argument("-warmup", action="store_true", 93 | help=("Whether to run a small benchmark (fraud) as a warmup")) 94 | parser.add_argument("-verbose", action="store_true", help="Produce verbose output") 95 | parser.add_argument("-extra", default='{}', help="Extra arguments as a python dictionary") 96 | args = parser.parse_args() 97 | # default value for output json file 98 | if not args.output: 99 | args.output = "%s.json" % args.dataset 100 | return args 101 | 102 | 103 | # benchmarks a single dataset 104 | def benchmark(args, dataset_folder, dataset): 105 | data = prepare_dataset(dataset_folder, dataset, args.nrows) 106 | results = {} 107 | # "all" runs all algorithms 108 | if args.algorithm == "all": 109 | args.algorithm = "xgb-gpu,xgb-cpu,xgb-gpu-dask,lgbm-cpu,lgbm-gpu,cat-cpu,cat-gpu" 110 | for alg in args.algorithm.split(","): 111 | print("Running '%s' ..." % alg) 112 | runner = algorithms.Algorithm.create(alg) 113 | with runner: 114 | train_time = runner.fit(data, args) 115 | pred = runner.test(data) 116 | results[alg] = { 117 | "train_time": train_time, 118 | "accuracy": get_metrics(data, pred), 119 | } 120 | 121 | return results 122 | 123 | 124 | def main(): 125 | args = parse_args() 126 | args.cpus = get_number_processors(args) 127 | args.extra = ast.literal_eval(args.extra) 128 | print_sys_info(args) 129 | if args.warmup: 130 | benchmark(args, os.path.join(args.root, "fraud"), "fraud") 131 | if args.dataset == 'all': 132 | args.dataset = 'airline,bosch,fraud,higgs,year,epsilon,covtype,newsgroups' 133 | results = {} 134 | for dataset in args.dataset.split(","): 135 | folder = os.path.join(args.root, dataset) 136 | results.update({dataset: benchmark(args, folder, dataset)}) 137 | print(json.dumps({dataset: results[dataset]}, indent=2, sort_keys=True)) 138 | output = json.dumps(results, indent=2, sort_keys=True) 139 | output_file = open(args.output, "w") 140 | output_file.write(output + "\n") 141 | output_file.close() 142 | print("Results written to file '%s'" % args.output) 143 | 144 | 145 | if __name__ == "__main__": 146 | main() 147 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/experiments/libs/loaders.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import arff 4 | import numpy as np 5 | from functools import reduce 6 | import sqlite3 7 | import logging 8 | from libs.planet_kaggle import (to_multi_label_dict, get_file_count, enrich_with_feature_encoding, 9 | featurise_images, generate_validation_files) 10 | import tensorflow as tf 11 | from keras.applications.resnet50 import ResNet50 12 | 13 | 14 | logging.basicConfig(level=logging.INFO) 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | _FRAUD_PATH = 'fraud_detection', 'credit_card_fraud_kaggle', 'creditcard.csv' 19 | _IOT_PATH = 'iot', 'sensor_stream_berkeley', 'sensor.arff' 20 | _AIRLINE_PATH = 'airline', 'airline_14col.data' 21 | _FOOTBALL_PATH = 'football', 'database.sqlite' 22 | _BCI_PATH = 'bci', 'data.npz' 23 | _HIGGS_PATH = 'higgs', 'HIGGS.csv' 24 | _KAGGLE_ROOT = 'planet' 25 | _PLANET_KAGGLE_LABEL_CSV = 'train_v2.csv' 26 | _PLANET_KAGGLE_TRAIN_DIR = 'train-jpg' 27 | _PLANET_KAGGLE_VAL_DIR = 'validate-jpg' 28 | 29 | 30 | def _get_datapath(): 31 | try: 32 | datapath = os.environ['MOUNT_POINT'] 33 | except KeyError: 34 | logger.info("MOUNT_POINT not found in environment. Defaulting to /fileshare") 35 | datapath = '/fileshare' 36 | return datapath 37 | 38 | 39 | def load_fraud(): 40 | """ Loads the credit card fraud data 41 | 42 | The datasets contains transactions made by credit cards in September 2013 by european cardholders. 43 | This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. 44 | The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. 45 | It contains only numerical input variables which are the result of a PCA transformation. 46 | 47 | Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about 48 | the data. 49 | Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed 50 | with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first 51 | transaction in the dataset. 52 | The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. 53 | Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise. 54 | Given the class imbalance ratio, we recommend measuring the accuracy using the Area Under the Precision-Recall Curve 55 | (AUPRC). 56 | Confusion matrix accuracy is not meaningful for unbalanced classification. 57 | 58 | The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group 59 | (http://mlg.ulb.ac.be) of ULB (Universite Libre de Bruxelles) on big data mining and fraud detection. More details 60 | on current and past projects on related topics are available on http://mlg.ulb.ac.be/BruFence 61 | and http://mlg.ulb.ac.be/ARTML 62 | Please cite: Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with 63 | Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015 64 | 65 | Returns 66 | ------- 67 | pandas DataFrame 68 | 69 | """ 70 | return pd.read_csv(reduce(os.path.join, _FRAUD_PATH, _get_datapath())) 71 | 72 | 73 | def load_iot(): 74 | """ Loads iot data 75 | 76 | Sensor stream contains information (temperature, humidity, light, and sensor voltage) collected from 54 sensors deployed 77 | in Intel Berkeley Research Lab. The whole stream contains consecutive information recorded over a 2 months 78 | period (1 reading per 1-3 minutes). I used the sensor ID as the class label, so the learning task of the stream is 79 | to correctly identify the sensor ID (1 out of 54 sensors) purely based on the sensor data and the corresponding recording 80 | time. 81 | 82 | While the data stream flow over time, so does the concepts underlying the stream. For example, the lighting during 83 | the working hours is generally stronger than the night, and the temperature of specific sensors (conference room) 84 | may regularly rise during the meetings. 85 | 86 | Returns 87 | ------- 88 | pandas DataFrame 89 | """ 90 | dataset = arff.load(open(reduce(os.path.join, _IOT_PATH, _get_datapath()))) 91 | columns = [i[0] for i in dataset['attributes']] 92 | return pd.DataFrame(dataset['data'], columns=columns) 93 | 94 | 95 | def load_airline(): 96 | """ Loads airline data 97 | The dataset consists of a large amount of records, containing flight arrival and departure details for all the 98 | commercial flights within the USA, from October 1987 to April 2008. Its size is around 116 million records and 99 | 5.76 GB of memory. 100 | There are 13 attributes, each represented in a separate column: Year (1987-2008), Month (1-12), Day of Month (1-31), 101 | Day of Week (1:Monday - 7:Sunday), CRS Departure Time (local time as hhmm), CRS Arrival Time (local time as hhmm), 102 | Unique Carrier, Flight Number, Actual Elapsed Time (in min), Origin, Destination, Distance (in miles), and Diverted 103 | (1=yes, 0=no). 104 | The target attribute is Arrival Delay, it is a positive or negative value measured in minutes. 105 | Link to the source: http://kt.ijs.si/elena_ikonomovska/data.html 106 | 107 | Returns 108 | ------- 109 | pandas DataFrame 110 | """ 111 | cols = ['Year', 'Month', 'DayofMonth', 'DayofWeek', 'CRSDepTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'ActualElapsedTime', 'Origin', 'Dest', 'Distance', 'Diverted', 'ArrDelay'] 112 | return pd.read_csv(reduce(os.path.join, _AIRLINE_PATH, _get_datapath()), names=cols) 113 | 114 | 115 | def load_football(): 116 | """ Loads football data 117 | Dataset of football stats. +25,000 matches, +10,000 players from 11 European Countries with their lead championship 118 | Seasons 2008 to 2016. It also contains players attributes sourced from EA Sports' FIFA video game series, 119 | including the weekly updates, team line up with squad formation (X, Y coordinates), betting odds from up to 10 120 | providers and detailed match events (goal types, possession, corner, cross, fouls, cards etc...) for +10,000 matches. 121 | The meaning of the columns can be found here: http://www.football-data.co.uk/notes.txt 122 | Number of attributes in each table (size of the dataframe): 123 | countries (11, 2) 124 | matches (25979, 115) 125 | leagues (11, 3) 126 | teams (299, 5) 127 | players (183978, 42) 128 | Link to the source: https://www.kaggle.com/hugomathien/soccer 129 | 130 | Returns 131 | ------- 132 | list of pandas DataFrame 133 | """ 134 | database_path = reduce(os.path.join, _FOOTBALL_PATH, _get_datapath()) 135 | with sqlite3.connect(database_path) as con: 136 | countries = pd.read_sql_query("SELECT * from Country", con) 137 | matches = pd.read_sql_query("SELECT * from Match", con) 138 | leagues = pd.read_sql_query("SELECT * from League", con) 139 | teams = pd.read_sql_query("SELECT * from Team", con) 140 | players = pd.read_sql("SELECT * FROM Player_Attributes;", con) 141 | return countries, matches, leagues, teams, players 142 | 143 | 144 | def load_bci(): 145 | """ Loads BCI data 146 | 147 | Contains measurements from 64 EEG sensors on the scalp of a single participant. 148 | The purpose of the recording is to determine from the electrical brain activity when the participant is paying attention. 149 | 150 | Returns 151 | ------- 152 | A tuple containing four numpy arrays 153 | train features 154 | train labels 155 | test features 156 | test labels 157 | """ 158 | 159 | npzfile = np.load(reduce(os.path.join, _BCI_PATH, _get_datapath())) 160 | return npzfile['train_X'], npzfile['train_y'], npzfile['test_X'], npzfile['test_y'] 161 | 162 | 163 | 164 | def load_higgs(): 165 | """ Loads HIGGS data 166 | 167 | Dataset of atomic particles measurements. The total size of the data is 11 millions of observations. 168 | It can be used in a classification problem to distinguish between a signal process which produces Higgs 169 | bosons and a background process which does not. 170 | The data has been produced using Monte Carlo simulations. The first 21 features (columns 2-22) are kinematic 171 | properties measured by the particle detectors in the accelerator. The last seven features are functions of 172 | the first 21 features; these are high-level features derived by physicists to help discriminate between the 173 | two classes. The first column is the class label (1 for signal, 0 for background), followed by the 28 174 | features (21 low-level features then 7 high-level features): lepton pT, lepton eta, lepton phi, 175 | missing energy magnitude, missing energy phi, jet 1 pt, jet 1 eta, jet 1 phi, jet 1 b-tag, jet 2 pt, jet 2 eta, 176 | jet 2 phi, jet 2 b-tag, jet 3 pt, jet 3 eta, jet 3 phi, jet 3 b-tag, jet 4 pt, jet 4 eta, jet 4 phi, 177 | jet 4 b-tag, m_jj, m_jjj, m_lv, m_jlv, m_bb, m_wbb, m_wwbb. 178 | Link to the source: https://archive.ics.uci.edu/ml/datasets/HIGGS 179 | 180 | Returns 181 | ------- 182 | pandas DataFrame 183 | """ 184 | cols = ['boson','lepton_pT','lepton_eta','lepton_phi','missing_energy_magnitude','missing_energy_phi','jet_1_pt','jet_1_eta','jet_1_phi','jet_1_b-tag','jet_2_pt','jet_2_eta','jet_2_phi','jet_2_b-tag','jet_3_pt','jet_3_eta','jet_3_phi','jet_3_b-tag','jet_4_pt','jet_4_eta','jet_4_phi','jet_4_b-tag','m_jj','m_jjj','m_lv','m_jlv','m_bb','m_wbb','m_wwbb'] 185 | return pd.read_csv(reduce(os.path.join, _HIGGS_PATH, _get_datapath()), names=cols) 186 | 187 | 188 | def load_planet_kaggle(): 189 | """ Loads Planet Kaggle data 190 | 191 | Dataset of satellite images of the Amazon. The objective of this dataset is to label satellite image chips 192 | with atmospheric conditions and various classes of land cover/land use. Resulting algorithms will help the 193 | global community better understand where, how, and why deforestation happens all over the world. The images 194 | use the GeoTiff format and each contain four bands of data: red, green, blue, and near infrared. 195 | To treat the images we used transfer learning with the CNN ResNet50. The images are featurized with this 196 | deep neural network. Once the features are generated we can use a boosted tree to classify them. 197 | Link to the source: https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/data 198 | 199 | Returns 200 | ------- 201 | A tuple containing four numpy arrays 202 | train_features 203 | y_train 204 | validation_features 205 | y_val 206 | """ 207 | csv_path = reduce(os.path.join, (_KAGGLE_ROOT, _PLANET_KAGGLE_LABEL_CSV), _get_datapath()) 208 | train_path = reduce(os.path.join, (_KAGGLE_ROOT, _PLANET_KAGGLE_TRAIN_DIR), _get_datapath()) 209 | val_path = reduce(os.path.join, (_KAGGLE_ROOT, _PLANET_KAGGLE_VAL_DIR), _get_datapath()) 210 | assert os.path.isfile(csv_path) 211 | assert os.path.exists(train_path) 212 | if not os.path.exists(val_path): os.mkdir(val_path) 213 | if not os.listdir(val_path): 214 | logger.info('Validation folder is empty, moving files...') 215 | generate_validation_files(train_path, val_path) 216 | 217 | logger.info('Reading in labels') 218 | labels_df = pd.read_csv(csv_path).pipe(enrich_with_feature_encoding) 219 | multi_label_dict = to_multi_label_dict(labels_df) 220 | 221 | nb_train_samples = get_file_count(os.path.join(train_path, '*.jpg')) 222 | nb_validation_samples = get_file_count(os.path.join(val_path, '*.jpg')) 223 | 224 | logger.debug('Number of training files {}'.format(nb_train_samples)) 225 | logger.debug('Number of validation files {}'.format(nb_validation_samples)) 226 | logger.debug('Loading model') 227 | 228 | model = ResNet50(include_top=False) 229 | train_features, train_names = featurise_images(model, 230 | train_path, 231 | 'train_{}', 232 | range(nb_train_samples), 233 | desc='Featurising training images') 234 | 235 | validation_features, validation_names = featurise_images(model, 236 | val_path, 237 | 'train_{}', 238 | range(nb_train_samples, nb_train_samples+nb_validation_samples), 239 | desc='Featurising validation images') 240 | 241 | # Prepare data 242 | y_train = np.array([multi_label_dict[name] for name in train_names]) 243 | y_val = np.array([multi_label_dict[name] for name in validation_names]) 244 | 245 | return train_features, y_train, validation_features, y_val 246 | -------------------------------------------------------------------------------- /datasets.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE 22 | 23 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 24 | 25 | import os 26 | from enum import Enum 27 | import pickle 28 | from urllib.request import urlretrieve 29 | import numpy as np 30 | from sklearn.model_selection import train_test_split 31 | from sklearn import datasets 32 | import pandas as pd 33 | import tqdm 34 | 35 | pbar = None 36 | 37 | 38 | def show_progress(block_num, block_size, total_size): 39 | global pbar 40 | if pbar is None: 41 | pbar = tqdm.tqdm(total=total_size / 1024, unit='kB') 42 | 43 | downloaded = block_num * block_size 44 | if downloaded < total_size: 45 | pbar.update(block_size / 1024) 46 | else: 47 | pbar.close() 48 | pbar = None 49 | 50 | 51 | def retrieve(url, filename=None): 52 | return urlretrieve(url, filename, reporthook=show_progress) 53 | 54 | 55 | class LearningTask(Enum): 56 | REGRESSION = 1 57 | CLASSIFICATION = 2 58 | MULTICLASS_CLASSIFICATION = 3 59 | 60 | 61 | class Data: # pylint: disable=too-few-public-methods,too-many-arguments 62 | def __init__(self, X_train, X_test, y_train, y_test, learning_task, qid_train=None, 63 | qid_test=None): 64 | self.X_train = X_train 65 | self.X_test = X_test 66 | self.y_train = y_train 67 | self.y_test = y_test 68 | self.learning_task = learning_task 69 | # For ranking task 70 | self.qid_train = qid_train 71 | self.qid_test = qid_test 72 | 73 | 74 | def prepare_dataset(dataset_folder, dataset, nrows): 75 | if not os.path.exists(dataset_folder): 76 | os.makedirs(dataset_folder) 77 | prepare_function = globals()["prepare_" + dataset] 78 | return prepare_function(dataset_folder, nrows) 79 | 80 | 81 | def __prepare_airline(dataset_folder, nrows, regression=False): # pylint: disable=too-many-locals 82 | url = 'http://kt.ijs.si/elena_ikonomovska/datasets/airline/airline_14col.data.bz2' 83 | pkl_base_name = "airline" 84 | if regression: 85 | pkl_base_name += "-regression" 86 | local_url = os.path.join(dataset_folder, os.path.basename(url)) 87 | pickle_url = os.path.join(dataset_folder, 88 | pkl_base_name 89 | + ("" if nrows is None else "-" + str(nrows)) + ".pkl") 90 | if os.path.exists(pickle_url): 91 | return pickle.load(open(pickle_url, "rb")) 92 | if not os.path.isfile(local_url): 93 | retrieve(url, local_url) 94 | 95 | cols = [ 96 | "Year", "Month", "DayofMonth", "DayofWeek", "CRSDepTime", 97 | "CRSArrTime", "UniqueCarrier", "FlightNum", "ActualElapsedTime", 98 | "Origin", "Dest", "Distance", "Diverted", "ArrDelay" 99 | ] 100 | 101 | # load the data as int16 102 | dtype = np.int16 103 | 104 | dtype_columns = { 105 | "Year": dtype, "Month": dtype, "DayofMonth": dtype, "DayofWeek": dtype, 106 | "CRSDepTime": dtype, "CRSArrTime": dtype, "FlightNum": dtype, 107 | "ActualElapsedTime": dtype, "Distance": 108 | dtype, 109 | "Diverted": dtype, "ArrDelay": dtype, 110 | } 111 | 112 | df = pd.read_csv(local_url, 113 | names=cols, dtype=dtype_columns, nrows=nrows) 114 | 115 | # Encode categoricals as numeric 116 | for col in df.select_dtypes(['object']).columns: 117 | df[col] = df[col].astype("category").cat.codes 118 | 119 | # Turn into binary classification problem 120 | if not regression: 121 | df["ArrDelay"] = 1 * (df["ArrDelay"] > 0) 122 | 123 | X = df[df.columns.difference(["ArrDelay"])].to_numpy(dtype=np.float32) 124 | y = df["ArrDelay"].to_numpy(dtype=np.float32) 125 | del df 126 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, 127 | test_size=0.2, 128 | ) 129 | if regression: 130 | task = LearningTask.REGRESSION 131 | else: 132 | task = LearningTask.CLASSIFICATION 133 | data = Data(X_train, X_test, y_train, y_test, task) 134 | pickle.dump(data, open(pickle_url, "wb"), protocol=4) 135 | return data 136 | 137 | 138 | def prepare_airline(dataset_folder, nrows): 139 | return __prepare_airline(dataset_folder, nrows, False) 140 | 141 | 142 | def prepare_airline_regression(dataset_folder, nrows): 143 | return __prepare_airline(dataset_folder, nrows, True) 144 | 145 | 146 | def prepare_bosch(dataset_folder, nrows): 147 | filename = "train_numeric.csv.zip" 148 | local_url = os.path.join(dataset_folder, filename) 149 | pickle_url = os.path.join(dataset_folder, 150 | "bosch" + ("" if nrows is None else "-" + str(nrows)) + ".pkl") 151 | if os.path.exists(pickle_url): 152 | return pickle.load(open(pickle_url, "rb")) 153 | 154 | os.system("kaggle competitions download -c bosch-production-line-performance -f " + 155 | filename + " -p " + dataset_folder) 156 | X = pd.read_csv(local_url, index_col=0, compression='zip', dtype=np.float32, 157 | nrows=nrows) 158 | y = X.iloc[:, -1].to_numpy(dtype=np.float32) 159 | X.drop(X.columns[-1], axis=1, inplace=True) 160 | X = X.to_numpy(dtype=np.float32) 161 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, 162 | test_size=0.2, 163 | ) 164 | data = Data(X_train, X_test, y_train, y_test, LearningTask.CLASSIFICATION) 165 | pickle.dump(data, open(pickle_url, "wb"), protocol=4) 166 | return data 167 | 168 | 169 | def prepare_fraud(dataset_folder, nrows): 170 | if not os.path.exists(dataset_folder): 171 | os.makedirs(dataset_folder) 172 | filename = "creditcard.csv" 173 | local_url = os.path.join(dataset_folder, filename) 174 | pickle_url = os.path.join(dataset_folder, 175 | "creditcard" + ("" if nrows is None else "-" + str(nrows)) + ".pkl") 176 | if os.path.exists(pickle_url): 177 | return pickle.load(open(pickle_url, "rb")) 178 | 179 | os.system("kaggle datasets download mlg-ulb/creditcardfraud -f" + 180 | filename + " -p " + dataset_folder) 181 | df = pd.read_csv(local_url + ".zip", dtype=np.float32, nrows=nrows) 182 | X = df[[col for col in df.columns if col.startswith('V')]].to_numpy(dtype=np.float32) 183 | y = df['Class'].to_numpy(dtype=np.float32) 184 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, 185 | test_size=0.2, 186 | ) 187 | data = Data(X_train, X_test, y_train, y_test, LearningTask.CLASSIFICATION) 188 | pickle.dump(data, open(pickle_url, "wb"), protocol=4) 189 | return data 190 | 191 | 192 | def prepare_higgs(dataset_folder, nrows): 193 | url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' 194 | local_url = os.path.join(dataset_folder, os.path.basename(url)) 195 | pickle_url = os.path.join(dataset_folder, 196 | "higgs" + ("" if nrows is None else "-" + str(nrows)) + ".pkl") 197 | 198 | if os.path.exists(pickle_url): 199 | return pickle.load(open(pickle_url, "rb")) 200 | 201 | if not os.path.isfile(local_url): 202 | retrieve(url, local_url) 203 | higgs = pd.read_csv(local_url, nrows=nrows) 204 | X = higgs.iloc[:, 1:].to_numpy(dtype=np.float32) 205 | y = higgs.iloc[:, 0].to_numpy(dtype=np.float32) 206 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, 207 | test_size=0.2, 208 | ) 209 | data = Data(X_train, X_test, y_train, y_test, LearningTask.CLASSIFICATION) 210 | pickle.dump(data, open(pickle_url, "wb"), protocol=4) 211 | return data 212 | 213 | 214 | def prepare_year(dataset_folder, nrows): 215 | url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt' \ 216 | '.zip' 217 | local_url = os.path.join(dataset_folder, os.path.basename(url)) 218 | pickle_url = os.path.join(dataset_folder, 219 | "year" + ("" if nrows is None else "-" + str(nrows)) + ".pkl") 220 | 221 | if os.path.exists(pickle_url): 222 | return pickle.load(open(pickle_url, "rb")) 223 | 224 | if not os.path.isfile(local_url): 225 | retrieve(url, local_url) 226 | year = pd.read_csv(local_url, nrows=nrows, header=None) 227 | X = year.iloc[:, 1:].to_numpy(dtype=np.float32) 228 | y = year.iloc[:, 0].to_numpy(dtype=np.float32) 229 | 230 | if nrows is None: 231 | # this dataset requires a specific train/test split, 232 | # with the specified number of rows at the start belonging to the train set, 233 | # and the rest being the test set 234 | X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, 235 | train_size=463715, 236 | test_size=51630) 237 | else: 238 | print( 239 | "Warning: nrows is specified, not using predefined test/train split for " 240 | "YearPredictionMSD.") 241 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, 242 | test_size=0.2, 243 | ) 244 | 245 | data = Data(X_train, X_test, y_train, y_test, LearningTask.REGRESSION) 246 | pickle.dump(data, open(pickle_url, "wb"), protocol=4) 247 | return data 248 | 249 | 250 | def prepare_epsilon(dataset_folder, nrows): 251 | url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ 252 | '/epsilon_normalized.bz2' 253 | url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \ 254 | '/epsilon_normalized.t.bz2' 255 | pickle_url = os.path.join(dataset_folder, 256 | "epsilon" + ("" if nrows is None else "-" + str(nrows)) + ".pkl") 257 | local_url_train = os.path.join(dataset_folder, os.path.basename(url_train)) 258 | local_url_test = os.path.join(dataset_folder, os.path.basename(url_test)) 259 | 260 | if os.path.exists(pickle_url): 261 | return pickle.load(open(pickle_url, "rb")) 262 | 263 | if not os.path.isfile(local_url_train): 264 | retrieve(url_train, local_url_train) 265 | if not os.path.isfile(local_url_test): 266 | retrieve(url_test, local_url_test) 267 | 268 | X_train, y_train = datasets.load_svmlight_file(local_url_train, 269 | dtype=np.float32) 270 | X_test, y_test = datasets.load_svmlight_file(local_url_test, 271 | dtype=np.float32) 272 | X_train = X_train.toarray() 273 | X_test = X_test.toarray() 274 | y_train[y_train <= 0] = 0 275 | y_test[y_test <= 0] = 0 276 | 277 | if nrows is not None: 278 | print("Warning: nrows is specified, not using predefined test/train split for epsilon.") 279 | 280 | X_train = np.vstack((X_train, X_test)) 281 | y_train = np.append(y_train, y_test) 282 | X_train = X_train[:nrows] 283 | y_train = y_train[:nrows] 284 | X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=77, 285 | test_size=0.2, 286 | ) 287 | 288 | data = Data(X_train, X_test, y_train, y_test, LearningTask.CLASSIFICATION) 289 | pickle.dump(data, open(pickle_url, "wb"), protocol=4) 290 | return data 291 | 292 | 293 | def prepare_covtype(dataset_folder, nrows): # pylint: disable=unused-argument 294 | X, y = datasets.fetch_covtype(return_X_y=True) # pylint: disable=unexpected-keyword-arg 295 | if nrows is not None: 296 | X = X[0:nrows] 297 | y = y[0:nrows] 298 | 299 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, 300 | test_size=0.2, 301 | ) 302 | return Data(X_train, X_test, y_train, y_test, LearningTask.MULTICLASS_CLASSIFICATION) 303 | 304 | 305 | def prepare_newsgroups(dataset_folder, nrows): # pylint: disable=unused-argument 306 | X, y = datasets.fetch_20newsgroups_vectorized(subset='all',return_X_y=True) # pylint: disable=unexpected-keyword-arg 307 | if nrows is not None: 308 | X = X[0:nrows] 309 | y = y[0:nrows] 310 | 311 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77, 312 | test_size=0.2, 313 | ) 314 | 315 | return Data(X_train, X_test, y_train, y_test, LearningTask.MULTICLASS_CLASSIFICATION) -------------------------------------------------------------------------------- /3rdparty/fast_retraining/experiments/libs/football.py: -------------------------------------------------------------------------------- 1 | #code from https://www.kaggle.com/airback/match-outcome-prediction-in-football 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | def get_fifa_stats(match, player_stats): 8 | ''' Aggregates fifa stats for a given match. ''' 9 | #Define variables 10 | match_id = match.match_api_id 11 | date = match['date'] 12 | players = ['home_player_1', 'home_player_2', 'home_player_3', "home_player_4", "home_player_5", 13 | "home_player_6", "home_player_7", "home_player_8", "home_player_9", "home_player_10", 14 | "home_player_11", "away_player_1", "away_player_2", "away_player_3", "away_player_4", 15 | "away_player_5", "away_player_6", "away_player_7", "away_player_8", "away_player_9", 16 | "away_player_10", "away_player_11"] 17 | player_stats_new = pd.DataFrame() 18 | names = [] 19 | 20 | #Loop through all players 21 | for player in players: 22 | 23 | #Get player ID 24 | player_id = match[player] 25 | 26 | #Get player stats 27 | stats = player_stats[player_stats.player_api_id == player_id] 28 | 29 | #Identify current stats 30 | current_stats = stats[stats.date < date].sort_values(by = 'date', ascending = False)[:1] 31 | 32 | if np.isnan(player_id) == True: 33 | overall_rating = pd.Series(0) 34 | else: 35 | current_stats.reset_index(inplace = True, drop = True) 36 | overall_rating = pd.Series(current_stats.loc[0, "overall_rating"]) 37 | 38 | #Rename stat 39 | name = "{}_overall_rating".format(player) 40 | names.append(name) 41 | 42 | #Aggregate stats 43 | player_stats_new = pd.concat([player_stats_new, overall_rating], axis = 1) 44 | 45 | player_stats_new.columns = names 46 | player_stats_new['match_api_id'] = match_id 47 | 48 | player_stats_new.reset_index(inplace = True, drop = True) 49 | 50 | #Return player stats 51 | return player_stats_new.ix[0] 52 | 53 | def get_fifa_data(matches, player_stats): 54 | ''' Gets fifa data for all matches. ''' 55 | #Apply get_fifa_stats for each match 56 | fifa_data = matches.apply(lambda x :get_fifa_stats(x, player_stats), axis = 1) 57 | return fifa_data 58 | 59 | def get_match_label(match): 60 | ''' Derives a label for a given match. ''' 61 | 62 | #Define variables 63 | home_goals = match['home_team_goal'] 64 | away_goals = match['away_team_goal'] 65 | 66 | label = pd.DataFrame() 67 | label.loc[0,'match_api_id'] = match['match_api_id'] 68 | 69 | #Identify match label 70 | if home_goals > away_goals: 71 | label.loc[0,'label'] = "Win" 72 | if home_goals == away_goals: 73 | label.loc[0,'label'] = "Draw" 74 | if home_goals < away_goals: 75 | label.loc[0,'label'] = "Defeat" 76 | 77 | #Return label 78 | return label.loc[0] 79 | 80 | 81 | def get_overall_fifa_rankings(fifa, get_overall = False): 82 | ''' Get overall fifa rankings from fifa data. ''' 83 | 84 | temp_data = fifa 85 | 86 | #Check if only overall player stats are desired 87 | if get_overall == True: 88 | 89 | #Get overall stats 90 | data = temp_data.loc[:,(fifa.columns.str.contains('overall_rating'))] 91 | data.loc[:,'match_api_id'] = temp_data.loc[:,'match_api_id'] 92 | else: 93 | 94 | #Get all stats except for stat date 95 | cols = fifa.loc[:,(fifa.columns.str.contains('date_stat'))] 96 | temp_data = fifa.drop(cols.columns, axis = 1) 97 | data = temp_data 98 | 99 | #Return data 100 | return data 101 | 102 | def get_last_matches(matches, date, team, x = 10): 103 | ''' Get the last x matches of a given team. ''' 104 | 105 | #Filter team matches from matches 106 | team_matches = matches[(matches['home_team_api_id'] == team) | (matches['away_team_api_id'] == team)] 107 | 108 | #Filter x last matches from team matches 109 | last_matches = team_matches[team_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:] 110 | 111 | #Return last matches 112 | return last_matches 113 | 114 | def get_last_matches_against_eachother(matches, date, home_team, away_team, x = 10): 115 | ''' Get the last x matches of two given teams. ''' 116 | 117 | #Find matches of both teams 118 | home_matches = matches[(matches['home_team_api_id'] == home_team) & (matches['away_team_api_id'] == away_team)] 119 | away_matches = matches[(matches['home_team_api_id'] == away_team) & (matches['away_team_api_id'] == home_team)] 120 | total_matches = pd.concat([home_matches, away_matches]) 121 | 122 | #Get last x matches 123 | try: 124 | last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:] 125 | except: 126 | last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:total_matches.shape[0],:] 127 | 128 | #Check for error in data 129 | if(last_matches.shape[0] > x): 130 | print("Error in obtaining matches") 131 | 132 | #Return data 133 | return last_matches 134 | 135 | def get_goals(matches, team): 136 | ''' Get the goals of a specfic team from a set of matches. ''' 137 | 138 | #Find home and away goals 139 | home_goals = int(matches.home_team_goal[matches.home_team_api_id == team].sum()) 140 | away_goals = int(matches.away_team_goal[matches.away_team_api_id == team].sum()) 141 | 142 | total_goals = home_goals + away_goals 143 | 144 | #Return total goals 145 | return total_goals 146 | 147 | def get_goals_conceided(matches, team): 148 | ''' Get the goals conceided of a specfic team from a set of matches. ''' 149 | 150 | #Find home and away goals 151 | home_goals = int(matches.home_team_goal[matches.away_team_api_id == team].sum()) 152 | away_goals = int(matches.away_team_goal[matches.home_team_api_id == team].sum()) 153 | 154 | total_goals = home_goals + away_goals 155 | 156 | #Return total goals 157 | return total_goals 158 | 159 | def get_wins(matches, team): 160 | ''' Get the number of wins of a specfic team from a set of matches. ''' 161 | 162 | #Find home and away wins 163 | home_wins = int(matches.home_team_goal[(matches.home_team_api_id == team) & (matches.home_team_goal > matches.away_team_goal)].count()) 164 | away_wins = int(matches.away_team_goal[(matches.away_team_api_id == team) & (matches.away_team_goal > matches.home_team_goal)].count()) 165 | 166 | total_wins = home_wins + away_wins 167 | 168 | #Return total wins 169 | return total_wins 170 | 171 | def get_match_features(match, matches, x = 10): 172 | ''' Create match specific features for a given match. ''' 173 | 174 | #Define variables 175 | date = match.date 176 | home_team = match.home_team_api_id 177 | away_team = match.away_team_api_id 178 | 179 | #Get last x matches of home and away team 180 | matches_home_team = get_last_matches(matches, date, home_team, x = 10) 181 | matches_away_team = get_last_matches(matches, date, away_team, x = 10) 182 | 183 | #Get last x matches of both teams against each other 184 | last_matches_against = get_last_matches_against_eachother(matches, date, home_team, away_team, x = 3) 185 | 186 | #Create goal variables 187 | home_goals = get_goals(matches_home_team, home_team) 188 | away_goals = get_goals(matches_away_team, away_team) 189 | home_goals_conceided = get_goals_conceided(matches_home_team, home_team) 190 | away_goals_conceided = get_goals_conceided(matches_away_team, away_team) 191 | 192 | #Define result data frame 193 | result = pd.DataFrame() 194 | 195 | #Define ID features 196 | result.loc[0, 'match_api_id'] = match.match_api_id 197 | result.loc[0, 'league_id'] = match.league_id 198 | 199 | #Create match features 200 | result.loc[0, 'home_team_goals_difference'] = home_goals - home_goals_conceided 201 | result.loc[0, 'away_team_goals_difference'] = away_goals - away_goals_conceided 202 | result.loc[0, 'games_won_home_team'] = get_wins(matches_home_team, home_team) 203 | result.loc[0, 'games_won_away_team'] = get_wins(matches_away_team, away_team) 204 | result.loc[0, 'games_against_won'] = get_wins(last_matches_against, home_team) 205 | result.loc[0, 'games_against_lost'] = get_wins(last_matches_against, away_team) 206 | 207 | #Add season 208 | result.loc[0, 'season'] = int(match['season'].split('/')[0]) 209 | 210 | #Return match features 211 | return result.loc[0] 212 | 213 | def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = True, x = 10, all_leagues = True, verbose = True): 214 | ''' Create and aggregate features and labels for all matches. ''' 215 | 216 | #Get fifa stats features 217 | fifa_stats = get_overall_fifa_rankings(fifa, get_overall) 218 | 219 | 220 | if verbose == True: 221 | print("Generating match features...") 222 | 223 | #Get match features for all matches 224 | match_stats = matches.apply(lambda x: get_match_features(x, matches, x = 10), axis = 1) 225 | 226 | #Create dummies for league ID feature 227 | if all_leagues: 228 | dummies = pd.get_dummies(match_stats['league_id']).rename(columns = lambda x: 'League_' + str(x)) 229 | match_stats = pd.concat([match_stats, dummies], axis = 1) 230 | match_stats.drop(['league_id'], inplace = True, axis = 1) 231 | 232 | 233 | if verbose == True: 234 | print("Generating match labels...") 235 | 236 | #Create match labels 237 | labels = matches.apply(get_match_label, axis = 1) 238 | 239 | if verbose == True: 240 | print("Generating bookkeeper data...") 241 | 242 | #Get bookkeeper quotas for all matches 243 | bk_data = get_bookkeeper_data(matches, bookkeepers, horizontal = True) 244 | bk_data.loc[:,'match_api_id'] = matches.loc[:,'match_api_id'] 245 | 246 | #Merges features and labels into one frame 247 | features = pd.merge(match_stats, fifa_stats, on = 'match_api_id', how = 'left') 248 | features = pd.merge(features, bk_data, on = 'match_api_id', how = 'left') 249 | feables = pd.merge(features, labels, on = 'match_api_id', how = 'left') 250 | 251 | #Drop NA values 252 | feables.dropna(inplace = True) 253 | 254 | #Return preprocessed data 255 | return feables 256 | 257 | 258 | def convert_odds_to_prob(match_odds): 259 | ''' Converts bookkeeper odds to probabilities. ''' 260 | 261 | #Define variables 262 | match_id = match_odds.loc[:,'match_api_id'] 263 | bookkeeper = match_odds.loc[:,'bookkeeper'] 264 | win_odd = match_odds.loc[:,'Win'] 265 | draw_odd = match_odds.loc[:,'Draw'] 266 | loss_odd = match_odds.loc[:,'Defeat'] 267 | 268 | #Converts odds to prob 269 | win_prob = 1 / win_odd 270 | draw_prob = 1 / draw_odd 271 | loss_prob = 1 / loss_odd 272 | 273 | total_prob = win_prob + draw_prob + loss_prob 274 | 275 | probs = pd.DataFrame() 276 | 277 | #Define output format and scale probs by sum over all probs 278 | probs.loc[:,'match_api_id'] = match_id 279 | probs.loc[:,'bookkeeper'] = bookkeeper 280 | probs.loc[:,'Win'] = win_prob / total_prob 281 | probs.loc[:,'Draw'] = draw_prob / total_prob 282 | probs.loc[:,'Defeat'] = loss_prob / total_prob 283 | 284 | #Return probs and meta data 285 | return probs 286 | 287 | def get_bookkeeper_data(matches, bookkeepers, horizontal = True): 288 | ''' Aggregates bookkeeper data for all matches and bookkeepers. ''' 289 | 290 | bk_data = pd.DataFrame() 291 | 292 | #Loop through bookkeepers 293 | for bookkeeper in bookkeepers: 294 | 295 | #Find columns containing data of bookkeeper 296 | temp_data = matches.loc[:,(matches.columns.str.contains(bookkeeper))] 297 | temp_data.loc[:, 'bookkeeper'] = str(bookkeeper) 298 | temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id'] 299 | 300 | #Rename odds columns and convert to numeric 301 | cols = temp_data.columns.values 302 | cols[:3] = ['Win','Draw','Defeat'] 303 | temp_data.columns = cols 304 | temp_data.loc[:,'Win'] = pd.to_numeric(temp_data['Win']) 305 | temp_data.loc[:,'Draw'] = pd.to_numeric(temp_data['Draw']) 306 | temp_data.loc[:,'Defeat'] = pd.to_numeric(temp_data['Defeat']) 307 | 308 | #Check if data should be aggregated horizontally 309 | if(horizontal == True): 310 | 311 | #Convert data to probs 312 | temp_data = convert_odds_to_prob(temp_data) 313 | temp_data.drop('match_api_id', axis = 1, inplace = True) 314 | temp_data.drop('bookkeeper', axis = 1, inplace = True) 315 | 316 | #Rename columns with bookkeeper names 317 | win_name = bookkeeper + "_" + "Win" 318 | draw_name = bookkeeper + "_" + "Draw" 319 | defeat_name = bookkeeper + "_" + "Defeat" 320 | temp_data.columns.values[:3] = [win_name, draw_name, defeat_name] 321 | 322 | #Aggregate data 323 | bk_data = pd.concat([bk_data, temp_data], axis = 1) 324 | else: 325 | #Aggregate vertically 326 | bk_data = bk_data.append(temp_data, ignore_index = True) 327 | 328 | #If horizontal add match api id to data 329 | if(horizontal == True): 330 | temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id'] 331 | 332 | #Return bookkeeper data 333 | return bk_data 334 | 335 | def get_bookkeeper_probs(matches, bookkeepers, horizontal = False): 336 | ''' Get bookkeeper data and convert to probabilities for vertical aggregation. ''' 337 | 338 | #Get bookkeeper data 339 | data = get_bookkeeper_data(matches, bookkeepers, horizontal = False) 340 | 341 | #Convert odds to probabilities 342 | probs = convert_odds_to_prob(data) 343 | 344 | #Return data 345 | return probs 346 | 347 | -------------------------------------------------------------------------------- /3rdparty/codebase/python/machine_learning/metrics.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import (confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, precision_score, 2 | recall_score, mean_squared_error, mean_absolute_error, r2_score) 3 | import numpy as np 4 | 5 | 6 | def classification_metrics_binary(y_true, y_pred): 7 | """Returns a report with different metrics for a binary classification problem. 8 | - Accuracy: Number of correct predictions made as a ratio of all predictions. Useful when there are equal number 9 | of observations in each class and all predictions and prediction errors are equally important. 10 | - Confusion matrix: C_ij where observations are known to be in group i but predicted to be in group j. In binary 11 | classification true negatives is C_00, false negatives is C_10, true positives is C_11 and false positives is C_01. 12 | - Precision: Number of true positives divided by the number of true and false positives. It is the ability of the 13 | classifier not to label as positive a sample that is negative. 14 | - Recall: Number of true positives divided by the number of true positives and false negatives. It is the ability 15 | of the classifier to find all the positive samples. 16 | High Precision and low Recall will return few positive results but most of them will be correct. 17 | High Recall and low Precision will return many positive results but most of them will be incorrect. 18 | - F1 Score: 2*((precision*recall)/(precision+recall)). It measures the balance between precision and recall. 19 | Args: 20 | y_true (list or array): True labels. 21 | y_pred (list or array): Predicted labels (binary). 22 | Returns: 23 | report (dict): Dictionary with metrics. 24 | Examples: 25 | >>> from collections import OrderedDict 26 | >>> y_true = [0,1,0,0,1] 27 | >>> y_pred = [0,1,0,1,1] 28 | >>> result = classification_metrics_binary(y_true, y_pred) 29 | >>> OrderedDict(sorted(result.items())) 30 | OrderedDict([('Accuracy', 0.8), ('Confusion Matrix', array([[2, 1], 31 | [0, 2]])), ('F1', 0.8), ('Precision', 0.6666666666666666), ('Recall', 1.0)]) 32 | 33 | """ 34 | m_acc = accuracy_score(y_true, y_pred) 35 | m_f1 = f1_score(y_true, y_pred) 36 | m_precision = precision_score(y_true, y_pred) 37 | m_recall = recall_score(y_true, y_pred) 38 | m_conf = confusion_matrix(y_true, y_pred) 39 | report = {'Accuracy': m_acc, 'Precision': m_precision, 'Recall': m_recall, 'F1': m_f1, 'Confusion Matrix': m_conf} 40 | return report 41 | 42 | 43 | def classification_metrics_multilabel(y_true, y_pred, labels): 44 | """Returns a report with different metrics for a multilabel classification problem. 45 | - Accuracy: Number of correct predictions made as a ratio of all predictions. Useful when there are equal number 46 | of observations in each class and all predictions and prediction errors are equally important. 47 | - Confusion matrix: C_ij where observations are known to be in group i but predicted to be in group j. In multilabel 48 | classification true predictions are in the diagonal and false predictions outside the diagonal. 49 | - Precision: Number of true positives divided by the number of true and false positives. It is the ability of the 50 | classifier not to label as positive a sample that is negative. 51 | - Recall: Number of true positives divided by the number of true positives and false negatives. It is the ability 52 | of the classifier to find all the positive samples. 53 | High Precision and low Recall will return few positive results but most of them will be correct. 54 | High Recall and low Precision will return many positive results but most of them will be incorrect. 55 | - F1 Score: 2*((precision*recall)/(precision+recall)). It measures the balance between precision and recall. 56 | Args: 57 | y_true (list or array): True labels. 58 | y_pred (list or array): Predicted labels. 59 | labels (list): Label index or name. 60 | Returns: 61 | report (dict): Dictionary with metrics. 62 | Examples: 63 | >>> from collections import OrderedDict 64 | >>> y_true = [0,1,2,0,1] 65 | >>> y_pred = [0,1,0,1,1] 66 | >>> result = classification_metrics_multilabel(y_true, y_pred, [0,1,2]) 67 | >>> OrderedDict(sorted(result.items())) 68 | OrderedDict([('Accuracy', 0.6), ('Confusion Matrix', array([[1, 1, 0], 69 | [0, 2, 0], 70 | [1, 0, 0]])), ('F1', 0.52), ('Precision', 0.4666666666666666), ('Recall', 0.6)]) 71 | 72 | """ 73 | m_acc = accuracy_score(y_true, y_pred) 74 | m_f1 = f1_score(y_true, y_pred, labels, average='weighted') 75 | m_precision = precision_score(y_true, y_pred, labels, average='weighted') 76 | m_recall = recall_score(y_true, y_pred, labels, average='weighted') 77 | m_conf = confusion_matrix(y_true, y_pred, labels) 78 | report = {'Accuracy': m_acc, 'Precision': m_precision, 'Recall': m_recall, 'F1': m_f1, 'Confusion Matrix': m_conf} 79 | return report 80 | 81 | 82 | def classification_metrics_binary_prob(y_true, y_prob): 83 | """Returns a report with different metrics for a binary classification problem. 84 | - AUC: The Area Under the Curve represents the ability to discriminate between positive and negative classes. An 85 | area of 1 represent perfect scoring and an area of 0.5 means random guessing. 86 | - Log loss: Also called logistic regression loss or cross-entropy loss. It quantifies the performance by 87 | penalizing false classifications. Minimizing the Log Loss is equivalent to minimizing the squared error but using 88 | probabilistic predictions. Log loss penalize heavily classifiers that are confident about incorrect classifications. 89 | Args: 90 | y_true (list or array): True labels. 91 | y_prob (list or array): Predicted labels (probability). 92 | Returns: 93 | report (dict): Dictionary with metrics. 94 | Examples: 95 | >>> from collections import OrderedDict 96 | >>> y_true = [0,1,0,0,1] 97 | >>> y_prob = [0.2,0.7,0.4,0.3,0.2] 98 | >>> result = classification_metrics_binary_prob(y_true, y_prob) 99 | >>> OrderedDict(sorted(result.items())) 100 | OrderedDict([('AUC', 0.5833333333333333), ('Log loss', 0.6113513950783531)]) 101 | >>> y_prob = [0.2,0.7,0.4,0.3,0.3] 102 | >>> result = classification_metrics_binary_prob(y_true, y_prob) 103 | >>> OrderedDict(sorted(result.items())) 104 | OrderedDict([('AUC', 0.75), ('Log loss', 0.5302583734567203)]) 105 | 106 | """ 107 | m_auc = roc_auc_score(y_true, y_prob) 108 | m_logloss = log_loss(y_true, y_prob) 109 | report = {'AUC': m_auc, 'Log loss': m_logloss} 110 | return report 111 | 112 | 113 | def regression_metrics(y_true, y_pred): 114 | """Returns a report with different metrics for a regression problem. 115 | - Mean Squared Error: MSE is a risk metric corresponding to the expected value of the squared (quadratic) error. 116 | It has the disadvantage of heavily weighting outliers. 117 | - Mean Absolute Error: MAE is a risk metric corresponding to the expected value of the absolute error or L1 loss. 118 | Not as sensitive to outliers. 119 | - R Square: R2 is statistical measure of how close the data are to the fitted regression line. It's best possible 120 | score is 1.0 and it can be negative (because the model can be arbitrarily worse). A score of 0 means that the 121 | variables are not linearly correlated. 122 | - Root Mean Squared Error: RMSE is the square root of MSE. It also gives a relatively high weight to large errors. 123 | Args: 124 | y_true (list or array): True values. 125 | y_pred (list or array): Predicted values. 126 | Returns: 127 | report (dict): Dictionary with metrics. 128 | Examples: 129 | >>> from collections import OrderedDict 130 | >>> y_true = [5,1,0,7,1] 131 | >>> y_pred = [6,0.7,0.4,10,20] 132 | >>> result = regression_metrics(y_true, y_pred) 133 | >>> OrderedDict(sorted(result.items())) 134 | OrderedDict([('MAE', 4.74), ('MSE', 74.25), ('R2', -9.088315217391303), ('RMSE', 8.616843969807043)]) 135 | >>> y_true = [5,1,0,7,1] 136 | >>> y_pred = [6,0.7,0.4,10,2] 137 | >>> result = regression_metrics(y_true, y_pred) 138 | >>> OrderedDict(sorted(result.items())) 139 | OrderedDict([('MAE', 1.1400000000000001), ('MSE', 2.25), ('R2', 0.6942934782608696), ('RMSE', 1.5)]) 140 | 141 | """ 142 | mse = mean_squared_error(y_true, y_pred) 143 | mae = mean_absolute_error(y_true, y_pred) 144 | r2 = r2_score(y_true, y_pred) 145 | report = {'MSE': mse, 'MAE': mae, 'R2': r2, 'RMSE': np.sqrt(mse)} 146 | return report 147 | 148 | 149 | def precision_at_k(y_true, y_pred, k=None): 150 | """Precision at K. 151 | Args: 152 | y_true (list or array): True values. 153 | y_pred (list or array): Predicted values. 154 | k (int): Limit of predicted values. 155 | Returns: 156 | result (float): precision at k (max=1, min=0) 157 | Examples: 158 | >>> y_true = [5,1,0,7,2] 159 | >>> y_pred = [2,5,0,1,7] 160 | >>> precision_at_k(y_true, y_pred, k=3) 161 | 1.0 162 | >>> y_true = np.array([5,1,0,7,2]) 163 | >>> y_pred = np.array([9,0,8,1,7]) 164 | >>> precision_at_k(y_true, y_pred, k=3) 165 | 0.3333333333333333 166 | 167 | """ 168 | predictions = y_pred[:k] 169 | num_hit = len(set(predictions).intersection(set(y_true))) 170 | return float(num_hit) / len(predictions) 171 | 172 | 173 | def recall_at_k(y_true, y_pred, k=None): 174 | """Recall at K. 175 | Args: 176 | y_true (list or array): True values. 177 | y_pred (list or array): Predicted values. 178 | k (int): Limit of predicted values. 179 | Returns: 180 | result (float): recall at k (max=1, min=0) 181 | Examples: 182 | >>> y_true = [5,1,0,7,2] 183 | >>> y_pred = [2,5,0,1,7] 184 | >>> recall_at_k(y_true, y_pred, k=3) 185 | 0.6 186 | >>> y_true = np.array([5,1,0,7,2]) 187 | >>> y_pred = np.array([9,0,8,1,7]) 188 | >>> recall_at_k(y_true, y_pred, k=3) 189 | 0.2 190 | 191 | """ 192 | predictions = y_pred[:k] 193 | num_hit = len(set(predictions).intersection(set(y_true))) 194 | return float(num_hit) / len(y_true) 195 | 196 | 197 | def discounted_cumulative_gain(y_true, y_pred, k=None): 198 | """Discounted Cumulative Gain (DCG). 199 | Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain 200 | Args: 201 | y_true (list or array): True values. 202 | y_pred (list or array): Predicted values. 203 | k (int): Limit of predicted values. 204 | Returns: 205 | result (float): DCG 206 | Examples: 207 | >>> y_true = [5,1,0,7,2] 208 | >>> y_pred = [2,5,0,1,7] 209 | >>> discounted_cumulative_gain(y_true, y_pred, k=3) 210 | 5.130929753571458 211 | >>> y_true = np.array([5,1,0,7,2]) 212 | >>> y_pred = np.array([9,0,8,1,7]) 213 | >>> discounted_cumulative_gain(y_true, y_pred, k=3) 214 | 6.0 215 | 216 | """ 217 | order = np.argsort(y_pred)[::-1] 218 | y_true = np.take(y_true, order[:k]) 219 | return (y_true / np.log2(np.arange(y_true.shape[0]) + 2)).sum() 220 | 221 | 222 | def exponential_discounted_cumulative_gain(y_true, y_pred, k=None): 223 | """Exponential Discounted Cumulative Gain (eDCG). 224 | Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain 225 | Args: 226 | y_true (list or array): True values. 227 | y_pred (list or array): Predicted values. 228 | k (int): Limit of predicted values. 229 | Returns: 230 | result (float): eDCG 231 | Examples: 232 | >>> y_true = [5,1,0,7,2] 233 | >>> y_pred = [2,5,0,1,7] 234 | >>> exponential_discounted_cumulative_gain(y_true, y_pred, k=3) 235 | 19.130929753571458 236 | >>> y_true = np.array([5,1,0,7,2]) 237 | >>> y_pred = np.array([9,0,8,1,7]) 238 | >>> exponential_discounted_cumulative_gain(y_true, y_pred, k=3) 239 | 32.0 240 | 241 | """ 242 | order = np.argsort(y_pred)[::-1] 243 | y_true = np.take(y_true, order[:k]) 244 | return ((2 ** y_true - 1) / np.log2(np.arange(y_true.shape[0]) + 2)).sum() 245 | 246 | 247 | def normalized_discounted_cumulative_gain(y_true, y_pred, k=None): 248 | """Normalized Discounted Cumulative Gain (nDCG). 249 | Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain 250 | Args: 251 | y_true (list or array): True values. 252 | y_pred (list or array): Predicted values. 253 | k (int): Limit of predicted values. 254 | Returns: 255 | result (float): nDCG (max=1, min=0) 256 | Examples: 257 | >>> y_true = [5,1,0,7,2] 258 | >>> y_pred = [2,5,0,1,7] 259 | >>> normalized_discounted_cumulative_gain(y_true, y_pred, k=3) 260 | 0.4599812921368268 261 | >>> y_true = np.array([5,1,0,7,2]) 262 | >>> y_pred = np.array([9,0,8,1,7]) 263 | >>> normalized_discounted_cumulative_gain(y_true, y_pred, k=3) 264 | 0.537892328558952 265 | 266 | """ 267 | return discounted_cumulative_gain(y_true, y_pred, k) / discounted_cumulative_gain(y_true, y_true, k) 268 | 269 | 270 | def normalized_exponential_discounted_cumulative_gain(y, y_pred, k=None): 271 | """Normalized Exponential Discounted Cumulative Gain (neDCG). 272 | Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain 273 | Args: 274 | y_true (list or array): True values. 275 | y_pred (list or array): Predicted values. 276 | k (int): Limit of predicted values. 277 | Returns: 278 | result (float): neDCG (max=1, min=0) 279 | Examples: 280 | >>> y_true = [5,1,0,7,2] 281 | >>> y_pred = [2,5,0,1,7] 282 | >>> normalized_exponential_discounted_cumulative_gain(y_true, y_pred, k=3) 283 | 0.1292116839006246 284 | >>> y_true = np.array([5,1,0,7,2]) 285 | >>> y_pred = np.array([9,0,8,1,7]) 286 | >>> normalized_exponential_discounted_cumulative_gain(y_true, y_pred, k=3) 287 | 0.21950735175253772 288 | 289 | """ 290 | return exponential_discounted_cumulative_gain(y, y_pred, k)/exponential_discounted_cumulative_gain(y, y, k) 291 | 292 | 293 | 294 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/experiments/02_BCI_GPU.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Experiment 02: BCI (GPU version)\n", 11 | "\n", 12 | "This experiment uses a Brain Computer Interface dataset. The purpose is to try and predict when the participant is paying attention. The dataset consists of recordings from a number of electrodes placed over the scalp.\n", 13 | "\n", 14 | "The details of the machine we used and the version of the libraries can be found in [experiment 01](01_airline.ipynb)." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": false, 22 | "deletable": true, 23 | "editable": true 24 | }, 25 | "outputs": [ 26 | { 27 | "name": "stderr", 28 | "output_type": "stream", 29 | "text": [ 30 | "Using TensorFlow backend.\n" 31 | ] 32 | }, 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul 2 2016, 17:53:06) \n", 38 | "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n", 39 | "XGBoost version: 0.6\n", 40 | "LightGBM version: 0.2\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "import json\n", 46 | "import sys\n", 47 | "import warnings\n", 48 | "import numpy as np\n", 49 | "import pandas as pd\n", 50 | "import pkg_resources\n", 51 | "from libs.loaders import load_bci\n", 52 | "from libs.timer import Timer\n", 53 | "from libs.metrics import classification_metrics_binary, classification_metrics_binary_prob, binarize_prediction\n", 54 | "import xgboost as xgb\n", 55 | "import lightgbm as lgb\n", 56 | "\n", 57 | "warnings.filterwarnings('ignore')\n", 58 | "\n", 59 | "print(\"System version: {}\".format(sys.version))\n", 60 | "print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n", 61 | "print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": { 67 | "deletable": true, 68 | "editable": true 69 | }, 70 | "source": [ 71 | "## Data loading and management\n", 72 | "\n", 73 | "\n", 74 | "The dataset has been preprepared by extracting 800ms epochs from each channel. The data was then lowpass filtered at 18Hz and downsampled by a factor of 6. This results is a feature vector of 2048. " 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 2, 80 | "metadata": { 81 | "collapsed": false, 82 | "deletable": true, 83 | "editable": true 84 | }, 85 | "outputs": [ 86 | { 87 | "name": "stderr", 88 | "output_type": "stream", 89 | "text": [ 90 | "INFO:libs.loaders:MOUNT_POINT not found in environment. Defaulting to /fileshare\n" 91 | ] 92 | }, 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | "CPU times: user 2.1 s, sys: 472 ms, total: 2.57 s\n", 98 | "Wall time: 18.9 s\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "%%time\n", 104 | "X, y, X_test, y_test = load_bci()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 3, 110 | "metadata": { 111 | "collapsed": false, 112 | "deletable": true, 113 | "editable": true 114 | }, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "(14519, 2048)\n", 121 | "(14519,)\n", 122 | "(5978, 2048)\n", 123 | "(5978,)\n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "X_train = np.concatenate(X)\n", 129 | "y_train = np.concatenate(y)\n", 130 | "X_test = np.concatenate(X_test)\n", 131 | "y_test = np.concatenate(y_test)\n", 132 | "print(X_train.shape)\n", 133 | "print(y_train.shape)\n", 134 | "print(X_test.shape)\n", 135 | "print(y_test.shape)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 4, 141 | "metadata": { 142 | "collapsed": true, 143 | "deletable": true, 144 | "editable": true 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "dtrain = xgb.DMatrix(data=X_train, label=y_train)\n", 149 | "dtest = xgb.DMatrix(data=X_test, label=y_test)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "metadata": { 156 | "collapsed": false, 157 | "deletable": true, 158 | "editable": true 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)\n", 163 | "lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": { 169 | "deletable": true, 170 | "editable": true 171 | }, 172 | "source": [ 173 | "### XGBoost" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 7, 179 | "metadata": { 180 | "collapsed": true, 181 | "deletable": true, 182 | "editable": true 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "results_dict = dict()\n", 187 | "num_rounds = 100" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 8, 193 | "metadata": { 194 | "collapsed": true, 195 | "deletable": true, 196 | "editable": true 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "params = {'max_depth':3, \n", 201 | " 'objective':'binary:logistic', \n", 202 | " 'min_child_weight':1, \n", 203 | " 'eta':0.1, \n", 204 | " 'colsample_bytree':1, \n", 205 | " 'scale_pos_weight':2, \n", 206 | " 'gamma':0.1, \n", 207 | " 'reg_lamda':1, \n", 208 | " 'subsample':1,\n", 209 | " 'tree_method':'exact', \n", 210 | " 'updater':'grow_gpu'\n", 211 | " }\n" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 13, 217 | "metadata": { 218 | "collapsed": true, 219 | "deletable": true, 220 | "editable": true 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "with Timer() as t_train:\n", 225 | " xgb_clf_pipeline = xgb.train(params, dtrain, num_boost_round=num_rounds)\n", 226 | " \n", 227 | "with Timer() as t_test:\n", 228 | " y_prob_xgb = xgb_clf_pipeline.predict(dtest)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 14, 234 | "metadata": { 235 | "collapsed": true, 236 | "deletable": true, 237 | "editable": true 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "y_pred_xgb = binarize_prediction(y_prob_xgb)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 15, 247 | "metadata": { 248 | "collapsed": true, 249 | "deletable": true, 250 | "editable": true 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "report_xgb = classification_metrics_binary(y_test, y_pred_xgb)\n", 255 | "report2_xgb = classification_metrics_binary_prob(y_test, y_prob_xgb)\n", 256 | "report_xgb.update(report2_xgb)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 17, 262 | "metadata": { 263 | "collapsed": false, 264 | "deletable": true, 265 | "editable": true 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "results_dict['xgb']={\n", 270 | " 'train_time': t_train.interval,\n", 271 | " 'test_time': t_test.interval,\n", 272 | " 'performance': report_xgb \n", 273 | "}" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 18, 279 | "metadata": { 280 | "collapsed": true, 281 | "deletable": true, 282 | "editable": true 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "del xgb_clf_pipeline" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": { 292 | "deletable": true, 293 | "editable": true 294 | }, 295 | "source": [ 296 | "Now let's try with XGBoost histogram." 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 19, 302 | "metadata": { 303 | "collapsed": true, 304 | "deletable": true, 305 | "editable": true 306 | }, 307 | "outputs": [], 308 | "source": [ 309 | "params = {'max_depth':0, \n", 310 | " 'objective':'binary:logistic', \n", 311 | " 'min_child_weight':1, \n", 312 | " 'eta':0.1, \n", 313 | " 'colsample_bytree':0.80, \n", 314 | " 'scale_pos_weight':2, \n", 315 | " 'gamma':0.1, \n", 316 | " 'reg_lamda':1, \n", 317 | " 'subsample':1,\n", 318 | " 'tree_method':'hist', \n", 319 | " 'max_leaves':2**3, \n", 320 | " 'grow_policy':'lossguide', \n", 321 | " }\n" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 20, 327 | "metadata": { 328 | "collapsed": true, 329 | "deletable": true, 330 | "editable": true 331 | }, 332 | "outputs": [], 333 | "source": [ 334 | "with Timer() as t_train:\n", 335 | " xgb_hist_clf_pipeline = xgb.train(params, dtrain, num_boost_round=num_rounds)\n", 336 | " \n", 337 | "with Timer() as t_test:\n", 338 | " y_prob_xgb_hist = xgb_hist_clf_pipeline.predict(dtest)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 21, 344 | "metadata": { 345 | "collapsed": true, 346 | "deletable": true, 347 | "editable": true 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "y_pred_xgb_hist = binarize_prediction(y_prob_xgb_hist)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 22, 357 | "metadata": { 358 | "collapsed": true, 359 | "deletable": true, 360 | "editable": true 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "report_xgb_hist = classification_metrics_binary(y_test, y_pred_xgb_hist)\n", 365 | "report2_xgb_hist = classification_metrics_binary_prob(y_test, y_prob_xgb_hist)\n", 366 | "report_xgb_hist.update(report2_xgb_hist)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 23, 372 | "metadata": { 373 | "collapsed": true, 374 | "deletable": true, 375 | "editable": true 376 | }, 377 | "outputs": [], 378 | "source": [ 379 | "results_dict['xgb_hist']={\n", 380 | " 'train_time': t_train.interval,\n", 381 | " 'test_time': t_test.interval,\n", 382 | " 'performance': report_xgb_hist\n", 383 | "}" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 24, 389 | "metadata": { 390 | "collapsed": true, 391 | "deletable": true, 392 | "editable": true 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "del xgb_hist_clf_pipeline" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": { 402 | "deletable": true, 403 | "editable": true 404 | }, 405 | "source": [ 406 | "### LightGBM" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 25, 412 | "metadata": { 413 | "collapsed": true, 414 | "deletable": true, 415 | "editable": true 416 | }, 417 | "outputs": [], 418 | "source": [ 419 | "params = {'num_leaves': 2**3,\n", 420 | " 'learning_rate': 0.1,\n", 421 | " 'scale_pos_weight': 2,\n", 422 | " 'min_split_gain': 0.1,\n", 423 | " 'min_child_weight': 1,\n", 424 | " 'reg_lambda': 1,\n", 425 | " 'subsample': 1,\n", 426 | " 'objective':'binary',\n", 427 | " 'task': 'train'\n", 428 | " }" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 30, 434 | "metadata": { 435 | "collapsed": true, 436 | "deletable": true, 437 | "editable": true 438 | }, 439 | "outputs": [], 440 | "source": [ 441 | "with Timer() as t_train:\n", 442 | " lgbm_clf_pipeline = lgb.train(params, lgb_train, num_boost_round=num_rounds)\n", 443 | " \n", 444 | "with Timer() as t_test:\n", 445 | " y_prob_lgbm = lgbm_clf_pipeline.predict(X_test)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 31, 451 | "metadata": { 452 | "collapsed": true, 453 | "deletable": true, 454 | "editable": true 455 | }, 456 | "outputs": [], 457 | "source": [ 458 | "y_pred_lgbm = binarize_prediction(y_prob_lgbm)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 32, 464 | "metadata": { 465 | "collapsed": true, 466 | "deletable": true, 467 | "editable": true 468 | }, 469 | "outputs": [], 470 | "source": [ 471 | "report_lgbm = classification_metrics_binary(y_test, y_pred_lgbm)\n", 472 | "report2_lgbm = classification_metrics_binary_prob(y_test, y_prob_lgbm)\n", 473 | "report_lgbm.update(report2_lgbm)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 33, 479 | "metadata": { 480 | "collapsed": false, 481 | "deletable": true, 482 | "editable": true 483 | }, 484 | "outputs": [], 485 | "source": [ 486 | "results_dict['lgbm']={\n", 487 | " 'train_time': t_train.interval,\n", 488 | " 'test_time': t_test.interval,\n", 489 | " 'performance': report_lgbm \n", 490 | "}" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 34, 496 | "metadata": { 497 | "collapsed": true, 498 | "deletable": true, 499 | "editable": true 500 | }, 501 | "outputs": [], 502 | "source": [ 503 | "del lgbm_clf_pipeline" 504 | ] 505 | }, 506 | { 507 | "cell_type": "markdown", 508 | "metadata": { 509 | "deletable": true, 510 | "editable": true 511 | }, 512 | "source": [ 513 | "Finally, we show the results" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 35, 519 | "metadata": { 520 | "collapsed": false, 521 | "deletable": true, 522 | "editable": true 523 | }, 524 | "outputs": [ 525 | { 526 | "name": "stdout", 527 | "output_type": "stream", 528 | "text": [ 529 | "{\n", 530 | " \"lgbm\": {\n", 531 | " \"performance\": {\n", 532 | " \"AUC\": 0.7714542124542124,\n", 533 | " \"Accuracy\": 0.8813984610237537,\n", 534 | " \"F1\": 0.13851761846901578,\n", 535 | " \"Precision\": 0.6,\n", 536 | " \"Recall\": 0.0782967032967033\n", 537 | " },\n", 538 | " \"test_time\": 0.009907090001433971,\n", 539 | " \"train_time\": 2.7659428379993187\n", 540 | " },\n", 541 | " \"xgb\": {\n", 542 | " \"performance\": {\n", 543 | " \"AUC\": 0.7716584249084248,\n", 544 | " \"Accuracy\": 0.8798929407828705,\n", 545 | " \"F1\": 0.09343434343434343,\n", 546 | " \"Precision\": 0.578125,\n", 547 | " \"Recall\": 0.050824175824175824\n", 548 | " },\n", 549 | " \"test_time\": 0.0064387769998575095,\n", 550 | " \"train_time\": 12.934047714998997\n", 551 | " },\n", 552 | " \"xgb_hist\": {\n", 553 | " \"performance\": {\n", 554 | " \"AUC\": 0.7736170852956569,\n", 555 | " \"Accuracy\": 0.8805620608899297,\n", 556 | " \"F1\": 0.12068965517241378,\n", 557 | " \"Precision\": 0.5833333333333334,\n", 558 | " \"Recall\": 0.0673076923076923\n", 559 | " },\n", 560 | " \"test_time\": 0.00308577800024068,\n", 561 | " \"train_time\": 42.69890288699935\n", 562 | " }\n", 563 | "}\n" 564 | ] 565 | } 566 | ], 567 | "source": [ 568 | "# Results\n", 569 | "print(json.dumps(results_dict, indent=4, sort_keys=True))" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": { 576 | "collapsed": true, 577 | "deletable": true, 578 | "editable": true 579 | }, 580 | "outputs": [], 581 | "source": [] 582 | } 583 | ], 584 | "metadata": { 585 | "kernelspec": { 586 | "display_name": "Python 3.5", 587 | "language": "python", 588 | "name": "python3" 589 | }, 590 | "language_info": { 591 | "codemirror_mode": { 592 | "name": "ipython", 593 | "version": 3 594 | }, 595 | "file_extension": ".py", 596 | "mimetype": "text/x-python", 597 | "name": "python", 598 | "nbconvert_exporter": "python", 599 | "pygments_lexer": "ipython3", 600 | "version": "3.5.2" 601 | } 602 | }, 603 | "nbformat": 4, 604 | "nbformat_minor": 0 605 | } 606 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/experiments/04_PlanetKaggle.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Experiment 04: Amazon Planet\n", 11 | "\n", 12 | "This experiment uses the data from the Kaggle competition [Planet: Understanding the Amazon from Space](https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/leaderboard). Here we use a pretrained ResNet50 model to generate the features from the dataset.\n", 13 | "\n", 14 | "For details of virtual machine we used and the versions of LightGBM and XGBoost, please refer to [experiment 1](01_airline.ipynb)." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": false, 22 | "deletable": true, 23 | "editable": true, 24 | "scrolled": true 25 | }, 26 | "outputs": [ 27 | { 28 | "name": "stderr", 29 | "output_type": "stream", 30 | "text": [ 31 | "Using TensorFlow backend.\n" 32 | ] 33 | }, 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul 2 2016, 17:53:06) \n", 39 | "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n", 40 | "XGBoost version: 0.6\n", 41 | "LightGBM version: 0.2\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "import sys\n", 47 | "from collections import defaultdict\n", 48 | "import numpy as np\n", 49 | "import pkg_resources\n", 50 | "from libs.loaders import load_planet_kaggle\n", 51 | "from libs.planet_kaggle import threshold_prediction\n", 52 | "from libs.timer import Timer\n", 53 | "from libs.utils import get_number_processors\n", 54 | "from lightgbm import LGBMClassifier\n", 55 | "import xgboost as xgb\n", 56 | "import lightgbm as lgb\n", 57 | "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n", 58 | "from tqdm import tqdm\n", 59 | "import json\n", 60 | "import warnings; warnings.simplefilter('ignore')\n", 61 | "\n", 62 | "print(\"System version: {}\".format(sys.version))\n", 63 | "print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n", 64 | "print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 2, 70 | "metadata": { 71 | "collapsed": false, 72 | "deletable": true, 73 | "editable": true 74 | }, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "env: MOUNT_POINT=/datadrive\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "%env MOUNT_POINT=/datadrive" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": { 91 | "deletable": true, 92 | "editable": true 93 | }, 94 | "source": [ 95 | "The images are loaded and featurised using a pretrained ResNet50 model available from Keras" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 3, 101 | "metadata": { 102 | "collapsed": false, 103 | "deletable": true, 104 | "editable": true 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "X_train, y_train, X_test, y_test = load_planet_kaggle()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 4, 114 | "metadata": { 115 | "collapsed": false, 116 | "deletable": true, 117 | "editable": true 118 | }, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "(35000, 2048)\n", 125 | "(35000, 17)\n", 126 | "(5479, 2048)\n", 127 | "(5479, 17)\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "print(X_train.shape)\n", 133 | "print(y_train.shape)\n", 134 | "print(X_test.shape)\n", 135 | "print(y_test.shape)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": { 141 | "deletable": true, 142 | "editable": true 143 | }, 144 | "source": [ 145 | "## XGBoost \n", 146 | "\n" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 5, 152 | "metadata": { 153 | "collapsed": false, 154 | "deletable": true, 155 | "editable": true 156 | }, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "Number of processors: 24\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "number_processors = get_number_processors()\n", 168 | "print(\"Number of processors: \", number_processors)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": { 174 | "deletable": true, 175 | "editable": true 176 | }, 177 | "source": [ 178 | "We will use a one-v-rest. So each classifier will be responsible for determining whether the assigned tag applies to the image" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 6, 184 | "metadata": { 185 | "collapsed": false, 186 | "deletable": true, 187 | "editable": true 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "def train_and_validate_xgboost(params, train_features, train_labels, validation_features, num_boost_round):\n", 192 | " n_classes = train_labels.shape[1]\n", 193 | " y_val_pred = np.zeros((validation_features.shape[0], n_classes))\n", 194 | " time_results = defaultdict(list)\n", 195 | " for class_i in tqdm(range(n_classes)):\n", 196 | " dtrain = xgb.DMatrix(data=train_features, label=train_labels[:, class_i])\n", 197 | " dtest = xgb.DMatrix(data=validation_features)\n", 198 | " with Timer() as t:\n", 199 | " model = xgb.train(params, dtrain, num_boost_round=num_boost_round)\n", 200 | " time_results['train_time'].append(t.interval)\n", 201 | " \n", 202 | " with Timer() as t:\n", 203 | " y_val_pred[:, class_i] = model.predict(dtest)\n", 204 | " time_results['test_time'].append(t.interval)\n", 205 | " \n", 206 | " return y_val_pred, time_results" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 7, 212 | "metadata": { 213 | "collapsed": true, 214 | "deletable": true, 215 | "editable": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "def train_and_validate_lightgbm(params, train_features, train_labels, validation_features, num_boost_round):\n", 220 | " n_classes = train_labels.shape[1]\n", 221 | " y_val_pred = np.zeros((validation_features.shape[0], n_classes))\n", 222 | " time_results = defaultdict(list)\n", 223 | " for class_i in tqdm(range(n_classes)):\n", 224 | " lgb_train = lgb.Dataset(train_features, train_labels[:, class_i], free_raw_data=False)\n", 225 | " with Timer() as t:\n", 226 | " model = lgb.train(params, lgb_train, num_boost_round = num_boost_round)\n", 227 | " time_results['train_time'].append(t.interval)\n", 228 | " \n", 229 | " with Timer() as t:\n", 230 | " y_val_pred[:, class_i] = model.predict(validation_features)\n", 231 | " time_results['test_time'].append(t.interval)\n", 232 | " \n", 233 | " return y_val_pred, time_results" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 8, 239 | "metadata": { 240 | "collapsed": false, 241 | "deletable": true, 242 | "editable": true 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "metrics_dict = {\n", 247 | " 'Accuracy': accuracy_score,\n", 248 | " 'Precision': lambda y_true, y_pred: precision_score(y_true, y_pred, average='samples'),\n", 249 | " 'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='samples'),\n", 250 | " 'F1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='samples'),\n", 251 | "}\n", 252 | "\n", 253 | "def classification_metrics(metrics, y_true, y_pred):\n", 254 | " return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.items()}" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 9, 260 | "metadata": { 261 | "collapsed": true, 262 | "deletable": true, 263 | "editable": true 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "results_dict = dict()\n", 268 | "num_rounds = 50" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": { 274 | "deletable": true, 275 | "editable": true 276 | }, 277 | "source": [ 278 | "Now we are going to define the different models." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 10, 284 | "metadata": { 285 | "collapsed": true, 286 | "deletable": true, 287 | "editable": true 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "xgb_params = {'max_depth':6, \n", 292 | " 'objective':'binary:logistic', \n", 293 | " 'min_child_weight':1, \n", 294 | " 'learning_rate':0.1, \n", 295 | " 'scale_pos_weight':2, \n", 296 | " 'gamma':0.1, \n", 297 | " 'reg_lamda':1, \n", 298 | " 'subsample':1,\n", 299 | " 'nthread':number_processors\n", 300 | " }" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 11, 306 | "metadata": { 307 | "collapsed": false, 308 | "deletable": true, 309 | "editable": true 310 | }, 311 | "outputs": [ 312 | { 313 | "name": "stderr", 314 | "output_type": "stream", 315 | "text": [ 316 | "100%|██████████| 17/17 [05:36<00:00, 19.88s/it]\n" 317 | ] 318 | } 319 | ], 320 | "source": [ 321 | "y_pred, timing_results = train_and_validate_xgboost(xgb_params, X_train, y_train, X_test, num_boost_round=num_rounds)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 12, 327 | "metadata": { 328 | "collapsed": false, 329 | "deletable": true, 330 | "editable": true 331 | }, 332 | "outputs": [], 333 | "source": [ 334 | "results_dict['xgb']={\n", 335 | " 'train_time': np.sum(timing_results['train_time']),\n", 336 | " 'test_time': np.sum(timing_results['test_time']),\n", 337 | " 'performance': classification_metrics(metrics_dict, \n", 338 | " y_test, \n", 339 | " threshold_prediction(y_pred, threshold=0.1)) \n", 340 | "}" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 13, 346 | "metadata": { 347 | "collapsed": true, 348 | "deletable": true, 349 | "editable": true 350 | }, 351 | "outputs": [], 352 | "source": [ 353 | "xgb_hist_params = {'max_depth':0, \n", 354 | " 'max_leaves':2**6, \n", 355 | " 'objective':'binary:logistic', \n", 356 | " 'min_child_weight':1, \n", 357 | " 'learning_rate':0.1, \n", 358 | " 'scale_pos_weight':2, \n", 359 | " 'gamma':0.1, \n", 360 | " 'reg_lamda':1, \n", 361 | " 'subsample':1,\n", 362 | " 'nthread':number_processors,\n", 363 | " 'tree_method':'hist', \n", 364 | " 'grow_policy':'lossguide',\n", 365 | " 'max_bins': 63\n", 366 | " }" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 14, 372 | "metadata": { 373 | "collapsed": false, 374 | "deletable": true, 375 | "editable": true 376 | }, 377 | "outputs": [ 378 | { 379 | "name": "stderr", 380 | "output_type": "stream", 381 | "text": [ 382 | "100%|██████████| 17/17 [35:26<00:00, 116.33s/it]\n" 383 | ] 384 | } 385 | ], 386 | "source": [ 387 | "y_pred, timing_results = train_and_validate_xgboost(xgb_hist_params, X_train, y_train, X_test, num_boost_round=num_rounds)" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 15, 393 | "metadata": { 394 | "collapsed": true, 395 | "deletable": true, 396 | "editable": true 397 | }, 398 | "outputs": [], 399 | "source": [ 400 | "results_dict['xgb_hist']={\n", 401 | " 'train_time': np.sum(timing_results['train_time']),\n", 402 | " 'test_time': np.sum(timing_results['test_time']),\n", 403 | " 'performance': classification_metrics(metrics_dict, \n", 404 | " y_test, \n", 405 | " threshold_prediction(y_pred, threshold=0.1)) \n", 406 | "}" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": { 412 | "deletable": true, 413 | "editable": true 414 | }, 415 | "source": [ 416 | "## LightGBM" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 16, 422 | "metadata": { 423 | "collapsed": false, 424 | "deletable": true, 425 | "editable": true 426 | }, 427 | "outputs": [], 428 | "source": [ 429 | "lgb_params = {'num_leaves': 2**6,\n", 430 | " 'learning_rate': 0.1,\n", 431 | " 'scale_pos_weight': 2,\n", 432 | " 'min_split_gain': 0.1,\n", 433 | " 'min_child_weight': 1,\n", 434 | " 'reg_lambda': 1,\n", 435 | " 'subsample': 1,\n", 436 | " 'objective':'binary',\n", 437 | " 'task': 'train',\n", 438 | " 'nthread':number_processors,\n", 439 | " 'max_bin': 63\n", 440 | " }" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 17, 446 | "metadata": { 447 | "collapsed": false, 448 | "deletable": true, 449 | "editable": true 450 | }, 451 | "outputs": [ 452 | { 453 | "name": "stderr", 454 | "output_type": "stream", 455 | "text": [ 456 | "100%|██████████| 17/17 [03:13<00:00, 7.91s/it]\n" 457 | ] 458 | } 459 | ], 460 | "source": [ 461 | "y_pred, timing_results = train_and_validate_lightgbm(lgb_params, X_train, y_train, X_test, num_boost_round=num_rounds)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 18, 467 | "metadata": { 468 | "collapsed": false, 469 | "deletable": true, 470 | "editable": true 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "results_dict['lgbm']={\n", 475 | " 'train_time': np.sum(timing_results['train_time']),\n", 476 | " 'test_time': np.sum(timing_results['test_time']),\n", 477 | " 'performance': classification_metrics(metrics_dict, \n", 478 | " y_test, \n", 479 | " threshold_prediction(y_pred, threshold=0.1)) \n", 480 | "}" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": { 486 | "deletable": true, 487 | "editable": true 488 | }, 489 | "source": [ 490 | "Finally, we show the results." 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 19, 496 | "metadata": { 497 | "collapsed": false, 498 | "deletable": true, 499 | "editable": true 500 | }, 501 | "outputs": [ 502 | { 503 | "name": "stdout", 504 | "output_type": "stream", 505 | "text": [ 506 | "{\n", 507 | " \"lgbm\": {\n", 508 | " \"performance\": {\n", 509 | " \"Accuracy\": 0.37233071728417594,\n", 510 | " \"F1\": 0.822258366139549,\n", 511 | " \"Precision\": 0.7439077632634851,\n", 512 | " \"Recall\": 0.9734099462015139\n", 513 | " },\n", 514 | " \"test_time\": 0.1641630920021271,\n", 515 | " \"train_time\": 194.57900593099475\n", 516 | " },\n", 517 | " \"xgb\": {\n", 518 | " \"performance\": {\n", 519 | " \"Accuracy\": 0.34057309728052565,\n", 520 | " \"F1\": 0.8048263053953228,\n", 521 | " \"Precision\": 0.7184218531362171,\n", 522 | " \"Recall\": 0.9766441564762427\n", 523 | " },\n", 524 | " \"test_time\": 0.1852665030019125,\n", 525 | " \"train_time\": 313.8951129560046\n", 526 | " },\n", 527 | " \"xgb_hist\": {\n", 528 | " \"performance\": {\n", 529 | " \"Accuracy\": 0.37871874429640445,\n", 530 | " \"F1\": 0.8220252909027159,\n", 531 | " \"Precision\": 0.7447899193746976,\n", 532 | " \"Recall\": 0.9720717197264013\n", 533 | " },\n", 534 | " \"test_time\": 0.19687007299944526,\n", 535 | " \"train_time\": 2115.2851170680005\n", 536 | " }\n", 537 | "}\n" 538 | ] 539 | } 540 | ], 541 | "source": [ 542 | "# Results\n", 543 | "print(json.dumps(results_dict, indent=4, sort_keys=True))" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": { 549 | "deletable": true, 550 | "editable": true 551 | }, 552 | "source": [ 553 | "This dataset shows an interesting behavior. It is the only notebook where XGBoost hist behaves worse than XGBoost. The reason could be because the number of features is high, 2048, and that could be causing a memory overhead. LightGBM and the standard version of XGBoost can manage this high number of features, so there is no overhead. You can try to use a higher complexity to improve the performance. For example, setting `max_depth=8` in XGBoost, `max_leaves=2**8` in XGBoost hist and `num_leaves=2**6` in LightGBM. This will cause an overhead in XGBoost hist." 554 | ] 555 | } 556 | ], 557 | "metadata": { 558 | "kernelspec": { 559 | "display_name": "Python3.5 (Strata)", 560 | "language": "python", 561 | "name": "strata" 562 | }, 563 | "language_info": { 564 | "codemirror_mode": { 565 | "name": "ipython", 566 | "version": 3 567 | }, 568 | "file_extension": ".py", 569 | "mimetype": "text/x-python", 570 | "name": "python", 571 | "nbconvert_exporter": "python", 572 | "pygments_lexer": "ipython3", 573 | "version": "3.5.2" 574 | } 575 | }, 576 | "nbformat": 4, 577 | "nbformat_minor": 0 578 | } 579 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/experiments/04_PlanetKaggle_GPU.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Experiment 04: Amazon Planet (GPU version)\n", 11 | "\n", 12 | "This experiment uses the data from the Kaggle competition [Planet: Understanding the Amazon from Space](https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/leaderboard). Here we use a pretrained ResNet50 model to generate the features from the dataset.\n", 13 | "\n", 14 | "The details of the machine we used and the version of the libraries can be found in [experiment 01](01_airline.ipynb)." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": false, 22 | "deletable": true, 23 | "editable": true, 24 | "scrolled": true 25 | }, 26 | "outputs": [ 27 | { 28 | "name": "stderr", 29 | "output_type": "stream", 30 | "text": [ 31 | "Using TensorFlow backend.\n" 32 | ] 33 | }, 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul 2 2016, 17:53:06) \n", 39 | "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n", 40 | "XGBoost version: 0.6\n", 41 | "LightGBM version: 0.2\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "import sys, os\n", 47 | "from collections import defaultdict\n", 48 | "import numpy as np\n", 49 | "import pkg_resources\n", 50 | "from libs.loaders import load_planet_kaggle\n", 51 | "from libs.planet_kaggle import threshold_prediction\n", 52 | "from libs.timer import Timer\n", 53 | "import lightgbm as lgb\n", 54 | "import xgboost as xgb\n", 55 | "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n", 56 | "from tqdm import tqdm\n", 57 | "import tensorflow as tf\n", 58 | "from keras.backend.tensorflow_backend import set_session, get_session\n", 59 | "\n", 60 | "print(\"System version: {}\".format(sys.version))\n", 61 | "print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n", 62 | "print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 2, 68 | "metadata": { 69 | "collapsed": false, 70 | "deletable": true, 71 | "editable": true 72 | }, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "env: MOUNT_POINT=/datadrive\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "%env MOUNT_POINT=/datadrive" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "metadata": { 90 | "collapsed": true, 91 | "deletable": true, 92 | "editable": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "#Configure TF to use only one GPU, by default TF allocates memory in all GPUs\n", 97 | "config = tf.ConfigProto(device_count = {'GPU': 1})\n", 98 | "#Configure TF to limit the amount of GPU memory, by default TF takes all of them. \n", 99 | "config.gpu_options.per_process_gpu_memory_fraction = 0.3\n", 100 | "set_session(tf.Session(config=config))" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": { 106 | "deletable": true, 107 | "editable": true 108 | }, 109 | "source": [ 110 | "The images are loaded and featurised using a pretrained ResNet50 model available from Keras" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 4, 116 | "metadata": { 117 | "collapsed": false, 118 | "deletable": true, 119 | "editable": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "X_train, y_train, X_test, y_test = load_planet_kaggle()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 5, 129 | "metadata": { 130 | "collapsed": false, 131 | "deletable": true, 132 | "editable": true 133 | }, 134 | "outputs": [ 135 | { 136 | "name": "stdout", 137 | "output_type": "stream", 138 | "text": [ 139 | "(35000, 2048)\n", 140 | "(35000, 17)\n", 141 | "(5479, 2048)\n", 142 | "(5479, 17)\n" 143 | ] 144 | } 145 | ], 146 | "source": [ 147 | "print(X_train.shape)\n", 148 | "print(y_train.shape)\n", 149 | "print(X_test.shape)\n", 150 | "print(y_test.shape)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": { 156 | "deletable": true, 157 | "editable": true 158 | }, 159 | "source": [ 160 | "## XGBoost " 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": { 166 | "deletable": true, 167 | "editable": true 168 | }, 169 | "source": [ 170 | "We will use a one-v-rest. So each classifier will be responsible for determining whether the assigned tag applies to the image" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 6, 176 | "metadata": { 177 | "collapsed": false, 178 | "deletable": true, 179 | "editable": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "def train_and_validate_xgboost(params, train_features, train_labels, validation_features, num_boost_round):\n", 184 | " n_classes = train_labels.shape[1]\n", 185 | " y_val_pred = np.zeros((validation_features.shape[0], n_classes))\n", 186 | " time_results = defaultdict(list)\n", 187 | " for class_i in tqdm(range(n_classes)):\n", 188 | " dtrain = xgb.DMatrix(data=train_features, label=train_labels[:, class_i])\n", 189 | " dtest = xgb.DMatrix(data=validation_features)\n", 190 | " with Timer() as t:\n", 191 | " model = xgb.train(params, dtrain, num_boost_round=num_boost_round)\n", 192 | " time_results['train_time'].append(t.interval)\n", 193 | " \n", 194 | " with Timer() as t:\n", 195 | " y_val_pred[:, class_i] = model.predict(dtest)\n", 196 | " time_results['test_time'].append(t.interval)\n", 197 | " \n", 198 | " return y_val_pred, time_results" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 7, 204 | "metadata": { 205 | "collapsed": true, 206 | "deletable": true, 207 | "editable": true 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "def train_and_validate_lightgbm(params, train_features, train_labels, validation_features, num_boost_round):\n", 212 | " n_classes = train_labels.shape[1]\n", 213 | " y_val_pred = np.zeros((validation_features.shape[0], n_classes))\n", 214 | " time_results = defaultdict(list)\n", 215 | " for class_i in tqdm(range(n_classes)):\n", 216 | " lgb_train = lgb.Dataset(train_features, train_labels[:, class_i], free_raw_data=False)\n", 217 | " with Timer() as t:\n", 218 | " model = lgb.train(params, lgb_train, num_boost_round = num_boost_round)\n", 219 | " time_results['train_time'].append(t.interval)\n", 220 | " \n", 221 | " with Timer() as t:\n", 222 | " y_val_pred[:, class_i] = model.predict(validation_features)\n", 223 | " time_results['test_time'].append(t.interval)\n", 224 | " \n", 225 | " return y_val_pred, time_results" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 8, 231 | "metadata": { 232 | "collapsed": false, 233 | "deletable": true, 234 | "editable": true 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "metrics_dict = {\n", 239 | " 'Accuracy': accuracy_score,\n", 240 | " 'Precision': lambda y_true, y_pred: precision_score(y_true, y_pred, average='samples'),\n", 241 | " 'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='samples'),\n", 242 | " 'F1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='samples'),\n", 243 | "}\n", 244 | "\n", 245 | "def classification_metrics(metrics, y_true, y_pred):\n", 246 | " return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.items()}" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 9, 252 | "metadata": { 253 | "collapsed": true, 254 | "deletable": true, 255 | "editable": true 256 | }, 257 | "outputs": [], 258 | "source": [ 259 | "results_dict = dict()\n", 260 | "num_rounds = 50" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": { 266 | "deletable": true, 267 | "editable": true 268 | }, 269 | "source": [ 270 | "Now we are going to define the different models." 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 10, 276 | "metadata": { 277 | "collapsed": true, 278 | "deletable": true, 279 | "editable": true 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "xgb_params = {'max_depth':2, #'max_depth':6 \n", 284 | " 'objective':'binary:logistic', \n", 285 | " 'min_child_weight':1, \n", 286 | " 'learning_rate':0.1, \n", 287 | " 'scale_pos_weight':2, \n", 288 | " 'gamma':0.1, \n", 289 | " 'reg_lamda':1, \n", 290 | " 'subsample':1,\n", 291 | " 'tree_method':'exact', \n", 292 | " 'updater':'grow_gpu',\n", 293 | " }" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": { 299 | "deletable": true, 300 | "editable": true 301 | }, 302 | "source": [ 303 | "*NOTE: We got an out of memory error with xgb. Please see the comments at the end of the notebook.*" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": { 310 | "collapsed": false, 311 | "deletable": true, 312 | "editable": true 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "y_pred, timing_results = train_and_validate_xgboost(xgb_params, X_train, y_train, X_test, num_boost_round=num_rounds)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": { 323 | "collapsed": false, 324 | "deletable": true, 325 | "editable": true 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "results_dict['xgb']={\n", 330 | " 'train_time': np.sum(timing_results['train_time']),\n", 331 | " 'test_time': np.sum(timing_results['test_time']),\n", 332 | " 'performance': classification_metrics(metrics_dict, \n", 333 | " y_test, \n", 334 | " threshold_prediction(y_pred, threshold=0.1)) \n", 335 | "}" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": { 341 | "deletable": true, 342 | "editable": true 343 | }, 344 | "source": [ 345 | "\n", 346 | "\n", 347 | "Now let's try with XGBoost histogram.\n" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 12, 353 | "metadata": { 354 | "collapsed": false, 355 | "deletable": true, 356 | "editable": true 357 | }, 358 | "outputs": [], 359 | "source": [ 360 | "xgb_hist_params = {'max_depth':0, \n", 361 | " 'max_leaves':2**6, \n", 362 | " 'objective':'binary:logistic', \n", 363 | " 'min_child_weight':1, \n", 364 | " 'learning_rate':0.1, \n", 365 | " 'scale_pos_weight':2, \n", 366 | " 'gamma':0.1, \n", 367 | " 'reg_lamda':1, \n", 368 | " 'subsample':1,\n", 369 | " 'tree_method':'hist', \n", 370 | " 'grow_policy':'lossguide',\n", 371 | " 'updater':'grow_gpu_hist',\n", 372 | " 'max_bins': 63\n", 373 | " }" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "collapsed": false, 381 | "deletable": true, 382 | "editable": true 383 | }, 384 | "outputs": [ 385 | { 386 | "name": "stderr", 387 | "output_type": "stream", 388 | "text": [ 389 | "\n", 390 | " 0%| | 0/17 [00:00= 0: 439 | params["devices"] = "0-" + str(args.gpus) 440 | 441 | if data.learning_task == LearningTask.REGRESSION: 442 | params["objective"] = "RMSE" 443 | elif data.learning_task == LearningTask.CLASSIFICATION: 444 | params["objective"] = "Logloss" 445 | params["scale_pos_weight"] = len(data.y_train) / np.count_nonzero(data.y_train) 446 | elif data.learning_task == LearningTask.MULTICLASS_CLASSIFICATION: 447 | params["objective"] = "MultiClassOneVsAll" 448 | params["classes_count"] = np.max(data.y_test) + 1 449 | params.update(args.extra) 450 | return params 451 | 452 | def fit(self, data, args): 453 | dtrain = cat.Pool(data.X_train, data.y_train) 454 | params = self.configure(data, args) 455 | params["iterations"] = args.ntrees 456 | self.model = cat.CatBoost(params) 457 | with Timer() as t: 458 | self.model.fit(dtrain) 459 | return t.interval 460 | 461 | def test(self, data): 462 | dtest = cat.Pool(data.X_test) 463 | if data.learning_task == LearningTask.MULTICLASS_CLASSIFICATION: 464 | prob = self.model.predict(dtest) 465 | return np.argmax(prob, axis=1) 466 | return self.model.predict(dtest) 467 | 468 | def __exit__(self, exc_type, exc_value, traceback): 469 | del self.model 470 | 471 | 472 | class CatCPUAlgorithm(CatAlgorithm): 473 | def configure(self, data, args): 474 | params = super(CatCPUAlgorithm, self).configure(data, args) 475 | params.update({"task_type": "CPU"}) 476 | return params 477 | 478 | 479 | class CatGPUAlgorithm(CatAlgorithm): 480 | def configure(self, data, args): 481 | params = super(CatGPUAlgorithm, self).configure(data, args) 482 | params.update({"task_type": "GPU"}) 483 | return params 484 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-whitelist= 7 | 8 | # Add files or directories to the blacklist. They should be base names, not 9 | # paths. 10 | ignore=CVS 11 | 12 | # Add files or directories matching the regex patterns to the blacklist. The 13 | # regex matches against base names, not paths. 14 | ignore-patterns= 15 | 16 | # Python code to execute, usually for sys.path manipulation such as 17 | # pygtk.require(). 18 | #init-hook= 19 | 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 21 | # number of processors available to use. 22 | jobs=1 23 | 24 | # Control the amount of potential inferred values when inferring a single 25 | # object. This can help the performance when dealing with large functions or 26 | # complex, nested conditions. 27 | limit-inference-results=100 28 | 29 | # List of plugins (as comma separated values of python modules names) to load, 30 | # usually to register additional checkers. 31 | load-plugins= 32 | 33 | # Pickle collected data for later comparisons. 34 | persistent=yes 35 | 36 | # Specify a configuration file. 37 | #rcfile= 38 | 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit 40 | # user-friendly hints instead of false-positive error messages. 41 | suggestion-mode=yes 42 | 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the 44 | # active Python interpreter and may run arbitrary code. 45 | unsafe-load-any-extension=no 46 | 47 | 48 | [MESSAGES CONTROL] 49 | 50 | # Only show warnings with the listed confidence levels. Leave empty to show 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 52 | confidence= 53 | 54 | # Disable the message, report, category or checker with the given id(s). You 55 | # can either give multiple identifiers separated by comma (,) or put this 56 | # option multiple times (only on the command line, not in the configuration 57 | # file where it should appear only once). You can also use "--disable=all" to 58 | # disable everything first and then reenable specific checks. For example, if 59 | # you want to run only the similarities checker, you can use "--disable=all 60 | # --enable=similarities". If you want to run only the classes checker, but have 61 | # no Warning level messages displayed, use "--disable=all --enable=classes 62 | # --disable=W". 63 | disable=print-statement, 64 | parameter-unpacking, 65 | unpacking-in-except, 66 | old-raise-syntax, 67 | backtick, 68 | long-suffix, 69 | old-ne-operator, 70 | old-octal-literal, 71 | import-star-module-level, 72 | non-ascii-bytes-literal, 73 | raw-checker-failed, 74 | bad-inline-option, 75 | locally-disabled, 76 | locally-enabled, 77 | file-ignored, 78 | suppressed-message, 79 | useless-suppression, 80 | deprecated-pragma, 81 | use-symbolic-message-instead, 82 | apply-builtin, 83 | basestring-builtin, 84 | buffer-builtin, 85 | cmp-builtin, 86 | coerce-builtin, 87 | execfile-builtin, 88 | file-builtin, 89 | long-builtin, 90 | raw_input-builtin, 91 | reduce-builtin, 92 | standarderror-builtin, 93 | unicode-builtin, 94 | xrange-builtin, 95 | coerce-method, 96 | delslice-method, 97 | getslice-method, 98 | setslice-method, 99 | no-absolute-import, 100 | old-division, 101 | dict-iter-method, 102 | dict-view-method, 103 | next-method-called, 104 | metaclass-assignment, 105 | indexing-exception, 106 | raising-string, 107 | reload-builtin, 108 | oct-method, 109 | hex-method, 110 | nonzero-method, 111 | cmp-method, 112 | input-builtin, 113 | round-builtin, 114 | intern-builtin, 115 | unichr-builtin, 116 | map-builtin-not-iterating, 117 | zip-builtin-not-iterating, 118 | range-builtin-not-iterating, 119 | filter-builtin-not-iterating, 120 | using-cmp-argument, 121 | eq-without-hash, 122 | div-method, 123 | idiv-method, 124 | rdiv-method, 125 | exception-message-attribute, 126 | invalid-str-codec, 127 | sys-max-int, 128 | bad-python3-import, 129 | deprecated-string-function, 130 | deprecated-str-translate-call, 131 | deprecated-itertools-function, 132 | deprecated-types-field, 133 | next-method-defined, 134 | dict-items-not-iterating, 135 | dict-keys-not-iterating, 136 | dict-values-not-iterating, 137 | deprecated-operator-function, 138 | deprecated-urllib-function, 139 | xreadlines-attribute, 140 | deprecated-sys-function, 141 | exception-escape, 142 | comprehension-escape, 143 | invalid-name, 144 | no-self-use, 145 | import-error, 146 | missing-docstring, 147 | unbalanced-tuple-unpacking 148 | 149 | 150 | # Enable the message, report, category or checker with the given id(s). You can 151 | # either give multiple identifier separated by comma (,) or put this option 152 | # multiple time (only on the command line, not in the configuration file where 153 | # it should appear only once). See also the "--disable" option for examples. 154 | enable=c-extension-no-member 155 | 156 | 157 | [REPORTS] 158 | 159 | # Python expression which should return a note less than 10 (10 is the highest 160 | # note). You have access to the variables errors warning, statement which 161 | # respectively contain the number of errors / warnings messages and the total 162 | # number of statements analyzed. This is used by the global evaluation report 163 | # (RP0004). 164 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 165 | 166 | # Template used to display messages. This is a python new-style format string 167 | # used to format the message information. See doc for all details. 168 | #msg-template= 169 | 170 | # Set the output format. Available formats are text, parseable, colorized, json 171 | # and msvs (visual studio). You can also give a reporter class, e.g. 172 | # mypackage.mymodule.MyReporterClass. 173 | output-format=text 174 | 175 | # Tells whether to display a full report or only the messages. 176 | reports=no 177 | 178 | # Activate the evaluation score. 179 | score=yes 180 | 181 | 182 | [REFACTORING] 183 | 184 | # Maximum number of nested blocks for function / method body 185 | max-nested-blocks=5 186 | 187 | # Complete name of functions that never returns. When checking for 188 | # inconsistent-return-statements if a never returning function is called then 189 | # it will be considered as an explicit return statement and no message will be 190 | # printed. 191 | never-returning-functions=sys.exit 192 | 193 | 194 | [BASIC] 195 | 196 | # Naming style matching correct argument names. 197 | argument-naming-style=snake_case 198 | 199 | # Regular expression matching correct argument names. Overrides argument- 200 | # naming-style. 201 | #argument-rgx= 202 | 203 | # Naming style matching correct attribute names. 204 | attr-naming-style=snake_case 205 | 206 | # Regular expression matching correct attribute names. Overrides attr-naming- 207 | # style. 208 | #attr-rgx= 209 | 210 | # Bad variable names which should always be refused, separated by a comma. 211 | bad-names=foo, 212 | bar, 213 | baz, 214 | toto, 215 | tutu, 216 | tata 217 | 218 | # Naming style matching correct class attribute names. 219 | class-attribute-naming-style=any 220 | 221 | # Regular expression matching correct class attribute names. Overrides class- 222 | # attribute-naming-style. 223 | #class-attribute-rgx= 224 | 225 | # Naming style matching correct class names. 226 | class-naming-style=PascalCase 227 | 228 | # Regular expression matching correct class names. Overrides class-naming- 229 | # style. 230 | #class-rgx= 231 | 232 | # Naming style matching correct constant names. 233 | const-naming-style=UPPER_CASE 234 | 235 | # Regular expression matching correct constant names. Overrides const-naming- 236 | # style. 237 | #const-rgx= 238 | 239 | # Minimum line length for functions/classes that require docstrings, shorter 240 | # ones are exempt. 241 | docstring-min-length=-1 242 | 243 | # Naming style matching correct function names. 244 | function-naming-style=snake_case 245 | 246 | # Regular expression matching correct function names. Overrides function- 247 | # naming-style. 248 | #function-rgx= 249 | 250 | # Good variable names which should always be accepted, separated by a comma. 251 | good-names=i, 252 | j, 253 | k, 254 | ex, 255 | Run, 256 | _ 257 | 258 | # Include a hint for the correct naming format with invalid-name. 259 | include-naming-hint=no 260 | 261 | # Naming style matching correct inline iteration names. 262 | inlinevar-naming-style=any 263 | 264 | # Regular expression matching correct inline iteration names. Overrides 265 | # inlinevar-naming-style. 266 | #inlinevar-rgx= 267 | 268 | # Naming style matching correct method names. 269 | method-naming-style=snake_case 270 | 271 | # Regular expression matching correct method names. Overrides method-naming- 272 | # style. 273 | #method-rgx= 274 | 275 | # Naming style matching correct module names. 276 | module-naming-style=snake_case 277 | 278 | # Regular expression matching correct module names. Overrides module-naming- 279 | # style. 280 | #module-rgx= 281 | 282 | # Colon-delimited sets of names that determine each other's naming style when 283 | # the name regexes allow several styles. 284 | name-group= 285 | 286 | # Regular expression which should only match function or class names that do 287 | # not require a docstring. 288 | no-docstring-rgx=^_ 289 | 290 | # List of decorators that produce properties, such as abc.abstractproperty. Add 291 | # to this list to register other decorators that produce valid properties. 292 | # These decorators are taken in consideration only for invalid-name. 293 | property-classes=abc.abstractproperty 294 | 295 | # Naming style matching correct variable names. 296 | variable-naming-style=snake_case 297 | 298 | # Regular expression matching correct variable names. Overrides variable- 299 | # naming-style. 300 | #variable-rgx= 301 | 302 | 303 | [FORMAT] 304 | 305 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 306 | expected-line-ending-format= 307 | 308 | # Regexp for a line that is allowed to be longer than the limit. 309 | ignore-long-lines=^\s*(# )??$ 310 | 311 | # Number of spaces of indent required inside a hanging or continued line. 312 | indent-after-paren=4 313 | 314 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 315 | # tab). 316 | indent-string=' ' 317 | 318 | # Maximum number of characters on a single line. 319 | max-line-length=100 320 | 321 | # Maximum number of lines in a module. 322 | max-module-lines=1000 323 | 324 | # List of optional constructs for which whitespace checking is disabled. `dict- 325 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 326 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 327 | # `empty-line` allows space-only lines. 328 | no-space-check=trailing-comma, 329 | dict-separator 330 | 331 | # Allow the body of a class to be on the same line as the declaration if body 332 | # contains single statement. 333 | single-line-class-stmt=no 334 | 335 | # Allow the body of an if to be on the same line as the test if there is no 336 | # else. 337 | single-line-if-stmt=no 338 | 339 | 340 | [MISCELLANEOUS] 341 | 342 | # List of note tags to take in consideration, separated by a comma. 343 | notes=FIXME, 344 | XXX, 345 | TODO 346 | 347 | 348 | [SPELLING] 349 | 350 | # Limits count of emitted suggestions for spelling mistakes. 351 | max-spelling-suggestions=4 352 | 353 | # Spelling dictionary name. Available dictionaries: none. To make it working 354 | # install python-enchant package.. 355 | spelling-dict= 356 | 357 | # List of comma separated words that should not be checked. 358 | spelling-ignore-words= 359 | 360 | # A path to a file that contains private dictionary; one word per line. 361 | spelling-private-dict-file= 362 | 363 | # Tells whether to store unknown words to indicated private dictionary in 364 | # --spelling-private-dict-file option instead of raising a message. 365 | spelling-store-unknown-words=no 366 | 367 | 368 | [SIMILARITIES] 369 | 370 | # Ignore comments when computing similarities. 371 | ignore-comments=yes 372 | 373 | # Ignore docstrings when computing similarities. 374 | ignore-docstrings=yes 375 | 376 | # Ignore imports when computing similarities. 377 | ignore-imports=no 378 | 379 | # Minimum lines number of a similarity. 380 | min-similarity-lines=4 381 | 382 | 383 | [VARIABLES] 384 | 385 | # List of additional names supposed to be defined in builtins. Remember that 386 | # you should avoid to define new builtins when possible. 387 | additional-builtins= 388 | 389 | # Tells whether unused global variables should be treated as a violation. 390 | allow-global-unused-variables=yes 391 | 392 | # List of strings which can identify a callback function by name. A callback 393 | # name must start or end with one of those strings. 394 | callbacks=cb_, 395 | _cb 396 | 397 | # A regular expression matching the name of dummy variables (i.e. expected to 398 | # not be used). 399 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 400 | 401 | # Argument names that match this expression will be ignored. Default to name 402 | # with leading underscore. 403 | ignored-argument-names=_.*|^ignored_|^unused_ 404 | 405 | # Tells whether we should check for unused import in __init__ files. 406 | init-import=no 407 | 408 | # List of qualified module names which can have objects that can redefine 409 | # builtins. 410 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 411 | 412 | 413 | [LOGGING] 414 | 415 | # Logging modules to check that the string format arguments are in logging 416 | # function parameter format. 417 | logging-modules=logging 418 | 419 | 420 | [TYPECHECK] 421 | 422 | # List of decorators that produce context managers, such as 423 | # contextlib.contextmanager. Add to this list to register other decorators that 424 | # produce valid context managers. 425 | contextmanager-decorators=contextlib.contextmanager 426 | 427 | # List of members which are set dynamically and missed by pylint inference 428 | # system, and so shouldn't trigger E1101 when accessed. Python regular 429 | # expressions are accepted. 430 | generated-members= 431 | 432 | # Tells whether missing members accessed in mixin class should be ignored. A 433 | # mixin class is detected if its name ends with "mixin" (case insensitive). 434 | ignore-mixin-members=yes 435 | 436 | # Tells whether to warn about missing members when the owner of the attribute 437 | # is inferred to be None. 438 | ignore-none=yes 439 | 440 | # This flag controls whether pylint should warn about no-member and similar 441 | # checks whenever an opaque object is returned when inferring. The inference 442 | # can return multiple potential results while evaluating a Python object, but 443 | # some branches might not be evaluated, which results in partial inference. In 444 | # that case, it might be useful to still emit no-member and other checks for 445 | # the rest of the inferred objects. 446 | ignore-on-opaque-inference=yes 447 | 448 | # List of class names for which member attributes should not be checked (useful 449 | # for classes with dynamically set attributes). This supports the use of 450 | # qualified names. 451 | ignored-classes=optparse.Values,thread._local,_thread._local 452 | 453 | # List of module names for which member attributes should not be checked 454 | # (useful for modules/projects where namespaces are manipulated during runtime 455 | # and thus existing member attributes cannot be deduced by static analysis. It 456 | # supports qualified module names, as well as Unix pattern matching. 457 | ignored-modules= 458 | 459 | # Show a hint with possible names when a member name was not found. The aspect 460 | # of finding the hint is based on edit distance. 461 | missing-member-hint=yes 462 | 463 | # The minimum edit distance a name should have in order to be considered a 464 | # similar match for a missing member name. 465 | missing-member-hint-distance=1 466 | 467 | # The total number of similar names that should be taken in consideration when 468 | # showing a hint for a missing member. 469 | missing-member-max-choices=1 470 | 471 | 472 | [IMPORTS] 473 | 474 | # Allow wildcard imports from modules that define __all__. 475 | allow-wildcard-with-all=no 476 | 477 | # Analyse import fallback blocks. This can be used to support both Python 2 and 478 | # 3 compatible code, which means that the block might have code that exists 479 | # only in one or another interpreter, leading to false positives when analysed. 480 | analyse-fallback-blocks=no 481 | 482 | # Deprecated modules which should not be used, separated by a comma. 483 | deprecated-modules=optparse,tkinter.tix 484 | 485 | # Create a graph of external dependencies in the given file (report RP0402 must 486 | # not be disabled). 487 | ext-import-graph= 488 | 489 | # Create a graph of every (i.e. internal and external) dependencies in the 490 | # given file (report RP0402 must not be disabled). 491 | import-graph= 492 | 493 | # Create a graph of internal dependencies in the given file (report RP0402 must 494 | # not be disabled). 495 | int-import-graph= 496 | 497 | # Force import order to recognize a module as part of the standard 498 | # compatibility libraries. 499 | known-standard-library= 500 | 501 | # Force import order to recognize a module as part of a third party library. 502 | known-third-party=enchant 503 | 504 | 505 | [CLASSES] 506 | 507 | # List of method names used to declare (i.e. assign) instance attributes. 508 | defining-attr-methods=__init__, 509 | __new__, 510 | setUp 511 | 512 | # List of member names, which should be excluded from the protected access 513 | # warning. 514 | exclude-protected=_asdict, 515 | _fields, 516 | _replace, 517 | _source, 518 | _make 519 | 520 | # List of valid names for the first argument in a class method. 521 | valid-classmethod-first-arg=cls 522 | 523 | # List of valid names for the first argument in a metaclass class method. 524 | valid-metaclass-classmethod-first-arg=cls 525 | 526 | 527 | [DESIGN] 528 | 529 | # Maximum number of arguments for function / method. 530 | max-args=5 531 | 532 | # Maximum number of attributes for a class (see R0902). 533 | max-attributes=7 534 | 535 | # Maximum number of boolean expressions in an if statement. 536 | max-bool-expr=5 537 | 538 | # Maximum number of branch for function / method body. 539 | max-branches=12 540 | 541 | # Maximum number of locals for function / method body. 542 | max-locals=15 543 | 544 | # Maximum number of parents for a class (see R0901). 545 | max-parents=7 546 | 547 | # Maximum number of public methods for a class (see R0904). 548 | max-public-methods=20 549 | 550 | # Maximum number of return / yield for function / method body. 551 | max-returns=6 552 | 553 | # Maximum number of statements in function / method body. 554 | max-statements=50 555 | 556 | # Minimum number of public methods for a class (see R0903). 557 | min-public-methods=2 558 | 559 | 560 | [EXCEPTIONS] 561 | 562 | # Exceptions that will emit a warning when being caught. Defaults to 563 | # "Exception". 564 | overgeneral-exceptions=Exception 565 | -------------------------------------------------------------------------------- /3rdparty/fast_retraining/experiments/06_HIGGS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Experiment 06: HIGGS boson \n", 11 | "\n", 12 | "This experiment uses the data from the [HIGGS dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) to predict the appearance of the Higgs boson. The dataset consists of 11 million of observations. More information about the data can be found in [loaders.py](libs/loaders.py). \n", 13 | "\n", 14 | "For details of virtual machine we used and the versions of LightGBM and XGBoost, please refer to [experiment 1](01_airline.ipynb)." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false, 22 | "deletable": true, 23 | "editable": true 24 | }, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "System version: 3.5.3 |Anaconda 4.4.0 (64-bit)| (default, Mar 6 2017, 11:58:13) \n", 31 | "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n", 32 | "XGBoost version: 0.6\n", 33 | "LightGBM version: 0.2\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "import json\n", 39 | "import sys\n", 40 | "import matplotlib.pyplot as plt\n", 41 | "import pkg_resources\n", 42 | "from libs.loaders import load_higgs\n", 43 | "from libs.timer import Timer\n", 44 | "from libs.utils import get_number_processors\n", 45 | "from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score\n", 46 | "from sklearn.model_selection import train_test_split\n", 47 | "from xgboost import XGBClassifier\n", 48 | "from lightgbm import LGBMClassifier\n", 49 | "import warnings\n", 50 | "warnings.filterwarnings('ignore')\n", 51 | "\n", 52 | "print(\"System version: {}\".format(sys.version))\n", 53 | "print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n", 54 | "print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": { 61 | "collapsed": false, 62 | "deletable": true, 63 | "editable": true 64 | }, 65 | "outputs": [ 66 | { 67 | "name": "stderr", 68 | "output_type": "stream", 69 | "text": [ 70 | "INFO:libs.loaders:MOUNT_POINT not found in environment. Defaulting to /fileshare\n" 71 | ] 72 | }, 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "(11000000, 29)\n", 78 | "CPU times: user 1min 12s, sys: 6.31 s, total: 1min 18s\n", 79 | "Wall time: 4min 15s\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "%%time\n", 85 | "df = load_higgs()\n", 86 | "print(df.shape)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 4, 92 | "metadata": { 93 | "collapsed": false, 94 | "deletable": true, 95 | "editable": true 96 | }, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/html": [ 101 | "
\n", 102 | "\n", 115 | "\n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | "
bosonlepton_pTlepton_etalepton_phimissing_energy_magnitudemissing_energy_phijet_1_ptjet_1_etajet_1_phijet_1_b-tag...jet_4_etajet_4_phijet_4_b-tagm_jjm_jjjm_lvm_jlvm_bbm_wbbm_wwbb
01.00.869293-0.6350820.2256900.327470-0.6899930.754202-0.248573-1.0920640.000000...-0.010455-0.0457673.1019611.3537600.9795630.9780760.9200050.7216570.9887510.876678
11.00.9075420.3291470.3594121.497970-0.3130101.095531-0.557525-1.5882302.173076...-1.138930-0.0008190.0000000.3022200.8330480.9857000.9780980.7797320.9923560.798343
21.00.7988351.470639-1.6359750.4537730.4256291.1048751.2823221.3816640.000000...1.1288480.9004610.0000000.9097531.1083300.9856920.9513310.8032520.8659240.780118
30.01.344385-0.8766260.9359131.9920500.8824541.786066-1.646778-0.9423830.000000...-0.678379-1.3603560.0000000.9466521.0287040.9986560.7282810.8692001.0267360.957904
41.01.1050090.3213561.5224010.882808-1.2053490.681466-1.070464-0.9218710.000000...-0.3735660.1130410.0000000.7558561.3610570.9866100.8380851.1332950.8722450.808487
\n", 265 | "

5 rows × 29 columns

\n", 266 | "
" 267 | ], 268 | "text/plain": [ 269 | " boson lepton_pT lepton_eta lepton_phi missing_energy_magnitude \\\n", 270 | "0 1.0 0.869293 -0.635082 0.225690 0.327470 \n", 271 | "1 1.0 0.907542 0.329147 0.359412 1.497970 \n", 272 | "2 1.0 0.798835 1.470639 -1.635975 0.453773 \n", 273 | "3 0.0 1.344385 -0.876626 0.935913 1.992050 \n", 274 | "4 1.0 1.105009 0.321356 1.522401 0.882808 \n", 275 | "\n", 276 | " missing_energy_phi jet_1_pt jet_1_eta jet_1_phi jet_1_b-tag ... \\\n", 277 | "0 -0.689993 0.754202 -0.248573 -1.092064 0.000000 ... \n", 278 | "1 -0.313010 1.095531 -0.557525 -1.588230 2.173076 ... \n", 279 | "2 0.425629 1.104875 1.282322 1.381664 0.000000 ... \n", 280 | "3 0.882454 1.786066 -1.646778 -0.942383 0.000000 ... \n", 281 | "4 -1.205349 0.681466 -1.070464 -0.921871 0.000000 ... \n", 282 | "\n", 283 | " jet_4_eta jet_4_phi jet_4_b-tag m_jj m_jjj m_lv m_jlv \\\n", 284 | "0 -0.010455 -0.045767 3.101961 1.353760 0.979563 0.978076 0.920005 \n", 285 | "1 -1.138930 -0.000819 0.000000 0.302220 0.833048 0.985700 0.978098 \n", 286 | "2 1.128848 0.900461 0.000000 0.909753 1.108330 0.985692 0.951331 \n", 287 | "3 -0.678379 -1.360356 0.000000 0.946652 1.028704 0.998656 0.728281 \n", 288 | "4 -0.373566 0.113041 0.000000 0.755856 1.361057 0.986610 0.838085 \n", 289 | "\n", 290 | " m_bb m_wbb m_wwbb \n", 291 | "0 0.721657 0.988751 0.876678 \n", 292 | "1 0.779732 0.992356 0.798343 \n", 293 | "2 0.803252 0.865924 0.780118 \n", 294 | "3 0.869200 1.026736 0.957904 \n", 295 | "4 1.133295 0.872245 0.808487 \n", 296 | "\n", 297 | "[5 rows x 29 columns]" 298 | ] 299 | }, 300 | "execution_count": 4, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "df.head()" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 5, 312 | "metadata": { 313 | "collapsed": false, 314 | "deletable": true, 315 | "editable": true 316 | }, 317 | "outputs": [ 318 | { 319 | "name": "stdout", 320 | "output_type": "stream", 321 | "text": [ 322 | "24\n" 323 | ] 324 | } 325 | ], 326 | "source": [ 327 | "num_rounds = 200\n", 328 | "number_processors = get_number_processors()\n", 329 | "print(number_processors)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 6, 335 | "metadata": { 336 | "collapsed": false, 337 | "deletable": true, 338 | "editable": true 339 | }, 340 | "outputs": [], 341 | "source": [ 342 | "xgb_clf_pipeline = XGBClassifier(max_depth=5, \n", 343 | " learning_rate=0.1, \n", 344 | " scale_pos_weight=2,\n", 345 | " n_estimators=num_rounds,\n", 346 | " gamma=0.1,\n", 347 | " min_child_weight=1,\n", 348 | " reg_lambda=1,\n", 349 | " subsample=1,\n", 350 | " nthread=number_processors\n", 351 | " )" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 7, 357 | "metadata": { 358 | "collapsed": false, 359 | "deletable": true, 360 | "editable": true 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "xgb_hist_clf_pipeline = XGBClassifier(max_depth=0, \n", 365 | " learning_rate=0.1, \n", 366 | " scale_pos_weight=2,\n", 367 | " n_estimators=num_rounds,\n", 368 | " gamma=0.1,\n", 369 | " min_child_weight=1,\n", 370 | " reg_lambda=1,\n", 371 | " subsample=1,\n", 372 | " max_leaves=2**5,\n", 373 | " grow_policy='lossguide',\n", 374 | " tree_method='hist',\n", 375 | " nthread=number_processors\n", 376 | " )" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 8, 382 | "metadata": { 383 | "collapsed": false, 384 | "deletable": true, 385 | "editable": true 386 | }, 387 | "outputs": [], 388 | "source": [ 389 | "lgbm_clf_pipeline = LGBMClassifier(num_leaves=2**5, \n", 390 | " learning_rate=0.1, \n", 391 | " scale_pos_weight=2,\n", 392 | " n_estimators=num_rounds,\n", 393 | " min_split_gain=0.1,\n", 394 | " min_child_weight=1,\n", 395 | " reg_lambda=1,\n", 396 | " subsample=1,\n", 397 | " nthread=number_processors\n", 398 | " )" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 9, 404 | "metadata": { 405 | "collapsed": false, 406 | "deletable": true, 407 | "editable": true 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "metrics_dict = {\n", 412 | " 'Accuracy': accuracy_score,\n", 413 | " 'Precision': precision_score,\n", 414 | " 'Recall': recall_score,\n", 415 | " 'AUC': roc_auc_score,\n", 416 | " 'F1': f1_score,\n", 417 | "}\n", 418 | "\n", 419 | "def classification_metrics(metrics, y_true, y_pred):\n", 420 | " return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.items()}" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 10, 426 | "metadata": { 427 | "collapsed": false, 428 | "deletable": true, 429 | "editable": true 430 | }, 431 | "outputs": [], 432 | "source": [ 433 | "def generate_feables(df):\n", 434 | " X = df[df.columns.difference(['boson'])]\n", 435 | " y = df['boson']\n", 436 | " return X,y" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 11, 442 | "metadata": { 443 | "collapsed": false, 444 | "deletable": true, 445 | "editable": true 446 | }, 447 | "outputs": [], 448 | "source": [ 449 | "X, y = generate_feables(df)" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 12, 455 | "metadata": { 456 | "collapsed": false, 457 | "deletable": true, 458 | "editable": true 459 | }, 460 | "outputs": [], 461 | "source": [ 462 | "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=77, test_size=500000)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 13, 468 | "metadata": { 469 | "collapsed": false, 470 | "deletable": true, 471 | "editable": true 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "results_dict = dict()" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": { 481 | "deletable": true, 482 | "editable": true 483 | }, 484 | "source": [ 485 | "### XGBoost" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 14, 491 | "metadata": { 492 | "collapsed": false, 493 | "deletable": true, 494 | "editable": true 495 | }, 496 | "outputs": [], 497 | "source": [ 498 | "with Timer() as train_t:\n", 499 | " xgb_clf_pipeline.fit(X_train,y_train)\n", 500 | " \n", 501 | "with Timer() as test_t:\n", 502 | " y_pred = xgb_clf_pipeline.predict(X_test)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 15, 508 | "metadata": { 509 | "collapsed": true, 510 | "deletable": true, 511 | "editable": true 512 | }, 513 | "outputs": [], 514 | "source": [ 515 | "results_dict['xgb']={\n", 516 | " 'train_time': train_t.interval,\n", 517 | " 'test_time': test_t.interval,\n", 518 | " 'performance': classification_metrics(metrics_dict, \n", 519 | " y_test, \n", 520 | " y_pred) \n", 521 | "}" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": 16, 527 | "metadata": { 528 | "collapsed": true, 529 | "deletable": true, 530 | "editable": true 531 | }, 532 | "outputs": [], 533 | "source": [ 534 | "with Timer() as t_train:\n", 535 | " xgb_hist_clf_pipeline.fit(X_train,y_train)" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": 17, 541 | "metadata": { 542 | "collapsed": true, 543 | "deletable": true, 544 | "editable": true 545 | }, 546 | "outputs": [], 547 | "source": [ 548 | "with Timer() as t_test:\n", 549 | " y_pred = xgb_hist_clf_pipeline.predict(X_test)" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 18, 555 | "metadata": { 556 | "collapsed": true, 557 | "deletable": true, 558 | "editable": true 559 | }, 560 | "outputs": [], 561 | "source": [ 562 | "results_dict['xgb_hist']={\n", 563 | " 'train_time': t_train.interval,\n", 564 | " 'test_time': t_test.interval,\n", 565 | " 'performance': classification_metrics(metrics_dict, \n", 566 | " y_test, \n", 567 | " y_pred) \n", 568 | "}" 569 | ] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": { 574 | "deletable": true, 575 | "editable": true 576 | }, 577 | "source": [ 578 | "### LightGBM" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": 19, 584 | "metadata": { 585 | "collapsed": true, 586 | "deletable": true, 587 | "editable": true 588 | }, 589 | "outputs": [], 590 | "source": [ 591 | "with Timer() as train_t:\n", 592 | " lgbm_clf_pipeline.fit(X_train, y_train)\n", 593 | " \n", 594 | "with Timer() as test_t:\n", 595 | " y_pred = lgbm_clf_pipeline.predict(X_test)" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 20, 601 | "metadata": { 602 | "collapsed": true, 603 | "deletable": true, 604 | "editable": true 605 | }, 606 | "outputs": [], 607 | "source": [ 608 | "results_dict['lgbm']={\n", 609 | " 'train_time': train_t.interval,\n", 610 | " 'test_time': test_t.interval,\n", 611 | " 'performance': classification_metrics(metrics_dict, \n", 612 | " y_test, \n", 613 | " y_pred) \n", 614 | "}" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": 21, 620 | "metadata": { 621 | "collapsed": false, 622 | "deletable": true, 623 | "editable": true 624 | }, 625 | "outputs": [ 626 | { 627 | "name": "stdout", 628 | "output_type": "stream", 629 | "text": [ 630 | "{\n", 631 | " \"lgbm\": {\n", 632 | " \"performance\": {\n", 633 | " \"AUC\": 0.694682949690134,\n", 634 | " \"Accuracy\": 0.707758,\n", 635 | " \"F1\": 0.7680747894958216,\n", 636 | " \"Precision\": 0.6627597069095391,\n", 637 | " \"Recall\": 0.9131831219806763\n", 638 | " },\n", 639 | " \"test_time\": 0.7120589099995414,\n", 640 | " \"train_time\": 119.34003880199998\n", 641 | " },\n", 642 | " \"xgb\": {\n", 643 | " \"performance\": {\n", 644 | " \"AUC\": 0.6859901403358623,\n", 645 | " \"Accuracy\": 0.699694,\n", 646 | " \"F1\": 0.7635493812093622,\n", 647 | " \"Precision\": 0.6551156676187414,\n", 648 | " \"Recall\": 0.9149984903381643\n", 649 | " },\n", 650 | " \"test_time\": 0.55617916600022,\n", 651 | " \"train_time\": 2996.1667750769993\n", 652 | " },\n", 653 | " \"xgb_hist\": {\n", 654 | " \"performance\": {\n", 655 | " \"AUC\": 0.6941216899970567,\n", 656 | " \"Accuracy\": 0.70721,\n", 657 | " \"F1\": 0.767674555527519,\n", 658 | " \"Precision\": 0.6623426413523601,\n", 659 | " \"Recall\": 0.9128434480676328\n", 660 | " },\n", 661 | " \"test_time\": 0.6464068210007099,\n", 662 | " \"train_time\": 121.21175534400027\n", 663 | " }\n", 664 | "}\n" 665 | ] 666 | } 667 | ], 668 | "source": [ 669 | "# Results\n", 670 | "print(json.dumps(results_dict, indent=4, sort_keys=True))" 671 | ] 672 | } 673 | ], 674 | "metadata": { 675 | "kernelspec": { 676 | "display_name": "Python Strata", 677 | "language": "python", 678 | "name": "strata" 679 | }, 680 | "language_info": { 681 | "codemirror_mode": { 682 | "name": "ipython", 683 | "version": 3 684 | }, 685 | "file_extension": ".py", 686 | "mimetype": "text/x-python", 687 | "name": "python", 688 | "nbconvert_exporter": "python", 689 | "pygments_lexer": "ipython3", 690 | "version": "3.5.3" 691 | } 692 | }, 693 | "nbformat": 4, 694 | "nbformat_minor": 0 695 | } 696 | --------------------------------------------------------------------------------