├── 3rdparty
    ├── fast_retraining
    │   ├── experiments
    │   │   ├── __init__.py
    │   │   ├── libs
    │   │   │   ├── __init__.py
    │   │   │   ├── utils.py
    │   │   │   ├── metrics.py
    │   │   │   ├── timer.py
    │   │   │   ├── notebook_memory_management.py
    │   │   │   ├── planet_kaggle.py
    │   │   │   ├── conversion.py
    │   │   │   ├── loaders.py
    │   │   │   └── football.py
    │   │   ├── 02_BCI_GPU.ipynb
    │   │   ├── 04_PlanetKaggle.ipynb
    │   │   ├── 04_PlanetKaggle_GPU.ipynb
    │   │   └── 06_HIGGS.ipynb
    │   ├── environment
    │   │   ├── deactivate_env_vars.sh
    │   │   └── activate_env_vars.sh
    │   ├── requirements.txt
    │   ├── LICENSE
    │   ├── .gitignore
    │   ├── README.md
    │   └── INSTALL.md
    ├── README.md
    └── codebase
    │   └── python
    │       └── machine_learning
    │           └── metrics.py
├── .gitlab-ci.yml
├── .gitignore
├── LICENSE
├── metrics.py
├── json2csv.py
├── Dockerfile
├── README.md
├── runme.py
├── datasets.py
├── algorithms.py
└── .pylintrc


/3rdparty/fast_retraining/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/experiments/libs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
1 | test:
2 |   script:
3 |   - apt-get update -qy
4 |   - apt-get install -y python3 python3-pip
5 |   - pip3 install pylint
6 |   - pylint *.py


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.tsv
 3 | *.json
 4 | *.csv
 5 | *.log
 6 | test/
 7 | train/
 8 | catboost_info/
 9 | learn/
10 | dask-worker-space/
11 | file-*.model
12 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/environment/deactivate_env_vars.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | export PYTHONPATH=$OLD_PYTHON_PATH
4 | export PATH=$OLD_PATH
5 | export MOUNT_POINT=
6 | export CACHE_DIR=
7 | echo Noooooooooooooooo


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/experiments/libs/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import multiprocessing
 3 | 
 4 | 
 5 | def get_number_processors():
 6 |     try:
 7 |         num = os.cpu_count()
 8 |     except:
 9 |         num = multiprocessing.cpu_count()
10 |     return num  
11 | 
12 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/requirements.txt:
--------------------------------------------------------------------------------
 1 | scikit-learn>=0.18.1
 2 | scipy>=0.19.0
 3 | tqdm>=4.11.2
 4 | bokeh>=0.12.6
 5 | selenium>=3.4.3
 6 | matplotlib>=1.5.3
 7 | arff>=0.9
 8 | glob2>=0.5
 9 | ipython>=6.1.0
10 | tensorflow>=1.1.0
11 | Keras>=2.0.3
12 | memory_profiler>=0.47
13 | psutil>=5.2.2
14 | ipykernel>=4.6.1
15 | 


--------------------------------------------------------------------------------
/3rdparty/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | This folder contains all the third-party codes that we borrowed for this
 3 | project. They are basically a snapshot from their original locations, denoting
 4 | when we borrowed their code.
 5 | 
 6 | # fast_retraining
 7 | Link: https://github.com/Azure/fast_retraining
 8 | Commit ID when last borrowed: e43c9195213189ee0476c4a114dd8395ae11ed26
 9 | 
10 | # metrics.py
11 | Link: https://github.com/miguelgfierro/codebase
12 | Commit ID when last borrowed: 1080ba63a97bb13b2d61ca8ad9b83c7593337e86
13 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/environment/activate_env_vars.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # The location of the repository on the host machine
 4 | REPOSPATH=$HOME'/repos/'
 5 | export PROJECTPATH=$REPOSPATH'fast_retraining'
 6 | 
 7 | 
 8 | # Add custom libraries to the python path
 9 | export OLD_PYTHON_PATH=$PYTHONPATH
10 | export PYTHONPATH=$PYTHONPATH:$PROJECTPATH # Adds the repository to the python path
11 | 
12 | # Add scripts to path
13 | export OLD_PATH=$PATH
14 | export PATH=$PATH:PROJECTPATH
15 | 
16 | # The mounting location for the data
17 | export MOUNT_POINT=/fileshare               
18 | echo Me Gusta!
19 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/experiments/libs/metrics.py:
--------------------------------------------------------------------------------
 1 | #Original source: https://github.com/miguelgfierro/codebase/blob/master/python/machine_learning/metrics.py
 2 | import numpy as np
 3 | from sklearn.metrics import roc_auc_score,accuracy_score, precision_score, recall_score, f1_score
 4 | 
 5 | 
 6 | def classification_metrics_binary(y_true, y_pred):
 7 |     m_acc = accuracy_score(y_true, y_pred)
 8 |     m_f1 = f1_score(y_true, y_pred)
 9 |     m_precision = precision_score(y_true, y_pred)
10 |     m_recall = recall_score(y_true, y_pred)
11 |     report = {'Accuracy':m_acc, 'Precision':m_precision, 'Recall':m_recall, 'F1':m_f1}
12 |     return report
13 | 
14 | 
15 | def classification_metrics_binary_prob(y_true, y_prob):
16 |     m_auc = roc_auc_score(y_true, y_prob)
17 |     report = {'AUC':m_auc}
18 |     return report
19 | 
20 | 
21 | def classification_metrics_multilabel(y_true, y_pred, labels):
22 |     m_acc = accuracy_score(y_true, y_pred)
23 |     m_f1 = f1_score(y_true, y_pred, labels, average='weighted')
24 |     m_precision = precision_score(y_true, y_pred, labels, average='weighted')
25 |     m_recall = recall_score(y_true, y_pred, labels, average='weighted')
26 |     report = {'Accuracy':m_acc, 'Precision':m_precision, 'Recall':m_recall, 'F1':m_f1}
27 |     return report
28 | 
29 | 
30 | def binarize_prediction(y, threshold=0.5):
31 |     y_pred = np.where(y > threshold, 1, 0)
32 |     return y_pred
33 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/experiments/libs/timer.py:
--------------------------------------------------------------------------------
 1 | #code based on https://github.com/miguelgfierro/codebase/ 
 2 | 
 3 | from timeit import default_timer
 4 | 
 5 | class Timer(object):
 6 |     """Timer class.
 7 |     Examples:
 8 |         >>> big_num = 100000
 9 |         >>> t = Timer()
10 |         >>> t.start()
11 |         >>> for i in range(big_num):
12 |         >>>     r = 1
13 |         >>> t.stop()
14 |         >>> print(t.interval)
15 |         0.0946876304844
16 |         >>> with Timer() as t:
17 |         >>>     for i in range(big_num):
18 |         >>>         r = 1
19 |         >>> print(t.interval)
20 |         0.0766928562442
21 |         >>> try:
22 |         >>>     with Timer() as t:
23 |         >>>         for i in range(big_num):
24 |         >>>             r = 1
25 |         >>>             raise(Exception("Get out!"))
26 |         >>> finally:
27 |         >>>     print(t.interval)
28 |         0.0757778924471
29 | 
30 |     """
31 |     def __init__(self):
32 |         self._timer = default_timer
33 |     
34 |     def __enter__(self):
35 |         self.start()
36 |         return self
37 | 
38 |     def __exit__(self, *args):
39 |         self.stop()
40 | 
41 |     def start(self):
42 |         """Start the timer."""
43 |         self.start = self._timer()
44 | 
45 |     def stop(self):
46 |         """Stop the timer. Calculate the interval in seconds."""
47 |         self.end = self._timer()
48 |         self.interval = self.end - self.start
49 | 
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # IPython Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # dotenv
 79 | .env
 80 | 
 81 | # virtualenv
 82 | venv/
 83 | ENV/
 84 | 
 85 | # Spyder project settings
 86 | .spyderproject
 87 | 
 88 | # Rope project settings
 89 | .ropeproject
 90 | 
 91 | ##########
 92 | *-Copy*.ipynb
 93 | experiments/*.svg
 94 | experiments/*.pk
 95 | Untitled.ipynb
 96 | 
 97 | 
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/experiments/libs/notebook_memory_management.py:
--------------------------------------------------------------------------------
 1 | #Source: https://github.com/ianozsvald/ipython_memory_usage
 2 | """Profile mem usage envelope of IPython commands and report interactively"""
 3 | from __future__ import division  # 1/2 == 0.5, as in Py3
 4 | from __future__ import absolute_import  # avoid hiding global modules with locals
 5 | from __future__ import print_function  # force use of print("hello")
 6 | from __future__ import unicode_literals  # force unadorned strings "" to be unicode without prepending u""
 7 | import time
 8 | import memory_profiler
 9 | from IPython import get_ipython
10 | import threading
11 |     
12 | 
13 | # keep a global accounting for the last known memory usage
14 | # which is the reference point for the memory delta calculation
15 | previous_call_memory_usage = memory_profiler.memory_usage()[0]
16 | t1 = time.time() # will be set to current time later
17 | keep_watching = True
18 | watching_memory = True
19 | input_cells = get_ipython().user_ns['In']
20 | 
21 | 
22 | def start_watching_memory():
23 |     """Register memory profiling tools to IPython instance."""
24 |     global watching_memory
25 |     watching_memory = True
26 |     ip = get_ipython()
27 |     ip.events.register("post_run_cell", watch_memory)
28 |     ip.events.register("pre_run_cell", pre_run_cell)
29 | 
30 | 
31 | def stop_watching_memory():
32 |     """Unregister memory profiling tools from IPython instance."""
33 |     global watching_memory
34 |     watching_memory = False
35 |     ip = get_ipython()
36 |     try:
37 |         ip.events.unregister("post_run_cell", watch_memory)
38 |     except ValueError:
39 |         pass
40 |     try:
41 |         ip.events.unregister("pre_run_cell", pre_run_cell)
42 |     except ValueError:
43 |         pass
44 | 
45 | 
46 | def watch_memory():
47 |     # bring in the global memory usage value from the previous iteration
48 |     global previous_call_memory_usage, peak_memory_usage, keep_watching, \
49 |            watching_memory, input_cells
50 |     new_memory_usage = memory_profiler.memory_usage()[0]
51 |     memory_delta = new_memory_usage - previous_call_memory_usage
52 |     keep_watching = False
53 |     # calculate time delta using global t1 (from the pre-run event) and current
54 |     # time
55 |     time_delta_secs = time.time() - t1
56 |     num_commands = len(input_cells) - 1
57 |     cmd = "In [{}]".format(num_commands)
58 |     # convert the results into a pretty string
59 |     output_template = ("{cmd} used {memory_delta:0.4f} MiB RAM in "
60 |                        "{time_delta:0.2f}s, total RAM usage "
61 |                        "{memory_usage:0.2f} MiB")
62 |     output = output_template.format(time_delta=time_delta_secs,
63 |                                     cmd=cmd,
64 |                                     memory_delta=memory_delta,
65 |                                     memory_usage=new_memory_usage)
66 |     if watching_memory:
67 |         print(str(output))
68 |     previous_call_memory_usage = new_memory_usage
69 | 
70 | 
71 | def pre_run_cell():
72 |     """Capture current time before we execute the current command"""
73 |     global t1
74 |     t1 = time.time()
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/experiments/libs/planet_kaggle.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import glob
 4 | from tqdm import tqdm
 5 | import shutil
 6 | from keras.preprocessing import image
 7 | from keras.applications.imagenet_utils import preprocess_input
 8 | 
 9 | 
10 | def labels_from(labels_df):
11 |     """ Extracts the unique labels from the labels dataframe
12 |     """
13 |     # Build list with unique labels
14 |     label_list = []
15 |     for tag_str in labels_df.tags.values:
16 |         labels = tag_str.split(' ')
17 |         for label in labels:
18 |             if label not in label_list:
19 |                 label_list.append(label)
20 |     return label_list
21 | 
22 | 
23 | def enrich_with_feature_encoding(labels_df):
24 |     # Add onehot features for every label
25 |     for label in labels_from(labels_df):
26 |         labels_df[label] = labels_df['tags'].apply(lambda x: 1 if label in x.split(' ') else 0)
27 |     return labels_df
28 | 
29 | 
30 | def to_multi_label_dict(enriched_labels_df):
31 |     df = enriched_labels_df.set_index('image_name').drop('tags', axis=1)
32 |     return dict((filename, encoded_array) for filename, encoded_array in zip(df.index, df.values))
33 | 
34 | 
35 | def get_file_count(folderpath):
36 |     """ Returns the number of files in a folder
37 |     """
38 |     return len(glob.glob(folderpath))
39 | 
40 | 
41 | def threshold_prediction(pred_y, threshold=0.5):# TODO: Needs to be tuned?
42 |     return pred_y > threshold
43 | 
44 | 
45 | def read_images(filepath, filenames):
46 |     """ Read images in batches
47 |     """
48 |     img_data = list()
49 |     for name in filenames:
50 |         img_path = os.path.join(filepath, name+'.jpg')
51 |         img = image.load_img(img_path, target_size=(224, 224))
52 |         x = image.img_to_array(img)
53 |         x = np.expand_dims(x, axis=0)
54 |         img_data.append(preprocess_input(x))
55 |     return np.concatenate(img_data)
56 | 
57 | 
58 | def chunks(l, n):
59 |     for i in range(0, len(l), n):
60 |         yield l[i:i + n]
61 | 
62 | 
63 | def featurise_images(model, filepath, nameformat, num_iter, batch_size=32, desc=None):
64 |     """ Use DL model to featurise images
65 |     """
66 |     features = list()
67 |     img_names = list()
68 |     num_list = list(num_iter)
69 |     num_batches = np.ceil(len(num_list)/batch_size)
70 |     
71 |     for num_chunk in tqdm(chunks(num_list, batch_size), total=num_batches, desc=desc):
72 |         filenames = [nameformat.format(index) for index in num_chunk]
73 |         batch_images = read_images(filepath, filenames)
74 |         img_names.extend(filenames)
75 |         features.extend(model.predict_on_batch(batch_images).squeeze())
76 |     return np.array(features), img_names
77 |   
78 |     
79 | def generate_validation_files(train_path, val_path, num_train = 35000):
80 |     """ Creates the validation files from the train files.
81 |     """
82 |     num_train_ini = get_file_count(os.path.join(train_path, '*.jpg'))
83 |     assert num_train_ini > num_train
84 |     
85 |     order = 'mv ' + train_path + '/train_{' + str(num_train) + '..' + str(num_train_ini) + '}.jpg ' + val_path
86 |     os.system(order)
87 |     
88 |     
89 |     


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/experiments/libs/conversion.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def _get_nominal_integer_dict(nominal_vals):
 5 |     """Convert nominal values in integers, starting at 0.
 6 |     Parameters:
 7 |         nominal_vals (pd.Series): A series.
 8 |     Returns:
 9 |         d (dict): An dictionary with numeric values.
10 | 
11 |     """
12 |     d = {}
13 |     for val in nominal_vals:
14 |         if val not in d:
15 |             current_max = max(d.values()) if len(d) > 0 else -1
16 |             d[val] = current_max+1
17 |     return d
18 | 
19 | 
20 | def _convert_to_integer(srs, d):
21 |     """Convert series to integer, given a dictionary.
22 |     Parameters:
23 |         srs (pd.Series): A series.
24 |         d (dict): A dictionary mapping values to integers
25 |     Returns:
26 |         srs (pd.Series): An series with numeric values.
27 | 
28 |     """
29 |     return srs.map(lambda x: d[x])
30 | 
31 | 
32 | def convert_cols_categorical_to_numeric(df, col_list=None):
33 |     """Convert categorical columns to numeric and leave numeric columns
34 |     as they are. You can force to convert a numerical column if it is
35 |     included in col_list
36 |     Parameters:
37 |         df (pd.DataFrame): Dataframe.
38 |         col_list (list): List of columns.
39 |     Returns:
40 |         ret (pd.DataFrame): An dataframe with numeric values.
41 |     Examples:
42 |         >>> df = pd.DataFrame({'letters':['a','b','c'],'numbers':[1,2,3]})
43 |         >>> df_numeric = convert_cols_categorical_to_numeric(df)
44 |         >>> print(df_numeric)
45 |            letters  numbers
46 |         0        0        1
47 |         1        1        2
48 |         2        2        3
49 | 
50 |     """
51 |     if col_list is None: col_list = []
52 |     ret = pd.DataFrame()
53 |     for column_name in df.columns:
54 |         column = df[column_name]
55 |         if column.dtype == 'object' or column_name in col_list:
56 |             col_dict = _get_nominal_integer_dict(column)
57 |             ret[column_name] = _convert_to_integer(column, col_dict)
58 |         else:
59 |             ret[column_name] = column
60 |     return ret
61 | 
62 | 
63 | def convert_related_cols_categorical_to_numeric(df, col_list):
64 |     """Convert categorical columns, that are related between each other,
65 |     to numeric and leave numeric columns
66 |     as they are.
67 |     Parameters:
68 |         df (pd.DataFrame): Dataframe.
69 |         col_list (list): List of columns.
70 |     Returns:
71 |         ret (pd.DataFrame): An dataframe with numeric values.
72 |     Examples:
73 |         >>> df = pd.DataFrame({'letters':['a','b','c'],'letters2':['c','d','e'],'numbers':[1,2,3]})
74 |         >>> df_numeric = convert_related_cols_categorical_to_numeric(df, col_list=['letters','letters2'])
75 |         >>> print(df_numeric)
76 |            letters  letters2  numbers
77 |         0        0         2        1
78 |         1        1         3        2
79 |         2        2         4        3
80 | 
81 |     """
82 |     ret = pd.DataFrame()
83 |     values=None
84 |     for c in col_list:
85 |         values = pd.concat([values,df[c]], axis=0)
86 |         values = pd.Series(values.unique())
87 |     col_dict = _get_nominal_integer_dict(values)
88 |     for column_name in df.columns:
89 |         column = df[column_name]
90 |         if column_name in col_list:
91 |             ret[column_name] = _convert_to_integer(column, col_dict)
92 |         else:
93 |             ret[column_name] = column
94 |     return ret
95 | 
96 | 


--------------------------------------------------------------------------------
/metrics.py:
--------------------------------------------------------------------------------
 1 | # BSD License
 2 | #
 3 | # Copyright (c) 2016-present, Miguel Gonzalez-Fierro. All rights reserved.
 4 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without modification,
 7 | # are permitted provided that the following conditions are met:
 8 | #
 9 | #  * Redistributions of source code must retain the above copyright notice, this
10 | #    list of conditions and the following disclaimer.
11 | #
12 | #  * Redistributions in binary form must reproduce the above copyright notice,
13 | #    this list of conditions and the following disclaimer in the documentation
14 | #    and/or other materials provided with the distribution.
15 | #
16 | #  * Neither the name Miguel Gonzalez-Fierro nor the names of its contributors may be used to
17 | #    endorse or promote products derived from this software without specific
18 | #    prior written permission.
19 | #
20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
21 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
24 | # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 | # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
27 | # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29 | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | import numpy as np
32 | import sklearn.metrics as sklm
33 | from datasets import LearningTask
34 | 
35 | 
36 | def get_metrics(data, pred):
37 |     if data.learning_task == LearningTask.REGRESSION:
38 |         return regression_metrics(data.y_test, pred)
39 |     if data.learning_task == LearningTask.CLASSIFICATION:
40 |         return classification_metrics(data.y_test, pred)
41 |     if data.learning_task == LearningTask.MULTICLASS_CLASSIFICATION:
42 |         return classification_metrics_multilabel(data.y_test, pred)
43 |     raise ValueError("No metrics defined for learning task: " + str(data.learning_task))
44 | 
45 | 
46 | def evaluate_metrics(y_true, y_pred, metrics):
47 |     res = {}
48 |     for metric_name, metric in metrics.items():
49 |         res[metric_name] = float(metric(y_true, y_pred))
50 |     return res
51 | 
52 | 
53 | def classification_metrics(y_true, y_prob, threshold=0.5):
54 |     y_pred = np.where(y_prob > threshold, 1, 0)
55 |     metrics = {
56 |         "Accuracy": sklm.accuracy_score,
57 |         "Log_Loss": lambda real, pred: sklm.log_loss(real, y_prob, eps=1e-5),
58 |         # yes, I'm using y_prob here!
59 |         "AUC": lambda real, pred: sklm.roc_auc_score(real, y_prob),
60 |         "Precision": sklm.precision_score,
61 |         "Recall": sklm.recall_score,
62 |     }
63 |     return evaluate_metrics(y_true, y_pred, metrics)
64 | 
65 | 
66 | def classification_metrics_multilabel(y_true, y_pred):
67 |     metrics = {
68 |         "Accuracy": sklm.accuracy_score,
69 |         "Precision": lambda real, pred: sklm.precision_score(real, pred,
70 |                                                              average="weighted"),
71 |         "Recall": lambda real, pred: sklm.recall_score(real, pred,
72 |                                                        average="weighted"),
73 |         "F1": lambda real, pred: sklm.f1_score(real, pred,
74 |                                                average="weighted"),
75 |     }
76 |     return evaluate_metrics(y_true, y_pred, metrics)
77 | 
78 | 
79 | def regression_metrics(y_true, y_pred):
80 |     metrics = {
81 |         "MeanAbsError": sklm.mean_absolute_error,
82 |         "MeanSquaredError": sklm.mean_squared_error,
83 |         "MedianAbsError": sklm.median_absolute_error,
84 |     }
85 |     return evaluate_metrics(y_true, y_pred, metrics)
86 | 


--------------------------------------------------------------------------------
/json2csv.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | 
 28 | import sys
 29 | import json
 30 | import os
 31 | import csv
 32 | 
 33 | TIMINGS = ["train_time", "test_time"]
 34 | METRICS = ["AUC", "Accuracy", "F1", "Precision", "Recall", "MeanAbsError", "MeanSquaredError",
 35 |            "MedianAbsError"]
 36 | ALLMETRICS = TIMINGS + METRICS
 37 | 
 38 | 
 39 | def load_perf_data(json_file):
 40 |     file = open(json_file, "r")
 41 |     data = json.load(file)
 42 |     file.close()
 43 |     return data
 44 | 
 45 | 
 46 | def load_all_perf_data(files):
 47 |     data = {}
 48 |     for json_file in files:
 49 |         dataset = os.path.basename(json_file)
 50 |         dataset = dataset.replace(".json", "")
 51 |         data[dataset] = load_perf_data(json_file)
 52 |     return data
 53 | 
 54 | 
 55 | def get_all_datasets(data):
 56 |     return data.keys()
 57 | 
 58 | 
 59 | def get_all_algos(data):
 60 |     algos = {}
 61 |     for dset in data.keys():
 62 |         for algo in data[dset].keys():
 63 |             algos[algo] = 1
 64 |     return algos.keys()
 65 | 
 66 | 
 67 | def read_from_dict(hashmap, key, def_val="-na-"):
 68 |     return hashmap[key] if key in hashmap else def_val
 69 | 
 70 | 
 71 | def combine_perf_data(data, datasets, algos):
 72 |     all_data = {}
 73 |     for dataset in datasets:
 74 |         out = []
 75 |         dset = read_from_dict(data, dataset, {})
 76 |         for algo in algos:
 77 |             algo_data = read_from_dict(dset, algo, {})
 78 |             perf = [algo]
 79 |             for timing in TIMINGS:
 80 |                 perf.append(read_from_dict(algo_data, timing))
 81 |             metric_data = read_from_dict(algo_data, "accuracy", {})
 82 |             for metric in METRICS:
 83 |                 perf.append(read_from_dict(metric_data, metric))
 84 |             out.append(perf)
 85 |         all_data[dataset] = out
 86 |     return all_data
 87 | 
 88 | 
 89 | def write_csv(all_data, datasets):
 90 |     writer = csv.writer(sys.stdout)
 91 |     header = ['dataset', 'algorithm'] + ALLMETRICS
 92 |     writer.writerow(header)
 93 |     for dataset in sorted(datasets):
 94 |         for row in all_data[dataset]:
 95 |             writer.writerow([dataset] + row)
 96 | 
 97 | 
 98 | def main():
 99 |     data = load_perf_data(sys.argv[1])
100 |     datasets = get_all_datasets(data)
101 |     algos = get_all_algos(data)
102 |     table = combine_perf_data(data, datasets, algos)
103 |     write_csv(table, datasets)
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     main()
108 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
  1 | ARG CUDA_VERSION
  2 | FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu18.04
  3 | SHELL ["/bin/bash", "-c"]
  4 | # Install conda (and use python 3.7)
  5 | RUN apt-get update && \
  6 |     apt-get install -y --no-install-recommends \
  7 |         build-essential \
  8 |         ca-certificates \
  9 |         curl \
 10 |         doxygen \
 11 |         git \
 12 |         graphviz \
 13 |         libcurl4-openssl-dev \
 14 |         libboost-all-dev \
 15 |         make \
 16 |         tar \
 17 |         unzip \
 18 |         wget \
 19 |         zlib1g-dev && \
 20 |     rm -rf /var/lib/apt/*
 21 | 
 22 | RUN curl -o /opt/miniconda.sh \
 23 | 	https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
 24 |     chmod +x /opt/miniconda.sh && \
 25 |     /opt/miniconda.sh -b -p /opt/conda && \
 26 |     /opt/conda/bin/conda update -n base conda && \
 27 |     rm /opt/miniconda.sh
 28 | ENV PATH /opt/conda/bin:$PATH
 29 | RUN conda install -c conda-forge -c rapidsai -c nvidia -c defaults \
 30 |         bokeh \
 31 |         cmake>=3.14 \
 32 |         h5py \
 33 |         ipython \
 34 |         ipywidgets \
 35 |         jupyter \
 36 |         kaggle \
 37 |         matplotlib \
 38 |         nose \
 39 |         numpy \
 40 |         pandas \
 41 |         Pillow \
 42 |         pydot \
 43 |         pylint\
 44 |         psutil\
 45 |         scikit-learn \
 46 |         scipy \
 47 |         six \
 48 |         dask \
 49 |         distributed \
 50 |         tqdm \
 51 |         cudf=0.18.0 \
 52 |         dask-cuda \
 53 |         rmm \
 54 |         librmm \
 55 |         rapids-xgboost \
 56 |         cuml=0.18 && \
 57 |     conda clean -ya
 58 | 
 59 | # lightgbm
 60 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
 61 | RUN apt-get update && \
 62 |     apt-get install -y --no-install-recommends \
 63 |         build-essential \
 64 |         bzip2 \
 65 |         ca-certificates \
 66 |         curl \
 67 |         git \
 68 |         libblas-dev \
 69 |         libboost-dev \
 70 |         libboost-filesystem-dev \
 71 |         libboost-system-dev \
 72 |         libbz2-dev \
 73 |         libc6 \
 74 |         libglib2.0-0 \
 75 |         liblapack-dev \
 76 |         libsm6 \
 77 |         libxext6 \
 78 |         libxrender1 \
 79 |         make \
 80 |         tar \
 81 |         unzip \
 82 |         wget && \
 83 |     rm -rf /var/lib/apt/*
 84 | RUN mkdir -p /etc/OpenCL/vendors && \
 85 |     echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
 86 | ENV OPENCL_LIBRARIES /usr/local/cuda/lib64
 87 | ENV OPENCL_INCLUDE_DIR /usr/local/cuda/include
 88 | RUN git config --global http.sslVerify false && \
 89 |     git clone --recursive https://github.com/Microsoft/LightGBM /opt/LightGBM && \
 90 |     cd /opt/LightGBM && \
 91 |     mkdir build && \
 92 |     cd build && \
 93 |     cmake .. \
 94 |         -DUSE_GPU=1 \
 95 |         -DOpenCL_LIBRARY=$OPENCL_LIBRARIES/libOpenCL.so \
 96 |         -DOpenCL_INCLUDE_DIR=$OPENCL_INCLUDE_DIR && \
 97 |     make OPENCL_HEADERS="/usr/local/cuda/targets/x86_64-linux/include" \
 98 |         LIBOPENCL="/usr/local/cuda/targets/x86_64-linux/lib" -j4 && \
 99 |     cd ../python-package && \
100 |     python setup.py install --precompile
101 | 
102 | # catboost
103 | RUN if [ "`echo $CUDA_VERSION | sed -e 's/[.].*//'`" -lt "11" ]; then git config --global http.sslVerify false && \
104 |     git clone --recursive "https://github.com/catboost/catboost" /opt/catboost && \
105 |     cd /opt/catboost && \
106 |     cd catboost/python-package/catboost && \
107 |     ../../../ya make \
108 |         -r \
109 |         -o ../../.. \
110 |         -DUSE_ARCADIA_PYTHON=no \
111 |         -DUSE_SYSTEM_PYTHON=3.7\
112 |         -DPYTHON_CONFIG=python3-config \
113 |         -DCUDA_ROOT=$(dirname $(dirname $(which nvcc))); \
114 |         fi
115 | ENV if [ "`echo $CUDA_VERSION | sed -e 's/[.].*//'`" -lt "11" ]; then PYTHONPATH=$PYTHONPATH:/opt/catboost/catboost/python-package; fi
116 | 
117 | 
118 | 
119 | # xgboost
120 | RUN git config --global http.sslVerify false && \
121 |     git clone --recursive https://github.com/dmlc/xgboost /opt/xgboost && \
122 |     cd /opt/xgboost && \
123 |     mkdir build && \
124 |     cd build && \
125 |     RMM_ROOT=/opt/conda cmake .. \
126 |         -DUSE_CUDA=ON \
127 |         -DUSE_NCCL=ON \
128 |         -DPLUGIN_RMM=ON && \
129 |     make -j4 && \
130 |     cd ../python-package && \
131 |     pip uninstall -y xgboost && \
132 |     python setup.py install
133 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/README.md:
--------------------------------------------------------------------------------
 1 | # Fast Retraining
 2 | 
 3 | In this repo we compare two of the fastest boosted decision tree libraries: [XGBoost](https://github.com/dmlc/xgboost) and [LightGBM](https://github.com/microsoft/LightGBM). We will evaluate them across datasets of several domains and different sizes.
 4 | 
 5 | On July 25, 2017, we published a blog post evaluating both libraries and discussing the benchmark results. The post is [Lessons Learned From Benchmarking Fast Machine Learning Algorithms](https://blogs.technet.microsoft.com/machinelearning/2017/07/25/lessons-learned-benchmarking-fast-machine-learning-algorithms/).
 6 | 
 7 | ## Installation and Setup
 8 | 
 9 | The installation instructions can be found [here](./INSTALL.md).
10 | 
11 | ## Project
12 | 
13 | In the folder [experiments](./experiments) you can find the different experiments of the project. We developed 6 experiments with the CPU and GPU versions of the libraries.
14 | 
15 | * Airline
16 | * BCI
17 | * Football
18 | * Planet Kaggle
19 | * Fraud Detection
20 | * HIGGS
21 | 
22 | In the folder [experiment/libs](./experiment/libs) there is the common code for the project.
23 | 
24 | ## Benchmark
25 | 
26 | In the following table there are summarized the time results (in seconds) and the ratio of the benchmarks performed in the experiments:
27 | 
28 | | Dataset | Experiment | Data size | Features | xgb time: <br/> CPU (GPU) | xgb_hist time: <br/> CPU (GPU) | lgb time: <br/>CPU (GPU) | ratio xgb/lgb: <br/> CPU (GPU) | ratio xgb_hist/lgb: <br/> CPU <br/> (GPU) |
29 | | --- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
30 | | Football | [Link CPU](./experiments/03_football.ipynb)<br/> [Link GPU](./experiments/03_football_GPU.ipynb) | 19673 | 46 | 2.27 (7.09) | 2.47 (4.58) | 0.58 (0.97) | 3.90 <br/> (7.26) | 4.25 <br/>(4.69) |
31 | | Fraud Detection | [Link CPU](./experiments/05_FraudDetection.ipynb)<br/> [Link GPU](./experiments/05_FraudDetection_GPU.ipynb) | 284807 | 30 | 4.34 (5.80) | 2.01 (1.64) | 0.66 (0.29) | 6.58 <br/>(19.74) | 3.04 <br/> (5.58) |
32 | | BCI | [Link CPU](./experiments/02_BCI.ipynb)<br/> [Link GPU](./experiments/02_BCI_GPU.ipynb) | 20497 | 2048 | 11.51 (12.93) | 41.84 (42.69) | 7.31 (2.76)| 1.57 <br/> (4.67) | 5.72 <br/>(15.43) |
33 | | Planet Kaggle | [Link CPU](./experiments/04_PlanetKaggle.ipynb)<br/> [Link GPU](./experiments/04_PlanetKaggle_GPU.ipynb) | 40479 | 2048 | 313.89 (-) | 2115.28 (2028.43) | 194.57 (317.68)| 1.61 <br/> (-) | 10.87 <br/>(6.38) |
34 | | HIGGS | [Link CPU](./experiments/06_HIGGS.ipynb)<br/> [Link GPU](./experiments/06_HIGGS_GPU.ipynb) | 11000000 | 28 | 2996.16 (-) | 121.21 (114.88) | 119.34 (71.87) | 25.10 <br/>(-) | 1.01 <br/> (1.59) |
35 | | Airline | [Link CPU](./experiments/01_airline.ipynb) <br/> [Link GPU](./experiments/01_airline_GPU.ipynb) | 115069017 | 13 | - (-) | 1242.09 (1271.91) | 1056.20 (645.40) | - <br/> (-) | 1.17 <br/>(1.97) |
36 | 
37 | 
38 | In the next table we summarize the performance results using the [F1-Score](https://en.wikipedia.org/wiki/F1_score).
39 | 
40 | | Dataset | Experiment | Data size | Features | xgb F1: <br/> CPU (GPU) | xgb_hist F1: <br/> CPU (GPU) | lgb F1: <br/> CPU (GPU) |
41 | | --- | :---: | :---: | :---: | :---: | :---: | :---: |
42 | | Football | [Link](./experiments/03_football.ipynb) <br/> [Link](./experiments/03_football_GPU.ipynb) | 19673 | 46 | 0.458 (0.470) | 0.460 (0.472) | 0.459 (0.470)|
43 | | Fraud Detection | [Link](./experiments/05_FraudDetection.ipynb) <br/> [Link](./experiments/05_FraudDetection_GPU.ipynb)  | 284807 | 30 | 0.824 (0.821) | 0.802 (0.814) | 0.813 (0.811) |
44 | | BCI | [Link](./experiments/02_BCI.ipynb) <br/> [Link](./experiments/02_BCI_GPU.ipynb) | 20497 | 2048 | 0.110 (0.093) | 0.142 (0.120) | 0.137 (0.138) |
45 | | Planet Kaggle | [Link](./experiments/04_PlanetKaggle.ipynb) <br/> [Link](./experiments/04_PlanetKaggle_GPU.ipynb) | 40479 | 2048 | 0.805 (-) | 0.822 (0.822) | 0.822 (0.821)|
46 | | HIGGS | [Link](./experiments/06_HIGGS.ipynb) <br/> [Link](./experiments/06_HIGGS_GPU.ipynb) | 11000000 | 28 | 0.763 (-) | 0.767 (0.767) | 0.768 (0.767) |
47 | | Airline | [Link](./experiments/01_airline.ipynb) <br/> [Link](./experiments/01_airline_GPU.ipynb) | 115069017 | 13 | - (-) | 0.741 (0.745) | 0.732 (0.745) |
48 | 
49 | The experiments were run on an Azure NV24 VM with 24 cores and 224 GB memory. The machine has 4 NVIDIA M60 GPUs. In both cases we used Ubuntu 16.04.
50 | 
51 | 
52 | ## Contributing
53 | 
54 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
55 | 
56 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/INSTALL.md:
--------------------------------------------------------------------------------
  1 | # Installation and Setup
  2 | 
  3 | Here we present the instructions for setting up the project on an [Ubuntu Azure VM](https://azure.microsoft.com/en-us/services/virtual-machines/). The VM we used for the experiment was a NV24 with 4 NVIDIA M60 GPUs. The OS was Ubuntu 16.04. We recommend to use the [Azure Data Science VM](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/microsoft-ads.standard-data-science-vm) which comes with many machine learning tools already installed.
  4 | 
  5 | ## Setting up the environment 
  6 | 
  7 | Clone this repo to your desired location
  8 | ```bash
  9 | git clone https://github.com/Azure/fast_retraining.git
 10 | ```
 11 | 
 12 | Create a conda environment if you haven't already done so. The command below creates a python 3 environment called fast.
 13 | ```bash
 14 | conda create --name fast python=3.5 anaconda
 15 | ```
 16 | 
 17 | Edit [activate_env_vars.sh](environment/activate_env_vars.sh ) and [deactivate_env_vars.sh](environment/deactivate_env_vars.sh )
 18 | so that they contain the correct information.
 19 | 
 20 | Install command line json parser
 21 | ```bash
 22 | apt-get install jq
 23 | ```
 24 | 
 25 | Activate the conda environment and install the requirements.
 26 | ```bash
 27 | source activate fast
 28 | pip install -r requirements.txt
 29 | ```
 30 | 
 31 | Get the currently activated environment and assign it to env_path.
 32 | Get info of current env and output to json | look for default_prefix element in JSON | remove all quotes
 33 | ```bash
 34 | env_path=$(conda info --json | jq '.default_prefix' | tr -d '"')
 35 | ```
 36 | 
 37 | Make sure you are in the environment folder of the project and run the following
 38 | ```bash
 39 | activate_script_path=$(readlink -f activate_env_vars.sh)
 40 | deactivate_script_path=$(readlink -f deactivate_env_vars.sh)
 41 | ```
 42 | 
 43 | Then we create the activation and deactivation scripts and make sure they point to our now modified activation 
 44 | and deactivation scripts in our environment folder
 45 | ```bash
 46 | mkdir -p $env_path/etc/conda/activate.d
 47 | mkdir -p $env_path/etc/conda/deactivate.d
 48 | echo 'source '$activate_script_path >> $env_path/etc/conda/activate.d/env_vars.sh
 49 | echo 'source '$deactivate_script_path >> $env_path/etc/conda/deactivate.d/env_vars.sh
 50 | ```
 51 | 
 52 | Exit the environment
 53 | ```bash
 54 | source deactivate
 55 | ```
 56 | 
 57 | Enter the environment again
 58 | ```bash
 59 | source activate fast
 60 | ```
 61 | 
 62 | Finally, to register the environment in the jupyter notebook:
 63 | ```bash
 64 | python -m ipykernel install --user --name fast --display-name "Python Fast"
 65 | ```
 66 | 
 67 | ## Installation of boosted tree libraries
 68 | 
 69 | We need to install [XGBoost](https://github.com/dmlc/xgboost) and [LightGBM](https://github.com/microsoft/LightGBM). Even though both libraries have pypi versions, for creating the experiments contained in this repo we compiled from source.
 70 | 
 71 | To install XGBoost you can follow the [installation guide](https://xgboost.readthedocs.io/en/latest/build.html). To build in CPU, using the specific commit we used:
 72 | 
 73 |     git clone --recursive https://github.com/dmlc/xgboost
 74 |     cd xgboost
 75 |     git checkout 6776292951565c8cd72e69afd9d94de1474f00c0
 76 |     git submodule update --recursive
 77 |     make -j$(nproc)
 78 | 
 79 | In case you want to use the last version, just skip the commands `git checkout` and `git submodule`.
 80 | 
 81 | If you want to build in GPU, the instructions are [here](https://github.com/dmlc/xgboost/tree/master/plugin/updater_gpu). You first need to download and unzip [CUB 1.6.4](https://nvlabs.github.io/cub/).
 82 | 
 83 |     git clone --recursive https://github.com/dmlc/xgboost
 84 |     cd xgboost
 85 |     git checkout 6776292951565c8cd72e69afd9d94de1474f00c0
 86 |     git submodule update --recursive
 87 |     mkdir build
 88 |     cd build
 89 |     cmake .. -DPLUGIN_UPDATER_GPU=ON -DCUB_DIRECTORY=/path/to/cub-1.6.4
 90 |     make -j$(nproc)
 91 | 
 92 | To install LighGBM you can follow the [installation guide](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide). To build on CPU:
 93 | 
 94 |     git clone --recursive https://github.com/Microsoft/LightGBM ; cd LightGBM
 95 |     git checkout 73968a96829e212b333c88cd44725c8c39c03ad1
 96 |     mkdir build ; cd build
 97 |     cmake ..
 98 |     make -j$(nproc)
 99 | 
100 | To install the GPU version:
101 | 
102 |     git clone --recursive https://github.com/Microsoft/LightGBM ; cd LightGBM
103 |     git checkout 73968a96829e212b333c88cd44725c8c39c03ad1
104 |     mkdir build ; cd build
105 |     cmake .. -DUSE_GPU=1
106 |     make -j$(nproc)
107 | 
108 | To install the python biddings you have to compile in the python directory. Both libraries have the exact same name for the python package, so you just need to do the following step in both libraries:
109 | 
110 |     cd python-package
111 |     python setup.py install
112 | 
113 | Finally, to check that the libraries are correctly installed, try to load them from python:
114 | 
115 |     python -c "import xgboost; import lightgbm"
116 | 
117 | 
118 | ## Installation of bokeh functionality to export plots
119 | 
120 | To generate png exports with bokeh you have to follow the instructions explained in [this link](http://bokeh.pydata.org/en/0.12.6/docs/user_guide/export.html).
121 | 
122 |     sudo apt-get install npm
123 |     sudo npm install -g phantomjs-prebuilt
124 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | This repo tries to benchmark boosting frameworks against some of the popular
  3 | ML datasets. This is a more scriptable version of Microsoft's work on comparing
  4 | LightGBM and XGBoost: https://github.com/Azure/fast_retraining/. Most of the
  5 | datasets used here are the same as in the above repo.
  6 | 
  7 | # Dependencies
  8 | - Cuda 9.2 or greater
  9 | - Nvidia docker 2.0
 10 | 
 11 | # Setting up this repo
 12 | ```bash
 13 | 
 14 |   $ git clone https://github.com/NVIDIA/gbm-bench.git
 15 |   $ cd gbm-bench
 16 | ```
 17 | Create a docker image for cuda 10.0
 18 | ```bash
 19 |   $ docker build -t gbm-bench:10.0 . --build-arg CUDA_VERSION=10.0
 20 | ```
 21 | You can create docker images with different cuda versions as below. You will not be able to create an image for a cuda version greater than what is installed on your system. The GBM libraries may not support very recent versions of cuda.
 22 | ```bash
 23 |   $ docker build -t gbm-bench:9.2 . --build-arg CUDA_VERSION=9.2
 24 | ```
 25 | 
 26 | # Datasets
 27 | gbm-bench will automatically download datasets as needed using wget or the [Kaggle API](https://github.com/Kaggle/kaggle-api). To use the kaggle datasets you will need a valid kaggle account and API token. Create a folder 'gbm-datasets' in some location with sufficient space for large datasets. Mounting this folder on fast local storage as opposed to network storage is recommended.
 28 | 
 29 | ```bash
 30 | $ mkdir gbm-datasets
 31 | ```
 32 | Upon launching docker you will pass this folder as well as the location of the kaggle API key as volumes to the container.
 33 | 
 34 | | Name                                                                           | Rows   | Columns | Task           |
 35 | |--------------------------------------------------------------------------------|--------|---------|----------------|
 36 | | [airline](http://kt.ijs.si/elena_ikonomovska/data.html)                        | 115M   | 13      | Classification |
 37 | | [airline_regression](http://kt.ijs.si/elena_ikonomovska/data.html)             | 115M   | 13      | Regression     |
 38 | | [bosch](https://www.kaggle.com/c/bosch-production-line-performance)            | 1.184M | 968     | Classification |
 39 | | [fraud](https://www.kaggle.com/mlg-ulb/creditcardfraud)                        | 285K   | 28      | Classification |
 40 | | [higgs](https://archive.ics.uci.edu/ml/datasets/HIGGS)                         | 11M    | 28      | Classification |
 41 | | [year](https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd)              | 515K   | 90      | Regression     |
 42 | | [covtype](https://archive.ics.uci.edu/ml/datasets/covertype)                   | 581K   | 54      | Multiclass     |
 43 | | [epsilon](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html) | 500K   | 2000    | Classification |
 44 | 
 45 | # Benchmarking
 46 | This section assumes that one has elevated permissions on the system where this
 47 | docker image will be run for benchmarking! In case this is not true, update
 48 | your flow accordingly.
 49 | 
 50 | ## Launching container
 51 | ```bash
 52 | docker run --runtime=nvidia -it --rm \
 53 |     -w /opt/gbm-bench \
 54 |     -v {YOUR-LOCATION/gbm-datasets}:/opt/gbm-datasets \
 55 |     -v {YOUR-LOCATION/gbm-bench}:/opt/gbm-bench \
 56 |     -v {KAGGLE-API-LOCATION/.kaggle}:/root/.kaggle \
 57 |      gbm-bench:10.0 /bin/bash
 58 | ```
 59 | The above command launches an interactive session and mounts the dataset folder, the gbm-bench repo and your kaggle API key inside the container. "gbm-bench:10.0" refers to the docker image, modify this if you are using a different cuda version.
 60 | 
 61 | ## Running benchmarks
 62 | Benchmarks are launched from the python runme.py script
 63 | ```bash
 64 | python runme.py --help
 65 | usage: runme.py [-h] [-dataset DATASET] [-root ROOT] [-algorithm ALGORITHM]
 66 |                 [-gpus GPUS] [-cpus CPUS] [-output OUTPUT] [-ntrees NTREES]
 67 |                 [-nrows NROWS] [-warmup] [-verbose] [-extra EXTRA]
 68 | 
 69 | Benchmark xgboost/lightgbm/catboost on real datasets
 70 | 
 71 | optional arguments:
 72 |   -h, --help            show this help message and exit
 73 |   -dataset DATASET      The dataset to be used for benchmarking. 'all' for all
 74 |                         datasets.
 75 |   -root ROOT            The root datasets folder
 76 |   -algorithm ALGORITHM  Comma-separated list of algorithms to run; 'all' run
 77 |                         all
 78 |   -gpus GPUS            #GPUs to use for the benchmarks; ignored when not
 79 |                         supported. Default is to use all.
 80 |   -cpus CPUS            #CPUs to use for the benchmarks; 0 means
 81 |                         psutil.cpu_count(logical=False)
 82 |   -output OUTPUT        Output json file with runtime/accuracy stats
 83 |   -ntrees NTREES        Number of trees. Default is as specified in the
 84 |                         respective dataset configuration
 85 |   -nrows NROWS          Subset of rows in the datasets to use. Useful for test
 86 |                         running benchmarks on small amounts of data. WARNING:
 87 |                         Some datasets will give incorrect accuracy results if
 88 |                         nrows is specified as they have predefined train/test
 89 |                         splits.
 90 |   -warmup               Whether to run a small benchmark (fraud) as a warmup
 91 |   -verbose              Produce verbose output
 92 |   -extra EXTRA          Extra arguments as a python dictionary
 93 | ```
 94 | 
 95 | As an example, launch the xgb-gpu algorithm on the year dataset.
 96 | ```bash
 97 | python runme.py -dataset year -algorithm xgb-gpu
 98 | ```
 99 | # Yet another boosting tree benchmark?
100 | * This is more scriptable (and configurable) version (eg: for automated benchmarking)
101 | * Also adds CatBoost to the comparison list
102 | * Tries to keep the boosting hyper-params the same across frameworks for a fair
103 |   comparison. Reference: [this paper](https://openreview.net/pdf?id=ryexWdLRtm)
104 | * Supports multi-GPU as well as multi-node benchmarking (assuming underlying framework allows)
105 | 
106 | # Third party codes and licensing
107 | The third party codes which we borrowed from, and their license texts, are released
108 | "as-received" under the folder named "3rdparty". Refer to 3rdparty/README.md as to
109 | when they are borrowed and their respective licenses.
110 | 
111 | # License for this project
112 | This project is released under BSD License. Refer to LICENSE for more details.
113 | 


--------------------------------------------------------------------------------
/runme.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | 
 28 | import os
 29 | import sys
 30 | import argparse
 31 | import json
 32 | import ast
 33 | import psutil
 34 | import algorithms
 35 | from metrics import get_metrics
 36 | from datasets import prepare_dataset
 37 | 
 38 | 
 39 | def get_number_processors(args):
 40 |     if args.cpus == 0:
 41 |         return psutil.cpu_count(logical=False)
 42 |     return args.cpus
 43 | 
 44 | 
 45 | def print_sys_info(args):
 46 |     try:
 47 |         import xgboost  # pylint: disable=import-outside-toplevel
 48 |         print("Xgboost : %s" % xgboost.__version__)
 49 |     except ImportError:
 50 |         pass
 51 |     try:
 52 |         import lightgbm  # pylint: disable=import-outside-toplevel
 53 |         print("LightGBM: %s" % lightgbm.__version__)
 54 |     except (ImportError, OSError):
 55 |         pass
 56 |     try:
 57 |         import catboost  # pylint: disable=import-outside-toplevel
 58 |         print("Catboost: %s" % catboost.__version__)
 59 |     except ImportError:
 60 |         pass
 61 |     print("System  : %s" % sys.version)
 62 |     print("#jobs   : %d" % args.cpus)
 63 | 
 64 | 
 65 | def parse_args():
 66 |     parser = argparse.ArgumentParser(
 67 |         description="Benchmark xgboost/lightgbm/catboost on real datasets")
 68 |     parser.add_argument("-dataset", default="all", type=str,
 69 |                         help="The dataset to be used for benchmarking. 'all' for all datasets.")
 70 |     parser.add_argument("-root", default="/opt/gbm-datasets",
 71 |                         type=str, help="The root datasets folder")
 72 |     parser.add_argument("-algorithm", default="all", type=str,
 73 |                         help=("Comma-separated list of algorithms to run; "
 74 |                               "'all' run all"))
 75 |     parser.add_argument("-gpus", default=-1, type=int,
 76 |                         help=("#GPUs to use for the benchmarks; "
 77 |                               "ignored when not supported. Default is to use all."))
 78 |     parser.add_argument("-cpus", default=0, type=int,
 79 |                         help=("#CPUs to use for the benchmarks; "
 80 |                               "0 means psutil.cpu_count(logical=False)"))
 81 |     parser.add_argument("-output", default=sys.path[0] + "/results.json", type=str,
 82 |                         help="Output json file with runtime/accuracy stats")
 83 |     parser.add_argument("-ntrees", default=500, type=int,
 84 |                         help=("Number of trees. Default is as specified in "
 85 |                               "the respective dataset configuration"))
 86 |     parser.add_argument("-nrows", default=None, type=int,
 87 |                         help=(
 88 |                             "Subset of rows in the datasets to use. Useful for test running "
 89 |                             "benchmarks on small amounts of data. WARNING: Some datasets will "
 90 |                             "give incorrect accuracy results if nrows is specified as they have "
 91 |                             "predefined train/test splits."))
 92 |     parser.add_argument("-warmup", action="store_true",
 93 |                         help=("Whether to run a small benchmark (fraud) as a warmup"))
 94 |     parser.add_argument("-verbose", action="store_true", help="Produce verbose output")
 95 |     parser.add_argument("-extra", default='{}', help="Extra arguments as a python dictionary")
 96 |     args = parser.parse_args()
 97 |     # default value for output json file
 98 |     if not args.output:
 99 |         args.output = "%s.json" % args.dataset
100 |     return args
101 | 
102 | 
103 | # benchmarks a single dataset
104 | def benchmark(args, dataset_folder, dataset):
105 |     data = prepare_dataset(dataset_folder, dataset, args.nrows)
106 |     results = {}
107 |     # "all" runs all algorithms
108 |     if args.algorithm == "all":
109 |         args.algorithm = "xgb-gpu,xgb-cpu,xgb-gpu-dask,lgbm-cpu,lgbm-gpu,cat-cpu,cat-gpu"
110 |     for alg in args.algorithm.split(","):
111 |         print("Running '%s' ..." % alg)
112 |         runner = algorithms.Algorithm.create(alg)
113 |         with runner:
114 |             train_time = runner.fit(data, args)
115 |             pred = runner.test(data)
116 |             results[alg] = {
117 |                 "train_time": train_time,
118 |                 "accuracy": get_metrics(data, pred),
119 |             }
120 | 
121 |     return results
122 | 
123 | 
124 | def main():
125 |     args = parse_args()
126 |     args.cpus = get_number_processors(args)
127 |     args.extra = ast.literal_eval(args.extra)
128 |     print_sys_info(args)
129 |     if args.warmup:
130 |         benchmark(args, os.path.join(args.root, "fraud"), "fraud")
131 |     if args.dataset == 'all':
132 |         args.dataset = 'airline,bosch,fraud,higgs,year,epsilon,covtype,newsgroups'
133 |     results = {}
134 |     for dataset in args.dataset.split(","):
135 |         folder = os.path.join(args.root, dataset)
136 |         results.update({dataset: benchmark(args, folder, dataset)})
137 |         print(json.dumps({dataset: results[dataset]}, indent=2, sort_keys=True))
138 |     output = json.dumps(results, indent=2, sort_keys=True)
139 |     output_file = open(args.output, "w")
140 |     output_file.write(output + "\n")
141 |     output_file.close()
142 |     print("Results written to file '%s'" % args.output)
143 | 
144 | 
145 | if __name__ == "__main__":
146 |     main()
147 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/experiments/libs/loaders.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import arff
  4 | import numpy as np
  5 | from functools import reduce
  6 | import sqlite3
  7 | import logging
  8 | from libs.planet_kaggle import (to_multi_label_dict, get_file_count, enrich_with_feature_encoding, 
  9 |                                 featurise_images, generate_validation_files)
 10 | import tensorflow as tf
 11 | from keras.applications.resnet50 import ResNet50
 12 | 
 13 | 
 14 | logging.basicConfig(level=logging.INFO)
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | _FRAUD_PATH = 'fraud_detection', 'credit_card_fraud_kaggle', 'creditcard.csv'
 19 | _IOT_PATH = 'iot', 'sensor_stream_berkeley', 'sensor.arff'
 20 | _AIRLINE_PATH = 'airline', 'airline_14col.data'
 21 | _FOOTBALL_PATH = 'football', 'database.sqlite'
 22 | _BCI_PATH = 'bci', 'data.npz'
 23 | _HIGGS_PATH = 'higgs', 'HIGGS.csv'
 24 | _KAGGLE_ROOT = 'planet'
 25 | _PLANET_KAGGLE_LABEL_CSV = 'train_v2.csv'
 26 | _PLANET_KAGGLE_TRAIN_DIR = 'train-jpg'
 27 | _PLANET_KAGGLE_VAL_DIR = 'validate-jpg'
 28 | 
 29 | 
 30 | def _get_datapath():
 31 |     try:
 32 |         datapath = os.environ['MOUNT_POINT']
 33 |     except KeyError:
 34 |         logger.info("MOUNT_POINT not found in environment. Defaulting to /fileshare")
 35 |         datapath = '/fileshare'
 36 |     return datapath
 37 | 
 38 | 
 39 | def load_fraud():
 40 |     """ Loads the credit card fraud data
 41 | 
 42 |     The datasets contains transactions made by credit cards in September 2013 by european cardholders.
 43 |     This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions.
 44 |     The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.
 45 |     It contains only numerical input variables which are the result of a PCA transformation.
 46 | 
 47 |     Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about
 48 |     the data.
 49 |     Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed
 50 |     with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first
 51 |     transaction in the dataset.
 52 |     The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. 
 53 |     Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.
 54 |     Given the class imbalance ratio, we recommend measuring the accuracy using the Area Under the Precision-Recall Curve
 55 |     (AUPRC).
 56 |     Confusion matrix accuracy is not meaningful for unbalanced classification.
 57 | 
 58 |     The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group
 59 |     (http://mlg.ulb.ac.be) of ULB (Universite Libre de Bruxelles) on big data mining and fraud detection. More details 
 60 |     on current  and past projects on related topics are available on http://mlg.ulb.ac.be/BruFence 
 61 |     and http://mlg.ulb.ac.be/ARTML
 62 |     Please cite: Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with
 63 |     Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015
 64 | 
 65 |     Returns
 66 |     -------
 67 |     pandas DataFrame
 68 | 
 69 |     """
 70 |     return pd.read_csv(reduce(os.path.join, _FRAUD_PATH, _get_datapath()))
 71 | 
 72 | 
 73 | def load_iot():
 74 |     """ Loads iot data
 75 | 
 76 |     Sensor stream contains information (temperature, humidity, light, and sensor voltage) collected from 54 sensors deployed
 77 |     in Intel Berkeley Research Lab. The whole stream contains consecutive information recorded over a 2 months
 78 |     period (1 reading per 1-3 minutes). I used the sensor ID as the class label, so the learning task of the stream is
 79 |     to correctly identify the sensor ID (1 out of 54 sensors) purely based on the sensor data and the corresponding recording
 80 |     time.
 81 | 
 82 |     While the data stream flow over time, so does the concepts underlying the stream. For example, the lighting during
 83 |     the working hours is generally stronger than the night, and the temperature of specific sensors (conference room)
 84 |     may regularly rise during the meetings.
 85 | 
 86 |     Returns
 87 |     -------
 88 |     pandas DataFrame
 89 |     """
 90 |     dataset = arff.load(open(reduce(os.path.join, _IOT_PATH, _get_datapath())))
 91 |     columns = [i[0] for i in dataset['attributes']]
 92 |     return pd.DataFrame(dataset['data'], columns=columns)
 93 | 
 94 | 
 95 | def load_airline():
 96 |     """ Loads airline data
 97 |     The dataset consists of a large amount of records, containing flight arrival and departure details for all the 
 98 |     commercial flights within the USA, from October 1987 to April 2008. Its size is around 116 million records and 
 99 |     5.76 GB of memory.
100 |     There are 13 attributes, each represented in a separate column: Year (1987-2008), Month (1-12), Day of Month (1-31), 
101 |     Day of Week (1:Monday - 7:Sunday), CRS Departure Time (local time as hhmm), CRS Arrival Time (local time as hhmm), 
102 |     Unique Carrier, Flight Number, Actual Elapsed Time (in min), Origin, Destination, Distance (in miles), and Diverted
103 |     (1=yes, 0=no). 
104 |     The target attribute is Arrival Delay, it is a positive or negative value measured in minutes. 
105 |     Link to the source: http://kt.ijs.si/elena_ikonomovska/data.html
106 | 
107 |     Returns
108 |     -------
109 |     pandas DataFrame
110 |     """
111 |     cols = ['Year', 'Month', 'DayofMonth', 'DayofWeek', 'CRSDepTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'ActualElapsedTime', 'Origin', 'Dest', 'Distance', 'Diverted', 'ArrDelay']
112 |     return pd.read_csv(reduce(os.path.join, _AIRLINE_PATH, _get_datapath()), names=cols)
113 | 
114 | 
115 | def load_football():
116 |     """ Loads football data
117 |     Dataset of football stats. +25,000 matches, +10,000 players from 11 European Countries with their lead championship
118 |     Seasons 2008 to 2016. It also contains players attributes sourced from EA Sports' FIFA video game series,
119 |     including the weekly updates, team line up with squad formation (X, Y coordinates), betting odds from up to 10 
120 |     providers and detailed match events (goal types, possession, corner, cross, fouls, cards etc...) for +10,000 matches.
121 |     The meaning of the columns can be found here: http://www.football-data.co.uk/notes.txt
122 |     Number of attributes in each table (size of the dataframe):
123 |     countries (11, 2)
124 |     matches (25979, 115)
125 |     leagues (11, 3)
126 |     teams (299, 5)
127 |     players (183978, 42)
128 |     Link to the source: https://www.kaggle.com/hugomathien/soccer
129 |     
130 |     Returns
131 |     -------
132 |     list of pandas DataFrame
133 |     """
134 |     database_path = reduce(os.path.join, _FOOTBALL_PATH, _get_datapath())
135 |     with sqlite3.connect(database_path) as con:
136 |         countries = pd.read_sql_query("SELECT * from Country", con)
137 |         matches = pd.read_sql_query("SELECT * from Match", con)
138 |         leagues = pd.read_sql_query("SELECT * from League", con)
139 |         teams = pd.read_sql_query("SELECT * from Team", con)
140 |         players = pd.read_sql("SELECT * FROM Player_Attributes;", con)
141 |     return countries, matches, leagues, teams, players
142 | 
143 | 
144 | def load_bci():
145 |     """ Loads BCI data
146 | 
147 |     Contains measurements from 64 EEG sensors on the scalp of a single participant. 
148 |     The purpose of the recording is to determine from the electrical brain activity when the participant is paying attention.
149 | 
150 |     Returns
151 |     -------
152 |     A tuple containing four numpy arrays
153 |         train features
154 |         train labels
155 |         test features
156 |         test labels
157 |     """
158 | 
159 |     npzfile = np.load(reduce(os.path.join, _BCI_PATH, _get_datapath()))
160 |     return npzfile['train_X'], npzfile['train_y'], npzfile['test_X'], npzfile['test_y']
161 | 
162 | 
163 | 
164 | def load_higgs():
165 |     """ Loads HIGGS data
166 |     
167 |     Dataset of atomic particles measurements. The total size of the data is 11 millions of observations. 
168 |     It can be used in a classification problem to distinguish between a signal process which produces Higgs 
169 |     bosons and a background process which does not.
170 |     The data has been produced using Monte Carlo simulations. The first 21 features (columns 2-22) are kinematic 
171 |     properties measured by the particle detectors in the accelerator. The last seven features are functions of 
172 |     the first 21 features; these are high-level features derived by physicists to help discriminate between the 
173 |     two classes. The first column is the class label (1 for signal, 0 for background), followed by the 28 
174 |     features (21 low-level features then 7 high-level features): lepton pT, lepton eta, lepton phi, 
175 |     missing energy magnitude, missing energy phi, jet 1 pt, jet 1 eta, jet 1 phi, jet 1 b-tag, jet 2 pt, jet 2 eta, 
176 |     jet 2 phi, jet 2 b-tag, jet 3 pt, jet 3 eta, jet 3 phi, jet 3 b-tag, jet 4 pt, jet 4 eta, jet 4 phi, 
177 |     jet 4 b-tag, m_jj, m_jjj, m_lv, m_jlv, m_bb, m_wbb, m_wwbb.
178 |     Link to the source: https://archive.ics.uci.edu/ml/datasets/HIGGS
179 |     
180 |     Returns
181 |     -------
182 |     pandas DataFrame
183 |     """
184 |     cols = ['boson','lepton_pT','lepton_eta','lepton_phi','missing_energy_magnitude','missing_energy_phi','jet_1_pt','jet_1_eta','jet_1_phi','jet_1_b-tag','jet_2_pt','jet_2_eta','jet_2_phi','jet_2_b-tag','jet_3_pt','jet_3_eta','jet_3_phi','jet_3_b-tag','jet_4_pt','jet_4_eta','jet_4_phi','jet_4_b-tag','m_jj','m_jjj','m_lv','m_jlv','m_bb','m_wbb','m_wwbb']
185 |     return pd.read_csv(reduce(os.path.join, _HIGGS_PATH, _get_datapath()), names=cols)
186 | 
187 | 
188 | def load_planet_kaggle():
189 |     """ Loads Planet Kaggle data
190 |     
191 |     Dataset of satellite images of the Amazon. The objective of this dataset is to label satellite image chips 
192 |     with atmospheric conditions and various classes of land cover/land use. Resulting algorithms will help the 
193 |     global community better understand where, how, and why deforestation happens all over the world. The images
194 |     use the GeoTiff format and each contain four bands of data: red, green, blue, and near infrared.
195 |     To treat the images we used transfer learning with the CNN ResNet50. The images are featurized with this
196 |     deep neural network. Once the features are generated we can use a boosted tree to classify them.
197 |     Link to the source: https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/data
198 |     
199 |     Returns
200 |     -------
201 |     A tuple containing four numpy arrays
202 |         train_features
203 |         y_train
204 |         validation_features
205 |         y_val
206 |     """    
207 |     csv_path = reduce(os.path.join, (_KAGGLE_ROOT, _PLANET_KAGGLE_LABEL_CSV), _get_datapath())
208 |     train_path = reduce(os.path.join, (_KAGGLE_ROOT, _PLANET_KAGGLE_TRAIN_DIR), _get_datapath())
209 |     val_path = reduce(os.path.join, (_KAGGLE_ROOT, _PLANET_KAGGLE_VAL_DIR), _get_datapath())
210 |     assert os.path.isfile(csv_path)
211 |     assert os.path.exists(train_path)
212 |     if not os.path.exists(val_path): os.mkdir(val_path)
213 |     if not os.listdir(val_path): 
214 |         logger.info('Validation folder is empty, moving files...')
215 |         generate_validation_files(train_path, val_path)
216 |     
217 |     logger.info('Reading in labels')
218 |     labels_df = pd.read_csv(csv_path).pipe(enrich_with_feature_encoding)
219 |     multi_label_dict = to_multi_label_dict(labels_df)
220 |     
221 |     nb_train_samples = get_file_count(os.path.join(train_path, '*.jpg'))
222 |     nb_validation_samples = get_file_count(os.path.join(val_path, '*.jpg'))
223 | 
224 |     logger.debug('Number of training files {}'.format(nb_train_samples))
225 |     logger.debug('Number of validation files {}'.format(nb_validation_samples))
226 |     logger.debug('Loading model')
227 | 
228 |     model = ResNet50(include_top=False)
229 |     train_features, train_names = featurise_images(model, 
230 |                                                 train_path, 
231 |                                                 'train_{}', 
232 |                                                     range(nb_train_samples),
233 |                                                     desc='Featurising training images')
234 | 
235 |     validation_features, validation_names = featurise_images(model, 
236 |                                                 val_path, 
237 |                                                 'train_{}', 
238 |                                                 range(nb_train_samples, nb_train_samples+nb_validation_samples),
239 |                                                 desc='Featurising validation images')
240 | 
241 |     # Prepare data
242 |     y_train = np.array([multi_label_dict[name] for name in train_names])
243 |     y_val = np.array([multi_label_dict[name] for name in validation_names])
244 | 
245 |     return train_features, y_train, validation_features, y_val
246 | 


--------------------------------------------------------------------------------
/datasets.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) Microsoft Corporation. All rights reserved.
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE
 22 | 
 23 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 24 | 
 25 | import os
 26 | from enum import Enum
 27 | import pickle
 28 | from urllib.request import urlretrieve
 29 | import numpy as np
 30 | from sklearn.model_selection import train_test_split
 31 | from sklearn import datasets
 32 | import pandas as pd
 33 | import tqdm
 34 | 
 35 | pbar = None
 36 | 
 37 | 
 38 | def show_progress(block_num, block_size, total_size):
 39 |     global pbar
 40 |     if pbar is None:
 41 |         pbar = tqdm.tqdm(total=total_size / 1024, unit='kB')
 42 | 
 43 |     downloaded = block_num * block_size
 44 |     if downloaded < total_size:
 45 |         pbar.update(block_size / 1024)
 46 |     else:
 47 |         pbar.close()
 48 |         pbar = None
 49 | 
 50 | 
 51 | def retrieve(url, filename=None):
 52 |     return urlretrieve(url, filename, reporthook=show_progress)
 53 | 
 54 | 
 55 | class LearningTask(Enum):
 56 |     REGRESSION = 1
 57 |     CLASSIFICATION = 2
 58 |     MULTICLASS_CLASSIFICATION = 3
 59 | 
 60 | 
 61 | class Data:  # pylint: disable=too-few-public-methods,too-many-arguments
 62 |     def __init__(self, X_train, X_test, y_train, y_test, learning_task, qid_train=None,
 63 |                  qid_test=None):
 64 |         self.X_train = X_train
 65 |         self.X_test = X_test
 66 |         self.y_train = y_train
 67 |         self.y_test = y_test
 68 |         self.learning_task = learning_task
 69 |         # For ranking task
 70 |         self.qid_train = qid_train
 71 |         self.qid_test = qid_test
 72 | 
 73 | 
 74 | def prepare_dataset(dataset_folder, dataset, nrows):
 75 |     if not os.path.exists(dataset_folder):
 76 |         os.makedirs(dataset_folder)
 77 |     prepare_function = globals()["prepare_" + dataset]
 78 |     return prepare_function(dataset_folder, nrows)
 79 | 
 80 | 
 81 | def __prepare_airline(dataset_folder, nrows, regression=False):  # pylint: disable=too-many-locals
 82 |     url = 'http://kt.ijs.si/elena_ikonomovska/datasets/airline/airline_14col.data.bz2'
 83 |     pkl_base_name = "airline"
 84 |     if regression:
 85 |         pkl_base_name += "-regression"
 86 |     local_url = os.path.join(dataset_folder, os.path.basename(url))
 87 |     pickle_url = os.path.join(dataset_folder,
 88 |                               pkl_base_name
 89 |                               + ("" if nrows is None else "-" + str(nrows)) + ".pkl")
 90 |     if os.path.exists(pickle_url):
 91 |         return pickle.load(open(pickle_url, "rb"))
 92 |     if not os.path.isfile(local_url):
 93 |         retrieve(url, local_url)
 94 | 
 95 |     cols = [
 96 |         "Year", "Month", "DayofMonth", "DayofWeek", "CRSDepTime",
 97 |         "CRSArrTime", "UniqueCarrier", "FlightNum", "ActualElapsedTime",
 98 |         "Origin", "Dest", "Distance", "Diverted", "ArrDelay"
 99 |     ]
100 | 
101 |     # load the data as int16
102 |     dtype = np.int16
103 | 
104 |     dtype_columns = {
105 |         "Year": dtype, "Month": dtype, "DayofMonth": dtype, "DayofWeek": dtype,
106 |         "CRSDepTime": dtype, "CRSArrTime": dtype, "FlightNum": dtype,
107 |         "ActualElapsedTime": dtype, "Distance":
108 |             dtype,
109 |         "Diverted": dtype, "ArrDelay": dtype,
110 |     }
111 | 
112 |     df = pd.read_csv(local_url,
113 |                      names=cols, dtype=dtype_columns, nrows=nrows)
114 | 
115 |     # Encode categoricals as numeric
116 |     for col in df.select_dtypes(['object']).columns:
117 |         df[col] = df[col].astype("category").cat.codes
118 | 
119 |     # Turn into binary classification problem
120 |     if not regression:
121 |         df["ArrDelay"] = 1 * (df["ArrDelay"] > 0)
122 | 
123 |     X = df[df.columns.difference(["ArrDelay"])].to_numpy(dtype=np.float32)
124 |     y = df["ArrDelay"].to_numpy(dtype=np.float32)
125 |     del df
126 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77,
127 |                                                         test_size=0.2,
128 |                                                         )
129 |     if regression:
130 |         task = LearningTask.REGRESSION
131 |     else:
132 |         task = LearningTask.CLASSIFICATION
133 |     data = Data(X_train, X_test, y_train, y_test, task)
134 |     pickle.dump(data, open(pickle_url, "wb"), protocol=4)
135 |     return data
136 | 
137 | 
138 | def prepare_airline(dataset_folder, nrows):
139 |     return __prepare_airline(dataset_folder, nrows, False)
140 | 
141 | 
142 | def prepare_airline_regression(dataset_folder, nrows):
143 |     return __prepare_airline(dataset_folder, nrows, True)
144 | 
145 | 
146 | def prepare_bosch(dataset_folder, nrows):
147 |     filename = "train_numeric.csv.zip"
148 |     local_url = os.path.join(dataset_folder, filename)
149 |     pickle_url = os.path.join(dataset_folder,
150 |                               "bosch" + ("" if nrows is None else "-" + str(nrows)) + ".pkl")
151 |     if os.path.exists(pickle_url):
152 |         return pickle.load(open(pickle_url, "rb"))
153 | 
154 |     os.system("kaggle competitions download -c bosch-production-line-performance -f " +
155 |               filename + " -p " + dataset_folder)
156 |     X = pd.read_csv(local_url, index_col=0, compression='zip', dtype=np.float32,
157 |                     nrows=nrows)
158 |     y = X.iloc[:, -1].to_numpy(dtype=np.float32)
159 |     X.drop(X.columns[-1], axis=1, inplace=True)
160 |     X = X.to_numpy(dtype=np.float32)
161 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77,
162 |                                                         test_size=0.2,
163 |                                                         )
164 |     data = Data(X_train, X_test, y_train, y_test, LearningTask.CLASSIFICATION)
165 |     pickle.dump(data, open(pickle_url, "wb"), protocol=4)
166 |     return data
167 | 
168 | 
169 | def prepare_fraud(dataset_folder, nrows):
170 |     if not os.path.exists(dataset_folder):
171 |         os.makedirs(dataset_folder)
172 |     filename = "creditcard.csv"
173 |     local_url = os.path.join(dataset_folder, filename)
174 |     pickle_url = os.path.join(dataset_folder,
175 |                               "creditcard" + ("" if nrows is None else "-" + str(nrows)) + ".pkl")
176 |     if os.path.exists(pickle_url):
177 |         return pickle.load(open(pickle_url, "rb"))
178 | 
179 |     os.system("kaggle datasets download mlg-ulb/creditcardfraud -f" +
180 |               filename + " -p " + dataset_folder)
181 |     df = pd.read_csv(local_url + ".zip", dtype=np.float32, nrows=nrows)
182 |     X = df[[col for col in df.columns if col.startswith('V')]].to_numpy(dtype=np.float32)
183 |     y = df['Class'].to_numpy(dtype=np.float32)
184 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77,
185 |                                                         test_size=0.2,
186 |                                                         )
187 |     data = Data(X_train, X_test, y_train, y_test, LearningTask.CLASSIFICATION)
188 |     pickle.dump(data, open(pickle_url, "wb"), protocol=4)
189 |     return data
190 | 
191 | 
192 | def prepare_higgs(dataset_folder, nrows):
193 |     url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz'
194 |     local_url = os.path.join(dataset_folder, os.path.basename(url))
195 |     pickle_url = os.path.join(dataset_folder,
196 |                               "higgs" + ("" if nrows is None else "-" + str(nrows)) + ".pkl")
197 | 
198 |     if os.path.exists(pickle_url):
199 |         return pickle.load(open(pickle_url, "rb"))
200 | 
201 |     if not os.path.isfile(local_url):
202 |         retrieve(url, local_url)
203 |     higgs = pd.read_csv(local_url, nrows=nrows)
204 |     X = higgs.iloc[:, 1:].to_numpy(dtype=np.float32)
205 |     y = higgs.iloc[:, 0].to_numpy(dtype=np.float32)
206 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77,
207 |                                                         test_size=0.2,
208 |                                                         )
209 |     data = Data(X_train, X_test, y_train, y_test, LearningTask.CLASSIFICATION)
210 |     pickle.dump(data, open(pickle_url, "wb"), protocol=4)
211 |     return data
212 | 
213 | 
214 | def prepare_year(dataset_folder, nrows):
215 |     url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt' \
216 |           '.zip'
217 |     local_url = os.path.join(dataset_folder, os.path.basename(url))
218 |     pickle_url = os.path.join(dataset_folder,
219 |                               "year" + ("" if nrows is None else "-" + str(nrows)) + ".pkl")
220 | 
221 |     if os.path.exists(pickle_url):
222 |         return pickle.load(open(pickle_url, "rb"))
223 | 
224 |     if not os.path.isfile(local_url):
225 |         retrieve(url, local_url)
226 |     year = pd.read_csv(local_url, nrows=nrows, header=None)
227 |     X = year.iloc[:, 1:].to_numpy(dtype=np.float32)
228 |     y = year.iloc[:, 0].to_numpy(dtype=np.float32)
229 | 
230 |     if nrows is None:
231 |         # this dataset requires a specific train/test split,
232 |         # with the specified number of rows at the start belonging to the train set,
233 |         # and the rest being the test set
234 |         X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False,
235 |                                                             train_size=463715,
236 |                                                             test_size=51630)
237 |     else:
238 |         print(
239 |             "Warning: nrows is specified, not using predefined test/train split for "
240 |             "YearPredictionMSD.")
241 |         X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77,
242 |                                                             test_size=0.2,
243 |                                                             )
244 | 
245 |     data = Data(X_train, X_test, y_train, y_test, LearningTask.REGRESSION)
246 |     pickle.dump(data, open(pickle_url, "wb"), protocol=4)
247 |     return data
248 | 
249 | 
250 | def prepare_epsilon(dataset_folder, nrows):
251 |     url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
252 |                 '/epsilon_normalized.bz2'
253 |     url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
254 |                '/epsilon_normalized.t.bz2'
255 |     pickle_url = os.path.join(dataset_folder,
256 |                               "epsilon" + ("" if nrows is None else "-" + str(nrows)) + ".pkl")
257 |     local_url_train = os.path.join(dataset_folder, os.path.basename(url_train))
258 |     local_url_test = os.path.join(dataset_folder, os.path.basename(url_test))
259 | 
260 |     if os.path.exists(pickle_url):
261 |         return pickle.load(open(pickle_url, "rb"))
262 | 
263 |     if not os.path.isfile(local_url_train):
264 |         retrieve(url_train, local_url_train)
265 |     if not os.path.isfile(local_url_test):
266 |         retrieve(url_test, local_url_test)
267 | 
268 |     X_train, y_train = datasets.load_svmlight_file(local_url_train,
269 |                                           dtype=np.float32)
270 |     X_test, y_test = datasets.load_svmlight_file(local_url_test,
271 |                                         dtype=np.float32)
272 |     X_train = X_train.toarray()
273 |     X_test = X_test.toarray()
274 |     y_train[y_train <= 0] = 0
275 |     y_test[y_test <= 0] = 0
276 | 
277 |     if nrows is not None:
278 |         print("Warning: nrows is specified, not using predefined test/train split for epsilon.")
279 | 
280 |         X_train = np.vstack((X_train, X_test))
281 |         y_train = np.append(y_train, y_test)
282 |         X_train = X_train[:nrows]
283 |         y_train = y_train[:nrows]
284 |         X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=77,
285 |                                                             test_size=0.2,
286 |                                                             )
287 | 
288 |     data = Data(X_train, X_test, y_train, y_test, LearningTask.CLASSIFICATION)
289 |     pickle.dump(data, open(pickle_url, "wb"), protocol=4)
290 |     return data
291 | 
292 | 
293 | def prepare_covtype(dataset_folder, nrows):  # pylint: disable=unused-argument
294 |     X, y = datasets.fetch_covtype(return_X_y=True)  # pylint: disable=unexpected-keyword-arg
295 |     if nrows is not None:
296 |         X = X[0:nrows]
297 |         y = y[0:nrows]
298 | 
299 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77,
300 |                                                         test_size=0.2,
301 |                                                         )
302 |     return Data(X_train, X_test, y_train, y_test, LearningTask.MULTICLASS_CLASSIFICATION)
303 | 
304 | 
305 | def prepare_newsgroups(dataset_folder, nrows):  # pylint: disable=unused-argument
306 |     X, y = datasets.fetch_20newsgroups_vectorized(subset='all',return_X_y=True)  # pylint: disable=unexpected-keyword-arg
307 |     if nrows is not None:
308 |         X = X[0:nrows]
309 |         y = y[0:nrows]
310 | 
311 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77,
312 |                                                         test_size=0.2,
313 |                                                         )
314 | 
315 |     return Data(X_train, X_test, y_train, y_test, LearningTask.MULTICLASS_CLASSIFICATION)


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/experiments/libs/football.py:
--------------------------------------------------------------------------------
  1 | #code from https://www.kaggle.com/airback/match-outcome-prediction-in-football
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def get_fifa_stats(match, player_stats):
  8 |     ''' Aggregates fifa stats for a given match. '''
  9 |     #Define variables
 10 |     match_id =  match.match_api_id
 11 |     date = match['date']
 12 |     players = ['home_player_1', 'home_player_2', 'home_player_3', "home_player_4", "home_player_5",
 13 |                "home_player_6", "home_player_7", "home_player_8", "home_player_9", "home_player_10",
 14 |                "home_player_11", "away_player_1", "away_player_2", "away_player_3", "away_player_4",
 15 |                "away_player_5", "away_player_6", "away_player_7", "away_player_8", "away_player_9",
 16 |                "away_player_10", "away_player_11"]
 17 |     player_stats_new = pd.DataFrame()
 18 |     names = []
 19 |     
 20 |     #Loop through all players
 21 |     for player in players:   
 22 |             
 23 |         #Get player ID
 24 |         player_id = match[player]
 25 |         
 26 |         #Get player stats 
 27 |         stats = player_stats[player_stats.player_api_id == player_id]
 28 |             
 29 |         #Identify current stats       
 30 |         current_stats = stats[stats.date < date].sort_values(by = 'date', ascending = False)[:1]
 31 |         
 32 |         if np.isnan(player_id) == True:
 33 |             overall_rating = pd.Series(0)
 34 |         else:
 35 |             current_stats.reset_index(inplace = True, drop = True)
 36 |             overall_rating = pd.Series(current_stats.loc[0, "overall_rating"])
 37 | 
 38 |         #Rename stat
 39 |         name = "{}_overall_rating".format(player)
 40 |         names.append(name)
 41 |             
 42 |         #Aggregate stats
 43 |         player_stats_new = pd.concat([player_stats_new, overall_rating], axis = 1)
 44 |     
 45 |     player_stats_new.columns = names        
 46 |     player_stats_new['match_api_id'] = match_id
 47 | 
 48 |     player_stats_new.reset_index(inplace = True, drop = True)
 49 |     
 50 |     #Return player stats    
 51 |     return player_stats_new.ix[0]     
 52 |       
 53 | def get_fifa_data(matches, player_stats):
 54 |     ''' Gets fifa data for all matches. '''  
 55 |     #Apply get_fifa_stats for each match
 56 |     fifa_data = matches.apply(lambda x :get_fifa_stats(x, player_stats), axis = 1)
 57 |     return fifa_data
 58 | 
 59 | def get_match_label(match):
 60 |     ''' Derives a label for a given match. '''
 61 |     
 62 |     #Define variables
 63 |     home_goals = match['home_team_goal']
 64 |     away_goals = match['away_team_goal']
 65 |      
 66 |     label = pd.DataFrame()
 67 |     label.loc[0,'match_api_id'] = match['match_api_id'] 
 68 | 
 69 |     #Identify match label  
 70 |     if home_goals > away_goals:
 71 |         label.loc[0,'label'] = "Win"
 72 |     if home_goals == away_goals:
 73 |         label.loc[0,'label'] = "Draw"
 74 |     if home_goals < away_goals:
 75 |         label.loc[0,'label'] = "Defeat"
 76 | 
 77 |     #Return label        
 78 |     return label.loc[0]
 79 |         
 80 |     
 81 | def get_overall_fifa_rankings(fifa, get_overall = False):
 82 |     ''' Get overall fifa rankings from fifa data. '''
 83 |       
 84 |     temp_data = fifa
 85 |     
 86 |     #Check if only overall player stats are desired
 87 |     if get_overall == True:
 88 |         
 89 |         #Get overall stats
 90 |         data = temp_data.loc[:,(fifa.columns.str.contains('overall_rating'))]
 91 |         data.loc[:,'match_api_id'] = temp_data.loc[:,'match_api_id']
 92 |     else:
 93 |         
 94 |         #Get all stats except for stat date
 95 |         cols = fifa.loc[:,(fifa.columns.str.contains('date_stat'))]
 96 |         temp_data = fifa.drop(cols.columns, axis = 1)        
 97 |         data = temp_data
 98 |     
 99 |     #Return data
100 |     return data
101 | 
102 | def get_last_matches(matches, date, team, x = 10):
103 |     ''' Get the last x matches of a given team. '''
104 |     
105 |     #Filter team matches from matches
106 |     team_matches = matches[(matches['home_team_api_id'] == team) | (matches['away_team_api_id'] == team)]
107 |                            
108 |     #Filter x last matches from team matches
109 |     last_matches = team_matches[team_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
110 |     
111 |     #Return last matches
112 |     return last_matches
113 |     
114 | def get_last_matches_against_eachother(matches, date, home_team, away_team, x = 10):
115 |     ''' Get the last x matches of two given teams. '''
116 |     
117 |     #Find matches of both teams
118 |     home_matches = matches[(matches['home_team_api_id'] == home_team) & (matches['away_team_api_id'] == away_team)]    
119 |     away_matches = matches[(matches['home_team_api_id'] == away_team) & (matches['away_team_api_id'] == home_team)]  
120 |     total_matches = pd.concat([home_matches, away_matches])
121 |     
122 |     #Get last x matches
123 |     try:    
124 |         last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
125 |     except:
126 |         last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:total_matches.shape[0],:]
127 |         
128 |         #Check for error in data
129 |         if(last_matches.shape[0] > x):
130 |             print("Error in obtaining matches")
131 |             
132 |     #Return data
133 |     return last_matches
134 |     
135 | def get_goals(matches, team):
136 |     ''' Get the goals of a specfic team from a set of matches. '''
137 |     
138 |     #Find home and away goals
139 |     home_goals = int(matches.home_team_goal[matches.home_team_api_id == team].sum())
140 |     away_goals = int(matches.away_team_goal[matches.away_team_api_id == team].sum())
141 | 
142 |     total_goals = home_goals + away_goals
143 |     
144 |     #Return total goals
145 |     return total_goals
146 | 
147 | def get_goals_conceided(matches, team):
148 |     ''' Get the goals conceided of a specfic team from a set of matches. '''
149 | 
150 |     #Find home and away goals
151 |     home_goals = int(matches.home_team_goal[matches.away_team_api_id == team].sum())
152 |     away_goals = int(matches.away_team_goal[matches.home_team_api_id == team].sum())
153 | 
154 |     total_goals = home_goals + away_goals
155 | 
156 |     #Return total goals
157 |     return total_goals
158 | 
159 | def get_wins(matches, team):
160 |     ''' Get the number of wins of a specfic team from a set of matches. '''
161 |     
162 |     #Find home and away wins
163 |     home_wins = int(matches.home_team_goal[(matches.home_team_api_id == team) & (matches.home_team_goal > matches.away_team_goal)].count())
164 |     away_wins = int(matches.away_team_goal[(matches.away_team_api_id == team) & (matches.away_team_goal > matches.home_team_goal)].count())
165 | 
166 |     total_wins = home_wins + away_wins
167 | 
168 |     #Return total wins
169 |     return total_wins      
170 |     
171 | def get_match_features(match, matches, x = 10):
172 |     ''' Create match specific features for a given match. '''
173 |     
174 |     #Define variables
175 |     date = match.date
176 |     home_team = match.home_team_api_id
177 |     away_team = match.away_team_api_id
178 |     
179 |     #Get last x matches of home and away team
180 |     matches_home_team = get_last_matches(matches, date, home_team, x = 10)
181 |     matches_away_team = get_last_matches(matches, date, away_team, x = 10)
182 |     
183 |     #Get last x matches of both teams against each other
184 |     last_matches_against = get_last_matches_against_eachother(matches, date, home_team, away_team, x = 3)
185 |     
186 |     #Create goal variables
187 |     home_goals = get_goals(matches_home_team, home_team)
188 |     away_goals = get_goals(matches_away_team, away_team)
189 |     home_goals_conceided = get_goals_conceided(matches_home_team, home_team)
190 |     away_goals_conceided = get_goals_conceided(matches_away_team, away_team)
191 |     
192 |     #Define result data frame
193 |     result = pd.DataFrame()
194 |     
195 |     #Define ID features
196 |     result.loc[0, 'match_api_id'] = match.match_api_id
197 |     result.loc[0, 'league_id'] = match.league_id
198 | 
199 |     #Create match features
200 |     result.loc[0, 'home_team_goals_difference'] = home_goals - home_goals_conceided
201 |     result.loc[0, 'away_team_goals_difference'] = away_goals - away_goals_conceided
202 |     result.loc[0, 'games_won_home_team'] = get_wins(matches_home_team, home_team) 
203 |     result.loc[0, 'games_won_away_team'] = get_wins(matches_away_team, away_team)
204 |     result.loc[0, 'games_against_won'] = get_wins(last_matches_against, home_team)
205 |     result.loc[0, 'games_against_lost'] = get_wins(last_matches_against, away_team)
206 |     
207 |     #Add season
208 |     result.loc[0, 'season'] = int(match['season'].split('/')[0])
209 |     
210 |     #Return match features
211 |     return result.loc[0]
212 |     
213 | def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = True, x = 10, all_leagues = True, verbose = True):
214 |     ''' Create and aggregate features and labels for all matches. '''
215 | 
216 |     #Get fifa stats features
217 |     fifa_stats = get_overall_fifa_rankings(fifa, get_overall)
218 |     
219 |     
220 |     if verbose == True:
221 |         print("Generating match features...")
222 |     
223 |     #Get match features for all matches
224 |     match_stats = matches.apply(lambda x: get_match_features(x, matches, x = 10), axis = 1)
225 |     
226 |     #Create dummies for league ID feature
227 |     if all_leagues:
228 |         dummies = pd.get_dummies(match_stats['league_id']).rename(columns = lambda x: 'League_' + str(x))
229 |         match_stats = pd.concat([match_stats, dummies], axis = 1)
230 |         match_stats.drop(['league_id'], inplace = True, axis = 1)
231 |    
232 |     
233 |     if verbose == True:    
234 |         print("Generating match labels...")
235 |     
236 |     #Create match labels
237 |     labels = matches.apply(get_match_label, axis = 1)
238 |     
239 |     if verbose == True:    
240 |         print("Generating bookkeeper data...")
241 |     
242 |     #Get bookkeeper quotas for all matches
243 |     bk_data = get_bookkeeper_data(matches, bookkeepers, horizontal = True)
244 |     bk_data.loc[:,'match_api_id'] = matches.loc[:,'match_api_id']
245 | 
246 |     #Merges features and labels into one frame
247 |     features = pd.merge(match_stats, fifa_stats, on = 'match_api_id', how = 'left')
248 |     features = pd.merge(features, bk_data, on = 'match_api_id', how = 'left')
249 |     feables = pd.merge(features, labels, on = 'match_api_id', how = 'left')
250 |     
251 |     #Drop NA values
252 |     feables.dropna(inplace = True)
253 |     
254 |     #Return preprocessed data
255 |     return feables
256 |     
257 | 
258 | def convert_odds_to_prob(match_odds):
259 |     ''' Converts bookkeeper odds to probabilities. '''
260 |     
261 |     #Define variables
262 |     match_id = match_odds.loc[:,'match_api_id']
263 |     bookkeeper = match_odds.loc[:,'bookkeeper']    
264 |     win_odd = match_odds.loc[:,'Win']
265 |     draw_odd = match_odds.loc[:,'Draw']
266 |     loss_odd = match_odds.loc[:,'Defeat']
267 |     
268 |     #Converts odds to prob
269 |     win_prob = 1 / win_odd
270 |     draw_prob = 1 / draw_odd
271 |     loss_prob = 1 / loss_odd
272 |     
273 |     total_prob = win_prob + draw_prob + loss_prob
274 |     
275 |     probs = pd.DataFrame()
276 |     
277 |     #Define output format and scale probs by sum over all probs
278 |     probs.loc[:,'match_api_id'] = match_id
279 |     probs.loc[:,'bookkeeper'] = bookkeeper
280 |     probs.loc[:,'Win'] = win_prob / total_prob
281 |     probs.loc[:,'Draw'] = draw_prob / total_prob
282 |     probs.loc[:,'Defeat'] = loss_prob / total_prob
283 |     
284 |     #Return probs and meta data
285 |     return probs
286 |     
287 | def get_bookkeeper_data(matches, bookkeepers, horizontal = True):
288 |     ''' Aggregates bookkeeper data for all matches and bookkeepers. '''
289 |     
290 |     bk_data = pd.DataFrame()
291 |     
292 |     #Loop through bookkeepers
293 |     for bookkeeper in bookkeepers:
294 | 
295 |         #Find columns containing data of bookkeeper
296 |         temp_data = matches.loc[:,(matches.columns.str.contains(bookkeeper))]
297 |         temp_data.loc[:, 'bookkeeper'] = str(bookkeeper)
298 |         temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']
299 |         
300 |         #Rename odds columns and convert to numeric
301 |         cols = temp_data.columns.values
302 |         cols[:3] = ['Win','Draw','Defeat']
303 |         temp_data.columns = cols
304 |         temp_data.loc[:,'Win'] = pd.to_numeric(temp_data['Win'])
305 |         temp_data.loc[:,'Draw'] = pd.to_numeric(temp_data['Draw'])
306 |         temp_data.loc[:,'Defeat'] = pd.to_numeric(temp_data['Defeat'])
307 |         
308 |         #Check if data should be aggregated horizontally
309 |         if(horizontal == True):
310 |             
311 |             #Convert data to probs
312 |             temp_data = convert_odds_to_prob(temp_data)
313 |             temp_data.drop('match_api_id', axis = 1, inplace = True)
314 |             temp_data.drop('bookkeeper', axis = 1, inplace = True)
315 |             
316 |             #Rename columns with bookkeeper names
317 |             win_name = bookkeeper + "_" + "Win"
318 |             draw_name = bookkeeper + "_" + "Draw"
319 |             defeat_name = bookkeeper + "_" + "Defeat"
320 |             temp_data.columns.values[:3] = [win_name, draw_name, defeat_name]
321 | 
322 |             #Aggregate data
323 |             bk_data = pd.concat([bk_data, temp_data], axis = 1)
324 |         else:
325 |             #Aggregate vertically
326 |             bk_data = bk_data.append(temp_data, ignore_index = True)
327 |     
328 |     #If horizontal add match api id to data
329 |     if(horizontal == True):
330 |         temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']
331 |     
332 |     #Return bookkeeper data
333 |     return bk_data
334 |     
335 | def get_bookkeeper_probs(matches, bookkeepers, horizontal = False):
336 |     ''' Get bookkeeper data and convert to probabilities for vertical aggregation. '''
337 |     
338 |     #Get bookkeeper data
339 |     data = get_bookkeeper_data(matches, bookkeepers, horizontal = False)
340 |     
341 |     #Convert odds to probabilities
342 |     probs = convert_odds_to_prob(data)
343 |     
344 |     #Return data
345 |     return probs
346 | 
347 | 


--------------------------------------------------------------------------------
/3rdparty/codebase/python/machine_learning/metrics.py:
--------------------------------------------------------------------------------
  1 | from sklearn.metrics import (confusion_matrix, accuracy_score, roc_auc_score, f1_score, log_loss, precision_score,
  2 |                              recall_score, mean_squared_error, mean_absolute_error, r2_score)
  3 | import numpy as np
  4 | 
  5 | 
  6 | def classification_metrics_binary(y_true, y_pred):
  7 |     """Returns a report with different metrics for a binary classification problem.
  8 |     - Accuracy: Number of correct predictions made as a ratio of all predictions. Useful when there are equal number
  9 |     of observations in each class and all predictions and prediction errors are equally important.
 10 |     - Confusion matrix: C_ij where observations are known to be in group i but predicted to be in group j. In binary
 11 |     classification true negatives is C_00, false negatives is C_10, true positives is C_11 and false positives is C_01.
 12 |     - Precision: Number of true positives divided by the number of true and false positives. It is the ability of the
 13 |     classifier not to label as positive a sample that is negative.
 14 |     - Recall: Number of true positives divided by the number of true positives and false negatives. It is the ability
 15 |     of the classifier to find all the positive samples.
 16 |     High Precision and low Recall will return few positive results but most of them will be correct. 
 17 |     High Recall and low Precision will return many positive results but most of them will be incorrect.
 18 |     - F1 Score: 2*((precision*recall)/(precision+recall)). It measures the balance between precision and recall.
 19 |     Args:
 20 |         y_true (list or array): True labels.
 21 |         y_pred (list or array): Predicted labels (binary).
 22 |     Returns:
 23 |         report (dict): Dictionary with metrics.
 24 |     Examples:
 25 |         >>> from collections import OrderedDict
 26 |         >>> y_true = [0,1,0,0,1]
 27 |         >>> y_pred = [0,1,0,1,1]
 28 |         >>> result = classification_metrics_binary(y_true, y_pred)
 29 |         >>> OrderedDict(sorted(result.items()))
 30 |         OrderedDict([('Accuracy', 0.8), ('Confusion Matrix', array([[2, 1],
 31 |                [0, 2]])), ('F1', 0.8), ('Precision', 0.6666666666666666), ('Recall', 1.0)])
 32 | 
 33 |     """
 34 |     m_acc = accuracy_score(y_true, y_pred)
 35 |     m_f1 = f1_score(y_true, y_pred)
 36 |     m_precision = precision_score(y_true, y_pred)
 37 |     m_recall = recall_score(y_true, y_pred)
 38 |     m_conf = confusion_matrix(y_true, y_pred)
 39 |     report = {'Accuracy': m_acc, 'Precision': m_precision, 'Recall': m_recall, 'F1': m_f1, 'Confusion Matrix': m_conf}
 40 |     return report
 41 | 
 42 | 
 43 | def classification_metrics_multilabel(y_true, y_pred, labels):
 44 |     """Returns a report with different metrics for a multilabel classification problem.
 45 |     - Accuracy: Number of correct predictions made as a ratio of all predictions. Useful when there are equal number
 46 |     of observations in each class and all predictions and prediction errors are equally important.
 47 |     - Confusion matrix: C_ij where observations are known to be in group i but predicted to be in group j. In multilabel
 48 |     classification true predictions are in the diagonal and false predictions outside the diagonal.
 49 |     - Precision: Number of true positives divided by the number of true and false positives. It is the ability of the
 50 |     classifier not to label as positive a sample that is negative.
 51 |     - Recall: Number of true positives divided by the number of true positives and false negatives. It is the ability
 52 |     of the classifier to find all the positive samples.
 53 |     High Precision and low Recall will return few positive results but most of them will be correct. 
 54 |     High Recall and low Precision will return many positive results but most of them will be incorrect.
 55 |     - F1 Score: 2*((precision*recall)/(precision+recall)). It measures the balance between precision and recall.
 56 |     Args:
 57 |         y_true (list or array): True labels.
 58 |         y_pred (list or array): Predicted labels.
 59 |         labels (list): Label index or name.
 60 |     Returns:
 61 |         report (dict): Dictionary with metrics.
 62 |     Examples:
 63 |         >>> from collections import OrderedDict
 64 |         >>> y_true = [0,1,2,0,1]
 65 |         >>> y_pred = [0,1,0,1,1]
 66 |         >>> result = classification_metrics_multilabel(y_true, y_pred, [0,1,2])
 67 |         >>> OrderedDict(sorted(result.items()))
 68 |         OrderedDict([('Accuracy', 0.6), ('Confusion Matrix', array([[1, 1, 0],
 69 |                [0, 2, 0],
 70 |                [1, 0, 0]])), ('F1', 0.52), ('Precision', 0.4666666666666666), ('Recall', 0.6)])
 71 | 
 72 |     """
 73 |     m_acc = accuracy_score(y_true, y_pred)
 74 |     m_f1 = f1_score(y_true, y_pred, labels, average='weighted')
 75 |     m_precision = precision_score(y_true, y_pred, labels, average='weighted')
 76 |     m_recall = recall_score(y_true, y_pred, labels, average='weighted')
 77 |     m_conf = confusion_matrix(y_true, y_pred, labels)
 78 |     report = {'Accuracy': m_acc, 'Precision': m_precision, 'Recall': m_recall, 'F1': m_f1, 'Confusion Matrix': m_conf}
 79 |     return report
 80 | 
 81 | 
 82 | def classification_metrics_binary_prob(y_true, y_prob):
 83 |     """Returns a report with different metrics for a binary classification problem.
 84 |     - AUC: The Area Under the Curve represents the ability to discriminate between positive and negative classes. An
 85 |     area of 1 represent perfect scoring and an area of 0.5 means random guessing.
 86 |     - Log loss: Also called logistic regression loss or cross-entropy loss. It quantifies the performance by
 87 |     penalizing false classifications. Minimizing the Log Loss is equivalent to minimizing the squared error but using
 88 |     probabilistic predictions. Log loss penalize heavily classifiers that are confident about incorrect classifications.
 89 |     Args:
 90 |         y_true (list or array): True labels.
 91 |         y_prob (list or array): Predicted labels (probability).
 92 |     Returns:
 93 |         report (dict): Dictionary with metrics.
 94 |     Examples:
 95 |         >>> from collections import OrderedDict
 96 |         >>> y_true = [0,1,0,0,1]
 97 |         >>> y_prob = [0.2,0.7,0.4,0.3,0.2]
 98 |         >>> result = classification_metrics_binary_prob(y_true, y_prob)
 99 |         >>> OrderedDict(sorted(result.items()))
100 |         OrderedDict([('AUC', 0.5833333333333333), ('Log loss', 0.6113513950783531)])
101 |         >>> y_prob = [0.2,0.7,0.4,0.3,0.3]
102 |         >>> result = classification_metrics_binary_prob(y_true, y_prob)
103 |         >>> OrderedDict(sorted(result.items()))
104 |         OrderedDict([('AUC', 0.75), ('Log loss', 0.5302583734567203)])
105 | 
106 |     """
107 |     m_auc = roc_auc_score(y_true, y_prob)
108 |     m_logloss = log_loss(y_true, y_prob)
109 |     report = {'AUC': m_auc, 'Log loss': m_logloss}
110 |     return report
111 | 
112 | 
113 | def regression_metrics(y_true, y_pred):
114 |     """Returns a report with different metrics for a regression problem.
115 |     - Mean Squared Error: MSE is a risk metric corresponding to the expected value of the squared (quadratic) error.
116 |     It has the disadvantage of heavily weighting outliers.
117 |     - Mean Absolute Error: MAE is a risk metric corresponding to the expected value of the absolute error or L1 loss.
118 |     Not as sensitive to outliers.
119 |     - R Square: R2 is statistical measure of how close the data are to the fitted regression line. It's best possible
120 |     score is 1.0 and it can be negative (because the model can be arbitrarily worse). A score of 0 means that the
121 |     variables are not linearly correlated.
122 |     - Root Mean Squared Error: RMSE is the square root of MSE. It also gives a relatively high weight to large errors.
123 |     Args:
124 |         y_true (list or array): True values.
125 |         y_pred (list or array): Predicted values.
126 |     Returns:
127 |         report (dict): Dictionary with metrics.
128 |     Examples:
129 |         >>> from collections import OrderedDict
130 |         >>> y_true = [5,1,0,7,1]
131 |         >>> y_pred = [6,0.7,0.4,10,20]
132 |         >>> result = regression_metrics(y_true, y_pred)
133 |         >>> OrderedDict(sorted(result.items()))
134 |         OrderedDict([('MAE', 4.74), ('MSE', 74.25), ('R2', -9.088315217391303), ('RMSE', 8.616843969807043)])
135 |         >>> y_true = [5,1,0,7,1]
136 |         >>> y_pred = [6,0.7,0.4,10,2]
137 |         >>> result = regression_metrics(y_true, y_pred)
138 |         >>> OrderedDict(sorted(result.items()))
139 |         OrderedDict([('MAE', 1.1400000000000001), ('MSE', 2.25), ('R2', 0.6942934782608696), ('RMSE', 1.5)])
140 | 
141 |     """
142 |     mse = mean_squared_error(y_true, y_pred)
143 |     mae = mean_absolute_error(y_true, y_pred)
144 |     r2 = r2_score(y_true, y_pred)
145 |     report = {'MSE': mse, 'MAE': mae, 'R2': r2, 'RMSE': np.sqrt(mse)}
146 |     return report
147 | 
148 | 
149 | def precision_at_k(y_true, y_pred, k=None):
150 |     """Precision at K.
151 |     Args:
152 |         y_true (list or array): True values.
153 |         y_pred (list or array): Predicted values.
154 |         k (int): Limit of predicted values.
155 |     Returns:
156 |         result (float): precision at k (max=1, min=0)
157 |     Examples:
158 |         >>> y_true = [5,1,0,7,2]
159 |         >>> y_pred = [2,5,0,1,7]
160 |         >>> precision_at_k(y_true, y_pred, k=3)
161 |         1.0
162 |         >>> y_true = np.array([5,1,0,7,2])
163 |         >>> y_pred = np.array([9,0,8,1,7])
164 |         >>> precision_at_k(y_true, y_pred, k=3)
165 |         0.3333333333333333
166 | 
167 |     """
168 |     predictions = y_pred[:k]
169 |     num_hit = len(set(predictions).intersection(set(y_true)))
170 |     return float(num_hit) / len(predictions)
171 | 
172 | 
173 | def recall_at_k(y_true, y_pred, k=None):
174 |     """Recall at K.
175 |     Args:
176 |         y_true (list or array): True values.
177 |         y_pred (list or array): Predicted values.
178 |         k (int): Limit of predicted values.
179 |     Returns:
180 |         result (float): recall at k (max=1, min=0)
181 |     Examples:
182 |         >>> y_true = [5,1,0,7,2]
183 |         >>> y_pred = [2,5,0,1,7]
184 |         >>> recall_at_k(y_true, y_pred, k=3)
185 |         0.6
186 |         >>> y_true = np.array([5,1,0,7,2])
187 |         >>> y_pred = np.array([9,0,8,1,7])
188 |         >>> recall_at_k(y_true, y_pred, k=3)
189 |         0.2
190 | 
191 |     """
192 |     predictions = y_pred[:k]
193 |     num_hit = len(set(predictions).intersection(set(y_true)))
194 |     return float(num_hit) / len(y_true)
195 | 
196 | 
197 | def discounted_cumulative_gain(y_true, y_pred, k=None):
198 |     """Discounted Cumulative Gain (DCG).
199 |     Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain
200 |     Args:
201 |         y_true (list or array): True values.
202 |         y_pred (list or array): Predicted values.
203 |         k (int): Limit of predicted values.
204 |     Returns:
205 |         result (float): DCG
206 |     Examples:
207 |         >>> y_true = [5,1,0,7,2]
208 |         >>> y_pred = [2,5,0,1,7]
209 |         >>> discounted_cumulative_gain(y_true, y_pred, k=3)
210 |         5.130929753571458
211 |         >>> y_true = np.array([5,1,0,7,2])
212 |         >>> y_pred = np.array([9,0,8,1,7])
213 |         >>> discounted_cumulative_gain(y_true, y_pred, k=3)
214 |         6.0
215 | 
216 |     """
217 |     order = np.argsort(y_pred)[::-1]
218 |     y_true = np.take(y_true, order[:k])
219 |     return (y_true / np.log2(np.arange(y_true.shape[0]) + 2)).sum()
220 | 
221 | 
222 | def exponential_discounted_cumulative_gain(y_true, y_pred, k=None):
223 |     """Exponential Discounted Cumulative Gain (eDCG).
224 |     Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain
225 |     Args:
226 |         y_true (list or array): True values.
227 |         y_pred (list or array): Predicted values.
228 |         k (int): Limit of predicted values.
229 |     Returns:
230 |         result (float): eDCG
231 |     Examples:
232 |         >>> y_true = [5,1,0,7,2]
233 |         >>> y_pred = [2,5,0,1,7]
234 |         >>> exponential_discounted_cumulative_gain(y_true, y_pred, k=3)
235 |         19.130929753571458
236 |         >>> y_true = np.array([5,1,0,7,2])
237 |         >>> y_pred = np.array([9,0,8,1,7])
238 |         >>> exponential_discounted_cumulative_gain(y_true, y_pred, k=3)
239 |         32.0
240 | 
241 |     """
242 |     order = np.argsort(y_pred)[::-1]
243 |     y_true = np.take(y_true, order[:k])
244 |     return ((2 ** y_true - 1) / np.log2(np.arange(y_true.shape[0]) + 2)).sum()
245 | 
246 | 
247 | def normalized_discounted_cumulative_gain(y_true, y_pred, k=None):
248 |     """Normalized Discounted Cumulative Gain (nDCG).
249 |     Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain
250 |     Args:
251 |         y_true (list or array): True values.
252 |         y_pred (list or array): Predicted values.
253 |         k (int): Limit of predicted values.
254 |     Returns:
255 |         result (float): nDCG (max=1, min=0)
256 |     Examples:
257 |         >>> y_true = [5,1,0,7,2]
258 |         >>> y_pred = [2,5,0,1,7]
259 |         >>> normalized_discounted_cumulative_gain(y_true, y_pred, k=3)
260 |         0.4599812921368268
261 |         >>> y_true = np.array([5,1,0,7,2])
262 |         >>> y_pred = np.array([9,0,8,1,7])
263 |         >>> normalized_discounted_cumulative_gain(y_true, y_pred, k=3)
264 |         0.537892328558952
265 | 
266 |     """
267 |     return discounted_cumulative_gain(y_true, y_pred, k) / discounted_cumulative_gain(y_true, y_true, k)
268 | 
269 | 
270 | def normalized_exponential_discounted_cumulative_gain(y, y_pred, k=None):
271 |     """Normalized Exponential Discounted Cumulative Gain (neDCG).
272 |     Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain
273 |     Args:
274 |         y_true (list or array): True values.
275 |         y_pred (list or array): Predicted values.
276 |         k (int): Limit of predicted values.
277 |     Returns:
278 |         result (float): neDCG (max=1, min=0)
279 |     Examples:
280 |         >>> y_true = [5,1,0,7,2]
281 |         >>> y_pred = [2,5,0,1,7]
282 |         >>> normalized_exponential_discounted_cumulative_gain(y_true, y_pred, k=3)
283 |         0.1292116839006246
284 |         >>> y_true = np.array([5,1,0,7,2])
285 |         >>> y_pred = np.array([9,0,8,1,7])
286 |         >>> normalized_exponential_discounted_cumulative_gain(y_true, y_pred, k=3)
287 |         0.21950735175253772
288 | 
289 |     """
290 |     return exponential_discounted_cumulative_gain(y, y_pred, k)/exponential_discounted_cumulative_gain(y, y, k)
291 | 
292 | 
293 | 
294 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/experiments/02_BCI_GPU.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Experiment 02: BCI (GPU version)\n",
 11 |     "\n",
 12 |     "This experiment uses a Brain Computer Interface dataset. The purpose is to try and predict when the participant is paying attention. The dataset consists of recordings from a number of electrodes placed over the scalp.\n",
 13 |     "\n",
 14 |     "The details of the machine we used and the version of the libraries can be found in [experiment 01](01_airline.ipynb)."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "collapsed": false,
 22 |     "deletable": true,
 23 |     "editable": true
 24 |    },
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stderr",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "Using TensorFlow backend.\n"
 31 |      ]
 32 |     },
 33 |     {
 34 |      "name": "stdout",
 35 |      "output_type": "stream",
 36 |      "text": [
 37 |       "System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) \n",
 38 |       "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n",
 39 |       "XGBoost version: 0.6\n",
 40 |       "LightGBM version: 0.2\n"
 41 |      ]
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "import json\n",
 46 |     "import sys\n",
 47 |     "import warnings\n",
 48 |     "import numpy as np\n",
 49 |     "import pandas as pd\n",
 50 |     "import pkg_resources\n",
 51 |     "from libs.loaders import load_bci\n",
 52 |     "from libs.timer import Timer\n",
 53 |     "from libs.metrics import classification_metrics_binary, classification_metrics_binary_prob, binarize_prediction\n",
 54 |     "import xgboost as xgb\n",
 55 |     "import lightgbm as lgb\n",
 56 |     "\n",
 57 |     "warnings.filterwarnings('ignore')\n",
 58 |     "\n",
 59 |     "print(\"System version: {}\".format(sys.version))\n",
 60 |     "print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n",
 61 |     "print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {
 67 |     "deletable": true,
 68 |     "editable": true
 69 |    },
 70 |    "source": [
 71 |     "## Data loading and management\n",
 72 |     "\n",
 73 |     "\n",
 74 |     "The dataset has been preprepared by extracting 800ms epochs from each channel. The data was then lowpass filtered at 18Hz and downsampled by a factor of 6. This results is a feature vector of 2048. "
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 2,
 80 |    "metadata": {
 81 |     "collapsed": false,
 82 |     "deletable": true,
 83 |     "editable": true
 84 |    },
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stderr",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "INFO:libs.loaders:MOUNT_POINT not found in environment. Defaulting to /fileshare\n"
 91 |      ]
 92 |     },
 93 |     {
 94 |      "name": "stdout",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "CPU times: user 2.1 s, sys: 472 ms, total: 2.57 s\n",
 98 |       "Wall time: 18.9 s\n"
 99 |      ]
100 |     }
101 |    ],
102 |    "source": [
103 |     "%%time\n",
104 |     "X, y, X_test, y_test = load_bci()"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 3,
110 |    "metadata": {
111 |     "collapsed": false,
112 |     "deletable": true,
113 |     "editable": true
114 |    },
115 |    "outputs": [
116 |     {
117 |      "name": "stdout",
118 |      "output_type": "stream",
119 |      "text": [
120 |       "(14519, 2048)\n",
121 |       "(14519,)\n",
122 |       "(5978, 2048)\n",
123 |       "(5978,)\n"
124 |      ]
125 |     }
126 |    ],
127 |    "source": [
128 |     "X_train = np.concatenate(X)\n",
129 |     "y_train = np.concatenate(y)\n",
130 |     "X_test = np.concatenate(X_test)\n",
131 |     "y_test = np.concatenate(y_test)\n",
132 |     "print(X_train.shape)\n",
133 |     "print(y_train.shape)\n",
134 |     "print(X_test.shape)\n",
135 |     "print(y_test.shape)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 4,
141 |    "metadata": {
142 |     "collapsed": true,
143 |     "deletable": true,
144 |     "editable": true
145 |    },
146 |    "outputs": [],
147 |    "source": [
148 |     "dtrain = xgb.DMatrix(data=X_train, label=y_train)\n",
149 |     "dtest = xgb.DMatrix(data=X_test, label=y_test)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 6,
155 |    "metadata": {
156 |     "collapsed": false,
157 |     "deletable": true,
158 |     "editable": true
159 |    },
160 |    "outputs": [],
161 |    "source": [
162 |     "lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)\n",
163 |     "lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=False)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {
169 |     "deletable": true,
170 |     "editable": true
171 |    },
172 |    "source": [
173 |     "### XGBoost"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 7,
179 |    "metadata": {
180 |     "collapsed": true,
181 |     "deletable": true,
182 |     "editable": true
183 |    },
184 |    "outputs": [],
185 |    "source": [
186 |     "results_dict = dict()\n",
187 |     "num_rounds = 100"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 8,
193 |    "metadata": {
194 |     "collapsed": true,
195 |     "deletable": true,
196 |     "editable": true
197 |    },
198 |    "outputs": [],
199 |    "source": [
200 |     "params = {'max_depth':3, \n",
201 |     "          'objective':'binary:logistic', \n",
202 |     "          'min_child_weight':1, \n",
203 |     "          'eta':0.1, \n",
204 |     "          'colsample_bytree':1, \n",
205 |     "          'scale_pos_weight':2, \n",
206 |     "          'gamma':0.1, \n",
207 |     "          'reg_lamda':1, \n",
208 |     "          'subsample':1,\n",
209 |     "          'tree_method':'exact', \n",
210 |     "          'updater':'grow_gpu'\n",
211 |     "          }\n"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 13,
217 |    "metadata": {
218 |     "collapsed": true,
219 |     "deletable": true,
220 |     "editable": true
221 |    },
222 |    "outputs": [],
223 |    "source": [
224 |     "with Timer() as t_train:\n",
225 |     "    xgb_clf_pipeline = xgb.train(params, dtrain, num_boost_round=num_rounds)\n",
226 |     "    \n",
227 |     "with Timer() as t_test:\n",
228 |     "    y_prob_xgb = xgb_clf_pipeline.predict(dtest)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 14,
234 |    "metadata": {
235 |     "collapsed": true,
236 |     "deletable": true,
237 |     "editable": true
238 |    },
239 |    "outputs": [],
240 |    "source": [
241 |     "y_pred_xgb = binarize_prediction(y_prob_xgb)"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 15,
247 |    "metadata": {
248 |     "collapsed": true,
249 |     "deletable": true,
250 |     "editable": true
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "report_xgb = classification_metrics_binary(y_test, y_pred_xgb)\n",
255 |     "report2_xgb = classification_metrics_binary_prob(y_test, y_prob_xgb)\n",
256 |     "report_xgb.update(report2_xgb)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 17,
262 |    "metadata": {
263 |     "collapsed": false,
264 |     "deletable": true,
265 |     "editable": true
266 |    },
267 |    "outputs": [],
268 |    "source": [
269 |     "results_dict['xgb']={\n",
270 |     "    'train_time': t_train.interval,\n",
271 |     "    'test_time': t_test.interval,\n",
272 |     "    'performance': report_xgb \n",
273 |     "}"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 18,
279 |    "metadata": {
280 |     "collapsed": true,
281 |     "deletable": true,
282 |     "editable": true
283 |    },
284 |    "outputs": [],
285 |    "source": [
286 |     "del xgb_clf_pipeline"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {
292 |     "deletable": true,
293 |     "editable": true
294 |    },
295 |    "source": [
296 |     "Now let's try with XGBoost histogram."
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 19,
302 |    "metadata": {
303 |     "collapsed": true,
304 |     "deletable": true,
305 |     "editable": true
306 |    },
307 |    "outputs": [],
308 |    "source": [
309 |     "params = {'max_depth':0, \n",
310 |     "          'objective':'binary:logistic', \n",
311 |     "          'min_child_weight':1, \n",
312 |     "          'eta':0.1, \n",
313 |     "          'colsample_bytree':0.80, \n",
314 |     "          'scale_pos_weight':2, \n",
315 |     "          'gamma':0.1, \n",
316 |     "          'reg_lamda':1, \n",
317 |     "          'subsample':1,\n",
318 |     "          'tree_method':'hist', \n",
319 |     "          'max_leaves':2**3, \n",
320 |     "          'grow_policy':'lossguide', \n",
321 |     "         }\n"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 20,
327 |    "metadata": {
328 |     "collapsed": true,
329 |     "deletable": true,
330 |     "editable": true
331 |    },
332 |    "outputs": [],
333 |    "source": [
334 |     "with Timer() as t_train:\n",
335 |     "    xgb_hist_clf_pipeline = xgb.train(params, dtrain, num_boost_round=num_rounds)\n",
336 |     "    \n",
337 |     "with Timer() as t_test:\n",
338 |     "    y_prob_xgb_hist = xgb_hist_clf_pipeline.predict(dtest)"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 21,
344 |    "metadata": {
345 |     "collapsed": true,
346 |     "deletable": true,
347 |     "editable": true
348 |    },
349 |    "outputs": [],
350 |    "source": [
351 |     "y_pred_xgb_hist = binarize_prediction(y_prob_xgb_hist)"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 22,
357 |    "metadata": {
358 |     "collapsed": true,
359 |     "deletable": true,
360 |     "editable": true
361 |    },
362 |    "outputs": [],
363 |    "source": [
364 |     "report_xgb_hist = classification_metrics_binary(y_test, y_pred_xgb_hist)\n",
365 |     "report2_xgb_hist = classification_metrics_binary_prob(y_test, y_prob_xgb_hist)\n",
366 |     "report_xgb_hist.update(report2_xgb_hist)"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 23,
372 |    "metadata": {
373 |     "collapsed": true,
374 |     "deletable": true,
375 |     "editable": true
376 |    },
377 |    "outputs": [],
378 |    "source": [
379 |     "results_dict['xgb_hist']={\n",
380 |     "    'train_time': t_train.interval,\n",
381 |     "    'test_time': t_test.interval,\n",
382 |     "    'performance': report_xgb_hist\n",
383 |     "}"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": 24,
389 |    "metadata": {
390 |     "collapsed": true,
391 |     "deletable": true,
392 |     "editable": true
393 |    },
394 |    "outputs": [],
395 |    "source": [
396 |     "del xgb_hist_clf_pipeline"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "markdown",
401 |    "metadata": {
402 |     "deletable": true,
403 |     "editable": true
404 |    },
405 |    "source": [
406 |     "### LightGBM"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 25,
412 |    "metadata": {
413 |     "collapsed": true,
414 |     "deletable": true,
415 |     "editable": true
416 |    },
417 |    "outputs": [],
418 |    "source": [
419 |     "params = {'num_leaves': 2**3,\n",
420 |     "         'learning_rate': 0.1,\n",
421 |     "         'scale_pos_weight': 2,\n",
422 |     "         'min_split_gain': 0.1,\n",
423 |     "         'min_child_weight': 1,\n",
424 |     "         'reg_lambda': 1,\n",
425 |     "         'subsample': 1,\n",
426 |     "         'objective':'binary',\n",
427 |     "         'task': 'train'\n",
428 |     "         }"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 30,
434 |    "metadata": {
435 |     "collapsed": true,
436 |     "deletable": true,
437 |     "editable": true
438 |    },
439 |    "outputs": [],
440 |    "source": [
441 |     "with Timer() as t_train:\n",
442 |     "    lgbm_clf_pipeline = lgb.train(params, lgb_train, num_boost_round=num_rounds)\n",
443 |     "    \n",
444 |     "with Timer() as t_test:\n",
445 |     "    y_prob_lgbm = lgbm_clf_pipeline.predict(X_test)"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 31,
451 |    "metadata": {
452 |     "collapsed": true,
453 |     "deletable": true,
454 |     "editable": true
455 |    },
456 |    "outputs": [],
457 |    "source": [
458 |     "y_pred_lgbm = binarize_prediction(y_prob_lgbm)"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": 32,
464 |    "metadata": {
465 |     "collapsed": true,
466 |     "deletable": true,
467 |     "editable": true
468 |    },
469 |    "outputs": [],
470 |    "source": [
471 |     "report_lgbm = classification_metrics_binary(y_test, y_pred_lgbm)\n",
472 |     "report2_lgbm = classification_metrics_binary_prob(y_test, y_prob_lgbm)\n",
473 |     "report_lgbm.update(report2_lgbm)"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": 33,
479 |    "metadata": {
480 |     "collapsed": false,
481 |     "deletable": true,
482 |     "editable": true
483 |    },
484 |    "outputs": [],
485 |    "source": [
486 |     "results_dict['lgbm']={\n",
487 |     "    'train_time': t_train.interval,\n",
488 |     "    'test_time': t_test.interval,\n",
489 |     "    'performance': report_lgbm \n",
490 |     "}"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": 34,
496 |    "metadata": {
497 |     "collapsed": true,
498 |     "deletable": true,
499 |     "editable": true
500 |    },
501 |    "outputs": [],
502 |    "source": [
503 |     "del lgbm_clf_pipeline"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "markdown",
508 |    "metadata": {
509 |     "deletable": true,
510 |     "editable": true
511 |    },
512 |    "source": [
513 |     "Finally, we show the results"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": 35,
519 |    "metadata": {
520 |     "collapsed": false,
521 |     "deletable": true,
522 |     "editable": true
523 |    },
524 |    "outputs": [
525 |     {
526 |      "name": "stdout",
527 |      "output_type": "stream",
528 |      "text": [
529 |       "{\n",
530 |       "    \"lgbm\": {\n",
531 |       "        \"performance\": {\n",
532 |       "            \"AUC\": 0.7714542124542124,\n",
533 |       "            \"Accuracy\": 0.8813984610237537,\n",
534 |       "            \"F1\": 0.13851761846901578,\n",
535 |       "            \"Precision\": 0.6,\n",
536 |       "            \"Recall\": 0.0782967032967033\n",
537 |       "        },\n",
538 |       "        \"test_time\": 0.009907090001433971,\n",
539 |       "        \"train_time\": 2.7659428379993187\n",
540 |       "    },\n",
541 |       "    \"xgb\": {\n",
542 |       "        \"performance\": {\n",
543 |       "            \"AUC\": 0.7716584249084248,\n",
544 |       "            \"Accuracy\": 0.8798929407828705,\n",
545 |       "            \"F1\": 0.09343434343434343,\n",
546 |       "            \"Precision\": 0.578125,\n",
547 |       "            \"Recall\": 0.050824175824175824\n",
548 |       "        },\n",
549 |       "        \"test_time\": 0.0064387769998575095,\n",
550 |       "        \"train_time\": 12.934047714998997\n",
551 |       "    },\n",
552 |       "    \"xgb_hist\": {\n",
553 |       "        \"performance\": {\n",
554 |       "            \"AUC\": 0.7736170852956569,\n",
555 |       "            \"Accuracy\": 0.8805620608899297,\n",
556 |       "            \"F1\": 0.12068965517241378,\n",
557 |       "            \"Precision\": 0.5833333333333334,\n",
558 |       "            \"Recall\": 0.0673076923076923\n",
559 |       "        },\n",
560 |       "        \"test_time\": 0.00308577800024068,\n",
561 |       "        \"train_time\": 42.69890288699935\n",
562 |       "    }\n",
563 |       "}\n"
564 |      ]
565 |     }
566 |    ],
567 |    "source": [
568 |     "# Results\n",
569 |     "print(json.dumps(results_dict, indent=4, sort_keys=True))"
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "code",
574 |    "execution_count": null,
575 |    "metadata": {
576 |     "collapsed": true,
577 |     "deletable": true,
578 |     "editable": true
579 |    },
580 |    "outputs": [],
581 |    "source": []
582 |   }
583 |  ],
584 |  "metadata": {
585 |   "kernelspec": {
586 |    "display_name": "Python 3.5",
587 |    "language": "python",
588 |    "name": "python3"
589 |   },
590 |   "language_info": {
591 |    "codemirror_mode": {
592 |     "name": "ipython",
593 |     "version": 3
594 |    },
595 |    "file_extension": ".py",
596 |    "mimetype": "text/x-python",
597 |    "name": "python",
598 |    "nbconvert_exporter": "python",
599 |    "pygments_lexer": "ipython3",
600 |    "version": "3.5.2"
601 |   }
602 |  },
603 |  "nbformat": 4,
604 |  "nbformat_minor": 0
605 | }
606 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/experiments/04_PlanetKaggle.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Experiment 04: Amazon Planet\n",
 11 |     "\n",
 12 |     "This experiment uses the data from the Kaggle competition [Planet: Understanding the Amazon from Space](https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/leaderboard). Here we use a pretrained ResNet50 model to generate the features from the dataset.\n",
 13 |     "\n",
 14 |     "For details of virtual machine we used and the versions of LightGBM and XGBoost, please refer to [experiment 1](01_airline.ipynb)."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "collapsed": false,
 22 |     "deletable": true,
 23 |     "editable": true,
 24 |     "scrolled": true
 25 |    },
 26 |    "outputs": [
 27 |     {
 28 |      "name": "stderr",
 29 |      "output_type": "stream",
 30 |      "text": [
 31 |       "Using TensorFlow backend.\n"
 32 |      ]
 33 |     },
 34 |     {
 35 |      "name": "stdout",
 36 |      "output_type": "stream",
 37 |      "text": [
 38 |       "System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) \n",
 39 |       "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n",
 40 |       "XGBoost version: 0.6\n",
 41 |       "LightGBM version: 0.2\n"
 42 |      ]
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "import sys\n",
 47 |     "from collections import defaultdict\n",
 48 |     "import numpy as np\n",
 49 |     "import pkg_resources\n",
 50 |     "from libs.loaders import load_planet_kaggle\n",
 51 |     "from libs.planet_kaggle import threshold_prediction\n",
 52 |     "from libs.timer import Timer\n",
 53 |     "from libs.utils import get_number_processors\n",
 54 |     "from lightgbm import LGBMClassifier\n",
 55 |     "import xgboost as xgb\n",
 56 |     "import lightgbm as lgb\n",
 57 |     "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
 58 |     "from tqdm import tqdm\n",
 59 |     "import json\n",
 60 |     "import warnings; warnings.simplefilter('ignore')\n",
 61 |     "\n",
 62 |     "print(\"System version: {}\".format(sys.version))\n",
 63 |     "print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n",
 64 |     "print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 2,
 70 |    "metadata": {
 71 |     "collapsed": false,
 72 |     "deletable": true,
 73 |     "editable": true
 74 |    },
 75 |    "outputs": [
 76 |     {
 77 |      "name": "stdout",
 78 |      "output_type": "stream",
 79 |      "text": [
 80 |       "env: MOUNT_POINT=/datadrive\n"
 81 |      ]
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "%env MOUNT_POINT=/datadrive"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {
 91 |     "deletable": true,
 92 |     "editable": true
 93 |    },
 94 |    "source": [
 95 |     "The images are loaded and featurised using a pretrained ResNet50 model available from Keras"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 3,
101 |    "metadata": {
102 |     "collapsed": false,
103 |     "deletable": true,
104 |     "editable": true
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "X_train, y_train, X_test, y_test = load_planet_kaggle()"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 4,
114 |    "metadata": {
115 |     "collapsed": false,
116 |     "deletable": true,
117 |     "editable": true
118 |    },
119 |    "outputs": [
120 |     {
121 |      "name": "stdout",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "(35000, 2048)\n",
125 |       "(35000, 17)\n",
126 |       "(5479, 2048)\n",
127 |       "(5479, 17)\n"
128 |      ]
129 |     }
130 |    ],
131 |    "source": [
132 |     "print(X_train.shape)\n",
133 |     "print(y_train.shape)\n",
134 |     "print(X_test.shape)\n",
135 |     "print(y_test.shape)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {
141 |     "deletable": true,
142 |     "editable": true
143 |    },
144 |    "source": [
145 |     "## XGBoost \n",
146 |     "\n"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 5,
152 |    "metadata": {
153 |     "collapsed": false,
154 |     "deletable": true,
155 |     "editable": true
156 |    },
157 |    "outputs": [
158 |     {
159 |      "name": "stdout",
160 |      "output_type": "stream",
161 |      "text": [
162 |       "Number of processors:  24\n"
163 |      ]
164 |     }
165 |    ],
166 |    "source": [
167 |     "number_processors = get_number_processors()\n",
168 |     "print(\"Number of processors: \", number_processors)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {
174 |     "deletable": true,
175 |     "editable": true
176 |    },
177 |    "source": [
178 |     "We will use a one-v-rest. So each classifier will be responsible for determining whether the assigned tag applies to the image"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 6,
184 |    "metadata": {
185 |     "collapsed": false,
186 |     "deletable": true,
187 |     "editable": true
188 |    },
189 |    "outputs": [],
190 |    "source": [
191 |     "def train_and_validate_xgboost(params, train_features, train_labels, validation_features, num_boost_round):\n",
192 |     "    n_classes = train_labels.shape[1]\n",
193 |     "    y_val_pred = np.zeros((validation_features.shape[0], n_classes))\n",
194 |     "    time_results = defaultdict(list)\n",
195 |     "    for class_i in tqdm(range(n_classes)):\n",
196 |     "        dtrain = xgb.DMatrix(data=train_features, label=train_labels[:, class_i])\n",
197 |     "        dtest = xgb.DMatrix(data=validation_features)\n",
198 |     "        with Timer() as t:\n",
199 |     "            model = xgb.train(params, dtrain, num_boost_round=num_boost_round)\n",
200 |     "        time_results['train_time'].append(t.interval)\n",
201 |     "        \n",
202 |     "        with Timer() as t:\n",
203 |     "            y_val_pred[:, class_i] = model.predict(dtest)\n",
204 |     "        time_results['test_time'].append(t.interval)\n",
205 |     "        \n",
206 |     "    return y_val_pred, time_results"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 7,
212 |    "metadata": {
213 |     "collapsed": true,
214 |     "deletable": true,
215 |     "editable": true
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "def train_and_validate_lightgbm(params, train_features, train_labels, validation_features, num_boost_round):\n",
220 |     "    n_classes = train_labels.shape[1]\n",
221 |     "    y_val_pred = np.zeros((validation_features.shape[0], n_classes))\n",
222 |     "    time_results = defaultdict(list)\n",
223 |     "    for class_i in tqdm(range(n_classes)):\n",
224 |     "        lgb_train = lgb.Dataset(train_features, train_labels[:, class_i], free_raw_data=False)\n",
225 |     "        with Timer() as t:\n",
226 |     "            model = lgb.train(params, lgb_train, num_boost_round = num_boost_round)\n",
227 |     "        time_results['train_time'].append(t.interval)\n",
228 |     "        \n",
229 |     "        with Timer() as t:\n",
230 |     "            y_val_pred[:, class_i] = model.predict(validation_features)\n",
231 |     "        time_results['test_time'].append(t.interval)\n",
232 |     "        \n",
233 |     "    return y_val_pred, time_results"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 8,
239 |    "metadata": {
240 |     "collapsed": false,
241 |     "deletable": true,
242 |     "editable": true
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "metrics_dict = {\n",
247 |     "    'Accuracy': accuracy_score,\n",
248 |     "    'Precision': lambda y_true, y_pred: precision_score(y_true, y_pred, average='samples'),\n",
249 |     "    'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='samples'),\n",
250 |     "    'F1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='samples'),\n",
251 |     "}\n",
252 |     "\n",
253 |     "def classification_metrics(metrics, y_true, y_pred):\n",
254 |     "    return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.items()}"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 9,
260 |    "metadata": {
261 |     "collapsed": true,
262 |     "deletable": true,
263 |     "editable": true
264 |    },
265 |    "outputs": [],
266 |    "source": [
267 |     "results_dict = dict()\n",
268 |     "num_rounds = 50"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "metadata": {
274 |     "deletable": true,
275 |     "editable": true
276 |    },
277 |    "source": [
278 |     "Now we are going to define the different models."
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 10,
284 |    "metadata": {
285 |     "collapsed": true,
286 |     "deletable": true,
287 |     "editable": true
288 |    },
289 |    "outputs": [],
290 |    "source": [
291 |     "xgb_params = {'max_depth':6, \n",
292 |     "              'objective':'binary:logistic', \n",
293 |     "              'min_child_weight':1, \n",
294 |     "              'learning_rate':0.1, \n",
295 |     "              'scale_pos_weight':2, \n",
296 |     "              'gamma':0.1, \n",
297 |     "              'reg_lamda':1, \n",
298 |     "              'subsample':1,\n",
299 |     "              'nthread':number_processors\n",
300 |     "             }"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 11,
306 |    "metadata": {
307 |     "collapsed": false,
308 |     "deletable": true,
309 |     "editable": true
310 |    },
311 |    "outputs": [
312 |     {
313 |      "name": "stderr",
314 |      "output_type": "stream",
315 |      "text": [
316 |       "100%|██████████| 17/17 [05:36<00:00, 19.88s/it]\n"
317 |      ]
318 |     }
319 |    ],
320 |    "source": [
321 |     "y_pred, timing_results = train_and_validate_xgboost(xgb_params, X_train, y_train, X_test, num_boost_round=num_rounds)"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 12,
327 |    "metadata": {
328 |     "collapsed": false,
329 |     "deletable": true,
330 |     "editable": true
331 |    },
332 |    "outputs": [],
333 |    "source": [
334 |     "results_dict['xgb']={\n",
335 |     "    'train_time': np.sum(timing_results['train_time']),\n",
336 |     "    'test_time': np.sum(timing_results['test_time']),\n",
337 |     "    'performance': classification_metrics(metrics_dict, \n",
338 |     "                                          y_test, \n",
339 |     "                                          threshold_prediction(y_pred, threshold=0.1)) \n",
340 |     "}"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 13,
346 |    "metadata": {
347 |     "collapsed": true,
348 |     "deletable": true,
349 |     "editable": true
350 |    },
351 |    "outputs": [],
352 |    "source": [
353 |     "xgb_hist_params = {'max_depth':0, \n",
354 |     "                  'max_leaves':2**6, \n",
355 |     "                  'objective':'binary:logistic', \n",
356 |     "                  'min_child_weight':1, \n",
357 |     "                  'learning_rate':0.1, \n",
358 |     "                  'scale_pos_weight':2, \n",
359 |     "                  'gamma':0.1, \n",
360 |     "                  'reg_lamda':1, \n",
361 |     "                  'subsample':1,\n",
362 |     "                  'nthread':number_processors,\n",
363 |     "                  'tree_method':'hist', \n",
364 |     "                  'grow_policy':'lossguide',\n",
365 |     "                  'max_bins': 63\n",
366 |     "                 }"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 14,
372 |    "metadata": {
373 |     "collapsed": false,
374 |     "deletable": true,
375 |     "editable": true
376 |    },
377 |    "outputs": [
378 |     {
379 |      "name": "stderr",
380 |      "output_type": "stream",
381 |      "text": [
382 |       "100%|██████████| 17/17 [35:26<00:00, 116.33s/it]\n"
383 |      ]
384 |     }
385 |    ],
386 |    "source": [
387 |     "y_pred, timing_results = train_and_validate_xgboost(xgb_hist_params, X_train, y_train, X_test, num_boost_round=num_rounds)"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": 15,
393 |    "metadata": {
394 |     "collapsed": true,
395 |     "deletable": true,
396 |     "editable": true
397 |    },
398 |    "outputs": [],
399 |    "source": [
400 |     "results_dict['xgb_hist']={\n",
401 |     "    'train_time': np.sum(timing_results['train_time']),\n",
402 |     "    'test_time': np.sum(timing_results['test_time']),\n",
403 |     "    'performance': classification_metrics(metrics_dict, \n",
404 |     "                                          y_test, \n",
405 |     "                                          threshold_prediction(y_pred, threshold=0.1)) \n",
406 |     "}"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "markdown",
411 |    "metadata": {
412 |     "deletable": true,
413 |     "editable": true
414 |    },
415 |    "source": [
416 |     "## LightGBM"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": 16,
422 |    "metadata": {
423 |     "collapsed": false,
424 |     "deletable": true,
425 |     "editable": true
426 |    },
427 |    "outputs": [],
428 |    "source": [
429 |     "lgb_params = {'num_leaves': 2**6,\n",
430 |     "             'learning_rate': 0.1,\n",
431 |     "             'scale_pos_weight': 2,\n",
432 |     "             'min_split_gain': 0.1,\n",
433 |     "             'min_child_weight': 1,\n",
434 |     "             'reg_lambda': 1,\n",
435 |     "             'subsample': 1,\n",
436 |     "             'objective':'binary',\n",
437 |     "             'task': 'train',\n",
438 |     "             'nthread':number_processors,\n",
439 |     "             'max_bin': 63\n",
440 |     "             }"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": 17,
446 |    "metadata": {
447 |     "collapsed": false,
448 |     "deletable": true,
449 |     "editable": true
450 |    },
451 |    "outputs": [
452 |     {
453 |      "name": "stderr",
454 |      "output_type": "stream",
455 |      "text": [
456 |       "100%|██████████| 17/17 [03:13<00:00,  7.91s/it]\n"
457 |      ]
458 |     }
459 |    ],
460 |    "source": [
461 |     "y_pred, timing_results = train_and_validate_lightgbm(lgb_params, X_train, y_train, X_test, num_boost_round=num_rounds)"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": 18,
467 |    "metadata": {
468 |     "collapsed": false,
469 |     "deletable": true,
470 |     "editable": true
471 |    },
472 |    "outputs": [],
473 |    "source": [
474 |     "results_dict['lgbm']={\n",
475 |     "    'train_time': np.sum(timing_results['train_time']),\n",
476 |     "    'test_time': np.sum(timing_results['test_time']),\n",
477 |     "    'performance': classification_metrics(metrics_dict, \n",
478 |     "                                          y_test, \n",
479 |     "                                          threshold_prediction(y_pred, threshold=0.1)) \n",
480 |     "}"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "markdown",
485 |    "metadata": {
486 |     "deletable": true,
487 |     "editable": true
488 |    },
489 |    "source": [
490 |     "Finally, we show the results."
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": 19,
496 |    "metadata": {
497 |     "collapsed": false,
498 |     "deletable": true,
499 |     "editable": true
500 |    },
501 |    "outputs": [
502 |     {
503 |      "name": "stdout",
504 |      "output_type": "stream",
505 |      "text": [
506 |       "{\n",
507 |       "    \"lgbm\": {\n",
508 |       "        \"performance\": {\n",
509 |       "            \"Accuracy\": 0.37233071728417594,\n",
510 |       "            \"F1\": 0.822258366139549,\n",
511 |       "            \"Precision\": 0.7439077632634851,\n",
512 |       "            \"Recall\": 0.9734099462015139\n",
513 |       "        },\n",
514 |       "        \"test_time\": 0.1641630920021271,\n",
515 |       "        \"train_time\": 194.57900593099475\n",
516 |       "    },\n",
517 |       "    \"xgb\": {\n",
518 |       "        \"performance\": {\n",
519 |       "            \"Accuracy\": 0.34057309728052565,\n",
520 |       "            \"F1\": 0.8048263053953228,\n",
521 |       "            \"Precision\": 0.7184218531362171,\n",
522 |       "            \"Recall\": 0.9766441564762427\n",
523 |       "        },\n",
524 |       "        \"test_time\": 0.1852665030019125,\n",
525 |       "        \"train_time\": 313.8951129560046\n",
526 |       "    },\n",
527 |       "    \"xgb_hist\": {\n",
528 |       "        \"performance\": {\n",
529 |       "            \"Accuracy\": 0.37871874429640445,\n",
530 |       "            \"F1\": 0.8220252909027159,\n",
531 |       "            \"Precision\": 0.7447899193746976,\n",
532 |       "            \"Recall\": 0.9720717197264013\n",
533 |       "        },\n",
534 |       "        \"test_time\": 0.19687007299944526,\n",
535 |       "        \"train_time\": 2115.2851170680005\n",
536 |       "    }\n",
537 |       "}\n"
538 |      ]
539 |     }
540 |    ],
541 |    "source": [
542 |     "# Results\n",
543 |     "print(json.dumps(results_dict, indent=4, sort_keys=True))"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "markdown",
548 |    "metadata": {
549 |     "deletable": true,
550 |     "editable": true
551 |    },
552 |    "source": [
553 |     "This dataset shows an interesting behavior. It is the only notebook where XGBoost hist behaves worse than XGBoost. The reason could be because the number of features is high, 2048, and that could be causing a memory overhead. LightGBM and the standard version of XGBoost can manage this high number of features, so there is no overhead. You can try to use a higher complexity to improve the performance. For example, setting `max_depth=8` in XGBoost, `max_leaves=2**8` in XGBoost hist and `num_leaves=2**6` in LightGBM. This will cause an overhead in XGBoost hist."
554 |    ]
555 |   }
556 |  ],
557 |  "metadata": {
558 |   "kernelspec": {
559 |    "display_name": "Python3.5 (Strata)",
560 |    "language": "python",
561 |    "name": "strata"
562 |   },
563 |   "language_info": {
564 |    "codemirror_mode": {
565 |     "name": "ipython",
566 |     "version": 3
567 |    },
568 |    "file_extension": ".py",
569 |    "mimetype": "text/x-python",
570 |    "name": "python",
571 |    "nbconvert_exporter": "python",
572 |    "pygments_lexer": "ipython3",
573 |    "version": "3.5.2"
574 |   }
575 |  },
576 |  "nbformat": 4,
577 |  "nbformat_minor": 0
578 | }
579 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/experiments/04_PlanetKaggle_GPU.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Experiment 04: Amazon Planet (GPU version)\n",
 11 |     "\n",
 12 |     "This experiment uses the data from the Kaggle competition [Planet: Understanding the Amazon from Space](https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/leaderboard). Here we use a pretrained ResNet50 model to generate the features from the dataset.\n",
 13 |     "\n",
 14 |     "The details of the machine we used and the version of the libraries can be found in [experiment 01](01_airline.ipynb)."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "collapsed": false,
 22 |     "deletable": true,
 23 |     "editable": true,
 24 |     "scrolled": true
 25 |    },
 26 |    "outputs": [
 27 |     {
 28 |      "name": "stderr",
 29 |      "output_type": "stream",
 30 |      "text": [
 31 |       "Using TensorFlow backend.\n"
 32 |      ]
 33 |     },
 34 |     {
 35 |      "name": "stdout",
 36 |      "output_type": "stream",
 37 |      "text": [
 38 |       "System version: 3.5.2 |Anaconda custom (64-bit)| (default, Jul  2 2016, 17:53:06) \n",
 39 |       "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n",
 40 |       "XGBoost version: 0.6\n",
 41 |       "LightGBM version: 0.2\n"
 42 |      ]
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "import sys, os\n",
 47 |     "from collections import defaultdict\n",
 48 |     "import numpy as np\n",
 49 |     "import pkg_resources\n",
 50 |     "from libs.loaders import load_planet_kaggle\n",
 51 |     "from libs.planet_kaggle import threshold_prediction\n",
 52 |     "from libs.timer import Timer\n",
 53 |     "import lightgbm as lgb\n",
 54 |     "import xgboost as xgb\n",
 55 |     "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
 56 |     "from tqdm import tqdm\n",
 57 |     "import tensorflow as tf\n",
 58 |     "from keras.backend.tensorflow_backend import set_session, get_session\n",
 59 |     "\n",
 60 |     "print(\"System version: {}\".format(sys.version))\n",
 61 |     "print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n",
 62 |     "print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 2,
 68 |    "metadata": {
 69 |     "collapsed": false,
 70 |     "deletable": true,
 71 |     "editable": true
 72 |    },
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "env: MOUNT_POINT=/datadrive\n"
 79 |      ]
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "%env MOUNT_POINT=/datadrive"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 3,
 89 |    "metadata": {
 90 |     "collapsed": true,
 91 |     "deletable": true,
 92 |     "editable": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "#Configure TF to use only one GPU, by default TF allocates memory in all GPUs\n",
 97 |     "config = tf.ConfigProto(device_count = {'GPU': 1})\n",
 98 |     "#Configure TF to limit the amount of GPU memory, by default TF takes all of them. \n",
 99 |     "config.gpu_options.per_process_gpu_memory_fraction = 0.3\n",
100 |     "set_session(tf.Session(config=config))"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {
106 |     "deletable": true,
107 |     "editable": true
108 |    },
109 |    "source": [
110 |     "The images are loaded and featurised using a pretrained ResNet50 model available from Keras"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 4,
116 |    "metadata": {
117 |     "collapsed": false,
118 |     "deletable": true,
119 |     "editable": true
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "X_train, y_train, X_test, y_test = load_planet_kaggle()"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 5,
129 |    "metadata": {
130 |     "collapsed": false,
131 |     "deletable": true,
132 |     "editable": true
133 |    },
134 |    "outputs": [
135 |     {
136 |      "name": "stdout",
137 |      "output_type": "stream",
138 |      "text": [
139 |       "(35000, 2048)\n",
140 |       "(35000, 17)\n",
141 |       "(5479, 2048)\n",
142 |       "(5479, 17)\n"
143 |      ]
144 |     }
145 |    ],
146 |    "source": [
147 |     "print(X_train.shape)\n",
148 |     "print(y_train.shape)\n",
149 |     "print(X_test.shape)\n",
150 |     "print(y_test.shape)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {
156 |     "deletable": true,
157 |     "editable": true
158 |    },
159 |    "source": [
160 |     "## XGBoost "
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {
166 |     "deletable": true,
167 |     "editable": true
168 |    },
169 |    "source": [
170 |     "We will use a one-v-rest. So each classifier will be responsible for determining whether the assigned tag applies to the image"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 6,
176 |    "metadata": {
177 |     "collapsed": false,
178 |     "deletable": true,
179 |     "editable": true
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "def train_and_validate_xgboost(params, train_features, train_labels, validation_features, num_boost_round):\n",
184 |     "    n_classes = train_labels.shape[1]\n",
185 |     "    y_val_pred = np.zeros((validation_features.shape[0], n_classes))\n",
186 |     "    time_results = defaultdict(list)\n",
187 |     "    for class_i in tqdm(range(n_classes)):\n",
188 |     "        dtrain = xgb.DMatrix(data=train_features, label=train_labels[:, class_i])\n",
189 |     "        dtest = xgb.DMatrix(data=validation_features)\n",
190 |     "        with Timer() as t:\n",
191 |     "            model = xgb.train(params, dtrain, num_boost_round=num_boost_round)\n",
192 |     "        time_results['train_time'].append(t.interval)\n",
193 |     "        \n",
194 |     "        with Timer() as t:\n",
195 |     "            y_val_pred[:, class_i] = model.predict(dtest)\n",
196 |     "        time_results['test_time'].append(t.interval)\n",
197 |     "        \n",
198 |     "    return y_val_pred, time_results"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 7,
204 |    "metadata": {
205 |     "collapsed": true,
206 |     "deletable": true,
207 |     "editable": true
208 |    },
209 |    "outputs": [],
210 |    "source": [
211 |     "def train_and_validate_lightgbm(params, train_features, train_labels, validation_features, num_boost_round):\n",
212 |     "    n_classes = train_labels.shape[1]\n",
213 |     "    y_val_pred = np.zeros((validation_features.shape[0], n_classes))\n",
214 |     "    time_results = defaultdict(list)\n",
215 |     "    for class_i in tqdm(range(n_classes)):\n",
216 |     "        lgb_train = lgb.Dataset(train_features, train_labels[:, class_i], free_raw_data=False)\n",
217 |     "        with Timer() as t:\n",
218 |     "            model = lgb.train(params, lgb_train, num_boost_round = num_boost_round)\n",
219 |     "        time_results['train_time'].append(t.interval)\n",
220 |     "        \n",
221 |     "        with Timer() as t:\n",
222 |     "            y_val_pred[:, class_i] = model.predict(validation_features)\n",
223 |     "        time_results['test_time'].append(t.interval)\n",
224 |     "        \n",
225 |     "    return y_val_pred, time_results"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 8,
231 |    "metadata": {
232 |     "collapsed": false,
233 |     "deletable": true,
234 |     "editable": true
235 |    },
236 |    "outputs": [],
237 |    "source": [
238 |     "metrics_dict = {\n",
239 |     "    'Accuracy': accuracy_score,\n",
240 |     "    'Precision': lambda y_true, y_pred: precision_score(y_true, y_pred, average='samples'),\n",
241 |     "    'Recall': lambda y_true, y_pred: recall_score(y_true, y_pred, average='samples'),\n",
242 |     "    'F1': lambda y_true, y_pred: f1_score(y_true, y_pred, average='samples'),\n",
243 |     "}\n",
244 |     "\n",
245 |     "def classification_metrics(metrics, y_true, y_pred):\n",
246 |     "    return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.items()}"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 9,
252 |    "metadata": {
253 |     "collapsed": true,
254 |     "deletable": true,
255 |     "editable": true
256 |    },
257 |    "outputs": [],
258 |    "source": [
259 |     "results_dict = dict()\n",
260 |     "num_rounds = 50"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {
266 |     "deletable": true,
267 |     "editable": true
268 |    },
269 |    "source": [
270 |     "Now we are going to define the different models."
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 10,
276 |    "metadata": {
277 |     "collapsed": true,
278 |     "deletable": true,
279 |     "editable": true
280 |    },
281 |    "outputs": [],
282 |    "source": [
283 |     "xgb_params = {'max_depth':2, #'max_depth':6 \n",
284 |     "              'objective':'binary:logistic', \n",
285 |     "              'min_child_weight':1, \n",
286 |     "              'learning_rate':0.1, \n",
287 |     "              'scale_pos_weight':2, \n",
288 |     "              'gamma':0.1, \n",
289 |     "              'reg_lamda':1, \n",
290 |     "              'subsample':1,\n",
291 |     "              'tree_method':'exact', \n",
292 |     "              'updater':'grow_gpu',\n",
293 |     "             }"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {
299 |     "deletable": true,
300 |     "editable": true
301 |    },
302 |    "source": [
303 |     "*NOTE: We got an out of memory error with xgb. Please see the comments at the end of the notebook.*"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {
310 |     "collapsed": false,
311 |     "deletable": true,
312 |     "editable": true
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "y_pred, timing_results = train_and_validate_xgboost(xgb_params, X_train, y_train, X_test, num_boost_round=num_rounds)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {
323 |     "collapsed": false,
324 |     "deletable": true,
325 |     "editable": true
326 |    },
327 |    "outputs": [],
328 |    "source": [
329 |     "results_dict['xgb']={\n",
330 |     "    'train_time': np.sum(timing_results['train_time']),\n",
331 |     "    'test_time': np.sum(timing_results['test_time']),\n",
332 |     "    'performance': classification_metrics(metrics_dict, \n",
333 |     "                                          y_test, \n",
334 |     "                                          threshold_prediction(y_pred, threshold=0.1)) \n",
335 |     "}"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {
341 |     "deletable": true,
342 |     "editable": true
343 |    },
344 |    "source": [
345 |     "\n",
346 |     "\n",
347 |     "Now let's try with XGBoost histogram.\n"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 12,
353 |    "metadata": {
354 |     "collapsed": false,
355 |     "deletable": true,
356 |     "editable": true
357 |    },
358 |    "outputs": [],
359 |    "source": [
360 |     "xgb_hist_params = {'max_depth':0, \n",
361 |     "                  'max_leaves':2**6, \n",
362 |     "                  'objective':'binary:logistic', \n",
363 |     "                  'min_child_weight':1, \n",
364 |     "                  'learning_rate':0.1, \n",
365 |     "                  'scale_pos_weight':2, \n",
366 |     "                  'gamma':0.1, \n",
367 |     "                  'reg_lamda':1, \n",
368 |     "                  'subsample':1,\n",
369 |     "                  'tree_method':'hist', \n",
370 |     "                  'grow_policy':'lossguide',\n",
371 |     "                  'updater':'grow_gpu_hist',\n",
372 |     "                  'max_bins': 63\n",
373 |     "                 }"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {
380 |     "collapsed": false,
381 |     "deletable": true,
382 |     "editable": true
383 |    },
384 |    "outputs": [
385 |     {
386 |      "name": "stderr",
387 |      "output_type": "stream",
388 |      "text": [
389 |       "\n",
390 |       "  0%|          | 0/17 [00:00<?, ?it/s]\u001b[A\n",
391 |       "100%|██████████| 17/17 [35:08<00:00, 115.04s/it]\n"
392 |      ]
393 |     }
394 |    ],
395 |    "source": [
396 |     "y_pred, timing_results = train_and_validate_xgboost(xgb_hist_params, X_train, y_train, X_test, num_boost_round=num_rounds)"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {
403 |     "collapsed": true,
404 |     "deletable": true,
405 |     "editable": true
406 |    },
407 |    "outputs": [],
408 |    "source": [
409 |     "results_dict['xgb_hist']={\n",
410 |     "    'train_time': np.sum(timing_results['train_time']),\n",
411 |     "    'test_time': np.sum(timing_results['test_time']),\n",
412 |     "    'performance': classification_metrics(metrics_dict, \n",
413 |     "                                          y_test, \n",
414 |     "                                          threshold_prediction(y_pred, threshold=0.1)) \n",
415 |     "}"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "markdown",
420 |    "metadata": {
421 |     "deletable": true,
422 |     "editable": true
423 |    },
424 |    "source": [
425 |     "## LightGBM \n",
426 |     "\n"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": 21,
432 |    "metadata": {
433 |     "collapsed": false,
434 |     "deletable": true,
435 |     "editable": true
436 |    },
437 |    "outputs": [],
438 |    "source": [
439 |     "lgb_params = {'num_leaves': 2**6,\n",
440 |     "             'learning_rate': 0.1,\n",
441 |     "             'scale_pos_weight': 2,\n",
442 |     "             'min_split_gain': 0.1,\n",
443 |     "             'min_child_weight': 1,\n",
444 |     "             'reg_lambda': 1,\n",
445 |     "             'subsample': 1,\n",
446 |     "             'objective':'binary',\n",
447 |     "             'device': 'gpu',\n",
448 |     "             'task': 'train',\n",
449 |     "             'max_bin': 63\n",
450 |     "             }"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": 22,
456 |    "metadata": {
457 |     "collapsed": false,
458 |     "deletable": true,
459 |     "editable": true
460 |    },
461 |    "outputs": [
462 |     {
463 |      "name": "stderr",
464 |      "output_type": "stream",
465 |      "text": [
466 |       "100%|██████████| 17/17 [05:25<00:00, 15.56s/it]\n"
467 |      ]
468 |     }
469 |    ],
470 |    "source": [
471 |     "y_pred, timing_results = train_and_validate_lightgbm(lgb_params, X_train, y_train, X_test, num_boost_round=num_rounds)"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": 23,
477 |    "metadata": {
478 |     "collapsed": false,
479 |     "deletable": true,
480 |     "editable": true
481 |    },
482 |    "outputs": [],
483 |    "source": [
484 |     "results_dict['lgbm']={\n",
485 |     "    'train_time': np.sum(timing_results['train_time']),\n",
486 |     "    'test_time': np.sum(timing_results['test_time']),\n",
487 |     "    'performance': classification_metrics(metrics_dict, \n",
488 |     "                                          y_test, \n",
489 |     "                                          threshold_prediction(y_pred, threshold=0.1)) \n",
490 |     "}"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "markdown",
495 |    "metadata": {
496 |     "deletable": true,
497 |     "editable": true
498 |    },
499 |    "source": [
500 |     "Finally, we show the results."
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": 24,
506 |    "metadata": {
507 |     "collapsed": false,
508 |     "deletable": true,
509 |     "editable": true
510 |    },
511 |    "outputs": [
512 |     {
513 |      "name": "stdout",
514 |      "output_type": "stream",
515 |      "text": [
516 |       "{\n",
517 |       "    \"lgbm\": {\n",
518 |       "        \"performance\": {\n",
519 |       "            \"Accuracy\": 0.3719656871691915,\n",
520 |       "            \"F1\": 0.8219281534713183,\n",
521 |       "            \"Precision\": 0.7435648956911961,\n",
522 |       "            \"Recall\": 0.9733034790846435\n",
523 |       "        },\n",
524 |       "        \"test_time\": 0.2703422480117297,\n",
525 |       "        \"train_time\": 317.68381078500533\n",
526 |       "    },\n",
527 |       "    \"xgb_hist\": {\n",
528 |       "        \"performance\": {\n",
529 |       "            \"Accuracy\": 0.37871874429640445,\n",
530 |       "            \"F1\": 0.8220252909027159,\n",
531 |       "            \"Precision\": 0.7447899193746976,\n",
532 |       "            \"Recall\": 0.9720717197264013\n",
533 |       "        },\n",
534 |       "        \"test_time\": 0.24103524297242984,\n",
535 |       "        \"train_time\": 2028.4320792920043\n",
536 |       "    }\n",
537 |       "}\n"
538 |      ]
539 |     }
540 |    ],
541 |    "source": [
542 |     "# Results\n",
543 |     "print(json.dumps(results_dict, indent=4, sort_keys=True))"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "markdown",
548 |    "metadata": {
549 |     "deletable": true,
550 |     "editable": true
551 |    },
552 |    "source": [
553 |     "In this dataset we have a big feature size, 2048. When using the standard version of XGBoost, xgb, we get an out of memory using a NVIDIA M60 GPU, even if we reduce the max depth of the tree to 2. A solution to this issue would be to reduce the feature size. One option could be using PCA and another could be to use a different featurizer, instead of ResNet whose last hidden layer has 2048 units, we could use VGG, [also provided by Keras](https://github.com/fchollet/keras/blob/master/keras/applications/vgg16.py), whose last hidden layer has 512 units. \n",
554 |     "\n",
555 |     "As it can be seen, LightGBM is faster than XGBoost, but in this case the speed is lower than in the CPU version. The GPU implementation cannot always speed up the training, since it has some additional cost of memory copy between CPU and GPU. So when the data size is small and the number of features is large, the GPU version will be slower."
556 |    ]
557 |   }
558 |  ],
559 |  "metadata": {
560 |   "kernelspec": {
561 |    "display_name": "Python 3.5",
562 |    "language": "python",
563 |    "name": "python3"
564 |   },
565 |   "language_info": {
566 |    "codemirror_mode": {
567 |     "name": "ipython",
568 |     "version": 3
569 |    },
570 |    "file_extension": ".py",
571 |    "mimetype": "text/x-python",
572 |    "name": "python",
573 |    "nbconvert_exporter": "python",
574 |    "pygments_lexer": "ipython3",
575 |    "version": "3.5.2"
576 |   }
577 |  },
578 |  "nbformat": 4,
579 |  "nbformat_minor": 0
580 | }
581 | 


--------------------------------------------------------------------------------
/algorithms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without
  4 | # modification, are permitted provided that the following conditions
  5 | # are met:
  6 | #  * Redistributions of source code must retain the above copyright
  7 | #    notice, this list of conditions and the following disclaimer.
  8 | #  * Redistributions in binary form must reproduce the above copyright
  9 | #    notice, this list of conditions and the following disclaimer in the
 10 | #    documentation and/or other materials provided with the distribution.
 11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 | #    contributors may be used to endorse or promote products derived
 13 | #    from this software without specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | 
 27 | from abc import ABC, abstractmethod
 28 | import time
 29 | import pandas as pd
 30 | import numpy as np
 31 | import dask.dataframe as dd
 32 | import dask.array as da
 33 | from dask.distributed import Client
 34 | from dask_cuda import LocalCUDACluster
 35 | import xgboost as xgb
 36 | 
 37 | try:
 38 |     import catboost as cat
 39 | except ImportError:
 40 |     cat = None
 41 | try:
 42 |     import lightgbm as lgb
 43 | except (ImportError, OSError):
 44 |     lgb = None
 45 | try:
 46 |     import dask_xgboost as dxgb
 47 | except ImportError:
 48 |     dxgb = None
 49 | try:
 50 |     from sklearn.experimental import enable_hist_gradient_boosting
 51 |     from sklearn.ensemble import HistGradientBoostingClassifier as skhgb
 52 | except ImportError:
 53 |     skhgb = None
 54 | try:
 55 |     from sklearn.experimental import enable_hist_gradient_boosting
 56 |     from sklearn.ensemble import HistGradientBoostingRegressor as skhgb_r
 57 | except ImportError:
 58 |     skhgb_r = None
 59 | try:
 60 |     from sklearn.ensemble import GradientBoostingClassifier as skgb
 61 | except ImportError:
 62 |     skgb = None
 63 | try:
 64 |     from sklearn.ensemble import GradientBoostingRegressor as skgb_r
 65 | except ImportError:
 66 |     skgb_r = None
 67 | try:
 68 |     from sklearn.ensemble import RandomForestClassifier as skrf
 69 | except ImportError:
 70 |     skrf = None
 71 | try:
 72 |     from sklearn.ensemble import RandomForestRegressor as skrf_r
 73 | except ImportError:
 74 |     skrf_r = None
 75 | try:
 76 |     from cuml.ensemble import RandomForestClassifier as cumlrf
 77 | except ImportError:
 78 |     cumlrf = None
 79 | try:
 80 |     from cuml.ensemble import RandomForestRegressor as cumlrf_r
 81 | except ImportError:
 82 |     cumlrf_r = None
 83 | 
 84 | from datasets import LearningTask
 85 | 
 86 | 
 87 | class Timer:
 88 |     def __init__(self):
 89 |         self.start = None
 90 |         self.end = None
 91 |         self.interval = None
 92 | 
 93 |     def __enter__(self):
 94 |         self.start = time.perf_counter()
 95 |         return self
 96 | 
 97 |     def __exit__(self, *args):
 98 |         self.end = time.perf_counter()
 99 |         self.interval = self.end - self.start
100 | 
101 | 
102 | class Algorithm(ABC):
103 |     @staticmethod
104 |     def create(name):  # pylint: disable=too-many-return-statements
105 |         if name == 'xgb-gpu':
106 |             return XgbGPUHistAlgorithm()
107 |         if name == 'xgb-gpu-dask':
108 |             return XgbGPUHistDaskAlgorithm()
109 |         if name == 'xgb-gpu-dask-old':
110 |             return XgbGPUHistDaskOldAlgorithm()
111 |         if name == 'xgb-cpu':
112 |             return XgbCPUHistAlgorithm()
113 |         if name == 'lgbm-cpu':
114 |             return LgbmCPUAlgorithm()
115 |         if name == 'lgbm-gpu':
116 |             return LgbmGPUAlgorithm()
117 |         if name == 'cat-cpu':
118 |             return CatCPUAlgorithm()
119 |         if name == 'cat-gpu':
120 |             return CatGPUAlgorithm()
121 |         if name == 'skhgb':
122 |             return SkHistAlgorithm()
123 |         if name == 'skgb':
124 |             return SkGradientAlgorithm()
125 |         if name == 'skrf':
126 |             return SkRandomForestAlgorithm()
127 |         if name == 'cumlrf':
128 |             return CumlRfAlgorithm()
129 |         raise ValueError("Unknown algorithm: " + name)
130 | 
131 |     def __init__(self):
132 |         self.model = None
133 | 
134 |     @abstractmethod
135 |     def fit(self, data, args):
136 |         pass
137 | 
138 |     @abstractmethod
139 |     def test(self, data):
140 |         pass
141 | 
142 |     def __enter__(self):
143 |         pass
144 | 
145 |     @abstractmethod
146 |     def __exit__(self, exc_type, exc_value, traceback):
147 |         pass
148 | 
149 | 
150 | # learning parameters shared by all algorithms, using the xgboost convention
151 | shared_params = {"max_depth": 8, "learning_rate": 0.1,
152 |                  "reg_lambda": 1}
153 | 
154 | class CumlRfAlgorithm(Algorithm):
155 |     def configure(self, data, args):
156 |         params = shared_params.copy()
157 |         del params["reg_lambda"]
158 |         del params["learning_rate"]
159 |         params["n_estimators"] = args.ntrees
160 |         params.update(args.extra)
161 |         return params
162 | 
163 |     def fit(self, data, args):
164 |         params = self.configure(data, args)
165 |         if data.learning_task == LearningTask.REGRESSION:
166 |             with Timer() as t:
167 |                 self.model = cumlrf_r(**params).fit(data.X_train, data.y_train)
168 |             return t.interval
169 |         else:
170 |             with Timer() as t:
171 |                 self.model = cumlrf(**params).fit(data.X_train, data.y_train)
172 |             return t.interval
173 | 
174 |     def test(self, data):
175 |         return self.model.predict(data.X_test)
176 | 
177 |     def __exit__(self, exc_type, exc_value, traceback):
178 |         del self.model
179 | 
180 | class XgbAlgorithm(Algorithm):
181 |     def configure(self, data, args):
182 |         params = shared_params.copy()
183 |         params.update({
184 |                        "nthread": args.cpus})
185 |         if data.learning_task == LearningTask.REGRESSION:
186 |             params["objective"] = "reg:squarederror"
187 |         elif data.learning_task == LearningTask.CLASSIFICATION:
188 |             params["objective"] = "binary:logistic"
189 |             params["scale_pos_weight"] = len(data.y_train) / np.count_nonzero(data.y_train)
190 |         elif data.learning_task == LearningTask.MULTICLASS_CLASSIFICATION:
191 |             params["objective"] = "multi:softmax"
192 |             params["num_class"] = np.max(data.y_test) + 1
193 |         params.update(args.extra)
194 |         return params
195 | 
196 |     def fit(self, data, args):
197 |         dtrain = xgb.DMatrix(data.X_train, data.y_train)
198 |         params = self.configure(data, args)
199 |         with Timer() as t:
200 |             self.model = xgb.train(params, dtrain, args.ntrees)
201 |         return t.interval
202 | 
203 |     def test(self, data):
204 |         dtest = xgb.DMatrix(data.X_test, data.y_test)
205 |         return self.model.predict(dtest)
206 | 
207 |     def __exit__(self, exc_type, exc_value, traceback):
208 |         del self.model
209 | 
210 | 
211 | class XgbGPUHistAlgorithm(XgbAlgorithm):
212 |     def configure(self, data, args):
213 |         params = super(XgbGPUHistAlgorithm, self).configure(data, args)
214 |         params.update({"tree_method": "gpu_hist", "gpu_id": 0})
215 |         return params
216 | 
217 | class SkRandomForestAlgorithm(Algorithm):
218 |     def configure(self, data, args):
219 |         params = shared_params.copy()
220 |         del params["reg_lambda"]
221 |         del params["learning_rate"]
222 |         params["n_estimators"] = args.ntrees
223 |         params.update(args.extra)
224 |         return params
225 | 
226 |     def fit(self, data, args):
227 |         params = self.configure(data, args)
228 |         if data.learning_task == LearningTask.REGRESSION:
229 |             with Timer() as t:
230 |                 self.model = skrf_r(**params).fit(data.X_train, data.y_train)
231 |             return t.interval
232 |         else:
233 |             with Timer() as t:
234 |                 self.model = skrf(**params).fit(data.X_train, data.y_train)
235 |             return t.interval
236 | 
237 |     def test(self, data):
238 |         return self.model.predict(data.X_test)
239 | 
240 |     def __exit__(self, exc_type, exc_value, traceback):
241 |         del self.model
242 | 
243 | class SkGradientAlgorithm(Algorithm):
244 |     def configure(self, data, args):
245 |         params = shared_params.copy()
246 |         del params["reg_lambda"]
247 |         del params["learning_rate"]
248 |         params["n_estimators"] = args.ntrees
249 |         params.update(args.extra)
250 |         return params
251 | 
252 |     def fit(self, data, args):
253 |         params = self.configure(data, args)
254 |         if data.learning_task == LearningTask.REGRESSION:
255 |             with Timer() as t:
256 |                 self.model = skgb_r(**params).fit(data.X_train, data.y_train)
257 |             return t.interval
258 |         else:
259 |             with Timer() as t:
260 |                 self.model = skgb(**params).fit(data.X_train, data.y_train)
261 |             return t.interval
262 | 
263 |     def test(self, data):
264 |         return self.model.predict(data.X_test)
265 | 
266 |     def __exit__(self, exc_type, exc_value, traceback):
267 |         del self.model
268 | 
269 | class SkHistAlgorithm(Algorithm):
270 |     def configure(self, data, args):
271 |         params = shared_params.copy()
272 |         del params["reg_lambda"]
273 |         del params["learning_rate"]
274 |         params["n_estimators"] = args.ntrees
275 |         params.update(args.extra)
276 |         return params
277 | 
278 |     def fit(self, data, args):
279 |         params = self.configure(data, args)
280 |         if data.learning_task == LearningTask.REGRESSION:
281 |             with Timer() as t:
282 |                 self.model = skhgb_r(**params).fit(data.X_train, data.y_train)
283 |             return t.interval
284 |         else:
285 |             with Timer() as t:
286 |                 self.model = skhgb(**params).fit(data.X_train, data.y_train)
287 |             return t.interval
288 | 
289 |     def test(self, data):
290 |         return self.model.predict(data.X_test)
291 | 
292 |     def __exit__(self, exc_type, exc_value, traceback):
293 |         del self.model
294 | 
295 | 
296 | class XgbGPUHistDaskAlgorithm(XgbAlgorithm):
297 |     def configure(self, data, args):
298 |         params = super(XgbGPUHistDaskAlgorithm, self).configure(data, args)
299 |         params.update({"tree_method": "gpu_hist"})
300 |         del params['nthread']  # This is handled by dask
301 |         return params
302 | 
303 |     def get_slices(self, n_slices, X, y):
304 |         n_rows_worker = int(np.ceil(len(y) / n_slices))
305 |         indices = []
306 |         count = 0
307 |         for _ in range(0, n_slices - 1):
308 |             indices.append(min(count + n_rows_worker, len(y)))
309 |             count += n_rows_worker
310 |         return np.split(X, indices), np.split(y, indices)
311 | 
312 |     def fit(self, data, args):
313 |         params = self.configure(data, args)
314 |         n_workers = None if args.gpus < 0 else args.gpus
315 |         cluster = LocalCUDACluster(n_workers=n_workers,
316 |                                    local_directory=args.root)
317 |         client = Client(cluster)
318 |         n_partitions = len(client.scheduler_info()['workers'])
319 |         X_sliced, y_sliced = self.get_slices(n_partitions,
320 |                                              data.X_train, data.y_train)
321 |         X = da.concatenate([da.from_array(sub_array) for sub_array in X_sliced])
322 |         X = X.rechunk((X_sliced[0].shape[0], data.X_train.shape[1]))
323 |         y = da.concatenate([da.from_array(sub_array) for sub_array in y_sliced])
324 |         y = y.rechunk(X.chunksize[0])
325 |         dtrain = xgb.dask.DaskDMatrix(client, X, y)
326 |         with Timer() as t:
327 |             output = xgb.dask.train(client, params, dtrain, num_boost_round=args.ntrees)
328 |         self.model = output['booster']
329 |         client.close()
330 |         cluster.close()
331 |         return t.interval
332 | 
333 |     def test(self, data):
334 |         dtest = xgb.DMatrix(data.X_test, data.y_test)
335 |         self.model.set_param({'predictor': 'gpu_predictor'})
336 |         return self.model.predict(dtest)
337 | 
338 |     def __exit__(self, exc_type, exc_value, traceback):
339 |         del self.model
340 | 
341 | 
342 | class XgbGPUHistDaskOldAlgorithm(XgbAlgorithm):
343 |     def configure(self, data, args):
344 |         params = super(XgbGPUHistDaskOldAlgorithm, self).configure(data, args)
345 |         params.update({"tree_method": "gpu_hist", "nthread": 1})
346 |         return params
347 | 
348 |     def fit(self, data, args):
349 |         params = self.configure(data, args)
350 |         cluster = LocalCUDACluster(n_workers=None if args.gpus < 0 else args.gpus,
351 |                                    local_directory=args.root)
352 |         client = Client(cluster)
353 |         partition_size = 1000
354 |         if isinstance(data.X_train, np.ndarray):
355 |             X = dd.from_array(data.X_train, partition_size)
356 |             y = dd.from_array(data.y_train, partition_size)
357 |         else:
358 |             X = dd.from_pandas(data.X_train, partition_size)
359 |             y = dd.from_pandas(data.y_train, partition_size)
360 |         X.columns = [str(i) for i in range(0, X.shape[1])]
361 |         with Timer() as t:
362 |             self.model = dxgb.train(client, params, X, y, num_boost_round=args.ntrees)
363 | 
364 |         client.close()
365 |         return t.interval
366 | 
367 |     def test(self, data):
368 |         if isinstance(data.X_test, np.ndarray):
369 |             data.X_test = pd.DataFrame(data=data.X_test, columns=np.arange(0,
370 |                                                                            data.X_test.shape[1]),
371 |                                        index=np.arange(0, data.X_test.shape[0]))
372 |         data.X_test.columns = [str(i) for i in range(0, data.X_test.shape[1])]
373 |         dtest = xgb.DMatrix(data.X_test, data.y_test)
374 |         return self.model.predict(dtest)
375 | 
376 |     def __exit__(self, exc_type, exc_value, traceback):
377 |         del self.model
378 | 
379 | 
380 | class XgbCPUHistAlgorithm(XgbAlgorithm):
381 |     def configure(self, data, args):
382 |         params = super(XgbCPUHistAlgorithm, self).configure(data, args)
383 |         params.update({"tree_method": "hist"})
384 |         return params
385 | 
386 | 
387 | class LgbmAlgorithm(Algorithm):
388 |     def configure(self, data, args):
389 |         params = shared_params.copy()
390 |         params.update({"max_leaves": 256,
391 |                        "nthread": args.cpus})
392 |         if data.learning_task == LearningTask.REGRESSION:
393 |             params["objective"] = "regression"
394 |         elif data.learning_task == LearningTask.CLASSIFICATION:
395 |             params["objective"] = "binary"
396 |             params["scale_pos_weight"] = len(data.y_train) / np.count_nonzero(data.y_train)
397 |         elif data.learning_task == LearningTask.MULTICLASS_CLASSIFICATION:
398 |             params["objective"] = "multiclass"
399 |             params["num_class"] = np.max(data.y_test) + 1
400 |         params.update(args.extra)
401 |         return params
402 | 
403 |     def fit(self, data, args):
404 |         dtrain = lgb.Dataset(data.X_train, data.y_train,
405 |                              free_raw_data=False)
406 |         params = self.configure(data, args)
407 |         with Timer() as t:
408 |             self.model = lgb.train(params, dtrain, args.ntrees)
409 |         return t.interval
410 | 
411 |     def test(self, data):
412 |         if data.learning_task == LearningTask.MULTICLASS_CLASSIFICATION:
413 |             prob = self.model.predict(data.X_test)
414 |             return np.argmax(prob, axis=1)
415 |         return self.model.predict(data.X_test)
416 | 
417 |     def __exit__(self, exc_type, exc_value, traceback):
418 |         self.model.free_dataset()
419 |         del self.model
420 | 
421 | 
422 | class LgbmCPUAlgorithm(LgbmAlgorithm):
423 |     pass
424 | 
425 | 
426 | class LgbmGPUAlgorithm(LgbmAlgorithm):
427 |     def configure(self, data, args):
428 |         params = super(LgbmGPUAlgorithm, self).configure(data, args)
429 |         params.update({"device": "gpu"})
430 |         return params
431 | 
432 | 
433 | class CatAlgorithm(Algorithm):
434 |     def configure(self, data, args):
435 |         params = shared_params.copy()
436 |         params.update({
437 |             "thread_count": args.cpus})
438 |         if args.gpus >= 0:
439 |             params["devices"] = "0-" + str(args.gpus)
440 | 
441 |         if data.learning_task == LearningTask.REGRESSION:
442 |             params["objective"] = "RMSE"
443 |         elif data.learning_task == LearningTask.CLASSIFICATION:
444 |             params["objective"] = "Logloss"
445 |             params["scale_pos_weight"] = len(data.y_train) / np.count_nonzero(data.y_train)
446 |         elif data.learning_task == LearningTask.MULTICLASS_CLASSIFICATION:
447 |             params["objective"] = "MultiClassOneVsAll"
448 |             params["classes_count"] = np.max(data.y_test) + 1
449 |         params.update(args.extra)
450 |         return params
451 | 
452 |     def fit(self, data, args):
453 |         dtrain = cat.Pool(data.X_train, data.y_train)
454 |         params = self.configure(data, args)
455 |         params["iterations"] = args.ntrees
456 |         self.model = cat.CatBoost(params)
457 |         with Timer() as t:
458 |             self.model.fit(dtrain)
459 |         return t.interval
460 | 
461 |     def test(self, data):
462 |         dtest = cat.Pool(data.X_test)
463 |         if data.learning_task == LearningTask.MULTICLASS_CLASSIFICATION:
464 |             prob = self.model.predict(dtest)
465 |             return np.argmax(prob, axis=1)
466 |         return self.model.predict(dtest)
467 | 
468 |     def __exit__(self, exc_type, exc_value, traceback):
469 |         del self.model
470 | 
471 | 
472 | class CatCPUAlgorithm(CatAlgorithm):
473 |     def configure(self, data, args):
474 |         params = super(CatCPUAlgorithm, self).configure(data, args)
475 |         params.update({"task_type": "CPU"})
476 |         return params
477 | 
478 | 
479 | class CatGPUAlgorithm(CatAlgorithm):
480 |     def configure(self, data, args):
481 |         params = super(CatGPUAlgorithm, self).configure(data, args)
482 |         params.update({"task_type": "GPU"})
483 |         return params
484 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # A comma-separated list of package or module names from where C extensions may
  4 | # be loaded. Extensions are loading into the active Python interpreter and may
  5 | # run arbitrary code.
  6 | extension-pkg-whitelist=
  7 | 
  8 | # Add files or directories to the blacklist. They should be base names, not
  9 | # paths.
 10 | ignore=CVS
 11 | 
 12 | # Add files or directories matching the regex patterns to the blacklist. The
 13 | # regex matches against base names, not paths.
 14 | ignore-patterns=
 15 | 
 16 | # Python code to execute, usually for sys.path manipulation such as
 17 | # pygtk.require().
 18 | #init-hook=
 19 | 
 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
 21 | # number of processors available to use.
 22 | jobs=1
 23 | 
 24 | # Control the amount of potential inferred values when inferring a single
 25 | # object. This can help the performance when dealing with large functions or
 26 | # complex, nested conditions.
 27 | limit-inference-results=100
 28 | 
 29 | # List of plugins (as comma separated values of python modules names) to load,
 30 | # usually to register additional checkers.
 31 | load-plugins=
 32 | 
 33 | # Pickle collected data for later comparisons.
 34 | persistent=yes
 35 | 
 36 | # Specify a configuration file.
 37 | #rcfile=
 38 | 
 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit
 40 | # user-friendly hints instead of false-positive error messages.
 41 | suggestion-mode=yes
 42 | 
 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 44 | # active Python interpreter and may run arbitrary code.
 45 | unsafe-load-any-extension=no
 46 | 
 47 | 
 48 | [MESSAGES CONTROL]
 49 | 
 50 | # Only show warnings with the listed confidence levels. Leave empty to show
 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
 52 | confidence=
 53 | 
 54 | # Disable the message, report, category or checker with the given id(s). You
 55 | # can either give multiple identifiers separated by comma (,) or put this
 56 | # option multiple times (only on the command line, not in the configuration
 57 | # file where it should appear only once). You can also use "--disable=all" to
 58 | # disable everything first and then reenable specific checks. For example, if
 59 | # you want to run only the similarities checker, you can use "--disable=all
 60 | # --enable=similarities". If you want to run only the classes checker, but have
 61 | # no Warning level messages displayed, use "--disable=all --enable=classes
 62 | # --disable=W".
 63 | disable=print-statement,
 64 |         parameter-unpacking,
 65 |         unpacking-in-except,
 66 |         old-raise-syntax,
 67 |         backtick,
 68 |         long-suffix,
 69 |         old-ne-operator,
 70 |         old-octal-literal,
 71 |         import-star-module-level,
 72 |         non-ascii-bytes-literal,
 73 |         raw-checker-failed,
 74 |         bad-inline-option,
 75 |         locally-disabled,
 76 |         locally-enabled,
 77 |         file-ignored,
 78 |         suppressed-message,
 79 |         useless-suppression,
 80 |         deprecated-pragma,
 81 |         use-symbolic-message-instead,
 82 |         apply-builtin,
 83 |         basestring-builtin,
 84 |         buffer-builtin,
 85 |         cmp-builtin,
 86 |         coerce-builtin,
 87 |         execfile-builtin,
 88 |         file-builtin,
 89 |         long-builtin,
 90 |         raw_input-builtin,
 91 |         reduce-builtin,
 92 |         standarderror-builtin,
 93 |         unicode-builtin,
 94 |         xrange-builtin,
 95 |         coerce-method,
 96 |         delslice-method,
 97 |         getslice-method,
 98 |         setslice-method,
 99 |         no-absolute-import,
100 |         old-division,
101 |         dict-iter-method,
102 |         dict-view-method,
103 |         next-method-called,
104 |         metaclass-assignment,
105 |         indexing-exception,
106 |         raising-string,
107 |         reload-builtin,
108 |         oct-method,
109 |         hex-method,
110 |         nonzero-method,
111 |         cmp-method,
112 |         input-builtin,
113 |         round-builtin,
114 |         intern-builtin,
115 |         unichr-builtin,
116 |         map-builtin-not-iterating,
117 |         zip-builtin-not-iterating,
118 |         range-builtin-not-iterating,
119 |         filter-builtin-not-iterating,
120 |         using-cmp-argument,
121 |         eq-without-hash,
122 |         div-method,
123 |         idiv-method,
124 |         rdiv-method,
125 |         exception-message-attribute,
126 |         invalid-str-codec,
127 |         sys-max-int,
128 |         bad-python3-import,
129 |         deprecated-string-function,
130 |         deprecated-str-translate-call,
131 |         deprecated-itertools-function,
132 |         deprecated-types-field,
133 |         next-method-defined,
134 |         dict-items-not-iterating,
135 |         dict-keys-not-iterating,
136 |         dict-values-not-iterating,
137 |         deprecated-operator-function,
138 |         deprecated-urllib-function,
139 |         xreadlines-attribute,
140 |         deprecated-sys-function,
141 |         exception-escape,
142 |         comprehension-escape,
143 |         invalid-name,
144 |         no-self-use,
145 |         import-error,
146 |         missing-docstring,
147 |         unbalanced-tuple-unpacking
148 | 
149 | 
150 | # Enable the message, report, category or checker with the given id(s). You can
151 | # either give multiple identifier separated by comma (,) or put this option
152 | # multiple time (only on the command line, not in the configuration file where
153 | # it should appear only once). See also the "--disable" option for examples.
154 | enable=c-extension-no-member
155 | 
156 | 
157 | [REPORTS]
158 | 
159 | # Python expression which should return a note less than 10 (10 is the highest
160 | # note). You have access to the variables errors warning, statement which
161 | # respectively contain the number of errors / warnings messages and the total
162 | # number of statements analyzed. This is used by the global evaluation report
163 | # (RP0004).
164 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
165 | 
166 | # Template used to display messages. This is a python new-style format string
167 | # used to format the message information. See doc for all details.
168 | #msg-template=
169 | 
170 | # Set the output format. Available formats are text, parseable, colorized, json
171 | # and msvs (visual studio). You can also give a reporter class, e.g.
172 | # mypackage.mymodule.MyReporterClass.
173 | output-format=text
174 | 
175 | # Tells whether to display a full report or only the messages.
176 | reports=no
177 | 
178 | # Activate the evaluation score.
179 | score=yes
180 | 
181 | 
182 | [REFACTORING]
183 | 
184 | # Maximum number of nested blocks for function / method body
185 | max-nested-blocks=5
186 | 
187 | # Complete name of functions that never returns. When checking for
188 | # inconsistent-return-statements if a never returning function is called then
189 | # it will be considered as an explicit return statement and no message will be
190 | # printed.
191 | never-returning-functions=sys.exit
192 | 
193 | 
194 | [BASIC]
195 | 
196 | # Naming style matching correct argument names.
197 | argument-naming-style=snake_case
198 | 
199 | # Regular expression matching correct argument names. Overrides argument-
200 | # naming-style.
201 | #argument-rgx=
202 | 
203 | # Naming style matching correct attribute names.
204 | attr-naming-style=snake_case
205 | 
206 | # Regular expression matching correct attribute names. Overrides attr-naming-
207 | # style.
208 | #attr-rgx=
209 | 
210 | # Bad variable names which should always be refused, separated by a comma.
211 | bad-names=foo,
212 |           bar,
213 |           baz,
214 |           toto,
215 |           tutu,
216 |           tata
217 | 
218 | # Naming style matching correct class attribute names.
219 | class-attribute-naming-style=any
220 | 
221 | # Regular expression matching correct class attribute names. Overrides class-
222 | # attribute-naming-style.
223 | #class-attribute-rgx=
224 | 
225 | # Naming style matching correct class names.
226 | class-naming-style=PascalCase
227 | 
228 | # Regular expression matching correct class names. Overrides class-naming-
229 | # style.
230 | #class-rgx=
231 | 
232 | # Naming style matching correct constant names.
233 | const-naming-style=UPPER_CASE
234 | 
235 | # Regular expression matching correct constant names. Overrides const-naming-
236 | # style.
237 | #const-rgx=
238 | 
239 | # Minimum line length for functions/classes that require docstrings, shorter
240 | # ones are exempt.
241 | docstring-min-length=-1
242 | 
243 | # Naming style matching correct function names.
244 | function-naming-style=snake_case
245 | 
246 | # Regular expression matching correct function names. Overrides function-
247 | # naming-style.
248 | #function-rgx=
249 | 
250 | # Good variable names which should always be accepted, separated by a comma.
251 | good-names=i,
252 |            j,
253 |            k,
254 |            ex,
255 |            Run,
256 |            _
257 | 
258 | # Include a hint for the correct naming format with invalid-name.
259 | include-naming-hint=no
260 | 
261 | # Naming style matching correct inline iteration names.
262 | inlinevar-naming-style=any
263 | 
264 | # Regular expression matching correct inline iteration names. Overrides
265 | # inlinevar-naming-style.
266 | #inlinevar-rgx=
267 | 
268 | # Naming style matching correct method names.
269 | method-naming-style=snake_case
270 | 
271 | # Regular expression matching correct method names. Overrides method-naming-
272 | # style.
273 | #method-rgx=
274 | 
275 | # Naming style matching correct module names.
276 | module-naming-style=snake_case
277 | 
278 | # Regular expression matching correct module names. Overrides module-naming-
279 | # style.
280 | #module-rgx=
281 | 
282 | # Colon-delimited sets of names that determine each other's naming style when
283 | # the name regexes allow several styles.
284 | name-group=
285 | 
286 | # Regular expression which should only match function or class names that do
287 | # not require a docstring.
288 | no-docstring-rgx=^_
289 | 
290 | # List of decorators that produce properties, such as abc.abstractproperty. Add
291 | # to this list to register other decorators that produce valid properties.
292 | # These decorators are taken in consideration only for invalid-name.
293 | property-classes=abc.abstractproperty
294 | 
295 | # Naming style matching correct variable names.
296 | variable-naming-style=snake_case
297 | 
298 | # Regular expression matching correct variable names. Overrides variable-
299 | # naming-style.
300 | #variable-rgx=
301 | 
302 | 
303 | [FORMAT]
304 | 
305 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
306 | expected-line-ending-format=
307 | 
308 | # Regexp for a line that is allowed to be longer than the limit.
309 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
310 | 
311 | # Number of spaces of indent required inside a hanging  or continued line.
312 | indent-after-paren=4
313 | 
314 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
315 | # tab).
316 | indent-string='    '
317 | 
318 | # Maximum number of characters on a single line.
319 | max-line-length=100
320 | 
321 | # Maximum number of lines in a module.
322 | max-module-lines=1000
323 | 
324 | # List of optional constructs for which whitespace checking is disabled. `dict-
325 | # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
326 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
327 | # `empty-line` allows space-only lines.
328 | no-space-check=trailing-comma,
329 |                dict-separator
330 | 
331 | # Allow the body of a class to be on the same line as the declaration if body
332 | # contains single statement.
333 | single-line-class-stmt=no
334 | 
335 | # Allow the body of an if to be on the same line as the test if there is no
336 | # else.
337 | single-line-if-stmt=no
338 | 
339 | 
340 | [MISCELLANEOUS]
341 | 
342 | # List of note tags to take in consideration, separated by a comma.
343 | notes=FIXME,
344 |       XXX,
345 |       TODO
346 | 
347 | 
348 | [SPELLING]
349 | 
350 | # Limits count of emitted suggestions for spelling mistakes.
351 | max-spelling-suggestions=4
352 | 
353 | # Spelling dictionary name. Available dictionaries: none. To make it working
354 | # install python-enchant package..
355 | spelling-dict=
356 | 
357 | # List of comma separated words that should not be checked.
358 | spelling-ignore-words=
359 | 
360 | # A path to a file that contains private dictionary; one word per line.
361 | spelling-private-dict-file=
362 | 
363 | # Tells whether to store unknown words to indicated private dictionary in
364 | # --spelling-private-dict-file option instead of raising a message.
365 | spelling-store-unknown-words=no
366 | 
367 | 
368 | [SIMILARITIES]
369 | 
370 | # Ignore comments when computing similarities.
371 | ignore-comments=yes
372 | 
373 | # Ignore docstrings when computing similarities.
374 | ignore-docstrings=yes
375 | 
376 | # Ignore imports when computing similarities.
377 | ignore-imports=no
378 | 
379 | # Minimum lines number of a similarity.
380 | min-similarity-lines=4
381 | 
382 | 
383 | [VARIABLES]
384 | 
385 | # List of additional names supposed to be defined in builtins. Remember that
386 | # you should avoid to define new builtins when possible.
387 | additional-builtins=
388 | 
389 | # Tells whether unused global variables should be treated as a violation.
390 | allow-global-unused-variables=yes
391 | 
392 | # List of strings which can identify a callback function by name. A callback
393 | # name must start or end with one of those strings.
394 | callbacks=cb_,
395 |           _cb
396 | 
397 | # A regular expression matching the name of dummy variables (i.e. expected to
398 | # not be used).
399 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
400 | 
401 | # Argument names that match this expression will be ignored. Default to name
402 | # with leading underscore.
403 | ignored-argument-names=_.*|^ignored_|^unused_
404 | 
405 | # Tells whether we should check for unused import in __init__ files.
406 | init-import=no
407 | 
408 | # List of qualified module names which can have objects that can redefine
409 | # builtins.
410 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
411 | 
412 | 
413 | [LOGGING]
414 | 
415 | # Logging modules to check that the string format arguments are in logging
416 | # function parameter format.
417 | logging-modules=logging
418 | 
419 | 
420 | [TYPECHECK]
421 | 
422 | # List of decorators that produce context managers, such as
423 | # contextlib.contextmanager. Add to this list to register other decorators that
424 | # produce valid context managers.
425 | contextmanager-decorators=contextlib.contextmanager
426 | 
427 | # List of members which are set dynamically and missed by pylint inference
428 | # system, and so shouldn't trigger E1101 when accessed. Python regular
429 | # expressions are accepted.
430 | generated-members=
431 | 
432 | # Tells whether missing members accessed in mixin class should be ignored. A
433 | # mixin class is detected if its name ends with "mixin" (case insensitive).
434 | ignore-mixin-members=yes
435 | 
436 | # Tells whether to warn about missing members when the owner of the attribute
437 | # is inferred to be None.
438 | ignore-none=yes
439 | 
440 | # This flag controls whether pylint should warn about no-member and similar
441 | # checks whenever an opaque object is returned when inferring. The inference
442 | # can return multiple potential results while evaluating a Python object, but
443 | # some branches might not be evaluated, which results in partial inference. In
444 | # that case, it might be useful to still emit no-member and other checks for
445 | # the rest of the inferred objects.
446 | ignore-on-opaque-inference=yes
447 | 
448 | # List of class names for which member attributes should not be checked (useful
449 | # for classes with dynamically set attributes). This supports the use of
450 | # qualified names.
451 | ignored-classes=optparse.Values,thread._local,_thread._local
452 | 
453 | # List of module names for which member attributes should not be checked
454 | # (useful for modules/projects where namespaces are manipulated during runtime
455 | # and thus existing member attributes cannot be deduced by static analysis. It
456 | # supports qualified module names, as well as Unix pattern matching.
457 | ignored-modules=
458 | 
459 | # Show a hint with possible names when a member name was not found. The aspect
460 | # of finding the hint is based on edit distance.
461 | missing-member-hint=yes
462 | 
463 | # The minimum edit distance a name should have in order to be considered a
464 | # similar match for a missing member name.
465 | missing-member-hint-distance=1
466 | 
467 | # The total number of similar names that should be taken in consideration when
468 | # showing a hint for a missing member.
469 | missing-member-max-choices=1
470 | 
471 | 
472 | [IMPORTS]
473 | 
474 | # Allow wildcard imports from modules that define __all__.
475 | allow-wildcard-with-all=no
476 | 
477 | # Analyse import fallback blocks. This can be used to support both Python 2 and
478 | # 3 compatible code, which means that the block might have code that exists
479 | # only in one or another interpreter, leading to false positives when analysed.
480 | analyse-fallback-blocks=no
481 | 
482 | # Deprecated modules which should not be used, separated by a comma.
483 | deprecated-modules=optparse,tkinter.tix
484 | 
485 | # Create a graph of external dependencies in the given file (report RP0402 must
486 | # not be disabled).
487 | ext-import-graph=
488 | 
489 | # Create a graph of every (i.e. internal and external) dependencies in the
490 | # given file (report RP0402 must not be disabled).
491 | import-graph=
492 | 
493 | # Create a graph of internal dependencies in the given file (report RP0402 must
494 | # not be disabled).
495 | int-import-graph=
496 | 
497 | # Force import order to recognize a module as part of the standard
498 | # compatibility libraries.
499 | known-standard-library=
500 | 
501 | # Force import order to recognize a module as part of a third party library.
502 | known-third-party=enchant
503 | 
504 | 
505 | [CLASSES]
506 | 
507 | # List of method names used to declare (i.e. assign) instance attributes.
508 | defining-attr-methods=__init__,
509 |                       __new__,
510 |                       setUp
511 | 
512 | # List of member names, which should be excluded from the protected access
513 | # warning.
514 | exclude-protected=_asdict,
515 |                   _fields,
516 |                   _replace,
517 |                   _source,
518 |                   _make
519 | 
520 | # List of valid names for the first argument in a class method.
521 | valid-classmethod-first-arg=cls
522 | 
523 | # List of valid names for the first argument in a metaclass class method.
524 | valid-metaclass-classmethod-first-arg=cls
525 | 
526 | 
527 | [DESIGN]
528 | 
529 | # Maximum number of arguments for function / method.
530 | max-args=5
531 | 
532 | # Maximum number of attributes for a class (see R0902).
533 | max-attributes=7
534 | 
535 | # Maximum number of boolean expressions in an if statement.
536 | max-bool-expr=5
537 | 
538 | # Maximum number of branch for function / method body.
539 | max-branches=12
540 | 
541 | # Maximum number of locals for function / method body.
542 | max-locals=15
543 | 
544 | # Maximum number of parents for a class (see R0901).
545 | max-parents=7
546 | 
547 | # Maximum number of public methods for a class (see R0904).
548 | max-public-methods=20
549 | 
550 | # Maximum number of return / yield for function / method body.
551 | max-returns=6
552 | 
553 | # Maximum number of statements in function / method body.
554 | max-statements=50
555 | 
556 | # Minimum number of public methods for a class (see R0903).
557 | min-public-methods=2
558 | 
559 | 
560 | [EXCEPTIONS]
561 | 
562 | # Exceptions that will emit a warning when being caught. Defaults to
563 | # "Exception".
564 | overgeneral-exceptions=Exception
565 | 


--------------------------------------------------------------------------------
/3rdparty/fast_retraining/experiments/06_HIGGS.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Experiment 06: HIGGS boson \n",
 11 |     "\n",
 12 |     "This experiment uses the data from the [HIGGS dataset](https://archive.ics.uci.edu/ml/datasets/HIGGS) to predict the appearance of the Higgs boson. The dataset consists of 11 million of observations. More information about the data can be found in [loaders.py](libs/loaders.py).  \n",
 13 |     "\n",
 14 |     "For details of virtual machine we used and the versions of LightGBM and XGBoost, please refer to [experiment 1](01_airline.ipynb)."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false,
 22 |     "deletable": true,
 23 |     "editable": true
 24 |    },
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stdout",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "System version: 3.5.3 |Anaconda 4.4.0 (64-bit)| (default, Mar  6 2017, 11:58:13) \n",
 31 |       "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n",
 32 |       "XGBoost version: 0.6\n",
 33 |       "LightGBM version: 0.2\n"
 34 |      ]
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "import json\n",
 39 |     "import sys\n",
 40 |     "import matplotlib.pyplot as plt\n",
 41 |     "import pkg_resources\n",
 42 |     "from libs.loaders import load_higgs\n",
 43 |     "from libs.timer import Timer\n",
 44 |     "from libs.utils import get_number_processors\n",
 45 |     "from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score\n",
 46 |     "from sklearn.model_selection import train_test_split\n",
 47 |     "from xgboost import XGBClassifier\n",
 48 |     "from lightgbm import LGBMClassifier\n",
 49 |     "import warnings\n",
 50 |     "warnings.filterwarnings('ignore')\n",
 51 |     "\n",
 52 |     "print(\"System version: {}\".format(sys.version))\n",
 53 |     "print(\"XGBoost version: {}\".format(pkg_resources.get_distribution('xgboost').version))\n",
 54 |     "print(\"LightGBM version: {}\".format(pkg_resources.get_distribution('lightgbm').version))"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {
 61 |     "collapsed": false,
 62 |     "deletable": true,
 63 |     "editable": true
 64 |    },
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stderr",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "INFO:libs.loaders:MOUNT_POINT not found in environment. Defaulting to /fileshare\n"
 71 |      ]
 72 |     },
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "(11000000, 29)\n",
 78 |       "CPU times: user 1min 12s, sys: 6.31 s, total: 1min 18s\n",
 79 |       "Wall time: 4min 15s\n"
 80 |      ]
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "%%time\n",
 85 |     "df = load_higgs()\n",
 86 |     "print(df.shape)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 4,
 92 |    "metadata": {
 93 |     "collapsed": false,
 94 |     "deletable": true,
 95 |     "editable": true
 96 |    },
 97 |    "outputs": [
 98 |     {
 99 |      "data": {
100 |       "text/html": [
101 |        "<div>\n",
102 |        "<style>\n",
103 |        "    .dataframe thead tr:only-child th {\n",
104 |        "        text-align: right;\n",
105 |        "    }\n",
106 |        "\n",
107 |        "    .dataframe thead th {\n",
108 |        "        text-align: left;\n",
109 |        "    }\n",
110 |        "\n",
111 |        "    .dataframe tbody tr th {\n",
112 |        "        vertical-align: top;\n",
113 |        "    }\n",
114 |        "</style>\n",
115 |        "<table border=\"1\" class=\"dataframe\">\n",
116 |        "  <thead>\n",
117 |        "    <tr style=\"text-align: right;\">\n",
118 |        "      <th></th>\n",
119 |        "      <th>boson</th>\n",
120 |        "      <th>lepton_pT</th>\n",
121 |        "      <th>lepton_eta</th>\n",
122 |        "      <th>lepton_phi</th>\n",
123 |        "      <th>missing_energy_magnitude</th>\n",
124 |        "      <th>missing_energy_phi</th>\n",
125 |        "      <th>jet_1_pt</th>\n",
126 |        "      <th>jet_1_eta</th>\n",
127 |        "      <th>jet_1_phi</th>\n",
128 |        "      <th>jet_1_b-tag</th>\n",
129 |        "      <th>...</th>\n",
130 |        "      <th>jet_4_eta</th>\n",
131 |        "      <th>jet_4_phi</th>\n",
132 |        "      <th>jet_4_b-tag</th>\n",
133 |        "      <th>m_jj</th>\n",
134 |        "      <th>m_jjj</th>\n",
135 |        "      <th>m_lv</th>\n",
136 |        "      <th>m_jlv</th>\n",
137 |        "      <th>m_bb</th>\n",
138 |        "      <th>m_wbb</th>\n",
139 |        "      <th>m_wwbb</th>\n",
140 |        "    </tr>\n",
141 |        "  </thead>\n",
142 |        "  <tbody>\n",
143 |        "    <tr>\n",
144 |        "      <th>0</th>\n",
145 |        "      <td>1.0</td>\n",
146 |        "      <td>0.869293</td>\n",
147 |        "      <td>-0.635082</td>\n",
148 |        "      <td>0.225690</td>\n",
149 |        "      <td>0.327470</td>\n",
150 |        "      <td>-0.689993</td>\n",
151 |        "      <td>0.754202</td>\n",
152 |        "      <td>-0.248573</td>\n",
153 |        "      <td>-1.092064</td>\n",
154 |        "      <td>0.000000</td>\n",
155 |        "      <td>...</td>\n",
156 |        "      <td>-0.010455</td>\n",
157 |        "      <td>-0.045767</td>\n",
158 |        "      <td>3.101961</td>\n",
159 |        "      <td>1.353760</td>\n",
160 |        "      <td>0.979563</td>\n",
161 |        "      <td>0.978076</td>\n",
162 |        "      <td>0.920005</td>\n",
163 |        "      <td>0.721657</td>\n",
164 |        "      <td>0.988751</td>\n",
165 |        "      <td>0.876678</td>\n",
166 |        "    </tr>\n",
167 |        "    <tr>\n",
168 |        "      <th>1</th>\n",
169 |        "      <td>1.0</td>\n",
170 |        "      <td>0.907542</td>\n",
171 |        "      <td>0.329147</td>\n",
172 |        "      <td>0.359412</td>\n",
173 |        "      <td>1.497970</td>\n",
174 |        "      <td>-0.313010</td>\n",
175 |        "      <td>1.095531</td>\n",
176 |        "      <td>-0.557525</td>\n",
177 |        "      <td>-1.588230</td>\n",
178 |        "      <td>2.173076</td>\n",
179 |        "      <td>...</td>\n",
180 |        "      <td>-1.138930</td>\n",
181 |        "      <td>-0.000819</td>\n",
182 |        "      <td>0.000000</td>\n",
183 |        "      <td>0.302220</td>\n",
184 |        "      <td>0.833048</td>\n",
185 |        "      <td>0.985700</td>\n",
186 |        "      <td>0.978098</td>\n",
187 |        "      <td>0.779732</td>\n",
188 |        "      <td>0.992356</td>\n",
189 |        "      <td>0.798343</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>2</th>\n",
193 |        "      <td>1.0</td>\n",
194 |        "      <td>0.798835</td>\n",
195 |        "      <td>1.470639</td>\n",
196 |        "      <td>-1.635975</td>\n",
197 |        "      <td>0.453773</td>\n",
198 |        "      <td>0.425629</td>\n",
199 |        "      <td>1.104875</td>\n",
200 |        "      <td>1.282322</td>\n",
201 |        "      <td>1.381664</td>\n",
202 |        "      <td>0.000000</td>\n",
203 |        "      <td>...</td>\n",
204 |        "      <td>1.128848</td>\n",
205 |        "      <td>0.900461</td>\n",
206 |        "      <td>0.000000</td>\n",
207 |        "      <td>0.909753</td>\n",
208 |        "      <td>1.108330</td>\n",
209 |        "      <td>0.985692</td>\n",
210 |        "      <td>0.951331</td>\n",
211 |        "      <td>0.803252</td>\n",
212 |        "      <td>0.865924</td>\n",
213 |        "      <td>0.780118</td>\n",
214 |        "    </tr>\n",
215 |        "    <tr>\n",
216 |        "      <th>3</th>\n",
217 |        "      <td>0.0</td>\n",
218 |        "      <td>1.344385</td>\n",
219 |        "      <td>-0.876626</td>\n",
220 |        "      <td>0.935913</td>\n",
221 |        "      <td>1.992050</td>\n",
222 |        "      <td>0.882454</td>\n",
223 |        "      <td>1.786066</td>\n",
224 |        "      <td>-1.646778</td>\n",
225 |        "      <td>-0.942383</td>\n",
226 |        "      <td>0.000000</td>\n",
227 |        "      <td>...</td>\n",
228 |        "      <td>-0.678379</td>\n",
229 |        "      <td>-1.360356</td>\n",
230 |        "      <td>0.000000</td>\n",
231 |        "      <td>0.946652</td>\n",
232 |        "      <td>1.028704</td>\n",
233 |        "      <td>0.998656</td>\n",
234 |        "      <td>0.728281</td>\n",
235 |        "      <td>0.869200</td>\n",
236 |        "      <td>1.026736</td>\n",
237 |        "      <td>0.957904</td>\n",
238 |        "    </tr>\n",
239 |        "    <tr>\n",
240 |        "      <th>4</th>\n",
241 |        "      <td>1.0</td>\n",
242 |        "      <td>1.105009</td>\n",
243 |        "      <td>0.321356</td>\n",
244 |        "      <td>1.522401</td>\n",
245 |        "      <td>0.882808</td>\n",
246 |        "      <td>-1.205349</td>\n",
247 |        "      <td>0.681466</td>\n",
248 |        "      <td>-1.070464</td>\n",
249 |        "      <td>-0.921871</td>\n",
250 |        "      <td>0.000000</td>\n",
251 |        "      <td>...</td>\n",
252 |        "      <td>-0.373566</td>\n",
253 |        "      <td>0.113041</td>\n",
254 |        "      <td>0.000000</td>\n",
255 |        "      <td>0.755856</td>\n",
256 |        "      <td>1.361057</td>\n",
257 |        "      <td>0.986610</td>\n",
258 |        "      <td>0.838085</td>\n",
259 |        "      <td>1.133295</td>\n",
260 |        "      <td>0.872245</td>\n",
261 |        "      <td>0.808487</td>\n",
262 |        "    </tr>\n",
263 |        "  </tbody>\n",
264 |        "</table>\n",
265 |        "<p>5 rows × 29 columns</p>\n",
266 |        "</div>"
267 |       ],
268 |       "text/plain": [
269 |        "   boson  lepton_pT  lepton_eta  lepton_phi  missing_energy_magnitude  \\\n",
270 |        "0    1.0   0.869293   -0.635082    0.225690                  0.327470   \n",
271 |        "1    1.0   0.907542    0.329147    0.359412                  1.497970   \n",
272 |        "2    1.0   0.798835    1.470639   -1.635975                  0.453773   \n",
273 |        "3    0.0   1.344385   -0.876626    0.935913                  1.992050   \n",
274 |        "4    1.0   1.105009    0.321356    1.522401                  0.882808   \n",
275 |        "\n",
276 |        "   missing_energy_phi  jet_1_pt  jet_1_eta  jet_1_phi  jet_1_b-tag    ...     \\\n",
277 |        "0           -0.689993  0.754202  -0.248573  -1.092064     0.000000    ...      \n",
278 |        "1           -0.313010  1.095531  -0.557525  -1.588230     2.173076    ...      \n",
279 |        "2            0.425629  1.104875   1.282322   1.381664     0.000000    ...      \n",
280 |        "3            0.882454  1.786066  -1.646778  -0.942383     0.000000    ...      \n",
281 |        "4           -1.205349  0.681466  -1.070464  -0.921871     0.000000    ...      \n",
282 |        "\n",
283 |        "   jet_4_eta  jet_4_phi  jet_4_b-tag      m_jj     m_jjj      m_lv     m_jlv  \\\n",
284 |        "0  -0.010455  -0.045767     3.101961  1.353760  0.979563  0.978076  0.920005   \n",
285 |        "1  -1.138930  -0.000819     0.000000  0.302220  0.833048  0.985700  0.978098   \n",
286 |        "2   1.128848   0.900461     0.000000  0.909753  1.108330  0.985692  0.951331   \n",
287 |        "3  -0.678379  -1.360356     0.000000  0.946652  1.028704  0.998656  0.728281   \n",
288 |        "4  -0.373566   0.113041     0.000000  0.755856  1.361057  0.986610  0.838085   \n",
289 |        "\n",
290 |        "       m_bb     m_wbb    m_wwbb  \n",
291 |        "0  0.721657  0.988751  0.876678  \n",
292 |        "1  0.779732  0.992356  0.798343  \n",
293 |        "2  0.803252  0.865924  0.780118  \n",
294 |        "3  0.869200  1.026736  0.957904  \n",
295 |        "4  1.133295  0.872245  0.808487  \n",
296 |        "\n",
297 |        "[5 rows x 29 columns]"
298 |       ]
299 |      },
300 |      "execution_count": 4,
301 |      "metadata": {},
302 |      "output_type": "execute_result"
303 |     }
304 |    ],
305 |    "source": [
306 |     "df.head()"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 5,
312 |    "metadata": {
313 |     "collapsed": false,
314 |     "deletable": true,
315 |     "editable": true
316 |    },
317 |    "outputs": [
318 |     {
319 |      "name": "stdout",
320 |      "output_type": "stream",
321 |      "text": [
322 |       "24\n"
323 |      ]
324 |     }
325 |    ],
326 |    "source": [
327 |     "num_rounds = 200\n",
328 |     "number_processors = get_number_processors()\n",
329 |     "print(number_processors)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 6,
335 |    "metadata": {
336 |     "collapsed": false,
337 |     "deletable": true,
338 |     "editable": true
339 |    },
340 |    "outputs": [],
341 |    "source": [
342 |     "xgb_clf_pipeline = XGBClassifier(max_depth=5, \n",
343 |     "                                learning_rate=0.1, \n",
344 |     "                                scale_pos_weight=2,\n",
345 |     "                                n_estimators=num_rounds,\n",
346 |     "                                gamma=0.1,\n",
347 |     "                                min_child_weight=1,\n",
348 |     "                                reg_lambda=1,\n",
349 |     "                                subsample=1,\n",
350 |     "                                nthread=number_processors\n",
351 |     "                                )"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 7,
357 |    "metadata": {
358 |     "collapsed": false,
359 |     "deletable": true,
360 |     "editable": true
361 |    },
362 |    "outputs": [],
363 |    "source": [
364 |     "xgb_hist_clf_pipeline = XGBClassifier(max_depth=0, \n",
365 |     "                                     learning_rate=0.1, \n",
366 |     "                                     scale_pos_weight=2,\n",
367 |     "                                     n_estimators=num_rounds,\n",
368 |     "                                     gamma=0.1,\n",
369 |     "                                     min_child_weight=1,\n",
370 |     "                                     reg_lambda=1,\n",
371 |     "                                     subsample=1,\n",
372 |     "                                     max_leaves=2**5,\n",
373 |     "                                     grow_policy='lossguide',\n",
374 |     "                                     tree_method='hist',\n",
375 |     "                                     nthread=number_processors\n",
376 |     "                                     )"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": 8,
382 |    "metadata": {
383 |     "collapsed": false,
384 |     "deletable": true,
385 |     "editable": true
386 |    },
387 |    "outputs": [],
388 |    "source": [
389 |     "lgbm_clf_pipeline = LGBMClassifier(num_leaves=2**5, \n",
390 |     "                                  learning_rate=0.1, \n",
391 |     "                                  scale_pos_weight=2,\n",
392 |     "                                  n_estimators=num_rounds,\n",
393 |     "                                  min_split_gain=0.1,\n",
394 |     "                                  min_child_weight=1,\n",
395 |     "                                  reg_lambda=1,\n",
396 |     "                                  subsample=1,\n",
397 |     "                                  nthread=number_processors\n",
398 |     "                                  )"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 9,
404 |    "metadata": {
405 |     "collapsed": false,
406 |     "deletable": true,
407 |     "editable": true
408 |    },
409 |    "outputs": [],
410 |    "source": [
411 |     "metrics_dict = {\n",
412 |     "    'Accuracy': accuracy_score,\n",
413 |     "    'Precision': precision_score,\n",
414 |     "    'Recall': recall_score,\n",
415 |     "    'AUC': roc_auc_score,\n",
416 |     "    'F1': f1_score,\n",
417 |     "}\n",
418 |     "\n",
419 |     "def classification_metrics(metrics, y_true, y_pred):\n",
420 |     "    return {metric_name:metric(y_true, y_pred) for metric_name, metric in metrics.items()}"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": 10,
426 |    "metadata": {
427 |     "collapsed": false,
428 |     "deletable": true,
429 |     "editable": true
430 |    },
431 |    "outputs": [],
432 |    "source": [
433 |     "def generate_feables(df):\n",
434 |     "    X = df[df.columns.difference(['boson'])]\n",
435 |     "    y = df['boson']\n",
436 |     "    return X,y"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": 11,
442 |    "metadata": {
443 |     "collapsed": false,
444 |     "deletable": true,
445 |     "editable": true
446 |    },
447 |    "outputs": [],
448 |    "source": [
449 |     "X, y = generate_feables(df)"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": 12,
455 |    "metadata": {
456 |     "collapsed": false,
457 |     "deletable": true,
458 |     "editable": true
459 |    },
460 |    "outputs": [],
461 |    "source": [
462 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=77, test_size=500000)"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": 13,
468 |    "metadata": {
469 |     "collapsed": false,
470 |     "deletable": true,
471 |     "editable": true
472 |    },
473 |    "outputs": [],
474 |    "source": [
475 |     "results_dict = dict()"
476 |    ]
477 |   },
478 |   {
479 |    "cell_type": "markdown",
480 |    "metadata": {
481 |     "deletable": true,
482 |     "editable": true
483 |    },
484 |    "source": [
485 |     "### XGBoost"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": 14,
491 |    "metadata": {
492 |     "collapsed": false,
493 |     "deletable": true,
494 |     "editable": true
495 |    },
496 |    "outputs": [],
497 |    "source": [
498 |     "with Timer() as train_t:\n",
499 |     "    xgb_clf_pipeline.fit(X_train,y_train)\n",
500 |     "    \n",
501 |     "with Timer() as test_t:\n",
502 |     "    y_pred = xgb_clf_pipeline.predict(X_test)"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": 15,
508 |    "metadata": {
509 |     "collapsed": true,
510 |     "deletable": true,
511 |     "editable": true
512 |    },
513 |    "outputs": [],
514 |    "source": [
515 |     "results_dict['xgb']={\n",
516 |     "    'train_time': train_t.interval,\n",
517 |     "    'test_time': test_t.interval,\n",
518 |     "    'performance': classification_metrics(metrics_dict, \n",
519 |     "                                          y_test, \n",
520 |     "                                          y_pred) \n",
521 |     "}"
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "code",
526 |    "execution_count": 16,
527 |    "metadata": {
528 |     "collapsed": true,
529 |     "deletable": true,
530 |     "editable": true
531 |    },
532 |    "outputs": [],
533 |    "source": [
534 |     "with Timer() as t_train:\n",
535 |     "    xgb_hist_clf_pipeline.fit(X_train,y_train)"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": 17,
541 |    "metadata": {
542 |     "collapsed": true,
543 |     "deletable": true,
544 |     "editable": true
545 |    },
546 |    "outputs": [],
547 |    "source": [
548 |     "with Timer() as t_test:\n",
549 |     "    y_pred = xgb_hist_clf_pipeline.predict(X_test)"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": 18,
555 |    "metadata": {
556 |     "collapsed": true,
557 |     "deletable": true,
558 |     "editable": true
559 |    },
560 |    "outputs": [],
561 |    "source": [
562 |     "results_dict['xgb_hist']={\n",
563 |     "    'train_time': t_train.interval,\n",
564 |     "    'test_time': t_test.interval,\n",
565 |     "    'performance': classification_metrics(metrics_dict, \n",
566 |     "                                          y_test, \n",
567 |     "                                          y_pred) \n",
568 |     "}"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "markdown",
573 |    "metadata": {
574 |     "deletable": true,
575 |     "editable": true
576 |    },
577 |    "source": [
578 |     "### LightGBM"
579 |    ]
580 |   },
581 |   {
582 |    "cell_type": "code",
583 |    "execution_count": 19,
584 |    "metadata": {
585 |     "collapsed": true,
586 |     "deletable": true,
587 |     "editable": true
588 |    },
589 |    "outputs": [],
590 |    "source": [
591 |     "with Timer() as train_t:\n",
592 |     "    lgbm_clf_pipeline.fit(X_train, y_train)\n",
593 |     "    \n",
594 |     "with Timer() as test_t:\n",
595 |     "    y_pred = lgbm_clf_pipeline.predict(X_test)"
596 |    ]
597 |   },
598 |   {
599 |    "cell_type": "code",
600 |    "execution_count": 20,
601 |    "metadata": {
602 |     "collapsed": true,
603 |     "deletable": true,
604 |     "editable": true
605 |    },
606 |    "outputs": [],
607 |    "source": [
608 |     "results_dict['lgbm']={\n",
609 |     "    'train_time': train_t.interval,\n",
610 |     "    'test_time': test_t.interval,\n",
611 |     "    'performance': classification_metrics(metrics_dict, \n",
612 |     "                                          y_test, \n",
613 |     "                                          y_pred) \n",
614 |     "}"
615 |    ]
616 |   },
617 |   {
618 |    "cell_type": "code",
619 |    "execution_count": 21,
620 |    "metadata": {
621 |     "collapsed": false,
622 |     "deletable": true,
623 |     "editable": true
624 |    },
625 |    "outputs": [
626 |     {
627 |      "name": "stdout",
628 |      "output_type": "stream",
629 |      "text": [
630 |       "{\n",
631 |       "    \"lgbm\": {\n",
632 |       "        \"performance\": {\n",
633 |       "            \"AUC\": 0.694682949690134,\n",
634 |       "            \"Accuracy\": 0.707758,\n",
635 |       "            \"F1\": 0.7680747894958216,\n",
636 |       "            \"Precision\": 0.6627597069095391,\n",
637 |       "            \"Recall\": 0.9131831219806763\n",
638 |       "        },\n",
639 |       "        \"test_time\": 0.7120589099995414,\n",
640 |       "        \"train_time\": 119.34003880199998\n",
641 |       "    },\n",
642 |       "    \"xgb\": {\n",
643 |       "        \"performance\": {\n",
644 |       "            \"AUC\": 0.6859901403358623,\n",
645 |       "            \"Accuracy\": 0.699694,\n",
646 |       "            \"F1\": 0.7635493812093622,\n",
647 |       "            \"Precision\": 0.6551156676187414,\n",
648 |       "            \"Recall\": 0.9149984903381643\n",
649 |       "        },\n",
650 |       "        \"test_time\": 0.55617916600022,\n",
651 |       "        \"train_time\": 2996.1667750769993\n",
652 |       "    },\n",
653 |       "    \"xgb_hist\": {\n",
654 |       "        \"performance\": {\n",
655 |       "            \"AUC\": 0.6941216899970567,\n",
656 |       "            \"Accuracy\": 0.70721,\n",
657 |       "            \"F1\": 0.767674555527519,\n",
658 |       "            \"Precision\": 0.6623426413523601,\n",
659 |       "            \"Recall\": 0.9128434480676328\n",
660 |       "        },\n",
661 |       "        \"test_time\": 0.6464068210007099,\n",
662 |       "        \"train_time\": 121.21175534400027\n",
663 |       "    }\n",
664 |       "}\n"
665 |      ]
666 |     }
667 |    ],
668 |    "source": [
669 |     "# Results\n",
670 |     "print(json.dumps(results_dict, indent=4, sort_keys=True))"
671 |    ]
672 |   }
673 |  ],
674 |  "metadata": {
675 |   "kernelspec": {
676 |    "display_name": "Python Strata",
677 |    "language": "python",
678 |    "name": "strata"
679 |   },
680 |   "language_info": {
681 |    "codemirror_mode": {
682 |     "name": "ipython",
683 |     "version": 3
684 |    },
685 |    "file_extension": ".py",
686 |    "mimetype": "text/x-python",
687 |    "name": "python",
688 |    "nbconvert_exporter": "python",
689 |    "pygments_lexer": "ipython3",
690 |    "version": "3.5.3"
691 |   }
692 |  },
693 |  "nbformat": 4,
694 |  "nbformat_minor": 0
695 | }
696 | 


--------------------------------------------------------------------------------