├── src ├── reg_resampler.egg-info │ ├── dependency_links.txt │ ├── top_level.txt │ ├── requires.txt │ ├── SOURCES.txt │ └── PKG-INFO └── reg_resampler.py ├── dist ├── reg_resampler-1.0.tar.gz ├── reg_resampler-2.0.tar.gz ├── reg_resampler-1.0.1.tar.gz ├── reg_resampler-1.0.2.tar.gz ├── reg_resampler-1.0.3.tar.gz ├── reg_resampler-1.0.6.tar.gz ├── reg_resampler-1.0.7.tar.gz ├── reg_resampler-2.0.1.tar.gz ├── reg_resampler-2.1.0.tar.gz ├── reg_resampler-2.1.1.tar.gz ├── reg_resampler-1.0-py3-none-any.whl ├── reg_resampler-2.0-py3-none-any.whl ├── reg_resampler-1.0.1-py3-none-any.whl ├── reg_resampler-1.0.2-py3-none-any.whl ├── reg_resampler-1.0.3-py3-none-any.whl ├── reg_resampler-1.0.6-py3-none-any.whl ├── reg_resampler-1.0.7-py3-none-any.whl ├── reg_resampler-2.0.1-py3-none-any.whl ├── reg_resampler-2.1.0-py3-none-any.whl └── reg_resampler-2.1.1-py3-none-any.whl ├── setup.py ├── LICENSE ├── README.md └── tutorials ├── numpy_CV_tutorial.ipynb └── pandas_CV_tutorial.ipynb /src/reg_resampler.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/reg_resampler.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | reg_resampler 2 | -------------------------------------------------------------------------------- /src/reg_resampler.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | scikit-learn 3 | numpy 4 | -------------------------------------------------------------------------------- /dist/reg_resampler-1.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.tar.gz -------------------------------------------------------------------------------- /dist/reg_resampler-2.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.0.tar.gz -------------------------------------------------------------------------------- /dist/reg_resampler-1.0.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.1.tar.gz -------------------------------------------------------------------------------- /dist/reg_resampler-1.0.2.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.2.tar.gz -------------------------------------------------------------------------------- /dist/reg_resampler-1.0.3.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.3.tar.gz -------------------------------------------------------------------------------- /dist/reg_resampler-1.0.6.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.6.tar.gz -------------------------------------------------------------------------------- /dist/reg_resampler-1.0.7.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.7.tar.gz -------------------------------------------------------------------------------- /dist/reg_resampler-2.0.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.0.1.tar.gz -------------------------------------------------------------------------------- /dist/reg_resampler-2.1.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.1.0.tar.gz -------------------------------------------------------------------------------- /dist/reg_resampler-2.1.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.1.1.tar.gz -------------------------------------------------------------------------------- /dist/reg_resampler-1.0-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0-py3-none-any.whl -------------------------------------------------------------------------------- /dist/reg_resampler-2.0-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.0-py3-none-any.whl -------------------------------------------------------------------------------- /dist/reg_resampler-1.0.1-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.1-py3-none-any.whl -------------------------------------------------------------------------------- /dist/reg_resampler-1.0.2-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.2-py3-none-any.whl -------------------------------------------------------------------------------- /dist/reg_resampler-1.0.3-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.3-py3-none-any.whl -------------------------------------------------------------------------------- /dist/reg_resampler-1.0.6-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.6-py3-none-any.whl -------------------------------------------------------------------------------- /dist/reg_resampler-1.0.7-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.7-py3-none-any.whl -------------------------------------------------------------------------------- /dist/reg_resampler-2.0.1-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.0.1-py3-none-any.whl -------------------------------------------------------------------------------- /dist/reg_resampler-2.1.0-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.1.0-py3-none-any.whl -------------------------------------------------------------------------------- /dist/reg_resampler-2.1.1-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.1.1-py3-none-any.whl -------------------------------------------------------------------------------- /src/reg_resampler.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.md 2 | setup.py 3 | src/reg_resampler.py 4 | src/reg_resampler.egg-info/PKG-INFO 5 | src/reg_resampler.egg-info/SOURCES.txt 6 | src/reg_resampler.egg-info/dependency_links.txt 7 | src/reg_resampler.egg-info/requires.txt 8 | src/reg_resampler.egg-info/top_level.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | 7 | setuptools.setup( 8 | name = 'reg_resampler', 9 | version = '2.1.1', 10 | author = "Atif Hassan, Venkata Sai Krithik", 11 | author_email = "atif.hit.hassan@gmail.com, pvsaikrithik@gmail.com", 12 | description = "An interface to apply your favourite re-sampler on regression tasks.", 13 | long_description = long_description, 14 | long_description_content_type = "text/markdown", 15 | url = "https://github.com/atif-hassan/Regression_ReSampling/", 16 | py_modules = ["reg_resampler"], 17 | package_dir = {'': 'src'}, 18 | install_requires = ["pandas", "scikit-learn", "numpy"], 19 | include_package_data = True, 20 | classifiers = [ 21 | "Programming Language :: Python :: 3", 22 | "Programming Language :: Python :: 3.6", 23 | "Programming Language :: Python :: 3.7", 24 | "License :: OSI Approved :: BSD License", 25 | "Operating System :: OS Independent", 26 | ] 27 | ) 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, Atif Hassan, Venkata Sai Krithiks 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![forthebadge made-with-python](http://ForTheBadge.com/images/badges/made-with-python.svg)](https://www.python.org/) 2 | [![ForTheBadge built-with-love](http://ForTheBadge.com/images/badges/built-with-love.svg)](https://github.com/atif-hassan/) 3 | 4 | [![PyPI version shields.io](https://img.shields.io/pypi/v/reg-resampler.svg)](https://pypi.python.org/pypi/reg-resampler/) 5 | [![Downloads](https://pepy.tech/badge/reg-resampler)](https://pepy.tech/project/reg-resampler) 6 | [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/atif-hassan/Regression_ReSampling/commits/master) 7 | # Regression ReSampling 8 | A python library for repurposing traditional classification-based resampling (undersampling and/or oversampling) techniques for regression tasks. Currently supports all resampling techniques present in **imblearn** 9 | 10 | ## Why does this exist? 11 | While we were working on a regression task, we realized that the target variable was skewed, i.e., most samples were present in a particular range. One can easily solve the skew problem for classification tasks via a slew of resampling techniques (either under or over sampling) but this luxury is unavailable for regression tasks. We therefore decided to create an interface that can repurpose all resampling techniques for classification problems to regression problems! 12 | 13 | ## How to install? 14 | ```pip install reg_resampler``` 15 | 16 | ## Functions and parameters 17 | ```python 18 | # This returns a numpy list of classes for each corresponding sample. It also automatically merges classes when required 19 | fit(X, target, bins=3, min_n_samples=6, balanced_binning=False, verbose=2) 20 | ``` 21 | - **X** - Either a pandas dataframe or numpy matrix. Complete data to be resampled. 22 | - **target** - Either string (for pandas) or index (for numpy). The target variable to be resampled. 23 | - **bins=3** - The number of classes that the user wants to generate. (Default: 3) 24 | - **min_n_samples=6** - Minimum number of samples in each bin. Bins having less than this value will be merged with the closest bin. Has to be more than neighbours in imblearn. (Default: 6) 25 | - **balanced_binning=False** - Decides whether samples are to be distributed roughly equally across all classes. (Default: False) 26 | - **verbose=2** - 0 will disable print by package, 1 will print info about class mergers and 2 will also print class distributions. 27 | 28 | ```python 29 | # Performs resampling and returns the resampled dataframe/numpy matrices in the form of data and target variable. 30 | resample(sampler_obj, trainX, trainY) 31 | ``` 32 | - **sampler_obj** - Your favourite resampling algorithm's object (currently supports imblearn) 33 | - **trainX** - Either a pandas dataframe or numpy matrix. Data to be resampled. Also, contains the target variable 34 | - **trainY** - Numpy array of psuedo classes obtained from fit function. 35 | 36 | ### Important Note 37 | All functions return the same data type as provided in input. 38 | 39 | ## How to import? 40 | ```python 41 | from reg_resampler import resampler 42 | ``` 43 | 44 | ## Usage 45 | ```python 46 | # Initialize the resampler object 47 | rs = resampler() 48 | 49 | # You might recieve info about class merger for low sample classes 50 | # Generate classes 51 | Y_classes = rs.fit(df_train, target=target, bins=num_bins) 52 | # Create the actual target variable 53 | Y = df_train[target] 54 | 55 | # Create a smote (over-sampling) object from imblearn 56 | smote = SMOTE(random_state=27) 57 | 58 | # Now resample 59 | final_X, final_Y = rs.resample(smote, df_train, Y_classes) 60 | ``` 61 | 62 | ## Tutorials 63 | You can find further [tutorials](https://github.com/atif-hassan/Regression_ReSampling/tree/master/tutorials) on how to use this library for cross-validation 64 | 65 | ## Future Ideas 66 | - Support for more resampling techniques 67 | 68 | ## Feature Request 69 | Drop us an email at **atif.hit.hassan@gmail.com** or **pvsaikrithik@gmail.com** if you want any particular feature 70 | -------------------------------------------------------------------------------- /src/reg_resampler.py: -------------------------------------------------------------------------------- 1 | class resampler: 2 | def __init__(self): 3 | import pandas as pd 4 | from sklearn.preprocessing import LabelEncoder 5 | from collections import Counter 6 | import numpy as np 7 | self.bins = 3 8 | self.pd = pd 9 | self.LabelEncoder = LabelEncoder 10 | self.Counter = Counter 11 | self.X = 0 12 | self.Y_classes = 0 13 | self.target = 0 14 | self.np = np 15 | 16 | # This function adds classes to each sample and returns the class list as a dataframe/numpy array (as per input) 17 | # It also merges classes as and when required 18 | def fit(self, X, target, bins=3, min_n_samples=6, balanced_binning=False, verbose=2): 19 | self.bins = bins 20 | tmp = target 21 | 22 | # If data is numpy, then convert it into pandas 23 | if type(target) == int: 24 | if target < 0: 25 | target = X.shape[1]+target 26 | tmp = target 27 | self.X = self.pd.DataFrame() 28 | for i in range(X.shape[1]): 29 | if i!=target: 30 | self.X[str(i)] = X[:,i] 31 | self.X["target"] = X[:,target] 32 | target = "target" 33 | else: 34 | self.X = X.copy() 35 | 36 | # Use qcut if balanced binning is required 37 | if balanced_binning: 38 | self.Y_classes = self.pd.qcut(self.X[target], q=self.bins, precision=0) 39 | else: 40 | self.Y_classes = self.pd.cut(self.X[target], bins=self.bins) 41 | 42 | # Pandas outputs ranges after binning. Convert ranges to classes 43 | le = self.LabelEncoder() 44 | self.Y_classes = le.fit_transform(self.Y_classes) 45 | 46 | # Merge classes if number of neighbours is more than the number of samples 47 | classes_count = list(map(list, self.Counter(self.Y_classes).items())) 48 | classes_count = sorted(classes_count, key = lambda x: x[0]) 49 | mid_point = len(classes_count) 50 | # Logic for merging 51 | for i in range(len(classes_count)): 52 | if classes_count[i][1] < min_n_samples: 53 | self.Y_classes[self.np.where(self.Y_classes == classes_count[i][0])[0]] = classes_count[i-1][0] 54 | if verbose > 0: 55 | print("INFO: Class " + str(classes_count[i][0]) + " has been merged into Class " + str(classes_count[i-1][0]) + " due to low number of samples") 56 | classes_count[i][0] = classes_count[i-1][0] 57 | if verbose > 0: 58 | print() 59 | 60 | # Perform label-encoding once again 61 | # Avoids class skipping after merging 62 | le = self.LabelEncoder() 63 | self.Y_classes = le.fit_transform(self.Y_classes) 64 | 65 | # Pretty print 66 | if verbose > 1: 67 | print("Class Distribution:\n-------------------") 68 | classes_count = list(map(list, self.Counter(self.Y_classes).items())) 69 | classes_count = sorted(classes_count, key = lambda x: x[0]) 70 | for class_, count in classes_count: 71 | print(str(class_)+": "+str(count)) 72 | print() 73 | 74 | # Finally concatenate and return as dataframe or numpy 75 | # Based on what type of target was sent 76 | self.X["classes"] = self.Y_classes 77 | if type(tmp) == int: 78 | self.target = tmp 79 | else: 80 | self.target = target 81 | return self.Y_classes 82 | 83 | 84 | 85 | # This function performs the re-sampling 86 | def resample(self, sampler_obj, trainX, trainY): 87 | # If classes haven't yet been created, then run the "fit" function 88 | if type(self.Y_classes) == int: 89 | print("Error! Run fit method first!!") 90 | return None 91 | 92 | # Finally, perform the re-sampling 93 | resampled_data, _ = sampler_obj.fit_resample(trainX, trainY) 94 | if type(resampled_data).__module__ == 'numpy': 95 | resampled_data = self.pd.DataFrame(resampled_data, columns=self.X.drop("classes", axis=1).columns) 96 | 97 | # Return the correct X and Y 98 | if type(self.target) == int: 99 | return resampled_data.drop("target", axis=1).values, resampled_data["target"].values 100 | else: 101 | return resampled_data.drop(self.target, axis=1), resampled_data[self.target] 102 | -------------------------------------------------------------------------------- /src/reg_resampler.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: reg-resampler 3 | Version: 2.1.1 4 | Summary: An interface to apply your favourite re-sampler on regression tasks. 5 | Home-page: https://github.com/atif-hassan/Regression_ReSampling/ 6 | Author: Atif Hassan, Venkata Sai Krithik 7 | Author-email: atif.hit.hassan@gmail.com, pvsaikrithik@gmail.com 8 | License: UNKNOWN 9 | Description: [![forthebadge made-with-python](http://ForTheBadge.com/images/badges/made-with-python.svg)](https://www.python.org/) 10 | [![ForTheBadge built-with-love](http://ForTheBadge.com/images/badges/built-with-love.svg)](https://github.com/atif-hassan/) 11 | 12 | [![PyPI version shields.io](https://img.shields.io/pypi/v/reg-resampler.svg)](https://pypi.python.org/pypi/reg-resampler/) 13 | [![Downloads](https://pepy.tech/badge/reg-resampler)](https://pepy.tech/project/reg-resampler) 14 | [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/atif-hassan/Regression_ReSampling/commits/master) 15 | # Regression ReSampling 16 | A python library for repurposing traditional classification-based resampling (undersampling and/or oversampling) techniques for regression tasks. Currently supports all resampling techniques present in **imblearn** 17 | 18 | ## Why does this exist? 19 | While we were working on a regression task, we realized that the target variable was skewed, i.e., most samples were present in a particular range. One can easily solve the skew problem for classification tasks via a slew of resampling techniques (either under or over sampling) but this luxury is unavailable for regression tasks. We therefore decided to create an interface that can repurpose all resampling techniques for classification problems to regression problems! 20 | 21 | ## How to install? 22 | ```pip install reg_resampler``` 23 | 24 | ## Functions and parameters 25 | ```python 26 | # This returns a numpy list of classes for each corresponding sample. It also automatically merges classes when required 27 | fit(X, target, bins=3, min_n_samples=6, balanced_binning=False, verbose=2) 28 | ``` 29 | - **X** - Either a pandas dataframe or numpy matrix. Complete data to be resampled. 30 | - **target** - Either string (for pandas) or index (for numpy). The target variable to be resampled. 31 | - **bins=3** - The number of classes that the user wants to generate. (Default: 3) 32 | - **min_n_samples=6** - Minimum number of samples in each bin. Bins having less than this value will be merged with the closest bin. Has to be more than neighbours in imblearn. (Default: 6) 33 | - **balanced_binning=False** - Decides whether samples are to be distributed roughly equally across all classes. (Default: False) 34 | - **verbose=2** - 0 will disable print by package, 1 will print info about class mergers and 2 will also print class distributions. 35 | 36 | ```python 37 | # Performs resampling and returns the resampled dataframe/numpy matrices in the form of data and target variable. 38 | resample(sampler_obj, trainX, trainY) 39 | ``` 40 | - **sampler_obj** - Your favourite resampling algorithm's object (currently supports imblearn) 41 | - **trainX** - Either a pandas dataframe or numpy matrix. Data to be resampled. Also, contains the target variable 42 | - **trainY** - Numpy array of psuedo classes obtained from fit function. 43 | 44 | ### Important Note 45 | All functions return the same data type as provided in input. 46 | 47 | ## How to import? 48 | ```python 49 | from reg_resampler import resampler 50 | ``` 51 | 52 | ## Usage 53 | ```python 54 | # Initialize the resampler object 55 | rs = resampler() 56 | 57 | # You might recieve info about class merger for low sample classes 58 | # Generate classes 59 | Y_classes = rs.fit(df_train, target=target, bins=num_bins) 60 | # Create the actual target variable 61 | Y = df_train[target] 62 | 63 | # Create a smote (over-sampling) object from imblearn 64 | smote = SMOTE(random_state=27) 65 | 66 | # Now resample 67 | final_X, final_Y = rs.resample(smote, df_train, Y_classes) 68 | ``` 69 | 70 | ## Tutorials 71 | You can find further [tutorials](https://github.com/atif-hassan/Regression_ReSampling/tree/master/tutorials) on how to use this library for cross-validation 72 | 73 | ## Future Ideas 74 | - Support for more resampling techniques 75 | 76 | ## Feature Request 77 | Drop us an email at **atif.hit.hassan@gmail.com** or **pvsaikrithik@gmail.com** if you want any particular feature 78 | 79 | Platform: UNKNOWN 80 | Classifier: Programming Language :: Python :: 3 81 | Classifier: Programming Language :: Python :: 3.6 82 | Classifier: Programming Language :: Python :: 3.7 83 | Classifier: License :: OSI Approved :: BSD License 84 | Classifier: Operating System :: OS Independent 85 | Description-Content-Type: text/markdown 86 | -------------------------------------------------------------------------------- /tutorials/numpy_CV_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Get all imports" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 4, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "from sklearn.preprocessing import LabelEncoder\n", 19 | "from sklearn.model_selection import KFold, StratifiedKFold\n", 20 | "from lightgbm import LGBMRegressor\n", 21 | "from imblearn.over_sampling import SMOTE\n", 22 | "from collections import Counter\n", 23 | "from reg_resampler import resampler\n", 24 | "from sklearn.metrics import mean_squared_log_error\n", 25 | "import warnings\n", 26 | "warnings.filterwarnings(\"ignore\")" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "### Read and transform data" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": { 40 | "scrolled": true 41 | }, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/html": [ 46 | "
\n", 47 | "\n", 60 | "\n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | "
ProductProduct_BrandItem_CategorySubcategory_1Subcategory_2Item_RatingSelling_Price
07918627101374.3291
1760670101273293.1897
21746280291121013.5792
312146708371344.0837
4210467010802964.3470
\n", 126 | "
" 127 | ], 128 | "text/plain": [ 129 | " Product Product_Brand Item_Category Subcategory_1 Subcategory_2 \\\n", 130 | "0 791 862 7 10 137 \n", 131 | "1 760 670 10 127 329 \n", 132 | "2 1746 280 29 112 101 \n", 133 | "3 1214 670 8 37 134 \n", 134 | "4 2104 670 10 80 296 \n", 135 | "\n", 136 | " Item_Rating Selling_Price \n", 137 | "0 4.3 291 \n", 138 | "1 3.1 897 \n", 139 | "2 3.5 792 \n", 140 | "3 4.0 837 \n", 141 | "4 4.3 470 " 142 | ] 143 | }, 144 | "execution_count": 2, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "df_train = pd.read_csv(\"Train.csv\")\n", 151 | "\n", 152 | "lone = LabelEncoder()\n", 153 | "df_train[\"Product\"] = lone.fit_transform(df_train[\"Product\"])\n", 154 | "df_train[\"Product_Brand\"] = lone.fit_transform(df_train[\"Product_Brand\"])\n", 155 | "df_train[\"Item_Category\"] = lone.fit_transform(df_train[\"Item_Category\"])\n", 156 | "df_train[\"Subcategory_1\"] = lone.fit_transform(df_train[\"Subcategory_1\"])\n", 157 | "df_train[\"Subcategory_2\"] = lone.fit_transform(df_train[\"Subcategory_2\"])\n", 158 | "df_train = df_train.drop(\"Date\", axis=1)\n", 159 | "\n", 160 | "df_train.head()" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "### Perform K-Fold" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 3, 173 | "metadata": { 174 | "scrolled": true 175 | }, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "INFO: Class 2 has been merged into Class 1 due to low number of samples\n", 182 | "INFO: Class 3 has been merged into Class 1 due to low number of samples\n", 183 | "INFO: Class 4 has been merged into Class 1 due to low number of samples\n", 184 | "\n", 185 | "Class Distribution:\n", 186 | "-------------------\n", 187 | "0: 2350\n", 188 | "1: 102\n", 189 | "\n", 190 | "0.7399142441762613\n", 191 | "0.7027917932595997\n", 192 | "0.7526839471836929\n", 193 | "0.6559476353228568\n", 194 | "0.728284943818841\n", 195 | "\n", 196 | "Average: 0.7159245127522503\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "# Initialize the resampler object\n", 202 | "rs = resampler()\n", 203 | "\n", 204 | "# Generate classes\n", 205 | "Y_classes = rs.fit(df_train.values, target=-1, bins=5, verbose=2)\n", 206 | "# Create the actual target variable\n", 207 | "Y = df_train[\"Selling_Price\"]\n", 208 | "\n", 209 | "# Perform K-Fold\n", 210 | "kfold, scores = KFold(n_splits=5, shuffle=True, random_state=27), list()\n", 211 | "for train, test in kfold.split(df_train.values):\n", 212 | " # Split into train and test\n", 213 | " x_train, y_train = df_train.values[train], Y_classes[train]\n", 214 | " x_test, y_test = df_train.values[test], Y.values[test]\n", 215 | " \n", 216 | " # Remove the target variable from x_test\n", 217 | " x_test = x_test[:,:-1]\n", 218 | " \n", 219 | " # Get the class distriubtion for perfoming relative sampling in the next line\n", 220 | " xp = Counter(y_train)\n", 221 | " # Your favourite oversampler\n", 222 | " smote = SMOTE(random_state=27, sampling_strategy={0:int(xp[0]*1.2), 1: int(xp[1]*2.0)})\n", 223 | " # Generate the over-sampled data\n", 224 | " x_train, y_train = rs.resample(smote, x_train, y_train)\n", 225 | " \n", 226 | " # Fit the model\n", 227 | " model = LGBMRegressor(random_state=27)\n", 228 | " model.fit(x_train, np.log(y_train))\n", 229 | " preds = np.exp(model.predict(x_test))\n", 230 | " \n", 231 | " # Check the score\n", 232 | " score = np.sqrt(mean_squared_log_error(y_test, preds))\n", 233 | " print(score)\n", 234 | " scores.append(score)\n", 235 | "print(\"\\nAverage: \", sum(scores)/len(scores))" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "### Perform Stratified K-Fold" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 5, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | "INFO: Class 2 has been merged into Class 1 due to low number of samples\n", 255 | "INFO: Class 3 has been merged into Class 1 due to low number of samples\n", 256 | "INFO: Class 4 has been merged into Class 1 due to low number of samples\n", 257 | "\n", 258 | "Class Distribution:\n", 259 | "-------------------\n", 260 | "0: 2350\n", 261 | "1: 102\n", 262 | "\n", 263 | "0.681783831084692\n", 264 | "0.6997998885546327\n", 265 | "0.7386469827713933\n", 266 | "0.6996689571871663\n", 267 | "0.7698032886366912\n", 268 | "\n", 269 | "Average: 0.7179405896469151\n" 270 | ] 271 | } 272 | ], 273 | "source": [ 274 | "# Initialize the resampler object\n", 275 | "rs = resampler()\n", 276 | "\n", 277 | "# Generate classes\n", 278 | "Y_classes = rs.fit(df_train.values, target=-1, bins=5, verbose=2)\n", 279 | "# Create the actual target variable\n", 280 | "Y = df_train[\"Selling_Price\"]\n", 281 | "\n", 282 | "# Perform K-Fold\n", 283 | "kfold, scores = StratifiedKFold(n_splits=5, shuffle=True, random_state=27), list()\n", 284 | "for train, test in kfold.split(df_train.values, Y_classes):\n", 285 | " # Split into train and test\n", 286 | " x_train, y_train = df_train.values[train], Y_classes[train]\n", 287 | " x_test, y_test = df_train.values[test], Y.values[test]\n", 288 | " \n", 289 | " # Remove the target variable from x_test\n", 290 | " x_test = x_test[:,:-1]\n", 291 | " \n", 292 | " # Get the class distriubtion for perfoming relative sampling in the next line\n", 293 | " xp = Counter(y_train)\n", 294 | " # Your favourite oversampler\n", 295 | " smote = SMOTE(random_state=27, sampling_strategy={0:int(xp[0]*1.2), 1: int(xp[1]*2.0)})\n", 296 | " # Generate the over-sampled data\n", 297 | " x_train, y_train = rs.resample(smote, x_train, y_train)\n", 298 | " \n", 299 | " # Fit the model\n", 300 | " model = LGBMRegressor(random_state=27)\n", 301 | " model.fit(x_train, np.log(y_train))\n", 302 | " preds = np.exp(model.predict(x_test))\n", 303 | " \n", 304 | " # Check the score\n", 305 | " score = np.sqrt(mean_squared_log_error(y_test, preds))\n", 306 | " print(score)\n", 307 | " scores.append(score)\n", 308 | "print(\"\\nAverage: \", sum(scores)/len(scores))" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [] 317 | } 318 | ], 319 | "metadata": { 320 | "kernelspec": { 321 | "display_name": "Python 3", 322 | "language": "python", 323 | "name": "python3" 324 | }, 325 | "language_info": { 326 | "codemirror_mode": { 327 | "name": "ipython", 328 | "version": 3 329 | }, 330 | "file_extension": ".py", 331 | "mimetype": "text/x-python", 332 | "name": "python", 333 | "nbconvert_exporter": "python", 334 | "pygments_lexer": "ipython3", 335 | "version": "3.7.4" 336 | } 337 | }, 338 | "nbformat": 4, 339 | "nbformat_minor": 4 340 | } 341 | -------------------------------------------------------------------------------- /tutorials/pandas_CV_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Get all imports" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stderr", 17 | "output_type": "stream", 18 | "text": [ 19 | "Using TensorFlow backend.\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "import pandas as pd\n", 25 | "import numpy as np\n", 26 | "from sklearn.preprocessing import LabelEncoder\n", 27 | "from sklearn.model_selection import KFold, StratifiedKFold\n", 28 | "from lightgbm import LGBMRegressor\n", 29 | "from imblearn.over_sampling import SMOTE\n", 30 | "from collections import Counter\n", 31 | "from reg_resampler import resampler\n", 32 | "from sklearn.metrics import mean_squared_log_error\n", 33 | "import warnings\n", 34 | "warnings.filterwarnings(\"ignore\")" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### Read and transform data" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": { 48 | "scrolled": true 49 | }, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/html": [ 54 | "
\n", 55 | "\n", 68 | "\n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | "
ProductProduct_BrandItem_CategorySubcategory_1Subcategory_2Item_RatingSelling_Price
07918627101374.3291
1760670101273293.1897
21746280291121013.5792
312146708371344.0837
4210467010802964.3470
\n", 134 | "
" 135 | ], 136 | "text/plain": [ 137 | " Product Product_Brand Item_Category Subcategory_1 Subcategory_2 \\\n", 138 | "0 791 862 7 10 137 \n", 139 | "1 760 670 10 127 329 \n", 140 | "2 1746 280 29 112 101 \n", 141 | "3 1214 670 8 37 134 \n", 142 | "4 2104 670 10 80 296 \n", 143 | "\n", 144 | " Item_Rating Selling_Price \n", 145 | "0 4.3 291 \n", 146 | "1 3.1 897 \n", 147 | "2 3.5 792 \n", 148 | "3 4.0 837 \n", 149 | "4 4.3 470 " 150 | ] 151 | }, 152 | "execution_count": 2, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "df_train = pd.read_csv(\"Train.csv\")\n", 159 | "\n", 160 | "lone = LabelEncoder()\n", 161 | "df_train[\"Product\"] = lone.fit_transform(df_train[\"Product\"])\n", 162 | "df_train[\"Product_Brand\"] = lone.fit_transform(df_train[\"Product_Brand\"])\n", 163 | "df_train[\"Item_Category\"] = lone.fit_transform(df_train[\"Item_Category\"])\n", 164 | "df_train[\"Subcategory_1\"] = lone.fit_transform(df_train[\"Subcategory_1\"])\n", 165 | "df_train[\"Subcategory_2\"] = lone.fit_transform(df_train[\"Subcategory_2\"])\n", 166 | "df_train = df_train.drop(\"Date\", axis=1)\n", 167 | "\n", 168 | "df_train.head()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "### Perform K-Fold" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 6, 181 | "metadata": { 182 | "scrolled": true 183 | }, 184 | "outputs": [ 185 | { 186 | "name": "stdout", 187 | "output_type": "stream", 188 | "text": [ 189 | "INFO: Class 2 has been merged into Class 1 due to low number of samples\n", 190 | "INFO: Class 3 has been merged into Class 1 due to low number of samples\n", 191 | "INFO: Class 4 has been merged into Class 1 due to low number of samples\n", 192 | "\n", 193 | "Class Distribution:\n", 194 | "-------------------\n", 195 | "0: 2350\n", 196 | "1: 102\n", 197 | "\n", 198 | "0.7482941493114452\n", 199 | "0.6863161770553499\n", 200 | "0.7564989146242577\n", 201 | "0.6666193447469937\n", 202 | "0.7311120964801305\n", 203 | "\n", 204 | "Average: 0.7177681364436354\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "# Initialize the resampler object\n", 210 | "rs = resampler()\n", 211 | "\n", 212 | "# Generate classes\n", 213 | "Y_classes = rs.fit(df_train, target=\"Selling_Price\", bins=5, verbose=2)\n", 214 | "# Create the actual target variable\n", 215 | "Y = df_train[\"Selling_Price\"]\n", 216 | "\n", 217 | "# Perform K-Fold\n", 218 | "kfold, scores = KFold(n_splits=5, shuffle=True, random_state=27), list()\n", 219 | "for train, test in kfold.split(df_train):\n", 220 | " # Split into train and test\n", 221 | " x_train, y_train = df_train.iloc[train], Y_classes[train]\n", 222 | " x_test, y_test = df_train.iloc[test], Y.iloc[test]\n", 223 | " \n", 224 | " # Remove the target variable from x_test\n", 225 | " x_test = x_test.drop(\"Selling_Price\", axis=1)\n", 226 | " \n", 227 | " # Get the class distriubtion for perfoming relative sampling in the next line\n", 228 | " xp = Counter(y_train)\n", 229 | " # Your favourite oversampler\n", 230 | " smote = SMOTE(random_state=27, sampling_strategy={0:int(xp[0]*1.2), 1: int(xp[1]*2.0)})\n", 231 | " # Generate the over-sampled data\n", 232 | " x_train, y_train = rs.resample(smote, x_train, y_train)\n", 233 | " \n", 234 | " # Fit the model\n", 235 | " model = LGBMRegressor(random_state=27)\n", 236 | " model.fit(x_train.values, np.log(y_train.values))\n", 237 | " preds = np.exp(model.predict(x_test.values))\n", 238 | " \n", 239 | " # Check the score\n", 240 | " score = np.sqrt(mean_squared_log_error(y_test.values, preds))\n", 241 | " print(score)\n", 242 | " scores.append(score)\n", 243 | "print(\"\\nAverage: \", sum(scores)/len(scores))" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "### Perform Stratified K-Fold" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 5, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "INFO: Class 2 has been merged into Class 1 due to low number of samples\n", 263 | "INFO: Class 3 has been merged into Class 1 due to low number of samples\n", 264 | "INFO: Class 4 has been merged into Class 1 due to low number of samples\n", 265 | "\n", 266 | "Class Distribution:\n", 267 | "-------------------\n", 268 | "0: 2350\n", 269 | "1: 102\n", 270 | "\n", 271 | "0.681783831084692\n", 272 | "0.6997998885546327\n", 273 | "0.7386469827713933\n", 274 | "0.6996689571871663\n", 275 | "0.7698032886366912\n", 276 | "\n", 277 | "Average: 0.7179405896469151\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "# Initialize the resampler object\n", 283 | "rs = resampler()\n", 284 | "\n", 285 | "# Generate classes\n", 286 | "Y_classes = rs.fit(df_train, target=\"Selling_Price\", bins=5, verbose=2)\n", 287 | "# Create the actual target variable\n", 288 | "Y = df_train[\"Selling_Price\"]\n", 289 | "\n", 290 | "# Perform K-Fold\n", 291 | "kfold, scores = StratifiedKFold(n_splits=5, shuffle=True, random_state=27), list()\n", 292 | "for train, test in kfold.split(df_train, Y_classes):\n", 293 | " # Split into train and test\n", 294 | " x_train, y_train = df_train.iloc[train], Y_classes[train]\n", 295 | " x_test, y_test = df_train.iloc[test], Y.iloc[test]\n", 296 | " \n", 297 | " # Remove the target variable from x_test\n", 298 | " x_test = x_test.drop(\"Selling_Price\", axis=1)\n", 299 | " \n", 300 | " # Get the class distriubtion for perfoming relative sampling in the next line\n", 301 | " xp = Counter(y_train)\n", 302 | " # Your favourite oversampler\n", 303 | " smote = SMOTE(random_state=27, sampling_strategy={0:int(xp[0]*1.2), 1: int(xp[1]*2.0)})\n", 304 | " # Generate the over-sampled data\n", 305 | " x_train, y_train = rs.resample(smote, x_train, y_train)\n", 306 | " \n", 307 | " # Fit the model\n", 308 | " model = LGBMRegressor(random_state=27)\n", 309 | " model.fit(x_train.values, np.log(y_train.values))\n", 310 | " preds = np.exp(model.predict(x_test.values))\n", 311 | " \n", 312 | " # Check the score\n", 313 | " score = np.sqrt(mean_squared_log_error(y_test.values, preds))\n", 314 | " print(score)\n", 315 | " scores.append(score)\n", 316 | "print(\"\\nAverage: \", sum(scores)/len(scores))" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [] 325 | } 326 | ], 327 | "metadata": { 328 | "kernelspec": { 329 | "display_name": "Python 3", 330 | "language": "python", 331 | "name": "python3" 332 | }, 333 | "language_info": { 334 | "codemirror_mode": { 335 | "name": "ipython", 336 | "version": 3 337 | }, 338 | "file_extension": ".py", 339 | "mimetype": "text/x-python", 340 | "name": "python", 341 | "nbconvert_exporter": "python", 342 | "pygments_lexer": "ipython3", 343 | "version": "3.7.4" 344 | } 345 | }, 346 | "nbformat": 4, 347 | "nbformat_minor": 4 348 | } 349 | --------------------------------------------------------------------------------