├── src
├── reg_resampler.egg-info
│ ├── dependency_links.txt
│ ├── top_level.txt
│ ├── requires.txt
│ ├── SOURCES.txt
│ └── PKG-INFO
└── reg_resampler.py
├── dist
├── reg_resampler-1.0.tar.gz
├── reg_resampler-2.0.tar.gz
├── reg_resampler-1.0.1.tar.gz
├── reg_resampler-1.0.2.tar.gz
├── reg_resampler-1.0.3.tar.gz
├── reg_resampler-1.0.6.tar.gz
├── reg_resampler-1.0.7.tar.gz
├── reg_resampler-2.0.1.tar.gz
├── reg_resampler-2.1.0.tar.gz
├── reg_resampler-2.1.1.tar.gz
├── reg_resampler-1.0-py3-none-any.whl
├── reg_resampler-2.0-py3-none-any.whl
├── reg_resampler-1.0.1-py3-none-any.whl
├── reg_resampler-1.0.2-py3-none-any.whl
├── reg_resampler-1.0.3-py3-none-any.whl
├── reg_resampler-1.0.6-py3-none-any.whl
├── reg_resampler-1.0.7-py3-none-any.whl
├── reg_resampler-2.0.1-py3-none-any.whl
├── reg_resampler-2.1.0-py3-none-any.whl
└── reg_resampler-2.1.1-py3-none-any.whl
├── setup.py
├── LICENSE
├── README.md
└── tutorials
├── numpy_CV_tutorial.ipynb
└── pandas_CV_tutorial.ipynb
/src/reg_resampler.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/src/reg_resampler.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | reg_resampler
2 |
--------------------------------------------------------------------------------
/src/reg_resampler.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | scikit-learn
3 | numpy
4 |
--------------------------------------------------------------------------------
/dist/reg_resampler-1.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.tar.gz
--------------------------------------------------------------------------------
/dist/reg_resampler-2.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.0.tar.gz
--------------------------------------------------------------------------------
/dist/reg_resampler-1.0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.1.tar.gz
--------------------------------------------------------------------------------
/dist/reg_resampler-1.0.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.2.tar.gz
--------------------------------------------------------------------------------
/dist/reg_resampler-1.0.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.3.tar.gz
--------------------------------------------------------------------------------
/dist/reg_resampler-1.0.6.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.6.tar.gz
--------------------------------------------------------------------------------
/dist/reg_resampler-1.0.7.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.7.tar.gz
--------------------------------------------------------------------------------
/dist/reg_resampler-2.0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.0.1.tar.gz
--------------------------------------------------------------------------------
/dist/reg_resampler-2.1.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.1.0.tar.gz
--------------------------------------------------------------------------------
/dist/reg_resampler-2.1.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.1.1.tar.gz
--------------------------------------------------------------------------------
/dist/reg_resampler-1.0-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/reg_resampler-2.0-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.0-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/reg_resampler-1.0.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.1-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/reg_resampler-1.0.2-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.2-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/reg_resampler-1.0.3-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.3-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/reg_resampler-1.0.6-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.6-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/reg_resampler-1.0.7-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-1.0.7-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/reg_resampler-2.0.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.0.1-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/reg_resampler-2.1.0-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.1.0-py3-none-any.whl
--------------------------------------------------------------------------------
/dist/reg_resampler-2.1.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/atif-hassan/Regression_ReSampling/HEAD/dist/reg_resampler-2.1.1-py3-none-any.whl
--------------------------------------------------------------------------------
/src/reg_resampler.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | README.md
2 | setup.py
3 | src/reg_resampler.py
4 | src/reg_resampler.egg-info/PKG-INFO
5 | src/reg_resampler.egg-info/SOURCES.txt
6 | src/reg_resampler.egg-info/dependency_links.txt
7 | src/reg_resampler.egg-info/requires.txt
8 | src/reg_resampler.egg-info/top_level.txt
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | with open("README.md", "r") as fh:
4 | long_description = fh.read()
5 |
6 |
7 | setuptools.setup(
8 | name = 'reg_resampler',
9 | version = '2.1.1',
10 | author = "Atif Hassan, Venkata Sai Krithik",
11 | author_email = "atif.hit.hassan@gmail.com, pvsaikrithik@gmail.com",
12 | description = "An interface to apply your favourite re-sampler on regression tasks.",
13 | long_description = long_description,
14 | long_description_content_type = "text/markdown",
15 | url = "https://github.com/atif-hassan/Regression_ReSampling/",
16 | py_modules = ["reg_resampler"],
17 | package_dir = {'': 'src'},
18 | install_requires = ["pandas", "scikit-learn", "numpy"],
19 | include_package_data = True,
20 | classifiers = [
21 | "Programming Language :: Python :: 3",
22 | "Programming Language :: Python :: 3.6",
23 | "Programming Language :: Python :: 3.7",
24 | "License :: OSI Approved :: BSD License",
25 | "Operating System :: OS Independent",
26 | ]
27 | )
28 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2020, Atif Hassan, Venkata Sai Krithiks
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://www.python.org/)
2 | [](https://github.com/atif-hassan/)
3 |
4 | [](https://pypi.python.org/pypi/reg-resampler/)
5 | [](https://pepy.tech/project/reg-resampler)
6 | [](https://github.com/atif-hassan/Regression_ReSampling/commits/master)
7 | # Regression ReSampling
8 | A python library for repurposing traditional classification-based resampling (undersampling and/or oversampling) techniques for regression tasks. Currently supports all resampling techniques present in **imblearn**
9 |
10 | ## Why does this exist?
11 | While we were working on a regression task, we realized that the target variable was skewed, i.e., most samples were present in a particular range. One can easily solve the skew problem for classification tasks via a slew of resampling techniques (either under or over sampling) but this luxury is unavailable for regression tasks. We therefore decided to create an interface that can repurpose all resampling techniques for classification problems to regression problems!
12 |
13 | ## How to install?
14 | ```pip install reg_resampler```
15 |
16 | ## Functions and parameters
17 | ```python
18 | # This returns a numpy list of classes for each corresponding sample. It also automatically merges classes when required
19 | fit(X, target, bins=3, min_n_samples=6, balanced_binning=False, verbose=2)
20 | ```
21 | - **X** - Either a pandas dataframe or numpy matrix. Complete data to be resampled.
22 | - **target** - Either string (for pandas) or index (for numpy). The target variable to be resampled.
23 | - **bins=3** - The number of classes that the user wants to generate. (Default: 3)
24 | - **min_n_samples=6** - Minimum number of samples in each bin. Bins having less than this value will be merged with the closest bin. Has to be more than neighbours in imblearn. (Default: 6)
25 | - **balanced_binning=False** - Decides whether samples are to be distributed roughly equally across all classes. (Default: False)
26 | - **verbose=2** - 0 will disable print by package, 1 will print info about class mergers and 2 will also print class distributions.
27 |
28 | ```python
29 | # Performs resampling and returns the resampled dataframe/numpy matrices in the form of data and target variable.
30 | resample(sampler_obj, trainX, trainY)
31 | ```
32 | - **sampler_obj** - Your favourite resampling algorithm's object (currently supports imblearn)
33 | - **trainX** - Either a pandas dataframe or numpy matrix. Data to be resampled. Also, contains the target variable
34 | - **trainY** - Numpy array of psuedo classes obtained from fit function.
35 |
36 | ### Important Note
37 | All functions return the same data type as provided in input.
38 |
39 | ## How to import?
40 | ```python
41 | from reg_resampler import resampler
42 | ```
43 |
44 | ## Usage
45 | ```python
46 | # Initialize the resampler object
47 | rs = resampler()
48 |
49 | # You might recieve info about class merger for low sample classes
50 | # Generate classes
51 | Y_classes = rs.fit(df_train, target=target, bins=num_bins)
52 | # Create the actual target variable
53 | Y = df_train[target]
54 |
55 | # Create a smote (over-sampling) object from imblearn
56 | smote = SMOTE(random_state=27)
57 |
58 | # Now resample
59 | final_X, final_Y = rs.resample(smote, df_train, Y_classes)
60 | ```
61 |
62 | ## Tutorials
63 | You can find further [tutorials](https://github.com/atif-hassan/Regression_ReSampling/tree/master/tutorials) on how to use this library for cross-validation
64 |
65 | ## Future Ideas
66 | - Support for more resampling techniques
67 |
68 | ## Feature Request
69 | Drop us an email at **atif.hit.hassan@gmail.com** or **pvsaikrithik@gmail.com** if you want any particular feature
70 |
--------------------------------------------------------------------------------
/src/reg_resampler.py:
--------------------------------------------------------------------------------
1 | class resampler:
2 | def __init__(self):
3 | import pandas as pd
4 | from sklearn.preprocessing import LabelEncoder
5 | from collections import Counter
6 | import numpy as np
7 | self.bins = 3
8 | self.pd = pd
9 | self.LabelEncoder = LabelEncoder
10 | self.Counter = Counter
11 | self.X = 0
12 | self.Y_classes = 0
13 | self.target = 0
14 | self.np = np
15 |
16 | # This function adds classes to each sample and returns the class list as a dataframe/numpy array (as per input)
17 | # It also merges classes as and when required
18 | def fit(self, X, target, bins=3, min_n_samples=6, balanced_binning=False, verbose=2):
19 | self.bins = bins
20 | tmp = target
21 |
22 | # If data is numpy, then convert it into pandas
23 | if type(target) == int:
24 | if target < 0:
25 | target = X.shape[1]+target
26 | tmp = target
27 | self.X = self.pd.DataFrame()
28 | for i in range(X.shape[1]):
29 | if i!=target:
30 | self.X[str(i)] = X[:,i]
31 | self.X["target"] = X[:,target]
32 | target = "target"
33 | else:
34 | self.X = X.copy()
35 |
36 | # Use qcut if balanced binning is required
37 | if balanced_binning:
38 | self.Y_classes = self.pd.qcut(self.X[target], q=self.bins, precision=0)
39 | else:
40 | self.Y_classes = self.pd.cut(self.X[target], bins=self.bins)
41 |
42 | # Pandas outputs ranges after binning. Convert ranges to classes
43 | le = self.LabelEncoder()
44 | self.Y_classes = le.fit_transform(self.Y_classes)
45 |
46 | # Merge classes if number of neighbours is more than the number of samples
47 | classes_count = list(map(list, self.Counter(self.Y_classes).items()))
48 | classes_count = sorted(classes_count, key = lambda x: x[0])
49 | mid_point = len(classes_count)
50 | # Logic for merging
51 | for i in range(len(classes_count)):
52 | if classes_count[i][1] < min_n_samples:
53 | self.Y_classes[self.np.where(self.Y_classes == classes_count[i][0])[0]] = classes_count[i-1][0]
54 | if verbose > 0:
55 | print("INFO: Class " + str(classes_count[i][0]) + " has been merged into Class " + str(classes_count[i-1][0]) + " due to low number of samples")
56 | classes_count[i][0] = classes_count[i-1][0]
57 | if verbose > 0:
58 | print()
59 |
60 | # Perform label-encoding once again
61 | # Avoids class skipping after merging
62 | le = self.LabelEncoder()
63 | self.Y_classes = le.fit_transform(self.Y_classes)
64 |
65 | # Pretty print
66 | if verbose > 1:
67 | print("Class Distribution:\n-------------------")
68 | classes_count = list(map(list, self.Counter(self.Y_classes).items()))
69 | classes_count = sorted(classes_count, key = lambda x: x[0])
70 | for class_, count in classes_count:
71 | print(str(class_)+": "+str(count))
72 | print()
73 |
74 | # Finally concatenate and return as dataframe or numpy
75 | # Based on what type of target was sent
76 | self.X["classes"] = self.Y_classes
77 | if type(tmp) == int:
78 | self.target = tmp
79 | else:
80 | self.target = target
81 | return self.Y_classes
82 |
83 |
84 |
85 | # This function performs the re-sampling
86 | def resample(self, sampler_obj, trainX, trainY):
87 | # If classes haven't yet been created, then run the "fit" function
88 | if type(self.Y_classes) == int:
89 | print("Error! Run fit method first!!")
90 | return None
91 |
92 | # Finally, perform the re-sampling
93 | resampled_data, _ = sampler_obj.fit_resample(trainX, trainY)
94 | if type(resampled_data).__module__ == 'numpy':
95 | resampled_data = self.pd.DataFrame(resampled_data, columns=self.X.drop("classes", axis=1).columns)
96 |
97 | # Return the correct X and Y
98 | if type(self.target) == int:
99 | return resampled_data.drop("target", axis=1).values, resampled_data["target"].values
100 | else:
101 | return resampled_data.drop(self.target, axis=1), resampled_data[self.target]
102 |
--------------------------------------------------------------------------------
/src/reg_resampler.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: reg-resampler
3 | Version: 2.1.1
4 | Summary: An interface to apply your favourite re-sampler on regression tasks.
5 | Home-page: https://github.com/atif-hassan/Regression_ReSampling/
6 | Author: Atif Hassan, Venkata Sai Krithik
7 | Author-email: atif.hit.hassan@gmail.com, pvsaikrithik@gmail.com
8 | License: UNKNOWN
9 | Description: [](https://www.python.org/)
10 | [](https://github.com/atif-hassan/)
11 |
12 | [](https://pypi.python.org/pypi/reg-resampler/)
13 | [](https://pepy.tech/project/reg-resampler)
14 | [](https://github.com/atif-hassan/Regression_ReSampling/commits/master)
15 | # Regression ReSampling
16 | A python library for repurposing traditional classification-based resampling (undersampling and/or oversampling) techniques for regression tasks. Currently supports all resampling techniques present in **imblearn**
17 |
18 | ## Why does this exist?
19 | While we were working on a regression task, we realized that the target variable was skewed, i.e., most samples were present in a particular range. One can easily solve the skew problem for classification tasks via a slew of resampling techniques (either under or over sampling) but this luxury is unavailable for regression tasks. We therefore decided to create an interface that can repurpose all resampling techniques for classification problems to regression problems!
20 |
21 | ## How to install?
22 | ```pip install reg_resampler```
23 |
24 | ## Functions and parameters
25 | ```python
26 | # This returns a numpy list of classes for each corresponding sample. It also automatically merges classes when required
27 | fit(X, target, bins=3, min_n_samples=6, balanced_binning=False, verbose=2)
28 | ```
29 | - **X** - Either a pandas dataframe or numpy matrix. Complete data to be resampled.
30 | - **target** - Either string (for pandas) or index (for numpy). The target variable to be resampled.
31 | - **bins=3** - The number of classes that the user wants to generate. (Default: 3)
32 | - **min_n_samples=6** - Minimum number of samples in each bin. Bins having less than this value will be merged with the closest bin. Has to be more than neighbours in imblearn. (Default: 6)
33 | - **balanced_binning=False** - Decides whether samples are to be distributed roughly equally across all classes. (Default: False)
34 | - **verbose=2** - 0 will disable print by package, 1 will print info about class mergers and 2 will also print class distributions.
35 |
36 | ```python
37 | # Performs resampling and returns the resampled dataframe/numpy matrices in the form of data and target variable.
38 | resample(sampler_obj, trainX, trainY)
39 | ```
40 | - **sampler_obj** - Your favourite resampling algorithm's object (currently supports imblearn)
41 | - **trainX** - Either a pandas dataframe or numpy matrix. Data to be resampled. Also, contains the target variable
42 | - **trainY** - Numpy array of psuedo classes obtained from fit function.
43 |
44 | ### Important Note
45 | All functions return the same data type as provided in input.
46 |
47 | ## How to import?
48 | ```python
49 | from reg_resampler import resampler
50 | ```
51 |
52 | ## Usage
53 | ```python
54 | # Initialize the resampler object
55 | rs = resampler()
56 |
57 | # You might recieve info about class merger for low sample classes
58 | # Generate classes
59 | Y_classes = rs.fit(df_train, target=target, bins=num_bins)
60 | # Create the actual target variable
61 | Y = df_train[target]
62 |
63 | # Create a smote (over-sampling) object from imblearn
64 | smote = SMOTE(random_state=27)
65 |
66 | # Now resample
67 | final_X, final_Y = rs.resample(smote, df_train, Y_classes)
68 | ```
69 |
70 | ## Tutorials
71 | You can find further [tutorials](https://github.com/atif-hassan/Regression_ReSampling/tree/master/tutorials) on how to use this library for cross-validation
72 |
73 | ## Future Ideas
74 | - Support for more resampling techniques
75 |
76 | ## Feature Request
77 | Drop us an email at **atif.hit.hassan@gmail.com** or **pvsaikrithik@gmail.com** if you want any particular feature
78 |
79 | Platform: UNKNOWN
80 | Classifier: Programming Language :: Python :: 3
81 | Classifier: Programming Language :: Python :: 3.6
82 | Classifier: Programming Language :: Python :: 3.7
83 | Classifier: License :: OSI Approved :: BSD License
84 | Classifier: Operating System :: OS Independent
85 | Description-Content-Type: text/markdown
86 |
--------------------------------------------------------------------------------
/tutorials/numpy_CV_tutorial.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Get all imports"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 4,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd\n",
17 | "import numpy as np\n",
18 | "from sklearn.preprocessing import LabelEncoder\n",
19 | "from sklearn.model_selection import KFold, StratifiedKFold\n",
20 | "from lightgbm import LGBMRegressor\n",
21 | "from imblearn.over_sampling import SMOTE\n",
22 | "from collections import Counter\n",
23 | "from reg_resampler import resampler\n",
24 | "from sklearn.metrics import mean_squared_log_error\n",
25 | "import warnings\n",
26 | "warnings.filterwarnings(\"ignore\")"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "### Read and transform data"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {
40 | "scrolled": true
41 | },
42 | "outputs": [
43 | {
44 | "data": {
45 | "text/html": [
46 | "
\n",
47 | "\n",
60 | "
\n",
61 | " \n",
62 | " \n",
63 | " | \n",
64 | " Product | \n",
65 | " Product_Brand | \n",
66 | " Item_Category | \n",
67 | " Subcategory_1 | \n",
68 | " Subcategory_2 | \n",
69 | " Item_Rating | \n",
70 | " Selling_Price | \n",
71 | "
\n",
72 | " \n",
73 | " \n",
74 | " \n",
75 | " | 0 | \n",
76 | " 791 | \n",
77 | " 862 | \n",
78 | " 7 | \n",
79 | " 10 | \n",
80 | " 137 | \n",
81 | " 4.3 | \n",
82 | " 291 | \n",
83 | "
\n",
84 | " \n",
85 | " | 1 | \n",
86 | " 760 | \n",
87 | " 670 | \n",
88 | " 10 | \n",
89 | " 127 | \n",
90 | " 329 | \n",
91 | " 3.1 | \n",
92 | " 897 | \n",
93 | "
\n",
94 | " \n",
95 | " | 2 | \n",
96 | " 1746 | \n",
97 | " 280 | \n",
98 | " 29 | \n",
99 | " 112 | \n",
100 | " 101 | \n",
101 | " 3.5 | \n",
102 | " 792 | \n",
103 | "
\n",
104 | " \n",
105 | " | 3 | \n",
106 | " 1214 | \n",
107 | " 670 | \n",
108 | " 8 | \n",
109 | " 37 | \n",
110 | " 134 | \n",
111 | " 4.0 | \n",
112 | " 837 | \n",
113 | "
\n",
114 | " \n",
115 | " | 4 | \n",
116 | " 2104 | \n",
117 | " 670 | \n",
118 | " 10 | \n",
119 | " 80 | \n",
120 | " 296 | \n",
121 | " 4.3 | \n",
122 | " 470 | \n",
123 | "
\n",
124 | " \n",
125 | "
\n",
126 | "
"
127 | ],
128 | "text/plain": [
129 | " Product Product_Brand Item_Category Subcategory_1 Subcategory_2 \\\n",
130 | "0 791 862 7 10 137 \n",
131 | "1 760 670 10 127 329 \n",
132 | "2 1746 280 29 112 101 \n",
133 | "3 1214 670 8 37 134 \n",
134 | "4 2104 670 10 80 296 \n",
135 | "\n",
136 | " Item_Rating Selling_Price \n",
137 | "0 4.3 291 \n",
138 | "1 3.1 897 \n",
139 | "2 3.5 792 \n",
140 | "3 4.0 837 \n",
141 | "4 4.3 470 "
142 | ]
143 | },
144 | "execution_count": 2,
145 | "metadata": {},
146 | "output_type": "execute_result"
147 | }
148 | ],
149 | "source": [
150 | "df_train = pd.read_csv(\"Train.csv\")\n",
151 | "\n",
152 | "lone = LabelEncoder()\n",
153 | "df_train[\"Product\"] = lone.fit_transform(df_train[\"Product\"])\n",
154 | "df_train[\"Product_Brand\"] = lone.fit_transform(df_train[\"Product_Brand\"])\n",
155 | "df_train[\"Item_Category\"] = lone.fit_transform(df_train[\"Item_Category\"])\n",
156 | "df_train[\"Subcategory_1\"] = lone.fit_transform(df_train[\"Subcategory_1\"])\n",
157 | "df_train[\"Subcategory_2\"] = lone.fit_transform(df_train[\"Subcategory_2\"])\n",
158 | "df_train = df_train.drop(\"Date\", axis=1)\n",
159 | "\n",
160 | "df_train.head()"
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "metadata": {},
166 | "source": [
167 | "### Perform K-Fold"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 3,
173 | "metadata": {
174 | "scrolled": true
175 | },
176 | "outputs": [
177 | {
178 | "name": "stdout",
179 | "output_type": "stream",
180 | "text": [
181 | "INFO: Class 2 has been merged into Class 1 due to low number of samples\n",
182 | "INFO: Class 3 has been merged into Class 1 due to low number of samples\n",
183 | "INFO: Class 4 has been merged into Class 1 due to low number of samples\n",
184 | "\n",
185 | "Class Distribution:\n",
186 | "-------------------\n",
187 | "0: 2350\n",
188 | "1: 102\n",
189 | "\n",
190 | "0.7399142441762613\n",
191 | "0.7027917932595997\n",
192 | "0.7526839471836929\n",
193 | "0.6559476353228568\n",
194 | "0.728284943818841\n",
195 | "\n",
196 | "Average: 0.7159245127522503\n"
197 | ]
198 | }
199 | ],
200 | "source": [
201 | "# Initialize the resampler object\n",
202 | "rs = resampler()\n",
203 | "\n",
204 | "# Generate classes\n",
205 | "Y_classes = rs.fit(df_train.values, target=-1, bins=5, verbose=2)\n",
206 | "# Create the actual target variable\n",
207 | "Y = df_train[\"Selling_Price\"]\n",
208 | "\n",
209 | "# Perform K-Fold\n",
210 | "kfold, scores = KFold(n_splits=5, shuffle=True, random_state=27), list()\n",
211 | "for train, test in kfold.split(df_train.values):\n",
212 | " # Split into train and test\n",
213 | " x_train, y_train = df_train.values[train], Y_classes[train]\n",
214 | " x_test, y_test = df_train.values[test], Y.values[test]\n",
215 | " \n",
216 | " # Remove the target variable from x_test\n",
217 | " x_test = x_test[:,:-1]\n",
218 | " \n",
219 | " # Get the class distriubtion for perfoming relative sampling in the next line\n",
220 | " xp = Counter(y_train)\n",
221 | " # Your favourite oversampler\n",
222 | " smote = SMOTE(random_state=27, sampling_strategy={0:int(xp[0]*1.2), 1: int(xp[1]*2.0)})\n",
223 | " # Generate the over-sampled data\n",
224 | " x_train, y_train = rs.resample(smote, x_train, y_train)\n",
225 | " \n",
226 | " # Fit the model\n",
227 | " model = LGBMRegressor(random_state=27)\n",
228 | " model.fit(x_train, np.log(y_train))\n",
229 | " preds = np.exp(model.predict(x_test))\n",
230 | " \n",
231 | " # Check the score\n",
232 | " score = np.sqrt(mean_squared_log_error(y_test, preds))\n",
233 | " print(score)\n",
234 | " scores.append(score)\n",
235 | "print(\"\\nAverage: \", sum(scores)/len(scores))"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "### Perform Stratified K-Fold"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 5,
248 | "metadata": {},
249 | "outputs": [
250 | {
251 | "name": "stdout",
252 | "output_type": "stream",
253 | "text": [
254 | "INFO: Class 2 has been merged into Class 1 due to low number of samples\n",
255 | "INFO: Class 3 has been merged into Class 1 due to low number of samples\n",
256 | "INFO: Class 4 has been merged into Class 1 due to low number of samples\n",
257 | "\n",
258 | "Class Distribution:\n",
259 | "-------------------\n",
260 | "0: 2350\n",
261 | "1: 102\n",
262 | "\n",
263 | "0.681783831084692\n",
264 | "0.6997998885546327\n",
265 | "0.7386469827713933\n",
266 | "0.6996689571871663\n",
267 | "0.7698032886366912\n",
268 | "\n",
269 | "Average: 0.7179405896469151\n"
270 | ]
271 | }
272 | ],
273 | "source": [
274 | "# Initialize the resampler object\n",
275 | "rs = resampler()\n",
276 | "\n",
277 | "# Generate classes\n",
278 | "Y_classes = rs.fit(df_train.values, target=-1, bins=5, verbose=2)\n",
279 | "# Create the actual target variable\n",
280 | "Y = df_train[\"Selling_Price\"]\n",
281 | "\n",
282 | "# Perform K-Fold\n",
283 | "kfold, scores = StratifiedKFold(n_splits=5, shuffle=True, random_state=27), list()\n",
284 | "for train, test in kfold.split(df_train.values, Y_classes):\n",
285 | " # Split into train and test\n",
286 | " x_train, y_train = df_train.values[train], Y_classes[train]\n",
287 | " x_test, y_test = df_train.values[test], Y.values[test]\n",
288 | " \n",
289 | " # Remove the target variable from x_test\n",
290 | " x_test = x_test[:,:-1]\n",
291 | " \n",
292 | " # Get the class distriubtion for perfoming relative sampling in the next line\n",
293 | " xp = Counter(y_train)\n",
294 | " # Your favourite oversampler\n",
295 | " smote = SMOTE(random_state=27, sampling_strategy={0:int(xp[0]*1.2), 1: int(xp[1]*2.0)})\n",
296 | " # Generate the over-sampled data\n",
297 | " x_train, y_train = rs.resample(smote, x_train, y_train)\n",
298 | " \n",
299 | " # Fit the model\n",
300 | " model = LGBMRegressor(random_state=27)\n",
301 | " model.fit(x_train, np.log(y_train))\n",
302 | " preds = np.exp(model.predict(x_test))\n",
303 | " \n",
304 | " # Check the score\n",
305 | " score = np.sqrt(mean_squared_log_error(y_test, preds))\n",
306 | " print(score)\n",
307 | " scores.append(score)\n",
308 | "print(\"\\nAverage: \", sum(scores)/len(scores))"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": null,
314 | "metadata": {},
315 | "outputs": [],
316 | "source": []
317 | }
318 | ],
319 | "metadata": {
320 | "kernelspec": {
321 | "display_name": "Python 3",
322 | "language": "python",
323 | "name": "python3"
324 | },
325 | "language_info": {
326 | "codemirror_mode": {
327 | "name": "ipython",
328 | "version": 3
329 | },
330 | "file_extension": ".py",
331 | "mimetype": "text/x-python",
332 | "name": "python",
333 | "nbconvert_exporter": "python",
334 | "pygments_lexer": "ipython3",
335 | "version": "3.7.4"
336 | }
337 | },
338 | "nbformat": 4,
339 | "nbformat_minor": 4
340 | }
341 |
--------------------------------------------------------------------------------
/tutorials/pandas_CV_tutorial.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Get all imports"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stderr",
17 | "output_type": "stream",
18 | "text": [
19 | "Using TensorFlow backend.\n"
20 | ]
21 | }
22 | ],
23 | "source": [
24 | "import pandas as pd\n",
25 | "import numpy as np\n",
26 | "from sklearn.preprocessing import LabelEncoder\n",
27 | "from sklearn.model_selection import KFold, StratifiedKFold\n",
28 | "from lightgbm import LGBMRegressor\n",
29 | "from imblearn.over_sampling import SMOTE\n",
30 | "from collections import Counter\n",
31 | "from reg_resampler import resampler\n",
32 | "from sklearn.metrics import mean_squared_log_error\n",
33 | "import warnings\n",
34 | "warnings.filterwarnings(\"ignore\")"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "### Read and transform data"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 2,
47 | "metadata": {
48 | "scrolled": true
49 | },
50 | "outputs": [
51 | {
52 | "data": {
53 | "text/html": [
54 | "\n",
55 | "\n",
68 | "
\n",
69 | " \n",
70 | " \n",
71 | " | \n",
72 | " Product | \n",
73 | " Product_Brand | \n",
74 | " Item_Category | \n",
75 | " Subcategory_1 | \n",
76 | " Subcategory_2 | \n",
77 | " Item_Rating | \n",
78 | " Selling_Price | \n",
79 | "
\n",
80 | " \n",
81 | " \n",
82 | " \n",
83 | " | 0 | \n",
84 | " 791 | \n",
85 | " 862 | \n",
86 | " 7 | \n",
87 | " 10 | \n",
88 | " 137 | \n",
89 | " 4.3 | \n",
90 | " 291 | \n",
91 | "
\n",
92 | " \n",
93 | " | 1 | \n",
94 | " 760 | \n",
95 | " 670 | \n",
96 | " 10 | \n",
97 | " 127 | \n",
98 | " 329 | \n",
99 | " 3.1 | \n",
100 | " 897 | \n",
101 | "
\n",
102 | " \n",
103 | " | 2 | \n",
104 | " 1746 | \n",
105 | " 280 | \n",
106 | " 29 | \n",
107 | " 112 | \n",
108 | " 101 | \n",
109 | " 3.5 | \n",
110 | " 792 | \n",
111 | "
\n",
112 | " \n",
113 | " | 3 | \n",
114 | " 1214 | \n",
115 | " 670 | \n",
116 | " 8 | \n",
117 | " 37 | \n",
118 | " 134 | \n",
119 | " 4.0 | \n",
120 | " 837 | \n",
121 | "
\n",
122 | " \n",
123 | " | 4 | \n",
124 | " 2104 | \n",
125 | " 670 | \n",
126 | " 10 | \n",
127 | " 80 | \n",
128 | " 296 | \n",
129 | " 4.3 | \n",
130 | " 470 | \n",
131 | "
\n",
132 | " \n",
133 | "
\n",
134 | "
"
135 | ],
136 | "text/plain": [
137 | " Product Product_Brand Item_Category Subcategory_1 Subcategory_2 \\\n",
138 | "0 791 862 7 10 137 \n",
139 | "1 760 670 10 127 329 \n",
140 | "2 1746 280 29 112 101 \n",
141 | "3 1214 670 8 37 134 \n",
142 | "4 2104 670 10 80 296 \n",
143 | "\n",
144 | " Item_Rating Selling_Price \n",
145 | "0 4.3 291 \n",
146 | "1 3.1 897 \n",
147 | "2 3.5 792 \n",
148 | "3 4.0 837 \n",
149 | "4 4.3 470 "
150 | ]
151 | },
152 | "execution_count": 2,
153 | "metadata": {},
154 | "output_type": "execute_result"
155 | }
156 | ],
157 | "source": [
158 | "df_train = pd.read_csv(\"Train.csv\")\n",
159 | "\n",
160 | "lone = LabelEncoder()\n",
161 | "df_train[\"Product\"] = lone.fit_transform(df_train[\"Product\"])\n",
162 | "df_train[\"Product_Brand\"] = lone.fit_transform(df_train[\"Product_Brand\"])\n",
163 | "df_train[\"Item_Category\"] = lone.fit_transform(df_train[\"Item_Category\"])\n",
164 | "df_train[\"Subcategory_1\"] = lone.fit_transform(df_train[\"Subcategory_1\"])\n",
165 | "df_train[\"Subcategory_2\"] = lone.fit_transform(df_train[\"Subcategory_2\"])\n",
166 | "df_train = df_train.drop(\"Date\", axis=1)\n",
167 | "\n",
168 | "df_train.head()"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {},
174 | "source": [
175 | "### Perform K-Fold"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 6,
181 | "metadata": {
182 | "scrolled": true
183 | },
184 | "outputs": [
185 | {
186 | "name": "stdout",
187 | "output_type": "stream",
188 | "text": [
189 | "INFO: Class 2 has been merged into Class 1 due to low number of samples\n",
190 | "INFO: Class 3 has been merged into Class 1 due to low number of samples\n",
191 | "INFO: Class 4 has been merged into Class 1 due to low number of samples\n",
192 | "\n",
193 | "Class Distribution:\n",
194 | "-------------------\n",
195 | "0: 2350\n",
196 | "1: 102\n",
197 | "\n",
198 | "0.7482941493114452\n",
199 | "0.6863161770553499\n",
200 | "0.7564989146242577\n",
201 | "0.6666193447469937\n",
202 | "0.7311120964801305\n",
203 | "\n",
204 | "Average: 0.7177681364436354\n"
205 | ]
206 | }
207 | ],
208 | "source": [
209 | "# Initialize the resampler object\n",
210 | "rs = resampler()\n",
211 | "\n",
212 | "# Generate classes\n",
213 | "Y_classes = rs.fit(df_train, target=\"Selling_Price\", bins=5, verbose=2)\n",
214 | "# Create the actual target variable\n",
215 | "Y = df_train[\"Selling_Price\"]\n",
216 | "\n",
217 | "# Perform K-Fold\n",
218 | "kfold, scores = KFold(n_splits=5, shuffle=True, random_state=27), list()\n",
219 | "for train, test in kfold.split(df_train):\n",
220 | " # Split into train and test\n",
221 | " x_train, y_train = df_train.iloc[train], Y_classes[train]\n",
222 | " x_test, y_test = df_train.iloc[test], Y.iloc[test]\n",
223 | " \n",
224 | " # Remove the target variable from x_test\n",
225 | " x_test = x_test.drop(\"Selling_Price\", axis=1)\n",
226 | " \n",
227 | " # Get the class distriubtion for perfoming relative sampling in the next line\n",
228 | " xp = Counter(y_train)\n",
229 | " # Your favourite oversampler\n",
230 | " smote = SMOTE(random_state=27, sampling_strategy={0:int(xp[0]*1.2), 1: int(xp[1]*2.0)})\n",
231 | " # Generate the over-sampled data\n",
232 | " x_train, y_train = rs.resample(smote, x_train, y_train)\n",
233 | " \n",
234 | " # Fit the model\n",
235 | " model = LGBMRegressor(random_state=27)\n",
236 | " model.fit(x_train.values, np.log(y_train.values))\n",
237 | " preds = np.exp(model.predict(x_test.values))\n",
238 | " \n",
239 | " # Check the score\n",
240 | " score = np.sqrt(mean_squared_log_error(y_test.values, preds))\n",
241 | " print(score)\n",
242 | " scores.append(score)\n",
243 | "print(\"\\nAverage: \", sum(scores)/len(scores))"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {},
249 | "source": [
250 | "### Perform Stratified K-Fold"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 5,
256 | "metadata": {},
257 | "outputs": [
258 | {
259 | "name": "stdout",
260 | "output_type": "stream",
261 | "text": [
262 | "INFO: Class 2 has been merged into Class 1 due to low number of samples\n",
263 | "INFO: Class 3 has been merged into Class 1 due to low number of samples\n",
264 | "INFO: Class 4 has been merged into Class 1 due to low number of samples\n",
265 | "\n",
266 | "Class Distribution:\n",
267 | "-------------------\n",
268 | "0: 2350\n",
269 | "1: 102\n",
270 | "\n",
271 | "0.681783831084692\n",
272 | "0.6997998885546327\n",
273 | "0.7386469827713933\n",
274 | "0.6996689571871663\n",
275 | "0.7698032886366912\n",
276 | "\n",
277 | "Average: 0.7179405896469151\n"
278 | ]
279 | }
280 | ],
281 | "source": [
282 | "# Initialize the resampler object\n",
283 | "rs = resampler()\n",
284 | "\n",
285 | "# Generate classes\n",
286 | "Y_classes = rs.fit(df_train, target=\"Selling_Price\", bins=5, verbose=2)\n",
287 | "# Create the actual target variable\n",
288 | "Y = df_train[\"Selling_Price\"]\n",
289 | "\n",
290 | "# Perform K-Fold\n",
291 | "kfold, scores = StratifiedKFold(n_splits=5, shuffle=True, random_state=27), list()\n",
292 | "for train, test in kfold.split(df_train, Y_classes):\n",
293 | " # Split into train and test\n",
294 | " x_train, y_train = df_train.iloc[train], Y_classes[train]\n",
295 | " x_test, y_test = df_train.iloc[test], Y.iloc[test]\n",
296 | " \n",
297 | " # Remove the target variable from x_test\n",
298 | " x_test = x_test.drop(\"Selling_Price\", axis=1)\n",
299 | " \n",
300 | " # Get the class distriubtion for perfoming relative sampling in the next line\n",
301 | " xp = Counter(y_train)\n",
302 | " # Your favourite oversampler\n",
303 | " smote = SMOTE(random_state=27, sampling_strategy={0:int(xp[0]*1.2), 1: int(xp[1]*2.0)})\n",
304 | " # Generate the over-sampled data\n",
305 | " x_train, y_train = rs.resample(smote, x_train, y_train)\n",
306 | " \n",
307 | " # Fit the model\n",
308 | " model = LGBMRegressor(random_state=27)\n",
309 | " model.fit(x_train.values, np.log(y_train.values))\n",
310 | " preds = np.exp(model.predict(x_test.values))\n",
311 | " \n",
312 | " # Check the score\n",
313 | " score = np.sqrt(mean_squared_log_error(y_test.values, preds))\n",
314 | " print(score)\n",
315 | " scores.append(score)\n",
316 | "print(\"\\nAverage: \", sum(scores)/len(scores))"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "metadata": {},
323 | "outputs": [],
324 | "source": []
325 | }
326 | ],
327 | "metadata": {
328 | "kernelspec": {
329 | "display_name": "Python 3",
330 | "language": "python",
331 | "name": "python3"
332 | },
333 | "language_info": {
334 | "codemirror_mode": {
335 | "name": "ipython",
336 | "version": 3
337 | },
338 | "file_extension": ".py",
339 | "mimetype": "text/x-python",
340 | "name": "python",
341 | "nbconvert_exporter": "python",
342 | "pygments_lexer": "ipython3",
343 | "version": "3.7.4"
344 | }
345 | },
346 | "nbformat": 4,
347 | "nbformat_minor": 4
348 | }
349 |
--------------------------------------------------------------------------------