├── data
├── raw
│ └── .gitkeep
├── external
│ └── .gitkeep
├── interim
│ └── .gitkeep
└── processed
│ └── .gitkeep
├── models
└── .gitkeep
├── notebooks
├── .gitkeep
└── reproduce_final_submission.ipynb
├── reports
├── .gitkeep
└── figures
│ └── .gitkeep
├── venv
└── README
├── tox.ini
├── learn
└── events.out.tfevents
├── requirements.txt
├── docs
├── getting-started.rst
├── commands.rst
├── index.rst
├── make.bat
├── Makefile
└── conf.py
├── test_environment.py
├── LICENSE
├── .gitignore
├── README.md
├── src
├── data
│ ├── make_dataset.py
│ └── data.py
└── models
│ ├── process.py
│ ├── models.py
│ └── predict_model.py
└── Makefile
/data/raw/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/models/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/notebooks/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/reports/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/external/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/interim/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/processed/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/reports/figures/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/venv/README:
--------------------------------------------------------------------------------
1 | Virtualenv directory
2 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | max-complexity = 10
4 |
--------------------------------------------------------------------------------
/learn/events.out.tfevents:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sagol/povert/HEAD/learn/events.out.tfevents
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | click
2 | Sphinx
3 | coverage
4 | awscli
5 | flake8
6 | python-dotenv>=0.5.1
7 | catboost==0.5.2
8 | lightgbm==2.0.10
9 | numpy==1.14.0
10 | pandas==0.22.0
11 | scikit-learn==0.19.1
12 | scipy==1.11.1
13 | xgboost==0.6a2
14 |
--------------------------------------------------------------------------------
/docs/getting-started.rst:
--------------------------------------------------------------------------------
1 | Getting started
2 | ===============
3 |
4 | This is where you describe how to get set up on a clean install, including the
5 | commands necessary to get the raw data (using the `sync_data_from_s3` command,
6 | for example), and then how to make the cleaned, final data sets.
7 |
--------------------------------------------------------------------------------
/docs/commands.rst:
--------------------------------------------------------------------------------
1 | Commands
2 | ========
3 |
4 | The Makefile contains the central entry points for common tasks related to this project.
5 |
6 | Syncing data to S3
7 | ^^^^^^^^^^^^^^^^^^
8 |
9 | * `make sync_data_to_s3` will use `aws s3 sync` to recursively sync files in `data/` up to `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/`.
10 | * `make sync_data_from_s3` will use `aws s3 sync` to recursively sync files from `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/` to `data/`.
11 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. povert documentation master file, created by
2 | sphinx-quickstart.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | povert documentation!
7 | ==============================================
8 |
9 | Contents:
10 |
11 | .. toctree::
12 | :maxdepth: 2
13 |
14 | getting-started
15 | commands
16 |
17 |
18 |
19 | Indices and tables
20 | ==================
21 |
22 | * :ref:`genindex`
23 | * :ref:`modindex`
24 | * :ref:`search`
25 |
--------------------------------------------------------------------------------
/test_environment.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | REQUIRED_PYTHON = "python3"
4 |
5 |
6 | def main():
7 | system_major = sys.version_info.major
8 | if REQUIRED_PYTHON == "python":
9 | required_major = 2
10 | elif REQUIRED_PYTHON == "python3":
11 | required_major = 3
12 | else:
13 | raise ValueError("Unrecognized python interpreter: {}".format(
14 | REQUIRED_PYTHON))
15 |
16 | if system_major != required_major:
17 | raise TypeError(
18 | "This project requires Python {}. Found: Python {}".format(
19 | required_major, sys.version))
20 | else:
21 | print(">>> Development environment passes all tests!")
22 |
23 |
24 | if __name__ == '__main__':
25 | main()
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | The MIT License (MIT)
3 | Copyright (c) 2018, Taras Baranyuk
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 |
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 |
11 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 |
46 | # Translations
47 | *.mo
48 | *.pot
49 |
50 | # Django stuff:
51 | *.log
52 |
53 | # Sphinx documentation
54 | docs/_build/
55 |
56 | # PyBuilder
57 | target/
58 |
59 | # DotEnv configuration
60 | .env
61 |
62 | # Database
63 | *.db
64 | *.rdb
65 |
66 | # Pycharm
67 | .idea
68 |
69 | # VS Code
70 | .vscode/
71 |
72 | # Spyder
73 | .spyproject/
74 |
75 | # Jupyter NB Checkpoints
76 | .ipynb_checkpoints/
77 |
78 | # exclude data from source control by default
79 | /data/raw/*.csv
80 | /data/processed/*.csv
81 | .vs/
82 | *.tsv
83 | /models/*
84 | train/
85 | venv/
86 | learn/
87 |
88 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | pover-T 2nd place
2 | ==============================
3 |
4 | Necessary tools and requirements:
5 | 1. Python3
6 | 2. You should install g++:
7 | ```sudo apt install g++=4:5.3.1-1ubuntu1```
8 | 3. Install the required Python packages in requirements.txt:
9 | ```make requirements```
10 | 4. Generate data:
11 | ```make data```
12 | 5. Create submission file:
13 | ```make submission```
14 | 6. The submission file ```submission_combine_20XX-XX-XX-XX-XX.csv```will appear in ```/models```
15 |
16 |
17 | Project Organization
18 | ------------
19 |
20 | ├── LICENSE
21 | ├── Makefile <- Makefile with commands like `make data` or `make train`
22 | ├── README.md <- The top-level README for developers using this project.
23 | ├── data
24 | │ ├── external <- Data from third party sources.
25 | │ ├── interim <- Intermediate data that has been transformed.
26 | │ ├── processed <- The final, canonical data sets for modeling.
27 | │ └── raw <- The original, immutable data dump.
28 | │
29 | ├── docs <- A default Sphinx project; see sphinx-doc.org for details
30 | │
31 | ├── models <- Trained and serialized models, model predictions, or model summaries
32 | │
33 | ├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering),
34 | │ the creator's initials, and a short `-` delimited description, e.g.
35 | │ `1.0-jqp-initial-data-exploration`.
36 | │
37 | ├── references <- Data dictionaries, manuals, and all other explanatory materials.
38 | │
39 | ├── reports <- Generated analysis as HTML, PDF, LaTeX, etc.
40 | │ └── figures <- Generated graphics and figures to be used in reporting
41 | │
42 | ├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g.
43 | │ generated with `pip freeze > requirements.txt`
44 | │
45 | ├── src <- Source code for use in this project.
46 | │ ├── __init__.py <- Makes src a Python module
47 | │ │
48 | │ ├── data <- Scripts to download or generate data
49 | │ │ └── make_dataset.py
50 | │ │
51 | │ ├── features <- Scripts to turn raw data into features for modeling
52 | │ │ └── build_features.py
53 | │ │
54 | │ ├── models <- Scripts to train models and then use trained models to make
55 | │ │ │ predictions
56 | │ │ ├── predict_model.py
57 | │ │ └── train_model.py
58 | │ │
59 | │ └── visualization <- Scripts to create exploratory and results oriented visualizations
60 | │ └── visualize.py
61 | │
62 | └── tox.ini <- tox file with settings for running tox; see tox.testrun.org
63 |
64 |
65 | --------
66 |
67 |
Project based on the cookiecutter data science project template. #cookiecutterdatascience
68 |
--------------------------------------------------------------------------------
/src/data/make_dataset.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | import click
4 | import logging
5 | import data
6 | from dotenv import find_dotenv, load_dotenv
7 |
8 |
9 | @click.command()
10 | @click.argument('input_hh_train_filepath', type=click.Path())
11 | @click.argument('input_ind_train_filepath', type=click.Path())
12 | @click.argument('input_hh_test_filepath', type=click.Path())
13 | @click.argument('input_ind_test_filepath', type=click.Path())
14 | @click.argument('output_train_filepath', type=click.Path())
15 | @click.argument('output_test_filepath', type=click.Path())
16 | @click.argument('output_train_ind_filepath', type=click.Path())
17 | @click.argument('output_test_ind_filepath', type=click.Path())
18 | def main(input_hh_train_filepath,
19 | input_ind_train_filepath,
20 | input_hh_test_filepath,
21 | input_ind_test_filepath,
22 | output_train_filepath,
23 | output_test_filepath,
24 | output_train_ind_filepath,
25 | output_test_ind_filepath
26 | ):
27 | """ Runs data processing scripts to turn raw data from (../raw) into
28 | cleaned data ready to be analyzed (saved in ../processed).
29 | """
30 |
31 | data_individual = data.DataInd()
32 | files_dict = {'train': 'data/raw/{0}'.format(input_ind_train_filepath),
33 | 'test': 'data/raw/{0}'.format(input_ind_test_filepath)}
34 | data_individual.set_country(input_hh_train_filepath[0])
35 | data_individual.set_file_names(files_dict=files_dict)
36 | data_individual.load(load=False, cat_enc=False)
37 | files_dict = {'train': 'data/processed/{0}'.format(output_train_ind_filepath),
38 | 'test': 'data/processed/{0}'.format(output_test_ind_filepath)}
39 | data_individual.save(files_dict=files_dict, poor=False)
40 |
41 | files_dict = {'train_hh': 'data/raw/{0}'.format(input_hh_train_filepath),
42 | 'test_hh': 'data/raw/{0}'.format(input_hh_test_filepath),
43 | 'train_ind': 'data/processed/{0}'.format(output_train_ind_filepath),
44 | 'test_ind': 'data/processed/{0}'.format(output_test_ind_filepath)}
45 | data_concat = data.DataConcat()
46 | data_concat.set_file_names(files_dict=files_dict)
47 | data_concat.set_country(input_hh_train_filepath[0])
48 | data_concat.load(load=False, cat_enc=False)
49 | files_dict = {'train': 'data/processed/{0}'.format(output_train_filepath),
50 | 'test': 'data/processed/{0}'.format(output_test_filepath)}
51 | data_concat.save(files_dict=files_dict)
52 |
53 | files_dict = {'train': 'data/raw/{0}'.format(input_hh_train_filepath),
54 | 'test': 'data/raw/{0}'.format(input_hh_test_filepath)}
55 | data_simple = data.Data()
56 | data_simple.set_file_names(files_dict=files_dict)
57 | data_simple.set_country(input_hh_train_filepath[0])
58 | data_simple.load(load=False)
59 | files_dict = {'train': 'data/processed/{0}'.format(input_hh_train_filepath),
60 | 'test': 'data/processed/{0}'.format(input_hh_test_filepath)}
61 | data_simple.save(files_dict=files_dict)
62 |
63 | logger = logging.getLogger(__name__)
64 | logger.info('making final data set from raw data')
65 |
66 |
67 | if __name__ == '__main__':
68 | log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
69 | logging.basicConfig(level=logging.INFO, format=log_fmt)
70 |
71 | # not used in this stub but often useful for finding various files
72 | project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
73 |
74 | # find .env automagically by walking up directories until it's found, then
75 | # load up the .env entries as environment variables
76 | load_dotenv(find_dotenv())
77 |
78 | main()
79 |
--------------------------------------------------------------------------------
/src/models/process.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import datetime
4 | import pandas as pd
5 | import numpy as np
6 | from sklearn.metrics.classification import log_loss
7 |
8 | src_dir = os.path.join(os.getcwd(), 'src')
9 | sys.path.append(src_dir)
10 |
11 |
12 | class processing:
13 | """
14 | A helper class with a search function of not very significant features,
15 | prediction, and saving the prediction to a file.
16 | """
17 | def __init__(self, countries=['A', 'B', 'C'],
18 | balances={'A': False, 'B': False, 'C': False}):
19 | self.countries = countries
20 | self.balances = balances
21 | self.exclude_dict = {'A': [], 'B': [], 'C': []}
22 | self.data_dict = None
23 | self.model_dict = None
24 | self.vote_waights_dict = None
25 |
26 | def set_vote_waights_dict(self, vote_waights_dict):
27 | self.vote_waights_dict = vote_waights_dict
28 |
29 | def set_data_dict(self, data_dict):
30 | self.data_dict = data_dict
31 |
32 | def set_model_dict(self, model_dict):
33 | self.model_dict = model_dict
34 |
35 | def set_exclude_dict(self, exclude_dict):
36 | self.exclude_dict = exclude_dict
37 |
38 | def save_csv(self, df, clf_model_name='_', path=''):
39 | submission_file = os.path.join(
40 | path, 'submission_{0}_{1}.csv'.format(
41 | clf_model_name,
42 | str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M"))))
43 | print('submission file:', submission_file)
44 | df.to_csv(submission_file, index=True, float_format='%.4f')
45 | print(df.head())
46 |
47 | def find_exclude(self, n_splits=5):
48 | if not self.model_dict or not self.data_dict:
49 | print('Stoped: no models or data')
50 | return None
51 |
52 | for c in self.countries:
53 | self.model_dict[c].load_data(data=self.data_dict[c],
54 | balance=self.balances[c])
55 | exclude_list = []
56 | finish = False
57 | logloss_dict = {}
58 | while not finish:
59 | self.model_dict[c].set_exclude_list(exclude_list)
60 | self.model_dict[c].train()
61 | exclude_list_prev = exclude_list.copy()
62 | columns = [x for x in self.model_dict[c].get_train().columns
63 | if x not in exclude_list_prev]
64 | exclude_list = [x for (x, y) in zip(
65 | columns, self.model_dict[c].get_feature_importances()
66 | ) if y == 0]
67 | if not exclude_list:
68 | finish = True
69 | exclude_list = exclude_list_prev + exclude_list
70 |
71 | logloss_iter = []
72 | splits = self.model_dict[c].data.get_train_valid(
73 | n_splits=n_splits, balance=self.balances[c])
74 |
75 | for i in range(0, n_splits):
76 | self.model_dict[c].set_random_seed(i)
77 | train, valid = splits[i]
78 | self.model_dict[c].set_exclude_list(exclude_list)
79 | self.model_dict[c].train(train[0], train[1])
80 | pred = self.model_dict[c].predict(valid[0])
81 | logloss_iter.append(log_loss(valid[1].astype(int),
82 | pred['poor']))
83 | logloss = np.mean(logloss_iter)
84 | logloss_dict[logloss] = exclude_list
85 | print('loglos: {0} exclude length: {1}'.format(
86 | logloss, len(exclude_list)))
87 | self.exclude_dict[c] = logloss_dict[np.min(
88 | list(logloss_dict.keys()))]
89 | print('Country: {0} exclude length: {1}'.format(
90 | c, len(self.exclude_dict.get(c))))
91 |
92 | return logloss_dict
93 |
94 | def predict(self, model_name, path=''):
95 | if not self.model_dict or not self.data_dict:
96 | print('Stoped: no models or data')
97 | return None
98 |
99 | predictions = []
100 | for c in self.countries:
101 | self.model_dict[c].load_data(data=self.data_dict[c],
102 | balance=self.balances[c])
103 | self.model_dict[c].set_exclude_list(self.exclude_dict[c])
104 | if self.vote_waights_dict:
105 | self.model_dict[c].set_weights(self.vote_waights_dict[c])
106 | print('exclude: \n', self.exclude_dict[c])
107 | self.model_dict[c].train()
108 | predictions.append(self.model_dict[c].predict())
109 | result = pd.concat(predictions)
110 | self.save_csv(result, clf_model_name=model_name, path=path)
111 | return result
112 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3
2 |
3 | #################################################################################
4 | # GLOBALS #
5 | #################################################################################
6 |
7 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
8 | BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')
9 | PROFILE = default
10 | PROJECT_NAME = povert
11 | PYTHON_INTERPRETER = python3
12 |
13 | ifeq (,$(shell which conda))
14 | HAS_CONDA=False
15 | else
16 | HAS_CONDA=True
17 | endif
18 |
19 | #################################################################################
20 | # COMMANDS #
21 | #################################################################################
22 |
23 | ## Install Python Dependencies
24 | requirements: test_environment
25 | pip install -U pip setuptools wheel
26 | pip install -r requirements.txt
27 |
28 | ## Make Dataset
29 | data: #requirements
30 | $(PYTHON_INTERPRETER) src/data/make_dataset.py \
31 | 'A_hhold_train.csv' \
32 | 'A_indiv_train.csv' \
33 | 'A_hhold_test.csv' \
34 | 'A_indiv_test.csv' \
35 | 'A_combine_train.csv' \
36 | 'A_combine_test.csv' \
37 | 'A_indiv_ext_train.csv' \
38 | 'A_indiv_ext_test.csv'
39 | $(PYTHON_INTERPRETER) src/data/make_dataset.py \
40 | 'B_hhold_train.csv' \
41 | 'B_indiv_train.csv' \
42 | 'B_hhold_test.csv' \
43 | 'B_indiv_test.csv' \
44 | 'B_combine_train.csv' \
45 | 'B_combine_test.csv' \
46 | 'B_indiv_ext_train.csv' \
47 | 'B_indiv_ext_test.csv'
48 | $(PYTHON_INTERPRETER) src/data/make_dataset.py \
49 | 'C_hhold_train.csv' \
50 | 'C_indiv_train.csv' \
51 | 'C_hhold_test.csv' \
52 | 'C_indiv_test.csv' \
53 | 'C_combine_train.csv' \
54 | 'C_combine_test.csv' \
55 | 'C_indiv_ext_train.csv' \
56 | 'C_indiv_ext_test.csv'
57 |
58 | ## Create submission
59 | submission:
60 | $(PYTHON_INTERPRETER) src/models/predict_model.py
61 |
62 |
63 | ## Delete all compiled Python files
64 | clean:
65 | find . -type f -name "*.py[co]" -delete
66 | find . -type d -name "__pycache__" -delete
67 |
68 | ## Lint using flake8
69 | lint:
70 | flake8 src
71 |
72 | ## Set up python interpreter environment
73 | create_environment:
74 | ifeq (True,$(HAS_CONDA))
75 | @echo ">>> Detected conda, creating conda environment."
76 | ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
77 | conda create --name $(PROJECT_NAME) python=3
78 | else
79 | conda create --name $(PROJECT_NAME) python=2.7
80 | endif
81 | @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
82 | else
83 | @pip install -q virtualenv virtualenvwrapper
84 | @echo ">>> Installing virtualenvwrapper if not already intalled.\nMake sure the following lines are in shell startup file\n\
85 | export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
86 | @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
87 | @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
88 | endif
89 |
90 | ## Test python environment is setup correctly
91 | test_environment:
92 | $(PYTHON_INTERPRETER) test_environment.py
93 |
94 | #################################################################################
95 | # PROJECT RULES #
96 | #################################################################################
97 |
98 |
99 |
100 | #################################################################################
101 | # Self Documenting Commands #
102 | #################################################################################
103 |
104 | .DEFAULT_GOAL := show-help
105 |
106 | # Inspired by
107 | # sed script explained:
108 | # /^##/:
109 | # * save line in hold space
110 | # * purge line
111 | # * Loop:
112 | # * append newline + line to hold space
113 | # * go to next line
114 | # * if line starts with doc comment, strip comment character off and loop
115 | # * remove target prerequisites
116 | # * append hold space (+ newline) to line
117 | # * replace newline plus comments by `---`
118 | # * print line
119 | # Separate expressions are necessary because labels cannot be delimited by
120 | # semicolon; see
121 | .PHONY: show-help
122 | show-help:
123 | @echo "$$(tput bold)Available rules:$$(tput sgr0)"
124 | @echo
125 | @sed -n -e "/^## / { \
126 | h; \
127 | s/.*//; \
128 | :doc" \
129 | -e "H; \
130 | n; \
131 | s/^## //; \
132 | t doc" \
133 | -e "s/:.*//; \
134 | G; \
135 | s/\\n## /---/; \
136 | s/\\n/ /g; \
137 | p; \
138 | }" ${MAKEFILE_LIST} \
139 | | LC_ALL='C' sort --ignore-case \
140 | | awk -F '---' \
141 | -v ncol=$$(tput cols) \
142 | -v indent=19 \
143 | -v col_on="$$(tput setaf 6)" \
144 | -v col_off="$$(tput sgr0)" \
145 | '{ \
146 | printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
147 | n = split($$2, words, " "); \
148 | line_length = ncol - indent; \
149 | for (i = 1; i <= n; i++) { \
150 | line_length -= length(words[i]) + 1; \
151 | if (line_length <= 0) { \
152 | line_length = ncol - indent - length(words[i]) - 1; \
153 | printf "\n%*s ", -indent, " "; \
154 | } \
155 | printf "%s ", words[i]; \
156 | } \
157 | printf "\n"; \
158 | }' \
159 | | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
160 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | REM Command file for Sphinx documentation
4 |
5 | if "%SPHINXBUILD%" == "" (
6 | set SPHINXBUILD=sphinx-build
7 | )
8 | set BUILDDIR=_build
9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
11 | if NOT "%PAPER%" == "" (
12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
14 | )
15 |
16 | if "%1" == "" goto help
17 |
18 | if "%1" == "help" (
19 | :help
20 | echo.Please use `make ^` where ^ is one of
21 | echo. html to make standalone HTML files
22 | echo. dirhtml to make HTML files named index.html in directories
23 | echo. singlehtml to make a single large HTML file
24 | echo. pickle to make pickle files
25 | echo. json to make JSON files
26 | echo. htmlhelp to make HTML files and a HTML help project
27 | echo. qthelp to make HTML files and a qthelp project
28 | echo. devhelp to make HTML files and a Devhelp project
29 | echo. epub to make an epub
30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
31 | echo. text to make text files
32 | echo. man to make manual pages
33 | echo. texinfo to make Texinfo files
34 | echo. gettext to make PO message catalogs
35 | echo. changes to make an overview over all changed/added/deprecated items
36 | echo. linkcheck to check all external links for integrity
37 | echo. doctest to run all doctests embedded in the documentation if enabled
38 | goto end
39 | )
40 |
41 | if "%1" == "clean" (
42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
43 | del /q /s %BUILDDIR%\*
44 | goto end
45 | )
46 |
47 | if "%1" == "html" (
48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
49 | if errorlevel 1 exit /b 1
50 | echo.
51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html.
52 | goto end
53 | )
54 |
55 | if "%1" == "dirhtml" (
56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
57 | if errorlevel 1 exit /b 1
58 | echo.
59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
60 | goto end
61 | )
62 |
63 | if "%1" == "singlehtml" (
64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
65 | if errorlevel 1 exit /b 1
66 | echo.
67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
68 | goto end
69 | )
70 |
71 | if "%1" == "pickle" (
72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
73 | if errorlevel 1 exit /b 1
74 | echo.
75 | echo.Build finished; now you can process the pickle files.
76 | goto end
77 | )
78 |
79 | if "%1" == "json" (
80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
81 | if errorlevel 1 exit /b 1
82 | echo.
83 | echo.Build finished; now you can process the JSON files.
84 | goto end
85 | )
86 |
87 | if "%1" == "htmlhelp" (
88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
89 | if errorlevel 1 exit /b 1
90 | echo.
91 | echo.Build finished; now you can run HTML Help Workshop with the ^
92 | .hhp project file in %BUILDDIR%/htmlhelp.
93 | goto end
94 | )
95 |
96 | if "%1" == "qthelp" (
97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
98 | if errorlevel 1 exit /b 1
99 | echo.
100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\povert.qhcp
103 | echo.To view the help file:
104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\povert.ghc
105 | goto end
106 | )
107 |
108 | if "%1" == "devhelp" (
109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | if errorlevel 1 exit /b 1
111 | echo.
112 | echo.Build finished.
113 | goto end
114 | )
115 |
116 | if "%1" == "epub" (
117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | if errorlevel 1 exit /b 1
119 | echo.
120 | echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | goto end
122 | )
123 |
124 | if "%1" == "latex" (
125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | if errorlevel 1 exit /b 1
127 | echo.
128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | goto end
130 | )
131 |
132 | if "%1" == "text" (
133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | if errorlevel 1 exit /b 1
135 | echo.
136 | echo.Build finished. The text files are in %BUILDDIR%/text.
137 | goto end
138 | )
139 |
140 | if "%1" == "man" (
141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | if errorlevel 1 exit /b 1
143 | echo.
144 | echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | goto end
146 | )
147 |
148 | if "%1" == "texinfo" (
149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | if errorlevel 1 exit /b 1
151 | echo.
152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | goto end
154 | )
155 |
156 | if "%1" == "gettext" (
157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | if errorlevel 1 exit /b 1
159 | echo.
160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | goto end
162 | )
163 |
164 | if "%1" == "changes" (
165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | if errorlevel 1 exit /b 1
167 | echo.
168 | echo.The overview file is in %BUILDDIR%/changes.
169 | goto end
170 | )
171 |
172 | if "%1" == "linkcheck" (
173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | if errorlevel 1 exit /b 1
175 | echo.
176 | echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | goto end
179 | )
180 |
181 | if "%1" == "doctest" (
182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | if errorlevel 1 exit /b 1
184 | echo.
185 | echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | goto end
188 | )
189 |
190 | :end
191 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | PAPER =
8 | BUILDDIR = _build
9 |
10 | # Internal variables.
11 | PAPEROPT_a4 = -D latex_paper_size=a4
12 | PAPEROPT_letter = -D latex_paper_size=letter
13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
14 | # the i18n builder cannot share the environment and doctrees with the others
15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
16 |
17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
18 |
19 | help:
20 | @echo "Please use \`make ' where is one of"
21 | @echo " html to make standalone HTML files"
22 | @echo " dirhtml to make HTML files named index.html in directories"
23 | @echo " singlehtml to make a single large HTML file"
24 | @echo " pickle to make pickle files"
25 | @echo " json to make JSON files"
26 | @echo " htmlhelp to make HTML files and a HTML help project"
27 | @echo " qthelp to make HTML files and a qthelp project"
28 | @echo " devhelp to make HTML files and a Devhelp project"
29 | @echo " epub to make an epub"
30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
31 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
32 | @echo " text to make text files"
33 | @echo " man to make manual pages"
34 | @echo " texinfo to make Texinfo files"
35 | @echo " info to make Texinfo files and run them through makeinfo"
36 | @echo " gettext to make PO message catalogs"
37 | @echo " changes to make an overview of all changed/added/deprecated items"
38 | @echo " linkcheck to check all external links for integrity"
39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
40 |
41 | clean:
42 | -rm -rf $(BUILDDIR)/*
43 |
44 | html:
45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
46 | @echo
47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
48 |
49 | dirhtml:
50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
51 | @echo
52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
53 |
54 | singlehtml:
55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
56 | @echo
57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
58 |
59 | pickle:
60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
61 | @echo
62 | @echo "Build finished; now you can process the pickle files."
63 |
64 | json:
65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
66 | @echo
67 | @echo "Build finished; now you can process the JSON files."
68 |
69 | htmlhelp:
70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
71 | @echo
72 | @echo "Build finished; now you can run HTML Help Workshop with the" \
73 | ".hhp project file in $(BUILDDIR)/htmlhelp."
74 |
75 | qthelp:
76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
77 | @echo
78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \
79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/povert.qhcp"
81 | @echo "To view the help file:"
82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/povert.qhc"
83 |
84 | devhelp:
85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
86 | @echo
87 | @echo "Build finished."
88 | @echo "To view the help file:"
89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/povert"
90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/povert"
91 | @echo "# devhelp"
92 |
93 | epub:
94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
95 | @echo
96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
97 |
98 | latex:
99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | @echo
101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | "(use \`make latexpdf' here to do that automatically)."
104 |
105 | latexpdf:
106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | @echo "Running LaTeX files through pdflatex..."
108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 |
111 | text:
112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | @echo
114 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
115 |
116 | man:
117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | @echo
119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 |
121 | texinfo:
122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | @echo
124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | @echo "Run \`make' in that directory to run these through makeinfo" \
126 | "(use \`make info' here to do that automatically)."
127 |
128 | info:
129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | @echo "Running Texinfo files through makeinfo..."
131 | make -C $(BUILDDIR)/texinfo info
132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 |
134 | gettext:
135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | @echo
137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 |
139 | changes:
140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | @echo
142 | @echo "The overview file is in $(BUILDDIR)/changes."
143 |
144 | linkcheck:
145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | @echo
147 | @echo "Link check complete; look for any errors in the above output " \
148 | "or in $(BUILDDIR)/linkcheck/output.txt."
149 |
150 | doctest:
151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | @echo "Testing of doctests in the sources finished, look at the " \
153 | "results in $(BUILDDIR)/doctest/output.txt."
154 |
--------------------------------------------------------------------------------
/src/models/models.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from catboost import CatBoostClassifier
4 | import xgboost as xgb
5 | import lightgbm as lgb
6 | from sklearn.utils import class_weight
7 | from abc import ABC, abstractmethod
8 |
9 |
10 | class predict_model(ABC):
11 | """
12 | Abstract class for working with classifiers.
13 | """
14 |
15 | @abstractmethod
16 | def __init__(self, name='predict_model', categ_conv=True):
17 | self.params = {}
18 | self.exclude_list = []
19 | self.name = name
20 | self.random = 1
21 | self.classifier = None
22 | self.categ_conv = categ_conv
23 | self.data_df = {}
24 |
25 | def set_params(self, params=None):
26 | if not params:
27 | self.params = {}
28 | else:
29 | self.params = params
30 |
31 | def set_random_seed(self, random=1):
32 | self.random = random
33 |
34 | @abstractmethod
35 | def load_data(self, data, balance=False):
36 | self.data = data
37 |
38 | self.data_df['train'], self.data_df['y'] = self.data.get_train(
39 | balance=balance
40 | )
41 | self.data_df['test'] = self.data.get_test()
42 |
43 | self.category_cols = self.data.get_cat_list()
44 | for header in self.category_cols:
45 | self.data_df['train'].loc[:, header] = self.data_df['train'][header].astype('category').cat.codes
46 | self.data_df['test'].loc[:, header] = self.data_df['test'][header].astype('category').cat.codes
47 | return True
48 |
49 | def get_train(self):
50 | return self.data_df['train']
51 |
52 | def get_y(self):
53 | return self.data_df['y']
54 |
55 | def get_test(self):
56 | return self.data_df['test']
57 |
58 | def set_exclude_list(self, exclude_list):
59 | self.exclude_list = exclude_list.copy()
60 |
61 | @abstractmethod
62 | def get_feature_importances(self):
63 | pass
64 |
65 | @abstractmethod
66 | def train(self, x_train=None, y_train=None):
67 | pass
68 |
69 | def predict(self, test=None):
70 | if self.classifier:
71 | if not isinstance(test, pd.DataFrame):
72 | test = self.get_test()
73 | elif self.categ_conv:
74 | cols = [x for x in self.category_cols if x in test.columns]
75 | for header in cols:
76 | test.loc[:, header] = test[header].astype('category').cat.codes
77 | test = test.drop(
78 | [x for x in self.exclude_list if x in test.columns], axis=1
79 | )
80 | res = pd.DataFrame(index=test.index)
81 | res['country'] = self.data.country
82 | res['poor'] = self.classifier.predict_proba(test)[:, 1]
83 | return res
84 | else:
85 | print('error: classifier not defined')
86 | return None
87 |
88 |
89 | class CB_model(predict_model):
90 | """
91 | Class for a CatBoost classifier.
92 | """
93 |
94 | def __init__(self, name='cat_boost', categ_conv=True):
95 | super().__init__(name='cat_boost', categ_conv=categ_conv)
96 | self.name = name
97 |
98 | def load_data(self, data, balance=False):
99 | if super().load_data(data, balance):
100 | c_w = class_weight.compute_class_weight(
101 | class_weight='balanced',
102 | classes=np.unique(self.data_df['y']),
103 | y=self.data_df['y']
104 | )
105 |
106 | self.classifier = CatBoostClassifier(**self.params,
107 | class_weights=c_w)
108 | return True
109 | else:
110 | return False
111 |
112 | def train(self, x_train=None, y_train=None):
113 |
114 | if not isinstance(x_train, pd.DataFrame):
115 | x_train = self.get_train()
116 | elif self.categ_conv:
117 | cols = [x for x in self.category_cols if x in x_train.columns]
118 | for header in cols:
119 | x_train.loc[:, header] = x_train[header].astype('category').cat.codes
120 |
121 | if not isinstance(y_train, pd.Series):
122 | y_train = self.get_y()
123 |
124 | x_train = x_train.drop([x for x in self.exclude_list
125 | if x in x_train.columns], axis=1)
126 |
127 | self.category_cols = [x for x in self.category_cols
128 | if x not in self.exclude_list]
129 |
130 | cat_dims = [x_train.columns.get_loc(i) for i in self.category_cols]
131 | print(x_train.shape, y_train.shape, len(self.category_cols))
132 |
133 | self.classifier.fit(x_train, y_train, cat_features=cat_dims)
134 | return self.classifier
135 |
136 | def get_feature_importances(self):
137 | return self.classifier._feature_importance
138 |
139 |
140 | class XGB_model(predict_model):
141 | """
142 | Class for a XGBoost classifier.
143 | """
144 |
145 | def __init__(self, name='xg_boost', categ_conv=True):
146 | super().__init__(name='xg_boost', categ_conv=categ_conv)
147 | self.name = name
148 |
149 | def load_data(self, data, balance=False):
150 | if super().load_data(data, balance):
151 | self.params['scale_pos_weight'] = (
152 | (self.data_df['y'].shape[0] - self.data_df['y'].sum()) /
153 | self.data_df['y'].sum()
154 | )
155 | self.classifier = xgb.XGBClassifier(**self.params)
156 | return True
157 | else:
158 | return False
159 |
160 | def train(self, x_train=None, y_train=None):
161 |
162 | if not isinstance(x_train, pd.DataFrame):
163 | x_train = self.get_train()
164 | elif self.categ_conv:
165 | cols = [x for x in self.category_cols if x in x_train.columns]
166 | for header in cols:
167 | x_train.loc[:, header] = x_train[header].astype('category').cat.codes
168 |
169 | if not isinstance(y_train, pd.Series):
170 | y_train = self.get_y()
171 |
172 | x_train = x_train.drop([x for x in self.exclude_list
173 | if x in x_train.columns], axis=1)
174 | print('x_train shape: ', x_train.shape)
175 |
176 | self.classifier.fit(x_train, y_train)
177 |
178 | return self.classifier
179 |
180 | def get_feature_importances(self):
181 | return self.classifier.feature_importances_
182 |
183 |
184 | class LGBM_model(predict_model):
185 | """
186 | Class for LightGBM classifier.
187 | """
188 |
189 | def __init__(self, name='lgbm', categ_conv=True):
190 | super().__init__(name='lgbm', categ_conv=categ_conv)
191 | self.name = name
192 |
193 | def load_data(self, data, balance=False):
194 | if super().load_data(data, balance):
195 | self.classifier = lgb.LGBMClassifier(**self.params)
196 | return True
197 | else:
198 | return False
199 |
200 | def train(self, x_train=None, y_train=None):
201 |
202 | if not isinstance(x_train, pd.DataFrame):
203 | x_train = self.get_train()
204 | elif self.categ_conv:
205 | cols = [x for x in self.category_cols if x in x_train.columns]
206 | for header in cols:
207 | x_train.loc[:, header] = x_train[header].astype('category').cat.codes
208 |
209 | if not isinstance(y_train, pd.Series):
210 | y_train = self.get_y()
211 |
212 | x_train = x_train.drop([x for x in self.exclude_list
213 | if x in x_train.columns], axis=1)
214 | print('x_train shape: ', x_train.shape)
215 |
216 | self.category_cols = [x for x in self.category_cols
217 | if x not in self.exclude_list]
218 |
219 | self.classifier.fit(x_train, y_train, verbose=False)
220 |
221 | return self.classifier
222 |
223 | def get_feature_importances(self):
224 | return self.classifier.feature_importances_
225 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # povert documentation build configuration file, created by
4 | # sphinx-quickstart.
5 | #
6 | # This file is execfile()d with the current directory set to its containing dir.
7 | #
8 | # Note that not all possible configuration values are present in this
9 | # autogenerated file.
10 | #
11 | # All configuration values have a default; values that are commented out
12 | # serve to show the default.
13 |
14 | import os
15 | import sys
16 |
17 | # If extensions (or modules to document with autodoc) are in another directory,
18 | # add these directories to sys.path here. If the directory is relative to the
19 | # documentation root, use os.path.abspath to make it absolute, like shown here.
20 | # sys.path.insert(0, os.path.abspath('.'))
21 |
22 | # -- General configuration -----------------------------------------------------
23 |
24 | # If your documentation needs a minimal Sphinx version, state it here.
25 | # needs_sphinx = '1.0'
26 |
27 | # Add any Sphinx extension module names here, as strings. They can be extensions
28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
29 | extensions = []
30 |
31 | # Add any paths that contain templates here, relative to this directory.
32 | templates_path = ['_templates']
33 |
34 | # The suffix of source filenames.
35 | source_suffix = '.rst'
36 |
37 | # The encoding of source files.
38 | # source_encoding = 'utf-8-sig'
39 |
40 | # The master toctree document.
41 | master_doc = 'index'
42 |
43 | # General information about the project.
44 | project = u'povert'
45 |
46 | # The version info for the project you're documenting, acts as replacement for
47 | # |version| and |release|, also used in various other places throughout the
48 | # built documents.
49 | #
50 | # The short X.Y version.
51 | version = '0.1'
52 | # The full version, including alpha/beta/rc tags.
53 | release = '0.1'
54 |
55 | # The language for content autogenerated by Sphinx. Refer to documentation
56 | # for a list of supported languages.
57 | # language = None
58 |
59 | # There are two options for replacing |today|: either, you set today to some
60 | # non-false value, then it is used:
61 | # today = ''
62 | # Else, today_fmt is used as the format for a strftime call.
63 | # today_fmt = '%B %d, %Y'
64 |
65 | # List of patterns, relative to source directory, that match files and
66 | # directories to ignore when looking for source files.
67 | exclude_patterns = ['_build']
68 |
69 | # The reST default role (used for this markup: `text`) to use for all documents.
70 | # default_role = None
71 |
72 | # If true, '()' will be appended to :func: etc. cross-reference text.
73 | # add_function_parentheses = True
74 |
75 | # If true, the current module name will be prepended to all description
76 | # unit titles (such as .. function::).
77 | # add_module_names = True
78 |
79 | # If true, sectionauthor and moduleauthor directives will be shown in the
80 | # output. They are ignored by default.
81 | # show_authors = False
82 |
83 | # The name of the Pygments (syntax highlighting) style to use.
84 | pygments_style = 'sphinx'
85 |
86 | # A list of ignored prefixes for module index sorting.
87 | # modindex_common_prefix = []
88 |
89 |
90 | # -- Options for HTML output ---------------------------------------------------
91 |
92 | # The theme to use for HTML and HTML Help pages. See the documentation for
93 | # a list of builtin themes.
94 | html_theme = 'default'
95 |
96 | # Theme options are theme-specific and customize the look and feel of a theme
97 | # further. For a list of options available for each theme, see the
98 | # documentation.
99 | # html_theme_options = {}
100 |
101 | # Add any paths that contain custom themes here, relative to this directory.
102 | # html_theme_path = []
103 |
104 | # The name for this set of Sphinx documents. If None, it defaults to
105 | # " v documentation".
106 | # html_title = None
107 |
108 | # A shorter title for the navigation bar. Default is the same as html_title.
109 | # html_short_title = None
110 |
111 | # The name of an image file (relative to this directory) to place at the top
112 | # of the sidebar.
113 | # html_logo = None
114 |
115 | # The name of an image file (within the static path) to use as favicon of the
116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
117 | # pixels large.
118 | # html_favicon = None
119 |
120 | # Add any paths that contain custom static files (such as style sheets) here,
121 | # relative to this directory. They are copied after the builtin static files,
122 | # so a file named "default.css" will overwrite the builtin "default.css".
123 | html_static_path = ['_static']
124 |
125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126 | # using the given strftime format.
127 | # html_last_updated_fmt = '%b %d, %Y'
128 |
129 | # If true, SmartyPants will be used to convert quotes and dashes to
130 | # typographically correct entities.
131 | # html_use_smartypants = True
132 |
133 | # Custom sidebar templates, maps document names to template names.
134 | # html_sidebars = {}
135 |
136 | # Additional templates that should be rendered to pages, maps page names to
137 | # template names.
138 | # html_additional_pages = {}
139 |
140 | # If false, no module index is generated.
141 | # html_domain_indices = True
142 |
143 | # If false, no index is generated.
144 | # html_use_index = True
145 |
146 | # If true, the index is split into individual pages for each letter.
147 | # html_split_index = False
148 |
149 | # If true, links to the reST sources are added to the pages.
150 | # html_show_sourcelink = True
151 |
152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
153 | # html_show_sphinx = True
154 |
155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
156 | # html_show_copyright = True
157 |
158 | # If true, an OpenSearch description file will be output, and all pages will
159 | # contain a tag referring to it. The value of this option must be the
160 | # base URL from which the finished HTML is served.
161 | # html_use_opensearch = ''
162 |
163 | # This is the file name suffix for HTML files (e.g. ".xhtml").
164 | # html_file_suffix = None
165 |
166 | # Output file base name for HTML help builder.
167 | htmlhelp_basename = 'povertdoc'
168 |
169 |
170 | # -- Options for LaTeX output --------------------------------------------------
171 |
172 | latex_elements = {
173 | # The paper size ('letterpaper' or 'a4paper').
174 | # 'papersize': 'letterpaper',
175 |
176 | # The font size ('10pt', '11pt' or '12pt').
177 | # 'pointsize': '10pt',
178 |
179 | # Additional stuff for the LaTeX preamble.
180 | # 'preamble': '',
181 | }
182 |
183 | # Grouping the document tree into LaTeX files. List of tuples
184 | # (source start file, target name, title, author, documentclass [howto/manual]).
185 | latex_documents = [
186 | ('index',
187 | 'povert.tex',
188 | u'povert Documentation',
189 | u"Taras Baranyuk", 'manual'),
190 | ]
191 |
192 | # The name of an image file (relative to this directory) to place at the top of
193 | # the title page.
194 | # latex_logo = None
195 |
196 | # For "manual" documents, if this is true, then toplevel headings are parts,
197 | # not chapters.
198 | # latex_use_parts = False
199 |
200 | # If true, show page references after internal links.
201 | # latex_show_pagerefs = False
202 |
203 | # If true, show URL addresses after external links.
204 | # latex_show_urls = False
205 |
206 | # Documents to append as an appendix to all manuals.
207 | # latex_appendices = []
208 |
209 | # If false, no module index is generated.
210 | # latex_domain_indices = True
211 |
212 |
213 | # -- Options for manual page output --------------------------------------------
214 |
215 | # One entry per manual page. List of tuples
216 | # (source start file, name, description, authors, manual section).
217 | man_pages = [
218 | ('index', 'povert', u'povert Documentation',
219 | [u"Taras Baranyuk"], 1)
220 | ]
221 |
222 | # If true, show URL addresses after external links.
223 | # man_show_urls = False
224 |
225 |
226 | # -- Options for Texinfo output ------------------------------------------------
227 |
228 | # Grouping the document tree into Texinfo files. List of tuples
229 | # (source start file, target name, title, author,
230 | # dir menu entry, description, category)
231 | texinfo_documents = [
232 | ('index', 'povert', u'povert Documentation',
233 | u"Taras Baranyuk", 'povert',
234 | '2nd place 2018', 'Miscellaneous'),
235 | ]
236 |
237 | # Documents to append as an appendix to all manuals.
238 | # texinfo_appendices = []
239 |
240 | # If false, no module index is generated.
241 | # texinfo_domain_indices = True
242 |
243 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
244 | # texinfo_show_urls = 'footnote'
245 |
--------------------------------------------------------------------------------
/src/data/data.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.utils import resample
3 | from collections import OrderedDict
4 | from sklearn.model_selection import StratifiedShuffleSplit
5 | from sklearn.preprocessing import StandardScaler
6 |
7 |
8 | class Data():
9 | """ Class for working with households data."""
10 |
11 | def __init__(self):
12 | self.country_df_train = None
13 | self.country_df_test = None
14 | self.categorical_list = []
15 | self.float_list = []
16 | self.train_file_name = None
17 | self.test_file_name = None
18 |
19 | def split_data(self,
20 | size=0.8,
21 | n_splits=1,
22 | random_state=1,
23 | balance=False,
24 | df=None):
25 | """
26 | Returns data partitions.
27 |
28 | Args:
29 | size: float, partition ratio, optional (default=0.8)
30 | n_splits: int, number of partitions, optional (default=1)
31 | random_state: int, RandomState instance, optional (default=1)
32 | balance: bool, resample data, optional (default=False)
33 | df: DataFrame, data for split, optional (default=None)
34 |
35 | Returns:
36 | List of splits.
37 | """
38 |
39 | if not isinstance(df, pd.DataFrame):
40 | train = self.country_df_train
41 | else:
42 | train = df
43 | sss = StratifiedShuffleSplit(n_splits=n_splits,
44 | test_size=1-size,
45 | random_state=random_state)
46 | splits = []
47 | for train_index, validate_index in sss.split(train, train.poor):
48 | df_train = train.iloc[train_index]
49 | if balance:
50 | df_train = self.resample(df_train)
51 | splits.append((df_train, train.iloc[validate_index]))
52 | return splits
53 |
54 | def _rename_col(self):
55 | """Rename columns."""
56 |
57 | train_columns = self.country_df_train.columns
58 | train_new_columns = [
59 | x if (x == 'poor' or
60 | x == 'country') else '{0}_{1}'.format(
61 | self.country,
62 | train_columns.get_loc(x)) for x in train_columns]
63 | self.country_df_train.columns = train_new_columns
64 | self.col_maping = dict(zip(train_columns, train_new_columns))
65 | self.col_maping_reverse = dict(zip(train_new_columns, train_columns))
66 |
67 | self.country_df_test.rename(columns=self.col_maping, inplace=True)
68 |
69 | def del_nonunique(self, df):
70 | """
71 | Delete columns with non-unique values.
72 |
73 | Args:
74 | df: DataFrame, data for clean
75 |
76 | Returns:
77 | DataFrame without columns with non-unique values.
78 | """
79 |
80 | nunique = df.apply(pd.Series.nunique)
81 | cols_to_drop = nunique[nunique == 1].index
82 | print('Cols to drop:', cols_to_drop)
83 | return df.drop(cols_to_drop, axis=1)
84 |
85 | def category_float_search(self,
86 | count=5,
87 | countries=['B'],
88 | cat_types=['object'],
89 | fi_types=['float64', 'int64']):
90 | """
91 | Search for categorical features.
92 |
93 | Args:
94 | count: int, number of unique values for determining categoricity,
95 | optional (default=5)
96 | countries: list, list of countries for which to search not only for
97 | features with the type in cat_types,
98 | optional (default=['B'])
99 | cat_types: list, list of types for categorical features,
100 | optional (default=['object'])
101 | fi_types: list, A list of additional types for searching for category
102 | features, optional (default=['float64', 'int64'])
103 |
104 | Returns:
105 | Tuple with a list of categorical columns and a list of other columns
106 | """
107 |
108 | categorical_list = list(
109 | self.country_df_train[self.col_common_list].select_dtypes(
110 | include=cat_types).columns)
111 |
112 | if self.country not in countries:
113 | return (categorical_list,
114 | list(self.country_df_train[
115 | self.col_common_list].select_dtypes(
116 | include=fi_types).columns))
117 |
118 | float_list = []
119 | print('float list length: ', len(list(
120 | self.country_df_test.select_dtypes(include=fi_types).columns)))
121 | for i in list(self.country_df_test[
122 | self.col_common_list].select_dtypes(include=fi_types).columns):
123 | value_set = set(
124 | self.country_df_test[i].unique()).union(set(
125 | self.country_df_train[i].unique()))
126 | if len(value_set) <= count:
127 | categorical_list.append(i)
128 | else:
129 | float_list.append(i)
130 | print('float list length: ', len(sorted(float_list)))
131 | return sorted(categorical_list), sorted(float_list)
132 |
133 | def scale(self):
134 | """
135 | Scale all non categorical values.
136 | """
137 | if not self.float_list:
138 | print('There is no float list')
139 | return False
140 | scaler = StandardScaler()
141 | for i in self.float_list:
142 | self.country_df_train[i] = scaler.fit_transform(
143 | self.country_df_train[i].values.reshape(-1, 1))
144 | self.country_df_test[i] = scaler.transform(
145 | self.country_df_test[i].values.reshape(-1, 1))
146 | return True
147 |
148 | def fillna(self):
149 | """
150 | Replace `NaN` values with the median of the column and remove all the completely empty columns.
151 | """
152 | print('train data have NaNs: ', self.country_df_train.isnull().any().any())
153 | print('test data have NaNs: ', self.country_df_test.isnull().any().any())
154 | self.country_df_train = self.country_df_train.fillna(
155 | self.country_df_train.median()).dropna(axis=1, how='all')
156 | self.country_df_test = self.country_df_test.fillna(
157 | self.country_df_test.median()).dropna(axis=1, how='all')
158 | print('train data have NaNs: ', self.country_df_train.isnull().any().any())
159 | print('test data have NaNs: ', self.country_df_test.isnull().any().any())
160 |
161 | def set_file_names(self, files_dict):
162 | """
163 | Set file names for train and test dataframes
164 |
165 | Args:
166 | files_dict: dictionary, file names for 'train' and 'test'
167 | """
168 | self.train_file_name = files_dict.get('train')
169 | self.test_file_name = files_dict.get('test')
170 |
171 | def set_country(self, country):
172 | """
173 | Set country label.
174 |
175 | Args:
176 | country: string, a label for country
177 | """
178 | self.country = country
179 | print('Country: ', self.country)
180 |
181 | def load(self, load=True, with_bug=True):
182 | """
183 | Load data from files.
184 |
185 | Args:
186 | load: bool, load from file without postprocessing,
187 | optional (default=True)
188 | with_bug: bool, emulate a bug for final submission,
189 | optional, (default=True)
190 | """
191 | self.country_df_train = self.del_nonunique(
192 | pd.read_csv(self.train_file_name, index_col='id'))
193 | self.country_df_test = self.del_nonunique(
194 | pd.read_csv(self.test_file_name, index_col='id'))
195 |
196 | if not load:
197 | self._rename_col()
198 | self.fillna()
199 | self.col_common_list = \
200 | sorted(list(set(self.country_df_train.columns).intersection(
201 | self.country_df_test.columns)))
202 | self.categorical_list, self.float_list = self.category_float_search()
203 | if not load:
204 | if self.country == 'B' or not with_bug:
205 | self.scale()
206 | print('dataind train shape: ', self.country_df_train.shape)
207 | return True
208 |
209 | def save(self, files_dict, poor=True):
210 | """
211 | Save data to files.
212 |
213 | Args:
214 | files_dict: dictionary, file names for 'train' and 'test'
215 | poor: bool, save poor column, optional (default=True)
216 | """
217 | train = self.get_train()
218 | if poor:
219 | train = pd.concat([train[0], train[1]], axis=1)
220 | else:
221 | train = train[0]
222 | train.to_csv(files_dict.get('train'), index=True, mode='w')
223 | test = self.get_test()
224 | test.to_csv(files_dict.get('test'), index=True, mode='w')
225 | return True
226 |
227 | def resample(self, df):
228 | """
229 | Resample dataframe.
230 |
231 | Args:
232 | df: DataFrame, dataframe for resample
233 |
234 | Returns:
235 | Resampled dataframe.
236 | """
237 | df_majority = df[~self.country_df_train.poor]
238 | df_minority = df[self.country_df_train.poor]
239 |
240 | df_minority_upsampled = resample(df_minority,
241 | replace=True,
242 | n_samples=df_majority.shape[0],
243 | random_state=1)
244 | return pd.concat([df_majority, df_minority_upsampled])
245 |
246 | def get_train(self, balance=False):
247 | """
248 | Get train data.
249 |
250 | Args:
251 | balance: bool, resample data, optional (default=False)
252 |
253 | Returns:
254 | Tuple with a train dataframe and a target dataframe.
255 | """
256 | if balance:
257 | train = self.resample(self.country_df_train)
258 | return train[self.col_common_list], train['poor']
259 | return (self.country_df_train[self.col_common_list],
260 | self.country_df_train['poor'])
261 |
262 | def get_train_valid(self, n_splits=1, balance=False):
263 | """
264 | Get train and valid sets.
265 |
266 | Args:
267 | n_splits: int, number of partitions, optional (default=1)
268 | balance: bool, resample data, optional (default=False)
269 |
270 | Returns:
271 | A list of splits.
272 | """
273 | splits = self.split_data(n_splits=n_splits, balance=balance)
274 | return [((x[self.col_common_list], x.poor),
275 | (y[self.col_common_list], y.poor)) for x, y in splits]
276 |
277 | def get_test(self):
278 | """
279 | Get test data.
280 |
281 | Returns:
282 | A test dataframe.
283 | """
284 | return self.country_df_test[self.col_common_list]
285 |
286 | def get_cat_list(self):
287 | """
288 | Get a list of categorical features.
289 |
290 | Returns:
291 | A list of columns.
292 | """
293 | return self.categorical_list
294 |
295 | def get_float_list(self):
296 | """
297 | Get a list of non-categorical features.
298 |
299 | Returns:
300 | A list of columns.
301 | """
302 | return self.float_list
303 |
304 |
305 | class DataInd(Data):
306 | """ Class for working with individual level data."""
307 |
308 | def __init__(self):
309 | super().__init__()
310 |
311 | def get_poor(self, df):
312 | """
313 | Get a dataframe with poor column.
314 |
315 | Returns:
316 | A dataframe with a poor column.
317 | """
318 | return df['poor'].reset_index()[['id', 'poor']].drop_duplicates().set_index('id')
319 |
320 | def summarize(self, df):
321 | """
322 | Get a dataframe with a summarized individual level data for household.
323 |
324 | Args:
325 | df: DataFrame, dataframe with an individual level data
326 |
327 | Returns:
328 | A dataframe with summarized columns.
329 | """
330 | count = df.copy().groupby(level=0).sum()
331 | res_df = pd.concat({'sum': count}, axis=1)
332 | res_df.columns = ['{0}_{1}'.format(i[0], i[1]) for i in res_df.columns]
333 | res_df = res_df.reindex(index=df.index.get_level_values(0))
334 | res_df = res_df[~res_df.index.duplicated(keep='first')]
335 | print('summarized size df: ', res_df.shape)
336 | return res_df
337 |
338 | def _get_id_list(self, df):
339 | """
340 | Get an ordered list of indeces.
341 |
342 | Args:
343 | df: DataFrame, dataframe with an individual level data
344 |
345 | Returns:
346 | An ordered list of indeces.
347 | """
348 | return list(OrderedDict.fromkeys(df.index.get_level_values(0)))
349 |
350 | def count_iid(self, df):
351 | """
352 | Get a dataframe with a count of individuals for households.
353 |
354 | Args:
355 | df: DataFrame, dataframe with an individual level data
356 |
357 | Returns:
358 | A dataframe with a count of individuals for households.
359 | """
360 | s = df.index.get_level_values(0).value_counts()
361 | return s.reindex(index=self._get_id_list(df)).to_frame('iid_cnt')
362 |
363 | def count_neg_poz(self, df):
364 | """
365 | Get a dataframe with a count of negative and positive values for
366 | an individual level data.
367 |
368 | Args:
369 | df: DataFrame, dataframe with an individual level data
370 |
371 | Returns:
372 | A dataframe with a count of negative and positive values for
373 | an individual level data.
374 | """
375 | res_df = df.select_dtypes(include=['float64', 'int64', 'int8'])
376 | res_df = res_df.groupby(level=0).apply(lambda c: c.apply(
377 | lambda x: pd.Series(
378 | [(x < 0).sum(), (x >= 0).sum()])).unstack())
379 | res_df.columns = ['{0}_{1}'.format(i[0], i[1])
380 | for i in res_df.columns]
381 | print('count_neg_poz size df: ', res_df.shape)
382 | return res_df.reindex(index=self._get_id_list(df))
383 |
384 | def count_unique_categories(self, df, iid=True):
385 | """
386 | Get a dataframe with a count of unique values for an individual
387 | level data.
388 |
389 | Args:
390 | df: DataFrame, dataframe with an individual level data
391 | iid: bool, add columns with the ratio of the number of unique
392 | values to the number of individuals in households,
393 | optional (default=True)
394 |
395 | Returns:
396 | A dataframe with a count of unique values for an individual
397 | level data.
398 | """
399 | res_df = df.groupby(level=0).apply(
400 | lambda c: c.apply(lambda x: pd.Series([len((x).unique())])))
401 | res_df.index = res_df.index.droplevel(1)
402 | res_df.columns = [
403 | '{0}_{1}'.format('cat_n', i) for i in res_df.columns]
404 | print('count_unique_categories size df: ', res_df.shape)
405 | res_df = res_df.reindex(index=self._get_id_list(df))
406 | if iid:
407 | div_df = res_df.div(self.count_iid(df)['iid_cnt'], axis=0)
408 | div_df.columns = ['{0}_{1}'.format('div_cat_iid', i)
409 | for i in res_df.columns]
410 | res_df = pd.concat([res_df, div_df], axis=1)
411 | return res_df
412 |
413 | def load(self, load=True, cat_enc=False):
414 | """
415 | Load data from files.
416 |
417 | Args:
418 | load: bool, load from file without postprocessing,
419 | optional (default=True)
420 | cat_enc: bool, encode categories to numeric values,
421 | optional, (default=False)
422 | """
423 |
424 | print('DataInd load')
425 | if load:
426 | self.country_df_train = self.del_nonunique(
427 | pd.read_csv(self.train_file_name, index_col=['id']))
428 | self.country_df_test = self.del_nonunique(
429 | pd.read_csv(self.test_file_name, index_col=['id']))
430 |
431 | if not load:
432 | print(self.train_file_name)
433 | print(self.test_file_name)
434 | self.country_df_train = self.del_nonunique(
435 | pd.read_csv(self.train_file_name, index_col=['id', 'iid']))
436 | self.country_df_test = self.del_nonunique(
437 | pd.read_csv(self.test_file_name, index_col=['id', 'iid']))
438 | self._rename_col()
439 | self.fillna()
440 | self.col_common_list = sorted(
441 | list(set(self.country_df_train.columns).intersection(
442 | self.country_df_test.columns)))
443 |
444 | self.categorical_list, self.float_list = self.category_float_search(
445 | countries=['A', 'B', 'C'])
446 |
447 | if cat_enc:
448 | for header in self.categorical_list:
449 | self.country_df_train[header] = self.country_df_train[header].astype('category').cat.codes
450 | self.country_df_test[header] = self.country_df_test[header].astype('category').cat.codes
451 | # To reproduce the result in the final submission.
452 | # In the general solution, this scale is not needed.
453 | self.scale()
454 | self.country_df_train = self.del_nonunique(pd.concat(
455 | [self.get_poor(self.country_df_train),
456 | self.count_iid(self.country_df_train),
457 | self.count_neg_poz(self.country_df_train),
458 | self.summarize(self.country_df_train),
459 | self.count_unique_categories(self.country_df_train)],
460 | axis=1))
461 |
462 | self.country_df_test = self.del_nonunique(pd.concat(
463 | [self.count_iid(self.country_df_test),
464 | self.count_neg_poz(self.country_df_test),
465 | self.summarize(self.country_df_test),
466 | self.count_unique_categories(self.country_df_test)],
467 | axis=1))
468 |
469 | self.col_common_list = sorted(
470 | list(set(self.country_df_train.columns).intersection(
471 | self.country_df_test.columns)))
472 | self.categorical_list, self.float_list = self.category_float_search(
473 | countries=['A', 'B', 'C'])
474 | if not load:
475 | self.scale()
476 | print('indiv train shape: ', self.country_df_train.shape)
477 | print('indiv test shape: ', self.country_df_test.shape)
478 | return True
479 |
480 |
481 | class DataConcat(Data):
482 | """
483 | Class for working with concatenated data from individual and household
484 | levels.
485 | """
486 |
487 | def __init__(self):
488 | self.data_hh_train = None
489 | self.data_hh_test = None
490 | self.data_indiv_train = None
491 | self.data_indiv_test = None
492 | super().__init__()
493 |
494 | def set_file_names(self, files_dict):
495 | """
496 | Set file names for train and test dataframes
497 |
498 | Args:
499 | files_dict: dictionary, file names for 'train' and 'test'
500 | """
501 | self.hh_train_file_name = files_dict.get('train_hh')
502 | self.hh_test_file_name = files_dict.get('test_hh')
503 | self.ind_train_file_name = files_dict.get('train_ind')
504 | self.ind_test_file_name = files_dict.get('test_ind')
505 | super().set_file_names(files_dict)
506 |
507 | def load(self, load=True, cat_enc=False, with_bug=True):
508 | """
509 | Load data from files.
510 |
511 | Args:
512 | load: bool, load from file without postprocessing,
513 | optional (default=True)
514 | cat_enc: bool, encode categories to numeric values,
515 | optional, (default=False)
516 | with_bug: bool, emulate a bug for final submission,
517 | optional, (default=True)
518 | """
519 | if with_bug or not load:
520 | data_hh = Data()
521 | data_hh.set_country(self.country)
522 | data_hh.set_file_names({'train': self.hh_train_file_name,
523 | 'test': self.hh_test_file_name})
524 | if not data_hh.load(load=False, with_bug=with_bug):
525 | return False
526 |
527 | if load:
528 | print('DataConcat load')
529 | self.country_df_train = self.del_nonunique(pd.read_csv(
530 | self.train_file_name, index_col=['id']))
531 | self.country_df_test = self.del_nonunique(pd.read_csv(
532 | self.test_file_name, index_col=['id']))
533 | else:
534 | data_ind = DataInd()
535 | data_ind.set_country(self.country)
536 | data_ind.set_file_names({'train': self.ind_train_file_name,
537 | 'test': self.ind_test_file_name})
538 |
539 | if data_ind.load(load=True):
540 | self.country_df_train = data_hh.country_df_train.join(
541 | data_ind.country_df_train)
542 | self.country_df_test = data_hh.country_df_test.join(
543 | data_ind.country_df_test)
544 |
545 | self.col_common_list = sorted(
546 | list(set(self.country_df_train.columns).intersection(
547 | self.country_df_test.columns)))
548 |
549 | if with_bug:
550 | self.categorical_list = data_hh.categorical_list
551 | else:
552 | self.categorical_list, self.float_list = self.category_float_search(
553 | countries=['B'])
554 |
555 | print('train:', self.country_df_train.shape)
556 | print('test:', self.country_df_test.shape)
557 |
558 | return True
559 |
--------------------------------------------------------------------------------
/src/models/predict_model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import process
4 | import pandas as pd
5 | from data.data import Data, DataConcat
6 | from models import LGBM_model, CB_model, XGB_model
7 |
8 | src_dir = os.path.join(os.getcwd(), 'src')
9 | sys.path.append(src_dir)
10 |
11 |
12 | def predict(p_models={'xgboost': True,
13 | 'lightgbm': True,
14 | 'catboost': True}):
15 | filenames_dict = {
16 | 'A': {'train': 'data/processed/A_hhold_train.csv',
17 | 'test': 'data/processed/A_hhold_test.csv',
18 | 'train_hh': 'data/raw/A_hhold_train.csv',
19 | 'test_hh': 'data/raw/A_hhold_test.csv',
20 | 'train_ind': 'data/processed/A_indiv_train.csv',
21 | 'test_ind': 'data/processed/A_indiv_test.csv'
22 | },
23 | 'B': {'train': 'data/processed/B_combine_train.csv',
24 | 'test': 'data/processed/B_combine_test.csv',
25 | 'train_hh': 'data/raw/B_hhold_train.csv',
26 | 'test_hh': 'data/raw/B_hhold_test.csv',
27 | 'train_ind': 'data/processed/B_indiv_ext_train.csv',
28 | 'test_ind': 'data/processed/B_indiv_ext_test.csv'
29 | },
30 | 'C': {'train': 'data/processed/C_combine_train.csv',
31 | 'test': 'data/processed/C_combine_test.csv',
32 | 'train_hh': 'data/raw/C_hhold_train.csv',
33 | 'test_hh': 'data/raw/C_hhold_test.csv',
34 | 'train_ind': 'data/processed/C_indiv_ext_train.csv',
35 | 'test_ind': 'data/processed/C_indiv_ext_test.csv'
36 | },
37 | }
38 |
39 | data_A = Data()
40 | data_B = DataConcat()
41 | data_C = DataConcat()
42 |
43 | data_A.set_country('A')
44 | data_B.set_country('B')
45 | data_C.set_country('C')
46 |
47 | data_A.set_file_names(files_dict=filenames_dict['A'])
48 | data_B.set_file_names(files_dict=filenames_dict['B'])
49 | data_C.set_file_names(files_dict=filenames_dict['C'])
50 |
51 | data_A.load(load=True)
52 | # To reproduce the result in the final submission.
53 | # Saving data to a file changes this data due to rounding of numbers.
54 | data_B.load(load=False)
55 | data_C.load(load=False)
56 |
57 | data_dict = {'A': data_A, 'B': data_B, 'C': data_C}
58 | balances = {'A': False, 'B': False, 'C': True}
59 |
60 | # XGBoost prediction
61 | if p_models['xgboost']:
62 | params_XGB_A = {
63 | 'learning_rate': 0.03,
64 | 'max_depth': 3,
65 | 'n_estimators': 1500,
66 | 'silent': True,
67 | 'objective': 'binary:logistic',
68 | 'gamma': 0.3,
69 | 'subsample': 0.7,
70 | 'reg_alpha': 0.05
71 | }
72 |
73 | params_XGB_B = {
74 | 'learning_rate': 0.03,
75 | 'max_depth': 5,
76 | 'n_estimators': 400,
77 | 'silent': True,
78 | 'objective': 'binary:logistic',
79 | 'gamma': 0.2,
80 | 'subsample': 0.7,
81 | 'reg_alpha': 0.05,
82 | }
83 |
84 | params_XGB_C = {
85 | 'learning_rate': 0.03,
86 | 'max_depth': 3,
87 | 'n_estimators': 500,
88 | 'silent': True,
89 | 'objective': 'binary:logistic',
90 | 'gamma': 0.2,
91 | 'subsample': 0.6,
92 | 'reg_alpha': 0.05,
93 | }
94 |
95 | model_xgb_A = XGB_model(categ_conv=True)
96 | model_xgb_A.set_params(params=params_XGB_A)
97 | model_xgb_B = XGB_model(categ_conv=True)
98 | model_xgb_B.set_params(params=params_XGB_B)
99 | model_xgb_C = XGB_model(categ_conv=True)
100 | model_xgb_C.set_params(params=params_XGB_C)
101 | model_xgb_dict = {'A': model_xgb_A, 'B': model_xgb_B, 'C': model_xgb_C}
102 |
103 | # List of columns to delete obtained via find_exclude function and cross-validation
104 | exclude_XGB_dict = {'A': ['A_0', 'A_10', 'A_101', 'A_106', 'A_11', 'A_113', 'A_120', 'A_121', 'A_13', 'A_131', 'A_134', 'A_138', 'A_140', 'A_146', 'A_147', 'A_148', 'A_15', 'A_152', 'A_155', 'A_161', 'A_162', 'A_167', 'A_168', 'A_17', 'A_170', 'A_173', 'A_174', 'A_175', 'A_176', 'A_179', 'A_18', 'A_181', 'A_185', 'A_186', 'A_191', 'A_195', 'A_197', 'A_2', 'A_202', 'A_203', 'A_206', 'A_213', 'A_215', 'A_216', 'A_218', 'A_219', 'A_22', 'A_223', 'A_225', 'A_226', 'A_227', 'A_232', 'A_234', 'A_237', 'A_242', 'A_245', 'A_251', 'A_252', 'A_253', 'A_254', 'A_255', 'A_256', 'A_258', 'A_259', 'A_26', 'A_261', 'A_262', 'A_263', 'A_267', 'A_27', 'A_272', 'A_277', 'A_282', 'A_295', 'A_299', 'A_3', 'A_30', 'A_301', 'A_302', 'A_305', 'A_307', 'A_308', 'A_309', 'A_31', 'A_312', 'A_315', 'A_319', 'A_32', 'A_322', 'A_33', 'A_330', 'A_332', 'A_335', 'A_341', 'A_35', 'A_39', 'A_43', 'A_44', 'A_45', 'A_46', 'A_49', 'A_57', 'A_59', 'A_60', 'A_61', 'A_63', 'A_66', 'A_67', 'A_69', 'A_70', 'A_72', 'A_76', 'A_77', 'A_80', 'A_81', 'A_88', 'A_89', 'A_9', 'A_91', 'A_93', 'A_97', 'cat_n_A_25', 'cat_n_A_3', 'cat_n_A_36', 'cat_n_A_4', 'iid_cnt', 'A_105', 'A_114', 'A_229', 'cat_n_A_20', 'div_cat_iid_cat_n_A_25', 'A_14', 'A_6_1', 'cat_n_A_39'],
105 | 'B': ['B_0', 'B_1', 'B_106', 'B_109', 'B_112', 'B_12', 'B_120', 'B_121', 'B_128', 'B_135', 'B_14', 'B_140', 'B_141', 'B_142', 'B_143', 'B_144', 'B_145', 'B_148', 'B_149', 'B_151', 'B_152', 'B_152_1', 'B_157_0', 'B_157_1', 'B_158', 'B_159_0', 'B_15_1', 'B_16', 'B_160', 'B_161_0', 'B_161_1', 'B_162', 'B_167', 'B_17', 'B_172', 'B_173', 'B_174_1', 'B_175_0', 'B_175_1', 'B_176', 'B_18', 'B_180_1', 'B_187', 'B_188', 'B_188_1', 'B_191', 'B_196', 'B_196_0', 'B_196_1', 'B_203', 'B_204', 'B_205', 'B_206', 'B_208', 'B_209', 'B_20_0', 'B_20_1', 'B_210', 'B_210_0', 'B_210_1', 'B_211', 'B_212', 'B_215', 'B_219', 'B_219_0', 'B_227', 'B_228', 'B_23', 'B_230', 'B_234', 'B_237', 'B_238', 'B_239', 'B_24', 'B_241', 'B_243', 'B_244', 'B_247', 'B_248', 'B_250', 'B_251', 'B_252', 'B_254', 'B_256', 'B_259', 'B_260', 'B_264', 'B_265', 'B_266', 'B_269', 'B_271', 'B_275', 'B_278', 'B_279', 'B_28', 'B_284', 'B_29', 'B_3', 'B_302', 'B_303', 'B_304', 'B_307', 'B_313', 'B_314', 'B_320', 'B_334', 'B_337', 'B_340', 'B_342', 'B_348', 'B_34_0', 'B_34_1', 'B_35', 'B_350', 'B_353', 'B_354', 'B_355', 'B_356', 'B_359', 'B_35_0', 'B_35_1', 'B_36', 'B_361', 'B_362', 'B_363', 'B_364', 'B_365', 'B_366', 'B_368', 'B_36_0', 'B_36_1', 'B_37', 'B_370', 'B_371', 'B_372', 'B_375', 'B_379', 'B_385', 'B_386', 'B_389', 'B_390', 'B_391', 'B_392', 'B_394', 'B_395', 'B_397', 'B_400', 'B_402', 'B_405', 'B_406', 'B_407', 'B_41', 'B_410', 'B_411', 'B_412', 'B_413', 'B_418', 'B_42', 'B_420', 'B_422', 'B_423', 'B_427', 'B_428', 'B_44', 'B_47', 'B_48', 'B_50', 'B_52', 'B_55', 'B_60_0', 'B_60_1', 'B_61', 'B_62', 'B_63', 'B_64', 'B_65', 'B_66', 'B_67', 'B_68_0', 'B_68_1', 'B_7', 'B_71_1', 'B_72', 'B_76', 'B_80', 'B_83', 'B_86', 'B_89', 'B_8_0', 'B_8_1', 'B_9', 'B_94', 'B_95', 'B_96', 'B_99', 'cat_n_B_1', 'cat_n_B_10', 'cat_n_B_102', 'cat_n_B_104', 'cat_n_B_105', 'cat_n_B_106', 'cat_n_B_107', 'cat_n_B_108', 'cat_n_B_11', 'cat_n_B_111', 'cat_n_B_115', 'cat_n_B_116', 'cat_n_B_117', 'cat_n_B_118', 'cat_n_B_120', 'cat_n_B_122', 'cat_n_B_126', 'cat_n_B_127', 'cat_n_B_129', 'cat_n_B_13', 'cat_n_B_130', 'cat_n_B_133', 'cat_n_B_134', 'cat_n_B_137', 'cat_n_B_138', 'cat_n_B_139', 'cat_n_B_140', 'cat_n_B_141', 'cat_n_B_142', 'cat_n_B_145', 'cat_n_B_147', 'cat_n_B_148', 'cat_n_B_149', 'cat_n_B_152', 'cat_n_B_153', 'cat_n_B_154', 'cat_n_B_157', 'cat_n_B_158', 'cat_n_B_16', 'cat_n_B_160', 'cat_n_B_161', 'cat_n_B_165', 'cat_n_B_166', 'cat_n_B_168', 'cat_n_B_169', 'cat_n_B_17', 'cat_n_B_170', 'cat_n_B_171', 'cat_n_B_174', 'cat_n_B_177', 'cat_n_B_179', 'cat_n_B_18', 'cat_n_B_181', 'cat_n_B_182', 'cat_n_B_184', 'cat_n_B_185', 'cat_n_B_187', 'cat_n_B_189', 'cat_n_B_192', 'cat_n_B_193', 'cat_n_B_196', 'cat_n_B_197', 'cat_n_B_198', 'cat_n_B_20', 'cat_n_B_201', 'cat_n_B_203', 'cat_n_B_204', 'cat_n_B_205', 'cat_n_B_206', 'cat_n_B_208', 'cat_n_B_209', 'cat_n_B_211', 'cat_n_B_212', 'cat_n_B_213', 'cat_n_B_214', 'cat_n_B_215', 'cat_n_B_216', 'cat_n_B_218', 'cat_n_B_219', 'cat_n_B_220', 'cat_n_B_221', 'cat_n_B_223', 'cat_n_B_23', 'cat_n_B_24', 'cat_n_B_25', 'cat_n_B_26', 'cat_n_B_27', 'cat_n_B_28', 'cat_n_B_3', 'cat_n_B_30', 'cat_n_B_31', 'cat_n_B_32', 'cat_n_B_33', 'cat_n_B_34', 'cat_n_B_35', 'cat_n_B_36', 'cat_n_B_37', 'cat_n_B_38', 'cat_n_B_39', 'cat_n_B_4', 'cat_n_B_42', 'cat_n_B_45', 'cat_n_B_47', 'cat_n_B_49', 'cat_n_B_5', 'cat_n_B_50', 'cat_n_B_51', 'cat_n_B_52', 'cat_n_B_55', 'cat_n_B_56', 'cat_n_B_60', 'cat_n_B_62', 'cat_n_B_63', 'cat_n_B_64', 'cat_n_B_65', 'cat_n_B_68', 'cat_n_B_7', 'cat_n_B_70', 'cat_n_B_71', 'cat_n_B_72', 'cat_n_B_76', 'cat_n_B_77', 'cat_n_B_78', 'cat_n_B_8', 'cat_n_B_82', 'cat_n_B_83', 'cat_n_B_86', 'cat_n_B_88', 'cat_n_B_90', 'cat_n_B_92', 'cat_n_B_93', 'cat_n_B_94', 'cat_n_B_95', 'cat_n_B_98', 'cat_n_B_99', 'div_cat_iid_cat_n_B_102', 'div_cat_iid_cat_n_B_105', 'div_cat_iid_cat_n_B_111', 'div_cat_iid_cat_n_B_114', 'div_cat_iid_cat_n_B_116', 'div_cat_iid_cat_n_B_118', 'div_cat_iid_cat_n_B_119', 'div_cat_iid_cat_n_B_122', 'div_cat_iid_cat_n_B_127', 'div_cat_iid_cat_n_B_131', 'div_cat_iid_cat_n_B_134', 'div_cat_iid_cat_n_B_139', 'div_cat_iid_cat_n_B_141', 'div_cat_iid_cat_n_B_142', 'div_cat_iid_cat_n_B_147', 'div_cat_iid_cat_n_B_157', 'div_cat_iid_cat_n_B_158', 'div_cat_iid_cat_n_B_16', 'div_cat_iid_cat_n_B_161', 'div_cat_iid_cat_n_B_169', 'div_cat_iid_cat_n_B_170', 'div_cat_iid_cat_n_B_171', 'div_cat_iid_cat_n_B_174', 'div_cat_iid_cat_n_B_177', 'div_cat_iid_cat_n_B_178', 'div_cat_iid_cat_n_B_179', 'div_cat_iid_cat_n_B_180', 'div_cat_iid_cat_n_B_181', 'div_cat_iid_cat_n_B_184', 'div_cat_iid_cat_n_B_188', 'div_cat_iid_cat_n_B_189', 'div_cat_iid_cat_n_B_193', 'div_cat_iid_cat_n_B_196', 'div_cat_iid_cat_n_B_197', 'div_cat_iid_cat_n_B_199', 'div_cat_iid_cat_n_B_201', 'div_cat_iid_cat_n_B_202', 'div_cat_iid_cat_n_B_204', 'div_cat_iid_cat_n_B_206', 'div_cat_iid_cat_n_B_208', 'div_cat_iid_cat_n_B_209', 'div_cat_iid_cat_n_B_213', 'div_cat_iid_cat_n_B_215', 'div_cat_iid_cat_n_B_216', 'div_cat_iid_cat_n_B_217', 'div_cat_iid_cat_n_B_220', 'div_cat_iid_cat_n_B_223', 'div_cat_iid_cat_n_B_23', 'div_cat_iid_cat_n_B_26', 'div_cat_iid_cat_n_B_3', 'div_cat_iid_cat_n_B_31', 'div_cat_iid_cat_n_B_32', 'div_cat_iid_cat_n_B_33', 'div_cat_iid_cat_n_B_34', 'div_cat_iid_cat_n_B_35', 'div_cat_iid_cat_n_B_36', 'div_cat_iid_cat_n_B_38', 'div_cat_iid_cat_n_B_42', 'div_cat_iid_cat_n_B_43', 'div_cat_iid_cat_n_B_45', 'div_cat_iid_cat_n_B_47', 'div_cat_iid_cat_n_B_50', 'div_cat_iid_cat_n_B_51', 'div_cat_iid_cat_n_B_52', 'div_cat_iid_cat_n_B_59', 'div_cat_iid_cat_n_B_61', 'div_cat_iid_cat_n_B_62', 'div_cat_iid_cat_n_B_69', 'div_cat_iid_cat_n_B_7', 'div_cat_iid_cat_n_B_71', 'div_cat_iid_cat_n_B_72', 'div_cat_iid_cat_n_B_75', 'div_cat_iid_cat_n_B_76', 'div_cat_iid_cat_n_B_77', 'div_cat_iid_cat_n_B_81', 'div_cat_iid_cat_n_B_83', 'div_cat_iid_cat_n_B_84', 'div_cat_iid_cat_n_B_90', 'div_cat_iid_cat_n_B_92', 'div_cat_iid_cat_n_B_94', 'div_cat_iid_cat_n_B_95', 'div_cat_iid_cat_n_B_98', 'div_cat_iid_cat_n_B_99', 'iid_cnt', 'sum_B_157', 'sum_B_161', 'sum_B_174', 'sum_B_188', 'B_10', 'B_101', 'B_104', 'B_107', 'B_11', 'B_111', 'B_116', 'B_123_0', 'B_156', 'B_159_1', 'B_164', 'B_170', 'B_171', 'B_174_0', 'B_182', 'B_192', 'B_194', 'B_19_0', 'B_216', 'B_223', 'B_224', 'B_229', 'B_235', 'B_25', 'B_272', 'B_282', 'B_283', 'B_288', 'B_290', 'B_293', 'B_297', 'B_317', 'B_318', 'B_322', 'B_325', 'B_343', 'B_352', 'B_373', 'B_384', 'B_403', 'B_51', 'B_68', 'B_73', 'B_92', 'cat_n_B_12', 'cat_n_B_124', 'cat_n_B_125', 'cat_n_B_131', 'cat_n_B_132', 'cat_n_B_136', 'cat_n_B_159', 'cat_n_B_167', 'cat_n_B_19', 'cat_n_B_191', 'cat_n_B_194', 'cat_n_B_2', 'cat_n_B_200', 'cat_n_B_202', 'cat_n_B_207', 'cat_n_B_210', 'cat_n_B_217', 'cat_n_B_44', 'cat_n_B_59', 'cat_n_B_67', 'cat_n_B_75', 'cat_n_B_84', 'cat_n_B_9', 'cat_n_B_91', 'cat_n_B_96', 'div_cat_iid_cat_n_B_0', 'div_cat_iid_cat_n_B_112', 'div_cat_iid_cat_n_B_12', 'div_cat_iid_cat_n_B_121', 'div_cat_iid_cat_n_B_126', 'div_cat_iid_cat_n_B_136', 'div_cat_iid_cat_n_B_137', 'div_cat_iid_cat_n_B_138', 'div_cat_iid_cat_n_B_151', 'div_cat_iid_cat_n_B_167', 'div_cat_iid_cat_n_B_186', 'div_cat_iid_cat_n_B_198', 'div_cat_iid_cat_n_B_2', 'div_cat_iid_cat_n_B_203', 'div_cat_iid_cat_n_B_207', 'div_cat_iid_cat_n_B_211', 'div_cat_iid_cat_n_B_29', 'div_cat_iid_cat_n_B_39', 'div_cat_iid_cat_n_B_49', 'div_cat_iid_cat_n_B_5', 'B_123_1', 'B_146', 'B_147', 'B_174', 'B_198_1', 'B_218', 'B_222_1', 'B_285', 'B_296', 'B_339', 'B_414', 'B_85', 'B_91', 'cat_n_B_113', 'cat_n_B_114', 'cat_n_B_123', 'cat_n_B_151', 'cat_n_B_178', 'cat_n_B_180', 'cat_n_B_183', 'cat_n_B_195', 'cat_n_B_199', 'cat_n_B_29', 'cat_n_B_43', 'cat_n_B_48', 'cat_n_B_74', 'div_cat_iid_cat_n_B_145', 'div_cat_iid_cat_n_B_148', 'div_cat_iid_cat_n_B_192', 'div_cat_iid_cat_n_B_55', 'sum_B_35', 'B_103', 'B_107_1', 'B_123', 'B_155', 'B_178', 'B_183', 'B_2', 'B_233', 'B_268', 'B_270', 'B_295', 'B_319', 'B_321', 'B_328', 'B_33', 'B_360', 'B_382', 'B_383', 'B_387', 'B_388', 'B_46_0', 'B_75', 'cat_n_B_119', 'cat_n_B_128', 'cat_n_B_146', 'cat_n_B_173', 'cat_n_B_40', 'div_cat_iid_cat_n_B_11', 'div_cat_iid_cat_n_B_110', 'div_cat_iid_cat_n_B_120', 'div_cat_iid_cat_n_B_128', 'div_cat_iid_cat_n_B_172', 'div_cat_iid_cat_n_B_20', 'div_cat_iid_cat_n_B_210', 'div_cat_iid_cat_n_B_219', 'div_cat_iid_cat_n_B_221', 'div_cat_iid_cat_n_B_27', 'div_cat_iid_cat_n_B_60', 'div_cat_iid_cat_n_B_63', 'div_cat_iid_cat_n_B_64', 'sum_B_180', 'B_115', 'B_124', 'B_19', 'B_19_1', 'B_330', 'B_357', 'B_409', 'cat_n_B_103', 'cat_n_B_121', 'cat_n_B_164', 'cat_n_B_186', 'cat_n_B_54', 'cat_n_B_73', 'cat_n_B_80', 'div_cat_iid_cat_n_B_154', 'div_cat_iid_cat_n_B_187', 'div_cat_iid_cat_n_B_44', 'B_163', 'B_165', 'B_180_0', 'B_236', 'B_277', 'B_292', 'B_329', 'B_34', 'B_46_1', 'cat_n_B_57', 'div_cat_iid_cat_n_B_130', 'div_cat_iid_cat_n_B_57'],
106 | 'C': ['C_1', 'C_10', 'C_100', 'C_109', 'C_10_0', 'C_111', 'C_116', 'C_126', 'C_127', 'C_129', 'C_133', 'C_135', 'C_139', 'C_14', 'C_141', 'C_143', 'C_146', 'C_151', 'C_154', 'C_155', 'C_157', 'C_159', 'C_17_0', 'C_17_1', 'C_19', 'C_22', 'C_25', 'C_26', 'C_27_0', 'C_27_1', 'C_28', 'C_3', 'C_31', 'C_32', 'C_44', 'C_45', 'C_47', 'C_54', 'C_55', 'C_59', 'C_6', 'C_63', 'C_65', 'C_67', 'C_72', 'C_73', 'C_74', 'C_77', 'C_8', 'C_81', 'C_84', 'C_85', 'C_87', 'C_89', 'C_9', 'C_92', 'C_94', 'C_96', 'C_99', 'cat_n_C_0', 'cat_n_C_10', 'cat_n_C_11', 'cat_n_C_14', 'cat_n_C_15', 'cat_n_C_16', 'cat_n_C_18', 'cat_n_C_20', 'cat_n_C_21', 'cat_n_C_23', 'cat_n_C_24', 'cat_n_C_26', 'cat_n_C_32', 'cat_n_C_38', 'cat_n_C_4', 'cat_n_C_5', 'cat_n_C_9', 'div_cat_iid_cat_n_C_13', 'div_cat_iid_cat_n_C_23', 'div_cat_iid_cat_n_C_26', 'div_cat_iid_cat_n_C_31', 'div_cat_iid_cat_n_C_32', 'div_cat_iid_cat_n_C_6', 'div_cat_iid_cat_n_C_7', 'iid_cnt', 'C_13', 'C_144', 'C_161', 'C_2', 'C_29', 'C_33', 'C_41', 'C_79', 'C_90', 'C_98', 'cat_n_C_25', 'cat_n_C_27', 'cat_n_C_3', 'cat_n_C_37', 'cat_n_C_6', 'div_cat_iid_cat_n_C_0', 'div_cat_iid_cat_n_C_14', 'div_cat_iid_cat_n_C_20', 'C_145', 'C_60', 'C_69', 'cat_n_C_13', 'cat_n_C_40', 'div_cat_iid_cat_n_C_15', 'div_cat_iid_cat_n_C_5', 'C_142', 'C_50', 'C_62', 'C_103', 'C_121', 'C_24', 'C_30', 'C_39', 'C_40', 'C_112', 'C_123']}
107 |
108 | process_xgb = process.processing(countries=['A', 'B', 'C'],
109 | balances=balances)
110 | process_xgb.set_data_dict(data_dict=data_dict)
111 | process_xgb.set_model_dict(model_dict=model_xgb_dict)
112 | # process_xgb.find_exclude()
113 | process_xgb.set_exclude_dict(exclude_XGB_dict)
114 | result_xgb = process_xgb.predict(model_name='xgboost', path='models/')
115 |
116 | # LightGBM prediction
117 | if p_models['lightgbm']:
118 | params_LGBM_A = {
119 | 'learning_rate': 0.02,
120 | 'max_depth': 6,
121 | 'n_estimators': 942,
122 | 'silent': True,
123 | 'objective': 'binary',
124 | 'subsample': 0.6,
125 | 'reg_alpha': 0.02,
126 | 'is_unbalance': True,
127 | 'boosting_type': 'gbdt',
128 | 'reg_lambda': 0.01,
129 | 'random_state': 1
130 | }
131 |
132 | params_LGBM_B = {
133 | 'learning_rate': 0.03,
134 | 'max_depth': 6,
135 | 'n_estimators': 232,
136 | 'silent': True,
137 | 'objective': 'binary',
138 | 'subsample': 0.8,
139 | 'reg_alpha': 0.05,
140 | 'is_unbalance': True,
141 | 'boosting_type': 'gbdt',
142 | 'reg_lambda': 0.00,
143 | 'random_state': 1
144 | }
145 |
146 | params_LGBM_C = {
147 | 'learning_rate': 0.05,
148 | 'max_depth': 3,
149 | 'n_estimators': 520,
150 | 'silent': True,
151 | 'objective': 'binary',
152 | 'subsample': 0.7,
153 | 'reg_alpha': 0.05,
154 | 'is_unbalance': True,
155 | 'boosting_type': 'gbdt',
156 | 'reg_lambda': 0.03,
157 | 'random_state': 1
158 | }
159 |
160 | model_lgbm_A = LGBM_model(categ_conv=True)
161 | model_lgbm_A.set_params(params=params_LGBM_A)
162 | model_lgbm_B = LGBM_model(categ_conv=True)
163 | model_lgbm_B.set_params(params=params_LGBM_B)
164 | model_lgbm_C = LGBM_model(categ_conv=True)
165 | model_lgbm_C.set_params(params=params_LGBM_C)
166 | model_lgbm_dict = {'A': model_lgbm_A, 'B': model_lgbm_B, 'C': model_lgbm_C}
167 |
168 | # List of columns to delete obtained via find_exclude function and cross-validation
169 | exclude_LGBM_dict = {'A': ['A_0', 'A_10', 'A_101', 'A_105', 'A_106', 'A_11', 'A_112', 'A_113', 'A_115', 'A_120', 'A_121', 'A_13', 'A_131', 'A_134', 'A_138', 'A_141', 'A_15', 'A_152', 'A_155', 'A_161', 'A_162', 'A_167', 'A_168', 'A_170', 'A_173', 'A_174', 'A_175', 'A_176', 'A_18', 'A_181', 'A_185', 'A_191', 'A_195', 'A_197', 'A_202', 'A_203', 'A_206', 'A_215', 'A_216', 'A_218', 'A_219', 'A_223', 'A_225', 'A_232', 'A_237', 'A_242', 'A_245', 'A_251', 'A_252', 'A_253', 'A_254', 'A_255', 'A_256', 'A_258', 'A_259', 'A_26', 'A_261', 'A_262', 'A_263', 'A_267', 'A_27', 'A_272', 'A_275', 'A_282', 'A_292', 'A_295', 'A_299', 'A_3', 'A_30', 'A_301', 'A_307', 'A_308', 'A_309', 'A_31', 'A_312', 'A_319', 'A_32', 'A_322', 'A_33', 'A_330', 'A_332', 'A_335', 'A_338', 'A_341', 'A_35', 'A_39', 'A_43', 'A_44', 'A_46', 'A_47', 'A_49', 'A_57', 'A_59', 'A_60', 'A_63', 'A_66', 'A_67', 'A_69', 'A_70', 'A_72', 'A_77', 'A_80', 'A_81', 'A_88', 'A_89', 'A_9', 'A_91', 'A_93'],
170 | 'B': ['B_0', 'B_1', 'B_106', 'B_106_0', 'B_107', 'B_11', 'B_115', 'B_120', 'B_121', 'B_128', 'B_140', 'B_141', 'B_142', 'B_143', 'B_144', 'B_151', 'B_152', 'B_157_0', 'B_157_1', 'B_158', 'B_159_0', 'B_159_1', 'B_15_0', 'B_16', 'B_160', 'B_161_0', 'B_161_1', 'B_162', 'B_164', 'B_165', 'B_167', 'B_17', 'B_172', 'B_174', 'B_174_0', 'B_174_1', 'B_176', 'B_18', 'B_180_0', 'B_187', 'B_188', 'B_188_1', 'B_191', 'B_194', 'B_196', 'B_196_0', 'B_196_1', 'B_19_0', 'B_203', 'B_204', 'B_205', 'B_206', 'B_208', 'B_209', 'B_20_0', 'B_210_0', 'B_210_1', 'B_215', 'B_219', 'B_219_0', 'B_227', 'B_228', 'B_229', 'B_23', 'B_230', 'B_234', 'B_236', 'B_238', 'B_24', 'B_241', 'B_242', 'B_243', 'B_244', 'B_247', 'B_25', 'B_250', 'B_254', 'B_256', 'B_264', 'B_266', 'B_269', 'B_271', 'B_272', 'B_275', 'B_279', 'B_283', 'B_284', 'B_288', 'B_29', 'B_293', 'B_296', 'B_3', 'B_302', 'B_303', 'B_307', 'B_314', 'B_317', 'B_318', 'B_325', 'B_329', 'B_330', 'B_334', 'B_337', 'B_340', 'B_348', 'B_34_0', 'B_34_1', 'B_35', 'B_350', 'B_354', 'B_355', 'B_356', 'B_35_0', 'B_35_1', 'B_36', 'B_361', 'B_366', 'B_36_0', 'B_36_1', 'B_37', 'B_370', 'B_371', 'B_372', 'B_373', 'B_385', 'B_386', 'B_389', 'B_390', 'B_394', 'B_397', 'B_399', 'B_400', 'B_402', 'B_405', 'B_406', 'B_407', 'B_408', 'B_410', 'B_411', 'B_412', 'B_413', 'B_418', 'B_42', 'B_420', 'B_422', 'B_427', 'B_428', 'B_432', 'B_436', 'B_48', 'B_50', 'B_52', 'B_55', 'B_60_1', 'B_63', 'B_64', 'B_65', 'B_67', 'B_68_0', 'B_71_0', 'B_72', 'B_73', 'B_75', 'B_80', 'B_83', 'B_89', 'B_8_0', 'B_9', 'B_91', 'B_94', 'B_95', 'B_99', 'cat_n_B_1', 'cat_n_B_102', 'cat_n_B_105', 'cat_n_B_106', 'cat_n_B_11', 'cat_n_B_110', 'cat_n_B_111', 'cat_n_B_112', 'cat_n_B_116', 'cat_n_B_117', 'cat_n_B_118', 'cat_n_B_119', 'cat_n_B_120', 'cat_n_B_121', 'cat_n_B_122', 'cat_n_B_123', 'cat_n_B_126', 'cat_n_B_127', 'cat_n_B_128', 'cat_n_B_131', 'cat_n_B_134', 'cat_n_B_136', 'cat_n_B_137', 'cat_n_B_138', 'cat_n_B_139', 'cat_n_B_140', 'cat_n_B_141', 'cat_n_B_142', 'cat_n_B_145', 'cat_n_B_147', 'cat_n_B_148', 'cat_n_B_151', 'cat_n_B_152', 'cat_n_B_153', 'cat_n_B_154', 'cat_n_B_157', 'cat_n_B_158', 'cat_n_B_159', 'cat_n_B_16', 'cat_n_B_160', 'cat_n_B_161', 'cat_n_B_167', 'cat_n_B_168', 'cat_n_B_169', 'cat_n_B_170', 'cat_n_B_171', 'cat_n_B_172', 'cat_n_B_174', 'cat_n_B_177', 'cat_n_B_178', 'cat_n_B_179', 'cat_n_B_18', 'cat_n_B_180', 'cat_n_B_181', 'cat_n_B_183', 'cat_n_B_184', 'cat_n_B_19', 'cat_n_B_190', 'cat_n_B_193', 'cat_n_B_194', 'cat_n_B_195', 'cat_n_B_196', 'cat_n_B_197', 'cat_n_B_198', 'cat_n_B_199', 'cat_n_B_20', 'cat_n_B_201', 'cat_n_B_202', 'cat_n_B_203', 'cat_n_B_204', 'cat_n_B_207', 'cat_n_B_208', 'cat_n_B_209', 'cat_n_B_210', 'cat_n_B_211', 'cat_n_B_213', 'cat_n_B_215', 'cat_n_B_216', 'cat_n_B_217', 'cat_n_B_218', 'cat_n_B_219', 'cat_n_B_220', 'cat_n_B_221', 'cat_n_B_223', 'cat_n_B_23', 'cat_n_B_25', 'cat_n_B_26', 'cat_n_B_27', 'cat_n_B_28', 'cat_n_B_29', 'cat_n_B_3', 'cat_n_B_31', 'cat_n_B_32', 'cat_n_B_33', 'cat_n_B_34', 'cat_n_B_36', 'cat_n_B_38', 'cat_n_B_39', 'cat_n_B_42', 'cat_n_B_44', 'cat_n_B_45', 'cat_n_B_47', 'cat_n_B_48', 'cat_n_B_49', 'cat_n_B_50', 'cat_n_B_51', 'cat_n_B_52', 'cat_n_B_55', 'cat_n_B_56', 'cat_n_B_59', 'cat_n_B_60', 'cat_n_B_62', 'cat_n_B_64', 'cat_n_B_68', 'cat_n_B_7', 'cat_n_B_71', 'cat_n_B_72', 'cat_n_B_75', 'cat_n_B_76', 'cat_n_B_77', 'cat_n_B_78', 'cat_n_B_84', 'cat_n_B_9', 'cat_n_B_90', 'cat_n_B_92', 'cat_n_B_94', 'cat_n_B_95', 'cat_n_B_98', 'cat_n_B_99', 'div_cat_iid_cat_n_B_102', 'div_cat_iid_cat_n_B_111', 'div_cat_iid_cat_n_B_116', 'div_cat_iid_cat_n_B_119', 'div_cat_iid_cat_n_B_121', 'div_cat_iid_cat_n_B_122', 'div_cat_iid_cat_n_B_134', 'div_cat_iid_cat_n_B_136', 'div_cat_iid_cat_n_B_139', 'div_cat_iid_cat_n_B_14', 'div_cat_iid_cat_n_B_141', 'div_cat_iid_cat_n_B_142', 'div_cat_iid_cat_n_B_145', 'div_cat_iid_cat_n_B_148', 'div_cat_iid_cat_n_B_157', 'div_cat_iid_cat_n_B_158', 'div_cat_iid_cat_n_B_159', 'div_cat_iid_cat_n_B_160', 'div_cat_iid_cat_n_B_172', 'div_cat_iid_cat_n_B_174', 'div_cat_iid_cat_n_B_181', 'div_cat_iid_cat_n_B_184', 'div_cat_iid_cat_n_B_188', 'div_cat_iid_cat_n_B_193', 'div_cat_iid_cat_n_B_196', 'div_cat_iid_cat_n_B_197', 'div_cat_iid_cat_n_B_201', 'div_cat_iid_cat_n_B_204', 'div_cat_iid_cat_n_B_210', 'div_cat_iid_cat_n_B_213', 'div_cat_iid_cat_n_B_215', 'div_cat_iid_cat_n_B_217', 'div_cat_iid_cat_n_B_220', 'div_cat_iid_cat_n_B_221', 'div_cat_iid_cat_n_B_223', 'div_cat_iid_cat_n_B_23', 'div_cat_iid_cat_n_B_26', 'div_cat_iid_cat_n_B_3', 'div_cat_iid_cat_n_B_31', 'div_cat_iid_cat_n_B_32', 'div_cat_iid_cat_n_B_33', 'div_cat_iid_cat_n_B_34', 'div_cat_iid_cat_n_B_38', 'div_cat_iid_cat_n_B_39', 'div_cat_iid_cat_n_B_42', 'div_cat_iid_cat_n_B_45', 'div_cat_iid_cat_n_B_47', 'div_cat_iid_cat_n_B_50', 'div_cat_iid_cat_n_B_51', 'div_cat_iid_cat_n_B_52', 'div_cat_iid_cat_n_B_55', 'div_cat_iid_cat_n_B_59', 'div_cat_iid_cat_n_B_60', 'div_cat_iid_cat_n_B_62', 'div_cat_iid_cat_n_B_7', 'div_cat_iid_cat_n_B_72', 'div_cat_iid_cat_n_B_76', 'div_cat_iid_cat_n_B_78', 'div_cat_iid_cat_n_B_81', 'div_cat_iid_cat_n_B_90', 'div_cat_iid_cat_n_B_94', 'div_cat_iid_cat_n_B_95', 'div_cat_iid_cat_n_B_98', 'div_cat_iid_cat_n_B_99', 'iid_cnt', 'sum_B_157', 'sum_B_161', 'sum_B_188'],
171 | 'C': ['C_100', 'C_109', 'C_10_0', 'C_111', 'C_116', 'C_121', 'C_123', 'C_125', 'C_126', 'C_127', 'C_129', 'C_133', 'C_135', 'C_139', 'C_14', 'C_140', 'C_141', 'C_143', 'C_146', 'C_150', 'C_151', 'C_152', 'C_154', 'C_155', 'C_157', 'C_159', 'C_17_0', 'C_17_1', 'C_18', 'C_19', 'C_2', 'C_20', 'C_21', 'C_22', 'C_25', 'C_26', 'C_27_0', 'C_27_1', 'C_28', 'C_29', 'C_3', 'C_32', 'C_33', 'C_39', 'C_40', 'C_41', 'C_54', 'C_55', 'C_59', 'C_62', 'C_63', 'C_64', 'C_65', 'C_67', 'C_69', 'C_72', 'C_73', 'C_74', 'C_77', 'C_8', 'C_81', 'C_82', 'C_84', 'C_85', 'C_87', 'C_9', 'C_90', 'C_92', 'C_94', 'C_96', 'C_98', 'C_99', 'cat_n_C_0', 'cat_n_C_10', 'cat_n_C_11', 'cat_n_C_14', 'cat_n_C_15', 'cat_n_C_16', 'cat_n_C_17', 'cat_n_C_18', 'cat_n_C_2', 'cat_n_C_20', 'cat_n_C_21', 'cat_n_C_23', 'cat_n_C_24', 'cat_n_C_26', 'cat_n_C_27', 'cat_n_C_3', 'cat_n_C_30', 'cat_n_C_38', 'cat_n_C_4', 'cat_n_C_5', 'cat_n_C_9', 'div_cat_iid_cat_n_C_2', 'div_cat_iid_cat_n_C_31', 'div_cat_iid_cat_n_C_4', 'div_cat_iid_cat_n_C_40', 'div_cat_iid_cat_n_C_7', 'iid_cnt']}
172 |
173 | process_lgbm = process.processing(countries=['A', 'B', 'C'],
174 | balances=balances)
175 | process_lgbm.set_data_dict(data_dict=data_dict)
176 | process_lgbm.set_model_dict(model_dict=model_lgbm_dict)
177 | process_lgbm.set_exclude_dict(exclude_LGBM_dict)
178 | # process_lgbm.find_exclude()
179 | result_lgbm = process_lgbm.predict(model_name='lightgbm', path='models/')
180 |
181 | # Catboost prediction
182 | if p_models['catboost']:
183 | params_CB_A = {
184 | 'iterations': 5000,
185 | 'learning_rate': 0.03,
186 | 'depth': 6,
187 | 'l2_leaf_reg': 3,
188 | 'loss_function': 'Logloss',
189 | 'random_seed': 1,
190 | 'logging_level': 'Silent',
191 | }
192 |
193 | params_CB_B = {
194 | 'iterations': 5000,
195 | 'learning_rate': 0.03,
196 | 'depth': 6,
197 | 'l2_leaf_reg': 3,
198 | 'loss_function': 'Logloss',
199 | 'random_seed': 1,
200 | 'logging_level': 'Silent',
201 | }
202 |
203 | params_CB_C = {
204 | 'iterations': 500,
205 | 'learning_rate': 0.03,
206 | 'depth': 6,
207 | 'l2_leaf_reg': 3,
208 | 'loss_function': 'Logloss',
209 | 'random_seed': 1,
210 | 'logging_level': 'Silent',
211 | }
212 |
213 | model_cb_A = CB_model(categ_conv=True)
214 | model_cb_A.set_params(params=params_CB_A)
215 | model_cb_B = CB_model(categ_conv=True)
216 | model_cb_B.set_params(params=params_CB_B)
217 | model_cb_C = CB_model(categ_conv=True)
218 | model_cb_C.set_params(params=params_CB_C)
219 | model_cb_dict = {'A': model_cb_A, 'B': model_cb_B, 'C': model_cb_C}
220 |
221 | # List of columns to delete obtained via find_exclude function and cross-validation
222 | exclude_CB_dict = {'A': ['A_0', 'A_10', 'A_106', 'A_113', 'A_114', 'A_115', 'A_120', 'A_138', 'A_15', 'A_173', 'A_174', 'A_175', 'A_181', 'A_185', 'A_191', 'A_195', 'A_202', 'A_206', 'A_215', 'A_216', 'A_218', 'A_223', 'A_245', 'A_250', 'A_251', 'A_252', 'A_253', 'A_254', 'A_255', 'A_256', 'A_263', 'A_272', 'A_277', 'A_295', 'A_299', 'A_308', 'A_309', 'A_32', 'A_33', 'A_330', 'A_39', 'A_43', 'A_44', 'A_57', 'A_59', 'A_63', 'A_69', 'A_6_1', 'A_70', 'A_72', 'A_77', 'A_81', 'A_88', 'A_89', 'A_93', 'cat_n_A_10', 'cat_n_A_15', 'cat_n_A_20', 'cat_n_A_22', 'cat_n_A_25', 'cat_n_A_33', 'cat_n_A_35', 'cat_n_A_39', 'cat_n_A_4', 'cat_n_A_5', 'cat_n_A_6', 'cat_n_A_8', 'cat_n_A_9', 'A_101', 'A_11', 'A_147', 'A_155', 'A_170', 'A_18', 'A_203', 'A_338', 'A_35', 'A_49', 'A_67', 'cat_n_A_11', 'cat_n_A_21', 'cat_n_A_30', 'cat_n_A_37', 'div_cat_iid_cat_n_A_16', 'A_105', 'A_14', 'A_149', 'A_197', 'A_26', 'A_261', 'A_302', 'A_312', 'A_319', 'A_328', 'A_32_1', 'A_341', 'A_9', 'cat_n_A_28', 'div_cat_iid_cat_n_A_32', 'A_121', 'A_125', 'A_131', 'A_161', 'A_17', 'A_192', 'A_229', 'A_259', 'A_60', 'A_80', 'cat_n_A_1', 'cat_n_A_29', 'A_13', 'A_134', 'A_176', 'A_182', 'A_213', 'A_22', 'A_267', 'A_301', 'A_31', 'A_146', 'A_162', 'A_27', 'A_152', 'A_189', 'A_292', 'A_3', 'A_65'],
223 | 'B': ['B_1', 'B_106_0', 'B_106_1', 'B_107_1', 'B_113', 'B_121', 'B_123_1', 'B_139', 'B_144_0', 'B_144_1', 'B_152_0', 'B_157_0', 'B_157_1', 'B_159_0', 'B_159_1', 'B_15_0', 'B_15_1', 'B_161_0', 'B_161_1', 'B_167', 'B_174_0', 'B_174_1', 'B_175_0', 'B_176', 'B_18', 'B_180_0', 'B_183', 'B_188_0', 'B_188_1', 'B_196', 'B_196_1', 'B_198_0', 'B_198_1', 'B_20', 'B_203', 'B_204', 'B_205', 'B_207', 'B_208', 'B_20_0', 'B_20_1', 'B_210_0', 'B_210_1', 'B_218_0', 'B_219', 'B_219_1', 'B_222_1', 'B_227', 'B_238', 'B_243', 'B_244', 'B_256', 'B_258', 'B_264', 'B_265', 'B_272', 'B_274', 'B_29', 'B_3', 'B_303', 'B_307', 'B_316', 'B_320', 'B_329', 'B_349', 'B_34_0', 'B_34_1', 'B_35', 'B_355', 'B_35_0', 'B_35_1', 'B_361', 'B_36_0', 'B_36_1', 'B_370', 'B_371', 'B_385', 'B_389', 'B_405', 'B_407', 'B_412', 'B_422', 'B_46_0', 'B_46_1', 'B_5', 'B_55', 'B_60_0', 'B_60_1', 'B_68_0', 'B_68_1', 'B_6_0', 'B_71_1', 'B_72', 'B_77', 'B_83', 'B_8_1', 'cat_n_B_0', 'cat_n_B_1', 'cat_n_B_10', 'cat_n_B_100', 'cat_n_B_101', 'cat_n_B_102', 'cat_n_B_103', 'cat_n_B_104', 'cat_n_B_105', 'cat_n_B_106', 'cat_n_B_107', 'cat_n_B_108', 'cat_n_B_11', 'cat_n_B_110', 'cat_n_B_111', 'cat_n_B_112', 'cat_n_B_113', 'cat_n_B_114', 'cat_n_B_115', 'cat_n_B_116', 'cat_n_B_117', 'cat_n_B_118', 'cat_n_B_119', 'cat_n_B_12', 'cat_n_B_120', 'cat_n_B_121', 'cat_n_B_122', 'cat_n_B_123', 'cat_n_B_124', 'cat_n_B_125', 'cat_n_B_126', 'cat_n_B_127', 'cat_n_B_128', 'cat_n_B_129', 'cat_n_B_13', 'cat_n_B_130', 'cat_n_B_131', 'cat_n_B_133', 'cat_n_B_134', 'cat_n_B_135', 'cat_n_B_136', 'cat_n_B_137', 'cat_n_B_138', 'cat_n_B_139', 'cat_n_B_14', 'cat_n_B_140', 'cat_n_B_141', 'cat_n_B_142', 'cat_n_B_143', 'cat_n_B_145', 'cat_n_B_146', 'cat_n_B_147', 'cat_n_B_148', 'cat_n_B_149', 'cat_n_B_15', 'cat_n_B_150', 'cat_n_B_151', 'cat_n_B_152', 'cat_n_B_153', 'cat_n_B_154', 'cat_n_B_156', 'cat_n_B_157', 'cat_n_B_158', 'cat_n_B_159', 'cat_n_B_16', 'cat_n_B_160', 'cat_n_B_161', 'cat_n_B_162', 'cat_n_B_163', 'cat_n_B_164', 'cat_n_B_165', 'cat_n_B_166', 'cat_n_B_167', 'cat_n_B_168', 'cat_n_B_169', 'cat_n_B_170', 'cat_n_B_171', 'cat_n_B_172', 'cat_n_B_173', 'cat_n_B_174', 'cat_n_B_175', 'cat_n_B_176', 'cat_n_B_177', 'cat_n_B_178', 'cat_n_B_179', 'cat_n_B_18', 'cat_n_B_180', 'cat_n_B_181', 'cat_n_B_183', 'cat_n_B_184', 'cat_n_B_185', 'cat_n_B_186', 'cat_n_B_187', 'cat_n_B_189', 'cat_n_B_19', 'cat_n_B_190', 'cat_n_B_191', 'cat_n_B_192', 'cat_n_B_193', 'cat_n_B_194', 'cat_n_B_195', 'cat_n_B_196', 'cat_n_B_197', 'cat_n_B_198', 'cat_n_B_199', 'cat_n_B_2', 'cat_n_B_20', 'cat_n_B_200', 'cat_n_B_201', 'cat_n_B_202', 'cat_n_B_203', 'cat_n_B_204', 'cat_n_B_205', 'cat_n_B_206', 'cat_n_B_207', 'cat_n_B_208', 'cat_n_B_209', 'cat_n_B_210', 'cat_n_B_211', 'cat_n_B_212', 'cat_n_B_213', 'cat_n_B_214', 'cat_n_B_215', 'cat_n_B_216', 'cat_n_B_217', 'cat_n_B_218', 'cat_n_B_22', 'cat_n_B_220', 'cat_n_B_221', 'cat_n_B_222', 'cat_n_B_223', 'cat_n_B_23', 'cat_n_B_24', 'cat_n_B_25', 'cat_n_B_26', 'cat_n_B_27', 'cat_n_B_28', 'cat_n_B_29', 'cat_n_B_3', 'cat_n_B_30', 'cat_n_B_31', 'cat_n_B_32', 'cat_n_B_33', 'cat_n_B_34', 'cat_n_B_36', 'cat_n_B_38', 'cat_n_B_39', 'cat_n_B_4', 'cat_n_B_40', 'cat_n_B_41', 'cat_n_B_42', 'cat_n_B_43', 'cat_n_B_44', 'cat_n_B_45', 'cat_n_B_46', 'cat_n_B_47', 'cat_n_B_48', 'cat_n_B_49', 'cat_n_B_5', 'cat_n_B_50', 'cat_n_B_51', 'cat_n_B_52', 'cat_n_B_53', 'cat_n_B_54', 'cat_n_B_55', 'cat_n_B_56', 'cat_n_B_57', 'cat_n_B_58', 'cat_n_B_59', 'cat_n_B_60', 'cat_n_B_61', 'cat_n_B_62', 'cat_n_B_63', 'cat_n_B_64', 'cat_n_B_65', 'cat_n_B_66', 'cat_n_B_67', 'cat_n_B_68', 'cat_n_B_69', 'cat_n_B_7', 'cat_n_B_70', 'cat_n_B_71', 'cat_n_B_72', 'cat_n_B_73', 'cat_n_B_75', 'cat_n_B_76', 'cat_n_B_77', 'cat_n_B_78', 'cat_n_B_79', 'cat_n_B_8', 'cat_n_B_80', 'cat_n_B_82', 'cat_n_B_83', 'cat_n_B_84', 'cat_n_B_86', 'cat_n_B_87', 'cat_n_B_88', 'cat_n_B_89', 'cat_n_B_9', 'cat_n_B_90', 'cat_n_B_91', 'cat_n_B_92', 'cat_n_B_93', 'cat_n_B_94', 'cat_n_B_95', 'cat_n_B_96', 'cat_n_B_97', 'cat_n_B_98', 'cat_n_B_99', 'div_cat_iid_cat_n_B_0', 'div_cat_iid_cat_n_B_10', 'div_cat_iid_cat_n_B_102', 'div_cat_iid_cat_n_B_103', 'div_cat_iid_cat_n_B_105', 'div_cat_iid_cat_n_B_106', 'div_cat_iid_cat_n_B_107', 'div_cat_iid_cat_n_B_108', 'div_cat_iid_cat_n_B_11', 'div_cat_iid_cat_n_B_110', 'div_cat_iid_cat_n_B_111', 'div_cat_iid_cat_n_B_112', 'div_cat_iid_cat_n_B_115', 'div_cat_iid_cat_n_B_117', 'div_cat_iid_cat_n_B_118', 'div_cat_iid_cat_n_B_119', 'div_cat_iid_cat_n_B_12', 'div_cat_iid_cat_n_B_120', 'div_cat_iid_cat_n_B_121', 'div_cat_iid_cat_n_B_122', 'div_cat_iid_cat_n_B_123', 'div_cat_iid_cat_n_B_127', 'div_cat_iid_cat_n_B_129', 'div_cat_iid_cat_n_B_13', 'div_cat_iid_cat_n_B_131', 'div_cat_iid_cat_n_B_133', 'div_cat_iid_cat_n_B_134', 'div_cat_iid_cat_n_B_135', 'div_cat_iid_cat_n_B_136', 'div_cat_iid_cat_n_B_137', 'div_cat_iid_cat_n_B_139', 'div_cat_iid_cat_n_B_14', 'div_cat_iid_cat_n_B_140', 'div_cat_iid_cat_n_B_141', 'div_cat_iid_cat_n_B_142', 'div_cat_iid_cat_n_B_145', 'div_cat_iid_cat_n_B_146', 'div_cat_iid_cat_n_B_147', 'div_cat_iid_cat_n_B_148', 'div_cat_iid_cat_n_B_149', 'div_cat_iid_cat_n_B_151', 'div_cat_iid_cat_n_B_153', 'div_cat_iid_cat_n_B_154', 'div_cat_iid_cat_n_B_156', 'div_cat_iid_cat_n_B_157', 'div_cat_iid_cat_n_B_158', 'div_cat_iid_cat_n_B_159', 'div_cat_iid_cat_n_B_160', 'div_cat_iid_cat_n_B_161', 'div_cat_iid_cat_n_B_162', 'div_cat_iid_cat_n_B_165', 'div_cat_iid_cat_n_B_166', 'div_cat_iid_cat_n_B_168', 'div_cat_iid_cat_n_B_173', 'div_cat_iid_cat_n_B_174', 'div_cat_iid_cat_n_B_177', 'div_cat_iid_cat_n_B_178', 'div_cat_iid_cat_n_B_179', 'div_cat_iid_cat_n_B_181', 'div_cat_iid_cat_n_B_182', 'div_cat_iid_cat_n_B_184', 'div_cat_iid_cat_n_B_186', 'div_cat_iid_cat_n_B_187', 'div_cat_iid_cat_n_B_188', 'div_cat_iid_cat_n_B_189', 'div_cat_iid_cat_n_B_192', 'div_cat_iid_cat_n_B_193', 'div_cat_iid_cat_n_B_194', 'div_cat_iid_cat_n_B_197', 'div_cat_iid_cat_n_B_198', 'div_cat_iid_cat_n_B_199', 'div_cat_iid_cat_n_B_2', 'div_cat_iid_cat_n_B_20', 'div_cat_iid_cat_n_B_201', 'div_cat_iid_cat_n_B_202', 'div_cat_iid_cat_n_B_203', 'div_cat_iid_cat_n_B_204', 'div_cat_iid_cat_n_B_205', 'div_cat_iid_cat_n_B_206', 'div_cat_iid_cat_n_B_207', 'div_cat_iid_cat_n_B_208', 'div_cat_iid_cat_n_B_209', 'div_cat_iid_cat_n_B_210', 'div_cat_iid_cat_n_B_212', 'div_cat_iid_cat_n_B_213', 'div_cat_iid_cat_n_B_215', 'div_cat_iid_cat_n_B_218', 'div_cat_iid_cat_n_B_219', 'div_cat_iid_cat_n_B_220', 'div_cat_iid_cat_n_B_223', 'div_cat_iid_cat_n_B_23', 'div_cat_iid_cat_n_B_26', 'div_cat_iid_cat_n_B_27', 'div_cat_iid_cat_n_B_29', 'div_cat_iid_cat_n_B_3', 'div_cat_iid_cat_n_B_33', 'div_cat_iid_cat_n_B_34', 'div_cat_iid_cat_n_B_36', 'div_cat_iid_cat_n_B_38', 'div_cat_iid_cat_n_B_39', 'div_cat_iid_cat_n_B_42', 'div_cat_iid_cat_n_B_44', 'div_cat_iid_cat_n_B_45', 'div_cat_iid_cat_n_B_46', 'div_cat_iid_cat_n_B_47', 'div_cat_iid_cat_n_B_48', 'div_cat_iid_cat_n_B_49', 'div_cat_iid_cat_n_B_50', 'div_cat_iid_cat_n_B_51', 'div_cat_iid_cat_n_B_52', 'div_cat_iid_cat_n_B_53', 'div_cat_iid_cat_n_B_57', 'div_cat_iid_cat_n_B_58', 'div_cat_iid_cat_n_B_59', 'div_cat_iid_cat_n_B_61', 'div_cat_iid_cat_n_B_62', 'div_cat_iid_cat_n_B_66', 'div_cat_iid_cat_n_B_68', 'div_cat_iid_cat_n_B_69', 'div_cat_iid_cat_n_B_7', 'div_cat_iid_cat_n_B_71', 'div_cat_iid_cat_n_B_72', 'div_cat_iid_cat_n_B_73', 'div_cat_iid_cat_n_B_74', 'div_cat_iid_cat_n_B_76', 'div_cat_iid_cat_n_B_78', 'div_cat_iid_cat_n_B_79', 'div_cat_iid_cat_n_B_81', 'div_cat_iid_cat_n_B_83', 'div_cat_iid_cat_n_B_84', 'div_cat_iid_cat_n_B_87', 'div_cat_iid_cat_n_B_88', 'div_cat_iid_cat_n_B_90', 'div_cat_iid_cat_n_B_91', 'div_cat_iid_cat_n_B_92', 'div_cat_iid_cat_n_B_93', 'div_cat_iid_cat_n_B_94', 'div_cat_iid_cat_n_B_95', 'div_cat_iid_cat_n_B_99', 'iid_cnt', 'sum_B_106', 'sum_B_123', 'sum_B_144', 'sum_B_157', 'sum_B_159', 'sum_B_161', 'sum_B_174', 'sum_B_180', 'sum_B_188', 'sum_B_19', 'sum_B_198', 'sum_B_20', 'sum_B_36', 'sum_B_6', 'B_11', 'B_127', 'B_173', 'B_180_1', 'B_196_0', 'B_19_0', 'B_206', 'B_219_0', 'B_221', 'B_269', 'B_280', 'B_287', 'B_314', 'B_328', 'B_334', 'B_337', 'B_397', 'B_400', 'B_402', 'B_413', 'B_418', 'B_45', 'B_71', 'B_71_0', 'B_80', 'B_8_0', 'cat_n_B_144', 'cat_n_B_155', 'cat_n_B_17', 'cat_n_B_182', 'cat_n_B_219', 'cat_n_B_37', 'cat_n_B_74', 'cat_n_B_81', 'cat_n_B_85', 'div_cat_iid_cat_n_B_116', 'div_cat_iid_cat_n_B_144', 'div_cat_iid_cat_n_B_15', 'div_cat_iid_cat_n_B_150', 'div_cat_iid_cat_n_B_163', 'div_cat_iid_cat_n_B_169', 'div_cat_iid_cat_n_B_171', 'div_cat_iid_cat_n_B_172', 'div_cat_iid_cat_n_B_176', 'div_cat_iid_cat_n_B_19', 'div_cat_iid_cat_n_B_217', 'div_cat_iid_cat_n_B_22', 'div_cat_iid_cat_n_B_24', 'div_cat_iid_cat_n_B_25', 'div_cat_iid_cat_n_B_28', 'div_cat_iid_cat_n_B_31', 'div_cat_iid_cat_n_B_32', 'div_cat_iid_cat_n_B_35', 'div_cat_iid_cat_n_B_5', 'div_cat_iid_cat_n_B_54', 'div_cat_iid_cat_n_B_60', 'div_cat_iid_cat_n_B_63', 'div_cat_iid_cat_n_B_8', 'div_cat_iid_cat_n_B_80', 'div_cat_iid_cat_n_B_96', 'div_cat_iid_cat_n_B_98', 'sum_B_107', 'sum_B_175', 'sum_B_196', 'sum_B_60', 'sum_B_68', 'B_140', 'B_142', 'B_160', 'B_239', 'B_302', 'B_352', 'B_353', 'B_366', 'B_372', 'B_386', 'B_392', 'B_420', 'B_97_1', 'cat_n_B_109', 'cat_n_B_35', 'cat_n_B_6', 'div_cat_iid_cat_n_B_101', 'div_cat_iid_cat_n_B_16', 'div_cat_iid_cat_n_B_175', 'div_cat_iid_cat_n_B_183', 'div_cat_iid_cat_n_B_185', 'div_cat_iid_cat_n_B_196', 'div_cat_iid_cat_n_B_4', 'div_cat_iid_cat_n_B_41', 'div_cat_iid_cat_n_B_89', 'sum_B_35', 'sum_B_46', 'B_107', 'B_107_0', 'B_123_0', 'B_147', 'B_161', 'B_175_1', 'B_248', 'B_250', 'B_251', 'B_317', 'B_33', 'B_356', 'B_64', 'B_86', 'div_cat_iid_cat_n_B_17', 'div_cat_iid_cat_n_B_180', 'div_cat_iid_cat_n_B_214', 'sum_B_71', 'B_112', 'B_120', 'B_132_1', 'B_19_1', 'B_236', 'B_427', 'B_57', 'div_cat_iid_cat_n_B_126', 'div_cat_iid_cat_n_B_170', 'div_cat_iid_cat_n_B_221', 'div_cat_iid_cat_n_B_97', 'B_115', 'B_12', 'B_141', 'B_180', 'B_222_0', 'B_230', 'B_241', 'B_266', 'B_288', 'B_312', 'B_335', 'B_394', 'B_79', 'B_95', 'B_99', 'cat_n_B_132', 'div_cat_iid_cat_n_B_100', 'div_cat_iid_cat_n_B_164', 'div_cat_iid_cat_n_B_200', 'B_129', 'B_6_1', 'div_cat_iid_cat_n_B_138', 'div_cat_iid_cat_n_B_155', 'div_cat_iid_cat_n_B_43', 'sum_B_210', 'B_126', 'B_21', 'B_339', 'B_65', 'div_cat_iid_cat_n_B_125', 'sum_B_132', 'sum_B_219', 'B_128', 'B_8', 'div_cat_iid_cat_n_B_130', 'sum_B_222', 'B_191', 'B_30', 'B_4', 'sum_B_8', 'B_275', 'B_290', 'div_cat_iid_cat_n_B_195', 'B_325', 'B_63', 'B_157', 'B_260', 'B_423', 'B_91', 'div_cat_iid_cat_n_B_37', 'div_cat_iid_cat_n_B_55', 'B_430', 'div_cat_iid_cat_n_B_75', 'B_395', 'B_73', 'B_0', 'div_cat_iid_cat_n_B_86', 'B_23', 'B_268', 'B_27', 'B_306', 'B_348', 'B_6', 'B_92', 'div_cat_iid_cat_n_B_222', 'B_168', 'div_cat_iid_cat_n_B_56', 'B_318', 'B_340', 'B_301', 'B_164', 'B_271', 'B_417', 'B_111', 'B_285', 'B_350', 'B_187', 'B_246', 'B_401', 'B_89'],
224 | 'C': ['C_118', 'C_135', 'C_17_0', 'C_39', 'C_55', 'C_7', 'C_89', 'C_91', 'cat_n_C_0', 'cat_n_C_1', 'cat_n_C_10', 'cat_n_C_18', 'cat_n_C_21', 'cat_n_C_22', 'cat_n_C_24', 'cat_n_C_26', 'cat_n_C_28', 'cat_n_C_3', 'cat_n_C_32', 'cat_n_C_37', 'cat_n_C_4', 'cat_n_C_40', 'cat_n_C_5', 'div_cat_iid_cat_n_C_31', 'div_cat_iid_cat_n_C_39', 'iid_cnt', 'C_14_1', 'cat_n_C_11', 'cat_n_C_14', 'cat_n_C_15', 'cat_n_C_2', 'cat_n_C_23', 'cat_n_C_27', 'cat_n_C_30', 'cat_n_C_38', 'cat_n_C_9', 'div_cat_iid_cat_n_C_26', 'C_129', 'C_57', 'C_76', 'cat_n_C_17', 'cat_n_C_20', 'cat_n_C_19', 'cat_n_C_6', 'div_cat_iid_cat_n_C_33', 'C_10_0', 'C_146', 'C_46', 'cat_n_C_39', 'div_cat_iid_cat_n_C_17']}
225 |
226 | process_cb = process.processing(countries=['A', 'B', 'C'],
227 | balances=balances)
228 | process_cb.set_data_dict(data_dict=data_dict)
229 | process_cb.set_model_dict(model_dict=model_cb_dict)
230 | # process_cb.find_exclude()
231 | process_cb.set_exclude_dict(exclude_CB_dict)
232 | result_cb = process_cb.predict(model_name='catboost', path='models/')
233 |
234 | # Create submission
235 | submission = pd.DataFrame(index=result_cb.index)
236 | submission['country'] = result_cb.country
237 | submission['poor'] = (result_xgb.poor * 0.4 +
238 | result_cb.poor * 0.4 +
239 | result_lgbm.poor * 0.2)
240 |
241 | process_cb.save_csv(submission, clf_model_name='combine', path='models/')
242 |
243 |
244 | if __name__ == '__main__':
245 | predict()
246 |
--------------------------------------------------------------------------------
/notebooks/reproduce_final_submission.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "ExecuteTime": {
8 | "end_time": "2018-03-20T12:48:12.113072",
9 | "start_time": "2018-03-20T12:48:11.495212"
10 | },
11 | "collapsed": false
12 | },
13 | "outputs": [],
14 | "source": [
15 | "import os\n",
16 | "import datetime\n",
17 | "import pandas as pd\n",
18 | "import numpy as np\n",
19 | "import xgboost as xgb\n",
20 | "import lightgbm as lgb\n",
21 | "from sklearn.utils import resample\n",
22 | "from sklearn.utils import class_weight\n",
23 | "from sklearn.model_selection import StratifiedShuffleSplit\n",
24 | "from sklearn.preprocessing import StandardScaler\n",
25 | "from sklearn.metrics.classification import accuracy_score, log_loss\n",
26 | "from collections import OrderedDict\n",
27 | "from abc import ABC, abstractmethod, abstractproperty\n",
28 | "from catboost import CatBoostClassifier\n"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {
35 | "ExecuteTime": {
36 | "end_time": "2018-03-20T12:48:13.937848",
37 | "start_time": "2018-03-20T12:48:13.698473"
38 | },
39 | "code_folding": [],
40 | "collapsed": true
41 | },
42 | "outputs": [],
43 | "source": [
44 | "class Data():\n",
45 | "\n",
46 | " def __init__(self):\n",
47 | " self.country_df_train = pd.DataFrame()\n",
48 | " self.country_df_test = pd.DataFrame()\n",
49 | " self.categorical_list = []\n",
50 | " self.float_list = []\n",
51 | " self.file_name = 'hhold'\n",
52 | "\n",
53 | " def split_data(self, size=0.8, n_splits=1, random_state=1, balance=False, df=None):\n",
54 | " if not isinstance(df, pd.DataFrame):\n",
55 | " train = self.country_df_train\n",
56 | " else:\n",
57 | " train = df \n",
58 | " sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1-size, random_state=random_state)\n",
59 | " splits = []\n",
60 | " for train_index, validate_index in sss.split(train, train.poor):\n",
61 | " df_train = train.iloc[train_index]\n",
62 | " if balance:\n",
63 | " df_train = self.resample(df_train)\n",
64 | " splits.append((df_train, train.iloc[validate_index]))\n",
65 | " return splits\n",
66 | "\n",
67 | " def _rename_col(self):\n",
68 | " train_columns = self.country_df_train.columns\n",
69 | " train_new_columns = [x if x == 'poor' or x == 'country' else '{0}_{1}'.format(self.country, \n",
70 | " train_columns.get_loc(x)) for x in train_columns]\n",
71 | " self.country_df_train.columns=train_new_columns\n",
72 | " self.col_maping = dict(zip(train_columns, train_new_columns))\n",
73 | " self.col_maping_reverse = dict(zip(train_new_columns, train_columns))\n",
74 | "\n",
75 | " self.country_df_test.rename(columns=self.col_maping, inplace=True) \n",
76 | " \n",
77 | " def del_nonunique(self, df):\n",
78 | " cols = list(df)\n",
79 | " nunique = df.apply(pd.Series.nunique)\n",
80 | " cols_to_drop = nunique[nunique == 1].index\n",
81 | " print('Cols to drop:', cols_to_drop)\n",
82 | " return df.drop(cols_to_drop, axis=1)\n",
83 | "\n",
84 | " def _category_float_search(self, countries=['B'], cat_types=['object'], fi_types=['float64', 'int64']):\n",
85 | " categorical_list = list(self.country_df_train[self.col_common_list].select_dtypes(\n",
86 | " include=cat_types).columns)\n",
87 | " if self.country not in countries:\n",
88 | " return categorical_list, list(\n",
89 | " self.country_df_train[self.col_common_list].select_dtypes(include=fi_types).columns)\n",
90 | " float_list = []\n",
91 | " scaler = StandardScaler()\n",
92 | " print('float list length: ', len(list(self.country_df_test.select_dtypes(include=fi_types).columns)))\n",
93 | " for i in list(self.country_df_test[self.col_common_list].select_dtypes(include=fi_types).columns):\n",
94 | " self.country_df_train[i].fillna(self.country_df_train[i].median(), inplace=True)\n",
95 | " self.country_df_test[i].fillna(self.country_df_test[i].median(), inplace=True)\n",
96 | " value_set = set(self.country_df_test[i].unique()).union(set(self.country_df_train[i].unique()))\n",
97 | " if len(value_set) <= 5:\n",
98 | " categorical_list.append(i)\n",
99 | " else:\n",
100 | " self.country_df_train[i] = scaler.fit_transform(self.country_df_train[i].values.reshape(-1, 1))\n",
101 | " self.country_df_test[i] = scaler.transform(self.country_df_test[i].values.reshape(-1, 1))\n",
102 | " float_list.append(i)\n",
103 | " print('float list length: ', len(sorted(float_list)))\n",
104 | " return sorted(categorical_list), sorted(float_list)\n",
105 | " \n",
106 | " \n",
107 | " def set_file_name(self, file_name):\n",
108 | " self.file_name = file_name\n",
109 | " \n",
110 | " def load(self, country=None, file_name='hhold'):\n",
111 | " self.file_name = file_name\n",
112 | " self.country = country\n",
113 | " if country in ['A', 'B', 'C']:\n",
114 | " self.country_df_train = self.del_nonunique(\n",
115 | " pd.read_csv(\"../data/raw/{0}_{1}_train.csv\".format(country, self.file_name),\n",
116 | " index_col='id'))\n",
117 | " self.country_df_test = self.del_nonunique(\n",
118 | " pd.read_csv(\"../data/raw/{0}_{1}_test.csv\".format(country, self.file_name),\n",
119 | " index_col='id'))\n",
120 | " \n",
121 | " self._rename_col()\n",
122 | " self.col_common_list = \\\n",
123 | " sorted(list(set(self.country_df_train.columns).intersection(self.country_df_test.columns)))\n",
124 | " self.categorical_list, self_float_list = self._category_float_search()\n",
125 | " return True\n",
126 | " else:\n",
127 | " return False\n",
128 | "\n",
129 | " def save(self, ext='_ext', poor=True):\n",
130 | " train = self.get_train()\n",
131 | " if poor:\n",
132 | " train = pd.concat([train[0], train[1]], axis=1)\n",
133 | " else:\n",
134 | " train = train[0]\n",
135 | " file_name = \"../data/processed/{0}_{1}{2}_train.csv\".format(self.country, self.file_name, ext) \n",
136 | " train.to_csv(file_name, index=True, mode='w')\n",
137 | " test = self.get_test()\n",
138 | " file_name = \"../data/processed/{0}_{1}{2}_test.csv\".format(self.country, self.file_name, ext) \n",
139 | " test.to_csv(file_name, index=True, mode='w') \n",
140 | " return True\n",
141 | "\n",
142 | " def resample(self, df):\n",
143 | " df_majority = df[self.country_df_train.poor==False]\n",
144 | " df_minority = df[self.country_df_train.poor==True]\n",
145 | "\n",
146 | " df_minority_upsampled = resample(df_minority, \n",
147 | " replace=True,\n",
148 | " n_samples=df_majority.shape[0],\n",
149 | " random_state=1) \n",
150 | " return pd.concat([df_majority, df_minority_upsampled]) \n",
151 | " \n",
152 | " \n",
153 | " def get_train(self, balance=False):\n",
154 | " if balance:\n",
155 | " train = self.resample(self.country_df_train)\n",
156 | " return train[self.col_common_list], train['poor'] \n",
157 | " return self.country_df_train[self.col_common_list], self.country_df_train['poor']\n",
158 | " \n",
159 | " def get_train_valid(self, n_splits=1, balance=False):\n",
160 | " splits = self.split_data(n_splits=n_splits, balance=balance)\n",
161 | " return [((x[self.col_common_list], x.poor),(y[self.col_common_list], y.poor)) for x,y in splits]\n",
162 | " \n",
163 | " def get_test(self):\n",
164 | " return self.country_df_test[self.col_common_list]\n",
165 | " \n",
166 | " def get_cat_list(self): \n",
167 | " return self.categorical_list\n",
168 | " \n",
169 | " def get_float_list(self):\n",
170 | " return self.float_list"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {
177 | "ExecuteTime": {
178 | "end_time": "2018-03-21T19:43:58.907936",
179 | "start_time": "2018-03-21T19:43:58.724366"
180 | },
181 | "code_folding": [],
182 | "collapsed": false
183 | },
184 | "outputs": [],
185 | "source": [
186 | "class DataInd(Data):\n",
187 | "\n",
188 | " def __init__(self):\n",
189 | " super().__init__()\n",
190 | " self.file_name = 'indiv'\n",
191 | " \n",
192 | " def get_poor(self, df):\n",
193 | " return df['poor'].reset_index()[['id', 'poor']].drop_duplicates().set_index('id')\n",
194 | " \n",
195 | " def summarize(self, df):\n",
196 | " count = df.copy().groupby(level=0).sum()\n",
197 | " res_df = pd.concat({'sum': count}, axis=1)\n",
198 | " res_df.columns = ['{0}_{1}'.format(i[0], i[1]) for i in res_df.columns]\n",
199 | " res_df = res_df.reindex(index=df.index.get_level_values(0))\n",
200 | " res_df = res_df[~res_df.index.duplicated(keep='first')]\n",
201 | " print('summarized size df: ', res_df.shape)\n",
202 | " return res_df\n",
203 | " \n",
204 | " def _get_id_list(self, df):\n",
205 | " return list(OrderedDict.fromkeys(df.index.get_level_values(0)))\n",
206 | " \n",
207 | " def count_iid(self, df):\n",
208 | " s = df.index.get_level_values(0).value_counts()\n",
209 | " return s.reindex(index = self._get_id_list(df)).to_frame('iid_cnt')\n",
210 | " \n",
211 | " def count_neg_poz(self, df):\n",
212 | " print('count_neg_poz input df shape', df.shape)\n",
213 | " res_df = df.select_dtypes(include=['float64','int64','int8'])\n",
214 | " print('count_neg_poz res_df shape', res_df.shape)\n",
215 | " res_df = res_df.groupby(level=0).apply(lambda c: c.apply(\n",
216 | " lambda x: pd.Series([(x < 0).sum(), (x >= 0).sum()])).unstack())\n",
217 | " res_df.columns = ['{0}_{1}'.format(i[0], i[1]) for i in res_df.columns] \n",
218 | " print('count_neg_poz size df: ', res_df.shape)\n",
219 | " return res_df.reindex(index = self._get_id_list(df))\n",
220 | " \n",
221 | " def count_unique_categories(self, df, iid=True):\n",
222 | " res_df = df.groupby(level=0).apply(lambda c: c.apply(lambda x: pd.Series([len((x).unique())])))\n",
223 | " res_df.index = res_df.index.droplevel(1)\n",
224 | " res_df.columns = ['{0}_{1}'.format('cat_n', i) for i in res_df.columns]\n",
225 | " print('count_unique_categories size df: ', res_df.shape)\n",
226 | " res_df = res_df.reindex(index = self._get_id_list(df))\n",
227 | " if iid:\n",
228 | " div_df = res_df.div(self.count_iid(df)['iid_cnt'], axis=0)\n",
229 | " div_df.columns = ['{0}_{1}'.format('div_cat_iid', i) for i in res_df.columns]\n",
230 | " res_df = pd.concat([res_df, div_df], axis=1)\n",
231 | " return res_df\n",
232 | " \n",
233 | " def load(self, country=None, obj_enc=False, cat_enc=False): \n",
234 | " self.country = country\n",
235 | " if country in ['A', 'B', 'C']:\n",
236 | " self.country_df_train = self.del_nonunique(\n",
237 | " pd.read_csv(\"../data/raw/{0}_{1}_train.csv\".format(country, self.file_name), \n",
238 | " index_col=['id','iid']))\n",
239 | " self.country_df_test = self.del_nonunique(\n",
240 | " pd.read_csv(\"../data/raw/{0}_{1}_test.csv\".format(country, self.file_name), \n",
241 | " index_col=['id','iid']))\n",
242 | "\n",
243 | " self._rename_col()\n",
244 | " print(self.country_df_train.head())\n",
245 | " print(self.country_df_test.head())\n",
246 | " self.col_common_list = sorted(list(set(self.country_df_train.columns).intersection(\n",
247 | " self.country_df_test.columns)))\n",
248 | "\n",
249 | " self.categorical_list, self_float_list = self._category_float_search(countries=['A', 'B', 'C'])\n",
250 | "\n",
251 | " if cat_enc:\n",
252 | " for header in self.categorical_list:\n",
253 | " self.country_df_train[header] = self.country_df_train[header].astype('category').cat.codes\n",
254 | " self.country_df_test[header] = self.country_df_test[header].astype('category').cat.codes\n",
255 | " \n",
256 | " self.country_df_train = pd.concat([self.get_poor(self.country_df_train),\n",
257 | " self.count_iid(self.country_df_train),\n",
258 | " self.count_neg_poz(self.country_df_train),\n",
259 | " self.summarize(self.country_df_train),\n",
260 | " self.count_unique_categories(self.country_df_train)\n",
261 | " ], axis=1)\n",
262 | " self.country_df_test = pd.concat([self.count_iid(self.country_df_test),\n",
263 | " self.count_neg_poz(self.country_df_test),\n",
264 | " self.summarize(self.country_df_test),\n",
265 | " self.count_unique_categories(self.country_df_test)\n",
266 | " ], axis=1)\n",
267 | " print(self.country_df_train.head())\n",
268 | " print(self.country_df_test.head())\n",
269 | " self.col_common_list = sorted(list(set(self.country_df_train.columns).intersection(\n",
270 | " self.country_df_test.columns)))\n",
271 | " self.categorical_list, self_float_list = self._category_float_search(countries=['A', 'B', 'C'])\n",
272 | "\n",
273 | " if obj_enc:\n",
274 | " self.country_df_train = self.object_encode(self.country_df_train, self.categorical_list)\n",
275 | " self.country_df_test = self.object_encode(self.country_df_test, self.categorical_list)\n",
276 | " self.col_common_list = \\\n",
277 | " sorted(list(set(self.country_df_train.columns).intersection(self.country_df_test.columns)))\n",
278 | "\n",
279 | " print('dataind train shape: ', self.country_df_train.shape)\n",
280 | " print('dataind test shape: ', self.country_df_test.shape)\n",
281 | " return True\n",
282 | " else:\n",
283 | " return False"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": null,
289 | "metadata": {
290 | "ExecuteTime": {
291 | "end_time": "2018-03-21T19:22:13.042508",
292 | "start_time": "2018-03-21T19:22:13.022175"
293 | },
294 | "code_folding": [],
295 | "collapsed": true
296 | },
297 | "outputs": [],
298 | "source": [
299 | "class DataConcat(Data):\n",
300 | "\n",
301 | " def __init__(self):\n",
302 | " self.data_hh_train = pd.DataFrame()\n",
303 | " self.data_hh_test = pd.DataFrame() \n",
304 | " self.data_indiv_train = pd.DataFrame()\n",
305 | " self.data_indiv_test = pd.DataFrame()\n",
306 | " super().__init__()\n",
307 | " self.file_name = 'combine'\n",
308 | " \n",
309 | " def load(self, country=None, file_name_hh='hhold', file_name_ind='indiv_ext'):\n",
310 | " self.country = country\n",
311 | " if country in ['A', 'B', 'C']:\n",
312 | " self.data_indiv_train = self.del_nonunique(\n",
313 | " pd.read_csv(\"../data/processed/{0}_{1}_train.csv\".format(country, file_name_ind), \n",
314 | " index_col='id'))\n",
315 | " self.data_indiv_test = self.del_nonunique(\n",
316 | " pd.read_csv(\"../data/processed/{0}_{1}_test.csv\".format(country, file_name_ind), \n",
317 | " index_col='id'))\n",
318 | " data_hh = Data()\n",
319 | " if data_hh.load(country, file_name=file_name_hh):\n",
320 | " self.country_df_train = data_hh.country_df_train\n",
321 | " self.country_df_test = data_hh.country_df_test\n",
322 | "\n",
323 | " self.categorical_list = data_hh.categorical_list\n",
324 | " \n",
325 | " self.country_df_train = self.country_df_train.join(self.data_indiv_train) \n",
326 | " self.country_df_test = self.country_df_test.join(self.data_indiv_test) \n",
327 | " self.col_common_list = sorted(list(set(self.country_df_train.columns).intersection(\n",
328 | " self.country_df_test.columns))) \n",
329 | " return True\n",
330 | " else:\n",
331 | " return False"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": null,
337 | "metadata": {
338 | "ExecuteTime": {
339 | "end_time": "2018-03-21T19:44:05.619495",
340 | "start_time": "2018-03-21T19:44:05.614755"
341 | },
342 | "code_folding": [],
343 | "collapsed": true
344 | },
345 | "outputs": [],
346 | "source": [
347 | "def combine_csv():\n",
348 | " data = DataInd()\n",
349 | " data_concat = DataConcat()\n",
350 | " for c in ['A']:\n",
351 | " data.load(c)\n",
352 | " data.save(ext='_ext', poor=False)\n",
353 | " data_concat.load(c)\n",
354 | " data_concat.save(ext='')"
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": null,
360 | "metadata": {
361 | "ExecuteTime": {
362 | "end_time": "2018-03-20T12:48:24.432502",
363 | "start_time": "2018-03-20T12:48:24.362500"
364 | },
365 | "code_folding": [
366 | 0
367 | ],
368 | "collapsed": true
369 | },
370 | "outputs": [],
371 | "source": [
372 | "class predict_model:\n",
373 | " \n",
374 | " @abstractmethod\n",
375 | " def __init__(self, name='predict_model', categ_conv=True):\n",
376 | " self.params = {}\n",
377 | " self.exclude_list = []\n",
378 | " self.name = name\n",
379 | " self.random = 1\n",
380 | " self.classifier = None\n",
381 | " self.categ_conv = categ_conv\n",
382 | " self.data_df = {}\n",
383 | "\n",
384 | " @abstractmethod\n",
385 | " def set_params(self, params=None):\n",
386 | " if not params:\n",
387 | " self.params = {}\n",
388 | " else:\n",
389 | " self.params = params\n",
390 | " \n",
391 | " @abstractmethod \n",
392 | " def set_random_seed(self, random=1):\n",
393 | " self.random = random\n",
394 | " \n",
395 | " @abstractmethod\n",
396 | " def load_data(self, data, balance=False): \n",
397 | " self.data = data\n",
398 | " self.data_df['train'], self.data_df['y'] = self.data.get_train(balance=balance)\n",
399 | " self.data_df['test'] = self.data.get_test()\n",
400 | " \n",
401 | " self.category_cols = self.data.get_cat_list()\n",
402 | " for header in self.category_cols:\n",
403 | " self.data_df['train'][header] = self.data_df['train'][header].astype('category').cat.codes\n",
404 | " self.data_df['test'][header] = self.data_df['test'][header].astype('category').cat.codes\n",
405 | " return True\n",
406 | "\n",
407 | " @abstractmethod\n",
408 | " def get_train(self): \n",
409 | " return self.data_df['train']\n",
410 | "\n",
411 | " @abstractmethod\n",
412 | " def get_y(self): \n",
413 | " return self.data_df['y']\n",
414 | "\n",
415 | " @abstractmethod\n",
416 | " def get_test(self): \n",
417 | " return self.data_df['test']\n",
418 | " \n",
419 | " @abstractmethod\n",
420 | " def set_exclude_list(self, exclude_list):\n",
421 | " self.exclude_list = exclude_list.copy()\n",
422 | " \n",
423 | " @abstractmethod\n",
424 | " def get_feature_importances(self):\n",
425 | " pass\n",
426 | " \n",
427 | " @abstractmethod\n",
428 | " def train(self, x_train=None, y_train=None):\n",
429 | " pass\n",
430 | " \n",
431 | " @abstractmethod\n",
432 | " def predict(self, test=None):\n",
433 | " if self.classifier:\n",
434 | " if not isinstance(test, pd.DataFrame):\n",
435 | " test = self.get_test()\n",
436 | " elif self.categ_conv:\n",
437 | " cols = [x for x in self.category_cols if x in test.columns]\n",
438 | " for header in cols:\n",
439 | " test[header] = test[header].astype('category').cat.codes \n",
440 | " test = test.drop([x for x in self.exclude_list if x in test.columns], axis=1)\n",
441 | " res = pd.DataFrame(index=test.index)\n",
442 | " res['country'] = self.data.country\n",
443 | " res['poor'] = self.classifier.predict_proba(test)[:,1]\n",
444 | " return res\n",
445 | " else:\n",
446 | " print('error: classifier not defined')\n",
447 | " return None"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {
454 | "ExecuteTime": {
455 | "end_time": "2018-03-20T12:48:24.747505",
456 | "start_time": "2018-03-20T12:48:24.698358"
457 | },
458 | "code_folding": [
459 | 0
460 | ],
461 | "collapsed": true
462 | },
463 | "outputs": [],
464 | "source": [
465 | "class CB_model(predict_model):\n",
466 | " \n",
467 | " def __init__(self, name='catboost', categ_conv=True):\n",
468 | " super().__init__(name='catboost', categ_conv=categ_conv)\n",
469 | " self.name = name\n",
470 | " \n",
471 | " def load_data(self, data, balance=False): \n",
472 | " if super().load_data(data, balance):\n",
473 | " c_w = class_weight.compute_class_weight(class_weight='balanced',\n",
474 | " classes=np.unique(self.data_df['y']), \n",
475 | " y=self.data_df['y'])\n",
476 | " print('class_weight: ', c_w)\n",
477 | "\n",
478 | " self.classifier = CatBoostClassifier(**self.params, class_weights=c_w)\n",
479 | " return True\n",
480 | " else:\n",
481 | " return False\n",
482 | " \n",
483 | " def train(self, x_train=None, y_train=None):\n",
484 | " if not isinstance(x_train, pd.DataFrame):\n",
485 | " x_train = self.get_train()\n",
486 | " elif self.categ_conv:\n",
487 | " cols = [x for x in self.category_cols if x in x_train.columns]\n",
488 | " for header in cols:\n",
489 | " x_train[header] = x_train[header].astype('category').cat.codes\n",
490 | " if not isinstance(y_train, pd.Series):\n",
491 | " y_train = self.get_y()\n",
492 | " \n",
493 | " x_train = x_train.drop([x for x in self.exclude_list if x in x_train.columns], axis=1)\n",
494 | " \n",
495 | " self.category_cols = [x for x in self.category_cols if x not in self.exclude_list]\n",
496 | "\n",
497 | " cat_dims = [x_train.columns.get_loc(i) for i in self.category_cols] \n",
498 | " print(x_train.shape, y_train.shape, len(self.category_cols))\n",
499 | " self.classifier.fit(x_train, y_train, cat_features=cat_dims) \n",
500 | " return self.classifier\n",
501 | " \n",
502 | " def get_feature_importances(self):\n",
503 | " return self.classifier._feature_importance "
504 | ]
505 | },
506 | {
507 | "cell_type": "code",
508 | "execution_count": null,
509 | "metadata": {
510 | "ExecuteTime": {
511 | "end_time": "2018-03-20T12:48:25.165680",
512 | "start_time": "2018-03-20T12:48:25.137715"
513 | },
514 | "code_folding": [
515 | 0
516 | ],
517 | "collapsed": true
518 | },
519 | "outputs": [],
520 | "source": [
521 | "class XGB_model(predict_model):\n",
522 | " \n",
523 | " def __init__(self, name='xgboost', categ_conv=True):\n",
524 | " super().__init__(name='xgboost', categ_conv=categ_conv)\n",
525 | " self.name = name\n",
526 | " \n",
527 | " def load_data(self, data, balance=False): \n",
528 | " if super().load_data(data, balance):\n",
529 | " self.params['scale_pos_weight'] = \\\n",
530 | " (self.data_df['y'].shape[0] - self.data_df['y'].sum()) / self.data_df['y'].sum()\n",
531 | " self.classifier = xgb.XGBClassifier(**self.params)\n",
532 | " return True\n",
533 | " else:\n",
534 | " return False\n",
535 | " \n",
536 | " def train(self, x_train=None, y_train=None):\n",
537 | " \n",
538 | " if not isinstance(x_train, pd.DataFrame):\n",
539 | " x_train = self.get_train()\n",
540 | " elif self.categ_conv:\n",
541 | " cols = [x for x in self.category_cols if x in x_train.columns]\n",
542 | " for header in cols:\n",
543 | " x_train[header] = x_train[header].astype('category').cat.codes\n",
544 | "\n",
545 | " if not isinstance(y_train, pd.Series):\n",
546 | " y_train = self.get_y()\n",
547 | " \n",
548 | " x_train = x_train.drop([x for x in self.exclude_list if x in x_train.columns], axis=1)\n",
549 | " print('x_train shape: ', x_train.shape) \n",
550 | " self.classifier.fit(x_train, y_train) \n",
551 | "\n",
552 | " return self.classifier \n",
553 | "\n",
554 | " def get_feature_importances(self):\n",
555 | " return self.classifier.feature_importances_"
556 | ]
557 | },
558 | {
559 | "cell_type": "code",
560 | "execution_count": null,
561 | "metadata": {
562 | "ExecuteTime": {
563 | "end_time": "2018-03-20T12:48:25.591673",
564 | "start_time": "2018-03-20T12:48:25.564906"
565 | },
566 | "code_folding": [
567 | 0
568 | ],
569 | "collapsed": true
570 | },
571 | "outputs": [],
572 | "source": [
573 | "class LGBM_model(predict_model):\n",
574 | " \n",
575 | " def __init__(self, name='lightgbm', categ_conv=True):\n",
576 | " super().__init__(name='lightgbm', categ_conv=categ_conv)\n",
577 | " self.name = name\n",
578 | " \n",
579 | " def load_data(self, data, balance=False): \n",
580 | " if super().load_data(data, balance):\n",
581 | " self.classifier = lgb.LGBMClassifier(**self.params) \n",
582 | " return True\n",
583 | " else:\n",
584 | " return False\n",
585 | "\n",
586 | " def train(self, x_train=None, y_train=None):\n",
587 | " \n",
588 | " if not isinstance(x_train, pd.DataFrame):\n",
589 | " x_train = self.get_train()\n",
590 | " elif self.categ_conv:\n",
591 | " cols = [x for x in self.category_cols if x in x_train.columns]\n",
592 | " for header in cols:\n",
593 | " x_train[header] = x_train[header].astype('category').cat.codes\n",
594 | "\n",
595 | " if not isinstance(y_train, pd.Series):\n",
596 | " y_train = self.get_y()\n",
597 | " \n",
598 | " x_train = x_train.drop([x for x in self.exclude_list if x in x_train.columns], axis=1)\n",
599 | " print('x_train shape: ', x_train.shape) \n",
600 | " self.category_cols = [x for x in self.category_cols if x not in self.exclude_list]\n",
601 | " self.classifier.fit(x_train, y_train,verbose=False) \n",
602 | "\n",
603 | " return self.classifier \n",
604 | "\n",
605 | " def get_feature_importances(self):\n",
606 | " return self.classifier.feature_importances_ "
607 | ]
608 | },
609 | {
610 | "cell_type": "code",
611 | "execution_count": null,
612 | "metadata": {
613 | "ExecuteTime": {
614 | "end_time": "2018-03-20T13:45:52.487688",
615 | "start_time": "2018-03-20T13:45:52.362851"
616 | },
617 | "code_folding": [
618 | 0
619 | ],
620 | "collapsed": true
621 | },
622 | "outputs": [],
623 | "source": [
624 | "class processing:\n",
625 | " \n",
626 | " def __init__(self, countries=['A', 'B', 'C'], \n",
627 | " balances={'A':False, 'B':False, 'C':False}):\n",
628 | " self.countries = countries\n",
629 | " self.balances = balances\n",
630 | " self.exclude_dict = {'A': [], 'B': [],'C': []}\n",
631 | " self.data_dict = None\n",
632 | " self.model_dict = None\n",
633 | " self.vote_waights_dict = None\n",
634 | "\n",
635 | " def set_data_dict(self, data_dict):\n",
636 | " self.data_dict = data_dict\n",
637 | " \n",
638 | " def set_model_dict(self, model_dict):\n",
639 | " self.model_dict = model_dict\n",
640 | " \n",
641 | " def set_exclude_dict(self, exclude_dict):\n",
642 | " self.exclude_dict = exclude_dict\n",
643 | " \n",
644 | " def save_csv(self, df, clf_model_name='_', path=''):\n",
645 | " submission_file = os.path.join(\n",
646 | " path, 'submission_{0}_{1}.csv'.format(\n",
647 | " clf_model_name,\n",
648 | " str(datetime.datetime.now().strftime(\"%Y-%m-%d-%H-%M\"))))\n",
649 | " print('submission file:', submission_file)\n",
650 | " df.to_csv(submission_file, index=True, float_format='%.4f')\n",
651 | " print(df.head())\n",
652 | " \n",
653 | " def find_exclude(self, e_count=5):\n",
654 | " if not self.model_dict or not self.data_dict:\n",
655 | " print('Stoped: no models or data')\n",
656 | " return None\n",
657 | " \n",
658 | " for c in self.countries:\n",
659 | " self.data_dict[c].load(c) \n",
660 | " self.model_dict[c].load_data(data=self.data_dict[c], balance=self.balances[c])\n",
661 | " exclude_list = []\n",
662 | " finish = False\n",
663 | " logloss_dict = {}\n",
664 | " while not finish:\n",
665 | " self.model_dict[c].set_exclude_list(exclude_list)\n",
666 | " clf = self.model_dict[c].train()\n",
667 | " exclude_list_prev = exclude_list.copy()\n",
668 | " columns = [x for x in self.model_dict[c].get_train().columns if x not in exclude_list_prev]\n",
669 | " exclude_list = [x for (x,y) in zip(columns, \n",
670 | " self.model_dict[c].get_feature_importances()) if y == 0]\n",
671 | " if not exclude_list:\n",
672 | " finish = True \n",
673 | " exclude_list = exclude_list_prev + exclude_list\n",
674 | "\n",
675 | " logloss_iter = []\n",
676 | " splits = self.model_dict[c].data.get_train_valid(n_splits=e_count, balance=self.balances[c])\n",
677 | "\n",
678 | " for i in range(0, e_count):\n",
679 | " self.model_dict[c].set_random_seed(i)\n",
680 | " train, valid = splits[i]\n",
681 | " self.model_dict[c].set_exclude_list(exclude_list)\n",
682 | " self.model_dict[c].train(train[0], train[1])\n",
683 | " pred = self.model_dict[c].predict(valid[0])\n",
684 | " logloss_iter.append(log_loss(valid[1].astype(int), pred['poor']))\n",
685 | " logloss = np.mean(logloss_iter)\n",
686 | " logloss_dict[logloss] = exclude_list\n",
687 | " print('loglos: {0} exclude length: {1}'.format(logloss, len(exclude_list)))\n",
688 | " self.exclude_dict[c] = logloss_dict[np.min(list(logloss_dict.keys()))]\n",
689 | " print('Country: {0} exclude length: {1}'.format(c, len(self.exclude_dict.get(c))))\n",
690 | "\n",
691 | " return logloss_dict\n",
692 | " \n",
693 | " def predict(self):\n",
694 | " if not self.model_dict or not self.data_dict:\n",
695 | " print('Stoped: no models or data')\n",
696 | " return None\n",
697 | " \n",
698 | " predictions = []\n",
699 | "\n",
700 | " for c in self.countries:\n",
701 | " self.data_dict[c].load(c) \n",
702 | " self.model_dict[c].load_data(data=self.data_dict[c], balance=self.balances[c])\n",
703 | " self.model_dict[c].set_exclude_list(self.exclude_dict[c])\n",
704 | " if self.vote_waights_dict:\n",
705 | " self.model_dict[c].set_weights(self.vote_waights_dict[c])\n",
706 | " print('exclude: \\n', self.exclude_dict[c])\n",
707 | " self.model_dict[c].train()\n",
708 | " predictions.append(self.model_dict[c].predict())\n",
709 | " result = pd.concat(predictions) \n",
710 | " self.save_csv(result, clf_model_name=self.model_dict['A'].name, path='../models/')\n",
711 | " return result"
712 | ]
713 | },
714 | {
715 | "cell_type": "code",
716 | "execution_count": null,
717 | "metadata": {
718 | "ExecuteTime": {
719 | "end_time": "2018-03-21T19:47:54.622674",
720 | "start_time": "2018-03-21T19:44:11.601545"
721 | },
722 | "collapsed": false,
723 | "scrolled": true
724 | },
725 | "outputs": [],
726 | "source": [
727 | "#creating data sets\n",
728 | "combine_csv()"
729 | ]
730 | },
731 | {
732 | "cell_type": "code",
733 | "execution_count": null,
734 | "metadata": {
735 | "ExecuteTime": {
736 | "end_time": "2018-03-20T18:35:01.475566",
737 | "start_time": "2018-03-20T18:34:01.697339"
738 | },
739 | "code_folding": [
740 | 17,
741 | 27,
742 | 37,
743 | 64,
744 | 75,
745 | 86,
746 | 110,
747 | 125,
748 | 140
749 | ],
750 | "collapsed": false
751 | },
752 | "outputs": [],
753 | "source": [
754 | "data_1 = Data()\n",
755 | "data_2 = DataConcat()\n",
756 | "data_dict = {'A': data_1, 'B': data_2,'C': data_2}\n",
757 | "balances={'A':False, 'B':False, 'C':True}\n",
758 | "\n",
759 | "exclude_CB_dict = {'A': ['A_0', 'A_10', 'A_106', 'A_113', 'A_114', 'A_115', 'A_120', 'A_138', 'A_15', 'A_173', 'A_174', 'A_175', 'A_181', 'A_185', 'A_191', 'A_195', 'A_202', 'A_206', 'A_215', 'A_216', 'A_218', 'A_223', 'A_245', 'A_250', 'A_251', 'A_252', 'A_253', 'A_254', 'A_255', 'A_256', 'A_263', 'A_272', 'A_277', 'A_295', 'A_299', 'A_308', 'A_309', 'A_32', 'A_33', 'A_330', 'A_39', 'A_43', 'A_44', 'A_57', 'A_59', 'A_63', 'A_69', 'A_6_1', 'A_70', 'A_72', 'A_77', 'A_81', 'A_88', 'A_89', 'A_93', 'cat_n_A_10', 'cat_n_A_15', 'cat_n_A_20', 'cat_n_A_22', 'cat_n_A_25', 'cat_n_A_33', 'cat_n_A_35', 'cat_n_A_39', 'cat_n_A_4', 'cat_n_A_5', 'cat_n_A_6', 'cat_n_A_8', 'cat_n_A_9', 'A_101', 'A_11', 'A_147', 'A_155', 'A_170', 'A_18', 'A_203', 'A_338', 'A_35', 'A_49', 'A_67', 'cat_n_A_11', 'cat_n_A_21', 'cat_n_A_30', 'cat_n_A_37', 'div_cat_iid_cat_n_A_16', 'A_105', 'A_14', 'A_149', 'A_197', 'A_26', 'A_261', 'A_302', 'A_312', 'A_319', 'A_328', 'A_32_1', 'A_341', 'A_9', 'cat_n_A_28', 'div_cat_iid_cat_n_A_32', 'A_121', 'A_125', 'A_131', 'A_161', 'A_17', 'A_192', 'A_229', 'A_259', 'A_60', 'A_80', 'cat_n_A_1', 'cat_n_A_29', 'A_13', 'A_134', 'A_176', 'A_182', 'A_213', 'A_22', 'A_267', 'A_301', 'A_31', 'cat_n_A_18', 'div_cat_iid_cat_n_A_38', 'A_146', 'A_162', 'A_27', 'A_152', 'A_189', 'A_292', 'A_3', 'A_65'],\n",
760 | " 'B': ['B_1', 'B_106_0', 'B_106_1', 'B_107_1', 'B_113', 'B_121', 'B_123_1', 'B_139', 'B_144_0', 'B_144_1', 'B_152_0', 'B_157_0', 'B_157_1', 'B_159_0', 'B_159_1', 'B_15_0', 'B_15_1', 'B_161_0', 'B_161_1', 'B_167', 'B_174_0', 'B_174_1', 'B_175_0', 'B_176', 'B_18', 'B_180_0', 'B_183', 'B_188_0', 'B_188_1', 'B_196', 'B_196_1', 'B_198_0', 'B_198_1', 'B_20', 'B_203', 'B_204', 'B_205', 'B_207', 'B_208', 'B_20_0', 'B_20_1', 'B_210_0', 'B_210_1', 'B_218_0', 'B_219', 'B_219_1', 'B_222_1', 'B_227', 'B_238', 'B_243', 'B_244', 'B_256', 'B_258', 'B_264', 'B_265', 'B_272', 'B_274', 'B_29', 'B_3', 'B_303', 'B_307', 'B_316', 'B_320', 'B_329', 'B_349', 'B_34_0', 'B_34_1', 'B_35', 'B_355', 'B_35_0', 'B_35_1', 'B_361', 'B_36_0', 'B_36_1', 'B_370', 'B_371', 'B_385', 'B_389', 'B_405', 'B_407', 'B_412', 'B_422', 'B_46_0', 'B_46_1', 'B_5', 'B_55', 'B_60_0', 'B_60_1', 'B_68_0', 'B_68_1', 'B_6_0', 'B_71_1', 'B_72', 'B_77', 'B_83', 'B_8_1', 'cat_n_B_0', 'cat_n_B_1', 'cat_n_B_10', 'cat_n_B_100', 'cat_n_B_101', 'cat_n_B_102', 'cat_n_B_103', 'cat_n_B_104', 'cat_n_B_105', 'cat_n_B_106', 'cat_n_B_107', 'cat_n_B_108', 'cat_n_B_11', 'cat_n_B_110', 'cat_n_B_111', 'cat_n_B_112', 'cat_n_B_113', 'cat_n_B_114', 'cat_n_B_115', 'cat_n_B_116', 'cat_n_B_117', 'cat_n_B_118', 'cat_n_B_119', 'cat_n_B_12', 'cat_n_B_120', 'cat_n_B_121', 'cat_n_B_122', 'cat_n_B_123', 'cat_n_B_124', 'cat_n_B_125', 'cat_n_B_126', 'cat_n_B_127', 'cat_n_B_128', 'cat_n_B_129', 'cat_n_B_13', 'cat_n_B_130', 'cat_n_B_131', 'cat_n_B_133', 'cat_n_B_134', 'cat_n_B_135', 'cat_n_B_136', 'cat_n_B_137', 'cat_n_B_138', 'cat_n_B_139', 'cat_n_B_14', 'cat_n_B_140', 'cat_n_B_141', 'cat_n_B_142', 'cat_n_B_143', 'cat_n_B_145', 'cat_n_B_146', 'cat_n_B_147', 'cat_n_B_148', 'cat_n_B_149', 'cat_n_B_15', 'cat_n_B_150', 'cat_n_B_151', 'cat_n_B_152', 'cat_n_B_153', 'cat_n_B_154', 'cat_n_B_156', 'cat_n_B_157', 'cat_n_B_158', 'cat_n_B_159', 'cat_n_B_16', 'cat_n_B_160', 'cat_n_B_161', 'cat_n_B_162', 'cat_n_B_163', 'cat_n_B_164', 'cat_n_B_165', 'cat_n_B_166', 'cat_n_B_167', 'cat_n_B_168', 'cat_n_B_169', 'cat_n_B_170', 'cat_n_B_171', 'cat_n_B_172', 'cat_n_B_173', 'cat_n_B_174', 'cat_n_B_175', 'cat_n_B_176', 'cat_n_B_177', 'cat_n_B_178', 'cat_n_B_179', 'cat_n_B_18', 'cat_n_B_180', 'cat_n_B_181', 'cat_n_B_183', 'cat_n_B_184', 'cat_n_B_185', 'cat_n_B_186', 'cat_n_B_187', 'cat_n_B_189', 'cat_n_B_19', 'cat_n_B_190', 'cat_n_B_191', 'cat_n_B_192', 'cat_n_B_193', 'cat_n_B_194', 'cat_n_B_195', 'cat_n_B_196', 'cat_n_B_197', 'cat_n_B_198', 'cat_n_B_199', 'cat_n_B_2', 'cat_n_B_20', 'cat_n_B_200', 'cat_n_B_201', 'cat_n_B_202', 'cat_n_B_203', 'cat_n_B_204', 'cat_n_B_205', 'cat_n_B_206', 'cat_n_B_207', 'cat_n_B_208', 'cat_n_B_209', 'cat_n_B_210', 'cat_n_B_211', 'cat_n_B_212', 'cat_n_B_213', 'cat_n_B_214', 'cat_n_B_215', 'cat_n_B_216', 'cat_n_B_217', 'cat_n_B_218', 'cat_n_B_22', 'cat_n_B_220', 'cat_n_B_221', 'cat_n_B_222', 'cat_n_B_223', 'cat_n_B_23', 'cat_n_B_24', 'cat_n_B_25', 'cat_n_B_26', 'cat_n_B_27', 'cat_n_B_28', 'cat_n_B_29', 'cat_n_B_3', 'cat_n_B_30', 'cat_n_B_31', 'cat_n_B_32', 'cat_n_B_33', 'cat_n_B_34', 'cat_n_B_36', 'cat_n_B_38', 'cat_n_B_39', 'cat_n_B_4', 'cat_n_B_40', 'cat_n_B_41', 'cat_n_B_42', 'cat_n_B_43', 'cat_n_B_44', 'cat_n_B_45', 'cat_n_B_46', 'cat_n_B_47', 'cat_n_B_48', 'cat_n_B_49', 'cat_n_B_5', 'cat_n_B_50', 'cat_n_B_51', 'cat_n_B_52', 'cat_n_B_53', 'cat_n_B_54', 'cat_n_B_55', 'cat_n_B_56', 'cat_n_B_57', 'cat_n_B_58', 'cat_n_B_59', 'cat_n_B_60', 'cat_n_B_61', 'cat_n_B_62', 'cat_n_B_63', 'cat_n_B_64', 'cat_n_B_65', 'cat_n_B_66', 'cat_n_B_67', 'cat_n_B_68', 'cat_n_B_69', 'cat_n_B_7', 'cat_n_B_70', 'cat_n_B_71', 'cat_n_B_72', 'cat_n_B_73', 'cat_n_B_75', 'cat_n_B_76', 'cat_n_B_77', 'cat_n_B_78', 'cat_n_B_79', 'cat_n_B_8', 'cat_n_B_80', 'cat_n_B_82', 'cat_n_B_83', 'cat_n_B_84', 'cat_n_B_86', 'cat_n_B_87', 'cat_n_B_88', 'cat_n_B_89', 'cat_n_B_9', 'cat_n_B_90', 'cat_n_B_91', 'cat_n_B_92', 'cat_n_B_93', 'cat_n_B_94', 'cat_n_B_95', 'cat_n_B_96', 'cat_n_B_97', 'cat_n_B_98', 'cat_n_B_99', 'div_cat_iid_cat_n_B_0', 'div_cat_iid_cat_n_B_10', 'div_cat_iid_cat_n_B_102', 'div_cat_iid_cat_n_B_103', 'div_cat_iid_cat_n_B_105', 'div_cat_iid_cat_n_B_106', 'div_cat_iid_cat_n_B_107', 'div_cat_iid_cat_n_B_108', 'div_cat_iid_cat_n_B_11', 'div_cat_iid_cat_n_B_110', 'div_cat_iid_cat_n_B_111', 'div_cat_iid_cat_n_B_112', 'div_cat_iid_cat_n_B_115', 'div_cat_iid_cat_n_B_117', 'div_cat_iid_cat_n_B_118', 'div_cat_iid_cat_n_B_119', 'div_cat_iid_cat_n_B_12', 'div_cat_iid_cat_n_B_120', 'div_cat_iid_cat_n_B_121', 'div_cat_iid_cat_n_B_122', 'div_cat_iid_cat_n_B_123', 'div_cat_iid_cat_n_B_127', 'div_cat_iid_cat_n_B_129', 'div_cat_iid_cat_n_B_13', 'div_cat_iid_cat_n_B_131', 'div_cat_iid_cat_n_B_133', 'div_cat_iid_cat_n_B_134', 'div_cat_iid_cat_n_B_135', 'div_cat_iid_cat_n_B_136', 'div_cat_iid_cat_n_B_137', 'div_cat_iid_cat_n_B_139', 'div_cat_iid_cat_n_B_14', 'div_cat_iid_cat_n_B_140', 'div_cat_iid_cat_n_B_141', 'div_cat_iid_cat_n_B_142', 'div_cat_iid_cat_n_B_145', 'div_cat_iid_cat_n_B_146', 'div_cat_iid_cat_n_B_147', 'div_cat_iid_cat_n_B_148', 'div_cat_iid_cat_n_B_149', 'div_cat_iid_cat_n_B_151', 'div_cat_iid_cat_n_B_153', 'div_cat_iid_cat_n_B_154', 'div_cat_iid_cat_n_B_156', 'div_cat_iid_cat_n_B_157', 'div_cat_iid_cat_n_B_158', 'div_cat_iid_cat_n_B_159', 'div_cat_iid_cat_n_B_160', 'div_cat_iid_cat_n_B_161', 'div_cat_iid_cat_n_B_162', 'div_cat_iid_cat_n_B_165', 'div_cat_iid_cat_n_B_166', 'div_cat_iid_cat_n_B_168', 'div_cat_iid_cat_n_B_173', 'div_cat_iid_cat_n_B_174', 'div_cat_iid_cat_n_B_177', 'div_cat_iid_cat_n_B_178', 'div_cat_iid_cat_n_B_179', 'div_cat_iid_cat_n_B_181', 'div_cat_iid_cat_n_B_182', 'div_cat_iid_cat_n_B_184', 'div_cat_iid_cat_n_B_186', 'div_cat_iid_cat_n_B_187', 'div_cat_iid_cat_n_B_188', 'div_cat_iid_cat_n_B_189', 'div_cat_iid_cat_n_B_192', 'div_cat_iid_cat_n_B_193', 'div_cat_iid_cat_n_B_194', 'div_cat_iid_cat_n_B_197', 'div_cat_iid_cat_n_B_198', 'div_cat_iid_cat_n_B_199', 'div_cat_iid_cat_n_B_2', 'div_cat_iid_cat_n_B_20', 'div_cat_iid_cat_n_B_201', 'div_cat_iid_cat_n_B_202', 'div_cat_iid_cat_n_B_203', 'div_cat_iid_cat_n_B_204', 'div_cat_iid_cat_n_B_205', 'div_cat_iid_cat_n_B_206', 'div_cat_iid_cat_n_B_207', 'div_cat_iid_cat_n_B_208', 'div_cat_iid_cat_n_B_209', 'div_cat_iid_cat_n_B_210', 'div_cat_iid_cat_n_B_212', 'div_cat_iid_cat_n_B_213', 'div_cat_iid_cat_n_B_215', 'div_cat_iid_cat_n_B_218', 'div_cat_iid_cat_n_B_219', 'div_cat_iid_cat_n_B_220', 'div_cat_iid_cat_n_B_223', 'div_cat_iid_cat_n_B_23', 'div_cat_iid_cat_n_B_26', 'div_cat_iid_cat_n_B_27', 'div_cat_iid_cat_n_B_29', 'div_cat_iid_cat_n_B_3', 'div_cat_iid_cat_n_B_33', 'div_cat_iid_cat_n_B_34', 'div_cat_iid_cat_n_B_36', 'div_cat_iid_cat_n_B_38', 'div_cat_iid_cat_n_B_39', 'div_cat_iid_cat_n_B_42', 'div_cat_iid_cat_n_B_44', 'div_cat_iid_cat_n_B_45', 'div_cat_iid_cat_n_B_46', 'div_cat_iid_cat_n_B_47', 'div_cat_iid_cat_n_B_48', 'div_cat_iid_cat_n_B_49', 'div_cat_iid_cat_n_B_50', 'div_cat_iid_cat_n_B_51', 'div_cat_iid_cat_n_B_52', 'div_cat_iid_cat_n_B_53', 'div_cat_iid_cat_n_B_57', 'div_cat_iid_cat_n_B_58', 'div_cat_iid_cat_n_B_59', 'div_cat_iid_cat_n_B_61', 'div_cat_iid_cat_n_B_62', 'div_cat_iid_cat_n_B_66', 'div_cat_iid_cat_n_B_68', 'div_cat_iid_cat_n_B_69', 'div_cat_iid_cat_n_B_7', 'div_cat_iid_cat_n_B_71', 'div_cat_iid_cat_n_B_72', 'div_cat_iid_cat_n_B_73', 'div_cat_iid_cat_n_B_74', 'div_cat_iid_cat_n_B_76', 'div_cat_iid_cat_n_B_78', 'div_cat_iid_cat_n_B_79', 'div_cat_iid_cat_n_B_81', 'div_cat_iid_cat_n_B_83', 'div_cat_iid_cat_n_B_84', 'div_cat_iid_cat_n_B_87', 'div_cat_iid_cat_n_B_88', 'div_cat_iid_cat_n_B_90', 'div_cat_iid_cat_n_B_91', 'div_cat_iid_cat_n_B_92', 'div_cat_iid_cat_n_B_93', 'div_cat_iid_cat_n_B_94', 'div_cat_iid_cat_n_B_95', 'div_cat_iid_cat_n_B_99', 'iid_cnt', 'sum_B_106', 'sum_B_123', 'sum_B_144', 'sum_B_157', 'sum_B_159', 'sum_B_161', 'sum_B_174', 'sum_B_180', 'sum_B_188', 'sum_B_19', 'sum_B_198', 'sum_B_20', 'sum_B_36', 'sum_B_6', 'B_11', 'B_127', 'B_173', 'B_180_1', 'B_196_0', 'B_19_0', 'B_206', 'B_219_0', 'B_221', 'B_269', 'B_280', 'B_287', 'B_314', 'B_328', 'B_334', 'B_337', 'B_397', 'B_400', 'B_402', 'B_413', 'B_418', 'B_45', 'B_71', 'B_71_0', 'B_80', 'B_8_0', 'cat_n_B_144', 'cat_n_B_155', 'cat_n_B_17', 'cat_n_B_182', 'cat_n_B_219', 'cat_n_B_37', 'cat_n_B_74', 'cat_n_B_81', 'cat_n_B_85', 'div_cat_iid_cat_n_B_116', 'div_cat_iid_cat_n_B_144', 'div_cat_iid_cat_n_B_15', 'div_cat_iid_cat_n_B_150', 'div_cat_iid_cat_n_B_163', 'div_cat_iid_cat_n_B_169', 'div_cat_iid_cat_n_B_171', 'div_cat_iid_cat_n_B_172', 'div_cat_iid_cat_n_B_176', 'div_cat_iid_cat_n_B_19', 'div_cat_iid_cat_n_B_217', 'div_cat_iid_cat_n_B_22', 'div_cat_iid_cat_n_B_24', 'div_cat_iid_cat_n_B_25', 'div_cat_iid_cat_n_B_28', 'div_cat_iid_cat_n_B_31', 'div_cat_iid_cat_n_B_32', 'div_cat_iid_cat_n_B_35', 'div_cat_iid_cat_n_B_5', 'div_cat_iid_cat_n_B_54', 'div_cat_iid_cat_n_B_60', 'div_cat_iid_cat_n_B_63', 'div_cat_iid_cat_n_B_8', 'div_cat_iid_cat_n_B_80', 'div_cat_iid_cat_n_B_96', 'div_cat_iid_cat_n_B_98', 'sum_B_107', 'sum_B_175', 'sum_B_196', 'sum_B_60', 'sum_B_68', 'B_140', 'B_142', 'B_160', 'B_239', 'B_302', 'B_352', 'B_353', 'B_366', 'B_372', 'B_386', 'B_392', 'B_420', 'B_97_1', 'cat_n_B_109', 'cat_n_B_35', 'cat_n_B_6', 'div_cat_iid_cat_n_B_101', 'div_cat_iid_cat_n_B_16', 'div_cat_iid_cat_n_B_175', 'div_cat_iid_cat_n_B_183', 'div_cat_iid_cat_n_B_185', 'div_cat_iid_cat_n_B_196', 'div_cat_iid_cat_n_B_4', 'div_cat_iid_cat_n_B_41', 'div_cat_iid_cat_n_B_89', 'sum_B_35', 'sum_B_46', 'B_107', 'B_107_0', 'B_123_0', 'B_147', 'B_161', 'B_175_1', 'B_248', 'B_250', 'B_251', 'B_317', 'B_33', 'B_356', 'B_64', 'B_86', 'div_cat_iid_cat_n_B_17', 'div_cat_iid_cat_n_B_180', 'div_cat_iid_cat_n_B_214', 'sum_B_71', 'B_112', 'B_120', 'B_132_1', 'B_19_1', 'B_236', 'B_427', 'B_57', 'div_cat_iid_cat_n_B_126', 'div_cat_iid_cat_n_B_170', 'div_cat_iid_cat_n_B_221', 'div_cat_iid_cat_n_B_97', 'B_115', 'B_12', 'B_141', 'B_180', 'B_222_0', 'B_230', 'B_241', 'B_266', 'B_288', 'B_312', 'B_335', 'B_394', 'B_79', 'B_95', 'B_99', 'cat_n_B_132', 'div_cat_iid_cat_n_B_100', 'div_cat_iid_cat_n_B_164', 'div_cat_iid_cat_n_B_200', 'B_129', 'B_6_1', 'div_cat_iid_cat_n_B_138', 'div_cat_iid_cat_n_B_155', 'div_cat_iid_cat_n_B_43', 'sum_B_210', 'B_126', 'B_21', 'B_339', 'B_65', 'div_cat_iid_cat_n_B_125', 'sum_B_132', 'sum_B_219', 'B_128', 'B_8', 'div_cat_iid_cat_n_B_130', 'sum_B_222', 'B_191', 'B_30', 'B_4', 'sum_B_8', 'B_275', 'B_290', 'div_cat_iid_cat_n_B_195', 'B_325', 'B_63', 'B_157', 'B_260', 'B_423', 'B_91', 'div_cat_iid_cat_n_B_37', 'div_cat_iid_cat_n_B_55', 'B_430', 'div_cat_iid_cat_n_B_75', 'B_395', 'B_73', 'B_0', 'div_cat_iid_cat_n_B_86', 'B_23', 'B_268', 'B_27', 'B_306', 'B_348', 'B_6', 'B_92', 'div_cat_iid_cat_n_B_222', 'B_168', 'div_cat_iid_cat_n_B_56', 'B_318', 'B_340', 'B_301', 'B_164', 'B_271', 'B_417', 'B_111', 'B_285', 'B_350', 'B_187', 'B_246', 'B_401', 'B_89'],\n",
761 | " 'C': ['C_118', 'C_135', 'C_17_0', 'C_39', 'C_55', 'C_7', 'C_89', 'C_91', 'cat_n_C_0', 'cat_n_C_1', 'cat_n_C_10', 'cat_n_C_18', 'cat_n_C_21', 'cat_n_C_22', 'cat_n_C_24', 'cat_n_C_26', 'cat_n_C_28', 'cat_n_C_3', 'cat_n_C_32', 'cat_n_C_37', 'cat_n_C_4', 'cat_n_C_40', 'cat_n_C_5', 'div_cat_iid_cat_n_C_31', 'div_cat_iid_cat_n_C_39', 'iid_cnt', 'C_14_1', 'cat_n_C_11', 'cat_n_C_14', 'cat_n_C_15', 'cat_n_C_2', 'cat_n_C_23', 'cat_n_C_27', 'cat_n_C_30', 'cat_n_C_38', 'cat_n_C_9', 'div_cat_iid_cat_n_C_26', 'C_129', 'C_57', 'C_76', 'cat_n_C_17', 'cat_n_C_20', 'cat_n_C_19', 'cat_n_C_6', 'div_cat_iid_cat_n_C_33', 'C_10_0', 'C_146', 'C_46', 'cat_n_C_39', 'div_cat_iid_cat_n_C_17']}\n",
762 | "exclude_XGB_dict = {'A': ['A_0', 'A_10', 'A_101', 'A_106', 'A_11', 'A_113', 'A_120', 'A_121', 'A_13', 'A_131', 'A_134', 'A_138', 'A_140', 'A_146', 'A_147', 'A_148', 'A_15', 'A_152', 'A_155', 'A_161', 'A_162', 'A_167', 'A_168', 'A_17', 'A_170', 'A_173', 'A_174', 'A_175', 'A_176', 'A_179', 'A_18', 'A_181', 'A_185', 'A_186', 'A_191', 'A_195', 'A_197', 'A_2', 'A_202', 'A_203', 'A_206', 'A_213', 'A_215', 'A_216', 'A_218', 'A_219', 'A_22', 'A_223', 'A_225', 'A_226', 'A_227', 'A_232', 'A_234', 'A_237', 'A_242', 'A_245', 'A_251', 'A_252', 'A_253', 'A_254', 'A_255', 'A_256', 'A_258', 'A_259', 'A_26', 'A_261', 'A_262', 'A_263', 'A_267', 'A_27', 'A_272', 'A_277', 'A_282', 'A_295', 'A_299', 'A_3', 'A_30', 'A_301', 'A_302', 'A_305', 'A_307', 'A_308', 'A_309', 'A_31', 'A_312', 'A_315', 'A_319', 'A_32', 'A_322', 'A_33', 'A_330', 'A_332', 'A_335', 'A_341', 'A_35', 'A_39', 'A_43', 'A_44', 'A_45', 'A_46', 'A_49', 'A_57', 'A_59', 'A_60', 'A_61', 'A_63', 'A_66', 'A_67', 'A_69', 'A_70', 'A_72', 'A_76', 'A_77', 'A_80', 'A_81', 'A_88', 'A_89', 'A_9', 'A_91', 'A_93', 'A_97', 'cat_n_A_25', 'cat_n_A_3', 'cat_n_A_36', 'cat_n_A_4', 'iid_cnt', 'A_105', 'A_114', 'A_229', 'cat_n_A_20', 'div_cat_iid_cat_n_A_25', 'A_14', 'A_6_1', 'cat_n_A_39'],\n",
763 | " 'B': ['B_0', 'B_1', 'B_106', 'B_109', 'B_112', 'B_12', 'B_120', 'B_121', 'B_128', 'B_135', 'B_14', 'B_140', 'B_141', 'B_142', 'B_143', 'B_144', 'B_145', 'B_148', 'B_149', 'B_151', 'B_152', 'B_152_1', 'B_157_0', 'B_157_1', 'B_158', 'B_159_0', 'B_15_1', 'B_16', 'B_160', 'B_161_0', 'B_161_1', 'B_162', 'B_167', 'B_17', 'B_172', 'B_173', 'B_174_1', 'B_175_0', 'B_175_1', 'B_176', 'B_18', 'B_180_1', 'B_187', 'B_188', 'B_188_1', 'B_191', 'B_196', 'B_196_0', 'B_196_1', 'B_203', 'B_204', 'B_205', 'B_206', 'B_208', 'B_209', 'B_20_0', 'B_20_1', 'B_210', 'B_210_0', 'B_210_1', 'B_211', 'B_212', 'B_215', 'B_219', 'B_219_0', 'B_227', 'B_228', 'B_23', 'B_230', 'B_234', 'B_237', 'B_238', 'B_239', 'B_24', 'B_241', 'B_243', 'B_244', 'B_247', 'B_248', 'B_250', 'B_251', 'B_252', 'B_254', 'B_256', 'B_259', 'B_260', 'B_264', 'B_265', 'B_266', 'B_269', 'B_271', 'B_275', 'B_278', 'B_279', 'B_28', 'B_284', 'B_29', 'B_3', 'B_302', 'B_303', 'B_304', 'B_307', 'B_313', 'B_314', 'B_320', 'B_334', 'B_337', 'B_340', 'B_342', 'B_348', 'B_34_0', 'B_34_1', 'B_35', 'B_350', 'B_353', 'B_354', 'B_355', 'B_356', 'B_359', 'B_35_0', 'B_35_1', 'B_36', 'B_361', 'B_362', 'B_363', 'B_364', 'B_365', 'B_366', 'B_368', 'B_36_0', 'B_36_1', 'B_37', 'B_370', 'B_371', 'B_372', 'B_375', 'B_379', 'B_385', 'B_386', 'B_389', 'B_390', 'B_391', 'B_392', 'B_394', 'B_395', 'B_397', 'B_400', 'B_402', 'B_405', 'B_406', 'B_407', 'B_41', 'B_410', 'B_411', 'B_412', 'B_413', 'B_418', 'B_42', 'B_420', 'B_422', 'B_423', 'B_427', 'B_428', 'B_44', 'B_47', 'B_48', 'B_50', 'B_52', 'B_55', 'B_60_0', 'B_60_1', 'B_61', 'B_62', 'B_63', 'B_64', 'B_65', 'B_66', 'B_67', 'B_68_0', 'B_68_1', 'B_7', 'B_71_1', 'B_72', 'B_76', 'B_80', 'B_83', 'B_86', 'B_89', 'B_8_0', 'B_8_1', 'B_9', 'B_94', 'B_95', 'B_96', 'B_99', 'cat_n_B_1', 'cat_n_B_10', 'cat_n_B_102', 'cat_n_B_104', 'cat_n_B_105', 'cat_n_B_106', 'cat_n_B_107', 'cat_n_B_108', 'cat_n_B_11', 'cat_n_B_111', 'cat_n_B_115', 'cat_n_B_116', 'cat_n_B_117', 'cat_n_B_118', 'cat_n_B_120', 'cat_n_B_122', 'cat_n_B_126', 'cat_n_B_127', 'cat_n_B_129', 'cat_n_B_13', 'cat_n_B_130', 'cat_n_B_133', 'cat_n_B_134', 'cat_n_B_137', 'cat_n_B_138', 'cat_n_B_139', 'cat_n_B_140', 'cat_n_B_141', 'cat_n_B_142', 'cat_n_B_145', 'cat_n_B_147', 'cat_n_B_148', 'cat_n_B_149', 'cat_n_B_152', 'cat_n_B_153', 'cat_n_B_154', 'cat_n_B_157', 'cat_n_B_158', 'cat_n_B_16', 'cat_n_B_160', 'cat_n_B_161', 'cat_n_B_165', 'cat_n_B_166', 'cat_n_B_168', 'cat_n_B_169', 'cat_n_B_17', 'cat_n_B_170', 'cat_n_B_171', 'cat_n_B_174', 'cat_n_B_177', 'cat_n_B_179', 'cat_n_B_18', 'cat_n_B_181', 'cat_n_B_182', 'cat_n_B_184', 'cat_n_B_185', 'cat_n_B_187', 'cat_n_B_189', 'cat_n_B_192', 'cat_n_B_193', 'cat_n_B_196', 'cat_n_B_197', 'cat_n_B_198', 'cat_n_B_20', 'cat_n_B_201', 'cat_n_B_203', 'cat_n_B_204', 'cat_n_B_205', 'cat_n_B_206', 'cat_n_B_208', 'cat_n_B_209', 'cat_n_B_211', 'cat_n_B_212', 'cat_n_B_213', 'cat_n_B_214', 'cat_n_B_215', 'cat_n_B_216', 'cat_n_B_218', 'cat_n_B_219', 'cat_n_B_220', 'cat_n_B_221', 'cat_n_B_223', 'cat_n_B_23', 'cat_n_B_24', 'cat_n_B_25', 'cat_n_B_26', 'cat_n_B_27', 'cat_n_B_28', 'cat_n_B_3', 'cat_n_B_30', 'cat_n_B_31', 'cat_n_B_32', 'cat_n_B_33', 'cat_n_B_34', 'cat_n_B_35', 'cat_n_B_36', 'cat_n_B_37', 'cat_n_B_38', 'cat_n_B_39', 'cat_n_B_4', 'cat_n_B_42', 'cat_n_B_45', 'cat_n_B_47', 'cat_n_B_49', 'cat_n_B_5', 'cat_n_B_50', 'cat_n_B_51', 'cat_n_B_52', 'cat_n_B_55', 'cat_n_B_56', 'cat_n_B_60', 'cat_n_B_62', 'cat_n_B_63', 'cat_n_B_64', 'cat_n_B_65', 'cat_n_B_68', 'cat_n_B_7', 'cat_n_B_70', 'cat_n_B_71', 'cat_n_B_72', 'cat_n_B_76', 'cat_n_B_77', 'cat_n_B_78', 'cat_n_B_8', 'cat_n_B_82', 'cat_n_B_83', 'cat_n_B_86', 'cat_n_B_88', 'cat_n_B_90', 'cat_n_B_92', 'cat_n_B_93', 'cat_n_B_94', 'cat_n_B_95', 'cat_n_B_98', 'cat_n_B_99', 'div_cat_iid_cat_n_B_102', 'div_cat_iid_cat_n_B_105', 'div_cat_iid_cat_n_B_111', 'div_cat_iid_cat_n_B_114', 'div_cat_iid_cat_n_B_116', 'div_cat_iid_cat_n_B_118', 'div_cat_iid_cat_n_B_119', 'div_cat_iid_cat_n_B_122', 'div_cat_iid_cat_n_B_127', 'div_cat_iid_cat_n_B_131', 'div_cat_iid_cat_n_B_134', 'div_cat_iid_cat_n_B_139', 'div_cat_iid_cat_n_B_141', 'div_cat_iid_cat_n_B_142', 'div_cat_iid_cat_n_B_147', 'div_cat_iid_cat_n_B_157', 'div_cat_iid_cat_n_B_158', 'div_cat_iid_cat_n_B_16', 'div_cat_iid_cat_n_B_161', 'div_cat_iid_cat_n_B_169', 'div_cat_iid_cat_n_B_170', 'div_cat_iid_cat_n_B_171', 'div_cat_iid_cat_n_B_174', 'div_cat_iid_cat_n_B_177', 'div_cat_iid_cat_n_B_178', 'div_cat_iid_cat_n_B_179', 'div_cat_iid_cat_n_B_180', 'div_cat_iid_cat_n_B_181', 'div_cat_iid_cat_n_B_184', 'div_cat_iid_cat_n_B_188', 'div_cat_iid_cat_n_B_189', 'div_cat_iid_cat_n_B_193', 'div_cat_iid_cat_n_B_196', 'div_cat_iid_cat_n_B_197', 'div_cat_iid_cat_n_B_199', 'div_cat_iid_cat_n_B_201', 'div_cat_iid_cat_n_B_202', 'div_cat_iid_cat_n_B_204', 'div_cat_iid_cat_n_B_206', 'div_cat_iid_cat_n_B_208', 'div_cat_iid_cat_n_B_209', 'div_cat_iid_cat_n_B_213', 'div_cat_iid_cat_n_B_215', 'div_cat_iid_cat_n_B_216', 'div_cat_iid_cat_n_B_217', 'div_cat_iid_cat_n_B_220', 'div_cat_iid_cat_n_B_223', 'div_cat_iid_cat_n_B_23', 'div_cat_iid_cat_n_B_26', 'div_cat_iid_cat_n_B_3', 'div_cat_iid_cat_n_B_31', 'div_cat_iid_cat_n_B_32', 'div_cat_iid_cat_n_B_33', 'div_cat_iid_cat_n_B_34', 'div_cat_iid_cat_n_B_35', 'div_cat_iid_cat_n_B_36', 'div_cat_iid_cat_n_B_38', 'div_cat_iid_cat_n_B_42', 'div_cat_iid_cat_n_B_43', 'div_cat_iid_cat_n_B_45', 'div_cat_iid_cat_n_B_47', 'div_cat_iid_cat_n_B_50', 'div_cat_iid_cat_n_B_51', 'div_cat_iid_cat_n_B_52', 'div_cat_iid_cat_n_B_59', 'div_cat_iid_cat_n_B_61', 'div_cat_iid_cat_n_B_62', 'div_cat_iid_cat_n_B_69', 'div_cat_iid_cat_n_B_7', 'div_cat_iid_cat_n_B_71', 'div_cat_iid_cat_n_B_72', 'div_cat_iid_cat_n_B_75', 'div_cat_iid_cat_n_B_76', 'div_cat_iid_cat_n_B_77', 'div_cat_iid_cat_n_B_81', 'div_cat_iid_cat_n_B_83', 'div_cat_iid_cat_n_B_84', 'div_cat_iid_cat_n_B_90', 'div_cat_iid_cat_n_B_92', 'div_cat_iid_cat_n_B_94', 'div_cat_iid_cat_n_B_95', 'div_cat_iid_cat_n_B_98', 'div_cat_iid_cat_n_B_99', 'iid_cnt', 'sum_B_157', 'sum_B_161', 'sum_B_174', 'sum_B_188', 'B_10', 'B_101', 'B_104', 'B_107', 'B_11', 'B_111', 'B_116', 'B_123_0', 'B_156', 'B_159_1', 'B_164', 'B_170', 'B_171', 'B_174_0', 'B_182', 'B_192', 'B_194', 'B_19_0', 'B_216', 'B_223', 'B_224', 'B_229', 'B_235', 'B_25', 'B_272', 'B_282', 'B_283', 'B_288', 'B_290', 'B_293', 'B_297', 'B_317', 'B_318', 'B_322', 'B_325', 'B_343', 'B_352', 'B_373', 'B_384', 'B_403', 'B_51', 'B_68', 'B_73', 'B_92', 'cat_n_B_12', 'cat_n_B_124', 'cat_n_B_125', 'cat_n_B_131', 'cat_n_B_132', 'cat_n_B_136', 'cat_n_B_159', 'cat_n_B_167', 'cat_n_B_19', 'cat_n_B_191', 'cat_n_B_194', 'cat_n_B_2', 'cat_n_B_200', 'cat_n_B_202', 'cat_n_B_207', 'cat_n_B_210', 'cat_n_B_217', 'cat_n_B_44', 'cat_n_B_59', 'cat_n_B_67', 'cat_n_B_75', 'cat_n_B_84', 'cat_n_B_9', 'cat_n_B_91', 'cat_n_B_96', 'div_cat_iid_cat_n_B_0', 'div_cat_iid_cat_n_B_112', 'div_cat_iid_cat_n_B_12', 'div_cat_iid_cat_n_B_121', 'div_cat_iid_cat_n_B_126', 'div_cat_iid_cat_n_B_136', 'div_cat_iid_cat_n_B_137', 'div_cat_iid_cat_n_B_138', 'div_cat_iid_cat_n_B_151', 'div_cat_iid_cat_n_B_167', 'div_cat_iid_cat_n_B_186', 'div_cat_iid_cat_n_B_198', 'div_cat_iid_cat_n_B_2', 'div_cat_iid_cat_n_B_203', 'div_cat_iid_cat_n_B_207', 'div_cat_iid_cat_n_B_211', 'div_cat_iid_cat_n_B_29', 'div_cat_iid_cat_n_B_39', 'div_cat_iid_cat_n_B_49', 'div_cat_iid_cat_n_B_5', 'B_123_1', 'B_146', 'B_147', 'B_174', 'B_198_1', 'B_218', 'B_222_1', 'B_285', 'B_296', 'B_339', 'B_414', 'B_85', 'B_91', 'cat_n_B_113', 'cat_n_B_114', 'cat_n_B_123', 'cat_n_B_151', 'cat_n_B_178', 'cat_n_B_180', 'cat_n_B_183', 'cat_n_B_195', 'cat_n_B_199', 'cat_n_B_29', 'cat_n_B_43', 'cat_n_B_48', 'cat_n_B_74', 'div_cat_iid_cat_n_B_145', 'div_cat_iid_cat_n_B_148', 'div_cat_iid_cat_n_B_192', 'div_cat_iid_cat_n_B_55', 'sum_B_35', 'B_103', 'B_107_1', 'B_123', 'B_155', 'B_178', 'B_183', 'B_2', 'B_233', 'B_268', 'B_270', 'B_295', 'B_319', 'B_321', 'B_328', 'B_33', 'B_360', 'B_382', 'B_383', 'B_387', 'B_388', 'B_46_0', 'B_75', 'cat_n_B_119', 'cat_n_B_128', 'cat_n_B_146', 'cat_n_B_173', 'cat_n_B_40', 'div_cat_iid_cat_n_B_11', 'div_cat_iid_cat_n_B_110', 'div_cat_iid_cat_n_B_120', 'div_cat_iid_cat_n_B_128', 'div_cat_iid_cat_n_B_172', 'div_cat_iid_cat_n_B_20', 'div_cat_iid_cat_n_B_210', 'div_cat_iid_cat_n_B_219', 'div_cat_iid_cat_n_B_221', 'div_cat_iid_cat_n_B_27', 'div_cat_iid_cat_n_B_60', 'div_cat_iid_cat_n_B_63', 'div_cat_iid_cat_n_B_64', 'sum_B_180', 'B_115', 'B_124', 'B_19', 'B_19_1', 'B_330', 'B_357', 'B_409', 'cat_n_B_103', 'cat_n_B_121', 'cat_n_B_164', 'cat_n_B_186', 'cat_n_B_54', 'cat_n_B_73', 'cat_n_B_80', 'div_cat_iid_cat_n_B_154', 'div_cat_iid_cat_n_B_187', 'div_cat_iid_cat_n_B_44', 'B_163', 'B_165', 'B_180_0', 'B_236', 'B_277', 'B_292', 'B_329', 'B_34', 'B_46_1', 'cat_n_B_57', 'div_cat_iid_cat_n_B_130', 'div_cat_iid_cat_n_B_57'],\n",
764 | " 'C': ['C_1', 'C_10', 'C_100', 'C_109', 'C_10_0', 'C_111', 'C_116', 'C_126', 'C_127', 'C_129', 'C_133', 'C_135', 'C_139', 'C_14', 'C_141', 'C_143', 'C_146', 'C_151', 'C_154', 'C_155', 'C_157', 'C_159', 'C_17_0', 'C_17_1', 'C_19', 'C_22', 'C_25', 'C_26', 'C_27_0', 'C_27_1', 'C_28', 'C_3', 'C_31', 'C_32', 'C_44', 'C_45', 'C_47', 'C_54', 'C_55', 'C_59', 'C_6', 'C_63', 'C_65', 'C_67', 'C_72', 'C_73', 'C_74', 'C_77', 'C_8', 'C_81', 'C_84', 'C_85', 'C_87', 'C_89', 'C_9', 'C_92', 'C_94', 'C_96', 'C_99', 'cat_n_C_0', 'cat_n_C_10', 'cat_n_C_11', 'cat_n_C_14', 'cat_n_C_15', 'cat_n_C_16', 'cat_n_C_18', 'cat_n_C_20', 'cat_n_C_21', 'cat_n_C_23', 'cat_n_C_24', 'cat_n_C_26', 'cat_n_C_32', 'cat_n_C_38', 'cat_n_C_4', 'cat_n_C_5', 'cat_n_C_9', 'div_cat_iid_cat_n_C_13', 'div_cat_iid_cat_n_C_23', 'div_cat_iid_cat_n_C_26', 'div_cat_iid_cat_n_C_31', 'div_cat_iid_cat_n_C_32', 'div_cat_iid_cat_n_C_6', 'div_cat_iid_cat_n_C_7', 'iid_cnt', 'C_13', 'C_144', 'C_161', 'C_2', 'C_29', 'C_33', 'C_41', 'C_79', 'C_90', 'C_98', 'cat_n_C_25', 'cat_n_C_27', 'cat_n_C_3', 'cat_n_C_37', 'cat_n_C_6', 'div_cat_iid_cat_n_C_0', 'div_cat_iid_cat_n_C_14', 'div_cat_iid_cat_n_C_20', 'C_145', 'C_60', 'C_69', 'cat_n_C_13', 'cat_n_C_40', 'div_cat_iid_cat_n_C_15', 'div_cat_iid_cat_n_C_5', 'C_142', 'C_50', 'C_62', 'C_103', 'C_121', 'C_24', 'C_30', 'C_39', 'C_40', 'C_112', 'C_123']}\n",
765 | "\n",
766 | "exclude_LGBM_dict = {'A': ['A_0', 'A_10', 'A_101', 'A_105', 'A_106', 'A_11', 'A_112', 'A_113', 'A_115', 'A_120', 'A_121', 'A_13', 'A_131', 'A_134', 'A_138', 'A_141', 'A_15', 'A_152', 'A_155', 'A_161', 'A_162', 'A_167', 'A_168', 'A_170', 'A_173', 'A_174', 'A_175', 'A_176', 'A_18', 'A_181', 'A_185', 'A_191', 'A_195', 'A_197', 'A_202', 'A_203', 'A_206', 'A_215', 'A_216', 'A_218', 'A_219', 'A_223', 'A_225', 'A_232', 'A_237', 'A_242', 'A_245', 'A_251', 'A_252', 'A_253', 'A_254', 'A_255', 'A_256', 'A_258', 'A_259', 'A_26', 'A_261', 'A_262', 'A_263', 'A_267', 'A_27', 'A_272', 'A_275', 'A_282', 'A_292', 'A_295', 'A_299', 'A_3', 'A_30', 'A_301', 'A_307', 'A_308', 'A_309', 'A_31', 'A_312', 'A_319', 'A_32', 'A_322', 'A_33', 'A_330', 'A_332', 'A_335', 'A_338', 'A_341', 'A_35', 'A_39', 'A_43', 'A_44', 'A_46', 'A_47', 'A_49', 'A_57', 'A_59', 'A_60', 'A_63', 'A_66', 'A_67', 'A_69', 'A_70', 'A_72', 'A_77', 'A_80', 'A_81', 'A_88', 'A_89', 'A_9', 'A_91', 'A_93'],\n",
767 | " 'B': ['B_0', 'B_1', 'B_106', 'B_106_0', 'B_107', 'B_11', 'B_115', 'B_120', 'B_121', 'B_128', 'B_140', 'B_141', 'B_142', 'B_143', 'B_144', 'B_151', 'B_152', 'B_157_0', 'B_157_1', 'B_158', 'B_159_0', 'B_159_1', 'B_15_0', 'B_16', 'B_160', 'B_161_0', 'B_161_1', 'B_162', 'B_164', 'B_165', 'B_167', 'B_17', 'B_172', 'B_174', 'B_174_0', 'B_174_1', 'B_176', 'B_18', 'B_180_0', 'B_187', 'B_188', 'B_188_1', 'B_191', 'B_194', 'B_196', 'B_196_0', 'B_196_1', 'B_19_0', 'B_203', 'B_204', 'B_205', 'B_206', 'B_208', 'B_209', 'B_20_0', 'B_210_0', 'B_210_1', 'B_215', 'B_219', 'B_219_0', 'B_227', 'B_228', 'B_229', 'B_23', 'B_230', 'B_234', 'B_236', 'B_238', 'B_24', 'B_241', 'B_242', 'B_243', 'B_244', 'B_247', 'B_25', 'B_250', 'B_254', 'B_256', 'B_264', 'B_266', 'B_269', 'B_271', 'B_272', 'B_275', 'B_279', 'B_283', 'B_284', 'B_288', 'B_29', 'B_293', 'B_296', 'B_3', 'B_302', 'B_303', 'B_307', 'B_314', 'B_317', 'B_318', 'B_325', 'B_329', 'B_330', 'B_334', 'B_337', 'B_340', 'B_348', 'B_34_0', 'B_34_1', 'B_35', 'B_350', 'B_354', 'B_355', 'B_356', 'B_35_0', 'B_35_1', 'B_36', 'B_361', 'B_366', 'B_36_0', 'B_36_1', 'B_37', 'B_370', 'B_371', 'B_372', 'B_373', 'B_385', 'B_386', 'B_389', 'B_390', 'B_394', 'B_397', 'B_399', 'B_400', 'B_402', 'B_405', 'B_406', 'B_407', 'B_408', 'B_410', 'B_411', 'B_412', 'B_413', 'B_418', 'B_42', 'B_420', 'B_422', 'B_427', 'B_428', 'B_432', 'B_436', 'B_48', 'B_50', 'B_52', 'B_55', 'B_60_1', 'B_63', 'B_64', 'B_65', 'B_67', 'B_68_0', 'B_71_0', 'B_72', 'B_73', 'B_75', 'B_80', 'B_83', 'B_89', 'B_8_0', 'B_9', 'B_91', 'B_94', 'B_95', 'B_99', 'cat_n_B_1', 'cat_n_B_102', 'cat_n_B_105', 'cat_n_B_106', 'cat_n_B_11', 'cat_n_B_110', 'cat_n_B_111', 'cat_n_B_112', 'cat_n_B_116', 'cat_n_B_117', 'cat_n_B_118', 'cat_n_B_119', 'cat_n_B_120', 'cat_n_B_121', 'cat_n_B_122', 'cat_n_B_123', 'cat_n_B_126', 'cat_n_B_127', 'cat_n_B_128', 'cat_n_B_131', 'cat_n_B_134', 'cat_n_B_136', 'cat_n_B_137', 'cat_n_B_138', 'cat_n_B_139', 'cat_n_B_140', 'cat_n_B_141', 'cat_n_B_142', 'cat_n_B_145', 'cat_n_B_147', 'cat_n_B_148', 'cat_n_B_151', 'cat_n_B_152', 'cat_n_B_153', 'cat_n_B_154', 'cat_n_B_157', 'cat_n_B_158', 'cat_n_B_159', 'cat_n_B_16', 'cat_n_B_160', 'cat_n_B_161', 'cat_n_B_167', 'cat_n_B_168', 'cat_n_B_169', 'cat_n_B_170', 'cat_n_B_171', 'cat_n_B_172', 'cat_n_B_174', 'cat_n_B_177', 'cat_n_B_178', 'cat_n_B_179', 'cat_n_B_18', 'cat_n_B_180', 'cat_n_B_181', 'cat_n_B_183', 'cat_n_B_184', 'cat_n_B_19', 'cat_n_B_190', 'cat_n_B_193', 'cat_n_B_194', 'cat_n_B_195', 'cat_n_B_196', 'cat_n_B_197', 'cat_n_B_198', 'cat_n_B_199', 'cat_n_B_20', 'cat_n_B_201', 'cat_n_B_202', 'cat_n_B_203', 'cat_n_B_204', 'cat_n_B_207', 'cat_n_B_208', 'cat_n_B_209', 'cat_n_B_210', 'cat_n_B_211', 'cat_n_B_213', 'cat_n_B_215', 'cat_n_B_216', 'cat_n_B_217', 'cat_n_B_218', 'cat_n_B_219', 'cat_n_B_220', 'cat_n_B_221', 'cat_n_B_223', 'cat_n_B_23', 'cat_n_B_25', 'cat_n_B_26', 'cat_n_B_27', 'cat_n_B_28', 'cat_n_B_29', 'cat_n_B_3', 'cat_n_B_31', 'cat_n_B_32', 'cat_n_B_33', 'cat_n_B_34', 'cat_n_B_36', 'cat_n_B_38', 'cat_n_B_39', 'cat_n_B_42', 'cat_n_B_44', 'cat_n_B_45', 'cat_n_B_47', 'cat_n_B_48', 'cat_n_B_49', 'cat_n_B_50', 'cat_n_B_51', 'cat_n_B_52', 'cat_n_B_55', 'cat_n_B_56', 'cat_n_B_59', 'cat_n_B_60', 'cat_n_B_62', 'cat_n_B_64', 'cat_n_B_68', 'cat_n_B_7', 'cat_n_B_71', 'cat_n_B_72', 'cat_n_B_75', 'cat_n_B_76', 'cat_n_B_77', 'cat_n_B_78', 'cat_n_B_84', 'cat_n_B_9', 'cat_n_B_90', 'cat_n_B_92', 'cat_n_B_94', 'cat_n_B_95', 'cat_n_B_98', 'cat_n_B_99', 'div_cat_iid_cat_n_B_102', 'div_cat_iid_cat_n_B_111', 'div_cat_iid_cat_n_B_116', 'div_cat_iid_cat_n_B_119', 'div_cat_iid_cat_n_B_121', 'div_cat_iid_cat_n_B_122', 'div_cat_iid_cat_n_B_134', 'div_cat_iid_cat_n_B_136', 'div_cat_iid_cat_n_B_139', 'div_cat_iid_cat_n_B_14', 'div_cat_iid_cat_n_B_141', 'div_cat_iid_cat_n_B_142', 'div_cat_iid_cat_n_B_145', 'div_cat_iid_cat_n_B_148', 'div_cat_iid_cat_n_B_157', 'div_cat_iid_cat_n_B_158', 'div_cat_iid_cat_n_B_159', 'div_cat_iid_cat_n_B_160', 'div_cat_iid_cat_n_B_172', 'div_cat_iid_cat_n_B_174', 'div_cat_iid_cat_n_B_181', 'div_cat_iid_cat_n_B_184', 'div_cat_iid_cat_n_B_188', 'div_cat_iid_cat_n_B_193', 'div_cat_iid_cat_n_B_196', 'div_cat_iid_cat_n_B_197', 'div_cat_iid_cat_n_B_201', 'div_cat_iid_cat_n_B_204', 'div_cat_iid_cat_n_B_210', 'div_cat_iid_cat_n_B_213', 'div_cat_iid_cat_n_B_215', 'div_cat_iid_cat_n_B_217', 'div_cat_iid_cat_n_B_220', 'div_cat_iid_cat_n_B_221', 'div_cat_iid_cat_n_B_223', 'div_cat_iid_cat_n_B_23', 'div_cat_iid_cat_n_B_26', 'div_cat_iid_cat_n_B_3', 'div_cat_iid_cat_n_B_31', 'div_cat_iid_cat_n_B_32', 'div_cat_iid_cat_n_B_33', 'div_cat_iid_cat_n_B_34', 'div_cat_iid_cat_n_B_38', 'div_cat_iid_cat_n_B_39', 'div_cat_iid_cat_n_B_42', 'div_cat_iid_cat_n_B_45', 'div_cat_iid_cat_n_B_47', 'div_cat_iid_cat_n_B_50', 'div_cat_iid_cat_n_B_51', 'div_cat_iid_cat_n_B_52', 'div_cat_iid_cat_n_B_55', 'div_cat_iid_cat_n_B_59', 'div_cat_iid_cat_n_B_60', 'div_cat_iid_cat_n_B_62', 'div_cat_iid_cat_n_B_7', 'div_cat_iid_cat_n_B_72', 'div_cat_iid_cat_n_B_76', 'div_cat_iid_cat_n_B_78', 'div_cat_iid_cat_n_B_81', 'div_cat_iid_cat_n_B_90', 'div_cat_iid_cat_n_B_94', 'div_cat_iid_cat_n_B_95', 'div_cat_iid_cat_n_B_98', 'div_cat_iid_cat_n_B_99', 'iid_cnt', 'sum_B_157', 'sum_B_161', 'sum_B_188'],\n",
768 | " 'C': ['C_100', 'C_109', 'C_10_0', 'C_111', 'C_116', 'C_121', 'C_123', 'C_125', 'C_126', 'C_127', 'C_129', 'C_133', 'C_135', 'C_139', 'C_14', 'C_140', 'C_141', 'C_143', 'C_146', 'C_150', 'C_151', 'C_152', 'C_154', 'C_155', 'C_157', 'C_159', 'C_17_0', 'C_17_1', 'C_18', 'C_19', 'C_2', 'C_20', 'C_21', 'C_22', 'C_25', 'C_26', 'C_27_0', 'C_27_1', 'C_28', 'C_29', 'C_3', 'C_32', 'C_33', 'C_39', 'C_40', 'C_41', 'C_54', 'C_55', 'C_59', 'C_62', 'C_63', 'C_64', 'C_65', 'C_67', 'C_69', 'C_72', 'C_73', 'C_74', 'C_77', 'C_8', 'C_81', 'C_82', 'C_84', 'C_85', 'C_87', 'C_9', 'C_90', 'C_92', 'C_94', 'C_96', 'C_98', 'C_99', 'cat_n_C_0', 'cat_n_C_10', 'cat_n_C_11', 'cat_n_C_14', 'cat_n_C_15', 'cat_n_C_16', 'cat_n_C_17', 'cat_n_C_18', 'cat_n_C_2', 'cat_n_C_20', 'cat_n_C_21', 'cat_n_C_23', 'cat_n_C_24', 'cat_n_C_26', 'cat_n_C_27', 'cat_n_C_3', 'cat_n_C_30', 'cat_n_C_38', 'cat_n_C_4', 'cat_n_C_5', 'cat_n_C_9', 'div_cat_iid_cat_n_C_2', 'div_cat_iid_cat_n_C_31', 'div_cat_iid_cat_n_C_4', 'div_cat_iid_cat_n_C_40', 'div_cat_iid_cat_n_C_7', 'iid_cnt']\n",
769 | " }\n",
770 | "\n",
771 | "params_CB_A = {\n",
772 | " 'iterations' : 5000,\n",
773 | " 'learning_rate' : 0.03,\n",
774 | " 'depth' : 6,\n",
775 | " 'l2_leaf_reg' : 3,\n",
776 | " 'loss_function' : 'Logloss',\n",
777 | " 'random_seed' : 1,\n",
778 | " 'logging_level' : 'Silent',\n",
779 | " }\n",
780 | "\n",
781 | "params_CB_B = {\n",
782 | " 'iterations' : 5000,\n",
783 | " 'learning_rate' : 0.03,\n",
784 | " 'depth' : 6,\n",
785 | " 'l2_leaf_reg' : 3,\n",
786 | " 'loss_function' : 'Logloss',\n",
787 | " 'random_seed' : 1,\n",
788 | " 'logging_level' : 'Silent',\n",
789 | " }\n",
790 | "\n",
791 | "params_CB_C = {\n",
792 | " 'iterations' : 500,\n",
793 | " 'learning_rate' : 0.03,\n",
794 | " 'depth' : 6,\n",
795 | " 'l2_leaf_reg' : 3,\n",
796 | " 'loss_function' : 'Logloss',\n",
797 | " 'random_seed' : 1,\n",
798 | " 'logging_level' : 'Silent',\n",
799 | " }\n",
800 | "\n",
801 | "model_cb_A = CB_model(categ_conv=True)\n",
802 | "model_cb_A.set_params(params=params_CB_A)\n",
803 | "model_cb_B = CB_model(categ_conv=True)\n",
804 | "model_cb_B.set_params(params=params_CB_B)\n",
805 | "model_cb_C = CB_model(categ_conv=True)\n",
806 | "model_cb_C.set_params(params=params_CB_C)\n",
807 | "\n",
808 | "model_cb_dict = {'A': model_cb_A, 'B': model_cb_B, 'C': model_cb_C}\n",
809 | "\n",
810 | "process = processing(countries=['A','B','C'], balances=balances)\n",
811 | "process.set_data_dict(data_dict=data_dict)\n",
812 | "\n",
813 | "process.set_model_dict(model_dict=model_cb_dict)\n",
814 | "#process.find_exclude()\n",
815 | "process.set_exclude_dict(exclude_CB_dict)\n",
816 | "#result_cb = process.predict()\n",
817 | "\n",
818 | "params_XGB_A = {\n",
819 | " 'learning_rate' : 0.03,\n",
820 | " 'max_depth' : 3,\n",
821 | " 'n_estimators' : 1500,\n",
822 | " 'silent' : True,\n",
823 | " 'objective' : 'binary:logistic', \n",
824 | " 'gamma' : 0.3,\n",
825 | " 'subsample' : 0.7,\n",
826 | " 'reg_alpha' : 0.05\n",
827 | " }\n",
828 | "\n",
829 | "params_XGB_B = {\n",
830 | " 'learning_rate' : 0.03,\n",
831 | " 'max_depth' : 5,\n",
832 | " 'n_estimators' : 400,\n",
833 | " 'silent' : True,\n",
834 | " 'objective' : 'binary:logistic',\n",
835 | " 'gamma' : 0.2,\n",
836 | " 'subsample' : 0.7,\n",
837 | " 'reg_alpha' : 0.05,\n",
838 | " }\n",
839 | "\n",
840 | "params_XGB_C = {\n",
841 | " 'learning_rate' : 0.03,\n",
842 | " 'max_depth' : 3,\n",
843 | " 'n_estimators' : 500,\n",
844 | " 'silent' : True,\n",
845 | " 'objective' : 'binary:logistic',\n",
846 | " 'gamma' : 0.2,\n",
847 | " 'subsample' : 0.6,\n",
848 | " 'reg_alpha' : 0.05,\n",
849 | " }\n",
850 | "\n",
851 | "model_xgb_A = XGB_model(categ_conv=True)\n",
852 | "model_xgb_A.set_params(params=params_XGB_A)\n",
853 | "model_xgb_B = XGB_model(categ_conv=True)\n",
854 | "model_xgb_B.set_params(params=params_XGB_B)\n",
855 | "model_xgb_C = XGB_model(categ_conv=True)\n",
856 | "model_xgb_C.set_params(params=params_XGB_C)\n",
857 | "model_xgb_dict = {'A': model_xgb_A, 'B': model_xgb_B,'C': model_xgb_C}\n",
858 | "\n",
859 | "process.set_model_dict(model_dict=model_xgb_dict)\n",
860 | "#process.find_exclude()\n",
861 | "process.set_exclude_dict(exclude_XGB_dict)\n",
862 | "#result_xgb = process.predict()\n",
863 | "\n",
864 | "params_LGBM_A = {\n",
865 | " 'learning_rate' : 0.02,\n",
866 | " 'max_depth' : 6,\n",
867 | " 'n_estimators' : 942,\n",
868 | " 'silent' : True,\n",
869 | " 'objective' : 'binary',\n",
870 | " 'gamma' : 0.3,\n",
871 | " 'subsample' : 0.6,\n",
872 | " 'reg_alpha' : 0.02,\n",
873 | " 'is_unbalance' : True,\n",
874 | " 'boosting_type' : 'gbdt',\n",
875 | " 'reg_lambda' : 0.01,\n",
876 | " 'random_state' : 1\n",
877 | " }\n",
878 | "\n",
879 | "params_LGBM_B = {\n",
880 | " 'learning_rate' : 0.03,\n",
881 | " 'max_depth' : 6,\n",
882 | " 'n_estimators' : 232,\n",
883 | " 'silent' : True,\n",
884 | " 'objective' : 'binary', \n",
885 | " 'gamma' : 0.3,\n",
886 | " 'subsample' : 0.8,\n",
887 | " 'reg_alpha' : 0.05,\n",
888 | " 'is_unbalance' : True,\n",
889 | " 'boosting_type' : 'gbdt',\n",
890 | " 'reg_lambda' : 0.00,\n",
891 | " 'random_state' : 1\n",
892 | " }\n",
893 | "\n",
894 | "params_LGBM_C = {\n",
895 | " 'learning_rate' : 0.05,\n",
896 | " 'max_depth' : 3,\n",
897 | " 'n_estimators' : 520,\n",
898 | " 'silent' : True,\n",
899 | " 'objective' : 'binary', \n",
900 | " 'gamma' : 0.3,\n",
901 | " 'subsample' : 0.7,\n",
902 | " 'reg_alpha' : 0.05,\n",
903 | " 'is_unbalance' : True,\n",
904 | " 'boosting_type' : 'gbdt',\n",
905 | " 'reg_lambda' : 0.03,\n",
906 | " 'random_state' : 1,\n",
907 | " }\n",
908 | "\n",
909 | "model_lgbm_A = LGBM_model(categ_conv=True)\n",
910 | "model_lgbm_A.set_params(params=params_LGBM_A)\n",
911 | "model_lgbm_B = LGBM_model(categ_conv=True)\n",
912 | "model_lgbm_B.set_params(params=params_LGBM_B)\n",
913 | "model_lgbm_C = LGBM_model(categ_conv=True)\n",
914 | "model_lgbm_C.set_params(params=params_LGBM_C)\n",
915 | "model_lgbm_dict = {'A': model_lgbm_A, 'B': model_lgbm_B,'C': model_lgbm_C}\n",
916 | "\n",
917 | "process.set_model_dict(model_dict=model_lgbm_dict)\n",
918 | "process.set_exclude_dict(exclude_LGBM_dict)\n",
919 | "result_lgbm = process.predict()\n"
920 | ]
921 | },
922 | {
923 | "cell_type": "code",
924 | "execution_count": null,
925 | "metadata": {
926 | "ExecuteTime": {
927 | "end_time": "2018-03-20T18:37:05.252546",
928 | "start_time": "2018-03-20T18:37:05.212396"
929 | },
930 | "collapsed": false
931 | },
932 | "outputs": [],
933 | "source": [
934 | "# Create submission\n",
935 | "submission = pd.DataFrame(index=result_cb.index)\n",
936 | "submission['country'] = result_cb.country\n",
937 | "submission['poor'] = (result_xgb.poor * 0.4 +\n",
938 | " result_cb.poor * 0.4 +\n",
939 | " result_lgbm.poor * 0.2)\n",
940 | "process.save_csv(submission, clf_model_name='combine', path='../models/')"
941 | ]
942 | }
943 | ],
944 | "metadata": {
945 | "hide_input": false,
946 | "kernelspec": {
947 | "display_name": "Python 3",
948 | "language": "python",
949 | "name": "python3"
950 | },
951 | "language_info": {
952 | "codemirror_mode": {
953 | "name": "ipython",
954 | "version": 3
955 | },
956 | "file_extension": ".py",
957 | "mimetype": "text/x-python",
958 | "name": "python",
959 | "nbconvert_exporter": "python",
960 | "pygments_lexer": "ipython3",
961 | "version": "3.5.4"
962 | }
963 | },
964 | "nbformat": 4,
965 | "nbformat_minor": 1
966 | }
967 |
--------------------------------------------------------------------------------