├── data ├── raw │ └── .gitkeep ├── external │ └── .gitkeep ├── interim │ └── .gitkeep └── processed │ └── .gitkeep ├── models └── .gitkeep ├── notebooks ├── .gitkeep └── reproduce_final_submission.ipynb ├── reports ├── .gitkeep └── figures │ └── .gitkeep ├── venv └── README ├── tox.ini ├── learn └── events.out.tfevents ├── requirements.txt ├── docs ├── getting-started.rst ├── commands.rst ├── index.rst ├── make.bat ├── Makefile └── conf.py ├── test_environment.py ├── LICENSE ├── .gitignore ├── README.md ├── src ├── data │ ├── make_dataset.py │ └── data.py └── models │ ├── process.py │ ├── models.py │ └── predict_model.py └── Makefile /data/raw/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reports/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/external/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/interim/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/processed/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reports/figures/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /venv/README: -------------------------------------------------------------------------------- 1 | Virtualenv directory 2 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | max-complexity = 10 4 | -------------------------------------------------------------------------------- /learn/events.out.tfevents: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sagol/povert/HEAD/learn/events.out.tfevents -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click 2 | Sphinx 3 | coverage 4 | awscli 5 | flake8 6 | python-dotenv>=0.5.1 7 | catboost==0.5.2 8 | lightgbm==2.0.10 9 | numpy==1.14.0 10 | pandas==0.22.0 11 | scikit-learn==0.19.1 12 | scipy==1.11.1 13 | xgboost==0.6a2 14 | -------------------------------------------------------------------------------- /docs/getting-started.rst: -------------------------------------------------------------------------------- 1 | Getting started 2 | =============== 3 | 4 | This is where you describe how to get set up on a clean install, including the 5 | commands necessary to get the raw data (using the `sync_data_from_s3` command, 6 | for example), and then how to make the cleaned, final data sets. 7 | -------------------------------------------------------------------------------- /docs/commands.rst: -------------------------------------------------------------------------------- 1 | Commands 2 | ======== 3 | 4 | The Makefile contains the central entry points for common tasks related to this project. 5 | 6 | Syncing data to S3 7 | ^^^^^^^^^^^^^^^^^^ 8 | 9 | * `make sync_data_to_s3` will use `aws s3 sync` to recursively sync files in `data/` up to `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/`. 10 | * `make sync_data_from_s3` will use `aws s3 sync` to recursively sync files from `s3://[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')/data/` to `data/`. 11 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. povert documentation master file, created by 2 | sphinx-quickstart. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | povert documentation! 7 | ============================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | getting-started 15 | commands 16 | 17 | 18 | 19 | Indices and tables 20 | ================== 21 | 22 | * :ref:`genindex` 23 | * :ref:`modindex` 24 | * :ref:`search` 25 | -------------------------------------------------------------------------------- /test_environment.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | REQUIRED_PYTHON = "python3" 4 | 5 | 6 | def main(): 7 | system_major = sys.version_info.major 8 | if REQUIRED_PYTHON == "python": 9 | required_major = 2 10 | elif REQUIRED_PYTHON == "python3": 11 | required_major = 3 12 | else: 13 | raise ValueError("Unrecognized python interpreter: {}".format( 14 | REQUIRED_PYTHON)) 15 | 16 | if system_major != required_major: 17 | raise TypeError( 18 | "This project requires Python {}. Found: Python {}".format( 19 | required_major, sys.version)) 20 | else: 21 | print(">>> Development environment passes all tests!") 22 | 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | The MIT License (MIT) 3 | Copyright (c) 2018, Taras Baranyuk 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # DotEnv configuration 60 | .env 61 | 62 | # Database 63 | *.db 64 | *.rdb 65 | 66 | # Pycharm 67 | .idea 68 | 69 | # VS Code 70 | .vscode/ 71 | 72 | # Spyder 73 | .spyproject/ 74 | 75 | # Jupyter NB Checkpoints 76 | .ipynb_checkpoints/ 77 | 78 | # exclude data from source control by default 79 | /data/raw/*.csv 80 | /data/processed/*.csv 81 | .vs/ 82 | *.tsv 83 | /models/* 84 | train/ 85 | venv/ 86 | learn/ 87 | 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pover-T 2nd place 2 | ============================== 3 | 4 | Necessary tools and requirements: 5 | 1. Python3 6 | 2. You should install g++: 7 | ```sudo apt install g++=4:5.3.1-1ubuntu1``` 8 | 3. Install the required Python packages in requirements.txt: 9 | ```make requirements``` 10 | 4. Generate data: 11 | ```make data``` 12 | 5. Create submission file: 13 | ```make submission``` 14 | 6. The submission file ```submission_combine_20XX-XX-XX-XX-XX.csv```will appear in ```/models``` 15 | 16 | 17 | Project Organization 18 | ------------ 19 | 20 | ├── LICENSE 21 | ├── Makefile <- Makefile with commands like `make data` or `make train` 22 | ├── README.md <- The top-level README for developers using this project. 23 | ├── data 24 | │   ├── external <- Data from third party sources. 25 | │   ├── interim <- Intermediate data that has been transformed. 26 | │   ├── processed <- The final, canonical data sets for modeling. 27 | │   └── raw <- The original, immutable data dump. 28 | │ 29 | ├── docs <- A default Sphinx project; see sphinx-doc.org for details 30 | │ 31 | ├── models <- Trained and serialized models, model predictions, or model summaries 32 | │ 33 | ├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering), 34 | │ the creator's initials, and a short `-` delimited description, e.g. 35 | │ `1.0-jqp-initial-data-exploration`. 36 | │ 37 | ├── references <- Data dictionaries, manuals, and all other explanatory materials. 38 | │ 39 | ├── reports <- Generated analysis as HTML, PDF, LaTeX, etc. 40 | │   └── figures <- Generated graphics and figures to be used in reporting 41 | │ 42 | ├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g. 43 | │ generated with `pip freeze > requirements.txt` 44 | │ 45 | ├── src <- Source code for use in this project. 46 | │   ├── __init__.py <- Makes src a Python module 47 | │ │ 48 | │   ├── data <- Scripts to download or generate data 49 | │   │   └── make_dataset.py 50 | │ │ 51 | │   ├── features <- Scripts to turn raw data into features for modeling 52 | │   │   └── build_features.py 53 | │ │ 54 | │   ├── models <- Scripts to train models and then use trained models to make 55 | │ │ │ predictions 56 | │   │   ├── predict_model.py 57 | │   │   └── train_model.py 58 | │ │ 59 | │   └── visualization <- Scripts to create exploratory and results oriented visualizations 60 | │   └── visualize.py 61 | │ 62 | └── tox.ini <- tox file with settings for running tox; see tox.testrun.org 63 | 64 | 65 | -------- 66 | 67 |

Project based on the cookiecutter data science project template. #cookiecutterdatascience

68 | -------------------------------------------------------------------------------- /src/data/make_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import click 4 | import logging 5 | import data 6 | from dotenv import find_dotenv, load_dotenv 7 | 8 | 9 | @click.command() 10 | @click.argument('input_hh_train_filepath', type=click.Path()) 11 | @click.argument('input_ind_train_filepath', type=click.Path()) 12 | @click.argument('input_hh_test_filepath', type=click.Path()) 13 | @click.argument('input_ind_test_filepath', type=click.Path()) 14 | @click.argument('output_train_filepath', type=click.Path()) 15 | @click.argument('output_test_filepath', type=click.Path()) 16 | @click.argument('output_train_ind_filepath', type=click.Path()) 17 | @click.argument('output_test_ind_filepath', type=click.Path()) 18 | def main(input_hh_train_filepath, 19 | input_ind_train_filepath, 20 | input_hh_test_filepath, 21 | input_ind_test_filepath, 22 | output_train_filepath, 23 | output_test_filepath, 24 | output_train_ind_filepath, 25 | output_test_ind_filepath 26 | ): 27 | """ Runs data processing scripts to turn raw data from (../raw) into 28 | cleaned data ready to be analyzed (saved in ../processed). 29 | """ 30 | 31 | data_individual = data.DataInd() 32 | files_dict = {'train': 'data/raw/{0}'.format(input_ind_train_filepath), 33 | 'test': 'data/raw/{0}'.format(input_ind_test_filepath)} 34 | data_individual.set_country(input_hh_train_filepath[0]) 35 | data_individual.set_file_names(files_dict=files_dict) 36 | data_individual.load(load=False, cat_enc=False) 37 | files_dict = {'train': 'data/processed/{0}'.format(output_train_ind_filepath), 38 | 'test': 'data/processed/{0}'.format(output_test_ind_filepath)} 39 | data_individual.save(files_dict=files_dict, poor=False) 40 | 41 | files_dict = {'train_hh': 'data/raw/{0}'.format(input_hh_train_filepath), 42 | 'test_hh': 'data/raw/{0}'.format(input_hh_test_filepath), 43 | 'train_ind': 'data/processed/{0}'.format(output_train_ind_filepath), 44 | 'test_ind': 'data/processed/{0}'.format(output_test_ind_filepath)} 45 | data_concat = data.DataConcat() 46 | data_concat.set_file_names(files_dict=files_dict) 47 | data_concat.set_country(input_hh_train_filepath[0]) 48 | data_concat.load(load=False, cat_enc=False) 49 | files_dict = {'train': 'data/processed/{0}'.format(output_train_filepath), 50 | 'test': 'data/processed/{0}'.format(output_test_filepath)} 51 | data_concat.save(files_dict=files_dict) 52 | 53 | files_dict = {'train': 'data/raw/{0}'.format(input_hh_train_filepath), 54 | 'test': 'data/raw/{0}'.format(input_hh_test_filepath)} 55 | data_simple = data.Data() 56 | data_simple.set_file_names(files_dict=files_dict) 57 | data_simple.set_country(input_hh_train_filepath[0]) 58 | data_simple.load(load=False) 59 | files_dict = {'train': 'data/processed/{0}'.format(input_hh_train_filepath), 60 | 'test': 'data/processed/{0}'.format(input_hh_test_filepath)} 61 | data_simple.save(files_dict=files_dict) 62 | 63 | logger = logging.getLogger(__name__) 64 | logger.info('making final data set from raw data') 65 | 66 | 67 | if __name__ == '__main__': 68 | log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' 69 | logging.basicConfig(level=logging.INFO, format=log_fmt) 70 | 71 | # not used in this stub but often useful for finding various files 72 | project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir) 73 | 74 | # find .env automagically by walking up directories until it's found, then 75 | # load up the .env entries as environment variables 76 | load_dotenv(find_dotenv()) 77 | 78 | main() 79 | -------------------------------------------------------------------------------- /src/models/process.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import datetime 4 | import pandas as pd 5 | import numpy as np 6 | from sklearn.metrics.classification import log_loss 7 | 8 | src_dir = os.path.join(os.getcwd(), 'src') 9 | sys.path.append(src_dir) 10 | 11 | 12 | class processing: 13 | """ 14 | A helper class with a search function of not very significant features, 15 | prediction, and saving the prediction to a file. 16 | """ 17 | def __init__(self, countries=['A', 'B', 'C'], 18 | balances={'A': False, 'B': False, 'C': False}): 19 | self.countries = countries 20 | self.balances = balances 21 | self.exclude_dict = {'A': [], 'B': [], 'C': []} 22 | self.data_dict = None 23 | self.model_dict = None 24 | self.vote_waights_dict = None 25 | 26 | def set_vote_waights_dict(self, vote_waights_dict): 27 | self.vote_waights_dict = vote_waights_dict 28 | 29 | def set_data_dict(self, data_dict): 30 | self.data_dict = data_dict 31 | 32 | def set_model_dict(self, model_dict): 33 | self.model_dict = model_dict 34 | 35 | def set_exclude_dict(self, exclude_dict): 36 | self.exclude_dict = exclude_dict 37 | 38 | def save_csv(self, df, clf_model_name='_', path=''): 39 | submission_file = os.path.join( 40 | path, 'submission_{0}_{1}.csv'.format( 41 | clf_model_name, 42 | str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")))) 43 | print('submission file:', submission_file) 44 | df.to_csv(submission_file, index=True, float_format='%.4f') 45 | print(df.head()) 46 | 47 | def find_exclude(self, n_splits=5): 48 | if not self.model_dict or not self.data_dict: 49 | print('Stoped: no models or data') 50 | return None 51 | 52 | for c in self.countries: 53 | self.model_dict[c].load_data(data=self.data_dict[c], 54 | balance=self.balances[c]) 55 | exclude_list = [] 56 | finish = False 57 | logloss_dict = {} 58 | while not finish: 59 | self.model_dict[c].set_exclude_list(exclude_list) 60 | self.model_dict[c].train() 61 | exclude_list_prev = exclude_list.copy() 62 | columns = [x for x in self.model_dict[c].get_train().columns 63 | if x not in exclude_list_prev] 64 | exclude_list = [x for (x, y) in zip( 65 | columns, self.model_dict[c].get_feature_importances() 66 | ) if y == 0] 67 | if not exclude_list: 68 | finish = True 69 | exclude_list = exclude_list_prev + exclude_list 70 | 71 | logloss_iter = [] 72 | splits = self.model_dict[c].data.get_train_valid( 73 | n_splits=n_splits, balance=self.balances[c]) 74 | 75 | for i in range(0, n_splits): 76 | self.model_dict[c].set_random_seed(i) 77 | train, valid = splits[i] 78 | self.model_dict[c].set_exclude_list(exclude_list) 79 | self.model_dict[c].train(train[0], train[1]) 80 | pred = self.model_dict[c].predict(valid[0]) 81 | logloss_iter.append(log_loss(valid[1].astype(int), 82 | pred['poor'])) 83 | logloss = np.mean(logloss_iter) 84 | logloss_dict[logloss] = exclude_list 85 | print('loglos: {0} exclude length: {1}'.format( 86 | logloss, len(exclude_list))) 87 | self.exclude_dict[c] = logloss_dict[np.min( 88 | list(logloss_dict.keys()))] 89 | print('Country: {0} exclude length: {1}'.format( 90 | c, len(self.exclude_dict.get(c)))) 91 | 92 | return logloss_dict 93 | 94 | def predict(self, model_name, path=''): 95 | if not self.model_dict or not self.data_dict: 96 | print('Stoped: no models or data') 97 | return None 98 | 99 | predictions = [] 100 | for c in self.countries: 101 | self.model_dict[c].load_data(data=self.data_dict[c], 102 | balance=self.balances[c]) 103 | self.model_dict[c].set_exclude_list(self.exclude_dict[c]) 104 | if self.vote_waights_dict: 105 | self.model_dict[c].set_weights(self.vote_waights_dict[c]) 106 | print('exclude: \n', self.exclude_dict[c]) 107 | self.model_dict[c].train() 108 | predictions.append(self.model_dict[c].predict()) 109 | result = pd.concat(predictions) 110 | self.save_csv(result, clf_model_name=model_name, path=path) 111 | return result 112 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3 2 | 3 | ################################################################################# 4 | # GLOBALS # 5 | ################################################################################# 6 | 7 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) 8 | BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://') 9 | PROFILE = default 10 | PROJECT_NAME = povert 11 | PYTHON_INTERPRETER = python3 12 | 13 | ifeq (,$(shell which conda)) 14 | HAS_CONDA=False 15 | else 16 | HAS_CONDA=True 17 | endif 18 | 19 | ################################################################################# 20 | # COMMANDS # 21 | ################################################################################# 22 | 23 | ## Install Python Dependencies 24 | requirements: test_environment 25 | pip install -U pip setuptools wheel 26 | pip install -r requirements.txt 27 | 28 | ## Make Dataset 29 | data: #requirements 30 | $(PYTHON_INTERPRETER) src/data/make_dataset.py \ 31 | 'A_hhold_train.csv' \ 32 | 'A_indiv_train.csv' \ 33 | 'A_hhold_test.csv' \ 34 | 'A_indiv_test.csv' \ 35 | 'A_combine_train.csv' \ 36 | 'A_combine_test.csv' \ 37 | 'A_indiv_ext_train.csv' \ 38 | 'A_indiv_ext_test.csv' 39 | $(PYTHON_INTERPRETER) src/data/make_dataset.py \ 40 | 'B_hhold_train.csv' \ 41 | 'B_indiv_train.csv' \ 42 | 'B_hhold_test.csv' \ 43 | 'B_indiv_test.csv' \ 44 | 'B_combine_train.csv' \ 45 | 'B_combine_test.csv' \ 46 | 'B_indiv_ext_train.csv' \ 47 | 'B_indiv_ext_test.csv' 48 | $(PYTHON_INTERPRETER) src/data/make_dataset.py \ 49 | 'C_hhold_train.csv' \ 50 | 'C_indiv_train.csv' \ 51 | 'C_hhold_test.csv' \ 52 | 'C_indiv_test.csv' \ 53 | 'C_combine_train.csv' \ 54 | 'C_combine_test.csv' \ 55 | 'C_indiv_ext_train.csv' \ 56 | 'C_indiv_ext_test.csv' 57 | 58 | ## Create submission 59 | submission: 60 | $(PYTHON_INTERPRETER) src/models/predict_model.py 61 | 62 | 63 | ## Delete all compiled Python files 64 | clean: 65 | find . -type f -name "*.py[co]" -delete 66 | find . -type d -name "__pycache__" -delete 67 | 68 | ## Lint using flake8 69 | lint: 70 | flake8 src 71 | 72 | ## Set up python interpreter environment 73 | create_environment: 74 | ifeq (True,$(HAS_CONDA)) 75 | @echo ">>> Detected conda, creating conda environment." 76 | ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER))) 77 | conda create --name $(PROJECT_NAME) python=3 78 | else 79 | conda create --name $(PROJECT_NAME) python=2.7 80 | endif 81 | @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)" 82 | else 83 | @pip install -q virtualenv virtualenvwrapper 84 | @echo ">>> Installing virtualenvwrapper if not already intalled.\nMake sure the following lines are in shell startup file\n\ 85 | export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" 86 | @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" 87 | @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" 88 | endif 89 | 90 | ## Test python environment is setup correctly 91 | test_environment: 92 | $(PYTHON_INTERPRETER) test_environment.py 93 | 94 | ################################################################################# 95 | # PROJECT RULES # 96 | ################################################################################# 97 | 98 | 99 | 100 | ################################################################################# 101 | # Self Documenting Commands # 102 | ################################################################################# 103 | 104 | .DEFAULT_GOAL := show-help 105 | 106 | # Inspired by 107 | # sed script explained: 108 | # /^##/: 109 | # * save line in hold space 110 | # * purge line 111 | # * Loop: 112 | # * append newline + line to hold space 113 | # * go to next line 114 | # * if line starts with doc comment, strip comment character off and loop 115 | # * remove target prerequisites 116 | # * append hold space (+ newline) to line 117 | # * replace newline plus comments by `---` 118 | # * print line 119 | # Separate expressions are necessary because labels cannot be delimited by 120 | # semicolon; see 121 | .PHONY: show-help 122 | show-help: 123 | @echo "$$(tput bold)Available rules:$$(tput sgr0)" 124 | @echo 125 | @sed -n -e "/^## / { \ 126 | h; \ 127 | s/.*//; \ 128 | :doc" \ 129 | -e "H; \ 130 | n; \ 131 | s/^## //; \ 132 | t doc" \ 133 | -e "s/:.*//; \ 134 | G; \ 135 | s/\\n## /---/; \ 136 | s/\\n/ /g; \ 137 | p; \ 138 | }" ${MAKEFILE_LIST} \ 139 | | LC_ALL='C' sort --ignore-case \ 140 | | awk -F '---' \ 141 | -v ncol=$$(tput cols) \ 142 | -v indent=19 \ 143 | -v col_on="$$(tput setaf 6)" \ 144 | -v col_off="$$(tput sgr0)" \ 145 | '{ \ 146 | printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ 147 | n = split($$2, words, " "); \ 148 | line_length = ncol - indent; \ 149 | for (i = 1; i <= n; i++) { \ 150 | line_length -= length(words[i]) + 1; \ 151 | if (line_length <= 0) { \ 152 | line_length = ncol - indent - length(words[i]) - 1; \ 153 | printf "\n%*s ", -indent, " "; \ 154 | } \ 155 | printf "%s ", words[i]; \ 156 | } \ 157 | printf "\n"; \ 158 | }' \ 159 | | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') 160 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\povert.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\povert.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/povert.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/povert.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/povert" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/povert" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /src/models/models.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from catboost import CatBoostClassifier 4 | import xgboost as xgb 5 | import lightgbm as lgb 6 | from sklearn.utils import class_weight 7 | from abc import ABC, abstractmethod 8 | 9 | 10 | class predict_model(ABC): 11 | """ 12 | Abstract class for working with classifiers. 13 | """ 14 | 15 | @abstractmethod 16 | def __init__(self, name='predict_model', categ_conv=True): 17 | self.params = {} 18 | self.exclude_list = [] 19 | self.name = name 20 | self.random = 1 21 | self.classifier = None 22 | self.categ_conv = categ_conv 23 | self.data_df = {} 24 | 25 | def set_params(self, params=None): 26 | if not params: 27 | self.params = {} 28 | else: 29 | self.params = params 30 | 31 | def set_random_seed(self, random=1): 32 | self.random = random 33 | 34 | @abstractmethod 35 | def load_data(self, data, balance=False): 36 | self.data = data 37 | 38 | self.data_df['train'], self.data_df['y'] = self.data.get_train( 39 | balance=balance 40 | ) 41 | self.data_df['test'] = self.data.get_test() 42 | 43 | self.category_cols = self.data.get_cat_list() 44 | for header in self.category_cols: 45 | self.data_df['train'].loc[:, header] = self.data_df['train'][header].astype('category').cat.codes 46 | self.data_df['test'].loc[:, header] = self.data_df['test'][header].astype('category').cat.codes 47 | return True 48 | 49 | def get_train(self): 50 | return self.data_df['train'] 51 | 52 | def get_y(self): 53 | return self.data_df['y'] 54 | 55 | def get_test(self): 56 | return self.data_df['test'] 57 | 58 | def set_exclude_list(self, exclude_list): 59 | self.exclude_list = exclude_list.copy() 60 | 61 | @abstractmethod 62 | def get_feature_importances(self): 63 | pass 64 | 65 | @abstractmethod 66 | def train(self, x_train=None, y_train=None): 67 | pass 68 | 69 | def predict(self, test=None): 70 | if self.classifier: 71 | if not isinstance(test, pd.DataFrame): 72 | test = self.get_test() 73 | elif self.categ_conv: 74 | cols = [x for x in self.category_cols if x in test.columns] 75 | for header in cols: 76 | test.loc[:, header] = test[header].astype('category').cat.codes 77 | test = test.drop( 78 | [x for x in self.exclude_list if x in test.columns], axis=1 79 | ) 80 | res = pd.DataFrame(index=test.index) 81 | res['country'] = self.data.country 82 | res['poor'] = self.classifier.predict_proba(test)[:, 1] 83 | return res 84 | else: 85 | print('error: classifier not defined') 86 | return None 87 | 88 | 89 | class CB_model(predict_model): 90 | """ 91 | Class for a CatBoost classifier. 92 | """ 93 | 94 | def __init__(self, name='cat_boost', categ_conv=True): 95 | super().__init__(name='cat_boost', categ_conv=categ_conv) 96 | self.name = name 97 | 98 | def load_data(self, data, balance=False): 99 | if super().load_data(data, balance): 100 | c_w = class_weight.compute_class_weight( 101 | class_weight='balanced', 102 | classes=np.unique(self.data_df['y']), 103 | y=self.data_df['y'] 104 | ) 105 | 106 | self.classifier = CatBoostClassifier(**self.params, 107 | class_weights=c_w) 108 | return True 109 | else: 110 | return False 111 | 112 | def train(self, x_train=None, y_train=None): 113 | 114 | if not isinstance(x_train, pd.DataFrame): 115 | x_train = self.get_train() 116 | elif self.categ_conv: 117 | cols = [x for x in self.category_cols if x in x_train.columns] 118 | for header in cols: 119 | x_train.loc[:, header] = x_train[header].astype('category').cat.codes 120 | 121 | if not isinstance(y_train, pd.Series): 122 | y_train = self.get_y() 123 | 124 | x_train = x_train.drop([x for x in self.exclude_list 125 | if x in x_train.columns], axis=1) 126 | 127 | self.category_cols = [x for x in self.category_cols 128 | if x not in self.exclude_list] 129 | 130 | cat_dims = [x_train.columns.get_loc(i) for i in self.category_cols] 131 | print(x_train.shape, y_train.shape, len(self.category_cols)) 132 | 133 | self.classifier.fit(x_train, y_train, cat_features=cat_dims) 134 | return self.classifier 135 | 136 | def get_feature_importances(self): 137 | return self.classifier._feature_importance 138 | 139 | 140 | class XGB_model(predict_model): 141 | """ 142 | Class for a XGBoost classifier. 143 | """ 144 | 145 | def __init__(self, name='xg_boost', categ_conv=True): 146 | super().__init__(name='xg_boost', categ_conv=categ_conv) 147 | self.name = name 148 | 149 | def load_data(self, data, balance=False): 150 | if super().load_data(data, balance): 151 | self.params['scale_pos_weight'] = ( 152 | (self.data_df['y'].shape[0] - self.data_df['y'].sum()) / 153 | self.data_df['y'].sum() 154 | ) 155 | self.classifier = xgb.XGBClassifier(**self.params) 156 | return True 157 | else: 158 | return False 159 | 160 | def train(self, x_train=None, y_train=None): 161 | 162 | if not isinstance(x_train, pd.DataFrame): 163 | x_train = self.get_train() 164 | elif self.categ_conv: 165 | cols = [x for x in self.category_cols if x in x_train.columns] 166 | for header in cols: 167 | x_train.loc[:, header] = x_train[header].astype('category').cat.codes 168 | 169 | if not isinstance(y_train, pd.Series): 170 | y_train = self.get_y() 171 | 172 | x_train = x_train.drop([x for x in self.exclude_list 173 | if x in x_train.columns], axis=1) 174 | print('x_train shape: ', x_train.shape) 175 | 176 | self.classifier.fit(x_train, y_train) 177 | 178 | return self.classifier 179 | 180 | def get_feature_importances(self): 181 | return self.classifier.feature_importances_ 182 | 183 | 184 | class LGBM_model(predict_model): 185 | """ 186 | Class for LightGBM classifier. 187 | """ 188 | 189 | def __init__(self, name='lgbm', categ_conv=True): 190 | super().__init__(name='lgbm', categ_conv=categ_conv) 191 | self.name = name 192 | 193 | def load_data(self, data, balance=False): 194 | if super().load_data(data, balance): 195 | self.classifier = lgb.LGBMClassifier(**self.params) 196 | return True 197 | else: 198 | return False 199 | 200 | def train(self, x_train=None, y_train=None): 201 | 202 | if not isinstance(x_train, pd.DataFrame): 203 | x_train = self.get_train() 204 | elif self.categ_conv: 205 | cols = [x for x in self.category_cols if x in x_train.columns] 206 | for header in cols: 207 | x_train.loc[:, header] = x_train[header].astype('category').cat.codes 208 | 209 | if not isinstance(y_train, pd.Series): 210 | y_train = self.get_y() 211 | 212 | x_train = x_train.drop([x for x in self.exclude_list 213 | if x in x_train.columns], axis=1) 214 | print('x_train shape: ', x_train.shape) 215 | 216 | self.category_cols = [x for x in self.category_cols 217 | if x not in self.exclude_list] 218 | 219 | self.classifier.fit(x_train, y_train, verbose=False) 220 | 221 | return self.classifier 222 | 223 | def get_feature_importances(self): 224 | return self.classifier.feature_importances_ 225 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # povert documentation build configuration file, created by 4 | # sphinx-quickstart. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import os 15 | import sys 16 | 17 | # If extensions (or modules to document with autodoc) are in another directory, 18 | # add these directories to sys.path here. If the directory is relative to the 19 | # documentation root, use os.path.abspath to make it absolute, like shown here. 20 | # sys.path.insert(0, os.path.abspath('.')) 21 | 22 | # -- General configuration ----------------------------------------------------- 23 | 24 | # If your documentation needs a minimal Sphinx version, state it here. 25 | # needs_sphinx = '1.0' 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be extensions 28 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 29 | extensions = [] 30 | 31 | # Add any paths that contain templates here, relative to this directory. 32 | templates_path = ['_templates'] 33 | 34 | # The suffix of source filenames. 35 | source_suffix = '.rst' 36 | 37 | # The encoding of source files. 38 | # source_encoding = 'utf-8-sig' 39 | 40 | # The master toctree document. 41 | master_doc = 'index' 42 | 43 | # General information about the project. 44 | project = u'povert' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = '0.1' 52 | # The full version, including alpha/beta/rc tags. 53 | release = '0.1' 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | # language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | # today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | # today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = ['_build'] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | # default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | # add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | # add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | # show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | # modindex_common_prefix = [] 88 | 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | html_theme = 'default' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | # html_theme_options = {} 100 | 101 | # Add any paths that contain custom themes here, relative to this directory. 102 | # html_theme_path = [] 103 | 104 | # The name for this set of Sphinx documents. If None, it defaults to 105 | # " v documentation". 106 | # html_title = None 107 | 108 | # A shorter title for the navigation bar. Default is the same as html_title. 109 | # html_short_title = None 110 | 111 | # The name of an image file (relative to this directory) to place at the top 112 | # of the sidebar. 113 | # html_logo = None 114 | 115 | # The name of an image file (within the static path) to use as favicon of the 116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 117 | # pixels large. 118 | # html_favicon = None 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 126 | # using the given strftime format. 127 | # html_last_updated_fmt = '%b %d, %Y' 128 | 129 | # If true, SmartyPants will be used to convert quotes and dashes to 130 | # typographically correct entities. 131 | # html_use_smartypants = True 132 | 133 | # Custom sidebar templates, maps document names to template names. 134 | # html_sidebars = {} 135 | 136 | # Additional templates that should be rendered to pages, maps page names to 137 | # template names. 138 | # html_additional_pages = {} 139 | 140 | # If false, no module index is generated. 141 | # html_domain_indices = True 142 | 143 | # If false, no index is generated. 144 | # html_use_index = True 145 | 146 | # If true, the index is split into individual pages for each letter. 147 | # html_split_index = False 148 | 149 | # If true, links to the reST sources are added to the pages. 150 | # html_show_sourcelink = True 151 | 152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 153 | # html_show_sphinx = True 154 | 155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 156 | # html_show_copyright = True 157 | 158 | # If true, an OpenSearch description file will be output, and all pages will 159 | # contain a tag referring to it. The value of this option must be the 160 | # base URL from which the finished HTML is served. 161 | # html_use_opensearch = '' 162 | 163 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 164 | # html_file_suffix = None 165 | 166 | # Output file base name for HTML help builder. 167 | htmlhelp_basename = 'povertdoc' 168 | 169 | 170 | # -- Options for LaTeX output -------------------------------------------------- 171 | 172 | latex_elements = { 173 | # The paper size ('letterpaper' or 'a4paper'). 174 | # 'papersize': 'letterpaper', 175 | 176 | # The font size ('10pt', '11pt' or '12pt'). 177 | # 'pointsize': '10pt', 178 | 179 | # Additional stuff for the LaTeX preamble. 180 | # 'preamble': '', 181 | } 182 | 183 | # Grouping the document tree into LaTeX files. List of tuples 184 | # (source start file, target name, title, author, documentclass [howto/manual]). 185 | latex_documents = [ 186 | ('index', 187 | 'povert.tex', 188 | u'povert Documentation', 189 | u"Taras Baranyuk", 'manual'), 190 | ] 191 | 192 | # The name of an image file (relative to this directory) to place at the top of 193 | # the title page. 194 | # latex_logo = None 195 | 196 | # For "manual" documents, if this is true, then toplevel headings are parts, 197 | # not chapters. 198 | # latex_use_parts = False 199 | 200 | # If true, show page references after internal links. 201 | # latex_show_pagerefs = False 202 | 203 | # If true, show URL addresses after external links. 204 | # latex_show_urls = False 205 | 206 | # Documents to append as an appendix to all manuals. 207 | # latex_appendices = [] 208 | 209 | # If false, no module index is generated. 210 | # latex_domain_indices = True 211 | 212 | 213 | # -- Options for manual page output -------------------------------------------- 214 | 215 | # One entry per manual page. List of tuples 216 | # (source start file, name, description, authors, manual section). 217 | man_pages = [ 218 | ('index', 'povert', u'povert Documentation', 219 | [u"Taras Baranyuk"], 1) 220 | ] 221 | 222 | # If true, show URL addresses after external links. 223 | # man_show_urls = False 224 | 225 | 226 | # -- Options for Texinfo output ------------------------------------------------ 227 | 228 | # Grouping the document tree into Texinfo files. List of tuples 229 | # (source start file, target name, title, author, 230 | # dir menu entry, description, category) 231 | texinfo_documents = [ 232 | ('index', 'povert', u'povert Documentation', 233 | u"Taras Baranyuk", 'povert', 234 | '2nd place 2018', 'Miscellaneous'), 235 | ] 236 | 237 | # Documents to append as an appendix to all manuals. 238 | # texinfo_appendices = [] 239 | 240 | # If false, no module index is generated. 241 | # texinfo_domain_indices = True 242 | 243 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 244 | # texinfo_show_urls = 'footnote' 245 | -------------------------------------------------------------------------------- /src/data/data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.utils import resample 3 | from collections import OrderedDict 4 | from sklearn.model_selection import StratifiedShuffleSplit 5 | from sklearn.preprocessing import StandardScaler 6 | 7 | 8 | class Data(): 9 | """ Class for working with households data.""" 10 | 11 | def __init__(self): 12 | self.country_df_train = None 13 | self.country_df_test = None 14 | self.categorical_list = [] 15 | self.float_list = [] 16 | self.train_file_name = None 17 | self.test_file_name = None 18 | 19 | def split_data(self, 20 | size=0.8, 21 | n_splits=1, 22 | random_state=1, 23 | balance=False, 24 | df=None): 25 | """ 26 | Returns data partitions. 27 | 28 | Args: 29 | size: float, partition ratio, optional (default=0.8) 30 | n_splits: int, number of partitions, optional (default=1) 31 | random_state: int, RandomState instance, optional (default=1) 32 | balance: bool, resample data, optional (default=False) 33 | df: DataFrame, data for split, optional (default=None) 34 | 35 | Returns: 36 | List of splits. 37 | """ 38 | 39 | if not isinstance(df, pd.DataFrame): 40 | train = self.country_df_train 41 | else: 42 | train = df 43 | sss = StratifiedShuffleSplit(n_splits=n_splits, 44 | test_size=1-size, 45 | random_state=random_state) 46 | splits = [] 47 | for train_index, validate_index in sss.split(train, train.poor): 48 | df_train = train.iloc[train_index] 49 | if balance: 50 | df_train = self.resample(df_train) 51 | splits.append((df_train, train.iloc[validate_index])) 52 | return splits 53 | 54 | def _rename_col(self): 55 | """Rename columns.""" 56 | 57 | train_columns = self.country_df_train.columns 58 | train_new_columns = [ 59 | x if (x == 'poor' or 60 | x == 'country') else '{0}_{1}'.format( 61 | self.country, 62 | train_columns.get_loc(x)) for x in train_columns] 63 | self.country_df_train.columns = train_new_columns 64 | self.col_maping = dict(zip(train_columns, train_new_columns)) 65 | self.col_maping_reverse = dict(zip(train_new_columns, train_columns)) 66 | 67 | self.country_df_test.rename(columns=self.col_maping, inplace=True) 68 | 69 | def del_nonunique(self, df): 70 | """ 71 | Delete columns with non-unique values. 72 | 73 | Args: 74 | df: DataFrame, data for clean 75 | 76 | Returns: 77 | DataFrame without columns with non-unique values. 78 | """ 79 | 80 | nunique = df.apply(pd.Series.nunique) 81 | cols_to_drop = nunique[nunique == 1].index 82 | print('Cols to drop:', cols_to_drop) 83 | return df.drop(cols_to_drop, axis=1) 84 | 85 | def category_float_search(self, 86 | count=5, 87 | countries=['B'], 88 | cat_types=['object'], 89 | fi_types=['float64', 'int64']): 90 | """ 91 | Search for categorical features. 92 | 93 | Args: 94 | count: int, number of unique values for determining categoricity, 95 | optional (default=5) 96 | countries: list, list of countries for which to search not only for 97 | features with the type in cat_types, 98 | optional (default=['B']) 99 | cat_types: list, list of types for categorical features, 100 | optional (default=['object']) 101 | fi_types: list, A list of additional types for searching for category 102 | features, optional (default=['float64', 'int64']) 103 | 104 | Returns: 105 | Tuple with a list of categorical columns and a list of other columns 106 | """ 107 | 108 | categorical_list = list( 109 | self.country_df_train[self.col_common_list].select_dtypes( 110 | include=cat_types).columns) 111 | 112 | if self.country not in countries: 113 | return (categorical_list, 114 | list(self.country_df_train[ 115 | self.col_common_list].select_dtypes( 116 | include=fi_types).columns)) 117 | 118 | float_list = [] 119 | print('float list length: ', len(list( 120 | self.country_df_test.select_dtypes(include=fi_types).columns))) 121 | for i in list(self.country_df_test[ 122 | self.col_common_list].select_dtypes(include=fi_types).columns): 123 | value_set = set( 124 | self.country_df_test[i].unique()).union(set( 125 | self.country_df_train[i].unique())) 126 | if len(value_set) <= count: 127 | categorical_list.append(i) 128 | else: 129 | float_list.append(i) 130 | print('float list length: ', len(sorted(float_list))) 131 | return sorted(categorical_list), sorted(float_list) 132 | 133 | def scale(self): 134 | """ 135 | Scale all non categorical values. 136 | """ 137 | if not self.float_list: 138 | print('There is no float list') 139 | return False 140 | scaler = StandardScaler() 141 | for i in self.float_list: 142 | self.country_df_train[i] = scaler.fit_transform( 143 | self.country_df_train[i].values.reshape(-1, 1)) 144 | self.country_df_test[i] = scaler.transform( 145 | self.country_df_test[i].values.reshape(-1, 1)) 146 | return True 147 | 148 | def fillna(self): 149 | """ 150 | Replace `NaN` values with the median of the column and remove all the completely empty columns. 151 | """ 152 | print('train data have NaNs: ', self.country_df_train.isnull().any().any()) 153 | print('test data have NaNs: ', self.country_df_test.isnull().any().any()) 154 | self.country_df_train = self.country_df_train.fillna( 155 | self.country_df_train.median()).dropna(axis=1, how='all') 156 | self.country_df_test = self.country_df_test.fillna( 157 | self.country_df_test.median()).dropna(axis=1, how='all') 158 | print('train data have NaNs: ', self.country_df_train.isnull().any().any()) 159 | print('test data have NaNs: ', self.country_df_test.isnull().any().any()) 160 | 161 | def set_file_names(self, files_dict): 162 | """ 163 | Set file names for train and test dataframes 164 | 165 | Args: 166 | files_dict: dictionary, file names for 'train' and 'test' 167 | """ 168 | self.train_file_name = files_dict.get('train') 169 | self.test_file_name = files_dict.get('test') 170 | 171 | def set_country(self, country): 172 | """ 173 | Set country label. 174 | 175 | Args: 176 | country: string, a label for country 177 | """ 178 | self.country = country 179 | print('Country: ', self.country) 180 | 181 | def load(self, load=True, with_bug=True): 182 | """ 183 | Load data from files. 184 | 185 | Args: 186 | load: bool, load from file without postprocessing, 187 | optional (default=True) 188 | with_bug: bool, emulate a bug for final submission, 189 | optional, (default=True) 190 | """ 191 | self.country_df_train = self.del_nonunique( 192 | pd.read_csv(self.train_file_name, index_col='id')) 193 | self.country_df_test = self.del_nonunique( 194 | pd.read_csv(self.test_file_name, index_col='id')) 195 | 196 | if not load: 197 | self._rename_col() 198 | self.fillna() 199 | self.col_common_list = \ 200 | sorted(list(set(self.country_df_train.columns).intersection( 201 | self.country_df_test.columns))) 202 | self.categorical_list, self.float_list = self.category_float_search() 203 | if not load: 204 | if self.country == 'B' or not with_bug: 205 | self.scale() 206 | print('dataind train shape: ', self.country_df_train.shape) 207 | return True 208 | 209 | def save(self, files_dict, poor=True): 210 | """ 211 | Save data to files. 212 | 213 | Args: 214 | files_dict: dictionary, file names for 'train' and 'test' 215 | poor: bool, save poor column, optional (default=True) 216 | """ 217 | train = self.get_train() 218 | if poor: 219 | train = pd.concat([train[0], train[1]], axis=1) 220 | else: 221 | train = train[0] 222 | train.to_csv(files_dict.get('train'), index=True, mode='w') 223 | test = self.get_test() 224 | test.to_csv(files_dict.get('test'), index=True, mode='w') 225 | return True 226 | 227 | def resample(self, df): 228 | """ 229 | Resample dataframe. 230 | 231 | Args: 232 | df: DataFrame, dataframe for resample 233 | 234 | Returns: 235 | Resampled dataframe. 236 | """ 237 | df_majority = df[~self.country_df_train.poor] 238 | df_minority = df[self.country_df_train.poor] 239 | 240 | df_minority_upsampled = resample(df_minority, 241 | replace=True, 242 | n_samples=df_majority.shape[0], 243 | random_state=1) 244 | return pd.concat([df_majority, df_minority_upsampled]) 245 | 246 | def get_train(self, balance=False): 247 | """ 248 | Get train data. 249 | 250 | Args: 251 | balance: bool, resample data, optional (default=False) 252 | 253 | Returns: 254 | Tuple with a train dataframe and a target dataframe. 255 | """ 256 | if balance: 257 | train = self.resample(self.country_df_train) 258 | return train[self.col_common_list], train['poor'] 259 | return (self.country_df_train[self.col_common_list], 260 | self.country_df_train['poor']) 261 | 262 | def get_train_valid(self, n_splits=1, balance=False): 263 | """ 264 | Get train and valid sets. 265 | 266 | Args: 267 | n_splits: int, number of partitions, optional (default=1) 268 | balance: bool, resample data, optional (default=False) 269 | 270 | Returns: 271 | A list of splits. 272 | """ 273 | splits = self.split_data(n_splits=n_splits, balance=balance) 274 | return [((x[self.col_common_list], x.poor), 275 | (y[self.col_common_list], y.poor)) for x, y in splits] 276 | 277 | def get_test(self): 278 | """ 279 | Get test data. 280 | 281 | Returns: 282 | A test dataframe. 283 | """ 284 | return self.country_df_test[self.col_common_list] 285 | 286 | def get_cat_list(self): 287 | """ 288 | Get a list of categorical features. 289 | 290 | Returns: 291 | A list of columns. 292 | """ 293 | return self.categorical_list 294 | 295 | def get_float_list(self): 296 | """ 297 | Get a list of non-categorical features. 298 | 299 | Returns: 300 | A list of columns. 301 | """ 302 | return self.float_list 303 | 304 | 305 | class DataInd(Data): 306 | """ Class for working with individual level data.""" 307 | 308 | def __init__(self): 309 | super().__init__() 310 | 311 | def get_poor(self, df): 312 | """ 313 | Get a dataframe with poor column. 314 | 315 | Returns: 316 | A dataframe with a poor column. 317 | """ 318 | return df['poor'].reset_index()[['id', 'poor']].drop_duplicates().set_index('id') 319 | 320 | def summarize(self, df): 321 | """ 322 | Get a dataframe with a summarized individual level data for household. 323 | 324 | Args: 325 | df: DataFrame, dataframe with an individual level data 326 | 327 | Returns: 328 | A dataframe with summarized columns. 329 | """ 330 | count = df.copy().groupby(level=0).sum() 331 | res_df = pd.concat({'sum': count}, axis=1) 332 | res_df.columns = ['{0}_{1}'.format(i[0], i[1]) for i in res_df.columns] 333 | res_df = res_df.reindex(index=df.index.get_level_values(0)) 334 | res_df = res_df[~res_df.index.duplicated(keep='first')] 335 | print('summarized size df: ', res_df.shape) 336 | return res_df 337 | 338 | def _get_id_list(self, df): 339 | """ 340 | Get an ordered list of indeces. 341 | 342 | Args: 343 | df: DataFrame, dataframe with an individual level data 344 | 345 | Returns: 346 | An ordered list of indeces. 347 | """ 348 | return list(OrderedDict.fromkeys(df.index.get_level_values(0))) 349 | 350 | def count_iid(self, df): 351 | """ 352 | Get a dataframe with a count of individuals for households. 353 | 354 | Args: 355 | df: DataFrame, dataframe with an individual level data 356 | 357 | Returns: 358 | A dataframe with a count of individuals for households. 359 | """ 360 | s = df.index.get_level_values(0).value_counts() 361 | return s.reindex(index=self._get_id_list(df)).to_frame('iid_cnt') 362 | 363 | def count_neg_poz(self, df): 364 | """ 365 | Get a dataframe with a count of negative and positive values for 366 | an individual level data. 367 | 368 | Args: 369 | df: DataFrame, dataframe with an individual level data 370 | 371 | Returns: 372 | A dataframe with a count of negative and positive values for 373 | an individual level data. 374 | """ 375 | res_df = df.select_dtypes(include=['float64', 'int64', 'int8']) 376 | res_df = res_df.groupby(level=0).apply(lambda c: c.apply( 377 | lambda x: pd.Series( 378 | [(x < 0).sum(), (x >= 0).sum()])).unstack()) 379 | res_df.columns = ['{0}_{1}'.format(i[0], i[1]) 380 | for i in res_df.columns] 381 | print('count_neg_poz size df: ', res_df.shape) 382 | return res_df.reindex(index=self._get_id_list(df)) 383 | 384 | def count_unique_categories(self, df, iid=True): 385 | """ 386 | Get a dataframe with a count of unique values for an individual 387 | level data. 388 | 389 | Args: 390 | df: DataFrame, dataframe with an individual level data 391 | iid: bool, add columns with the ratio of the number of unique 392 | values to the number of individuals in households, 393 | optional (default=True) 394 | 395 | Returns: 396 | A dataframe with a count of unique values for an individual 397 | level data. 398 | """ 399 | res_df = df.groupby(level=0).apply( 400 | lambda c: c.apply(lambda x: pd.Series([len((x).unique())]))) 401 | res_df.index = res_df.index.droplevel(1) 402 | res_df.columns = [ 403 | '{0}_{1}'.format('cat_n', i) for i in res_df.columns] 404 | print('count_unique_categories size df: ', res_df.shape) 405 | res_df = res_df.reindex(index=self._get_id_list(df)) 406 | if iid: 407 | div_df = res_df.div(self.count_iid(df)['iid_cnt'], axis=0) 408 | div_df.columns = ['{0}_{1}'.format('div_cat_iid', i) 409 | for i in res_df.columns] 410 | res_df = pd.concat([res_df, div_df], axis=1) 411 | return res_df 412 | 413 | def load(self, load=True, cat_enc=False): 414 | """ 415 | Load data from files. 416 | 417 | Args: 418 | load: bool, load from file without postprocessing, 419 | optional (default=True) 420 | cat_enc: bool, encode categories to numeric values, 421 | optional, (default=False) 422 | """ 423 | 424 | print('DataInd load') 425 | if load: 426 | self.country_df_train = self.del_nonunique( 427 | pd.read_csv(self.train_file_name, index_col=['id'])) 428 | self.country_df_test = self.del_nonunique( 429 | pd.read_csv(self.test_file_name, index_col=['id'])) 430 | 431 | if not load: 432 | print(self.train_file_name) 433 | print(self.test_file_name) 434 | self.country_df_train = self.del_nonunique( 435 | pd.read_csv(self.train_file_name, index_col=['id', 'iid'])) 436 | self.country_df_test = self.del_nonunique( 437 | pd.read_csv(self.test_file_name, index_col=['id', 'iid'])) 438 | self._rename_col() 439 | self.fillna() 440 | self.col_common_list = sorted( 441 | list(set(self.country_df_train.columns).intersection( 442 | self.country_df_test.columns))) 443 | 444 | self.categorical_list, self.float_list = self.category_float_search( 445 | countries=['A', 'B', 'C']) 446 | 447 | if cat_enc: 448 | for header in self.categorical_list: 449 | self.country_df_train[header] = self.country_df_train[header].astype('category').cat.codes 450 | self.country_df_test[header] = self.country_df_test[header].astype('category').cat.codes 451 | # To reproduce the result in the final submission. 452 | # In the general solution, this scale is not needed. 453 | self.scale() 454 | self.country_df_train = self.del_nonunique(pd.concat( 455 | [self.get_poor(self.country_df_train), 456 | self.count_iid(self.country_df_train), 457 | self.count_neg_poz(self.country_df_train), 458 | self.summarize(self.country_df_train), 459 | self.count_unique_categories(self.country_df_train)], 460 | axis=1)) 461 | 462 | self.country_df_test = self.del_nonunique(pd.concat( 463 | [self.count_iid(self.country_df_test), 464 | self.count_neg_poz(self.country_df_test), 465 | self.summarize(self.country_df_test), 466 | self.count_unique_categories(self.country_df_test)], 467 | axis=1)) 468 | 469 | self.col_common_list = sorted( 470 | list(set(self.country_df_train.columns).intersection( 471 | self.country_df_test.columns))) 472 | self.categorical_list, self.float_list = self.category_float_search( 473 | countries=['A', 'B', 'C']) 474 | if not load: 475 | self.scale() 476 | print('indiv train shape: ', self.country_df_train.shape) 477 | print('indiv test shape: ', self.country_df_test.shape) 478 | return True 479 | 480 | 481 | class DataConcat(Data): 482 | """ 483 | Class for working with concatenated data from individual and household 484 | levels. 485 | """ 486 | 487 | def __init__(self): 488 | self.data_hh_train = None 489 | self.data_hh_test = None 490 | self.data_indiv_train = None 491 | self.data_indiv_test = None 492 | super().__init__() 493 | 494 | def set_file_names(self, files_dict): 495 | """ 496 | Set file names for train and test dataframes 497 | 498 | Args: 499 | files_dict: dictionary, file names for 'train' and 'test' 500 | """ 501 | self.hh_train_file_name = files_dict.get('train_hh') 502 | self.hh_test_file_name = files_dict.get('test_hh') 503 | self.ind_train_file_name = files_dict.get('train_ind') 504 | self.ind_test_file_name = files_dict.get('test_ind') 505 | super().set_file_names(files_dict) 506 | 507 | def load(self, load=True, cat_enc=False, with_bug=True): 508 | """ 509 | Load data from files. 510 | 511 | Args: 512 | load: bool, load from file without postprocessing, 513 | optional (default=True) 514 | cat_enc: bool, encode categories to numeric values, 515 | optional, (default=False) 516 | with_bug: bool, emulate a bug for final submission, 517 | optional, (default=True) 518 | """ 519 | if with_bug or not load: 520 | data_hh = Data() 521 | data_hh.set_country(self.country) 522 | data_hh.set_file_names({'train': self.hh_train_file_name, 523 | 'test': self.hh_test_file_name}) 524 | if not data_hh.load(load=False, with_bug=with_bug): 525 | return False 526 | 527 | if load: 528 | print('DataConcat load') 529 | self.country_df_train = self.del_nonunique(pd.read_csv( 530 | self.train_file_name, index_col=['id'])) 531 | self.country_df_test = self.del_nonunique(pd.read_csv( 532 | self.test_file_name, index_col=['id'])) 533 | else: 534 | data_ind = DataInd() 535 | data_ind.set_country(self.country) 536 | data_ind.set_file_names({'train': self.ind_train_file_name, 537 | 'test': self.ind_test_file_name}) 538 | 539 | if data_ind.load(load=True): 540 | self.country_df_train = data_hh.country_df_train.join( 541 | data_ind.country_df_train) 542 | self.country_df_test = data_hh.country_df_test.join( 543 | data_ind.country_df_test) 544 | 545 | self.col_common_list = sorted( 546 | list(set(self.country_df_train.columns).intersection( 547 | self.country_df_test.columns))) 548 | 549 | if with_bug: 550 | self.categorical_list = data_hh.categorical_list 551 | else: 552 | self.categorical_list, self.float_list = self.category_float_search( 553 | countries=['B']) 554 | 555 | print('train:', self.country_df_train.shape) 556 | print('test:', self.country_df_test.shape) 557 | 558 | return True 559 | -------------------------------------------------------------------------------- /src/models/predict_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import process 4 | import pandas as pd 5 | from data.data import Data, DataConcat 6 | from models import LGBM_model, CB_model, XGB_model 7 | 8 | src_dir = os.path.join(os.getcwd(), 'src') 9 | sys.path.append(src_dir) 10 | 11 | 12 | def predict(p_models={'xgboost': True, 13 | 'lightgbm': True, 14 | 'catboost': True}): 15 | filenames_dict = { 16 | 'A': {'train': 'data/processed/A_hhold_train.csv', 17 | 'test': 'data/processed/A_hhold_test.csv', 18 | 'train_hh': 'data/raw/A_hhold_train.csv', 19 | 'test_hh': 'data/raw/A_hhold_test.csv', 20 | 'train_ind': 'data/processed/A_indiv_train.csv', 21 | 'test_ind': 'data/processed/A_indiv_test.csv' 22 | }, 23 | 'B': {'train': 'data/processed/B_combine_train.csv', 24 | 'test': 'data/processed/B_combine_test.csv', 25 | 'train_hh': 'data/raw/B_hhold_train.csv', 26 | 'test_hh': 'data/raw/B_hhold_test.csv', 27 | 'train_ind': 'data/processed/B_indiv_ext_train.csv', 28 | 'test_ind': 'data/processed/B_indiv_ext_test.csv' 29 | }, 30 | 'C': {'train': 'data/processed/C_combine_train.csv', 31 | 'test': 'data/processed/C_combine_test.csv', 32 | 'train_hh': 'data/raw/C_hhold_train.csv', 33 | 'test_hh': 'data/raw/C_hhold_test.csv', 34 | 'train_ind': 'data/processed/C_indiv_ext_train.csv', 35 | 'test_ind': 'data/processed/C_indiv_ext_test.csv' 36 | }, 37 | } 38 | 39 | data_A = Data() 40 | data_B = DataConcat() 41 | data_C = DataConcat() 42 | 43 | data_A.set_country('A') 44 | data_B.set_country('B') 45 | data_C.set_country('C') 46 | 47 | data_A.set_file_names(files_dict=filenames_dict['A']) 48 | data_B.set_file_names(files_dict=filenames_dict['B']) 49 | data_C.set_file_names(files_dict=filenames_dict['C']) 50 | 51 | data_A.load(load=True) 52 | # To reproduce the result in the final submission. 53 | # Saving data to a file changes this data due to rounding of numbers. 54 | data_B.load(load=False) 55 | data_C.load(load=False) 56 | 57 | data_dict = {'A': data_A, 'B': data_B, 'C': data_C} 58 | balances = {'A': False, 'B': False, 'C': True} 59 | 60 | # XGBoost prediction 61 | if p_models['xgboost']: 62 | params_XGB_A = { 63 | 'learning_rate': 0.03, 64 | 'max_depth': 3, 65 | 'n_estimators': 1500, 66 | 'silent': True, 67 | 'objective': 'binary:logistic', 68 | 'gamma': 0.3, 69 | 'subsample': 0.7, 70 | 'reg_alpha': 0.05 71 | } 72 | 73 | params_XGB_B = { 74 | 'learning_rate': 0.03, 75 | 'max_depth': 5, 76 | 'n_estimators': 400, 77 | 'silent': True, 78 | 'objective': 'binary:logistic', 79 | 'gamma': 0.2, 80 | 'subsample': 0.7, 81 | 'reg_alpha': 0.05, 82 | } 83 | 84 | params_XGB_C = { 85 | 'learning_rate': 0.03, 86 | 'max_depth': 3, 87 | 'n_estimators': 500, 88 | 'silent': True, 89 | 'objective': 'binary:logistic', 90 | 'gamma': 0.2, 91 | 'subsample': 0.6, 92 | 'reg_alpha': 0.05, 93 | } 94 | 95 | model_xgb_A = XGB_model(categ_conv=True) 96 | model_xgb_A.set_params(params=params_XGB_A) 97 | model_xgb_B = XGB_model(categ_conv=True) 98 | model_xgb_B.set_params(params=params_XGB_B) 99 | model_xgb_C = XGB_model(categ_conv=True) 100 | model_xgb_C.set_params(params=params_XGB_C) 101 | model_xgb_dict = {'A': model_xgb_A, 'B': model_xgb_B, 'C': model_xgb_C} 102 | 103 | # List of columns to delete obtained via find_exclude function and cross-validation 104 | exclude_XGB_dict = {'A': ['A_0', 'A_10', 'A_101', 'A_106', 'A_11', 'A_113', 'A_120', 'A_121', 'A_13', 'A_131', 'A_134', 'A_138', 'A_140', 'A_146', 'A_147', 'A_148', 'A_15', 'A_152', 'A_155', 'A_161', 'A_162', 'A_167', 'A_168', 'A_17', 'A_170', 'A_173', 'A_174', 'A_175', 'A_176', 'A_179', 'A_18', 'A_181', 'A_185', 'A_186', 'A_191', 'A_195', 'A_197', 'A_2', 'A_202', 'A_203', 'A_206', 'A_213', 'A_215', 'A_216', 'A_218', 'A_219', 'A_22', 'A_223', 'A_225', 'A_226', 'A_227', 'A_232', 'A_234', 'A_237', 'A_242', 'A_245', 'A_251', 'A_252', 'A_253', 'A_254', 'A_255', 'A_256', 'A_258', 'A_259', 'A_26', 'A_261', 'A_262', 'A_263', 'A_267', 'A_27', 'A_272', 'A_277', 'A_282', 'A_295', 'A_299', 'A_3', 'A_30', 'A_301', 'A_302', 'A_305', 'A_307', 'A_308', 'A_309', 'A_31', 'A_312', 'A_315', 'A_319', 'A_32', 'A_322', 'A_33', 'A_330', 'A_332', 'A_335', 'A_341', 'A_35', 'A_39', 'A_43', 'A_44', 'A_45', 'A_46', 'A_49', 'A_57', 'A_59', 'A_60', 'A_61', 'A_63', 'A_66', 'A_67', 'A_69', 'A_70', 'A_72', 'A_76', 'A_77', 'A_80', 'A_81', 'A_88', 'A_89', 'A_9', 'A_91', 'A_93', 'A_97', 'cat_n_A_25', 'cat_n_A_3', 'cat_n_A_36', 'cat_n_A_4', 'iid_cnt', 'A_105', 'A_114', 'A_229', 'cat_n_A_20', 'div_cat_iid_cat_n_A_25', 'A_14', 'A_6_1', 'cat_n_A_39'], 105 | 'B': ['B_0', 'B_1', 'B_106', 'B_109', 'B_112', 'B_12', 'B_120', 'B_121', 'B_128', 'B_135', 'B_14', 'B_140', 'B_141', 'B_142', 'B_143', 'B_144', 'B_145', 'B_148', 'B_149', 'B_151', 'B_152', 'B_152_1', 'B_157_0', 'B_157_1', 'B_158', 'B_159_0', 'B_15_1', 'B_16', 'B_160', 'B_161_0', 'B_161_1', 'B_162', 'B_167', 'B_17', 'B_172', 'B_173', 'B_174_1', 'B_175_0', 'B_175_1', 'B_176', 'B_18', 'B_180_1', 'B_187', 'B_188', 'B_188_1', 'B_191', 'B_196', 'B_196_0', 'B_196_1', 'B_203', 'B_204', 'B_205', 'B_206', 'B_208', 'B_209', 'B_20_0', 'B_20_1', 'B_210', 'B_210_0', 'B_210_1', 'B_211', 'B_212', 'B_215', 'B_219', 'B_219_0', 'B_227', 'B_228', 'B_23', 'B_230', 'B_234', 'B_237', 'B_238', 'B_239', 'B_24', 'B_241', 'B_243', 'B_244', 'B_247', 'B_248', 'B_250', 'B_251', 'B_252', 'B_254', 'B_256', 'B_259', 'B_260', 'B_264', 'B_265', 'B_266', 'B_269', 'B_271', 'B_275', 'B_278', 'B_279', 'B_28', 'B_284', 'B_29', 'B_3', 'B_302', 'B_303', 'B_304', 'B_307', 'B_313', 'B_314', 'B_320', 'B_334', 'B_337', 'B_340', 'B_342', 'B_348', 'B_34_0', 'B_34_1', 'B_35', 'B_350', 'B_353', 'B_354', 'B_355', 'B_356', 'B_359', 'B_35_0', 'B_35_1', 'B_36', 'B_361', 'B_362', 'B_363', 'B_364', 'B_365', 'B_366', 'B_368', 'B_36_0', 'B_36_1', 'B_37', 'B_370', 'B_371', 'B_372', 'B_375', 'B_379', 'B_385', 'B_386', 'B_389', 'B_390', 'B_391', 'B_392', 'B_394', 'B_395', 'B_397', 'B_400', 'B_402', 'B_405', 'B_406', 'B_407', 'B_41', 'B_410', 'B_411', 'B_412', 'B_413', 'B_418', 'B_42', 'B_420', 'B_422', 'B_423', 'B_427', 'B_428', 'B_44', 'B_47', 'B_48', 'B_50', 'B_52', 'B_55', 'B_60_0', 'B_60_1', 'B_61', 'B_62', 'B_63', 'B_64', 'B_65', 'B_66', 'B_67', 'B_68_0', 'B_68_1', 'B_7', 'B_71_1', 'B_72', 'B_76', 'B_80', 'B_83', 'B_86', 'B_89', 'B_8_0', 'B_8_1', 'B_9', 'B_94', 'B_95', 'B_96', 'B_99', 'cat_n_B_1', 'cat_n_B_10', 'cat_n_B_102', 'cat_n_B_104', 'cat_n_B_105', 'cat_n_B_106', 'cat_n_B_107', 'cat_n_B_108', 'cat_n_B_11', 'cat_n_B_111', 'cat_n_B_115', 'cat_n_B_116', 'cat_n_B_117', 'cat_n_B_118', 'cat_n_B_120', 'cat_n_B_122', 'cat_n_B_126', 'cat_n_B_127', 'cat_n_B_129', 'cat_n_B_13', 'cat_n_B_130', 'cat_n_B_133', 'cat_n_B_134', 'cat_n_B_137', 'cat_n_B_138', 'cat_n_B_139', 'cat_n_B_140', 'cat_n_B_141', 'cat_n_B_142', 'cat_n_B_145', 'cat_n_B_147', 'cat_n_B_148', 'cat_n_B_149', 'cat_n_B_152', 'cat_n_B_153', 'cat_n_B_154', 'cat_n_B_157', 'cat_n_B_158', 'cat_n_B_16', 'cat_n_B_160', 'cat_n_B_161', 'cat_n_B_165', 'cat_n_B_166', 'cat_n_B_168', 'cat_n_B_169', 'cat_n_B_17', 'cat_n_B_170', 'cat_n_B_171', 'cat_n_B_174', 'cat_n_B_177', 'cat_n_B_179', 'cat_n_B_18', 'cat_n_B_181', 'cat_n_B_182', 'cat_n_B_184', 'cat_n_B_185', 'cat_n_B_187', 'cat_n_B_189', 'cat_n_B_192', 'cat_n_B_193', 'cat_n_B_196', 'cat_n_B_197', 'cat_n_B_198', 'cat_n_B_20', 'cat_n_B_201', 'cat_n_B_203', 'cat_n_B_204', 'cat_n_B_205', 'cat_n_B_206', 'cat_n_B_208', 'cat_n_B_209', 'cat_n_B_211', 'cat_n_B_212', 'cat_n_B_213', 'cat_n_B_214', 'cat_n_B_215', 'cat_n_B_216', 'cat_n_B_218', 'cat_n_B_219', 'cat_n_B_220', 'cat_n_B_221', 'cat_n_B_223', 'cat_n_B_23', 'cat_n_B_24', 'cat_n_B_25', 'cat_n_B_26', 'cat_n_B_27', 'cat_n_B_28', 'cat_n_B_3', 'cat_n_B_30', 'cat_n_B_31', 'cat_n_B_32', 'cat_n_B_33', 'cat_n_B_34', 'cat_n_B_35', 'cat_n_B_36', 'cat_n_B_37', 'cat_n_B_38', 'cat_n_B_39', 'cat_n_B_4', 'cat_n_B_42', 'cat_n_B_45', 'cat_n_B_47', 'cat_n_B_49', 'cat_n_B_5', 'cat_n_B_50', 'cat_n_B_51', 'cat_n_B_52', 'cat_n_B_55', 'cat_n_B_56', 'cat_n_B_60', 'cat_n_B_62', 'cat_n_B_63', 'cat_n_B_64', 'cat_n_B_65', 'cat_n_B_68', 'cat_n_B_7', 'cat_n_B_70', 'cat_n_B_71', 'cat_n_B_72', 'cat_n_B_76', 'cat_n_B_77', 'cat_n_B_78', 'cat_n_B_8', 'cat_n_B_82', 'cat_n_B_83', 'cat_n_B_86', 'cat_n_B_88', 'cat_n_B_90', 'cat_n_B_92', 'cat_n_B_93', 'cat_n_B_94', 'cat_n_B_95', 'cat_n_B_98', 'cat_n_B_99', 'div_cat_iid_cat_n_B_102', 'div_cat_iid_cat_n_B_105', 'div_cat_iid_cat_n_B_111', 'div_cat_iid_cat_n_B_114', 'div_cat_iid_cat_n_B_116', 'div_cat_iid_cat_n_B_118', 'div_cat_iid_cat_n_B_119', 'div_cat_iid_cat_n_B_122', 'div_cat_iid_cat_n_B_127', 'div_cat_iid_cat_n_B_131', 'div_cat_iid_cat_n_B_134', 'div_cat_iid_cat_n_B_139', 'div_cat_iid_cat_n_B_141', 'div_cat_iid_cat_n_B_142', 'div_cat_iid_cat_n_B_147', 'div_cat_iid_cat_n_B_157', 'div_cat_iid_cat_n_B_158', 'div_cat_iid_cat_n_B_16', 'div_cat_iid_cat_n_B_161', 'div_cat_iid_cat_n_B_169', 'div_cat_iid_cat_n_B_170', 'div_cat_iid_cat_n_B_171', 'div_cat_iid_cat_n_B_174', 'div_cat_iid_cat_n_B_177', 'div_cat_iid_cat_n_B_178', 'div_cat_iid_cat_n_B_179', 'div_cat_iid_cat_n_B_180', 'div_cat_iid_cat_n_B_181', 'div_cat_iid_cat_n_B_184', 'div_cat_iid_cat_n_B_188', 'div_cat_iid_cat_n_B_189', 'div_cat_iid_cat_n_B_193', 'div_cat_iid_cat_n_B_196', 'div_cat_iid_cat_n_B_197', 'div_cat_iid_cat_n_B_199', 'div_cat_iid_cat_n_B_201', 'div_cat_iid_cat_n_B_202', 'div_cat_iid_cat_n_B_204', 'div_cat_iid_cat_n_B_206', 'div_cat_iid_cat_n_B_208', 'div_cat_iid_cat_n_B_209', 'div_cat_iid_cat_n_B_213', 'div_cat_iid_cat_n_B_215', 'div_cat_iid_cat_n_B_216', 'div_cat_iid_cat_n_B_217', 'div_cat_iid_cat_n_B_220', 'div_cat_iid_cat_n_B_223', 'div_cat_iid_cat_n_B_23', 'div_cat_iid_cat_n_B_26', 'div_cat_iid_cat_n_B_3', 'div_cat_iid_cat_n_B_31', 'div_cat_iid_cat_n_B_32', 'div_cat_iid_cat_n_B_33', 'div_cat_iid_cat_n_B_34', 'div_cat_iid_cat_n_B_35', 'div_cat_iid_cat_n_B_36', 'div_cat_iid_cat_n_B_38', 'div_cat_iid_cat_n_B_42', 'div_cat_iid_cat_n_B_43', 'div_cat_iid_cat_n_B_45', 'div_cat_iid_cat_n_B_47', 'div_cat_iid_cat_n_B_50', 'div_cat_iid_cat_n_B_51', 'div_cat_iid_cat_n_B_52', 'div_cat_iid_cat_n_B_59', 'div_cat_iid_cat_n_B_61', 'div_cat_iid_cat_n_B_62', 'div_cat_iid_cat_n_B_69', 'div_cat_iid_cat_n_B_7', 'div_cat_iid_cat_n_B_71', 'div_cat_iid_cat_n_B_72', 'div_cat_iid_cat_n_B_75', 'div_cat_iid_cat_n_B_76', 'div_cat_iid_cat_n_B_77', 'div_cat_iid_cat_n_B_81', 'div_cat_iid_cat_n_B_83', 'div_cat_iid_cat_n_B_84', 'div_cat_iid_cat_n_B_90', 'div_cat_iid_cat_n_B_92', 'div_cat_iid_cat_n_B_94', 'div_cat_iid_cat_n_B_95', 'div_cat_iid_cat_n_B_98', 'div_cat_iid_cat_n_B_99', 'iid_cnt', 'sum_B_157', 'sum_B_161', 'sum_B_174', 'sum_B_188', 'B_10', 'B_101', 'B_104', 'B_107', 'B_11', 'B_111', 'B_116', 'B_123_0', 'B_156', 'B_159_1', 'B_164', 'B_170', 'B_171', 'B_174_0', 'B_182', 'B_192', 'B_194', 'B_19_0', 'B_216', 'B_223', 'B_224', 'B_229', 'B_235', 'B_25', 'B_272', 'B_282', 'B_283', 'B_288', 'B_290', 'B_293', 'B_297', 'B_317', 'B_318', 'B_322', 'B_325', 'B_343', 'B_352', 'B_373', 'B_384', 'B_403', 'B_51', 'B_68', 'B_73', 'B_92', 'cat_n_B_12', 'cat_n_B_124', 'cat_n_B_125', 'cat_n_B_131', 'cat_n_B_132', 'cat_n_B_136', 'cat_n_B_159', 'cat_n_B_167', 'cat_n_B_19', 'cat_n_B_191', 'cat_n_B_194', 'cat_n_B_2', 'cat_n_B_200', 'cat_n_B_202', 'cat_n_B_207', 'cat_n_B_210', 'cat_n_B_217', 'cat_n_B_44', 'cat_n_B_59', 'cat_n_B_67', 'cat_n_B_75', 'cat_n_B_84', 'cat_n_B_9', 'cat_n_B_91', 'cat_n_B_96', 'div_cat_iid_cat_n_B_0', 'div_cat_iid_cat_n_B_112', 'div_cat_iid_cat_n_B_12', 'div_cat_iid_cat_n_B_121', 'div_cat_iid_cat_n_B_126', 'div_cat_iid_cat_n_B_136', 'div_cat_iid_cat_n_B_137', 'div_cat_iid_cat_n_B_138', 'div_cat_iid_cat_n_B_151', 'div_cat_iid_cat_n_B_167', 'div_cat_iid_cat_n_B_186', 'div_cat_iid_cat_n_B_198', 'div_cat_iid_cat_n_B_2', 'div_cat_iid_cat_n_B_203', 'div_cat_iid_cat_n_B_207', 'div_cat_iid_cat_n_B_211', 'div_cat_iid_cat_n_B_29', 'div_cat_iid_cat_n_B_39', 'div_cat_iid_cat_n_B_49', 'div_cat_iid_cat_n_B_5', 'B_123_1', 'B_146', 'B_147', 'B_174', 'B_198_1', 'B_218', 'B_222_1', 'B_285', 'B_296', 'B_339', 'B_414', 'B_85', 'B_91', 'cat_n_B_113', 'cat_n_B_114', 'cat_n_B_123', 'cat_n_B_151', 'cat_n_B_178', 'cat_n_B_180', 'cat_n_B_183', 'cat_n_B_195', 'cat_n_B_199', 'cat_n_B_29', 'cat_n_B_43', 'cat_n_B_48', 'cat_n_B_74', 'div_cat_iid_cat_n_B_145', 'div_cat_iid_cat_n_B_148', 'div_cat_iid_cat_n_B_192', 'div_cat_iid_cat_n_B_55', 'sum_B_35', 'B_103', 'B_107_1', 'B_123', 'B_155', 'B_178', 'B_183', 'B_2', 'B_233', 'B_268', 'B_270', 'B_295', 'B_319', 'B_321', 'B_328', 'B_33', 'B_360', 'B_382', 'B_383', 'B_387', 'B_388', 'B_46_0', 'B_75', 'cat_n_B_119', 'cat_n_B_128', 'cat_n_B_146', 'cat_n_B_173', 'cat_n_B_40', 'div_cat_iid_cat_n_B_11', 'div_cat_iid_cat_n_B_110', 'div_cat_iid_cat_n_B_120', 'div_cat_iid_cat_n_B_128', 'div_cat_iid_cat_n_B_172', 'div_cat_iid_cat_n_B_20', 'div_cat_iid_cat_n_B_210', 'div_cat_iid_cat_n_B_219', 'div_cat_iid_cat_n_B_221', 'div_cat_iid_cat_n_B_27', 'div_cat_iid_cat_n_B_60', 'div_cat_iid_cat_n_B_63', 'div_cat_iid_cat_n_B_64', 'sum_B_180', 'B_115', 'B_124', 'B_19', 'B_19_1', 'B_330', 'B_357', 'B_409', 'cat_n_B_103', 'cat_n_B_121', 'cat_n_B_164', 'cat_n_B_186', 'cat_n_B_54', 'cat_n_B_73', 'cat_n_B_80', 'div_cat_iid_cat_n_B_154', 'div_cat_iid_cat_n_B_187', 'div_cat_iid_cat_n_B_44', 'B_163', 'B_165', 'B_180_0', 'B_236', 'B_277', 'B_292', 'B_329', 'B_34', 'B_46_1', 'cat_n_B_57', 'div_cat_iid_cat_n_B_130', 'div_cat_iid_cat_n_B_57'], 106 | 'C': ['C_1', 'C_10', 'C_100', 'C_109', 'C_10_0', 'C_111', 'C_116', 'C_126', 'C_127', 'C_129', 'C_133', 'C_135', 'C_139', 'C_14', 'C_141', 'C_143', 'C_146', 'C_151', 'C_154', 'C_155', 'C_157', 'C_159', 'C_17_0', 'C_17_1', 'C_19', 'C_22', 'C_25', 'C_26', 'C_27_0', 'C_27_1', 'C_28', 'C_3', 'C_31', 'C_32', 'C_44', 'C_45', 'C_47', 'C_54', 'C_55', 'C_59', 'C_6', 'C_63', 'C_65', 'C_67', 'C_72', 'C_73', 'C_74', 'C_77', 'C_8', 'C_81', 'C_84', 'C_85', 'C_87', 'C_89', 'C_9', 'C_92', 'C_94', 'C_96', 'C_99', 'cat_n_C_0', 'cat_n_C_10', 'cat_n_C_11', 'cat_n_C_14', 'cat_n_C_15', 'cat_n_C_16', 'cat_n_C_18', 'cat_n_C_20', 'cat_n_C_21', 'cat_n_C_23', 'cat_n_C_24', 'cat_n_C_26', 'cat_n_C_32', 'cat_n_C_38', 'cat_n_C_4', 'cat_n_C_5', 'cat_n_C_9', 'div_cat_iid_cat_n_C_13', 'div_cat_iid_cat_n_C_23', 'div_cat_iid_cat_n_C_26', 'div_cat_iid_cat_n_C_31', 'div_cat_iid_cat_n_C_32', 'div_cat_iid_cat_n_C_6', 'div_cat_iid_cat_n_C_7', 'iid_cnt', 'C_13', 'C_144', 'C_161', 'C_2', 'C_29', 'C_33', 'C_41', 'C_79', 'C_90', 'C_98', 'cat_n_C_25', 'cat_n_C_27', 'cat_n_C_3', 'cat_n_C_37', 'cat_n_C_6', 'div_cat_iid_cat_n_C_0', 'div_cat_iid_cat_n_C_14', 'div_cat_iid_cat_n_C_20', 'C_145', 'C_60', 'C_69', 'cat_n_C_13', 'cat_n_C_40', 'div_cat_iid_cat_n_C_15', 'div_cat_iid_cat_n_C_5', 'C_142', 'C_50', 'C_62', 'C_103', 'C_121', 'C_24', 'C_30', 'C_39', 'C_40', 'C_112', 'C_123']} 107 | 108 | process_xgb = process.processing(countries=['A', 'B', 'C'], 109 | balances=balances) 110 | process_xgb.set_data_dict(data_dict=data_dict) 111 | process_xgb.set_model_dict(model_dict=model_xgb_dict) 112 | # process_xgb.find_exclude() 113 | process_xgb.set_exclude_dict(exclude_XGB_dict) 114 | result_xgb = process_xgb.predict(model_name='xgboost', path='models/') 115 | 116 | # LightGBM prediction 117 | if p_models['lightgbm']: 118 | params_LGBM_A = { 119 | 'learning_rate': 0.02, 120 | 'max_depth': 6, 121 | 'n_estimators': 942, 122 | 'silent': True, 123 | 'objective': 'binary', 124 | 'subsample': 0.6, 125 | 'reg_alpha': 0.02, 126 | 'is_unbalance': True, 127 | 'boosting_type': 'gbdt', 128 | 'reg_lambda': 0.01, 129 | 'random_state': 1 130 | } 131 | 132 | params_LGBM_B = { 133 | 'learning_rate': 0.03, 134 | 'max_depth': 6, 135 | 'n_estimators': 232, 136 | 'silent': True, 137 | 'objective': 'binary', 138 | 'subsample': 0.8, 139 | 'reg_alpha': 0.05, 140 | 'is_unbalance': True, 141 | 'boosting_type': 'gbdt', 142 | 'reg_lambda': 0.00, 143 | 'random_state': 1 144 | } 145 | 146 | params_LGBM_C = { 147 | 'learning_rate': 0.05, 148 | 'max_depth': 3, 149 | 'n_estimators': 520, 150 | 'silent': True, 151 | 'objective': 'binary', 152 | 'subsample': 0.7, 153 | 'reg_alpha': 0.05, 154 | 'is_unbalance': True, 155 | 'boosting_type': 'gbdt', 156 | 'reg_lambda': 0.03, 157 | 'random_state': 1 158 | } 159 | 160 | model_lgbm_A = LGBM_model(categ_conv=True) 161 | model_lgbm_A.set_params(params=params_LGBM_A) 162 | model_lgbm_B = LGBM_model(categ_conv=True) 163 | model_lgbm_B.set_params(params=params_LGBM_B) 164 | model_lgbm_C = LGBM_model(categ_conv=True) 165 | model_lgbm_C.set_params(params=params_LGBM_C) 166 | model_lgbm_dict = {'A': model_lgbm_A, 'B': model_lgbm_B, 'C': model_lgbm_C} 167 | 168 | # List of columns to delete obtained via find_exclude function and cross-validation 169 | exclude_LGBM_dict = {'A': ['A_0', 'A_10', 'A_101', 'A_105', 'A_106', 'A_11', 'A_112', 'A_113', 'A_115', 'A_120', 'A_121', 'A_13', 'A_131', 'A_134', 'A_138', 'A_141', 'A_15', 'A_152', 'A_155', 'A_161', 'A_162', 'A_167', 'A_168', 'A_170', 'A_173', 'A_174', 'A_175', 'A_176', 'A_18', 'A_181', 'A_185', 'A_191', 'A_195', 'A_197', 'A_202', 'A_203', 'A_206', 'A_215', 'A_216', 'A_218', 'A_219', 'A_223', 'A_225', 'A_232', 'A_237', 'A_242', 'A_245', 'A_251', 'A_252', 'A_253', 'A_254', 'A_255', 'A_256', 'A_258', 'A_259', 'A_26', 'A_261', 'A_262', 'A_263', 'A_267', 'A_27', 'A_272', 'A_275', 'A_282', 'A_292', 'A_295', 'A_299', 'A_3', 'A_30', 'A_301', 'A_307', 'A_308', 'A_309', 'A_31', 'A_312', 'A_319', 'A_32', 'A_322', 'A_33', 'A_330', 'A_332', 'A_335', 'A_338', 'A_341', 'A_35', 'A_39', 'A_43', 'A_44', 'A_46', 'A_47', 'A_49', 'A_57', 'A_59', 'A_60', 'A_63', 'A_66', 'A_67', 'A_69', 'A_70', 'A_72', 'A_77', 'A_80', 'A_81', 'A_88', 'A_89', 'A_9', 'A_91', 'A_93'], 170 | 'B': ['B_0', 'B_1', 'B_106', 'B_106_0', 'B_107', 'B_11', 'B_115', 'B_120', 'B_121', 'B_128', 'B_140', 'B_141', 'B_142', 'B_143', 'B_144', 'B_151', 'B_152', 'B_157_0', 'B_157_1', 'B_158', 'B_159_0', 'B_159_1', 'B_15_0', 'B_16', 'B_160', 'B_161_0', 'B_161_1', 'B_162', 'B_164', 'B_165', 'B_167', 'B_17', 'B_172', 'B_174', 'B_174_0', 'B_174_1', 'B_176', 'B_18', 'B_180_0', 'B_187', 'B_188', 'B_188_1', 'B_191', 'B_194', 'B_196', 'B_196_0', 'B_196_1', 'B_19_0', 'B_203', 'B_204', 'B_205', 'B_206', 'B_208', 'B_209', 'B_20_0', 'B_210_0', 'B_210_1', 'B_215', 'B_219', 'B_219_0', 'B_227', 'B_228', 'B_229', 'B_23', 'B_230', 'B_234', 'B_236', 'B_238', 'B_24', 'B_241', 'B_242', 'B_243', 'B_244', 'B_247', 'B_25', 'B_250', 'B_254', 'B_256', 'B_264', 'B_266', 'B_269', 'B_271', 'B_272', 'B_275', 'B_279', 'B_283', 'B_284', 'B_288', 'B_29', 'B_293', 'B_296', 'B_3', 'B_302', 'B_303', 'B_307', 'B_314', 'B_317', 'B_318', 'B_325', 'B_329', 'B_330', 'B_334', 'B_337', 'B_340', 'B_348', 'B_34_0', 'B_34_1', 'B_35', 'B_350', 'B_354', 'B_355', 'B_356', 'B_35_0', 'B_35_1', 'B_36', 'B_361', 'B_366', 'B_36_0', 'B_36_1', 'B_37', 'B_370', 'B_371', 'B_372', 'B_373', 'B_385', 'B_386', 'B_389', 'B_390', 'B_394', 'B_397', 'B_399', 'B_400', 'B_402', 'B_405', 'B_406', 'B_407', 'B_408', 'B_410', 'B_411', 'B_412', 'B_413', 'B_418', 'B_42', 'B_420', 'B_422', 'B_427', 'B_428', 'B_432', 'B_436', 'B_48', 'B_50', 'B_52', 'B_55', 'B_60_1', 'B_63', 'B_64', 'B_65', 'B_67', 'B_68_0', 'B_71_0', 'B_72', 'B_73', 'B_75', 'B_80', 'B_83', 'B_89', 'B_8_0', 'B_9', 'B_91', 'B_94', 'B_95', 'B_99', 'cat_n_B_1', 'cat_n_B_102', 'cat_n_B_105', 'cat_n_B_106', 'cat_n_B_11', 'cat_n_B_110', 'cat_n_B_111', 'cat_n_B_112', 'cat_n_B_116', 'cat_n_B_117', 'cat_n_B_118', 'cat_n_B_119', 'cat_n_B_120', 'cat_n_B_121', 'cat_n_B_122', 'cat_n_B_123', 'cat_n_B_126', 'cat_n_B_127', 'cat_n_B_128', 'cat_n_B_131', 'cat_n_B_134', 'cat_n_B_136', 'cat_n_B_137', 'cat_n_B_138', 'cat_n_B_139', 'cat_n_B_140', 'cat_n_B_141', 'cat_n_B_142', 'cat_n_B_145', 'cat_n_B_147', 'cat_n_B_148', 'cat_n_B_151', 'cat_n_B_152', 'cat_n_B_153', 'cat_n_B_154', 'cat_n_B_157', 'cat_n_B_158', 'cat_n_B_159', 'cat_n_B_16', 'cat_n_B_160', 'cat_n_B_161', 'cat_n_B_167', 'cat_n_B_168', 'cat_n_B_169', 'cat_n_B_170', 'cat_n_B_171', 'cat_n_B_172', 'cat_n_B_174', 'cat_n_B_177', 'cat_n_B_178', 'cat_n_B_179', 'cat_n_B_18', 'cat_n_B_180', 'cat_n_B_181', 'cat_n_B_183', 'cat_n_B_184', 'cat_n_B_19', 'cat_n_B_190', 'cat_n_B_193', 'cat_n_B_194', 'cat_n_B_195', 'cat_n_B_196', 'cat_n_B_197', 'cat_n_B_198', 'cat_n_B_199', 'cat_n_B_20', 'cat_n_B_201', 'cat_n_B_202', 'cat_n_B_203', 'cat_n_B_204', 'cat_n_B_207', 'cat_n_B_208', 'cat_n_B_209', 'cat_n_B_210', 'cat_n_B_211', 'cat_n_B_213', 'cat_n_B_215', 'cat_n_B_216', 'cat_n_B_217', 'cat_n_B_218', 'cat_n_B_219', 'cat_n_B_220', 'cat_n_B_221', 'cat_n_B_223', 'cat_n_B_23', 'cat_n_B_25', 'cat_n_B_26', 'cat_n_B_27', 'cat_n_B_28', 'cat_n_B_29', 'cat_n_B_3', 'cat_n_B_31', 'cat_n_B_32', 'cat_n_B_33', 'cat_n_B_34', 'cat_n_B_36', 'cat_n_B_38', 'cat_n_B_39', 'cat_n_B_42', 'cat_n_B_44', 'cat_n_B_45', 'cat_n_B_47', 'cat_n_B_48', 'cat_n_B_49', 'cat_n_B_50', 'cat_n_B_51', 'cat_n_B_52', 'cat_n_B_55', 'cat_n_B_56', 'cat_n_B_59', 'cat_n_B_60', 'cat_n_B_62', 'cat_n_B_64', 'cat_n_B_68', 'cat_n_B_7', 'cat_n_B_71', 'cat_n_B_72', 'cat_n_B_75', 'cat_n_B_76', 'cat_n_B_77', 'cat_n_B_78', 'cat_n_B_84', 'cat_n_B_9', 'cat_n_B_90', 'cat_n_B_92', 'cat_n_B_94', 'cat_n_B_95', 'cat_n_B_98', 'cat_n_B_99', 'div_cat_iid_cat_n_B_102', 'div_cat_iid_cat_n_B_111', 'div_cat_iid_cat_n_B_116', 'div_cat_iid_cat_n_B_119', 'div_cat_iid_cat_n_B_121', 'div_cat_iid_cat_n_B_122', 'div_cat_iid_cat_n_B_134', 'div_cat_iid_cat_n_B_136', 'div_cat_iid_cat_n_B_139', 'div_cat_iid_cat_n_B_14', 'div_cat_iid_cat_n_B_141', 'div_cat_iid_cat_n_B_142', 'div_cat_iid_cat_n_B_145', 'div_cat_iid_cat_n_B_148', 'div_cat_iid_cat_n_B_157', 'div_cat_iid_cat_n_B_158', 'div_cat_iid_cat_n_B_159', 'div_cat_iid_cat_n_B_160', 'div_cat_iid_cat_n_B_172', 'div_cat_iid_cat_n_B_174', 'div_cat_iid_cat_n_B_181', 'div_cat_iid_cat_n_B_184', 'div_cat_iid_cat_n_B_188', 'div_cat_iid_cat_n_B_193', 'div_cat_iid_cat_n_B_196', 'div_cat_iid_cat_n_B_197', 'div_cat_iid_cat_n_B_201', 'div_cat_iid_cat_n_B_204', 'div_cat_iid_cat_n_B_210', 'div_cat_iid_cat_n_B_213', 'div_cat_iid_cat_n_B_215', 'div_cat_iid_cat_n_B_217', 'div_cat_iid_cat_n_B_220', 'div_cat_iid_cat_n_B_221', 'div_cat_iid_cat_n_B_223', 'div_cat_iid_cat_n_B_23', 'div_cat_iid_cat_n_B_26', 'div_cat_iid_cat_n_B_3', 'div_cat_iid_cat_n_B_31', 'div_cat_iid_cat_n_B_32', 'div_cat_iid_cat_n_B_33', 'div_cat_iid_cat_n_B_34', 'div_cat_iid_cat_n_B_38', 'div_cat_iid_cat_n_B_39', 'div_cat_iid_cat_n_B_42', 'div_cat_iid_cat_n_B_45', 'div_cat_iid_cat_n_B_47', 'div_cat_iid_cat_n_B_50', 'div_cat_iid_cat_n_B_51', 'div_cat_iid_cat_n_B_52', 'div_cat_iid_cat_n_B_55', 'div_cat_iid_cat_n_B_59', 'div_cat_iid_cat_n_B_60', 'div_cat_iid_cat_n_B_62', 'div_cat_iid_cat_n_B_7', 'div_cat_iid_cat_n_B_72', 'div_cat_iid_cat_n_B_76', 'div_cat_iid_cat_n_B_78', 'div_cat_iid_cat_n_B_81', 'div_cat_iid_cat_n_B_90', 'div_cat_iid_cat_n_B_94', 'div_cat_iid_cat_n_B_95', 'div_cat_iid_cat_n_B_98', 'div_cat_iid_cat_n_B_99', 'iid_cnt', 'sum_B_157', 'sum_B_161', 'sum_B_188'], 171 | 'C': ['C_100', 'C_109', 'C_10_0', 'C_111', 'C_116', 'C_121', 'C_123', 'C_125', 'C_126', 'C_127', 'C_129', 'C_133', 'C_135', 'C_139', 'C_14', 'C_140', 'C_141', 'C_143', 'C_146', 'C_150', 'C_151', 'C_152', 'C_154', 'C_155', 'C_157', 'C_159', 'C_17_0', 'C_17_1', 'C_18', 'C_19', 'C_2', 'C_20', 'C_21', 'C_22', 'C_25', 'C_26', 'C_27_0', 'C_27_1', 'C_28', 'C_29', 'C_3', 'C_32', 'C_33', 'C_39', 'C_40', 'C_41', 'C_54', 'C_55', 'C_59', 'C_62', 'C_63', 'C_64', 'C_65', 'C_67', 'C_69', 'C_72', 'C_73', 'C_74', 'C_77', 'C_8', 'C_81', 'C_82', 'C_84', 'C_85', 'C_87', 'C_9', 'C_90', 'C_92', 'C_94', 'C_96', 'C_98', 'C_99', 'cat_n_C_0', 'cat_n_C_10', 'cat_n_C_11', 'cat_n_C_14', 'cat_n_C_15', 'cat_n_C_16', 'cat_n_C_17', 'cat_n_C_18', 'cat_n_C_2', 'cat_n_C_20', 'cat_n_C_21', 'cat_n_C_23', 'cat_n_C_24', 'cat_n_C_26', 'cat_n_C_27', 'cat_n_C_3', 'cat_n_C_30', 'cat_n_C_38', 'cat_n_C_4', 'cat_n_C_5', 'cat_n_C_9', 'div_cat_iid_cat_n_C_2', 'div_cat_iid_cat_n_C_31', 'div_cat_iid_cat_n_C_4', 'div_cat_iid_cat_n_C_40', 'div_cat_iid_cat_n_C_7', 'iid_cnt']} 172 | 173 | process_lgbm = process.processing(countries=['A', 'B', 'C'], 174 | balances=balances) 175 | process_lgbm.set_data_dict(data_dict=data_dict) 176 | process_lgbm.set_model_dict(model_dict=model_lgbm_dict) 177 | process_lgbm.set_exclude_dict(exclude_LGBM_dict) 178 | # process_lgbm.find_exclude() 179 | result_lgbm = process_lgbm.predict(model_name='lightgbm', path='models/') 180 | 181 | # Catboost prediction 182 | if p_models['catboost']: 183 | params_CB_A = { 184 | 'iterations': 5000, 185 | 'learning_rate': 0.03, 186 | 'depth': 6, 187 | 'l2_leaf_reg': 3, 188 | 'loss_function': 'Logloss', 189 | 'random_seed': 1, 190 | 'logging_level': 'Silent', 191 | } 192 | 193 | params_CB_B = { 194 | 'iterations': 5000, 195 | 'learning_rate': 0.03, 196 | 'depth': 6, 197 | 'l2_leaf_reg': 3, 198 | 'loss_function': 'Logloss', 199 | 'random_seed': 1, 200 | 'logging_level': 'Silent', 201 | } 202 | 203 | params_CB_C = { 204 | 'iterations': 500, 205 | 'learning_rate': 0.03, 206 | 'depth': 6, 207 | 'l2_leaf_reg': 3, 208 | 'loss_function': 'Logloss', 209 | 'random_seed': 1, 210 | 'logging_level': 'Silent', 211 | } 212 | 213 | model_cb_A = CB_model(categ_conv=True) 214 | model_cb_A.set_params(params=params_CB_A) 215 | model_cb_B = CB_model(categ_conv=True) 216 | model_cb_B.set_params(params=params_CB_B) 217 | model_cb_C = CB_model(categ_conv=True) 218 | model_cb_C.set_params(params=params_CB_C) 219 | model_cb_dict = {'A': model_cb_A, 'B': model_cb_B, 'C': model_cb_C} 220 | 221 | # List of columns to delete obtained via find_exclude function and cross-validation 222 | exclude_CB_dict = {'A': ['A_0', 'A_10', 'A_106', 'A_113', 'A_114', 'A_115', 'A_120', 'A_138', 'A_15', 'A_173', 'A_174', 'A_175', 'A_181', 'A_185', 'A_191', 'A_195', 'A_202', 'A_206', 'A_215', 'A_216', 'A_218', 'A_223', 'A_245', 'A_250', 'A_251', 'A_252', 'A_253', 'A_254', 'A_255', 'A_256', 'A_263', 'A_272', 'A_277', 'A_295', 'A_299', 'A_308', 'A_309', 'A_32', 'A_33', 'A_330', 'A_39', 'A_43', 'A_44', 'A_57', 'A_59', 'A_63', 'A_69', 'A_6_1', 'A_70', 'A_72', 'A_77', 'A_81', 'A_88', 'A_89', 'A_93', 'cat_n_A_10', 'cat_n_A_15', 'cat_n_A_20', 'cat_n_A_22', 'cat_n_A_25', 'cat_n_A_33', 'cat_n_A_35', 'cat_n_A_39', 'cat_n_A_4', 'cat_n_A_5', 'cat_n_A_6', 'cat_n_A_8', 'cat_n_A_9', 'A_101', 'A_11', 'A_147', 'A_155', 'A_170', 'A_18', 'A_203', 'A_338', 'A_35', 'A_49', 'A_67', 'cat_n_A_11', 'cat_n_A_21', 'cat_n_A_30', 'cat_n_A_37', 'div_cat_iid_cat_n_A_16', 'A_105', 'A_14', 'A_149', 'A_197', 'A_26', 'A_261', 'A_302', 'A_312', 'A_319', 'A_328', 'A_32_1', 'A_341', 'A_9', 'cat_n_A_28', 'div_cat_iid_cat_n_A_32', 'A_121', 'A_125', 'A_131', 'A_161', 'A_17', 'A_192', 'A_229', 'A_259', 'A_60', 'A_80', 'cat_n_A_1', 'cat_n_A_29', 'A_13', 'A_134', 'A_176', 'A_182', 'A_213', 'A_22', 'A_267', 'A_301', 'A_31', 'A_146', 'A_162', 'A_27', 'A_152', 'A_189', 'A_292', 'A_3', 'A_65'], 223 | 'B': ['B_1', 'B_106_0', 'B_106_1', 'B_107_1', 'B_113', 'B_121', 'B_123_1', 'B_139', 'B_144_0', 'B_144_1', 'B_152_0', 'B_157_0', 'B_157_1', 'B_159_0', 'B_159_1', 'B_15_0', 'B_15_1', 'B_161_0', 'B_161_1', 'B_167', 'B_174_0', 'B_174_1', 'B_175_0', 'B_176', 'B_18', 'B_180_0', 'B_183', 'B_188_0', 'B_188_1', 'B_196', 'B_196_1', 'B_198_0', 'B_198_1', 'B_20', 'B_203', 'B_204', 'B_205', 'B_207', 'B_208', 'B_20_0', 'B_20_1', 'B_210_0', 'B_210_1', 'B_218_0', 'B_219', 'B_219_1', 'B_222_1', 'B_227', 'B_238', 'B_243', 'B_244', 'B_256', 'B_258', 'B_264', 'B_265', 'B_272', 'B_274', 'B_29', 'B_3', 'B_303', 'B_307', 'B_316', 'B_320', 'B_329', 'B_349', 'B_34_0', 'B_34_1', 'B_35', 'B_355', 'B_35_0', 'B_35_1', 'B_361', 'B_36_0', 'B_36_1', 'B_370', 'B_371', 'B_385', 'B_389', 'B_405', 'B_407', 'B_412', 'B_422', 'B_46_0', 'B_46_1', 'B_5', 'B_55', 'B_60_0', 'B_60_1', 'B_68_0', 'B_68_1', 'B_6_0', 'B_71_1', 'B_72', 'B_77', 'B_83', 'B_8_1', 'cat_n_B_0', 'cat_n_B_1', 'cat_n_B_10', 'cat_n_B_100', 'cat_n_B_101', 'cat_n_B_102', 'cat_n_B_103', 'cat_n_B_104', 'cat_n_B_105', 'cat_n_B_106', 'cat_n_B_107', 'cat_n_B_108', 'cat_n_B_11', 'cat_n_B_110', 'cat_n_B_111', 'cat_n_B_112', 'cat_n_B_113', 'cat_n_B_114', 'cat_n_B_115', 'cat_n_B_116', 'cat_n_B_117', 'cat_n_B_118', 'cat_n_B_119', 'cat_n_B_12', 'cat_n_B_120', 'cat_n_B_121', 'cat_n_B_122', 'cat_n_B_123', 'cat_n_B_124', 'cat_n_B_125', 'cat_n_B_126', 'cat_n_B_127', 'cat_n_B_128', 'cat_n_B_129', 'cat_n_B_13', 'cat_n_B_130', 'cat_n_B_131', 'cat_n_B_133', 'cat_n_B_134', 'cat_n_B_135', 'cat_n_B_136', 'cat_n_B_137', 'cat_n_B_138', 'cat_n_B_139', 'cat_n_B_14', 'cat_n_B_140', 'cat_n_B_141', 'cat_n_B_142', 'cat_n_B_143', 'cat_n_B_145', 'cat_n_B_146', 'cat_n_B_147', 'cat_n_B_148', 'cat_n_B_149', 'cat_n_B_15', 'cat_n_B_150', 'cat_n_B_151', 'cat_n_B_152', 'cat_n_B_153', 'cat_n_B_154', 'cat_n_B_156', 'cat_n_B_157', 'cat_n_B_158', 'cat_n_B_159', 'cat_n_B_16', 'cat_n_B_160', 'cat_n_B_161', 'cat_n_B_162', 'cat_n_B_163', 'cat_n_B_164', 'cat_n_B_165', 'cat_n_B_166', 'cat_n_B_167', 'cat_n_B_168', 'cat_n_B_169', 'cat_n_B_170', 'cat_n_B_171', 'cat_n_B_172', 'cat_n_B_173', 'cat_n_B_174', 'cat_n_B_175', 'cat_n_B_176', 'cat_n_B_177', 'cat_n_B_178', 'cat_n_B_179', 'cat_n_B_18', 'cat_n_B_180', 'cat_n_B_181', 'cat_n_B_183', 'cat_n_B_184', 'cat_n_B_185', 'cat_n_B_186', 'cat_n_B_187', 'cat_n_B_189', 'cat_n_B_19', 'cat_n_B_190', 'cat_n_B_191', 'cat_n_B_192', 'cat_n_B_193', 'cat_n_B_194', 'cat_n_B_195', 'cat_n_B_196', 'cat_n_B_197', 'cat_n_B_198', 'cat_n_B_199', 'cat_n_B_2', 'cat_n_B_20', 'cat_n_B_200', 'cat_n_B_201', 'cat_n_B_202', 'cat_n_B_203', 'cat_n_B_204', 'cat_n_B_205', 'cat_n_B_206', 'cat_n_B_207', 'cat_n_B_208', 'cat_n_B_209', 'cat_n_B_210', 'cat_n_B_211', 'cat_n_B_212', 'cat_n_B_213', 'cat_n_B_214', 'cat_n_B_215', 'cat_n_B_216', 'cat_n_B_217', 'cat_n_B_218', 'cat_n_B_22', 'cat_n_B_220', 'cat_n_B_221', 'cat_n_B_222', 'cat_n_B_223', 'cat_n_B_23', 'cat_n_B_24', 'cat_n_B_25', 'cat_n_B_26', 'cat_n_B_27', 'cat_n_B_28', 'cat_n_B_29', 'cat_n_B_3', 'cat_n_B_30', 'cat_n_B_31', 'cat_n_B_32', 'cat_n_B_33', 'cat_n_B_34', 'cat_n_B_36', 'cat_n_B_38', 'cat_n_B_39', 'cat_n_B_4', 'cat_n_B_40', 'cat_n_B_41', 'cat_n_B_42', 'cat_n_B_43', 'cat_n_B_44', 'cat_n_B_45', 'cat_n_B_46', 'cat_n_B_47', 'cat_n_B_48', 'cat_n_B_49', 'cat_n_B_5', 'cat_n_B_50', 'cat_n_B_51', 'cat_n_B_52', 'cat_n_B_53', 'cat_n_B_54', 'cat_n_B_55', 'cat_n_B_56', 'cat_n_B_57', 'cat_n_B_58', 'cat_n_B_59', 'cat_n_B_60', 'cat_n_B_61', 'cat_n_B_62', 'cat_n_B_63', 'cat_n_B_64', 'cat_n_B_65', 'cat_n_B_66', 'cat_n_B_67', 'cat_n_B_68', 'cat_n_B_69', 'cat_n_B_7', 'cat_n_B_70', 'cat_n_B_71', 'cat_n_B_72', 'cat_n_B_73', 'cat_n_B_75', 'cat_n_B_76', 'cat_n_B_77', 'cat_n_B_78', 'cat_n_B_79', 'cat_n_B_8', 'cat_n_B_80', 'cat_n_B_82', 'cat_n_B_83', 'cat_n_B_84', 'cat_n_B_86', 'cat_n_B_87', 'cat_n_B_88', 'cat_n_B_89', 'cat_n_B_9', 'cat_n_B_90', 'cat_n_B_91', 'cat_n_B_92', 'cat_n_B_93', 'cat_n_B_94', 'cat_n_B_95', 'cat_n_B_96', 'cat_n_B_97', 'cat_n_B_98', 'cat_n_B_99', 'div_cat_iid_cat_n_B_0', 'div_cat_iid_cat_n_B_10', 'div_cat_iid_cat_n_B_102', 'div_cat_iid_cat_n_B_103', 'div_cat_iid_cat_n_B_105', 'div_cat_iid_cat_n_B_106', 'div_cat_iid_cat_n_B_107', 'div_cat_iid_cat_n_B_108', 'div_cat_iid_cat_n_B_11', 'div_cat_iid_cat_n_B_110', 'div_cat_iid_cat_n_B_111', 'div_cat_iid_cat_n_B_112', 'div_cat_iid_cat_n_B_115', 'div_cat_iid_cat_n_B_117', 'div_cat_iid_cat_n_B_118', 'div_cat_iid_cat_n_B_119', 'div_cat_iid_cat_n_B_12', 'div_cat_iid_cat_n_B_120', 'div_cat_iid_cat_n_B_121', 'div_cat_iid_cat_n_B_122', 'div_cat_iid_cat_n_B_123', 'div_cat_iid_cat_n_B_127', 'div_cat_iid_cat_n_B_129', 'div_cat_iid_cat_n_B_13', 'div_cat_iid_cat_n_B_131', 'div_cat_iid_cat_n_B_133', 'div_cat_iid_cat_n_B_134', 'div_cat_iid_cat_n_B_135', 'div_cat_iid_cat_n_B_136', 'div_cat_iid_cat_n_B_137', 'div_cat_iid_cat_n_B_139', 'div_cat_iid_cat_n_B_14', 'div_cat_iid_cat_n_B_140', 'div_cat_iid_cat_n_B_141', 'div_cat_iid_cat_n_B_142', 'div_cat_iid_cat_n_B_145', 'div_cat_iid_cat_n_B_146', 'div_cat_iid_cat_n_B_147', 'div_cat_iid_cat_n_B_148', 'div_cat_iid_cat_n_B_149', 'div_cat_iid_cat_n_B_151', 'div_cat_iid_cat_n_B_153', 'div_cat_iid_cat_n_B_154', 'div_cat_iid_cat_n_B_156', 'div_cat_iid_cat_n_B_157', 'div_cat_iid_cat_n_B_158', 'div_cat_iid_cat_n_B_159', 'div_cat_iid_cat_n_B_160', 'div_cat_iid_cat_n_B_161', 'div_cat_iid_cat_n_B_162', 'div_cat_iid_cat_n_B_165', 'div_cat_iid_cat_n_B_166', 'div_cat_iid_cat_n_B_168', 'div_cat_iid_cat_n_B_173', 'div_cat_iid_cat_n_B_174', 'div_cat_iid_cat_n_B_177', 'div_cat_iid_cat_n_B_178', 'div_cat_iid_cat_n_B_179', 'div_cat_iid_cat_n_B_181', 'div_cat_iid_cat_n_B_182', 'div_cat_iid_cat_n_B_184', 'div_cat_iid_cat_n_B_186', 'div_cat_iid_cat_n_B_187', 'div_cat_iid_cat_n_B_188', 'div_cat_iid_cat_n_B_189', 'div_cat_iid_cat_n_B_192', 'div_cat_iid_cat_n_B_193', 'div_cat_iid_cat_n_B_194', 'div_cat_iid_cat_n_B_197', 'div_cat_iid_cat_n_B_198', 'div_cat_iid_cat_n_B_199', 'div_cat_iid_cat_n_B_2', 'div_cat_iid_cat_n_B_20', 'div_cat_iid_cat_n_B_201', 'div_cat_iid_cat_n_B_202', 'div_cat_iid_cat_n_B_203', 'div_cat_iid_cat_n_B_204', 'div_cat_iid_cat_n_B_205', 'div_cat_iid_cat_n_B_206', 'div_cat_iid_cat_n_B_207', 'div_cat_iid_cat_n_B_208', 'div_cat_iid_cat_n_B_209', 'div_cat_iid_cat_n_B_210', 'div_cat_iid_cat_n_B_212', 'div_cat_iid_cat_n_B_213', 'div_cat_iid_cat_n_B_215', 'div_cat_iid_cat_n_B_218', 'div_cat_iid_cat_n_B_219', 'div_cat_iid_cat_n_B_220', 'div_cat_iid_cat_n_B_223', 'div_cat_iid_cat_n_B_23', 'div_cat_iid_cat_n_B_26', 'div_cat_iid_cat_n_B_27', 'div_cat_iid_cat_n_B_29', 'div_cat_iid_cat_n_B_3', 'div_cat_iid_cat_n_B_33', 'div_cat_iid_cat_n_B_34', 'div_cat_iid_cat_n_B_36', 'div_cat_iid_cat_n_B_38', 'div_cat_iid_cat_n_B_39', 'div_cat_iid_cat_n_B_42', 'div_cat_iid_cat_n_B_44', 'div_cat_iid_cat_n_B_45', 'div_cat_iid_cat_n_B_46', 'div_cat_iid_cat_n_B_47', 'div_cat_iid_cat_n_B_48', 'div_cat_iid_cat_n_B_49', 'div_cat_iid_cat_n_B_50', 'div_cat_iid_cat_n_B_51', 'div_cat_iid_cat_n_B_52', 'div_cat_iid_cat_n_B_53', 'div_cat_iid_cat_n_B_57', 'div_cat_iid_cat_n_B_58', 'div_cat_iid_cat_n_B_59', 'div_cat_iid_cat_n_B_61', 'div_cat_iid_cat_n_B_62', 'div_cat_iid_cat_n_B_66', 'div_cat_iid_cat_n_B_68', 'div_cat_iid_cat_n_B_69', 'div_cat_iid_cat_n_B_7', 'div_cat_iid_cat_n_B_71', 'div_cat_iid_cat_n_B_72', 'div_cat_iid_cat_n_B_73', 'div_cat_iid_cat_n_B_74', 'div_cat_iid_cat_n_B_76', 'div_cat_iid_cat_n_B_78', 'div_cat_iid_cat_n_B_79', 'div_cat_iid_cat_n_B_81', 'div_cat_iid_cat_n_B_83', 'div_cat_iid_cat_n_B_84', 'div_cat_iid_cat_n_B_87', 'div_cat_iid_cat_n_B_88', 'div_cat_iid_cat_n_B_90', 'div_cat_iid_cat_n_B_91', 'div_cat_iid_cat_n_B_92', 'div_cat_iid_cat_n_B_93', 'div_cat_iid_cat_n_B_94', 'div_cat_iid_cat_n_B_95', 'div_cat_iid_cat_n_B_99', 'iid_cnt', 'sum_B_106', 'sum_B_123', 'sum_B_144', 'sum_B_157', 'sum_B_159', 'sum_B_161', 'sum_B_174', 'sum_B_180', 'sum_B_188', 'sum_B_19', 'sum_B_198', 'sum_B_20', 'sum_B_36', 'sum_B_6', 'B_11', 'B_127', 'B_173', 'B_180_1', 'B_196_0', 'B_19_0', 'B_206', 'B_219_0', 'B_221', 'B_269', 'B_280', 'B_287', 'B_314', 'B_328', 'B_334', 'B_337', 'B_397', 'B_400', 'B_402', 'B_413', 'B_418', 'B_45', 'B_71', 'B_71_0', 'B_80', 'B_8_0', 'cat_n_B_144', 'cat_n_B_155', 'cat_n_B_17', 'cat_n_B_182', 'cat_n_B_219', 'cat_n_B_37', 'cat_n_B_74', 'cat_n_B_81', 'cat_n_B_85', 'div_cat_iid_cat_n_B_116', 'div_cat_iid_cat_n_B_144', 'div_cat_iid_cat_n_B_15', 'div_cat_iid_cat_n_B_150', 'div_cat_iid_cat_n_B_163', 'div_cat_iid_cat_n_B_169', 'div_cat_iid_cat_n_B_171', 'div_cat_iid_cat_n_B_172', 'div_cat_iid_cat_n_B_176', 'div_cat_iid_cat_n_B_19', 'div_cat_iid_cat_n_B_217', 'div_cat_iid_cat_n_B_22', 'div_cat_iid_cat_n_B_24', 'div_cat_iid_cat_n_B_25', 'div_cat_iid_cat_n_B_28', 'div_cat_iid_cat_n_B_31', 'div_cat_iid_cat_n_B_32', 'div_cat_iid_cat_n_B_35', 'div_cat_iid_cat_n_B_5', 'div_cat_iid_cat_n_B_54', 'div_cat_iid_cat_n_B_60', 'div_cat_iid_cat_n_B_63', 'div_cat_iid_cat_n_B_8', 'div_cat_iid_cat_n_B_80', 'div_cat_iid_cat_n_B_96', 'div_cat_iid_cat_n_B_98', 'sum_B_107', 'sum_B_175', 'sum_B_196', 'sum_B_60', 'sum_B_68', 'B_140', 'B_142', 'B_160', 'B_239', 'B_302', 'B_352', 'B_353', 'B_366', 'B_372', 'B_386', 'B_392', 'B_420', 'B_97_1', 'cat_n_B_109', 'cat_n_B_35', 'cat_n_B_6', 'div_cat_iid_cat_n_B_101', 'div_cat_iid_cat_n_B_16', 'div_cat_iid_cat_n_B_175', 'div_cat_iid_cat_n_B_183', 'div_cat_iid_cat_n_B_185', 'div_cat_iid_cat_n_B_196', 'div_cat_iid_cat_n_B_4', 'div_cat_iid_cat_n_B_41', 'div_cat_iid_cat_n_B_89', 'sum_B_35', 'sum_B_46', 'B_107', 'B_107_0', 'B_123_0', 'B_147', 'B_161', 'B_175_1', 'B_248', 'B_250', 'B_251', 'B_317', 'B_33', 'B_356', 'B_64', 'B_86', 'div_cat_iid_cat_n_B_17', 'div_cat_iid_cat_n_B_180', 'div_cat_iid_cat_n_B_214', 'sum_B_71', 'B_112', 'B_120', 'B_132_1', 'B_19_1', 'B_236', 'B_427', 'B_57', 'div_cat_iid_cat_n_B_126', 'div_cat_iid_cat_n_B_170', 'div_cat_iid_cat_n_B_221', 'div_cat_iid_cat_n_B_97', 'B_115', 'B_12', 'B_141', 'B_180', 'B_222_0', 'B_230', 'B_241', 'B_266', 'B_288', 'B_312', 'B_335', 'B_394', 'B_79', 'B_95', 'B_99', 'cat_n_B_132', 'div_cat_iid_cat_n_B_100', 'div_cat_iid_cat_n_B_164', 'div_cat_iid_cat_n_B_200', 'B_129', 'B_6_1', 'div_cat_iid_cat_n_B_138', 'div_cat_iid_cat_n_B_155', 'div_cat_iid_cat_n_B_43', 'sum_B_210', 'B_126', 'B_21', 'B_339', 'B_65', 'div_cat_iid_cat_n_B_125', 'sum_B_132', 'sum_B_219', 'B_128', 'B_8', 'div_cat_iid_cat_n_B_130', 'sum_B_222', 'B_191', 'B_30', 'B_4', 'sum_B_8', 'B_275', 'B_290', 'div_cat_iid_cat_n_B_195', 'B_325', 'B_63', 'B_157', 'B_260', 'B_423', 'B_91', 'div_cat_iid_cat_n_B_37', 'div_cat_iid_cat_n_B_55', 'B_430', 'div_cat_iid_cat_n_B_75', 'B_395', 'B_73', 'B_0', 'div_cat_iid_cat_n_B_86', 'B_23', 'B_268', 'B_27', 'B_306', 'B_348', 'B_6', 'B_92', 'div_cat_iid_cat_n_B_222', 'B_168', 'div_cat_iid_cat_n_B_56', 'B_318', 'B_340', 'B_301', 'B_164', 'B_271', 'B_417', 'B_111', 'B_285', 'B_350', 'B_187', 'B_246', 'B_401', 'B_89'], 224 | 'C': ['C_118', 'C_135', 'C_17_0', 'C_39', 'C_55', 'C_7', 'C_89', 'C_91', 'cat_n_C_0', 'cat_n_C_1', 'cat_n_C_10', 'cat_n_C_18', 'cat_n_C_21', 'cat_n_C_22', 'cat_n_C_24', 'cat_n_C_26', 'cat_n_C_28', 'cat_n_C_3', 'cat_n_C_32', 'cat_n_C_37', 'cat_n_C_4', 'cat_n_C_40', 'cat_n_C_5', 'div_cat_iid_cat_n_C_31', 'div_cat_iid_cat_n_C_39', 'iid_cnt', 'C_14_1', 'cat_n_C_11', 'cat_n_C_14', 'cat_n_C_15', 'cat_n_C_2', 'cat_n_C_23', 'cat_n_C_27', 'cat_n_C_30', 'cat_n_C_38', 'cat_n_C_9', 'div_cat_iid_cat_n_C_26', 'C_129', 'C_57', 'C_76', 'cat_n_C_17', 'cat_n_C_20', 'cat_n_C_19', 'cat_n_C_6', 'div_cat_iid_cat_n_C_33', 'C_10_0', 'C_146', 'C_46', 'cat_n_C_39', 'div_cat_iid_cat_n_C_17']} 225 | 226 | process_cb = process.processing(countries=['A', 'B', 'C'], 227 | balances=balances) 228 | process_cb.set_data_dict(data_dict=data_dict) 229 | process_cb.set_model_dict(model_dict=model_cb_dict) 230 | # process_cb.find_exclude() 231 | process_cb.set_exclude_dict(exclude_CB_dict) 232 | result_cb = process_cb.predict(model_name='catboost', path='models/') 233 | 234 | # Create submission 235 | submission = pd.DataFrame(index=result_cb.index) 236 | submission['country'] = result_cb.country 237 | submission['poor'] = (result_xgb.poor * 0.4 + 238 | result_cb.poor * 0.4 + 239 | result_lgbm.poor * 0.2) 240 | 241 | process_cb.save_csv(submission, clf_model_name='combine', path='models/') 242 | 243 | 244 | if __name__ == '__main__': 245 | predict() 246 | -------------------------------------------------------------------------------- /notebooks/reproduce_final_submission.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2018-03-20T12:48:12.113072", 9 | "start_time": "2018-03-20T12:48:11.495212" 10 | }, 11 | "collapsed": false 12 | }, 13 | "outputs": [], 14 | "source": [ 15 | "import os\n", 16 | "import datetime\n", 17 | "import pandas as pd\n", 18 | "import numpy as np\n", 19 | "import xgboost as xgb\n", 20 | "import lightgbm as lgb\n", 21 | "from sklearn.utils import resample\n", 22 | "from sklearn.utils import class_weight\n", 23 | "from sklearn.model_selection import StratifiedShuffleSplit\n", 24 | "from sklearn.preprocessing import StandardScaler\n", 25 | "from sklearn.metrics.classification import accuracy_score, log_loss\n", 26 | "from collections import OrderedDict\n", 27 | "from abc import ABC, abstractmethod, abstractproperty\n", 28 | "from catboost import CatBoostClassifier\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "ExecuteTime": { 36 | "end_time": "2018-03-20T12:48:13.937848", 37 | "start_time": "2018-03-20T12:48:13.698473" 38 | }, 39 | "code_folding": [], 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "class Data():\n", 45 | "\n", 46 | " def __init__(self):\n", 47 | " self.country_df_train = pd.DataFrame()\n", 48 | " self.country_df_test = pd.DataFrame()\n", 49 | " self.categorical_list = []\n", 50 | " self.float_list = []\n", 51 | " self.file_name = 'hhold'\n", 52 | "\n", 53 | " def split_data(self, size=0.8, n_splits=1, random_state=1, balance=False, df=None):\n", 54 | " if not isinstance(df, pd.DataFrame):\n", 55 | " train = self.country_df_train\n", 56 | " else:\n", 57 | " train = df \n", 58 | " sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1-size, random_state=random_state)\n", 59 | " splits = []\n", 60 | " for train_index, validate_index in sss.split(train, train.poor):\n", 61 | " df_train = train.iloc[train_index]\n", 62 | " if balance:\n", 63 | " df_train = self.resample(df_train)\n", 64 | " splits.append((df_train, train.iloc[validate_index]))\n", 65 | " return splits\n", 66 | "\n", 67 | " def _rename_col(self):\n", 68 | " train_columns = self.country_df_train.columns\n", 69 | " train_new_columns = [x if x == 'poor' or x == 'country' else '{0}_{1}'.format(self.country, \n", 70 | " train_columns.get_loc(x)) for x in train_columns]\n", 71 | " self.country_df_train.columns=train_new_columns\n", 72 | " self.col_maping = dict(zip(train_columns, train_new_columns))\n", 73 | " self.col_maping_reverse = dict(zip(train_new_columns, train_columns))\n", 74 | "\n", 75 | " self.country_df_test.rename(columns=self.col_maping, inplace=True) \n", 76 | " \n", 77 | " def del_nonunique(self, df):\n", 78 | " cols = list(df)\n", 79 | " nunique = df.apply(pd.Series.nunique)\n", 80 | " cols_to_drop = nunique[nunique == 1].index\n", 81 | " print('Cols to drop:', cols_to_drop)\n", 82 | " return df.drop(cols_to_drop, axis=1)\n", 83 | "\n", 84 | " def _category_float_search(self, countries=['B'], cat_types=['object'], fi_types=['float64', 'int64']):\n", 85 | " categorical_list = list(self.country_df_train[self.col_common_list].select_dtypes(\n", 86 | " include=cat_types).columns)\n", 87 | " if self.country not in countries:\n", 88 | " return categorical_list, list(\n", 89 | " self.country_df_train[self.col_common_list].select_dtypes(include=fi_types).columns)\n", 90 | " float_list = []\n", 91 | " scaler = StandardScaler()\n", 92 | " print('float list length: ', len(list(self.country_df_test.select_dtypes(include=fi_types).columns)))\n", 93 | " for i in list(self.country_df_test[self.col_common_list].select_dtypes(include=fi_types).columns):\n", 94 | " self.country_df_train[i].fillna(self.country_df_train[i].median(), inplace=True)\n", 95 | " self.country_df_test[i].fillna(self.country_df_test[i].median(), inplace=True)\n", 96 | " value_set = set(self.country_df_test[i].unique()).union(set(self.country_df_train[i].unique()))\n", 97 | " if len(value_set) <= 5:\n", 98 | " categorical_list.append(i)\n", 99 | " else:\n", 100 | " self.country_df_train[i] = scaler.fit_transform(self.country_df_train[i].values.reshape(-1, 1))\n", 101 | " self.country_df_test[i] = scaler.transform(self.country_df_test[i].values.reshape(-1, 1))\n", 102 | " float_list.append(i)\n", 103 | " print('float list length: ', len(sorted(float_list)))\n", 104 | " return sorted(categorical_list), sorted(float_list)\n", 105 | " \n", 106 | " \n", 107 | " def set_file_name(self, file_name):\n", 108 | " self.file_name = file_name\n", 109 | " \n", 110 | " def load(self, country=None, file_name='hhold'):\n", 111 | " self.file_name = file_name\n", 112 | " self.country = country\n", 113 | " if country in ['A', 'B', 'C']:\n", 114 | " self.country_df_train = self.del_nonunique(\n", 115 | " pd.read_csv(\"../data/raw/{0}_{1}_train.csv\".format(country, self.file_name),\n", 116 | " index_col='id'))\n", 117 | " self.country_df_test = self.del_nonunique(\n", 118 | " pd.read_csv(\"../data/raw/{0}_{1}_test.csv\".format(country, self.file_name),\n", 119 | " index_col='id'))\n", 120 | " \n", 121 | " self._rename_col()\n", 122 | " self.col_common_list = \\\n", 123 | " sorted(list(set(self.country_df_train.columns).intersection(self.country_df_test.columns)))\n", 124 | " self.categorical_list, self_float_list = self._category_float_search()\n", 125 | " return True\n", 126 | " else:\n", 127 | " return False\n", 128 | "\n", 129 | " def save(self, ext='_ext', poor=True):\n", 130 | " train = self.get_train()\n", 131 | " if poor:\n", 132 | " train = pd.concat([train[0], train[1]], axis=1)\n", 133 | " else:\n", 134 | " train = train[0]\n", 135 | " file_name = \"../data/processed/{0}_{1}{2}_train.csv\".format(self.country, self.file_name, ext) \n", 136 | " train.to_csv(file_name, index=True, mode='w')\n", 137 | " test = self.get_test()\n", 138 | " file_name = \"../data/processed/{0}_{1}{2}_test.csv\".format(self.country, self.file_name, ext) \n", 139 | " test.to_csv(file_name, index=True, mode='w') \n", 140 | " return True\n", 141 | "\n", 142 | " def resample(self, df):\n", 143 | " df_majority = df[self.country_df_train.poor==False]\n", 144 | " df_minority = df[self.country_df_train.poor==True]\n", 145 | "\n", 146 | " df_minority_upsampled = resample(df_minority, \n", 147 | " replace=True,\n", 148 | " n_samples=df_majority.shape[0],\n", 149 | " random_state=1) \n", 150 | " return pd.concat([df_majority, df_minority_upsampled]) \n", 151 | " \n", 152 | " \n", 153 | " def get_train(self, balance=False):\n", 154 | " if balance:\n", 155 | " train = self.resample(self.country_df_train)\n", 156 | " return train[self.col_common_list], train['poor'] \n", 157 | " return self.country_df_train[self.col_common_list], self.country_df_train['poor']\n", 158 | " \n", 159 | " def get_train_valid(self, n_splits=1, balance=False):\n", 160 | " splits = self.split_data(n_splits=n_splits, balance=balance)\n", 161 | " return [((x[self.col_common_list], x.poor),(y[self.col_common_list], y.poor)) for x,y in splits]\n", 162 | " \n", 163 | " def get_test(self):\n", 164 | " return self.country_df_test[self.col_common_list]\n", 165 | " \n", 166 | " def get_cat_list(self): \n", 167 | " return self.categorical_list\n", 168 | " \n", 169 | " def get_float_list(self):\n", 170 | " return self.float_list" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "ExecuteTime": { 178 | "end_time": "2018-03-21T19:43:58.907936", 179 | "start_time": "2018-03-21T19:43:58.724366" 180 | }, 181 | "code_folding": [], 182 | "collapsed": false 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "class DataInd(Data):\n", 187 | "\n", 188 | " def __init__(self):\n", 189 | " super().__init__()\n", 190 | " self.file_name = 'indiv'\n", 191 | " \n", 192 | " def get_poor(self, df):\n", 193 | " return df['poor'].reset_index()[['id', 'poor']].drop_duplicates().set_index('id')\n", 194 | " \n", 195 | " def summarize(self, df):\n", 196 | " count = df.copy().groupby(level=0).sum()\n", 197 | " res_df = pd.concat({'sum': count}, axis=1)\n", 198 | " res_df.columns = ['{0}_{1}'.format(i[0], i[1]) for i in res_df.columns]\n", 199 | " res_df = res_df.reindex(index=df.index.get_level_values(0))\n", 200 | " res_df = res_df[~res_df.index.duplicated(keep='first')]\n", 201 | " print('summarized size df: ', res_df.shape)\n", 202 | " return res_df\n", 203 | " \n", 204 | " def _get_id_list(self, df):\n", 205 | " return list(OrderedDict.fromkeys(df.index.get_level_values(0)))\n", 206 | " \n", 207 | " def count_iid(self, df):\n", 208 | " s = df.index.get_level_values(0).value_counts()\n", 209 | " return s.reindex(index = self._get_id_list(df)).to_frame('iid_cnt')\n", 210 | " \n", 211 | " def count_neg_poz(self, df):\n", 212 | " print('count_neg_poz input df shape', df.shape)\n", 213 | " res_df = df.select_dtypes(include=['float64','int64','int8'])\n", 214 | " print('count_neg_poz res_df shape', res_df.shape)\n", 215 | " res_df = res_df.groupby(level=0).apply(lambda c: c.apply(\n", 216 | " lambda x: pd.Series([(x < 0).sum(), (x >= 0).sum()])).unstack())\n", 217 | " res_df.columns = ['{0}_{1}'.format(i[0], i[1]) for i in res_df.columns] \n", 218 | " print('count_neg_poz size df: ', res_df.shape)\n", 219 | " return res_df.reindex(index = self._get_id_list(df))\n", 220 | " \n", 221 | " def count_unique_categories(self, df, iid=True):\n", 222 | " res_df = df.groupby(level=0).apply(lambda c: c.apply(lambda x: pd.Series([len((x).unique())])))\n", 223 | " res_df.index = res_df.index.droplevel(1)\n", 224 | " res_df.columns = ['{0}_{1}'.format('cat_n', i) for i in res_df.columns]\n", 225 | " print('count_unique_categories size df: ', res_df.shape)\n", 226 | " res_df = res_df.reindex(index = self._get_id_list(df))\n", 227 | " if iid:\n", 228 | " div_df = res_df.div(self.count_iid(df)['iid_cnt'], axis=0)\n", 229 | " div_df.columns = ['{0}_{1}'.format('div_cat_iid', i) for i in res_df.columns]\n", 230 | " res_df = pd.concat([res_df, div_df], axis=1)\n", 231 | " return res_df\n", 232 | " \n", 233 | " def load(self, country=None, obj_enc=False, cat_enc=False): \n", 234 | " self.country = country\n", 235 | " if country in ['A', 'B', 'C']:\n", 236 | " self.country_df_train = self.del_nonunique(\n", 237 | " pd.read_csv(\"../data/raw/{0}_{1}_train.csv\".format(country, self.file_name), \n", 238 | " index_col=['id','iid']))\n", 239 | " self.country_df_test = self.del_nonunique(\n", 240 | " pd.read_csv(\"../data/raw/{0}_{1}_test.csv\".format(country, self.file_name), \n", 241 | " index_col=['id','iid']))\n", 242 | "\n", 243 | " self._rename_col()\n", 244 | " print(self.country_df_train.head())\n", 245 | " print(self.country_df_test.head())\n", 246 | " self.col_common_list = sorted(list(set(self.country_df_train.columns).intersection(\n", 247 | " self.country_df_test.columns)))\n", 248 | "\n", 249 | " self.categorical_list, self_float_list = self._category_float_search(countries=['A', 'B', 'C'])\n", 250 | "\n", 251 | " if cat_enc:\n", 252 | " for header in self.categorical_list:\n", 253 | " self.country_df_train[header] = self.country_df_train[header].astype('category').cat.codes\n", 254 | " self.country_df_test[header] = self.country_df_test[header].astype('category').cat.codes\n", 255 | " \n", 256 | " self.country_df_train = pd.concat([self.get_poor(self.country_df_train),\n", 257 | " self.count_iid(self.country_df_train),\n", 258 | " self.count_neg_poz(self.country_df_train),\n", 259 | " self.summarize(self.country_df_train),\n", 260 | " self.count_unique_categories(self.country_df_train)\n", 261 | " ], axis=1)\n", 262 | " self.country_df_test = pd.concat([self.count_iid(self.country_df_test),\n", 263 | " self.count_neg_poz(self.country_df_test),\n", 264 | " self.summarize(self.country_df_test),\n", 265 | " self.count_unique_categories(self.country_df_test)\n", 266 | " ], axis=1)\n", 267 | " print(self.country_df_train.head())\n", 268 | " print(self.country_df_test.head())\n", 269 | " self.col_common_list = sorted(list(set(self.country_df_train.columns).intersection(\n", 270 | " self.country_df_test.columns)))\n", 271 | " self.categorical_list, self_float_list = self._category_float_search(countries=['A', 'B', 'C'])\n", 272 | "\n", 273 | " if obj_enc:\n", 274 | " self.country_df_train = self.object_encode(self.country_df_train, self.categorical_list)\n", 275 | " self.country_df_test = self.object_encode(self.country_df_test, self.categorical_list)\n", 276 | " self.col_common_list = \\\n", 277 | " sorted(list(set(self.country_df_train.columns).intersection(self.country_df_test.columns)))\n", 278 | "\n", 279 | " print('dataind train shape: ', self.country_df_train.shape)\n", 280 | " print('dataind test shape: ', self.country_df_test.shape)\n", 281 | " return True\n", 282 | " else:\n", 283 | " return False" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": { 290 | "ExecuteTime": { 291 | "end_time": "2018-03-21T19:22:13.042508", 292 | "start_time": "2018-03-21T19:22:13.022175" 293 | }, 294 | "code_folding": [], 295 | "collapsed": true 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "class DataConcat(Data):\n", 300 | "\n", 301 | " def __init__(self):\n", 302 | " self.data_hh_train = pd.DataFrame()\n", 303 | " self.data_hh_test = pd.DataFrame() \n", 304 | " self.data_indiv_train = pd.DataFrame()\n", 305 | " self.data_indiv_test = pd.DataFrame()\n", 306 | " super().__init__()\n", 307 | " self.file_name = 'combine'\n", 308 | " \n", 309 | " def load(self, country=None, file_name_hh='hhold', file_name_ind='indiv_ext'):\n", 310 | " self.country = country\n", 311 | " if country in ['A', 'B', 'C']:\n", 312 | " self.data_indiv_train = self.del_nonunique(\n", 313 | " pd.read_csv(\"../data/processed/{0}_{1}_train.csv\".format(country, file_name_ind), \n", 314 | " index_col='id'))\n", 315 | " self.data_indiv_test = self.del_nonunique(\n", 316 | " pd.read_csv(\"../data/processed/{0}_{1}_test.csv\".format(country, file_name_ind), \n", 317 | " index_col='id'))\n", 318 | " data_hh = Data()\n", 319 | " if data_hh.load(country, file_name=file_name_hh):\n", 320 | " self.country_df_train = data_hh.country_df_train\n", 321 | " self.country_df_test = data_hh.country_df_test\n", 322 | "\n", 323 | " self.categorical_list = data_hh.categorical_list\n", 324 | " \n", 325 | " self.country_df_train = self.country_df_train.join(self.data_indiv_train) \n", 326 | " self.country_df_test = self.country_df_test.join(self.data_indiv_test) \n", 327 | " self.col_common_list = sorted(list(set(self.country_df_train.columns).intersection(\n", 328 | " self.country_df_test.columns))) \n", 329 | " return True\n", 330 | " else:\n", 331 | " return False" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": { 338 | "ExecuteTime": { 339 | "end_time": "2018-03-21T19:44:05.619495", 340 | "start_time": "2018-03-21T19:44:05.614755" 341 | }, 342 | "code_folding": [], 343 | "collapsed": true 344 | }, 345 | "outputs": [], 346 | "source": [ 347 | "def combine_csv():\n", 348 | " data = DataInd()\n", 349 | " data_concat = DataConcat()\n", 350 | " for c in ['A']:\n", 351 | " data.load(c)\n", 352 | " data.save(ext='_ext', poor=False)\n", 353 | " data_concat.load(c)\n", 354 | " data_concat.save(ext='')" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": { 361 | "ExecuteTime": { 362 | "end_time": "2018-03-20T12:48:24.432502", 363 | "start_time": "2018-03-20T12:48:24.362500" 364 | }, 365 | "code_folding": [ 366 | 0 367 | ], 368 | "collapsed": true 369 | }, 370 | "outputs": [], 371 | "source": [ 372 | "class predict_model:\n", 373 | " \n", 374 | " @abstractmethod\n", 375 | " def __init__(self, name='predict_model', categ_conv=True):\n", 376 | " self.params = {}\n", 377 | " self.exclude_list = []\n", 378 | " self.name = name\n", 379 | " self.random = 1\n", 380 | " self.classifier = None\n", 381 | " self.categ_conv = categ_conv\n", 382 | " self.data_df = {}\n", 383 | "\n", 384 | " @abstractmethod\n", 385 | " def set_params(self, params=None):\n", 386 | " if not params:\n", 387 | " self.params = {}\n", 388 | " else:\n", 389 | " self.params = params\n", 390 | " \n", 391 | " @abstractmethod \n", 392 | " def set_random_seed(self, random=1):\n", 393 | " self.random = random\n", 394 | " \n", 395 | " @abstractmethod\n", 396 | " def load_data(self, data, balance=False): \n", 397 | " self.data = data\n", 398 | " self.data_df['train'], self.data_df['y'] = self.data.get_train(balance=balance)\n", 399 | " self.data_df['test'] = self.data.get_test()\n", 400 | " \n", 401 | " self.category_cols = self.data.get_cat_list()\n", 402 | " for header in self.category_cols:\n", 403 | " self.data_df['train'][header] = self.data_df['train'][header].astype('category').cat.codes\n", 404 | " self.data_df['test'][header] = self.data_df['test'][header].astype('category').cat.codes\n", 405 | " return True\n", 406 | "\n", 407 | " @abstractmethod\n", 408 | " def get_train(self): \n", 409 | " return self.data_df['train']\n", 410 | "\n", 411 | " @abstractmethod\n", 412 | " def get_y(self): \n", 413 | " return self.data_df['y']\n", 414 | "\n", 415 | " @abstractmethod\n", 416 | " def get_test(self): \n", 417 | " return self.data_df['test']\n", 418 | " \n", 419 | " @abstractmethod\n", 420 | " def set_exclude_list(self, exclude_list):\n", 421 | " self.exclude_list = exclude_list.copy()\n", 422 | " \n", 423 | " @abstractmethod\n", 424 | " def get_feature_importances(self):\n", 425 | " pass\n", 426 | " \n", 427 | " @abstractmethod\n", 428 | " def train(self, x_train=None, y_train=None):\n", 429 | " pass\n", 430 | " \n", 431 | " @abstractmethod\n", 432 | " def predict(self, test=None):\n", 433 | " if self.classifier:\n", 434 | " if not isinstance(test, pd.DataFrame):\n", 435 | " test = self.get_test()\n", 436 | " elif self.categ_conv:\n", 437 | " cols = [x for x in self.category_cols if x in test.columns]\n", 438 | " for header in cols:\n", 439 | " test[header] = test[header].astype('category').cat.codes \n", 440 | " test = test.drop([x for x in self.exclude_list if x in test.columns], axis=1)\n", 441 | " res = pd.DataFrame(index=test.index)\n", 442 | " res['country'] = self.data.country\n", 443 | " res['poor'] = self.classifier.predict_proba(test)[:,1]\n", 444 | " return res\n", 445 | " else:\n", 446 | " print('error: classifier not defined')\n", 447 | " return None" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": { 454 | "ExecuteTime": { 455 | "end_time": "2018-03-20T12:48:24.747505", 456 | "start_time": "2018-03-20T12:48:24.698358" 457 | }, 458 | "code_folding": [ 459 | 0 460 | ], 461 | "collapsed": true 462 | }, 463 | "outputs": [], 464 | "source": [ 465 | "class CB_model(predict_model):\n", 466 | " \n", 467 | " def __init__(self, name='catboost', categ_conv=True):\n", 468 | " super().__init__(name='catboost', categ_conv=categ_conv)\n", 469 | " self.name = name\n", 470 | " \n", 471 | " def load_data(self, data, balance=False): \n", 472 | " if super().load_data(data, balance):\n", 473 | " c_w = class_weight.compute_class_weight(class_weight='balanced',\n", 474 | " classes=np.unique(self.data_df['y']), \n", 475 | " y=self.data_df['y'])\n", 476 | " print('class_weight: ', c_w)\n", 477 | "\n", 478 | " self.classifier = CatBoostClassifier(**self.params, class_weights=c_w)\n", 479 | " return True\n", 480 | " else:\n", 481 | " return False\n", 482 | " \n", 483 | " def train(self, x_train=None, y_train=None):\n", 484 | " if not isinstance(x_train, pd.DataFrame):\n", 485 | " x_train = self.get_train()\n", 486 | " elif self.categ_conv:\n", 487 | " cols = [x for x in self.category_cols if x in x_train.columns]\n", 488 | " for header in cols:\n", 489 | " x_train[header] = x_train[header].astype('category').cat.codes\n", 490 | " if not isinstance(y_train, pd.Series):\n", 491 | " y_train = self.get_y()\n", 492 | " \n", 493 | " x_train = x_train.drop([x for x in self.exclude_list if x in x_train.columns], axis=1)\n", 494 | " \n", 495 | " self.category_cols = [x for x in self.category_cols if x not in self.exclude_list]\n", 496 | "\n", 497 | " cat_dims = [x_train.columns.get_loc(i) for i in self.category_cols] \n", 498 | " print(x_train.shape, y_train.shape, len(self.category_cols))\n", 499 | " self.classifier.fit(x_train, y_train, cat_features=cat_dims) \n", 500 | " return self.classifier\n", 501 | " \n", 502 | " def get_feature_importances(self):\n", 503 | " return self.classifier._feature_importance " 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": { 510 | "ExecuteTime": { 511 | "end_time": "2018-03-20T12:48:25.165680", 512 | "start_time": "2018-03-20T12:48:25.137715" 513 | }, 514 | "code_folding": [ 515 | 0 516 | ], 517 | "collapsed": true 518 | }, 519 | "outputs": [], 520 | "source": [ 521 | "class XGB_model(predict_model):\n", 522 | " \n", 523 | " def __init__(self, name='xgboost', categ_conv=True):\n", 524 | " super().__init__(name='xgboost', categ_conv=categ_conv)\n", 525 | " self.name = name\n", 526 | " \n", 527 | " def load_data(self, data, balance=False): \n", 528 | " if super().load_data(data, balance):\n", 529 | " self.params['scale_pos_weight'] = \\\n", 530 | " (self.data_df['y'].shape[0] - self.data_df['y'].sum()) / self.data_df['y'].sum()\n", 531 | " self.classifier = xgb.XGBClassifier(**self.params)\n", 532 | " return True\n", 533 | " else:\n", 534 | " return False\n", 535 | " \n", 536 | " def train(self, x_train=None, y_train=None):\n", 537 | " \n", 538 | " if not isinstance(x_train, pd.DataFrame):\n", 539 | " x_train = self.get_train()\n", 540 | " elif self.categ_conv:\n", 541 | " cols = [x for x in self.category_cols if x in x_train.columns]\n", 542 | " for header in cols:\n", 543 | " x_train[header] = x_train[header].astype('category').cat.codes\n", 544 | "\n", 545 | " if not isinstance(y_train, pd.Series):\n", 546 | " y_train = self.get_y()\n", 547 | " \n", 548 | " x_train = x_train.drop([x for x in self.exclude_list if x in x_train.columns], axis=1)\n", 549 | " print('x_train shape: ', x_train.shape) \n", 550 | " self.classifier.fit(x_train, y_train) \n", 551 | "\n", 552 | " return self.classifier \n", 553 | "\n", 554 | " def get_feature_importances(self):\n", 555 | " return self.classifier.feature_importances_" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "metadata": { 562 | "ExecuteTime": { 563 | "end_time": "2018-03-20T12:48:25.591673", 564 | "start_time": "2018-03-20T12:48:25.564906" 565 | }, 566 | "code_folding": [ 567 | 0 568 | ], 569 | "collapsed": true 570 | }, 571 | "outputs": [], 572 | "source": [ 573 | "class LGBM_model(predict_model):\n", 574 | " \n", 575 | " def __init__(self, name='lightgbm', categ_conv=True):\n", 576 | " super().__init__(name='lightgbm', categ_conv=categ_conv)\n", 577 | " self.name = name\n", 578 | " \n", 579 | " def load_data(self, data, balance=False): \n", 580 | " if super().load_data(data, balance):\n", 581 | " self.classifier = lgb.LGBMClassifier(**self.params) \n", 582 | " return True\n", 583 | " else:\n", 584 | " return False\n", 585 | "\n", 586 | " def train(self, x_train=None, y_train=None):\n", 587 | " \n", 588 | " if not isinstance(x_train, pd.DataFrame):\n", 589 | " x_train = self.get_train()\n", 590 | " elif self.categ_conv:\n", 591 | " cols = [x for x in self.category_cols if x in x_train.columns]\n", 592 | " for header in cols:\n", 593 | " x_train[header] = x_train[header].astype('category').cat.codes\n", 594 | "\n", 595 | " if not isinstance(y_train, pd.Series):\n", 596 | " y_train = self.get_y()\n", 597 | " \n", 598 | " x_train = x_train.drop([x for x in self.exclude_list if x in x_train.columns], axis=1)\n", 599 | " print('x_train shape: ', x_train.shape) \n", 600 | " self.category_cols = [x for x in self.category_cols if x not in self.exclude_list]\n", 601 | " self.classifier.fit(x_train, y_train,verbose=False) \n", 602 | "\n", 603 | " return self.classifier \n", 604 | "\n", 605 | " def get_feature_importances(self):\n", 606 | " return self.classifier.feature_importances_ " 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "metadata": { 613 | "ExecuteTime": { 614 | "end_time": "2018-03-20T13:45:52.487688", 615 | "start_time": "2018-03-20T13:45:52.362851" 616 | }, 617 | "code_folding": [ 618 | 0 619 | ], 620 | "collapsed": true 621 | }, 622 | "outputs": [], 623 | "source": [ 624 | "class processing:\n", 625 | " \n", 626 | " def __init__(self, countries=['A', 'B', 'C'], \n", 627 | " balances={'A':False, 'B':False, 'C':False}):\n", 628 | " self.countries = countries\n", 629 | " self.balances = balances\n", 630 | " self.exclude_dict = {'A': [], 'B': [],'C': []}\n", 631 | " self.data_dict = None\n", 632 | " self.model_dict = None\n", 633 | " self.vote_waights_dict = None\n", 634 | "\n", 635 | " def set_data_dict(self, data_dict):\n", 636 | " self.data_dict = data_dict\n", 637 | " \n", 638 | " def set_model_dict(self, model_dict):\n", 639 | " self.model_dict = model_dict\n", 640 | " \n", 641 | " def set_exclude_dict(self, exclude_dict):\n", 642 | " self.exclude_dict = exclude_dict\n", 643 | " \n", 644 | " def save_csv(self, df, clf_model_name='_', path=''):\n", 645 | " submission_file = os.path.join(\n", 646 | " path, 'submission_{0}_{1}.csv'.format(\n", 647 | " clf_model_name,\n", 648 | " str(datetime.datetime.now().strftime(\"%Y-%m-%d-%H-%M\"))))\n", 649 | " print('submission file:', submission_file)\n", 650 | " df.to_csv(submission_file, index=True, float_format='%.4f')\n", 651 | " print(df.head())\n", 652 | " \n", 653 | " def find_exclude(self, e_count=5):\n", 654 | " if not self.model_dict or not self.data_dict:\n", 655 | " print('Stoped: no models or data')\n", 656 | " return None\n", 657 | " \n", 658 | " for c in self.countries:\n", 659 | " self.data_dict[c].load(c) \n", 660 | " self.model_dict[c].load_data(data=self.data_dict[c], balance=self.balances[c])\n", 661 | " exclude_list = []\n", 662 | " finish = False\n", 663 | " logloss_dict = {}\n", 664 | " while not finish:\n", 665 | " self.model_dict[c].set_exclude_list(exclude_list)\n", 666 | " clf = self.model_dict[c].train()\n", 667 | " exclude_list_prev = exclude_list.copy()\n", 668 | " columns = [x for x in self.model_dict[c].get_train().columns if x not in exclude_list_prev]\n", 669 | " exclude_list = [x for (x,y) in zip(columns, \n", 670 | " self.model_dict[c].get_feature_importances()) if y == 0]\n", 671 | " if not exclude_list:\n", 672 | " finish = True \n", 673 | " exclude_list = exclude_list_prev + exclude_list\n", 674 | "\n", 675 | " logloss_iter = []\n", 676 | " splits = self.model_dict[c].data.get_train_valid(n_splits=e_count, balance=self.balances[c])\n", 677 | "\n", 678 | " for i in range(0, e_count):\n", 679 | " self.model_dict[c].set_random_seed(i)\n", 680 | " train, valid = splits[i]\n", 681 | " self.model_dict[c].set_exclude_list(exclude_list)\n", 682 | " self.model_dict[c].train(train[0], train[1])\n", 683 | " pred = self.model_dict[c].predict(valid[0])\n", 684 | " logloss_iter.append(log_loss(valid[1].astype(int), pred['poor']))\n", 685 | " logloss = np.mean(logloss_iter)\n", 686 | " logloss_dict[logloss] = exclude_list\n", 687 | " print('loglos: {0} exclude length: {1}'.format(logloss, len(exclude_list)))\n", 688 | " self.exclude_dict[c] = logloss_dict[np.min(list(logloss_dict.keys()))]\n", 689 | " print('Country: {0} exclude length: {1}'.format(c, len(self.exclude_dict.get(c))))\n", 690 | "\n", 691 | " return logloss_dict\n", 692 | " \n", 693 | " def predict(self):\n", 694 | " if not self.model_dict or not self.data_dict:\n", 695 | " print('Stoped: no models or data')\n", 696 | " return None\n", 697 | " \n", 698 | " predictions = []\n", 699 | "\n", 700 | " for c in self.countries:\n", 701 | " self.data_dict[c].load(c) \n", 702 | " self.model_dict[c].load_data(data=self.data_dict[c], balance=self.balances[c])\n", 703 | " self.model_dict[c].set_exclude_list(self.exclude_dict[c])\n", 704 | " if self.vote_waights_dict:\n", 705 | " self.model_dict[c].set_weights(self.vote_waights_dict[c])\n", 706 | " print('exclude: \\n', self.exclude_dict[c])\n", 707 | " self.model_dict[c].train()\n", 708 | " predictions.append(self.model_dict[c].predict())\n", 709 | " result = pd.concat(predictions) \n", 710 | " self.save_csv(result, clf_model_name=self.model_dict['A'].name, path='../models/')\n", 711 | " return result" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": null, 717 | "metadata": { 718 | "ExecuteTime": { 719 | "end_time": "2018-03-21T19:47:54.622674", 720 | "start_time": "2018-03-21T19:44:11.601545" 721 | }, 722 | "collapsed": false, 723 | "scrolled": true 724 | }, 725 | "outputs": [], 726 | "source": [ 727 | "#creating data sets\n", 728 | "combine_csv()" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": { 735 | "ExecuteTime": { 736 | "end_time": "2018-03-20T18:35:01.475566", 737 | "start_time": "2018-03-20T18:34:01.697339" 738 | }, 739 | "code_folding": [ 740 | 17, 741 | 27, 742 | 37, 743 | 64, 744 | 75, 745 | 86, 746 | 110, 747 | 125, 748 | 140 749 | ], 750 | "collapsed": false 751 | }, 752 | "outputs": [], 753 | "source": [ 754 | "data_1 = Data()\n", 755 | "data_2 = DataConcat()\n", 756 | "data_dict = {'A': data_1, 'B': data_2,'C': data_2}\n", 757 | "balances={'A':False, 'B':False, 'C':True}\n", 758 | "\n", 759 | "exclude_CB_dict = {'A': ['A_0', 'A_10', 'A_106', 'A_113', 'A_114', 'A_115', 'A_120', 'A_138', 'A_15', 'A_173', 'A_174', 'A_175', 'A_181', 'A_185', 'A_191', 'A_195', 'A_202', 'A_206', 'A_215', 'A_216', 'A_218', 'A_223', 'A_245', 'A_250', 'A_251', 'A_252', 'A_253', 'A_254', 'A_255', 'A_256', 'A_263', 'A_272', 'A_277', 'A_295', 'A_299', 'A_308', 'A_309', 'A_32', 'A_33', 'A_330', 'A_39', 'A_43', 'A_44', 'A_57', 'A_59', 'A_63', 'A_69', 'A_6_1', 'A_70', 'A_72', 'A_77', 'A_81', 'A_88', 'A_89', 'A_93', 'cat_n_A_10', 'cat_n_A_15', 'cat_n_A_20', 'cat_n_A_22', 'cat_n_A_25', 'cat_n_A_33', 'cat_n_A_35', 'cat_n_A_39', 'cat_n_A_4', 'cat_n_A_5', 'cat_n_A_6', 'cat_n_A_8', 'cat_n_A_9', 'A_101', 'A_11', 'A_147', 'A_155', 'A_170', 'A_18', 'A_203', 'A_338', 'A_35', 'A_49', 'A_67', 'cat_n_A_11', 'cat_n_A_21', 'cat_n_A_30', 'cat_n_A_37', 'div_cat_iid_cat_n_A_16', 'A_105', 'A_14', 'A_149', 'A_197', 'A_26', 'A_261', 'A_302', 'A_312', 'A_319', 'A_328', 'A_32_1', 'A_341', 'A_9', 'cat_n_A_28', 'div_cat_iid_cat_n_A_32', 'A_121', 'A_125', 'A_131', 'A_161', 'A_17', 'A_192', 'A_229', 'A_259', 'A_60', 'A_80', 'cat_n_A_1', 'cat_n_A_29', 'A_13', 'A_134', 'A_176', 'A_182', 'A_213', 'A_22', 'A_267', 'A_301', 'A_31', 'cat_n_A_18', 'div_cat_iid_cat_n_A_38', 'A_146', 'A_162', 'A_27', 'A_152', 'A_189', 'A_292', 'A_3', 'A_65'],\n", 760 | " 'B': ['B_1', 'B_106_0', 'B_106_1', 'B_107_1', 'B_113', 'B_121', 'B_123_1', 'B_139', 'B_144_0', 'B_144_1', 'B_152_0', 'B_157_0', 'B_157_1', 'B_159_0', 'B_159_1', 'B_15_0', 'B_15_1', 'B_161_0', 'B_161_1', 'B_167', 'B_174_0', 'B_174_1', 'B_175_0', 'B_176', 'B_18', 'B_180_0', 'B_183', 'B_188_0', 'B_188_1', 'B_196', 'B_196_1', 'B_198_0', 'B_198_1', 'B_20', 'B_203', 'B_204', 'B_205', 'B_207', 'B_208', 'B_20_0', 'B_20_1', 'B_210_0', 'B_210_1', 'B_218_0', 'B_219', 'B_219_1', 'B_222_1', 'B_227', 'B_238', 'B_243', 'B_244', 'B_256', 'B_258', 'B_264', 'B_265', 'B_272', 'B_274', 'B_29', 'B_3', 'B_303', 'B_307', 'B_316', 'B_320', 'B_329', 'B_349', 'B_34_0', 'B_34_1', 'B_35', 'B_355', 'B_35_0', 'B_35_1', 'B_361', 'B_36_0', 'B_36_1', 'B_370', 'B_371', 'B_385', 'B_389', 'B_405', 'B_407', 'B_412', 'B_422', 'B_46_0', 'B_46_1', 'B_5', 'B_55', 'B_60_0', 'B_60_1', 'B_68_0', 'B_68_1', 'B_6_0', 'B_71_1', 'B_72', 'B_77', 'B_83', 'B_8_1', 'cat_n_B_0', 'cat_n_B_1', 'cat_n_B_10', 'cat_n_B_100', 'cat_n_B_101', 'cat_n_B_102', 'cat_n_B_103', 'cat_n_B_104', 'cat_n_B_105', 'cat_n_B_106', 'cat_n_B_107', 'cat_n_B_108', 'cat_n_B_11', 'cat_n_B_110', 'cat_n_B_111', 'cat_n_B_112', 'cat_n_B_113', 'cat_n_B_114', 'cat_n_B_115', 'cat_n_B_116', 'cat_n_B_117', 'cat_n_B_118', 'cat_n_B_119', 'cat_n_B_12', 'cat_n_B_120', 'cat_n_B_121', 'cat_n_B_122', 'cat_n_B_123', 'cat_n_B_124', 'cat_n_B_125', 'cat_n_B_126', 'cat_n_B_127', 'cat_n_B_128', 'cat_n_B_129', 'cat_n_B_13', 'cat_n_B_130', 'cat_n_B_131', 'cat_n_B_133', 'cat_n_B_134', 'cat_n_B_135', 'cat_n_B_136', 'cat_n_B_137', 'cat_n_B_138', 'cat_n_B_139', 'cat_n_B_14', 'cat_n_B_140', 'cat_n_B_141', 'cat_n_B_142', 'cat_n_B_143', 'cat_n_B_145', 'cat_n_B_146', 'cat_n_B_147', 'cat_n_B_148', 'cat_n_B_149', 'cat_n_B_15', 'cat_n_B_150', 'cat_n_B_151', 'cat_n_B_152', 'cat_n_B_153', 'cat_n_B_154', 'cat_n_B_156', 'cat_n_B_157', 'cat_n_B_158', 'cat_n_B_159', 'cat_n_B_16', 'cat_n_B_160', 'cat_n_B_161', 'cat_n_B_162', 'cat_n_B_163', 'cat_n_B_164', 'cat_n_B_165', 'cat_n_B_166', 'cat_n_B_167', 'cat_n_B_168', 'cat_n_B_169', 'cat_n_B_170', 'cat_n_B_171', 'cat_n_B_172', 'cat_n_B_173', 'cat_n_B_174', 'cat_n_B_175', 'cat_n_B_176', 'cat_n_B_177', 'cat_n_B_178', 'cat_n_B_179', 'cat_n_B_18', 'cat_n_B_180', 'cat_n_B_181', 'cat_n_B_183', 'cat_n_B_184', 'cat_n_B_185', 'cat_n_B_186', 'cat_n_B_187', 'cat_n_B_189', 'cat_n_B_19', 'cat_n_B_190', 'cat_n_B_191', 'cat_n_B_192', 'cat_n_B_193', 'cat_n_B_194', 'cat_n_B_195', 'cat_n_B_196', 'cat_n_B_197', 'cat_n_B_198', 'cat_n_B_199', 'cat_n_B_2', 'cat_n_B_20', 'cat_n_B_200', 'cat_n_B_201', 'cat_n_B_202', 'cat_n_B_203', 'cat_n_B_204', 'cat_n_B_205', 'cat_n_B_206', 'cat_n_B_207', 'cat_n_B_208', 'cat_n_B_209', 'cat_n_B_210', 'cat_n_B_211', 'cat_n_B_212', 'cat_n_B_213', 'cat_n_B_214', 'cat_n_B_215', 'cat_n_B_216', 'cat_n_B_217', 'cat_n_B_218', 'cat_n_B_22', 'cat_n_B_220', 'cat_n_B_221', 'cat_n_B_222', 'cat_n_B_223', 'cat_n_B_23', 'cat_n_B_24', 'cat_n_B_25', 'cat_n_B_26', 'cat_n_B_27', 'cat_n_B_28', 'cat_n_B_29', 'cat_n_B_3', 'cat_n_B_30', 'cat_n_B_31', 'cat_n_B_32', 'cat_n_B_33', 'cat_n_B_34', 'cat_n_B_36', 'cat_n_B_38', 'cat_n_B_39', 'cat_n_B_4', 'cat_n_B_40', 'cat_n_B_41', 'cat_n_B_42', 'cat_n_B_43', 'cat_n_B_44', 'cat_n_B_45', 'cat_n_B_46', 'cat_n_B_47', 'cat_n_B_48', 'cat_n_B_49', 'cat_n_B_5', 'cat_n_B_50', 'cat_n_B_51', 'cat_n_B_52', 'cat_n_B_53', 'cat_n_B_54', 'cat_n_B_55', 'cat_n_B_56', 'cat_n_B_57', 'cat_n_B_58', 'cat_n_B_59', 'cat_n_B_60', 'cat_n_B_61', 'cat_n_B_62', 'cat_n_B_63', 'cat_n_B_64', 'cat_n_B_65', 'cat_n_B_66', 'cat_n_B_67', 'cat_n_B_68', 'cat_n_B_69', 'cat_n_B_7', 'cat_n_B_70', 'cat_n_B_71', 'cat_n_B_72', 'cat_n_B_73', 'cat_n_B_75', 'cat_n_B_76', 'cat_n_B_77', 'cat_n_B_78', 'cat_n_B_79', 'cat_n_B_8', 'cat_n_B_80', 'cat_n_B_82', 'cat_n_B_83', 'cat_n_B_84', 'cat_n_B_86', 'cat_n_B_87', 'cat_n_B_88', 'cat_n_B_89', 'cat_n_B_9', 'cat_n_B_90', 'cat_n_B_91', 'cat_n_B_92', 'cat_n_B_93', 'cat_n_B_94', 'cat_n_B_95', 'cat_n_B_96', 'cat_n_B_97', 'cat_n_B_98', 'cat_n_B_99', 'div_cat_iid_cat_n_B_0', 'div_cat_iid_cat_n_B_10', 'div_cat_iid_cat_n_B_102', 'div_cat_iid_cat_n_B_103', 'div_cat_iid_cat_n_B_105', 'div_cat_iid_cat_n_B_106', 'div_cat_iid_cat_n_B_107', 'div_cat_iid_cat_n_B_108', 'div_cat_iid_cat_n_B_11', 'div_cat_iid_cat_n_B_110', 'div_cat_iid_cat_n_B_111', 'div_cat_iid_cat_n_B_112', 'div_cat_iid_cat_n_B_115', 'div_cat_iid_cat_n_B_117', 'div_cat_iid_cat_n_B_118', 'div_cat_iid_cat_n_B_119', 'div_cat_iid_cat_n_B_12', 'div_cat_iid_cat_n_B_120', 'div_cat_iid_cat_n_B_121', 'div_cat_iid_cat_n_B_122', 'div_cat_iid_cat_n_B_123', 'div_cat_iid_cat_n_B_127', 'div_cat_iid_cat_n_B_129', 'div_cat_iid_cat_n_B_13', 'div_cat_iid_cat_n_B_131', 'div_cat_iid_cat_n_B_133', 'div_cat_iid_cat_n_B_134', 'div_cat_iid_cat_n_B_135', 'div_cat_iid_cat_n_B_136', 'div_cat_iid_cat_n_B_137', 'div_cat_iid_cat_n_B_139', 'div_cat_iid_cat_n_B_14', 'div_cat_iid_cat_n_B_140', 'div_cat_iid_cat_n_B_141', 'div_cat_iid_cat_n_B_142', 'div_cat_iid_cat_n_B_145', 'div_cat_iid_cat_n_B_146', 'div_cat_iid_cat_n_B_147', 'div_cat_iid_cat_n_B_148', 'div_cat_iid_cat_n_B_149', 'div_cat_iid_cat_n_B_151', 'div_cat_iid_cat_n_B_153', 'div_cat_iid_cat_n_B_154', 'div_cat_iid_cat_n_B_156', 'div_cat_iid_cat_n_B_157', 'div_cat_iid_cat_n_B_158', 'div_cat_iid_cat_n_B_159', 'div_cat_iid_cat_n_B_160', 'div_cat_iid_cat_n_B_161', 'div_cat_iid_cat_n_B_162', 'div_cat_iid_cat_n_B_165', 'div_cat_iid_cat_n_B_166', 'div_cat_iid_cat_n_B_168', 'div_cat_iid_cat_n_B_173', 'div_cat_iid_cat_n_B_174', 'div_cat_iid_cat_n_B_177', 'div_cat_iid_cat_n_B_178', 'div_cat_iid_cat_n_B_179', 'div_cat_iid_cat_n_B_181', 'div_cat_iid_cat_n_B_182', 'div_cat_iid_cat_n_B_184', 'div_cat_iid_cat_n_B_186', 'div_cat_iid_cat_n_B_187', 'div_cat_iid_cat_n_B_188', 'div_cat_iid_cat_n_B_189', 'div_cat_iid_cat_n_B_192', 'div_cat_iid_cat_n_B_193', 'div_cat_iid_cat_n_B_194', 'div_cat_iid_cat_n_B_197', 'div_cat_iid_cat_n_B_198', 'div_cat_iid_cat_n_B_199', 'div_cat_iid_cat_n_B_2', 'div_cat_iid_cat_n_B_20', 'div_cat_iid_cat_n_B_201', 'div_cat_iid_cat_n_B_202', 'div_cat_iid_cat_n_B_203', 'div_cat_iid_cat_n_B_204', 'div_cat_iid_cat_n_B_205', 'div_cat_iid_cat_n_B_206', 'div_cat_iid_cat_n_B_207', 'div_cat_iid_cat_n_B_208', 'div_cat_iid_cat_n_B_209', 'div_cat_iid_cat_n_B_210', 'div_cat_iid_cat_n_B_212', 'div_cat_iid_cat_n_B_213', 'div_cat_iid_cat_n_B_215', 'div_cat_iid_cat_n_B_218', 'div_cat_iid_cat_n_B_219', 'div_cat_iid_cat_n_B_220', 'div_cat_iid_cat_n_B_223', 'div_cat_iid_cat_n_B_23', 'div_cat_iid_cat_n_B_26', 'div_cat_iid_cat_n_B_27', 'div_cat_iid_cat_n_B_29', 'div_cat_iid_cat_n_B_3', 'div_cat_iid_cat_n_B_33', 'div_cat_iid_cat_n_B_34', 'div_cat_iid_cat_n_B_36', 'div_cat_iid_cat_n_B_38', 'div_cat_iid_cat_n_B_39', 'div_cat_iid_cat_n_B_42', 'div_cat_iid_cat_n_B_44', 'div_cat_iid_cat_n_B_45', 'div_cat_iid_cat_n_B_46', 'div_cat_iid_cat_n_B_47', 'div_cat_iid_cat_n_B_48', 'div_cat_iid_cat_n_B_49', 'div_cat_iid_cat_n_B_50', 'div_cat_iid_cat_n_B_51', 'div_cat_iid_cat_n_B_52', 'div_cat_iid_cat_n_B_53', 'div_cat_iid_cat_n_B_57', 'div_cat_iid_cat_n_B_58', 'div_cat_iid_cat_n_B_59', 'div_cat_iid_cat_n_B_61', 'div_cat_iid_cat_n_B_62', 'div_cat_iid_cat_n_B_66', 'div_cat_iid_cat_n_B_68', 'div_cat_iid_cat_n_B_69', 'div_cat_iid_cat_n_B_7', 'div_cat_iid_cat_n_B_71', 'div_cat_iid_cat_n_B_72', 'div_cat_iid_cat_n_B_73', 'div_cat_iid_cat_n_B_74', 'div_cat_iid_cat_n_B_76', 'div_cat_iid_cat_n_B_78', 'div_cat_iid_cat_n_B_79', 'div_cat_iid_cat_n_B_81', 'div_cat_iid_cat_n_B_83', 'div_cat_iid_cat_n_B_84', 'div_cat_iid_cat_n_B_87', 'div_cat_iid_cat_n_B_88', 'div_cat_iid_cat_n_B_90', 'div_cat_iid_cat_n_B_91', 'div_cat_iid_cat_n_B_92', 'div_cat_iid_cat_n_B_93', 'div_cat_iid_cat_n_B_94', 'div_cat_iid_cat_n_B_95', 'div_cat_iid_cat_n_B_99', 'iid_cnt', 'sum_B_106', 'sum_B_123', 'sum_B_144', 'sum_B_157', 'sum_B_159', 'sum_B_161', 'sum_B_174', 'sum_B_180', 'sum_B_188', 'sum_B_19', 'sum_B_198', 'sum_B_20', 'sum_B_36', 'sum_B_6', 'B_11', 'B_127', 'B_173', 'B_180_1', 'B_196_0', 'B_19_0', 'B_206', 'B_219_0', 'B_221', 'B_269', 'B_280', 'B_287', 'B_314', 'B_328', 'B_334', 'B_337', 'B_397', 'B_400', 'B_402', 'B_413', 'B_418', 'B_45', 'B_71', 'B_71_0', 'B_80', 'B_8_0', 'cat_n_B_144', 'cat_n_B_155', 'cat_n_B_17', 'cat_n_B_182', 'cat_n_B_219', 'cat_n_B_37', 'cat_n_B_74', 'cat_n_B_81', 'cat_n_B_85', 'div_cat_iid_cat_n_B_116', 'div_cat_iid_cat_n_B_144', 'div_cat_iid_cat_n_B_15', 'div_cat_iid_cat_n_B_150', 'div_cat_iid_cat_n_B_163', 'div_cat_iid_cat_n_B_169', 'div_cat_iid_cat_n_B_171', 'div_cat_iid_cat_n_B_172', 'div_cat_iid_cat_n_B_176', 'div_cat_iid_cat_n_B_19', 'div_cat_iid_cat_n_B_217', 'div_cat_iid_cat_n_B_22', 'div_cat_iid_cat_n_B_24', 'div_cat_iid_cat_n_B_25', 'div_cat_iid_cat_n_B_28', 'div_cat_iid_cat_n_B_31', 'div_cat_iid_cat_n_B_32', 'div_cat_iid_cat_n_B_35', 'div_cat_iid_cat_n_B_5', 'div_cat_iid_cat_n_B_54', 'div_cat_iid_cat_n_B_60', 'div_cat_iid_cat_n_B_63', 'div_cat_iid_cat_n_B_8', 'div_cat_iid_cat_n_B_80', 'div_cat_iid_cat_n_B_96', 'div_cat_iid_cat_n_B_98', 'sum_B_107', 'sum_B_175', 'sum_B_196', 'sum_B_60', 'sum_B_68', 'B_140', 'B_142', 'B_160', 'B_239', 'B_302', 'B_352', 'B_353', 'B_366', 'B_372', 'B_386', 'B_392', 'B_420', 'B_97_1', 'cat_n_B_109', 'cat_n_B_35', 'cat_n_B_6', 'div_cat_iid_cat_n_B_101', 'div_cat_iid_cat_n_B_16', 'div_cat_iid_cat_n_B_175', 'div_cat_iid_cat_n_B_183', 'div_cat_iid_cat_n_B_185', 'div_cat_iid_cat_n_B_196', 'div_cat_iid_cat_n_B_4', 'div_cat_iid_cat_n_B_41', 'div_cat_iid_cat_n_B_89', 'sum_B_35', 'sum_B_46', 'B_107', 'B_107_0', 'B_123_0', 'B_147', 'B_161', 'B_175_1', 'B_248', 'B_250', 'B_251', 'B_317', 'B_33', 'B_356', 'B_64', 'B_86', 'div_cat_iid_cat_n_B_17', 'div_cat_iid_cat_n_B_180', 'div_cat_iid_cat_n_B_214', 'sum_B_71', 'B_112', 'B_120', 'B_132_1', 'B_19_1', 'B_236', 'B_427', 'B_57', 'div_cat_iid_cat_n_B_126', 'div_cat_iid_cat_n_B_170', 'div_cat_iid_cat_n_B_221', 'div_cat_iid_cat_n_B_97', 'B_115', 'B_12', 'B_141', 'B_180', 'B_222_0', 'B_230', 'B_241', 'B_266', 'B_288', 'B_312', 'B_335', 'B_394', 'B_79', 'B_95', 'B_99', 'cat_n_B_132', 'div_cat_iid_cat_n_B_100', 'div_cat_iid_cat_n_B_164', 'div_cat_iid_cat_n_B_200', 'B_129', 'B_6_1', 'div_cat_iid_cat_n_B_138', 'div_cat_iid_cat_n_B_155', 'div_cat_iid_cat_n_B_43', 'sum_B_210', 'B_126', 'B_21', 'B_339', 'B_65', 'div_cat_iid_cat_n_B_125', 'sum_B_132', 'sum_B_219', 'B_128', 'B_8', 'div_cat_iid_cat_n_B_130', 'sum_B_222', 'B_191', 'B_30', 'B_4', 'sum_B_8', 'B_275', 'B_290', 'div_cat_iid_cat_n_B_195', 'B_325', 'B_63', 'B_157', 'B_260', 'B_423', 'B_91', 'div_cat_iid_cat_n_B_37', 'div_cat_iid_cat_n_B_55', 'B_430', 'div_cat_iid_cat_n_B_75', 'B_395', 'B_73', 'B_0', 'div_cat_iid_cat_n_B_86', 'B_23', 'B_268', 'B_27', 'B_306', 'B_348', 'B_6', 'B_92', 'div_cat_iid_cat_n_B_222', 'B_168', 'div_cat_iid_cat_n_B_56', 'B_318', 'B_340', 'B_301', 'B_164', 'B_271', 'B_417', 'B_111', 'B_285', 'B_350', 'B_187', 'B_246', 'B_401', 'B_89'],\n", 761 | " 'C': ['C_118', 'C_135', 'C_17_0', 'C_39', 'C_55', 'C_7', 'C_89', 'C_91', 'cat_n_C_0', 'cat_n_C_1', 'cat_n_C_10', 'cat_n_C_18', 'cat_n_C_21', 'cat_n_C_22', 'cat_n_C_24', 'cat_n_C_26', 'cat_n_C_28', 'cat_n_C_3', 'cat_n_C_32', 'cat_n_C_37', 'cat_n_C_4', 'cat_n_C_40', 'cat_n_C_5', 'div_cat_iid_cat_n_C_31', 'div_cat_iid_cat_n_C_39', 'iid_cnt', 'C_14_1', 'cat_n_C_11', 'cat_n_C_14', 'cat_n_C_15', 'cat_n_C_2', 'cat_n_C_23', 'cat_n_C_27', 'cat_n_C_30', 'cat_n_C_38', 'cat_n_C_9', 'div_cat_iid_cat_n_C_26', 'C_129', 'C_57', 'C_76', 'cat_n_C_17', 'cat_n_C_20', 'cat_n_C_19', 'cat_n_C_6', 'div_cat_iid_cat_n_C_33', 'C_10_0', 'C_146', 'C_46', 'cat_n_C_39', 'div_cat_iid_cat_n_C_17']}\n", 762 | "exclude_XGB_dict = {'A': ['A_0', 'A_10', 'A_101', 'A_106', 'A_11', 'A_113', 'A_120', 'A_121', 'A_13', 'A_131', 'A_134', 'A_138', 'A_140', 'A_146', 'A_147', 'A_148', 'A_15', 'A_152', 'A_155', 'A_161', 'A_162', 'A_167', 'A_168', 'A_17', 'A_170', 'A_173', 'A_174', 'A_175', 'A_176', 'A_179', 'A_18', 'A_181', 'A_185', 'A_186', 'A_191', 'A_195', 'A_197', 'A_2', 'A_202', 'A_203', 'A_206', 'A_213', 'A_215', 'A_216', 'A_218', 'A_219', 'A_22', 'A_223', 'A_225', 'A_226', 'A_227', 'A_232', 'A_234', 'A_237', 'A_242', 'A_245', 'A_251', 'A_252', 'A_253', 'A_254', 'A_255', 'A_256', 'A_258', 'A_259', 'A_26', 'A_261', 'A_262', 'A_263', 'A_267', 'A_27', 'A_272', 'A_277', 'A_282', 'A_295', 'A_299', 'A_3', 'A_30', 'A_301', 'A_302', 'A_305', 'A_307', 'A_308', 'A_309', 'A_31', 'A_312', 'A_315', 'A_319', 'A_32', 'A_322', 'A_33', 'A_330', 'A_332', 'A_335', 'A_341', 'A_35', 'A_39', 'A_43', 'A_44', 'A_45', 'A_46', 'A_49', 'A_57', 'A_59', 'A_60', 'A_61', 'A_63', 'A_66', 'A_67', 'A_69', 'A_70', 'A_72', 'A_76', 'A_77', 'A_80', 'A_81', 'A_88', 'A_89', 'A_9', 'A_91', 'A_93', 'A_97', 'cat_n_A_25', 'cat_n_A_3', 'cat_n_A_36', 'cat_n_A_4', 'iid_cnt', 'A_105', 'A_114', 'A_229', 'cat_n_A_20', 'div_cat_iid_cat_n_A_25', 'A_14', 'A_6_1', 'cat_n_A_39'],\n", 763 | " 'B': ['B_0', 'B_1', 'B_106', 'B_109', 'B_112', 'B_12', 'B_120', 'B_121', 'B_128', 'B_135', 'B_14', 'B_140', 'B_141', 'B_142', 'B_143', 'B_144', 'B_145', 'B_148', 'B_149', 'B_151', 'B_152', 'B_152_1', 'B_157_0', 'B_157_1', 'B_158', 'B_159_0', 'B_15_1', 'B_16', 'B_160', 'B_161_0', 'B_161_1', 'B_162', 'B_167', 'B_17', 'B_172', 'B_173', 'B_174_1', 'B_175_0', 'B_175_1', 'B_176', 'B_18', 'B_180_1', 'B_187', 'B_188', 'B_188_1', 'B_191', 'B_196', 'B_196_0', 'B_196_1', 'B_203', 'B_204', 'B_205', 'B_206', 'B_208', 'B_209', 'B_20_0', 'B_20_1', 'B_210', 'B_210_0', 'B_210_1', 'B_211', 'B_212', 'B_215', 'B_219', 'B_219_0', 'B_227', 'B_228', 'B_23', 'B_230', 'B_234', 'B_237', 'B_238', 'B_239', 'B_24', 'B_241', 'B_243', 'B_244', 'B_247', 'B_248', 'B_250', 'B_251', 'B_252', 'B_254', 'B_256', 'B_259', 'B_260', 'B_264', 'B_265', 'B_266', 'B_269', 'B_271', 'B_275', 'B_278', 'B_279', 'B_28', 'B_284', 'B_29', 'B_3', 'B_302', 'B_303', 'B_304', 'B_307', 'B_313', 'B_314', 'B_320', 'B_334', 'B_337', 'B_340', 'B_342', 'B_348', 'B_34_0', 'B_34_1', 'B_35', 'B_350', 'B_353', 'B_354', 'B_355', 'B_356', 'B_359', 'B_35_0', 'B_35_1', 'B_36', 'B_361', 'B_362', 'B_363', 'B_364', 'B_365', 'B_366', 'B_368', 'B_36_0', 'B_36_1', 'B_37', 'B_370', 'B_371', 'B_372', 'B_375', 'B_379', 'B_385', 'B_386', 'B_389', 'B_390', 'B_391', 'B_392', 'B_394', 'B_395', 'B_397', 'B_400', 'B_402', 'B_405', 'B_406', 'B_407', 'B_41', 'B_410', 'B_411', 'B_412', 'B_413', 'B_418', 'B_42', 'B_420', 'B_422', 'B_423', 'B_427', 'B_428', 'B_44', 'B_47', 'B_48', 'B_50', 'B_52', 'B_55', 'B_60_0', 'B_60_1', 'B_61', 'B_62', 'B_63', 'B_64', 'B_65', 'B_66', 'B_67', 'B_68_0', 'B_68_1', 'B_7', 'B_71_1', 'B_72', 'B_76', 'B_80', 'B_83', 'B_86', 'B_89', 'B_8_0', 'B_8_1', 'B_9', 'B_94', 'B_95', 'B_96', 'B_99', 'cat_n_B_1', 'cat_n_B_10', 'cat_n_B_102', 'cat_n_B_104', 'cat_n_B_105', 'cat_n_B_106', 'cat_n_B_107', 'cat_n_B_108', 'cat_n_B_11', 'cat_n_B_111', 'cat_n_B_115', 'cat_n_B_116', 'cat_n_B_117', 'cat_n_B_118', 'cat_n_B_120', 'cat_n_B_122', 'cat_n_B_126', 'cat_n_B_127', 'cat_n_B_129', 'cat_n_B_13', 'cat_n_B_130', 'cat_n_B_133', 'cat_n_B_134', 'cat_n_B_137', 'cat_n_B_138', 'cat_n_B_139', 'cat_n_B_140', 'cat_n_B_141', 'cat_n_B_142', 'cat_n_B_145', 'cat_n_B_147', 'cat_n_B_148', 'cat_n_B_149', 'cat_n_B_152', 'cat_n_B_153', 'cat_n_B_154', 'cat_n_B_157', 'cat_n_B_158', 'cat_n_B_16', 'cat_n_B_160', 'cat_n_B_161', 'cat_n_B_165', 'cat_n_B_166', 'cat_n_B_168', 'cat_n_B_169', 'cat_n_B_17', 'cat_n_B_170', 'cat_n_B_171', 'cat_n_B_174', 'cat_n_B_177', 'cat_n_B_179', 'cat_n_B_18', 'cat_n_B_181', 'cat_n_B_182', 'cat_n_B_184', 'cat_n_B_185', 'cat_n_B_187', 'cat_n_B_189', 'cat_n_B_192', 'cat_n_B_193', 'cat_n_B_196', 'cat_n_B_197', 'cat_n_B_198', 'cat_n_B_20', 'cat_n_B_201', 'cat_n_B_203', 'cat_n_B_204', 'cat_n_B_205', 'cat_n_B_206', 'cat_n_B_208', 'cat_n_B_209', 'cat_n_B_211', 'cat_n_B_212', 'cat_n_B_213', 'cat_n_B_214', 'cat_n_B_215', 'cat_n_B_216', 'cat_n_B_218', 'cat_n_B_219', 'cat_n_B_220', 'cat_n_B_221', 'cat_n_B_223', 'cat_n_B_23', 'cat_n_B_24', 'cat_n_B_25', 'cat_n_B_26', 'cat_n_B_27', 'cat_n_B_28', 'cat_n_B_3', 'cat_n_B_30', 'cat_n_B_31', 'cat_n_B_32', 'cat_n_B_33', 'cat_n_B_34', 'cat_n_B_35', 'cat_n_B_36', 'cat_n_B_37', 'cat_n_B_38', 'cat_n_B_39', 'cat_n_B_4', 'cat_n_B_42', 'cat_n_B_45', 'cat_n_B_47', 'cat_n_B_49', 'cat_n_B_5', 'cat_n_B_50', 'cat_n_B_51', 'cat_n_B_52', 'cat_n_B_55', 'cat_n_B_56', 'cat_n_B_60', 'cat_n_B_62', 'cat_n_B_63', 'cat_n_B_64', 'cat_n_B_65', 'cat_n_B_68', 'cat_n_B_7', 'cat_n_B_70', 'cat_n_B_71', 'cat_n_B_72', 'cat_n_B_76', 'cat_n_B_77', 'cat_n_B_78', 'cat_n_B_8', 'cat_n_B_82', 'cat_n_B_83', 'cat_n_B_86', 'cat_n_B_88', 'cat_n_B_90', 'cat_n_B_92', 'cat_n_B_93', 'cat_n_B_94', 'cat_n_B_95', 'cat_n_B_98', 'cat_n_B_99', 'div_cat_iid_cat_n_B_102', 'div_cat_iid_cat_n_B_105', 'div_cat_iid_cat_n_B_111', 'div_cat_iid_cat_n_B_114', 'div_cat_iid_cat_n_B_116', 'div_cat_iid_cat_n_B_118', 'div_cat_iid_cat_n_B_119', 'div_cat_iid_cat_n_B_122', 'div_cat_iid_cat_n_B_127', 'div_cat_iid_cat_n_B_131', 'div_cat_iid_cat_n_B_134', 'div_cat_iid_cat_n_B_139', 'div_cat_iid_cat_n_B_141', 'div_cat_iid_cat_n_B_142', 'div_cat_iid_cat_n_B_147', 'div_cat_iid_cat_n_B_157', 'div_cat_iid_cat_n_B_158', 'div_cat_iid_cat_n_B_16', 'div_cat_iid_cat_n_B_161', 'div_cat_iid_cat_n_B_169', 'div_cat_iid_cat_n_B_170', 'div_cat_iid_cat_n_B_171', 'div_cat_iid_cat_n_B_174', 'div_cat_iid_cat_n_B_177', 'div_cat_iid_cat_n_B_178', 'div_cat_iid_cat_n_B_179', 'div_cat_iid_cat_n_B_180', 'div_cat_iid_cat_n_B_181', 'div_cat_iid_cat_n_B_184', 'div_cat_iid_cat_n_B_188', 'div_cat_iid_cat_n_B_189', 'div_cat_iid_cat_n_B_193', 'div_cat_iid_cat_n_B_196', 'div_cat_iid_cat_n_B_197', 'div_cat_iid_cat_n_B_199', 'div_cat_iid_cat_n_B_201', 'div_cat_iid_cat_n_B_202', 'div_cat_iid_cat_n_B_204', 'div_cat_iid_cat_n_B_206', 'div_cat_iid_cat_n_B_208', 'div_cat_iid_cat_n_B_209', 'div_cat_iid_cat_n_B_213', 'div_cat_iid_cat_n_B_215', 'div_cat_iid_cat_n_B_216', 'div_cat_iid_cat_n_B_217', 'div_cat_iid_cat_n_B_220', 'div_cat_iid_cat_n_B_223', 'div_cat_iid_cat_n_B_23', 'div_cat_iid_cat_n_B_26', 'div_cat_iid_cat_n_B_3', 'div_cat_iid_cat_n_B_31', 'div_cat_iid_cat_n_B_32', 'div_cat_iid_cat_n_B_33', 'div_cat_iid_cat_n_B_34', 'div_cat_iid_cat_n_B_35', 'div_cat_iid_cat_n_B_36', 'div_cat_iid_cat_n_B_38', 'div_cat_iid_cat_n_B_42', 'div_cat_iid_cat_n_B_43', 'div_cat_iid_cat_n_B_45', 'div_cat_iid_cat_n_B_47', 'div_cat_iid_cat_n_B_50', 'div_cat_iid_cat_n_B_51', 'div_cat_iid_cat_n_B_52', 'div_cat_iid_cat_n_B_59', 'div_cat_iid_cat_n_B_61', 'div_cat_iid_cat_n_B_62', 'div_cat_iid_cat_n_B_69', 'div_cat_iid_cat_n_B_7', 'div_cat_iid_cat_n_B_71', 'div_cat_iid_cat_n_B_72', 'div_cat_iid_cat_n_B_75', 'div_cat_iid_cat_n_B_76', 'div_cat_iid_cat_n_B_77', 'div_cat_iid_cat_n_B_81', 'div_cat_iid_cat_n_B_83', 'div_cat_iid_cat_n_B_84', 'div_cat_iid_cat_n_B_90', 'div_cat_iid_cat_n_B_92', 'div_cat_iid_cat_n_B_94', 'div_cat_iid_cat_n_B_95', 'div_cat_iid_cat_n_B_98', 'div_cat_iid_cat_n_B_99', 'iid_cnt', 'sum_B_157', 'sum_B_161', 'sum_B_174', 'sum_B_188', 'B_10', 'B_101', 'B_104', 'B_107', 'B_11', 'B_111', 'B_116', 'B_123_0', 'B_156', 'B_159_1', 'B_164', 'B_170', 'B_171', 'B_174_0', 'B_182', 'B_192', 'B_194', 'B_19_0', 'B_216', 'B_223', 'B_224', 'B_229', 'B_235', 'B_25', 'B_272', 'B_282', 'B_283', 'B_288', 'B_290', 'B_293', 'B_297', 'B_317', 'B_318', 'B_322', 'B_325', 'B_343', 'B_352', 'B_373', 'B_384', 'B_403', 'B_51', 'B_68', 'B_73', 'B_92', 'cat_n_B_12', 'cat_n_B_124', 'cat_n_B_125', 'cat_n_B_131', 'cat_n_B_132', 'cat_n_B_136', 'cat_n_B_159', 'cat_n_B_167', 'cat_n_B_19', 'cat_n_B_191', 'cat_n_B_194', 'cat_n_B_2', 'cat_n_B_200', 'cat_n_B_202', 'cat_n_B_207', 'cat_n_B_210', 'cat_n_B_217', 'cat_n_B_44', 'cat_n_B_59', 'cat_n_B_67', 'cat_n_B_75', 'cat_n_B_84', 'cat_n_B_9', 'cat_n_B_91', 'cat_n_B_96', 'div_cat_iid_cat_n_B_0', 'div_cat_iid_cat_n_B_112', 'div_cat_iid_cat_n_B_12', 'div_cat_iid_cat_n_B_121', 'div_cat_iid_cat_n_B_126', 'div_cat_iid_cat_n_B_136', 'div_cat_iid_cat_n_B_137', 'div_cat_iid_cat_n_B_138', 'div_cat_iid_cat_n_B_151', 'div_cat_iid_cat_n_B_167', 'div_cat_iid_cat_n_B_186', 'div_cat_iid_cat_n_B_198', 'div_cat_iid_cat_n_B_2', 'div_cat_iid_cat_n_B_203', 'div_cat_iid_cat_n_B_207', 'div_cat_iid_cat_n_B_211', 'div_cat_iid_cat_n_B_29', 'div_cat_iid_cat_n_B_39', 'div_cat_iid_cat_n_B_49', 'div_cat_iid_cat_n_B_5', 'B_123_1', 'B_146', 'B_147', 'B_174', 'B_198_1', 'B_218', 'B_222_1', 'B_285', 'B_296', 'B_339', 'B_414', 'B_85', 'B_91', 'cat_n_B_113', 'cat_n_B_114', 'cat_n_B_123', 'cat_n_B_151', 'cat_n_B_178', 'cat_n_B_180', 'cat_n_B_183', 'cat_n_B_195', 'cat_n_B_199', 'cat_n_B_29', 'cat_n_B_43', 'cat_n_B_48', 'cat_n_B_74', 'div_cat_iid_cat_n_B_145', 'div_cat_iid_cat_n_B_148', 'div_cat_iid_cat_n_B_192', 'div_cat_iid_cat_n_B_55', 'sum_B_35', 'B_103', 'B_107_1', 'B_123', 'B_155', 'B_178', 'B_183', 'B_2', 'B_233', 'B_268', 'B_270', 'B_295', 'B_319', 'B_321', 'B_328', 'B_33', 'B_360', 'B_382', 'B_383', 'B_387', 'B_388', 'B_46_0', 'B_75', 'cat_n_B_119', 'cat_n_B_128', 'cat_n_B_146', 'cat_n_B_173', 'cat_n_B_40', 'div_cat_iid_cat_n_B_11', 'div_cat_iid_cat_n_B_110', 'div_cat_iid_cat_n_B_120', 'div_cat_iid_cat_n_B_128', 'div_cat_iid_cat_n_B_172', 'div_cat_iid_cat_n_B_20', 'div_cat_iid_cat_n_B_210', 'div_cat_iid_cat_n_B_219', 'div_cat_iid_cat_n_B_221', 'div_cat_iid_cat_n_B_27', 'div_cat_iid_cat_n_B_60', 'div_cat_iid_cat_n_B_63', 'div_cat_iid_cat_n_B_64', 'sum_B_180', 'B_115', 'B_124', 'B_19', 'B_19_1', 'B_330', 'B_357', 'B_409', 'cat_n_B_103', 'cat_n_B_121', 'cat_n_B_164', 'cat_n_B_186', 'cat_n_B_54', 'cat_n_B_73', 'cat_n_B_80', 'div_cat_iid_cat_n_B_154', 'div_cat_iid_cat_n_B_187', 'div_cat_iid_cat_n_B_44', 'B_163', 'B_165', 'B_180_0', 'B_236', 'B_277', 'B_292', 'B_329', 'B_34', 'B_46_1', 'cat_n_B_57', 'div_cat_iid_cat_n_B_130', 'div_cat_iid_cat_n_B_57'],\n", 764 | " 'C': ['C_1', 'C_10', 'C_100', 'C_109', 'C_10_0', 'C_111', 'C_116', 'C_126', 'C_127', 'C_129', 'C_133', 'C_135', 'C_139', 'C_14', 'C_141', 'C_143', 'C_146', 'C_151', 'C_154', 'C_155', 'C_157', 'C_159', 'C_17_0', 'C_17_1', 'C_19', 'C_22', 'C_25', 'C_26', 'C_27_0', 'C_27_1', 'C_28', 'C_3', 'C_31', 'C_32', 'C_44', 'C_45', 'C_47', 'C_54', 'C_55', 'C_59', 'C_6', 'C_63', 'C_65', 'C_67', 'C_72', 'C_73', 'C_74', 'C_77', 'C_8', 'C_81', 'C_84', 'C_85', 'C_87', 'C_89', 'C_9', 'C_92', 'C_94', 'C_96', 'C_99', 'cat_n_C_0', 'cat_n_C_10', 'cat_n_C_11', 'cat_n_C_14', 'cat_n_C_15', 'cat_n_C_16', 'cat_n_C_18', 'cat_n_C_20', 'cat_n_C_21', 'cat_n_C_23', 'cat_n_C_24', 'cat_n_C_26', 'cat_n_C_32', 'cat_n_C_38', 'cat_n_C_4', 'cat_n_C_5', 'cat_n_C_9', 'div_cat_iid_cat_n_C_13', 'div_cat_iid_cat_n_C_23', 'div_cat_iid_cat_n_C_26', 'div_cat_iid_cat_n_C_31', 'div_cat_iid_cat_n_C_32', 'div_cat_iid_cat_n_C_6', 'div_cat_iid_cat_n_C_7', 'iid_cnt', 'C_13', 'C_144', 'C_161', 'C_2', 'C_29', 'C_33', 'C_41', 'C_79', 'C_90', 'C_98', 'cat_n_C_25', 'cat_n_C_27', 'cat_n_C_3', 'cat_n_C_37', 'cat_n_C_6', 'div_cat_iid_cat_n_C_0', 'div_cat_iid_cat_n_C_14', 'div_cat_iid_cat_n_C_20', 'C_145', 'C_60', 'C_69', 'cat_n_C_13', 'cat_n_C_40', 'div_cat_iid_cat_n_C_15', 'div_cat_iid_cat_n_C_5', 'C_142', 'C_50', 'C_62', 'C_103', 'C_121', 'C_24', 'C_30', 'C_39', 'C_40', 'C_112', 'C_123']}\n", 765 | "\n", 766 | "exclude_LGBM_dict = {'A': ['A_0', 'A_10', 'A_101', 'A_105', 'A_106', 'A_11', 'A_112', 'A_113', 'A_115', 'A_120', 'A_121', 'A_13', 'A_131', 'A_134', 'A_138', 'A_141', 'A_15', 'A_152', 'A_155', 'A_161', 'A_162', 'A_167', 'A_168', 'A_170', 'A_173', 'A_174', 'A_175', 'A_176', 'A_18', 'A_181', 'A_185', 'A_191', 'A_195', 'A_197', 'A_202', 'A_203', 'A_206', 'A_215', 'A_216', 'A_218', 'A_219', 'A_223', 'A_225', 'A_232', 'A_237', 'A_242', 'A_245', 'A_251', 'A_252', 'A_253', 'A_254', 'A_255', 'A_256', 'A_258', 'A_259', 'A_26', 'A_261', 'A_262', 'A_263', 'A_267', 'A_27', 'A_272', 'A_275', 'A_282', 'A_292', 'A_295', 'A_299', 'A_3', 'A_30', 'A_301', 'A_307', 'A_308', 'A_309', 'A_31', 'A_312', 'A_319', 'A_32', 'A_322', 'A_33', 'A_330', 'A_332', 'A_335', 'A_338', 'A_341', 'A_35', 'A_39', 'A_43', 'A_44', 'A_46', 'A_47', 'A_49', 'A_57', 'A_59', 'A_60', 'A_63', 'A_66', 'A_67', 'A_69', 'A_70', 'A_72', 'A_77', 'A_80', 'A_81', 'A_88', 'A_89', 'A_9', 'A_91', 'A_93'],\n", 767 | " 'B': ['B_0', 'B_1', 'B_106', 'B_106_0', 'B_107', 'B_11', 'B_115', 'B_120', 'B_121', 'B_128', 'B_140', 'B_141', 'B_142', 'B_143', 'B_144', 'B_151', 'B_152', 'B_157_0', 'B_157_1', 'B_158', 'B_159_0', 'B_159_1', 'B_15_0', 'B_16', 'B_160', 'B_161_0', 'B_161_1', 'B_162', 'B_164', 'B_165', 'B_167', 'B_17', 'B_172', 'B_174', 'B_174_0', 'B_174_1', 'B_176', 'B_18', 'B_180_0', 'B_187', 'B_188', 'B_188_1', 'B_191', 'B_194', 'B_196', 'B_196_0', 'B_196_1', 'B_19_0', 'B_203', 'B_204', 'B_205', 'B_206', 'B_208', 'B_209', 'B_20_0', 'B_210_0', 'B_210_1', 'B_215', 'B_219', 'B_219_0', 'B_227', 'B_228', 'B_229', 'B_23', 'B_230', 'B_234', 'B_236', 'B_238', 'B_24', 'B_241', 'B_242', 'B_243', 'B_244', 'B_247', 'B_25', 'B_250', 'B_254', 'B_256', 'B_264', 'B_266', 'B_269', 'B_271', 'B_272', 'B_275', 'B_279', 'B_283', 'B_284', 'B_288', 'B_29', 'B_293', 'B_296', 'B_3', 'B_302', 'B_303', 'B_307', 'B_314', 'B_317', 'B_318', 'B_325', 'B_329', 'B_330', 'B_334', 'B_337', 'B_340', 'B_348', 'B_34_0', 'B_34_1', 'B_35', 'B_350', 'B_354', 'B_355', 'B_356', 'B_35_0', 'B_35_1', 'B_36', 'B_361', 'B_366', 'B_36_0', 'B_36_1', 'B_37', 'B_370', 'B_371', 'B_372', 'B_373', 'B_385', 'B_386', 'B_389', 'B_390', 'B_394', 'B_397', 'B_399', 'B_400', 'B_402', 'B_405', 'B_406', 'B_407', 'B_408', 'B_410', 'B_411', 'B_412', 'B_413', 'B_418', 'B_42', 'B_420', 'B_422', 'B_427', 'B_428', 'B_432', 'B_436', 'B_48', 'B_50', 'B_52', 'B_55', 'B_60_1', 'B_63', 'B_64', 'B_65', 'B_67', 'B_68_0', 'B_71_0', 'B_72', 'B_73', 'B_75', 'B_80', 'B_83', 'B_89', 'B_8_0', 'B_9', 'B_91', 'B_94', 'B_95', 'B_99', 'cat_n_B_1', 'cat_n_B_102', 'cat_n_B_105', 'cat_n_B_106', 'cat_n_B_11', 'cat_n_B_110', 'cat_n_B_111', 'cat_n_B_112', 'cat_n_B_116', 'cat_n_B_117', 'cat_n_B_118', 'cat_n_B_119', 'cat_n_B_120', 'cat_n_B_121', 'cat_n_B_122', 'cat_n_B_123', 'cat_n_B_126', 'cat_n_B_127', 'cat_n_B_128', 'cat_n_B_131', 'cat_n_B_134', 'cat_n_B_136', 'cat_n_B_137', 'cat_n_B_138', 'cat_n_B_139', 'cat_n_B_140', 'cat_n_B_141', 'cat_n_B_142', 'cat_n_B_145', 'cat_n_B_147', 'cat_n_B_148', 'cat_n_B_151', 'cat_n_B_152', 'cat_n_B_153', 'cat_n_B_154', 'cat_n_B_157', 'cat_n_B_158', 'cat_n_B_159', 'cat_n_B_16', 'cat_n_B_160', 'cat_n_B_161', 'cat_n_B_167', 'cat_n_B_168', 'cat_n_B_169', 'cat_n_B_170', 'cat_n_B_171', 'cat_n_B_172', 'cat_n_B_174', 'cat_n_B_177', 'cat_n_B_178', 'cat_n_B_179', 'cat_n_B_18', 'cat_n_B_180', 'cat_n_B_181', 'cat_n_B_183', 'cat_n_B_184', 'cat_n_B_19', 'cat_n_B_190', 'cat_n_B_193', 'cat_n_B_194', 'cat_n_B_195', 'cat_n_B_196', 'cat_n_B_197', 'cat_n_B_198', 'cat_n_B_199', 'cat_n_B_20', 'cat_n_B_201', 'cat_n_B_202', 'cat_n_B_203', 'cat_n_B_204', 'cat_n_B_207', 'cat_n_B_208', 'cat_n_B_209', 'cat_n_B_210', 'cat_n_B_211', 'cat_n_B_213', 'cat_n_B_215', 'cat_n_B_216', 'cat_n_B_217', 'cat_n_B_218', 'cat_n_B_219', 'cat_n_B_220', 'cat_n_B_221', 'cat_n_B_223', 'cat_n_B_23', 'cat_n_B_25', 'cat_n_B_26', 'cat_n_B_27', 'cat_n_B_28', 'cat_n_B_29', 'cat_n_B_3', 'cat_n_B_31', 'cat_n_B_32', 'cat_n_B_33', 'cat_n_B_34', 'cat_n_B_36', 'cat_n_B_38', 'cat_n_B_39', 'cat_n_B_42', 'cat_n_B_44', 'cat_n_B_45', 'cat_n_B_47', 'cat_n_B_48', 'cat_n_B_49', 'cat_n_B_50', 'cat_n_B_51', 'cat_n_B_52', 'cat_n_B_55', 'cat_n_B_56', 'cat_n_B_59', 'cat_n_B_60', 'cat_n_B_62', 'cat_n_B_64', 'cat_n_B_68', 'cat_n_B_7', 'cat_n_B_71', 'cat_n_B_72', 'cat_n_B_75', 'cat_n_B_76', 'cat_n_B_77', 'cat_n_B_78', 'cat_n_B_84', 'cat_n_B_9', 'cat_n_B_90', 'cat_n_B_92', 'cat_n_B_94', 'cat_n_B_95', 'cat_n_B_98', 'cat_n_B_99', 'div_cat_iid_cat_n_B_102', 'div_cat_iid_cat_n_B_111', 'div_cat_iid_cat_n_B_116', 'div_cat_iid_cat_n_B_119', 'div_cat_iid_cat_n_B_121', 'div_cat_iid_cat_n_B_122', 'div_cat_iid_cat_n_B_134', 'div_cat_iid_cat_n_B_136', 'div_cat_iid_cat_n_B_139', 'div_cat_iid_cat_n_B_14', 'div_cat_iid_cat_n_B_141', 'div_cat_iid_cat_n_B_142', 'div_cat_iid_cat_n_B_145', 'div_cat_iid_cat_n_B_148', 'div_cat_iid_cat_n_B_157', 'div_cat_iid_cat_n_B_158', 'div_cat_iid_cat_n_B_159', 'div_cat_iid_cat_n_B_160', 'div_cat_iid_cat_n_B_172', 'div_cat_iid_cat_n_B_174', 'div_cat_iid_cat_n_B_181', 'div_cat_iid_cat_n_B_184', 'div_cat_iid_cat_n_B_188', 'div_cat_iid_cat_n_B_193', 'div_cat_iid_cat_n_B_196', 'div_cat_iid_cat_n_B_197', 'div_cat_iid_cat_n_B_201', 'div_cat_iid_cat_n_B_204', 'div_cat_iid_cat_n_B_210', 'div_cat_iid_cat_n_B_213', 'div_cat_iid_cat_n_B_215', 'div_cat_iid_cat_n_B_217', 'div_cat_iid_cat_n_B_220', 'div_cat_iid_cat_n_B_221', 'div_cat_iid_cat_n_B_223', 'div_cat_iid_cat_n_B_23', 'div_cat_iid_cat_n_B_26', 'div_cat_iid_cat_n_B_3', 'div_cat_iid_cat_n_B_31', 'div_cat_iid_cat_n_B_32', 'div_cat_iid_cat_n_B_33', 'div_cat_iid_cat_n_B_34', 'div_cat_iid_cat_n_B_38', 'div_cat_iid_cat_n_B_39', 'div_cat_iid_cat_n_B_42', 'div_cat_iid_cat_n_B_45', 'div_cat_iid_cat_n_B_47', 'div_cat_iid_cat_n_B_50', 'div_cat_iid_cat_n_B_51', 'div_cat_iid_cat_n_B_52', 'div_cat_iid_cat_n_B_55', 'div_cat_iid_cat_n_B_59', 'div_cat_iid_cat_n_B_60', 'div_cat_iid_cat_n_B_62', 'div_cat_iid_cat_n_B_7', 'div_cat_iid_cat_n_B_72', 'div_cat_iid_cat_n_B_76', 'div_cat_iid_cat_n_B_78', 'div_cat_iid_cat_n_B_81', 'div_cat_iid_cat_n_B_90', 'div_cat_iid_cat_n_B_94', 'div_cat_iid_cat_n_B_95', 'div_cat_iid_cat_n_B_98', 'div_cat_iid_cat_n_B_99', 'iid_cnt', 'sum_B_157', 'sum_B_161', 'sum_B_188'],\n", 768 | " 'C': ['C_100', 'C_109', 'C_10_0', 'C_111', 'C_116', 'C_121', 'C_123', 'C_125', 'C_126', 'C_127', 'C_129', 'C_133', 'C_135', 'C_139', 'C_14', 'C_140', 'C_141', 'C_143', 'C_146', 'C_150', 'C_151', 'C_152', 'C_154', 'C_155', 'C_157', 'C_159', 'C_17_0', 'C_17_1', 'C_18', 'C_19', 'C_2', 'C_20', 'C_21', 'C_22', 'C_25', 'C_26', 'C_27_0', 'C_27_1', 'C_28', 'C_29', 'C_3', 'C_32', 'C_33', 'C_39', 'C_40', 'C_41', 'C_54', 'C_55', 'C_59', 'C_62', 'C_63', 'C_64', 'C_65', 'C_67', 'C_69', 'C_72', 'C_73', 'C_74', 'C_77', 'C_8', 'C_81', 'C_82', 'C_84', 'C_85', 'C_87', 'C_9', 'C_90', 'C_92', 'C_94', 'C_96', 'C_98', 'C_99', 'cat_n_C_0', 'cat_n_C_10', 'cat_n_C_11', 'cat_n_C_14', 'cat_n_C_15', 'cat_n_C_16', 'cat_n_C_17', 'cat_n_C_18', 'cat_n_C_2', 'cat_n_C_20', 'cat_n_C_21', 'cat_n_C_23', 'cat_n_C_24', 'cat_n_C_26', 'cat_n_C_27', 'cat_n_C_3', 'cat_n_C_30', 'cat_n_C_38', 'cat_n_C_4', 'cat_n_C_5', 'cat_n_C_9', 'div_cat_iid_cat_n_C_2', 'div_cat_iid_cat_n_C_31', 'div_cat_iid_cat_n_C_4', 'div_cat_iid_cat_n_C_40', 'div_cat_iid_cat_n_C_7', 'iid_cnt']\n", 769 | " }\n", 770 | "\n", 771 | "params_CB_A = {\n", 772 | " 'iterations' : 5000,\n", 773 | " 'learning_rate' : 0.03,\n", 774 | " 'depth' : 6,\n", 775 | " 'l2_leaf_reg' : 3,\n", 776 | " 'loss_function' : 'Logloss',\n", 777 | " 'random_seed' : 1,\n", 778 | " 'logging_level' : 'Silent',\n", 779 | " }\n", 780 | "\n", 781 | "params_CB_B = {\n", 782 | " 'iterations' : 5000,\n", 783 | " 'learning_rate' : 0.03,\n", 784 | " 'depth' : 6,\n", 785 | " 'l2_leaf_reg' : 3,\n", 786 | " 'loss_function' : 'Logloss',\n", 787 | " 'random_seed' : 1,\n", 788 | " 'logging_level' : 'Silent',\n", 789 | " }\n", 790 | "\n", 791 | "params_CB_C = {\n", 792 | " 'iterations' : 500,\n", 793 | " 'learning_rate' : 0.03,\n", 794 | " 'depth' : 6,\n", 795 | " 'l2_leaf_reg' : 3,\n", 796 | " 'loss_function' : 'Logloss',\n", 797 | " 'random_seed' : 1,\n", 798 | " 'logging_level' : 'Silent',\n", 799 | " }\n", 800 | "\n", 801 | "model_cb_A = CB_model(categ_conv=True)\n", 802 | "model_cb_A.set_params(params=params_CB_A)\n", 803 | "model_cb_B = CB_model(categ_conv=True)\n", 804 | "model_cb_B.set_params(params=params_CB_B)\n", 805 | "model_cb_C = CB_model(categ_conv=True)\n", 806 | "model_cb_C.set_params(params=params_CB_C)\n", 807 | "\n", 808 | "model_cb_dict = {'A': model_cb_A, 'B': model_cb_B, 'C': model_cb_C}\n", 809 | "\n", 810 | "process = processing(countries=['A','B','C'], balances=balances)\n", 811 | "process.set_data_dict(data_dict=data_dict)\n", 812 | "\n", 813 | "process.set_model_dict(model_dict=model_cb_dict)\n", 814 | "#process.find_exclude()\n", 815 | "process.set_exclude_dict(exclude_CB_dict)\n", 816 | "#result_cb = process.predict()\n", 817 | "\n", 818 | "params_XGB_A = {\n", 819 | " 'learning_rate' : 0.03,\n", 820 | " 'max_depth' : 3,\n", 821 | " 'n_estimators' : 1500,\n", 822 | " 'silent' : True,\n", 823 | " 'objective' : 'binary:logistic', \n", 824 | " 'gamma' : 0.3,\n", 825 | " 'subsample' : 0.7,\n", 826 | " 'reg_alpha' : 0.05\n", 827 | " }\n", 828 | "\n", 829 | "params_XGB_B = {\n", 830 | " 'learning_rate' : 0.03,\n", 831 | " 'max_depth' : 5,\n", 832 | " 'n_estimators' : 400,\n", 833 | " 'silent' : True,\n", 834 | " 'objective' : 'binary:logistic',\n", 835 | " 'gamma' : 0.2,\n", 836 | " 'subsample' : 0.7,\n", 837 | " 'reg_alpha' : 0.05,\n", 838 | " }\n", 839 | "\n", 840 | "params_XGB_C = {\n", 841 | " 'learning_rate' : 0.03,\n", 842 | " 'max_depth' : 3,\n", 843 | " 'n_estimators' : 500,\n", 844 | " 'silent' : True,\n", 845 | " 'objective' : 'binary:logistic',\n", 846 | " 'gamma' : 0.2,\n", 847 | " 'subsample' : 0.6,\n", 848 | " 'reg_alpha' : 0.05,\n", 849 | " }\n", 850 | "\n", 851 | "model_xgb_A = XGB_model(categ_conv=True)\n", 852 | "model_xgb_A.set_params(params=params_XGB_A)\n", 853 | "model_xgb_B = XGB_model(categ_conv=True)\n", 854 | "model_xgb_B.set_params(params=params_XGB_B)\n", 855 | "model_xgb_C = XGB_model(categ_conv=True)\n", 856 | "model_xgb_C.set_params(params=params_XGB_C)\n", 857 | "model_xgb_dict = {'A': model_xgb_A, 'B': model_xgb_B,'C': model_xgb_C}\n", 858 | "\n", 859 | "process.set_model_dict(model_dict=model_xgb_dict)\n", 860 | "#process.find_exclude()\n", 861 | "process.set_exclude_dict(exclude_XGB_dict)\n", 862 | "#result_xgb = process.predict()\n", 863 | "\n", 864 | "params_LGBM_A = {\n", 865 | " 'learning_rate' : 0.02,\n", 866 | " 'max_depth' : 6,\n", 867 | " 'n_estimators' : 942,\n", 868 | " 'silent' : True,\n", 869 | " 'objective' : 'binary',\n", 870 | " 'gamma' : 0.3,\n", 871 | " 'subsample' : 0.6,\n", 872 | " 'reg_alpha' : 0.02,\n", 873 | " 'is_unbalance' : True,\n", 874 | " 'boosting_type' : 'gbdt',\n", 875 | " 'reg_lambda' : 0.01,\n", 876 | " 'random_state' : 1\n", 877 | " }\n", 878 | "\n", 879 | "params_LGBM_B = {\n", 880 | " 'learning_rate' : 0.03,\n", 881 | " 'max_depth' : 6,\n", 882 | " 'n_estimators' : 232,\n", 883 | " 'silent' : True,\n", 884 | " 'objective' : 'binary', \n", 885 | " 'gamma' : 0.3,\n", 886 | " 'subsample' : 0.8,\n", 887 | " 'reg_alpha' : 0.05,\n", 888 | " 'is_unbalance' : True,\n", 889 | " 'boosting_type' : 'gbdt',\n", 890 | " 'reg_lambda' : 0.00,\n", 891 | " 'random_state' : 1\n", 892 | " }\n", 893 | "\n", 894 | "params_LGBM_C = {\n", 895 | " 'learning_rate' : 0.05,\n", 896 | " 'max_depth' : 3,\n", 897 | " 'n_estimators' : 520,\n", 898 | " 'silent' : True,\n", 899 | " 'objective' : 'binary', \n", 900 | " 'gamma' : 0.3,\n", 901 | " 'subsample' : 0.7,\n", 902 | " 'reg_alpha' : 0.05,\n", 903 | " 'is_unbalance' : True,\n", 904 | " 'boosting_type' : 'gbdt',\n", 905 | " 'reg_lambda' : 0.03,\n", 906 | " 'random_state' : 1,\n", 907 | " }\n", 908 | "\n", 909 | "model_lgbm_A = LGBM_model(categ_conv=True)\n", 910 | "model_lgbm_A.set_params(params=params_LGBM_A)\n", 911 | "model_lgbm_B = LGBM_model(categ_conv=True)\n", 912 | "model_lgbm_B.set_params(params=params_LGBM_B)\n", 913 | "model_lgbm_C = LGBM_model(categ_conv=True)\n", 914 | "model_lgbm_C.set_params(params=params_LGBM_C)\n", 915 | "model_lgbm_dict = {'A': model_lgbm_A, 'B': model_lgbm_B,'C': model_lgbm_C}\n", 916 | "\n", 917 | "process.set_model_dict(model_dict=model_lgbm_dict)\n", 918 | "process.set_exclude_dict(exclude_LGBM_dict)\n", 919 | "result_lgbm = process.predict()\n" 920 | ] 921 | }, 922 | { 923 | "cell_type": "code", 924 | "execution_count": null, 925 | "metadata": { 926 | "ExecuteTime": { 927 | "end_time": "2018-03-20T18:37:05.252546", 928 | "start_time": "2018-03-20T18:37:05.212396" 929 | }, 930 | "collapsed": false 931 | }, 932 | "outputs": [], 933 | "source": [ 934 | "# Create submission\n", 935 | "submission = pd.DataFrame(index=result_cb.index)\n", 936 | "submission['country'] = result_cb.country\n", 937 | "submission['poor'] = (result_xgb.poor * 0.4 +\n", 938 | " result_cb.poor * 0.4 +\n", 939 | " result_lgbm.poor * 0.2)\n", 940 | "process.save_csv(submission, clf_model_name='combine', path='../models/')" 941 | ] 942 | } 943 | ], 944 | "metadata": { 945 | "hide_input": false, 946 | "kernelspec": { 947 | "display_name": "Python 3", 948 | "language": "python", 949 | "name": "python3" 950 | }, 951 | "language_info": { 952 | "codemirror_mode": { 953 | "name": "ipython", 954 | "version": 3 955 | }, 956 | "file_extension": ".py", 957 | "mimetype": "text/x-python", 958 | "name": "python", 959 | "nbconvert_exporter": "python", 960 | "pygments_lexer": "ipython3", 961 | "version": "3.5.4" 962 | } 963 | }, 964 | "nbformat": 4, 965 | "nbformat_minor": 1 966 | } 967 | --------------------------------------------------------------------------------