├── scripts
    ├── __init__.py
    ├── settings.py
    ├── sql_requests.py
    ├── feature_extraction.py
    └── model.py
├── requirements.txt
├── .gitattributes
├── tox.ini
├── data
    ├── data.csv
    ├── test_data.csv
    ├── train_data.csv
    └── Data description.md
├── acquisition_prediction.py
├── README.md
└── develop
    └── Model.ipynb


/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | numpy
3 | sklearn
4 | mysqlclient


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.sql filter=lfs diff=lfs merge=lfs -text
2 | *.csv filter=lfs diff=lfs merge=lfs -text
3 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude =
3 |     data,
4 |     develop,
5 |     README.md,
6 |     scripts/sql_requests.py
7 | max-line-length = 120
8 | 


--------------------------------------------------------------------------------
/data/data.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:557170580f622c2079c9d318c9e53f23070e9aebb54616568e33e57979a23bdc
3 | size 2575601
4 | 


--------------------------------------------------------------------------------
/data/test_data.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:517c418485b0d258bec5ee24e9fbe2e6e2c1f8170814e38ea026e7450ffb05c5
3 | size 515123
4 | 


--------------------------------------------------------------------------------
/data/train_data.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b8644b23c466b8134ae77d508be5ec1e269dee4d0b118cb3e5bd587ab41fb576
3 | size 2060694
4 | 


--------------------------------------------------------------------------------
/scripts/settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def path_to_data_file(path):
 5 |     cur_dir = os.path.dirname(os.path.realpath(__file__))
 6 |     return os.path.normpath(os.path.join(cur_dir, path))
 7 | 
 8 | data = '../data/data.csv'
 9 | train = '../data/train_data.csv'
10 | test = '../data/train_data.csv'
11 | 
12 | 
13 | DATA_FILE = path_to_data_file(data)
14 | TRAIN_FILE = path_to_data_file(train)
15 | TEST_FILE = path_to_data_file(test)
16 | 
17 | 


--------------------------------------------------------------------------------
/acquisition_prediction.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python3
 2 | from scripts.feature_extraction import do_feature_extraction
 3 | from scripts.model import do_model_building
 4 | import argparse
 5 | 
 6 | extract_help = """Extract features from database and save them in files located by paths in settings.py. 
 7 | Also you should specify database credentials."""
 8 | fit_help = 'Build prediction model and print best params and scores.'
 9 | parser = argparse.ArgumentParser()
10 | exclusive_group = parser.add_mutually_exclusive_group()
11 | exclusive_group.add_argument('--extract', action='store_true', help=extract_help)
12 | exclusive_group.add_argument('--fit', action='store_true', help=fit_help)
13 | db_group = parser.add_argument_group('Database credentials')
14 | db_group.add_argument('--user', help='database user')
15 | db_group.add_argument('--password', help='database user password')
16 | db_group.add_argument('--scheme', help='database scheme')
17 | args = parser.parse_args()
18 | 
19 | if args.extract:
20 |     user = args.user
21 |     password = args.password
22 |     scheme = args.scheme
23 |     if user and password and scheme:
24 |         do_feature_extraction(user, password, scheme)
25 |     else:
26 |         parser.error("You should specify all database credentials.")
27 | elif args.fit:
28 |     do_model_building()
29 | else:
30 |     parser.error("You should specify --extract or --fit options.")


--------------------------------------------------------------------------------
/data/Data description.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | <H2>Data description</H2>
 4 | <H3>Data dictionary</H3>
 5 | 
 6 | | Variable             | Definition                                               | Key           |
 7 | |----------------------|----------------------------------------------------------|---------------|
 8 | | company_id           | Company ID                                               |               |
 9 | | category_code        | Company category                                         |               |
10 | | country_code         | Country                                                  | USA, NZL, other|
11 | | state_code           | State                                                    | California, other |
12 | | ipo                  | IPO                                                      | True or False |
13 | | is_acquired          | Company is acquired or not                               | True or False |
14 | | is_closed            | Company is closed or not                                 | True or False |
15 | | age                  | Company age in days                                      |               |
16 | | mba_degree           | Number of people with MBA degree associated with company |               |
17 | | phd_degree           | Number of people with PhD degree associated with company |               |
18 | | ms_degree            | Number of people with MS degree associated with company  |               |
19 | | other_degree         | Number of other people associated with company           |               |
20 | | offices              | Number of company's offices                              |               |
21 | | average_funded       | Funds per round                                          |               |
22 | | average_participants | Participants per round                                   |               |
23 | | total_rounds         | Total number of rounds                                   |               |
24 | | products_number      | Total number of products                                 |               |
25 | | acquired_companies   | Total number of acquired companies                       |               |
26 | 
27 | <H3>Variable notes</H3>
28 | <H4>Age</H4>
29 | Age is calculated by next rule: if company is acquired, then age is set up on acquiring date, else on 01.01.2014.
30 | 


--------------------------------------------------------------------------------
/scripts/sql_requests.py:
--------------------------------------------------------------------------------
 1 | """This script contains requests and functions for extracting company information from database."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | financial_ipo_offices_products_request = """SELECT company_id, category_code, founded_at, closed_at, country_code, state_code, city, region, average_funded, total_rounds, average_participants, public_at, acquired_at, products_number, offices, acquired_companies 
 6 | FROM
 7 | 	(SELECT * 
 8 | 	FROM
 9 | 		(SELECT *
10 | 		FROM	
11 | 			(SELECT *
12 | 			FROM	
13 | 				(SELECT *
14 | 				FROM	
15 | 					(SELECT *
16 | 					FROM
17 | 						(SELECT id as company_id, category_code, founded_at, closed_at, country_code, state_code, city, region  
18 | 						FROM {0}.cb_objects 
19 | 						WHERE entity_type='Company') as companies 
20 | 					RIGHT JOIN 
21 | 						(SELECT object_id as round_company_id, avg(raised_amount_usd) as average_funded, count(*) as total_rounds, avg(participants) as average_participants 
22 | 						FROM {0}.cb_funding_rounds
23 | 						GROUP BY object_id) as rounds
24 | 					ON company_id = round_company_id) as company_rounds
25 | 				LEFT JOIN
26 | 					(SELECT acquired_object_id as acquired_company_id, acquired_at 
27 | 					FROM {0}.cb_acquisitions) as acquisitions
28 | 				ON company_id = acquired_company_id) as financial_info
29 | 			LEFT JOIN
30 | 				(SELECT object_id as ipo_company_id, public_at
31 | 				FROM {0}.cb_ipos) as ipo
32 | 			ON company_id = ipo_company_id) as financial_ipo_info
33 | 		LEFT JOIN
34 | 			(SELECT parent_id, count(*) as products_number 
35 | 			FROM {0}.cb_objects
36 | 			WHERE NOT isnull(parent_id) and entity_type = 'Product'
37 | 			GROUP BY parent_id) as products
38 | 		ON parent_id = company_id) as financial_ipo_products_info
39 | 	LEFT JOIN
40 | 		(SELECT object_id as office_company_id, count(*) as offices FROM {0}.cb_offices
41 | 		GROUP BY object_id) as offices_info
42 | 	ON company_id = office_company_id) as financial_ipo_products_office_info
43 | LEFT JOIN
44 | 	(SELECT acquiring_object_id, count(*) as acquired_companies
45 |     FROM {0}.cb_acquisitions
46 |     GROUP by acquiring_object_id) as acquisition_number
47 | ON company_id = acquiring_object_id;"""
48 | 
49 | degrees_request = """SELECT count(*) as count, company_id, degree_type
50 |                      FROM (SELECT rel.relationship_object_id as company_id, 
51 |                             IF(lower(deg.degree_type) in {1}, lower(deg.degree_type), 'other') as degree_type
52 |                            FROM {0}.cb_relationships as rel 
53 |                            JOIN {0}.cb_degrees as deg 
54 |                            ON rel.person_object_id=deg.object_id) as rel_deg
55 |                      GROUP BY company_id, degree_type;"""
56 | 
57 | valid_degrees = ('mba', 'phd', 'ms')
58 | 
59 | 
60 | def get_financial_ipo_offices_products_query(connection, scheme):
61 |     """
62 |     Returns pandas DataFrame containing company information about finances, ipo, offices and products.
63 |     """
64 | 
65 |     return pd.read_sql(financial_ipo_offices_products_request.format(scheme, valid_degrees), con=connection)
66 | 
67 | 
68 | def get_degrees_query(connection, scheme):
69 |     """
70 |     Returns pandas DataFrame containing information about how much in each company people are with degrees stored in
71 |     valid_degrees.
72 |     """
73 | 
74 |     return pd.read_sql(degrees_request.format(scheme, valid_degrees), con=connection)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Merge and Acquisitions Prediction
  2 | 
  3 | ## Rational
  4 | 
  5 | As
  6 | [Wikipedia says](https://en.wikipedia.org/wiki/Mergers_and_acquisitions):
  7 | "Mergers and acquisitions (M&A) are transactions in which the
  8 | ownership of companies, other business organizations or their
  9 | operating units are transferred or combined. As an aspect of strategic
 10 | management, M&A can allow enterprises to grow, shrink, and change the
 11 | nature of their business or competitive position."
 12 | 
 13 | Many traders and investors are interested in predicting M&As in order
 14 | to adjust their strategy according to an M&A upcoming event.
 15 | 
 16 | Having a rationally big dataset of company and M&A information we can
 17 | try to analyze it and build a predictive model using different machine
 18 | learning (ML) techniques. The model can be wrapped in a web
 19 | application available for traders and investors.
 20 | 
 21 | ## Our Goals
 22 | 
 23 | * To collect a dataset of company and M&A information from Crunchbase
 24 |   (see the next section).
 25 | * Analyze the dataset and build a predictive model of M&A.
 26 | * Wrap the model into a web app that will display predictions of
 27 |   future M&A.
 28 | 
 29 | ## Crunchbase
 30 | 
 31 | [Crunchbase](https://www.crunchbase.com/) is a database of the startup
 32 | ecosystem consisting of investors, incubators and start-ups, which
 33 | comprises around 500,000 data points profiling companies, people,
 34 | funds, fundings and events.
 35 | 
 36 | The good thing about Crunchbase is it provides a full-fledged API to
 37 | the start-ups data it has. Please
 38 | refer [this page](https://data.crunchbase.com/) to learn more about
 39 | the API.
 40 | 
 41 | A paid Crunchbase account can be provided by Crystalnix if it is
 42 | necessary.
 43 | 
 44 | ## Project Stages
 45 | 
 46 | The project will consist of three major stages each of which depends
 47 | on the previous one:
 48 | 
 49 | 1. Collecting the data.
 50 | 2. Building the model.
 51 | 3. Implementing a web app.
 52 | 
 53 | ## Collecting the Data
 54 | 
 55 | The data is supposed to be collected by using Crunchbase API. Less
 56 | interesting but still viable option is to
 57 | use
 58 | [the full 2013 dataset](https://data.crunchbase.com/docs/getting-started#basic-access).
 59 | 
 60 | ## Building the Model
 61 | 
 62 | It would be nice to avoid using a lot of computational power (such as
 63 | Amazon EMR) to produce the model. However we are not sure how large
 64 | the dataset is going to be. In case additional computational power is
 65 | necessary it is advised to contact to the project manager responsible
 66 | for the project.
 67 | 
 68 | ## Implementing a Web App
 69 | 
 70 | The web app can be described by the following user stories:
 71 | 
 72 | * As a user I can register to the application using my email and
 73 |   password or by using a social account (Google, Facebook, Twitter,
 74 |   etc).
 75 | * As a user after login I can see the dashboard with company names and
 76 |   probabilities they will be acquired.
 77 | 
 78 | Example of the app dashboard:
 79 | 
 80 | | Comany Name               | Chance to Be Acquired, %   |
 81 | |---------------------------|----------------------------|
 82 | | Religare Health insurance | 69.97                      |
 83 | | Supremia Grup             | 9.16                       |
 84 | | Alpine Jan-San            | 55.58                      |
 85 | | Fitzroy Communications    | 9.99                       |
 86 | | Connextions, Inc.         | 40.04                      |
 87 | 
 88 | 
 89 | No design work for the first version of the app is considered.
 90 | 
 91 | ## Technologies
 92 | 
 93 | The preferable technologies are:
 94 | 
 95 | * Python scientific stack for data analysis and ML.
 96 | * Django for the web app.
 97 | * Deploy on Amazon Web Services infrastructure.
 98 | 
 99 | The requirements are not touch and are subject to discussion.
100 | 
101 | ## Assumptions
102 | 
103 | * It is possible to get data from Crunchbase via the REST API.
104 | * The dataset is reasonably small in informative to be processed on a
105 |   single machine.
106 | * Collecting and processing data doesn't violate any Crunchbase
107 |   license terms.
108 | 
109 | ## Our Approach
110 | 
111 | In case of any issues they should be reported as soon as possible. We
112 | prefer to make and spot mistakes quickly while it's cheap to do them
113 | and adjust the direction.
114 | 
115 | 
116 | ## Application
117 | For purpose of our work CLI application has been developed. For run application just use 'python acquisition_prediction.py'.
118 | 
119 | ### Installation
120 | For install application clone this repository in folder you need and run 'pip install -r requirements.txt'.
121 | 
122 | ### Usage
123 | acquisition_prediction.py [-h] [--extract | --fit] [--user USER]
124 |                                  [--password PASSWORD] [--scheme SCHEME]
125 | 
126 | | Option                | Description                                                                                                                          |
127 | |-----------------------|--------------------------------------------------------------------------------------------------------------------------------------|
128 | | --extract             | Extract features from database and save them in files located by paths in settings.py. Also you should specify database credentials. |
129 | | --fit                 | Build prediction model and print best params and scores.                                                                             |
130 | 
131 | Database credentials:
132 | 
133 | | Credential          | Description            |
134 | |---------------------|------------------------|
135 | | --user USER         | Database user          |
136 | | --password PASSWORD | Database user password |
137 | | --scheme SCHEME     | Database scheme        |


--------------------------------------------------------------------------------
/scripts/feature_extraction.py:
--------------------------------------------------------------------------------
  1 | """This script forms csv data file with characteristics described in "Data description.md"
  2 | from sql dump provided by CrunchBase and located in "sql" folder.
  3 | The dump should be imported in local MySQL database.
  4 | In addition, this script splits data into two parts: train_data and test_data.
  5 | You will be asked for user, password and scheme for database you use."""
  6 | 
  7 | import pandas as pd
  8 | import MySQLdb as sql
  9 | import numpy as np
 10 | from scripts.sql_requests import get_degrees_query, get_financial_ipo_offices_products_query, valid_degrees
 11 | from sklearn.model_selection import train_test_split
 12 | from scripts.settings import DATA_FILE, TEST_FILE, TRAIN_FILE
 13 | 
 14 | 
 15 | def fix_column(df, column):
 16 |     """
 17 |     Set "column" in df to NaN if "column" was after acquisition.
 18 |     """
 19 |     column_after_acquired = '%s_after_acquired' % column
 20 |     column_acquired = pd.DataFrame({column: df[column], 'acquired': df.acquired_at})
 21 |     column_acquired.dropna(how='any', inplace=True)
 22 |     column_acquired.loc[:, column] = column_acquired[column].apply(pd.to_datetime)
 23 |     column_acquired.loc[:, 'acquired'] = column_acquired.acquired.apply(pd.to_datetime)
 24 |     column_acquired[column_after_acquired] = (column_acquired[column] - column_acquired['acquired']).apply(
 25 |         lambda x: x.days > 0)
 26 |     indexes = column_acquired[column_acquired[column_after_acquired]].index
 27 |     df.loc[indexes, column] = np.nan
 28 | 
 29 | 
 30 | class FeatureExtractor:
 31 |     def __init__(self):
 32 |         self.db = None
 33 |         self.data = None
 34 |         self.degrees = None
 35 | 
 36 |     def connect_to_db(self, user, password, scheme):
 37 |         """
 38 |         Connect to database.
 39 |         :param user: database user
 40 |         :param password: database user's password
 41 |         :param scheme: database scheme in which sql dump is stored
 42 |         :return:
 43 |         """
 44 |         self.db = sql.connect(user=user, passwd=password, db=scheme)
 45 |         self.scheme = scheme
 46 | 
 47 |     def perform_queries(self):
 48 |         """
 49 |         Extract data from database.
 50 |         :return:
 51 |         """
 52 |         self.data = get_financial_ipo_offices_products_query(self.db, self.scheme)
 53 |         self.degrees = get_degrees_query(self.db, self.scheme)
 54 | 
 55 |     def extract_degrees(self):
 56 |         """
 57 |         Calculate how much people with degrees stored in valid_degrees are associated with each company.
 58 |         :return:
 59 |         """
 60 |         degrees = self.degrees
 61 |         for degree_name in valid_degrees + ('other',):
 62 |             deg = degrees[degrees.degree_type == degree_name][['count', 'company_id']]
 63 |             deg.columns = ['%s_degree' % degree_name, 'company_id']
 64 |             self.data = self.data.merge(deg, on='company_id', how='left')
 65 | 
 66 |     def fix_public_and_closed(self):
 67 |         """
 68 |         Set 'public_at, 'closed_at' to NaN if they were after acquisition.
 69 |         :return:
 70 |         """
 71 |         fix_column(self.data, 'public_at')
 72 |         fix_column(self.data, 'closed_at')
 73 | 
 74 |     def set_binary_columns(self):
 75 |         """
 76 |         Set 'ipo', 'is_acquired', 'is_closed' in data.
 77 |         :return:
 78 |         """
 79 |         self.data['ipo'] = self.data.public_at.notnull()
 80 |         self.data['is_acquired'] = self.data.acquired_at.notnull()
 81 |         self.data['is_closed'] = self.data.closed_at.notnull()
 82 | 
 83 |     def calculate_age(self):
 84 |         """
 85 |         Add an "age" column to data where age is set up on date of acquisition, if it was,
 86 |         or on 01.01.2014.
 87 |         """
 88 |         df = self.data
 89 |         is_acquired = df.is_acquired
 90 |         is_not_acquired = ~is_acquired
 91 |         df.loc[is_acquired, 'age'] = (
 92 |             df[is_acquired]['acquired_at'].apply(pd.to_datetime) - df[is_acquired]['founded_at'].apply(pd.to_datetime))\
 93 |             .apply(lambda x: x.days)
 94 |         df.loc[is_not_acquired, 'age'] = (
 95 |             pd.to_datetime('2014-01-01') - df[is_not_acquired]['founded_at'].apply(pd.to_datetime)) \
 96 |             .apply(lambda x: x.days)
 97 |         df.drop(df[df.age < 0].index, inplace=True)
 98 | 
 99 |     def set_geo_info(self):
100 |         """
101 |         Set all values, except 'USA' and 'NZL' in country_code to 'other'.
102 |         Also set state_code to 'California', if it was 'CA', else 'other'.
103 |         :return:
104 |         """
105 |         data = self.data
106 |         data.loc[:, 'country_code'] = data['country_code'].apply(lambda x: x if x in ['USA', 'NZL'] else 'other')
107 |         data.loc[:, 'state_code'] = data['state_code'].apply(lambda x: 'California' if x == 'CA' else 'other')
108 | 
109 |     def drop_excess_columns(self):
110 |         """
111 |         Drop 'founded_at', 'closed_at', 'public_at', 'acquired_at', 'city', 'region' from data.
112 |         :return:
113 |         """
114 |         columns_to_drop = ['founded_at', 'closed_at', 'public_at', 'acquired_at', 'city', 'region']
115 |         self.data.drop(columns_to_drop, inplace=True, axis=1)
116 | 
117 |     def save_to_files(self):
118 |         """
119 |         Save resultant data to DATA_FILE, also save it to TRAIN_FILE and  TEST_FILE using stratifying strategy.
120 |         :return:
121 |         """
122 |         data = self.data
123 |         data.to_csv(DATA_FILE, index=False)
124 |         x_train, x_test, y_train, y_test = train_test_split(data.drop(['is_acquired'], axis=1), data['is_acquired'],
125 |                                                             stratify=data['is_acquired'], test_size=0.2)
126 |         x_train['is_acquired'] = y_train
127 |         x_test['is_acquired'] = y_test
128 |         x_train.to_csv(TRAIN_FILE, index=False)
129 |         x_test.to_csv(TEST_FILE, index=False)
130 | 
131 | 
132 | def do_feature_extraction(user, password, scheme):
133 |     """
134 |     Usual workflow for this script.
135 |     :param user: database user
136 |     :param password: database user's password
137 |     :param scheme: database scheme in which sql dump is stored
138 |     :return:
139 |     """
140 |     try:
141 |         feature_extractor = FeatureExtractor()
142 |         feature_extractor.connect_to_db(user, password, scheme)
143 |         feature_extractor.perform_queries()
144 |         feature_extractor.extract_degrees()
145 |         feature_extractor.fix_public_and_closed()
146 |         feature_extractor.set_binary_columns()
147 |         feature_extractor.calculate_age()
148 |         feature_extractor.set_geo_info()
149 |         feature_extractor.drop_excess_columns()
150 |         feature_extractor.save_to_files()
151 |     except sql.OperationalError:
152 |         print("Wrong credentials for access to database.")
153 | 
154 | if __name__ == '__main__':
155 |     user = input("user: ")
156 |     password = input("password: ")
157 |     scheme = input("scheme: ")
158 |     do_feature_extraction(user, password, scheme)


--------------------------------------------------------------------------------
/scripts/model.py:
--------------------------------------------------------------------------------
  1 | """This script is intended for building prediction model: it reads samples from the data file,
  2 | apply transformations to them and searches for the best parameters for prediction."""
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | from sklearn.pipeline import Pipeline
  7 | from sklearn.ensemble import GradientBoostingClassifier
  8 | from sklearn.linear_model import RANSACRegressor
  9 | from sklearn.model_selection import GridSearchCV, StratifiedKFold
 10 | from sklearn.preprocessing import LabelBinarizer, StandardScaler, Imputer
 11 | from sklearn_pandas import DataFrameMapper, CategoricalImputer
 12 | from sklearn.base import BaseEstimator, TransformerMixin
 13 | from sklearn.svm import SVC
 14 | from sklearn.neural_network import MLPClassifier
 15 | from sklearn.metrics import f1_score, recall_score, precision_score
 16 | from scripts.settings import DATA_FILE, TRAIN_FILE, TEST_FILE
 17 | 
 18 | import warnings
 19 | 
 20 | warnings.filterwarnings("ignore")
 21 | 
 22 | 
 23 | class ValueImputer(BaseEstimator, TransformerMixin):
 24 |     """
 25 |     Impute missed values with particular value.
 26 |     """
 27 |     def __init__(self, value, missing_values='NaN', copy=True):
 28 |         self.value = value
 29 |         self.missing_values = missing_values
 30 |         self.copy = copy
 31 | 
 32 |     def fit(self, X, y=None):
 33 |         return self
 34 | 
 35 |     def transform(self, X):
 36 |         mask = self._get_mask(X, self.missing_values)
 37 |         if self.copy:
 38 |             X = X.copy()
 39 |         X[mask] = self.value
 40 |         return X
 41 | 
 42 |     @staticmethod
 43 |     def _get_mask(X, value):
 44 |         """
 45 |         Compute the boolean mask X == missing_values.
 46 |         """
 47 |         if value == "NaN" or value is None or (isinstance(value, float) and np.isnan(value)):
 48 |             return pd.isnull(X)
 49 |         else:
 50 |             return X == value
 51 | 
 52 | 
 53 | class OnceFittedLabelBinarizer(LabelBinarizer):
 54 |     """
 55 |     Usual LabelBinarizer, but it can be fitted only once.
 56 |     """
 57 |     def __init__(self, neg_label=0, pos_label=1, sparse_output=False):
 58 |         super().__init__(neg_label, pos_label, sparse_output)
 59 |         self.once_fitted = False
 60 | 
 61 |     def fit(self, y):
 62 |         if self.once_fitted:
 63 |             return self
 64 |         self.once_fitted = True
 65 |         return super().fit(y)
 66 | 
 67 | 
 68 | class FundImputer(BaseEstimator, TransformerMixin):
 69 |     """
 70 |     Impute average funds based on total rounds using RANSACRegressor.
 71 |     """
 72 | 
 73 |     def __init__(self):
 74 |         self.clf = RANSACRegressor()
 75 | 
 76 |     def fit(self, X, y=None):
 77 |         frame = pd.DataFrame({'total_rounds': X[:, 0], 'average_funded': X[:, 1]})
 78 |         grouped = frame.groupby('total_rounds').average_funded.mean()
 79 |         rounds_funds = pd.DataFrame({'rounds': grouped.index, 'funded': grouped})
 80 |         shape = (len(rounds_funds), 1)
 81 |         self.clf.fit(rounds_funds.rounds.as_matrix().reshape(shape), rounds_funds.funded.as_matrix().reshape(shape))
 82 |         return self
 83 | 
 84 |     def transform(self, X):
 85 |         frame = pd.DataFrame({'total_rounds': X[:, 0], 'average_funded': X[:, 1]})
 86 |         null_funded = frame.average_funded.isnull()
 87 |         total_shape = (len(frame), 1)
 88 |         null_funded_shape = (len(frame[null_funded]), 1)
 89 |         prediction = self.clf.predict(frame[null_funded].total_rounds.as_matrix().reshape(null_funded_shape))
 90 |         frame.loc[null_funded, 'average_funded'] = prediction.ravel()
 91 |         transformed = frame.average_funded.as_matrix().reshape(total_shape)
 92 |         return transformed
 93 | 
 94 | 
 95 | class ParticipantsImputer(BaseEstimator, TransformerMixin):
 96 |     """
 97 |     Impute participants number based on average funds using RANSACRegressor.
 98 |     """
 99 | 
100 |     def __init__(self):
101 |         self.clf = RANSACRegressor()
102 | 
103 |     def fit(self, X, y=None):
104 |         frame = pd.DataFrame({'average_funded': X[:, 0], 'average_participants': X[:, 1]})
105 |         funds_participants = frame[(frame.average_participants != 0.0) & frame.average_funded.notnull()]
106 |         shape = (len(funds_participants), 1)
107 |         features = funds_participants.average_funded.as_matrix().reshape(shape)
108 |         ground_truth = funds_participants.average_participants.as_matrix().reshape(shape)
109 |         self.clf.fit(features, ground_truth)
110 |         return self
111 | 
112 |     def transform(self, X):
113 |         frame = pd.DataFrame({'average_funded': X[:, 0], 'average_participants': X[:, 1]})
114 |         null_participants = (frame.average_participants == 0.0) & frame.average_funded.notnull()
115 |         total_shape = (len(frame), 1)
116 |         null_funded_shape = (len(frame[null_participants]), 1)
117 |         prediction = self.clf.predict(frame[null_participants].average_funded.as_matrix().reshape(null_funded_shape))
118 |         frame.loc[null_participants, 'average_participants'] = prediction.ravel()
119 |         transformed = frame.average_participants.as_matrix().reshape(total_shape)
120 |         return transformed
121 | 
122 | 
123 | class ModelBuilder:
124 |     def __init__(self):
125 |         self.data = None
126 |         self.X_train = None
127 |         self.Y_train = None
128 |         self.X_test = None
129 |         self.Y_test = None
130 |         self.clf = None
131 |         category_binarizer = OnceFittedLabelBinarizer()
132 |         country_binarizer = OnceFittedLabelBinarizer()
133 |         state_binarizer = OnceFittedLabelBinarizer()
134 |         self.category_mapper = DataFrameMapper([
135 |             (['category_code'], [CategoricalImputer(), category_binarizer]),
136 |             (['country_code'], [CategoricalImputer(), country_binarizer]),
137 |             (['state_code'], [CategoricalImputer(), state_binarizer]),
138 |         ])
139 |         self.mapper = DataFrameMapper([
140 |             (['category_code'], [CategoricalImputer(), category_binarizer], {'alias': 'category'}),
141 |             (['country_code'], [CategoricalImputer(), country_binarizer], {'alias': 'country'}),
142 |             (['state_code'], [CategoricalImputer(), state_binarizer], {'alias': 'state'}),
143 |             (['mba_degree'], [ValueImputer(0), StandardScaler()]),
144 |             (['phd_degree'], [ValueImputer(0), StandardScaler()]),
145 |             (['ms_degree'], [ValueImputer(0), StandardScaler()]),
146 |             (['other_degree'], [ValueImputer(0)]),
147 |             (['age'], [Imputer(), StandardScaler()]),
148 |             (['offices'], [ValueImputer(1.0), StandardScaler()]),
149 |             (['products_number'], [ValueImputer(1.0), StandardScaler()]),
150 |             (['average_funded', 'average_participants'], [ParticipantsImputer(), StandardScaler()],
151 |              {'alias': 'average_participants'}),
152 |             (['total_rounds'], None),
153 |             (['ipo'], None),
154 |             (['is_closed'], None),
155 |             (['total_rounds', 'average_funded'], [FundImputer(), StandardScaler()], {'alias': 'average_funded'}),
156 |             (['acquired_companies'], [ValueImputer(0)]),
157 |         ])
158 |         SVC_C_grid = [10 ** i for i in range(-3, 4)]
159 |         SVC_gamma_grid = [10 ** i for i in range(-3, 1)] + ['auto']
160 |         MLP_hidden_layer_sizes = [[25], [50], [75], [100], [50, 25], [75, 50], [100, 75], [75, 50, 25], [100, 75, 50]]
161 |         MLP_activation = ['logistic', 'tanh', 'relu']
162 |         self.grid = [{'clf': [GradientBoostingClassifier()], 'clf__n_estimators': [20 * i for i in range(5, 8)],
163 |                  'clf__max_depth': [i + 3 for i in range(2, 6)]},
164 |                 {'clf': [SVC(kernel='rbf', class_weight='balanced')], 'clf__C': SVC_C_grid,
165 |                  'clf__gamma': SVC_gamma_grid},
166 |                 {'clf': [SVC(kernel='poly', class_weight='balanced')], 'clf__C': SVC_C_grid,
167 |                  'clf__gamma': SVC_gamma_grid,
168 |                  'clf__degree': list(range(3, 6))},
169 |                 {'clf': [MLPClassifier()], 'clf__hidden_layer_sizes': MLP_hidden_layer_sizes,
170 |                  'clf__activation': MLP_activation,
171 |                  'clf__alpha': [10 ** i for i in range(-1, 3)]}]
172 | 
173 |     def read_data(self, data_path, train_path, test_path):
174 |         """
175 |         Read data.
176 |         :param data_path: path to full data
177 |         :param train_path: path to train data
178 |         :param test_path: path to test data
179 |         :return:
180 |         """
181 |         self.data = pd.read_csv(data_path)
182 |         train_data = pd.read_csv(train_path)
183 |         test_data = pd.read_csv(test_path)
184 |         self.X_train = train_data.drop(['company_id', 'is_acquired'], axis=1)
185 |         self.Y_train = train_data.is_acquired.as_matrix()
186 |         self.X_test = test_data.drop(['company_id', 'is_acquired'], axis=1)
187 |         self.Y_test = test_data.is_acquired.as_matrix()
188 | 
189 |     def fit(self):
190 |         """
191 |         Find the best parameter for classification.
192 |         :return:
193 |         """
194 |         self.category_mapper.fit(self.data)
195 |         estimators = [('fill_nan', self.mapper), ('clf', GradientBoostingClassifier())]
196 |         pipe = Pipeline(estimators)
197 |         self.clf = GridSearchCV(pipe, self.grid, scoring='f1', cv=StratifiedKFold(n_splits=3, shuffle=True), verbose=5)
198 |         self.clf.fit(self.X_train, self.Y_train)
199 | 
200 |     def print_results(self):
201 |         """
202 |         Print best score, best params and metrics for prediction for test data.
203 |         :return:
204 |         """
205 |         print("Best score: ", self.clf.best_score_)
206 |         print("Best params: ", self.clf.best_params_)
207 |         prediction = self.clf.predict(self.X_test)
208 |         print("F1-score for test data: ", f1_score(self.Y_test, prediction))
209 |         print("Recall for test data: ", recall_score(self.Y_test, prediction))
210 |         print("Precision for test data: ", precision_score(self.Y_test, prediction))
211 | 
212 | 
213 | def do_model_building():
214 |     model_builder = ModelBuilder()
215 |     model_builder.read_data(DATA_FILE, TRAIN_FILE, TEST_FILE)
216 |     model_builder.fit()
217 |     model_builder.print_results()
218 | 
219 | if __name__ == '__main__':
220 |     do_model_building()
221 | 


--------------------------------------------------------------------------------
/develop/Model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "For the beginning we make required imports:"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "import seaborn as sns\n",
 21 |     "import pandas as pd\n",
 22 |     "import numpy as np\n",
 23 |     "from sklearn.pipeline import Pipeline\n",
 24 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
 25 |     "from sklearn.linear_model import RANSACRegressor\n",
 26 |     "from sklearn.model_selection import GridSearchCV, StratifiedKFold\n",
 27 |     "from sklearn.preprocessing import LabelBinarizer, StandardScaler, Imputer\n",
 28 |     "from sklearn_pandas import DataFrameMapper, CategoricalImputer\n",
 29 |     "from sklearn.base import BaseEstimator, TransformerMixin\n",
 30 |     "from sklearn.svm import SVC\n",
 31 |     "from sklearn.neural_network import MLPClassifier\n",
 32 |     "from sklearn.metrics import f1_score\n",
 33 |     "sns.set(style=\"whitegrid\", color_codes=True)\n",
 34 |     "%matplotlib inline\n",
 35 |     "import warnings\n",
 36 |     "warnings.filterwarnings(\"ignore\")"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "Then we define pathes to our data files:"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {
 50 |     "collapsed": true
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "def path_to_data_file(path):\n",
 55 |     "    return os.path.normpath(os.path.join(os.path.abspath(os.path.curdir), path))\n",
 56 |     "\n",
 57 |     "train_file = \"../data/train_data.csv\"\n",
 58 |     "test_file = \"../data/test_data.csv\"\n",
 59 |     "data_file = \"../data/data.csv\"\n",
 60 |     "train_path = path_to_data_file(train_file)\n",
 61 |     "test_path = path_to_data_file(test_file)\n",
 62 |     "data_path = path_to_data_file(data_file)"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "Class ValueImputer is used for imputing missed values with particular value:"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {
 76 |     "collapsed": true
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "class ValueImputer(BaseEstimator, TransformerMixin):\n",
 81 |     "    def __init__(self, value, missing_values='NaN', copy=True):\n",
 82 |     "        self.value = value\n",
 83 |     "        self.missing_values = missing_values\n",
 84 |     "        self.copy = copy\n",
 85 |     "\n",
 86 |     "    def fit(self, X, y=None):\n",
 87 |     "        return self\n",
 88 |     "\n",
 89 |     "    def transform(self, X):\n",
 90 |     "        mask = self._get_mask(X, self.missing_values)\n",
 91 |     "        if self.copy:\n",
 92 |     "            X = X.copy()\n",
 93 |     "        X[mask] = self.value\n",
 94 |     "        return X\n",
 95 |     "\n",
 96 |     "    @staticmethod\n",
 97 |     "    def _get_mask(X, value):\n",
 98 |     "        \"\"\"\n",
 99 |     "        Compute the boolean mask X == missing_values.\n",
100 |     "        \"\"\"\n",
101 |     "        if value == \"NaN\" or value is None or (isinstance(value, float) and np.isnan(value)):\n",
102 |     "            return pd.isnull(X)\n",
103 |     "        else:\n",
104 |     "            return X == value"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "The next class OnceFittedLabelBinarizer is modified usual LabelBinarizer. We make this modification because of behavior of DataFrameMapper used further. We need to train LabelBinarizer on full data to perform transformation for train set, but DataFrameMapper will fit binarizer on train data again, and some categories may be not learned."
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "collapsed": true
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "class OnceFittedLabelBinarizer(LabelBinarizer):\n",
123 |     "    def __init__(self, neg_label=0, pos_label=1, sparse_output=False):\n",
124 |     "        super().__init__(neg_label, pos_label, sparse_output)\n",
125 |     "        self.once_fitted = False\n",
126 |     "\n",
127 |     "    def fit(self, y):\n",
128 |     "        if self.once_fitted:\n",
129 |     "            return self\n",
130 |     "        self.once_fitted = True\n",
131 |     "        return super().fit(y)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "Class FundImputer is used for imputing missing values in column \"average_funded\". We supposed that average funding depends on number of funding rounds, so for imputing missing values for that feature RANSACRegressor was trained. "
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {
145 |     "collapsed": true
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "class FundImputer(BaseEstimator, TransformerMixin):\n",
150 |     "    \"\"\"\n",
151 |     "    Impute average funds based on total rounds using RANSACRegressor.\n",
152 |     "    \"\"\"\n",
153 |     "\n",
154 |     "    def __init__(self):\n",
155 |     "        self.clf = RANSACRegressor()\n",
156 |     "\n",
157 |     "    def fit(self, X, y=None):\n",
158 |     "        frame = pd.DataFrame({'total_rounds': X[:, 0], 'average_funded': X[:, 1]})\n",
159 |     "        grouped = frame.groupby('total_rounds').average_funded.mean()\n",
160 |     "        rounds_funds = pd.DataFrame({'rounds': grouped.index, 'funded': grouped})\n",
161 |     "        shape = (len(rounds_funds), 1)\n",
162 |     "        self.clf.fit(rounds_funds.rounds.as_matrix().reshape(shape), rounds_funds.funded.as_matrix().reshape(shape))\n",
163 |     "        return self\n",
164 |     "\n",
165 |     "    def transform(self, X):\n",
166 |     "        frame = pd.DataFrame({'total_rounds': X[:, 0], 'average_funded': X[:, 1]})\n",
167 |     "        null_funded = frame.average_funded.isnull()\n",
168 |     "        total_shape = (len(frame), 1)\n",
169 |     "        null_funded_shape = (len(frame[null_funded]), 1)\n",
170 |     "        prediction = self.clf.predict(frame[null_funded].total_rounds.as_matrix().reshape(null_funded_shape))\n",
171 |     "        frame.loc[null_funded, 'average_funded'] = prediction.ravel()\n",
172 |     "        transformed = frame.average_funded.as_matrix().reshape(total_shape)\n",
173 |     "        return transformed"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "The same technique was used for fixing average number of participants when company raised money, but had number of funding round participants equal to zero, with difference that number of participants was recovering by average funding."
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {
187 |     "collapsed": true
188 |    },
189 |    "outputs": [],
190 |    "source": [
191 |     "class ParticipantsImputer(BaseEstimator, TransformerMixin):\n",
192 |     "    \"\"\"\n",
193 |     "    Impute participants number based on average funds using RANSACRegressor.\n",
194 |     "    \"\"\"\n",
195 |     "\n",
196 |     "    def __init__(self):\n",
197 |     "        self.clf = RANSACRegressor()\n",
198 |     "\n",
199 |     "    def fit(self, X, y=None):\n",
200 |     "        frame = pd.DataFrame({'average_funded': X[:, 0], 'average_participants': X[:, 1]})\n",
201 |     "        funds_participants = frame[(frame.average_participants != 0.0) & frame.average_funded.notnull()]\n",
202 |     "        shape = (len(funds_participants), 1)\n",
203 |     "        features = funds_participants.average_funded.as_matrix().reshape(shape)\n",
204 |     "        ground_truth = funds_participants.average_participants.as_matrix().reshape(shape)\n",
205 |     "        self.clf.fit(features, ground_truth)\n",
206 |     "        return self\n",
207 |     "\n",
208 |     "    def transform(self, X):\n",
209 |     "        frame = pd.DataFrame({'average_funded': X[:, 0], 'average_participants': X[:, 1]})\n",
210 |     "        null_participants = (frame.average_participants == 0.0) & frame.average_funded.notnull()\n",
211 |     "        total_shape = (len(frame), 1)\n",
212 |     "        null_funded_shape = (len(frame[null_participants]), 1)\n",
213 |     "        prediction = self.clf.predict(frame[null_participants].average_funded.as_matrix().reshape(null_funded_shape))\n",
214 |     "        frame.loc[null_participants, 'average_participants'] = prediction.ravel()\n",
215 |     "        transformed = frame.average_participants.as_matrix().reshape(total_shape)\n",
216 |     "        return transformed"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "Now we are ready to start learning. First, we train ours label binarizers on whole data to not miss any categories."
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "collapsed": true
231 |    },
232 |    "outputs": [],
233 |    "source": [
234 |     "data = pd.read_csv(data_path)\n",
235 |     "category_binarizer = OnceFittedLabelBinarizer()\n",
236 |     "country_binarizer = OnceFittedLabelBinarizer()\n",
237 |     "state_binarizer = OnceFittedLabelBinarizer()\n",
238 |     "category_mapper = DataFrameMapper([\n",
239 |     "    (['category_code'], [CategoricalImputer(), category_binarizer]),\n",
240 |     "    (['country_code'], [CategoricalImputer(), country_binarizer]),\n",
241 |     "    (['state_code'], [CategoricalImputer(), state_binarizer]),\n",
242 |     "])\n",
243 |     "category_mapper.fit(data)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "Then we map ours columns to corresponding transformations."
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {
257 |     "collapsed": true
258 |    },
259 |    "outputs": [],
260 |    "source": [
261 |     "mapper = DataFrameMapper([\n",
262 |     "    (['category_code'], [CategoricalImputer(), category_binarizer], {'alias': 'category'}),\n",
263 |     "    (['country_code'], [CategoricalImputer(), country_binarizer], {'alias': 'country'}),\n",
264 |     "    (['state_code'], [CategoricalImputer(), state_binarizer], {'alias': 'state'}),\n",
265 |     "    (['mba_degree'], [ValueImputer(0), StandardScaler()]),\n",
266 |     "    (['phd_degree'], [ValueImputer(0), StandardScaler()]),\n",
267 |     "    (['ms_degree'], [ValueImputer(0), StandardScaler()]),\n",
268 |     "    (['other_degree'], [ValueImputer(0)]),\n",
269 |     "    (['age'], [Imputer(), StandardScaler()]),\n",
270 |     "    (['offices'], [ValueImputer(1.0), StandardScaler()]),\n",
271 |     "    (['products_number'], [ValueImputer(1.0), StandardScaler()]),\n",
272 |     "    (['average_funded', 'average_participants'], [ParticipantsImputer(), StandardScaler()],\n",
273 |     "     {'alias': 'average_participants'}),\n",
274 |     "    (['total_rounds'], None),\n",
275 |     "    (['ipo'], None),\n",
276 |     "    (['is_closed'], None),\n",
277 |     "    (['total_rounds', 'average_funded'], [FundImputer(), StandardScaler()], {'alias': 'average_funded'}),\n",
278 |     "    (['acquired_companies'], [ValueImputer(0)]),\n",
279 |     "])"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "As the next step we determine params grid that will be used later in a grid search."
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {
293 |     "collapsed": true
294 |    },
295 |    "outputs": [],
296 |    "source": [
297 |     "SVC_C_grid = [10 ** i for i in range(-3, 4)]\n",
298 |     "SVC_gamma_grid = [10 ** i for i in range(-3, 1)] + ['auto']\n",
299 |     "MLP_hidden_layer_sizes = [[25], [50], [75], [100], [50, 25], [75, 50], [100, 75], [75, 50, 25], [100, 75, 50]]\n",
300 |     "MLP_activation = ['logistic', 'tanh', 'relu']\n",
301 |     "grid = [{'clf': [GradientBoostingClassifier()], 'clf__n_estimators': [20 * i for i in range(5, 8)],\n",
302 |     "         'clf__max_depth': [i + 3 for i in range(2, 6)]},\n",
303 |     "        {'clf': [SVC(kernel='rbf', class_weight='balanced')], 'clf__C': SVC_C_grid, 'clf__gamma':SVC_gamma_grid},\n",
304 |     "        {'clf': [SVC(kernel='poly', class_weight='balanced')], 'clf__C': SVC_C_grid, 'clf__gamma':SVC_gamma_grid,\n",
305 |     "         'clf__degree': list(range(3, 6))},\n",
306 |     "        {'clf': [MLPClassifier()], 'clf__hidden_layer_sizes': MLP_hidden_layer_sizes, 'clf__activation': MLP_activation,\n",
307 |     "         'clf__alpha': [10 ** i for i in range(-1, 3)]}]"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {},
313 |    "source": [
314 |     "Now we load train and test data:"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {
321 |     "collapsed": true
322 |    },
323 |    "outputs": [],
324 |    "source": [
325 |     "train_data = pd.read_csv(train_path)\n",
326 |     "test_data = pd.read_csv(test_path)\n",
327 |     "X_train = train_data.drop(['company_id', 'is_acquired'], axis=1)\n",
328 |     "Y_train = train_data.is_acquired.as_matrix()\n",
329 |     "X_test = test_data.drop(['company_id', 'is_acquired'], axis=1)\n",
330 |     "Y_test = test_data.is_acquired.as_matrix()"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {},
336 |    "source": [
337 |     "All preparations are completed, and we are ready to fit."
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {
344 |     "collapsed": true
345 |    },
346 |    "outputs": [],
347 |    "source": [
348 |     "estimators = [('fill_nan', mapper), ('clf', GradientBoostingClassifier())]\n",
349 |     "pipe = Pipeline(estimators)\n",
350 |     "clf = GridSearchCV(pipe, grid, scoring='f1', cv=StratifiedKFold(n_splits=3, shuffle=True))\n",
351 |     "clf.fit(X_train, Y_train)"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "Let's look at the results:"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {
365 |     "collapsed": true
366 |    },
367 |    "outputs": [],
368 |    "source": [
369 |     "print(\"Best score: \", clf.best_score_)\n",
370 |     "print(\"Best params: \", clf.best_params_)\n",
371 |     "\n",
372 |     "prediction = clf.predict(X_test)\n",
373 |     "print(f1_score(Y_test, prediction))\n",
374 |     "\n",
375 |     "recall = (Y_test & prediction).sum() / Y_test.sum()\n",
376 |     "precision = (Y_test & prediction).sum() / prediction.sum()\n",
377 |     "print(\"recall: \", recall)\n",
378 |     "print(\"precision: \", precision)"
379 |    ]
380 |   }
381 |  ],
382 |  "metadata": {
383 |   "kernelspec": {
384 |    "display_name": "Python 3",
385 |    "language": "python",
386 |    "name": "python3"
387 |   },
388 |   "language_info": {
389 |    "codemirror_mode": {
390 |     "name": "ipython",
391 |     "version": 3
392 |    },
393 |    "file_extension": ".py",
394 |    "mimetype": "text/x-python",
395 |    "name": "python",
396 |    "nbconvert_exporter": "python",
397 |    "pygments_lexer": "ipython3",
398 |    "version": "3.5.2"
399 |   }
400 |  },
401 |  "nbformat": 4,
402 |  "nbformat_minor": 2
403 | }
404 | 


--------------------------------------------------------------------------------