├── CONTRIBUTING.md
├── LICENSE.txt
├── requirements.txt
├── README.md
├── k_anonymity.py
├── uci_config.py
├── bow_lib.py
├── factbook.yaml
├── synthesis_wrapper.py
├── uci_demo.py
├── dimanalysis_lib.py
├── utilities_lib.py
├── visualization_lib.py
├── preprocessor_lib.py
├── uci-heart-disease
    ├── heart-disease.names.txt
    ├── .ipynb_checkpoints
    │   ├── heart-disease.names-checkpoint.txt
    │   └── processed.cleveland-checkpoint.csv
    └── processed.cleveland.csv
├── analytics_wrapper.py
├── analytics_lib.py
└── synthesis_lib.py


/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ## Requirements
 4 | 
 5 | python version 3.8 or later
 6 | 
 7 | All the required python modules are listed in requirements.txt
 8 | 
 9 | 
10 | 
11 | ## Installation
12 | 
13 | To install the required python modules, run the following command:
14 | 
15 | pip install -r requirements.txt
16 | 
17 | 
18 | ## Usage
19 | 
20 | 1. Modify uci_config.py    or use it as it for using the sample dataset from uci heart disease
21 | 
22 | 2. python uci_demo.py
23 | 
24 | 3. the outputs ncluding the synthesized data and the results from cross-validation will be in output_uci/
25 | 
26 | 
27 | ## Testing
28 | 
29 | Check the output directory to see if the synthesized data is there as well as the log file and all the cross-validation
30 | related files and a report pdf file.
31 | 
32 | ## Branches & pull requests
33 | We use the git-flow branch strategy. Features should be based off the `develop` branch and merged using GitHub pull requests.
34 | 
35 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022 Medidata Solutions, Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.9.2
 2 | aiohttp-cors==0.7.0
 3 | aioredis==1.3.1
 4 | alignment==1.0.10
 5 | astor==0.8.1
 6 | async-timeout==3.0.1
 7 | attrs==21.2.0
 8 | autograd==1.3
 9 | autograd-gamma==0.5.0
10 | blessings==1.7
11 | cachetools==4.2.2
12 | certifi==2023.7.22
13 | chardet==4.0.0
14 | charset-normalizer==2.0.6
15 | click==8.0.1
16 | colorful==0.5.4
17 | cycler==0.10.0
18 | editdistance==0.5.3
19 | filelock==3.0.12
20 | formulaic==0.2.4
21 | future==0.18.3
22 | google-api-core==1.31.3
23 | google-auth==1.35.0
24 | googleapis-common-protos==1.53.0
25 | gower==0.0.5
26 | gpustat==0.6.0
27 | grpcio==1.53.0
28 | hiredis==2.0.0
29 | idna==3.2
30 | interface-meta==1.2.4
31 | joblib==1.2.0
32 | jsonschema==3.2.0
33 | kiwisolver==1.3.2
34 | lifelines==0.26.3
35 | llvmlite==0.34.0
36 | matplotlib==3.4.3
37 | msgpack==1.0.2
38 | multidict==5.1.0
39 | numba==0.51.2
40 | numpy==1.22
41 | nvidia-ml-py3==7.352.0
42 | opencensus==0.7.13
43 | opencensus-context==0.1.2
44 | packaging==21.0
45 | pandas==1.2.4
46 | Pillow==10.2.0
47 | prometheus-client==0.11.0
48 | protobuf==3.18.3
49 | psutil==5.8.0
50 | py-spy==0.3.9
51 | pyasn1==0.4.8
52 | pyasn1-modules==0.2.8
53 | pynndescent==0.5.4
54 | pyparsing==2.4.7
55 | pyrsistent==0.18.0
56 | python-dateutil==2.8.2
57 | pytz==2021.1
58 | PyYAML==5.4.1
59 | ray==2.9.3
60 | redis==3.5.3
61 | requests==2.31.0
62 | rsa==4.7.2
63 | sas7bdat==2.2.3
64 | scikit-learn==1.0.1
65 | scipy==1.10.0
66 | seaborn==0.11.2
67 | setproctitle==1.2.2
68 | six==1.16.0
69 | sklearn==0.0
70 | threadpoolctl==2.2.0
71 | typing-extensions==3.10.0.2
72 | umap-learn==0.5.1
73 | urllib3==1.26.18
74 | wrapt==1.12.1
75 | yarl==1.6.3
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Simulants 
 2 | 
 3 | In order to address the privacy concerns of patient data and to be able to disclose clinical trial data to
 4 | other organizations, we have built a system that synthesizes patient data and cross-validates the synthetic data
 5 | against the real data by running standard statistical techniques and machine learning algorithms.
 6 | The code consists of a set of libraries used for loading sample data from the UCI reposirtory, preprocessing it
 7 | and using it to synthesize a new set of patients.
 8 | 
 9 | A sample dataset is downloaded from the UCI Machine Learning Repository at:
10 | https://archive.ics.uci.edu/ml/datasets/Heart+Disease
11 | 
12 | 
13 | ## Prerequisites
14 | use python 3.8 or later
15 | 
16 | All the required packages are specified in requirements.txt.
17 | 
18 | pip install -r requirements.txt
19 | 
20 | 
21 | 
22 | ## Usage
23 | 1. Modify uci_config.py    or use it as it for using the sample dataset from uci heart disease
24 | 
25 | 2. python uci_demo.py
26 | 
27 | 3. the outputs ncluding the synthesized data and the results from cross-validation will be in output_uci/
28 | 
29 | 
30 | ## Contributing
31 | See [CONTRIBUTING](CONTRIBUTING.md).
32 | 
33 | ## Contributors
34 | Jacob Aptekar (Medidata Solutions)
35 | 
36 | Mandis Beigi (Medidata Solutions)
37 | 
38 | Pierre-Louis Bourlon (Medidata Solutions)
39 | 
40 | Jason Mezey (Cornell University)
41 | 
42 | Afrah Shafquat (Medidata Solutions)
43 | 
44 | 
45 | ## Contact
46 | See the [factbook](factbook.yaml).
47 | ## Contact
48 | Mandis Beigi at AcornAI (Medidata Solutions Inc., a Dassault Systemes Company)
49 | 
50 | mandis.beigi@3ds.com
51 | 
52 | 


--------------------------------------------------------------------------------
/k_anonymity.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # Author: Mandis Beigi
 5 | # Copyright (c) 2022 Medidata Solutions, Inc.
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in
15 | # all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 | # THE SOFTWARE.
24 | 
25 | 
26 | import pandas as pd
27 | import logging
28 | 
29 | 
30 | #drop all rows with any column having less than k distinct values
31 | def perform_k_anonymity(df, anonymity_k, ignore_columns):
32 | 
33 |     num_cols = df._get_numeric_data().columns
34 |     cat_cols = list(set(df.columns) - set(num_cols))
35 |     for ignore_column in ignore_columns:
36 |         if ignore_column in cat_cols:
37 |             cat_cols.remove(ignore_column)
38 | 
39 |     for column in cat_cols:
40 |         removals = df[column].value_counts().reset_index()
41 |         removals = removals[removals[column] >= anonymity_k]['index'].values
42 |         df = df[df[column].isin(removals)]
43 | 
44 |     return(df)
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/uci_config.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # General config parameters
 3 | ###############################################################################
 4 | proj_name = 'demo'                     # project name
 5 | data_path='./uci-heart-disease/'       # directory containing the source data
 6 | data_file='processed.cleveland.csv'    # file name for the source data in csv format 
 7 | output_dir='./output_'+proj_name+'/'   # output directory where the synthesized data will be placed
 8 | log_file=proj_name+'.log'              # name of the log file
 9 | report_file=proj_name+'_report.pdf'    # name of the report file in pdf containing the cross-validations
10 | num_cpus=1                             # number of CPUs to use
11 | 
12 | 
13 | ###############################################################################
14 | # Core Simulants synthesizer config parameters
15 | ###############################################################################
16 | anonymity_k = 1                        # k-anonymity for the categorical attributes
17 | embedding_method = 'tsne'              # method for embedding; options: cca, ica, tsne, pca
18 | embedding_metric = 'gower'             # metric to use for tsne; options: gower, euclidean
19 | min_cluster_size = 5                   # minimum cluster size for knn
20 | max_cluster_size = 5                   # maximum cluster size for knn
21 | corr_threshold = 0.7                   # correlation coefficient threshold for co-segregation of attributes
22 | batch_size = 1                         # ratio of the number of synthesized data to source data
23 | include_outliers = True                # whether to include outlisers in the synthesized data
24 | col_pairings = []                      # columns that need to be forced to be co-segregated.
25 |                                        # example: [['age', 'weight'], ['ethnicity', 'race']]]
26 | holdout_cols = []                      # name of columns to holdout before embedding is done
27 | imputing_method = 'simple'             # imputation method to use before embedding; options: simple, iterative
28 | add_noise = True                       # whether to add gaussian noise to the numerical attributes
29 | 
30 | 
31 | ###############################################################################
32 | # Fidelity (cross-validation) config parameters
33 | ###############################################################################
34 | cv_flag = True                         # flag to perform cross-validation
35 | cv_bow_num_of_bins = 40                # number of bins to use for the bag-of-words cross-validation
36 | 
37 | 


--------------------------------------------------------------------------------
/bow_lib.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # Author: Mandis Beigi
 5 | # Copyright (c) 2022 Medidata Solutions, Inc.
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in
15 | # all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 | # THE SOFTWARE.
24 | 
25 | 
26 | 
27 | import numpy as np
28 | from sklearn.cluster import KMeans
29 | from sklearn.neighbors import NearestNeighbors
30 | 
31 | 
32 | def generate_code_book(data_df, num_bins):
33 |     """
34 |     Generates a BOW codebook for the data.
35 |     Parameters:
36 |     data_df (DataFrame): The input data in DataFrame format
37 |     num_bins (int): The number of BOW bins.
38 |     returns:
39 |     kmeans object: returns the BOW codebook
40 |     """
41 |     x = data_df.values.tolist()
42 |     
43 |     kmeans = KMeans(n_clusters=num_bins, random_state=0).fit(x)
44 |     return kmeans
45 | 
46 | 
47 | def get_histogram(kmeans, data_df):
48 |     """
49 |     returns the BOW pdf.
50 |     Parameters:
51 |     kmeans: The kmeans clusters from the codebook generation.
52 |     data_df (DataFrame): The input data in DataFrame format
53 |     returns:
54 |     numpy: returns the pdf
55 |     """
56 |     num_bins = kmeans.cluster_centers_.shape[0]
57 |     centroids = kmeans.cluster_centers_
58 |     data_list = data_df.values.tolist()
59 |     
60 |     nn = NearestNeighbors(n_neighbors=1).fit(centroids)
61 | 
62 |     histogram = np.zeros(shape=(1, num_bins))
63 |     for j in data_list:
64 |         neighs = nn.kneighbors([j])
65 |         closest_bin_index = neighs[1][0][0]
66 |         histogram[0][closest_bin_index] = histogram[0][closest_bin_index] + 1
67 | 
68 |     hist = histogram.tolist()[0]
69 |     pdf  = np.divide(hist, sum(hist))
70 | 
71 |     return pdf
72 | 
73 | 


--------------------------------------------------------------------------------
/factbook.yaml:
--------------------------------------------------------------------------------
 1 | # This is your base factbook, please update all values in <>
 2 | # Remove all the unnecesary extra comments from the file.
 3 | 
 4 | # This YAML file starts starts with '---', which is the separator for YAML documents
 5 | # within the same file. Learn more: https://yaml.org/spec/1.2/spec.html#id2800401
 6 | # If your repository contains information about more than one component, you can use multiple
 7 | # YAML documents in this same file to document each of them.
 8 | # For instance you may have one API component and an UI component or multiple libraries as different components.
 9 | ---
10 | apiVersion: backstage.io/v1alpha1 # This is a constant, just leave as is.
11 | kind: Component # values: Uninitialized or Component. Use `Component` once this repository is past the initial commit.
12 | spec:
13 |   type: library # web_backend == only API, web_frontend == only UI, web_fullstack == both, mobile == mobile app, library, other
14 |   lifecycle: production # values: experimental, production or deprecated. `production` means development towards production even if not in production yet.
15 |   owner: mbeigi@mdsol.com
16 | metadata:
17 |   # Please look at the schema below for more information about how to write a factbook
18 |   json_schema: "https://github.com/mdsol/platform-standards/tree/master/schemas/v1alpha1.schema.json" # This is a constant, just leave as is.
19 |   name: simulants-public  # Name of this entity. For instance: Plinth, RaveEDC, Astinus4J, etc.
20 |   description: Synthetic data generation for clinical trial data
21 | #  aliases: ["", ""] # Add a list of extra names if this entity has other names
22 | # Uncomment and fill in the security section if this repository holds the codebase of a service.
23 | #  security:
24 | #    authentication: [mauth] # List all the authentication methods used in our service. See the json schema for options.
25 | #    network_accessiblity: [private] # `private` == internal vpn only, `public` == directly accesible outside the vpn
26 | #    data_types: [clinical] # List the type of data used in your service.
27 |   teams:
28 |     - name: Simulants
29 |       number: 262
30 |       email: mbeigi@mdsol.com
31 |   people:
32 |     # An architect MUST be set among the people in this project.
33 |     # You can optionally add more people, for available roles see the "Person" schema in the JSON Schema linked above.
34 |     - role: technical owner
35 |       email: mbeigi@mdsol.com
36 |     - role: product owner
37 |       email: jiachen@mdsol.com
38 | 
39 |   channels: # List all the places where we can find you. Mainly slack channels.
40 |     - url: https://mdsol.slack.com/messages/simulants-development
41 |       automated_messaging: true
42 |       role: slack
43 | 
44 |   annotations:  # Anything extra, for instance links.
45 |     arc42: https://learn.mdsol.com/<path to arc42 docs>
46 |     docs: https://github.com/mdsol/Simulants/wiki
47 |     medistrano: https://medistrano.imedidata.net/projects/<project ID>
48 |     jira: https://jira.mdsol.com/secure/RapidBoard.jspa?rapidView=2943
49 |     #area: <choose an area> # valid values: platform, data_platform, data_science, product, other
50 |     # product: <name of product> uncomment this if this component is part of a bigger one, for instance it is a piece of Rave
51 | 
52 | 


--------------------------------------------------------------------------------
/synthesis_wrapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # Author: Mandis Beigi
 5 | # Copyright (c) 2022 Medidata Solutions, Inc.
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in
15 | # all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 | # THE SOFTWARE.
24 | 
25 | 
26 | import logging
27 | 
28 | import k_anonymity
29 | import synthesis_lib
30 | import preprocessor_lib
31 | import utilities_lib
32 | 
33 | 
34 | def synthesize(df, config):
35 | 
36 |     logging.info('Performing k-anonymity to the data......................')
37 |     logging.info('The data size before k-anonymity: {}'.format(df.shape))
38 |     ignore_columns = utilities_lib.get_date_columns(df)
39 |     df = k_anonymity.perform_k_anonymity(df, config.anonymity_k, ignore_columns)
40 |     logging.info('The data size after k-anonymity: {}'.format(df.shape))
41 | 
42 |     ignore_columns = utilities_lib.get_date_columns(df)
43 |     tmp_df = df.loc[:, ~df.columns.isin(ignore_columns)]
44 |     label_encoded_df, encoding_dict = preprocessor_lib.label_encoding_encode(tmp_df)
45 |     label_encoded_df = preprocessor_lib.impute_label_encoded_df(label_encoded_df)
46 | 
47 |     corr_cols_groups = synthesis_lib.generate_corr_cols_groups(label_encoded_df, config.corr_threshold)
48 |     col_pairings = utilities_lib.merge_2d_lists(corr_cols_groups, config.col_pairings)
49 | 
50 |     one_hot_encoded_df = preprocessor_lib.one_hot_encoding_encode(tmp_df)
51 |     logging.info("encoded_df: {}".format(one_hot_encoded_df.shape))
52 | 
53 |     encoded_df = one_hot_encoded_df
54 | 
55 |     logging.info('Synthesizing the data data.............................')
56 | 
57 |     syn_encoded_df = synthesis_lib.synthesize(encoded_df,
58 |             method=config.embedding_method, metric=config.embedding_metric,
59 |             min_cluster_size=config.min_cluster_size, max_cluster_size=config.max_cluster_size,
60 |             batch_size=config.batch_size, corr_thresh=config.corr_threshold, include_outliers=config.include_outliers,
61 |             holdout_cols=config.holdout_cols, derived_cols_dict={}, col_pairings=col_pairings,
62 |             imputing_method=config.imputing_method, add_noise=config.add_noise)
63 |     logging.info("syn_encoded_df: {}".format(syn_encoded_df.shape))
64 | 
65 |     logging.info('Decoding the synthesized data...............................')
66 |     syn_encoded_df_no_index = syn_encoded_df.reset_index(drop=False)
67 |     syn_df = preprocessor_lib.one_hot_encoding_decode(syn_encoded_df_no_index)
68 | 
69 |     logging.info('Saving the synthesized data.....................................')
70 |     logging.info('syn_df: {}'.format(syn_df.shape))
71 | 
72 |     df = df.reset_index(drop=False)
73 | 
74 |     df_columns = utilities_lib.intersection(df.columns, syn_df.columns)
75 |     syn_df = syn_df.reindex(columns=df_columns)
76 |     syn_df.to_csv(config.output_dir+config.proj_name+'_syn.csv', index=False)
77 | 
78 |     return(syn_df)
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/uci_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # Author: Mandis Beigi
 5 | # Copyright (c) 2022 Medidata Solutions, Inc.
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in
15 | # all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 | # THE SOFTWARE.
24 | 
25 | 
26 | import pandas as pd
27 | import numpy as np
28 | import copy
29 | import os
30 | import sys
31 | import time
32 | import logging
33 | from matplotlib.backends.backend_pdf import PdfPages
34 | import warnings
35 | import datetime
36 | 
37 | import uci_config as config
38 | import preprocessor_lib
39 | import utilities_lib
40 | import analytics_wrapper
41 | import synthesis_wrapper
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     print('Loaded the code....................................................')
46 |     start_time = time.time()
47 |     print('Time is %s'%datetime.datetime.now())
48 | 
49 |     print('Creating directories...............................................')
50 |     if not os.path.exists(config.output_dir):
51 |         os.makedirs(config.output_dir)
52 |     if not os.path.exists(config.output_dir+'figs/'):
53 |         os.makedirs(config.output_dir+'figs/')
54 |     print('Finished creating directories......................................')
55 | 
56 |     print('Starting the logger................................................')
57 |     logging.basicConfig(level=logging.INFO, format='%(asctime)-15s [%(levelname)s] %(funcName)s: %(message)s',
58 |                         filename=config.output_dir+config.log_file, filemode='w')
59 |     warnings.filterwarnings("ignore")
60 |     print('Finished starting the logger.......................................')
61 | 
62 |     file_name = config.data_path+config.data_file
63 |     logging.info(file_name)
64 | 
65 |     logging.info('Loading the data............................................')
66 |     logging.info('Loading the data............................................')
67 |     df = pd.read_csv(file_name)
68 |     print('Finished loading the data..........................................')
69 | 
70 |     pdf_page = PdfPages(config.output_dir+config.report_file)
71 | 
72 |     print('Synthesizing the data..............................................')
73 |     print('Time is %s'%datetime.datetime.now())
74 |     syn_df = synthesis_wrapper.synthesize(df, config)
75 |     print('Finished synthesizing the data.....................................')
76 |     print('Time is %s'%datetime.datetime.now())
77 | 
78 |     if config.cv_flag:
79 |         logging.info('Running analytics on the real and synthetic data............')
80 |         print('Running analytics on the real and synthetic data...................')
81 |         print('Time is %s'%datetime.datetime.now())
82 |         analytics_wrapper.analyze(df.copy(), syn_df.copy(), config, pdf_page)
83 | 
84 |     pdf_page.close()
85 |     logging.info('It took {} seconds'.format(time.time() - start_time))
86 |     print('It took {} seconds'.format(time.time() - start_time))
87 | 
88 |     logging.info('...................Done!....................................')
89 |     print('..........................Done!....................................')
90 | 
91 | 


--------------------------------------------------------------------------------
/dimanalysis_lib.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Author: Mandis Beigi
  5 | # Copyright (c) 2022 Medidata Solutions, Inc.
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in
 15 | # all copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 | # THE SOFTWARE.
 24 | 
 25 | 
 26 | 
 27 | import os
 28 | import copy
 29 | from random import shuffle
 30 | from datetime import datetime
 31 | import pandas as pd
 32 | import numpy as np
 33 | 
 34 | from sklearn.manifold import TSNE
 35 | from sklearn.decomposition import PCA
 36 | from sklearn.decomposition import FastICA
 37 | from sklearn.preprocessing import StandardScaler
 38 | from sklearn.cross_decomposition import CCA
 39 | import gower
 40 | 
 41 | 
 42 | def reduce_cca(data_df, n_components=2):
 43 |     """
 44 |     Uses CCA to reduce dimension.
 45 |     Parameters:
 46 |     data_df (DataFrame): The input data in DataFrame format
 47 |     n_components (int): The number of components to reduce to. Default is 2.
 48 |     returns:
 49 |     DataFrame: returns the data in the reduced dimension
 50 |     """
 51 |     new_df = data_df.reset_index(drop=True)
 52 |     embedded = CCA(n_components=2).fit_transform(new_df.to_numpy())
 53 |     return(pd.DataFrame(embedded, index=data_df.index))
 54 | 
 55 | 
 56 | def reduce_ica(data_df, n_components=None):
 57 |     """
 58 |     Uses ICA to reduce dimension.
 59 |     Parameters:
 60 |     data_df (DataFrame): The input data in DataFrame format
 61 |     n_components (int): The number of components to reduce to. Default is all components.
 62 |     returns:
 63 |     DataFrame: returns the data in the reduced dimension
 64 |     """
 65 |     new_df = data_df.reset_index(drop=True)
 66 |     embedded = FastICA(n_components=2).fit_transform(new_df)
 67 |     return(pd.DataFrame(embedded, index=data_df.index))
 68 | 
 69 | 
 70 | def reduce_tsne(data_df, n_components=2, init='pca', metric='euclidean'):
 71 |     """
 72 |     Uses tSNE to reduce dimension.
 73 |     Parameters:
 74 |     data_df (DataFrame): The input data in DataFrame format
 75 |     n_components (int): The number of components to reduce to. Default is 2.
 76 |     returns:
 77 |     DataFrame: returns the data in the reduced dimension
 78 |     """
 79 |     new_df = data_df.reset_index(drop=True)
 80 |     if metric == 'gower':
 81 |         #tsne = TSNE(n_components=n_components, metric='precomputed', square_distances=True)
 82 |         tsne = TSNE(n_components=n_components)
 83 |         df_gower = gower.gower_matrix(new_df)
 84 |         embedded = tsne.fit_transform(df_gower)
 85 |     else:
 86 |         #tsne = TSNE(n_components, square_distances=True)
 87 |         tsne = TSNE(n_components)
 88 |         embedded = tsne.fit_transform(new_df)
 89 |         
 90 |     return(pd.DataFrame(embedded, index=data_df.index))
 91 | 
 92 | 
 93 | def reduce_pca(data_df, n_components=None):
 94 |     """
 95 |     Uses PCA to reduce dimension.
 96 |     Parameters:
 97 |     data_df (DataFrame): The input data in DataFrame format
 98 |     n_components (float): The number of components or to reduce to. If the number if between 0 and 1, n_components is the % of 
 99 |                             the principal components will be kept. Default is all components.
100 |     returns:
101 |     DataFrame: returns the data in the reduced dimension
102 |     """
103 |     new_df = data_df.reset_index(drop=True)
104 |     data_np = new_df.to_numpy()
105 |     
106 |     #Standardize the data by removing the mean and scaling to unit variance
107 |     pca_np = StandardScaler().fit_transform(data_np)
108 |     pca = PCA(n_components)
109 |     embedded = pca.fit_transform(pca_np)
110 |     
111 |     return(pd.DataFrame(embedded, index=data_df.index))
112 | 
113 | 


--------------------------------------------------------------------------------
/utilities_lib.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Author: Mandis Beigi
  5 | # Copyright (c) 2022 Medidata Solutions, Inc.
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in
 15 | # all copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 | # THE SOFTWARE.
 24 | 
 25 | 
 26 | import pandas as pd
 27 | import numpy as np
 28 | from matplotlib import pyplot as plt
 29 | import copy
 30 | import math
 31 | from functools import reduce
 32 | from dateutil.parser import parse
 33 | from pandas.api.types import is_datetime64_any_dtype as is_datetime
 34 | import logging
 35 | 
 36 | 
 37 | def intersection(lst1, lst2):
 38 |     lst3 = [value for value in lst1 if value in lst2]
 39 |     return lst3
 40 | 
 41 | def is_identical(list_a, list_b):
 42 |     if len(list_a) != len(list_b):
 43 |         return False
 44 |     for i in list_a:
 45 |         if i not in list_b:
 46 |             return False
 47 |     return True
 48 | 
 49 | def merge_2d_lists(list_a, list_b):
 50 |     list_c = list_b.copy()
 51 |     for item_a in list_a:
 52 |         found = False
 53 |         for item_c in list_c:
 54 |             if is_identical(item_a, item_c):
 55 |                 found = True
 56 |                 break
 57 |         if not found:
 58 |             list_c.append(item_a)
 59 |     return(list_c)
 60 | 
 61 | #check if list1 and list2 have any common elements
 62 | def if_common_element(list1, list2):
 63 |     one = set(list1)
 64 |     two = set(list2)
 65 |     if (one & two):
 66 |         return (True)
 67 | 
 68 |     return (False)
 69 | 
 70 | 
 71 | #example: dict1={'a':[1,2], 'b':[4]}  and dict2={'a':[6], 'd':[8]}  merged={'a':[1,2,6], 'b':[4], 'd':[8]}
 72 | def merge_dictionaries(dict1, dict2):
 73 |     merged= { key:dict1.get(key,[])+dict2.get(key,[]) for key in set(list(dict1.keys())+list(dict2.keys())) }
 74 |     return(merged)
 75 | 
 76 | 
 77 | #converts {"a":["c", "d"],"b":["e","f"]}  to [["a","c","d"],["b","e","f"]]
 78 | def convert_dict_to_groups(cols_dict):
 79 |     groups = []
 80 |     for key in cols_dict.keys():
 81 |         this_list=[]
 82 |         this_list.append(key)
 83 |         for value in cols_dict[key]:
 84 |             if value not in this_list:
 85 |                 this_list.append(value)
 86 |         groups.append(this_list)
 87 | 
 88 |     return(groups)
 89 | 
 90 | 
 91 | def bitwise_or_pair(x1, x2): 
 92 |     return(np.bitwise_or(x1, x2))
 93 | 
 94 | def bitwise_or_list(x):
 95 |     return(reduce(bitwise_or_pair, x))
 96 | 
 97 | 
 98 | #quantize the columns with values that have a wide range
 99 | def quantize_df(df):
100 |     col_names = list(df)
101 |     for col_name in col_names:
102 |         col = list(df[col_name])
103 |         new_col = quantize_list(col)
104 |         df[col_name] = new_col
105 |     return(df)
106 |      
107 | #quantize a given list of values using a list of quantization levels with step.
108 | def quantize_list(values):
109 |     min_val = math.floor(min(values))
110 |     max_val = math.ceil(max(values))
111 |     step = max(1, int((max_val-min_val)/100.0))
112 |     quantizations = list(range(min_val, max_val, step))
113 |     values = np.array(values)
114 |     quantizations = np.array(quantizations)
115 |     new_values = quantizations[np.argmin(np.abs(np.repeat(values[:, np.newaxis], len(quantizations), axis=1) - quantizations), axis=1)]
116 |     return(new_values)
117 | 
118 | 
119 | def get_common_rows(df1, df2):
120 |     common_rows = pd.merge(df1, df2, how='inner')
121 |     return common_rows
122 | 
123 | 
124 | # Drop the duplicates between df1 and df2 from df1 and return the modified df1
125 | def drop_duplicates(df1, df2):
126 |     df1 = pd.concat([df1, df2, df2]).drop_duplicates(keep=False)
127 |     return(df1)
128 | 
129 | 
130 | #Drop columns starting with names given in columns to accomodate encoding format of column names that start with |
131 | def drop_columns_containing(df, columns):
132 |     for drop_col_name in columns:
133 |         df = df.loc[:, ~df.columns.str.startswith(drop_col_name+'|')]
134 |     for drop_col_name in columns:
135 |         df = df.loc[:, ~df.columns.str.match(drop_col_name)]
136 |     return(df)
137 | 
138 | def is_date(string, fuzzy=False):
139 |     try:
140 |         parse(string, fuzzy=fuzzy)
141 |         return(True)
142 | 
143 |     except ValueError:
144 |         return(False)
145 | 
146 | 
147 | def get_date_columns(df):
148 |     tmp_df = df.copy()
149 |     tmp_df = tmp_df.select_dtypes(exclude=['int','float'])
150 |     date_cols = []
151 |     for col in tmp_df.columns:
152 |         try:
153 |             tmp_df[col].astype('float')
154 |         except:
155 |             tmp_col = pd.to_datetime(tmp_df[col], errors='coerce')
156 |             tmp_col_na = tmp_col.isna().sum()*100/tmp_col.shape[0]
157 |             if not tmp_col_na>90:
158 |                 date_cols +=[col]
159 |     return(date_cols)
160 | 
161 | 
162 | def drop_date_columns(df):
163 |     '''Dropping any columns that have dates in them from dataset'''
164 |     date_cols = get_date_columns(df)
165 |     logging.info("dropping all the date columns: {}".format(date_cols))
166 |     df = drop_columns(df, date_cols)
167 |     return(df)
168 | 
169 | 
170 | def drop_columns(df, columns):
171 |     '''Drop specified columns from dataset'''
172 |     
173 |     logging.info('Columns deleted: %s'%columns)
174 |     
175 |     for drop_col_name in columns:
176 |         df = df.loc[:, ~df.columns.str.match(drop_col_name)]
177 |     return(df)
178 | 
179 | 
180 | def keep_columns_containing(data_df, feature_names):
181 |     original_columns = data_df.columns.to_list()
182 |     all_column_names = []
183 |     for column_name in feature_names:
184 |         all_column_names = all_column_names + [col for col in data_df.columns if column_name in col]
185 | 
186 |     cols_to_drop = utilities_lib.remove_list_from_list(original_columns, all_column_names)
187 |     new_data_df = data_df.drop(cols_to_drop, axis = 1)
188 |     return(new_data_df)
189 | 
190 | 
191 | def remove_item_from_list(ls, val):
192 |     return list(filter(lambda x: x != val, ls))
193 | 
194 | 
195 | #removes ls2 from ls1 and returns the results
196 | def remove_list_from_list(ls1, ls2):
197 |     for ls in ls2:
198 |         ls1 = remove_item_from_list(ls1, ls)
199 |     return ls1
200 | 
201 | 


--------------------------------------------------------------------------------
/visualization_lib.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Author: Mandis Beigi
  5 | # Copyright (c) 2022 Medidata Solutions, Inc.
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in
 15 | # all copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 | # THE SOFTWARE.
 24 | 
 25 | 
 26 | import logging
 27 | import pandas as pd
 28 | from matplotlib import pyplot as plt
 29 | import seaborn as sns
 30 | import numpy as np
 31 | from matplotlib.backends.backend_pdf import PdfPages
 32 | 
 33 | 
 34 | def scatter_by_column_names(df, x_name, y_name, filename, hold=False):
 35 |     """
 36 |     Create a scatter plot and save to a file.
 37 |     Parameters:
 38 |     df (DataFrame): The input data in DataFrame format
 39 |     x_name (list): Column names for X.
 40 |     y_name (list): Column names for Y.
 41 |     filename (str): The filename to save the plot to.
 42 |     """
 43 |     x = (df.loc[:,x_name].values.tolist())
 44 |     y = (df.loc[:,y_name].values.tolist())
 45 |     plt.scatter(x, y)
 46 |     #plt.show()
 47 |     plt.savefig(filename)
 48 |     if (not hold):
 49 |         plt.clf()
 50 |     return
 51 | 
 52 | def scatter(df, filename, hold=False, pdf_page=None, label=None,c='black',alpha=0.5):
 53 |     """
 54 |     Create a scatter plot of the first two columns and save to a file.
 55 |     Parameters:
 56 |     df (DataFrame): The input data in DataFrame format
 57 |     filename (str): The filename to save the plot to.
 58 |     """
 59 |     x = (df.iloc[:,0].values.tolist())
 60 |     y = (df.iloc[:,1].values.tolist())
 61 |     plt.scatter(x, y, label=label,c=c, alpha=alpha, edgecolor='white')
 62 |     #plt.show()
 63 |     plt.legend(bbox_to_anchor=(1.0, 0.6))
 64 |     plt.title(filename)
 65 |     plt.tight_layout()
 66 |     fig = plt.savefig(filename)
 67 |     if not hold and pdf_page != None:
 68 |         pdf_page.savefig(fig, bbox_inches='tight')
 69 |     if (not hold):
 70 |         plt.clf()
 71 |     return
 72 | 
 73 | 
 74 | def scatter_bw(df, filename, hold=False, pdf_page=None):
 75 |     """
 76 |     Create a scatter plot of the first two columns and save to a file.
 77 |     Parameters:
 78 |     df (DataFrame): The input data in DataFrame format
 79 |     filename (str): The filename to save the plot to.
 80 |     """
 81 |     x = (df.iloc[:,0].values.tolist())
 82 |     y = (df.iloc[:,1].values.tolist())
 83 | 
 84 |     if hold:
 85 |         color = ['0' for item in y]
 86 |     else:
 87 |         color = ['0.8' for item in y]
 88 | 
 89 |     plt.scatter(x, y, c=color, alpha=0.5, edgecolor='white')
 90 |     plt.title(filename)
 91 |     fig = plt.savefig(filename)
 92 |     if not hold and pdf_page != None:
 93 |         pdf_page.savefig(fig, bbox_inches='tight')
 94 |     if (not hold):
 95 |         plt.clf()
 96 |     return
 97 | 
 98 | 
 99 | def histogram(df, bins, y, filename):
100 |     fig = df.plot(bins, y, kind='hist', alpha=0.5)
101 |     #df.show()
102 |     fig.savefig(filename).get_figure()
103 |     return
104 | 
105 | def bar(data_np, num_of_bins, filename, pdf_page=None, hold=False):
106 |     bins = list(range(0, num_of_bins))
107 |     plt.bar(bins, data_np, alpha=0.5, edgecolor='white')
108 |     plt.title(filename)
109 |     #plt.show()
110 |     fig = plt.savefig(filename)
111 |     if pdf_page != None:
112 |         pdf_page.savefig(fig, bbox_inches='tight')
113 |     if not hold:
114 |         plt.clf()
115 |     
116 | def correlation_heatmap(df, filename, corr='pearson', pdf_page=None):
117 |     #df_corr = np.abs(df.corr(method=corr))
118 |     df_corr = (df.corr(method=corr))
119 |     #order the column and row names alphabetically
120 |     df_corr = df_corr.reindex(sorted(df_corr.columns), axis=0)
121 |     df_corr = df_corr.reindex(sorted(df_corr.columns), axis=1)
122 |     #logging.info(df_corr)
123 |     # Generate a mask for the upper triangle
124 |     mask = np.triu(np.ones_like(df_corr, dtype=bool))
125 |     # Set up the matplotlib figure
126 |     f, ax = plt.subplots(figsize=(11, 9))
127 |     ax.set_title(filename)
128 |     # Generate a custom diverging colormap
129 |     cmap = sns.diverging_palette(230, 20, as_cmap=True)
130 |     # Draw the heatmap with the mask and correct aspect ratio
131 |     #sns_plot = sns.heatmap(df_corr, mask=mask, cmap=cmap, vmax=0.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})
132 |     sns_plot = sns.heatmap(df_corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})
133 |     fig = sns_plot.get_figure()
134 |     fig.savefig(filename)
135 |     if pdf_page != None:
136 |         pdf_page.savefig(fig, bbox_inches='tight')
137 |     fig.clf()
138 |     
139 | # Plots the heatmap of diffrence of correlation matrices of two dataframes
140 | def diff_correlation_heatmap(df1, df2, filename, corr='pearson', pdf_page=None):
141 |     #df1_corr = np.abs(df1.corr(method=corr))
142 |     df1_corr = (df1.corr(method=corr))
143 |     #order the column and row names alphabetically
144 |     df1_corr = df1_corr.reindex(sorted(df1_corr.columns), axis=0)
145 |     df1_corr = df1_corr.reindex(sorted(df1_corr.columns), axis=1)
146 |     #logging.info(df1_corr)
147 | 
148 |     #df2_corr = np.abs(df2.corr(method=corr))
149 |     df2_corr = (df2.corr(method=corr))
150 |     #order the column and row names alphabetically
151 |     df2_corr = df2_corr.reindex(sorted(df2_corr.columns), axis=0)
152 |     df2_corr = df2_corr.reindex(sorted(df2_corr.columns), axis=1)
153 |     #logging.info(df2_corr)
154 |     
155 |     df_corr = (abs(df1_corr.fillna(0)-df2_corr.fillna(0))).fillna(0)
156 |     print("The sum of diff_corr: {}".format(df_corr.values.sum()))
157 |     logging.info("The sum of diff_corr: {}".format(df_corr.values.sum()))
158 |     
159 |     # Generate a mask for the upper triangle
160 |     mask = np.triu(np.ones_like(df_corr, dtype=bool))
161 |     # Set up the matplotlib figure
162 |     f, ax = plt.subplots(figsize=(11, 9))
163 |     # Generate a custom diverging colormap
164 |     cmap = sns.diverging_palette(230, 20, as_cmap=True)
165 |     # Draw the heatmap with the mask and correct aspect ratio
166 |     #sns_plot = sns.heatmap(df_corr, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})
167 |     sns_plot = sns.heatmap(df_corr, mask=mask, cmap=cmap, vmin=-2, vmax=2, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})
168 |     ax.set_title(filename)
169 |     fig = sns_plot.get_figure()
170 |     fig.savefig(filename)
171 |     if pdf_page != None:
172 |         pdf_page.savefig(fig, bbox_inches='tight')
173 |     fig.clf()
174 | 


--------------------------------------------------------------------------------
/preprocessor_lib.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Author: Mandis Beigi
  5 | # Copyright (c) 2022 Medidata Solutions, Inc.
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in
 15 | # all copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 | # THE SOFTWARE.
 24 | 
 25 | 
 26 | import logging
 27 | from sas7bdat import SAS7BDAT
 28 | import pandas as pd
 29 | import numpy as np
 30 | from collections import defaultdict
 31 | from sklearn.impute import SimpleImputer
 32 | from sklearn.experimental import enable_iterative_imputer
 33 | from sklearn.impute import IterativeImputer
 34 | import math
 35 | 
 36 | 
 37 | #checks whether a given float contains decimals
 38 | def contains_decimal(x):
 39 |     frac, whole = math.modf(x)
 40 |     if frac>0:
 41 |         return(True)
 42 |     return(False)
 43 | 
 44 | 
 45 | #checks weather the given vector of numbers contains floats, otherwise all are assumed to be ints
 46 | def contains_floats(x_vec):
 47 |     highest_prec = False
 48 |     for x in x_vec:
 49 |         prec = contains_decimal(x)
 50 |         if prec:
 51 |             highest_prec = prec
 52 |     return(highest_prec)
 53 | 
 54 | 
 55 | def get_boolean_and_nonboolean_columns(df):
 56 |     boolean_columns = []
 57 |     for c in df.columns:
 58 |         if set(df.loc[:,c]).issubset(set([0, 1])) or '|' in c:
 59 |             boolean_columns.append(c)
 60 |     nonboolean_columns = list(set(df.columns) - set(boolean_columns))
 61 |     return(boolean_columns, nonboolean_columns)
 62 | 
 63 | 
 64 | #LabelEncoding: converts a dataframe containing categorical values to numbers and returns the new dataframe as well as the dictionary of the mappings
 65 | def label_encoding_encode(df):
 66 |     #df = pd.DataFrame({'value': ["a", "b", "c", "a"], 'num': [1,2,3,4], 'name':["yy","bb", "yy","zz"]})
 67 |     df_encoded = df.copy()
 68 |     df_type = df.dtypes
 69 |     object_idx = np.where(df_type == 'object')
 70 |     dicts = {}
 71 |     for i in range(0, len(object_idx[0])):
 72 |         c = df_encoded[df.columns[object_idx[0][i]]].astype('category')
 73 |         d = dict(enumerate(c.cat.categories))
 74 |         col_name = df.columns[object_idx[0][i]]
 75 |         df_encoded[col_name] = c.cat.codes
 76 |         dicts[col_name] = d
 77 |     
 78 |     return (df_encoded, dicts)
 79 | 
 80 | 
 81 | #LabelEncoding: converts a dataframe containing numbers to categorical values using a mapping dictionary and returns a new dataframe
 82 | def label_encoding_decode(df_encoded, dicts):
 83 |     df = df_encoded.copy()
 84 |     keys=dicts.keys()
 85 |     for key in keys:
 86 |         df[key] = df_encoded[key].map(dicts[key])
 87 |     
 88 |     return (df)
 89 | 
 90 | 
 91 | def one_hot_encoding_encode(df):
 92 |     encoded_df = pd.get_dummies(df, prefix_sep='|')
 93 |     return(encoded_df)
 94 | 
 95 | 
 96 | def one_hot_encoding_decode(df_dummies):
 97 |     prefix_sep = '|'
 98 |     pos = defaultdict(list)
 99 |     vals = defaultdict(list)
100 | 
101 |     for i, c in enumerate(df_dummies.columns):
102 |         if prefix_sep in c:
103 |             k, v = c.split(prefix_sep, 1)
104 |             pos[k].append(i)
105 |             vals[k].append(v)
106 |         else:
107 |             pos[prefix_sep].append(i)
108 | 
109 |     df = pd.DataFrame({k: pd.Categorical.from_codes(
110 |                               np.argmax(df_dummies.iloc[:, pos[k]].values, axis=1),
111 |                               vals[k])
112 |                       for k in vals})
113 | 
114 |     df[df_dummies.columns[pos[prefix_sep]]] = df_dummies.iloc[:, pos[prefix_sep]]
115 |     return df
116 | 
117 | 
118 | # This function takes a dataframe with categorical and ordinal columns and converts all fields to floats
119 | # It performs one-hot-encoding for the categorical variables and label encoding for the ordinal variables
120 | def encode_df(df, categorical_columns, ordinal_columns):
121 |     all_columns = df.columns.tolist()
122 |     all_columns_set = set(all_columns)
123 |     other_columns_set = all_columns_set.difference(set(categorical_columns))
124 |     other_columns = list(rest_columns_set.difference(set(ordinal_columns)))
125 |     
126 |     cat_df = df[categorical_columns]
127 |     ord_df = df[ordinal_columns]
128 |     other_df = df[other_columns]
129 |     
130 |     cat_df_encoded = one_hot_encoding_encode(cat_df)
131 |     ord_df_encoded, ord_dict = label_encoding_encode(ord_df)
132 |     
133 |     encoded_df = other_df.join(cat_df_encoded)
134 |     encoded_df = encoded_df.join(ord_df_encoded)
135 |     
136 |     return(encoded_df, ord_dict)
137 |     
138 |     
139 | # This function converts back an encoded dataframe to the original categorical and ordinal columns
140 | def decode_df(df, ord_dict):
141 |     decoded_df = label_encoding_decode(df, ord_dict)
142 |     decoded_df = one_hot_encoding_decode(decoded_df)
143 |     return(decoded_df)
144 | 
145 | 
146 | #fill the missing data with new values not existing in the column
147 | #this is used to determine the column correlations
148 | def impute_label_encoded_df(df):
149 |     for column in df.columns:
150 |         tmp_col_values = sorted(df[column].unique())
151 |         col_values = [x for x in tmp_col_values if math.isnan(x) == False]
152 |         if len(col_values) >= 2:
153 |             fill_val = col_values[len(col_values)-1]+(col_values[len(col_values)-1]-col_values[len(col_values)-2])
154 |         elif len(col_values) == 1:
155 |             if col_values[0] != 0:
156 |                 fill_val = 2*col_values[0]
157 |             else:
158 |                 fill_val = 1
159 |         else:
160 |             fill_val = 0
161 |         df[[column]] = df[[column]].fillna(value=fill_val)
162 | 
163 |     return(df)
164 | 
165 | 
166 | #impute the missing values of boolean columns with the most frequent value and
167 | #impute the missing values of the non-boolean columns with the median
168 | def impute_one_hot_encoded_df(df):
169 |     boolean_columns, nonboolean_columns = get_boolean_and_nonboolean_columns(df)
170 |     boolean_df = df[boolean_columns]
171 |     nonboolean_df = df[nonboolean_columns]
172 | 
173 |     if len(nonboolean_columns) != 0:
174 |         imputed_nonboolean_df = nonboolean_df.fillna(nonboolean_df.median())
175 |         imputed_nonboolean_df = imputed_nonboolean_df.reset_index(drop=True)
176 | 
177 |     if len(boolean_columns) != 0:
178 |         imp_most_freq = SimpleImputer(strategy='most_frequent')
179 |         imp_most_freq.fit(boolean_df)
180 |         imputed_boolean_df = pd.DataFrame(imp_most_freq.transform(boolean_df))
181 |         imputed_boolean_df.columns = boolean_df.columns
182 |         imputed_boolean_df = imputed_boolean_df.reset_index(drop=True)
183 | 
184 |     if len(nonboolean_columns) != 0 and len(boolean_columns) != 0:
185 |         imputed_df = pd.concat([imputed_nonboolean_df, imputed_boolean_df], axis=1)
186 |     elif len(nonboolean_columns) == 0:
187 |         imputed_df = imputed_boolean_df
188 |     elif len(boolean_columns) == 0:
189 |         imputed_df = imputed_nonboolean_df
190 | 
191 |     imputed_df = imputed_df.reindex(columns=df.columns)
192 |     imputed_df = imputed_df.set_index(df.index)
193 |     return(imputed_df)
194 | 
195 | 
196 | def iterative_impute(df):
197 |     logging.info("Iterative imputing the data")
198 |     imputer = IterativeImputer()
199 |     imputer.fit(df)
200 |     imputed_np = imputer.transform(df)
201 |     imputed_df = pd.DataFrame(imputed_np, columns=df.columns)
202 |     imputed_df = imputed_df.reindex(columns=df.columns)
203 |     imputed_df = imputed_df.set_index(df.index)
204 |     return(imputed_df)
205 | 


--------------------------------------------------------------------------------
/uci-heart-disease/heart-disease.names.txt:
--------------------------------------------------------------------------------
  1 | Publication Request: 
  2 |    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
  3 |    This file describes the contents of the heart-disease directory.
  4 | 
  5 |    This directory contains 4 databases concerning heart disease diagnosis.
  6 |    All attributes are numeric-valued.  The data was collected from the
  7 |    four following locations:
  8 | 
  9 |      1. Cleveland Clinic Foundation (cleveland.data)
 10 |      2. Hungarian Institute of Cardiology, Budapest (hungarian.data)
 11 |      3. V.A. Medical Center, Long Beach, CA (long-beach-va.data)
 12 |      4. University Hospital, Zurich, Switzerland (switzerland.data)
 13 | 
 14 |    Each database has the same instance format.  While the databases have 76
 15 |    raw attributes, only 14 of them are actually used.  Thus I've taken the
 16 |    liberty of making 2 copies of each database: one with all the attributes
 17 |    and 1 with the 14 attributes actually used in past experiments.
 18 | 
 19 |    The authors of the databases have requested:
 20 | 
 21 |       ...that any publications resulting from the use of the data include the 
 22 |       names of the principal investigator responsible for the data collection
 23 |       at each institution.  They would be:
 24 | 
 25 |        1. Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D.
 26 |        2. University Hospital, Zurich, Switzerland: William Steinbrunn, M.D.
 27 |        3. University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D.
 28 |        4. V.A. Medical Center, Long Beach and Cleveland Clinic Foundation:
 29 | 	  Robert Detrano, M.D., Ph.D.
 30 | 
 31 |    Thanks in advance for abiding by this request.
 32 | 
 33 |    David Aha
 34 |    July 22, 1988
 35 |    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 36 | 
 37 | 1. Title: Heart Disease Databases
 38 | 
 39 | 2. Source Information:
 40 |    (a) Creators: 
 41 |        -- 1. Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D.
 42 |        -- 2. University Hospital, Zurich, Switzerland: William Steinbrunn, M.D.
 43 |        -- 3. University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D.
 44 |        -- 4. V.A. Medical Center, Long Beach and Cleveland Clinic Foundation:
 45 |              Robert Detrano, M.D., Ph.D.
 46 |    (b) Donor: David W. Aha (aha@ics.uci.edu) (714) 856-8779   
 47 |    (c) Date: July, 1988
 48 | 
 49 | 3. Past Usage:
 50 |     1. Detrano,~R., Janosi,~A., Steinbrunn,~W., Pfisterer,~M., Schmid,~J.,
 51 |        Sandhu,~S., Guppy,~K., Lee,~S., \& Froelicher,~V. (1989).  {\it 
 52 |        International application of a new probability algorithm for the 
 53 |        diagnosis of coronary artery disease.}  {\it American Journal of 
 54 |        Cardiology}, {\it 64},304--310.
 55 |        -- International Probability Analysis 
 56 |        -- Address: Robert Detrano, M.D.
 57 |                    Cardiology 111-C
 58 |                    V.A. Medical Center
 59 |                    5901 E. 7th Street
 60 |                    Long Beach, CA 90028
 61 |        -- Results in percent accuracy: (for 0.5 probability threshold)
 62 |              Data Name:  CDF    CADENZA
 63 |           -- Hungarian   77     74
 64 |              Long beach  79     77
 65 |              Swiss       81     81
 66 |           -- Approximately a 77% correct classification accuracy with a
 67 |              logistic-regression-derived discriminant function
 68 |     2. David W. Aha & Dennis Kibler
 69 |        -- 
 70 |           
 71 |           
 72 |           -- Instance-based prediction of heart-disease presence with the 
 73 |              Cleveland database
 74 |              -- NTgrowth: 77.0% accuracy
 75 |              --       C4: 74.8% accuracy
 76 |     3. John Gennari
 77 |        -- Gennari, J.~H., Langley, P, \& Fisher, D. (1989). Models of
 78 |           incremental concept formation. {\it Artificial Intelligence, 40},
 79 |           11--61.
 80 |        -- Results: 
 81 |           -- The CLASSIT conceptual clustering system achieved a 78.9% accuracy
 82 |              on the Cleveland database.
 83 | 
 84 | 4. Relevant Information:
 85 |      This database contains 76 attributes, but all published experiments
 86 |      refer to using a subset of 14 of them.  In particular, the Cleveland
 87 |      database is the only one that has been used by ML researchers to 
 88 |      this date.  The "goal" field refers to the presence of heart disease
 89 |      in the patient.  It is integer valued from 0 (no presence) to 4.
 90 |      Experiments with the Cleveland database have concentrated on simply
 91 |      attempting to distinguish presence (values 1,2,3,4) from absence (value
 92 |      0).  
 93 |    
 94 |      The names and social security numbers of the patients were recently 
 95 |      removed from the database, replaced with dummy values.
 96 | 
 97 |      One file has been "processed", that one containing the Cleveland 
 98 |      database.  All four unprocessed files also exist in this directory.
 99 |     
100 | 5. Number of Instances: 
101 |         Database:    # of instances:
102 |           Cleveland: 303
103 |           Hungarian: 294
104 |         Switzerland: 123
105 |       Long Beach VA: 200
106 | 
107 | 6. Number of Attributes: 76 (including the predicted attribute)
108 | 
109 | 7. Attribute Information:
110 |    -- Only 14 used
111 |       -- 1. #3  (age)       
112 |       -- 2. #4  (sex)       
113 |       -- 3. #9  (cp)        
114 |       -- 4. #10 (trestbps)  
115 |       -- 5. #12 (chol)      
116 |       -- 6. #16 (fbs)       
117 |       -- 7. #19 (restecg)   
118 |       -- 8. #32 (thalach)   
119 |       -- 9. #38 (exang)     
120 |       -- 10. #40 (oldpeak)   
121 |       -- 11. #41 (slope)     
122 |       -- 12. #44 (ca)        
123 |       -- 13. #51 (thal)      
124 |       -- 14. #58 (num)       (the predicted attribute)
125 | 
126 |    -- Complete attribute documentation:
127 |       1 id: patient identification number
128 |       2 ccf: social security number (I replaced this with a dummy value of 0)
129 |       3 age: age in years
130 |       4 sex: sex (1 = male; 0 = female)
131 |       5 painloc: chest pain location (1 = substernal; 0 = otherwise)
132 |       6 painexer (1 = provoked by exertion; 0 = otherwise)
133 |       7 relrest (1 = relieved after rest; 0 = otherwise)
134 |       8 pncaden (sum of 5, 6, and 7)
135 |       9 cp: chest pain type
136 |         -- Value 1: typical angina
137 |         -- Value 2: atypical angina
138 |         -- Value 3: non-anginal pain
139 |         -- Value 4: asymptomatic
140 |      10 trestbps: resting blood pressure (in mm Hg on admission to the 
141 |         hospital)
142 |      11 htn
143 |      12 chol: serum cholestoral in mg/dl
144 |      13 smoke: I believe this is 1 = yes; 0 = no (is or is not a smoker)
145 |      14 cigs (cigarettes per day)
146 |      15 years (number of years as a smoker)
147 |      16 fbs: (fasting blood sugar > 120 mg/dl)  (1 = true; 0 = false)
148 |      17 dm (1 = history of diabetes; 0 = no such history)
149 |      18 famhist: family history of coronary artery disease (1 = yes; 0 = no)
150 |      19 restecg: resting electrocardiographic results
151 |         -- Value 0: normal
152 |         -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST 
153 |                     elevation or depression of > 0.05 mV)
154 |         -- Value 2: showing probable or definite left ventricular hypertrophy
155 |                     by Estes' criteria
156 |      20 ekgmo (month of exercise ECG reading)
157 |      21 ekgday(day of exercise ECG reading)
158 |      22 ekgyr (year of exercise ECG reading)
159 |      23 dig (digitalis used furing exercise ECG: 1 = yes; 0 = no)
160 |      24 prop (Beta blocker used during exercise ECG: 1 = yes; 0 = no)
161 |      25 nitr (nitrates used during exercise ECG: 1 = yes; 0 = no)
162 |      26 pro (calcium channel blocker used during exercise ECG: 1 = yes; 0 = no)
163 |      27 diuretic (diuretic used used during exercise ECG: 1 = yes; 0 = no)
164 |      28 proto: exercise protocol
165 |           1 = Bruce     
166 |           2 = Kottus
167 |           3 = McHenry
168 |           4 = fast Balke
169 |           5 = Balke
170 |           6 = Noughton 
171 |           7 = bike 150 kpa min/min  (Not sure if "kpa min/min" is what was 
172 |               written!)
173 |           8 = bike 125 kpa min/min  
174 |           9 = bike 100 kpa min/min
175 |          10 = bike 75 kpa min/min
176 |          11 = bike 50 kpa min/min
177 |          12 = arm ergometer
178 |      29 thaldur: duration of exercise test in minutes
179 |      30 thaltime: time when ST measure depression was noted
180 |      31 met: mets achieved
181 |      32 thalach: maximum heart rate achieved
182 |      33 thalrest: resting heart rate
183 |      34 tpeakbps: peak exercise blood pressure (first of 2 parts)
184 |      35 tpeakbpd: peak exercise blood pressure (second of 2 parts)
185 |      36 dummy
186 |      37 trestbpd: resting blood pressure
187 |      38 exang: exercise induced angina (1 = yes; 0 = no)
188 |      39 xhypo: (1 = yes; 0 = no)
189 |      40 oldpeak = ST depression induced by exercise relative to rest
190 |      41 slope: the slope of the peak exercise ST segment
191 |         -- Value 1: upsloping
192 |         -- Value 2: flat
193 |         -- Value 3: downsloping
194 |      42 rldv5: height at rest
195 |      43 rldv5e: height at peak exercise
196 |      44 ca: number of major vessels (0-3) colored by flourosopy
197 |      45 restckm: irrelevant
198 |      46 exerckm: irrelevant
199 |      47 restef: rest raidonuclid (sp?) ejection fraction
200 |      48 restwm: rest wall (sp?) motion abnormality
201 |         0 = none
202 |         1 = mild or moderate
203 |         2 = moderate or severe
204 |         3 = akinesis or dyskmem (sp?)
205 |      49 exeref: exercise radinalid (sp?) ejection fraction
206 |      50 exerwm: exercise wall (sp?) motion 
207 |      51 thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
208 |      52 thalsev: not used
209 |      53 thalpul: not used
210 |      54 earlobe: not used
211 |      55 cmo: month of cardiac cath (sp?)  (perhaps "call")
212 |      56 cday: day of cardiac cath (sp?)
213 |      57 cyr: year of cardiac cath (sp?)
214 |      58 num: diagnosis of heart disease (angiographic disease status)
215 |         -- Value 0: < 50% diameter narrowing
216 |         -- Value 1: > 50% diameter narrowing
217 |         (in any major vessel: attributes 59 through 68 are vessels)
218 |      59 lmt
219 |      60 ladprox
220 |      61 laddist
221 |      62 diag
222 |      63 cxmain
223 |      64 ramus
224 |      65 om1
225 |      66 om2
226 |      67 rcaprox
227 |      68 rcadist
228 |      69 lvx1: not used
229 |      70 lvx2: not used
230 |      71 lvx3: not used
231 |      72 lvx4: not used
232 |      73 lvf: not used
233 |      74 cathef: not used
234 |      75 junk: not used
235 |      76 name: last name of patient 
236 | 	(I replaced this with the dummy string "name")
237 | 
238 | 9. Missing Attribute Values: Several.  Distinguished with value -9.0.
239 | 
240 | 10. Class Distribution:
241 |         Database:      0   1   2   3   4 Total
242 |           Cleveland: 164  55  36  35  13   303
243 |           Hungarian: 188  37  26  28  15   294
244 |         Switzerland:   8  48  32  30   5   123
245 |       Long Beach VA:  51  56  41  42  10   200
246 | 


--------------------------------------------------------------------------------
/uci-heart-disease/.ipynb_checkpoints/heart-disease.names-checkpoint.txt:
--------------------------------------------------------------------------------
  1 | Publication Request: 
  2 |    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
  3 |    This file describes the contents of the heart-disease directory.
  4 | 
  5 |    This directory contains 4 databases concerning heart disease diagnosis.
  6 |    All attributes are numeric-valued.  The data was collected from the
  7 |    four following locations:
  8 | 
  9 |      1. Cleveland Clinic Foundation (cleveland.data)
 10 |      2. Hungarian Institute of Cardiology, Budapest (hungarian.data)
 11 |      3. V.A. Medical Center, Long Beach, CA (long-beach-va.data)
 12 |      4. University Hospital, Zurich, Switzerland (switzerland.data)
 13 | 
 14 |    Each database has the same instance format.  While the databases have 76
 15 |    raw attributes, only 14 of them are actually used.  Thus I've taken the
 16 |    liberty of making 2 copies of each database: one with all the attributes
 17 |    and 1 with the 14 attributes actually used in past experiments.
 18 | 
 19 |    The authors of the databases have requested:
 20 | 
 21 |       ...that any publications resulting from the use of the data include the 
 22 |       names of the principal investigator responsible for the data collection
 23 |       at each institution.  They would be:
 24 | 
 25 |        1. Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D.
 26 |        2. University Hospital, Zurich, Switzerland: William Steinbrunn, M.D.
 27 |        3. University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D.
 28 |        4. V.A. Medical Center, Long Beach and Cleveland Clinic Foundation:
 29 | 	  Robert Detrano, M.D., Ph.D.
 30 | 
 31 |    Thanks in advance for abiding by this request.
 32 | 
 33 |    David Aha
 34 |    July 22, 1988
 35 |    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 36 | 
 37 | 1. Title: Heart Disease Databases
 38 | 
 39 | 2. Source Information:
 40 |    (a) Creators: 
 41 |        -- 1. Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D.
 42 |        -- 2. University Hospital, Zurich, Switzerland: William Steinbrunn, M.D.
 43 |        -- 3. University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D.
 44 |        -- 4. V.A. Medical Center, Long Beach and Cleveland Clinic Foundation:
 45 |              Robert Detrano, M.D., Ph.D.
 46 |    (b) Donor: David W. Aha (aha@ics.uci.edu) (714) 856-8779   
 47 |    (c) Date: July, 1988
 48 | 
 49 | 3. Past Usage:
 50 |     1. Detrano,~R., Janosi,~A., Steinbrunn,~W., Pfisterer,~M., Schmid,~J.,
 51 |        Sandhu,~S., Guppy,~K., Lee,~S., \& Froelicher,~V. (1989).  {\it 
 52 |        International application of a new probability algorithm for the 
 53 |        diagnosis of coronary artery disease.}  {\it American Journal of 
 54 |        Cardiology}, {\it 64},304--310.
 55 |        -- International Probability Analysis 
 56 |        -- Address: Robert Detrano, M.D.
 57 |                    Cardiology 111-C
 58 |                    V.A. Medical Center
 59 |                    5901 E. 7th Street
 60 |                    Long Beach, CA 90028
 61 |        -- Results in percent accuracy: (for 0.5 probability threshold)
 62 |              Data Name:  CDF    CADENZA
 63 |           -- Hungarian   77     74
 64 |              Long beach  79     77
 65 |              Swiss       81     81
 66 |           -- Approximately a 77% correct classification accuracy with a
 67 |              logistic-regression-derived discriminant function
 68 |     2. David W. Aha & Dennis Kibler
 69 |        -- 
 70 |           
 71 |           
 72 |           -- Instance-based prediction of heart-disease presence with the 
 73 |              Cleveland database
 74 |              -- NTgrowth: 77.0% accuracy
 75 |              --       C4: 74.8% accuracy
 76 |     3. John Gennari
 77 |        -- Gennari, J.~H., Langley, P, \& Fisher, D. (1989). Models of
 78 |           incremental concept formation. {\it Artificial Intelligence, 40},
 79 |           11--61.
 80 |        -- Results: 
 81 |           -- The CLASSIT conceptual clustering system achieved a 78.9% accuracy
 82 |              on the Cleveland database.
 83 | 
 84 | 4. Relevant Information:
 85 |      This database contains 76 attributes, but all published experiments
 86 |      refer to using a subset of 14 of them.  In particular, the Cleveland
 87 |      database is the only one that has been used by ML researchers to 
 88 |      this date.  The "goal" field refers to the presence of heart disease
 89 |      in the patient.  It is integer valued from 0 (no presence) to 4.
 90 |      Experiments with the Cleveland database have concentrated on simply
 91 |      attempting to distinguish presence (values 1,2,3,4) from absence (value
 92 |      0).  
 93 |    
 94 |      The names and social security numbers of the patients were recently 
 95 |      removed from the database, replaced with dummy values.
 96 | 
 97 |      One file has been "processed", that one containing the Cleveland 
 98 |      database.  All four unprocessed files also exist in this directory.
 99 |     
100 | 5. Number of Instances: 
101 |         Database:    # of instances:
102 |           Cleveland: 303
103 |           Hungarian: 294
104 |         Switzerland: 123
105 |       Long Beach VA: 200
106 | 
107 | 6. Number of Attributes: 76 (including the predicted attribute)
108 | 
109 | 7. Attribute Information:
110 |    -- Only 14 used
111 |       -- 1. #3  (age)       
112 |       -- 2. #4  (sex)       
113 |       -- 3. #9  (cp)        
114 |       -- 4. #10 (trestbps)  
115 |       -- 5. #12 (chol)      
116 |       -- 6. #16 (fbs)       
117 |       -- 7. #19 (restecg)   
118 |       -- 8. #32 (thalach)   
119 |       -- 9. #38 (exang)     
120 |       -- 10. #40 (oldpeak)   
121 |       -- 11. #41 (slope)     
122 |       -- 12. #44 (ca)        
123 |       -- 13. #51 (thal)      
124 |       -- 14. #58 (num)       (the predicted attribute)
125 | 
126 |    -- Complete attribute documentation:
127 |       1 id: patient identification number
128 |       2 ccf: social security number (I replaced this with a dummy value of 0)
129 |       3 age: age in years
130 |       4 sex: sex (1 = male; 0 = female)
131 |       5 painloc: chest pain location (1 = substernal; 0 = otherwise)
132 |       6 painexer (1 = provoked by exertion; 0 = otherwise)
133 |       7 relrest (1 = relieved after rest; 0 = otherwise)
134 |       8 pncaden (sum of 5, 6, and 7)
135 |       9 cp: chest pain type
136 |         -- Value 1: typical angina
137 |         -- Value 2: atypical angina
138 |         -- Value 3: non-anginal pain
139 |         -- Value 4: asymptomatic
140 |      10 trestbps: resting blood pressure (in mm Hg on admission to the 
141 |         hospital)
142 |      11 htn
143 |      12 chol: serum cholestoral in mg/dl
144 |      13 smoke: I believe this is 1 = yes; 0 = no (is or is not a smoker)
145 |      14 cigs (cigarettes per day)
146 |      15 years (number of years as a smoker)
147 |      16 fbs: (fasting blood sugar > 120 mg/dl)  (1 = true; 0 = false)
148 |      17 dm (1 = history of diabetes; 0 = no such history)
149 |      18 famhist: family history of coronary artery disease (1 = yes; 0 = no)
150 |      19 restecg: resting electrocardiographic results
151 |         -- Value 0: normal
152 |         -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST 
153 |                     elevation or depression of > 0.05 mV)
154 |         -- Value 2: showing probable or definite left ventricular hypertrophy
155 |                     by Estes' criteria
156 |      20 ekgmo (month of exercise ECG reading)
157 |      21 ekgday(day of exercise ECG reading)
158 |      22 ekgyr (year of exercise ECG reading)
159 |      23 dig (digitalis used furing exercise ECG: 1 = yes; 0 = no)
160 |      24 prop (Beta blocker used during exercise ECG: 1 = yes; 0 = no)
161 |      25 nitr (nitrates used during exercise ECG: 1 = yes; 0 = no)
162 |      26 pro (calcium channel blocker used during exercise ECG: 1 = yes; 0 = no)
163 |      27 diuretic (diuretic used used during exercise ECG: 1 = yes; 0 = no)
164 |      28 proto: exercise protocol
165 |           1 = Bruce     
166 |           2 = Kottus
167 |           3 = McHenry
168 |           4 = fast Balke
169 |           5 = Balke
170 |           6 = Noughton 
171 |           7 = bike 150 kpa min/min  (Not sure if "kpa min/min" is what was 
172 |               written!)
173 |           8 = bike 125 kpa min/min  
174 |           9 = bike 100 kpa min/min
175 |          10 = bike 75 kpa min/min
176 |          11 = bike 50 kpa min/min
177 |          12 = arm ergometer
178 |      29 thaldur: duration of exercise test in minutes
179 |      30 thaltime: time when ST measure depression was noted
180 |      31 met: mets achieved
181 |      32 thalach: maximum heart rate achieved
182 |      33 thalrest: resting heart rate
183 |      34 tpeakbps: peak exercise blood pressure (first of 2 parts)
184 |      35 tpeakbpd: peak exercise blood pressure (second of 2 parts)
185 |      36 dummy
186 |      37 trestbpd: resting blood pressure
187 |      38 exang: exercise induced angina (1 = yes; 0 = no)
188 |      39 xhypo: (1 = yes; 0 = no)
189 |      40 oldpeak = ST depression induced by exercise relative to rest
190 |      41 slope: the slope of the peak exercise ST segment
191 |         -- Value 1: upsloping
192 |         -- Value 2: flat
193 |         -- Value 3: downsloping
194 |      42 rldv5: height at rest
195 |      43 rldv5e: height at peak exercise
196 |      44 ca: number of major vessels (0-3) colored by flourosopy
197 |      45 restckm: irrelevant
198 |      46 exerckm: irrelevant
199 |      47 restef: rest raidonuclid (sp?) ejection fraction
200 |      48 restwm: rest wall (sp?) motion abnormality
201 |         0 = none
202 |         1 = mild or moderate
203 |         2 = moderate or severe
204 |         3 = akinesis or dyskmem (sp?)
205 |      49 exeref: exercise radinalid (sp?) ejection fraction
206 |      50 exerwm: exercise wall (sp?) motion 
207 |      51 thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
208 |      52 thalsev: not used
209 |      53 thalpul: not used
210 |      54 earlobe: not used
211 |      55 cmo: month of cardiac cath (sp?)  (perhaps "call")
212 |      56 cday: day of cardiac cath (sp?)
213 |      57 cyr: year of cardiac cath (sp?)
214 |      58 num: diagnosis of heart disease (angiographic disease status)
215 |         -- Value 0: < 50% diameter narrowing
216 |         -- Value 1: > 50% diameter narrowing
217 |         (in any major vessel: attributes 59 through 68 are vessels)
218 |      59 lmt
219 |      60 ladprox
220 |      61 laddist
221 |      62 diag
222 |      63 cxmain
223 |      64 ramus
224 |      65 om1
225 |      66 om2
226 |      67 rcaprox
227 |      68 rcadist
228 |      69 lvx1: not used
229 |      70 lvx2: not used
230 |      71 lvx3: not used
231 |      72 lvx4: not used
232 |      73 lvf: not used
233 |      74 cathef: not used
234 |      75 junk: not used
235 |      76 name: last name of patient 
236 | 	(I replaced this with the dummy string "name")
237 | 
238 | 9. Missing Attribute Values: Several.  Distinguished with value -9.0.
239 | 
240 | 10. Class Distribution:
241 |         Database:      0   1   2   3   4 Total
242 |           Cleveland: 164  55  36  35  13   303
243 |           Hungarian: 188  37  26  28  15   294
244 |         Switzerland:   8  48  32  30   5   123
245 |       Long Beach VA:  51  56  41  42  10   200
246 | 


--------------------------------------------------------------------------------
/analytics_wrapper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Author: Mandis Beigi
  5 | # Copyright (c) 2022 Medidata Solutions, Inc.
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in
 15 | # all copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 | # THE SOFTWARE.
 24 | 
 25 | 
 26 | import pandas as pd
 27 | import numpy as np
 28 | import logging
 29 | from numpy import mean
 30 | from numpy import std
 31 | from scipy.spatial import distance
 32 | from collections import Counter
 33 | import matplotlib.pyplot as plt
 34 | import math
 35 | #from pandas_profiling import ProfileReport
 36 | 
 37 | import analytics_lib
 38 | import bow_lib
 39 | import dimanalysis_lib
 40 | import preprocessor_lib
 41 | import utilities_lib
 42 | import visualization_lib
 43 | 
 44 | 
 45 | def analyze_missingness(df):
 46 |     missingness_df = pd.DataFrame(columns=['column name', 'percent missing'])
 47 |     index = 0
 48 |     for column in df.columns:
 49 |         missingness = df[column].isna().sum()/len(df[column])
 50 |         logging.info('Column %s missingness: %.2f'%(column, missingness))
 51 |         if missingness >= 0.3:
 52 |             missingness_df.loc[index] = [column, missingness]
 53 |             index = index + 1
 54 | 
 55 |     return(missingness_df)
 56 | 
 57 | 
 58 | def analyze(source_df, syn_df, config, pdf_page):
 59 |     source_df = utilities_lib.drop_date_columns(source_df)
 60 |     syn_df = utilities_lib.drop_date_columns(syn_df)
 61 | 
 62 |     logging.info('Analyzing duplicates in the source data.....................')
 63 |     source_duplicates = source_df[source_df.duplicated()]
 64 |     dup_len = len(source_duplicates)
 65 |     if dup_len > 0:
 66 |         logging.warning("Number of duplicates in the source data: {}".format(dup_len))
 67 | 
 68 |     logging.info('Analyzing missingness of all the source data................')
 69 |     missingness_df = analyze_missingness(source_df)
 70 |     missingness_df.to_csv(config.output_dir+config.proj_name+'_missingness_src.csv')
 71 | 
 72 |     logging.info('Analyzing missingness of all the synthesized data................')
 73 |     analyze_missingness(syn_df)
 74 |     missingness_df = analyze_missingness(syn_df)
 75 |     missingness_df.to_csv(config.output_dir+config.proj_name+'_missingness_syn.csv')
 76 | 
 77 |     source_df = preprocessor_lib.one_hot_encoding_encode(source_df)
 78 |     syn_df = preprocessor_lib.one_hot_encoding_encode(syn_df)
 79 | 
 80 |     if source_df.shape[1] != syn_df.shape[1]:
 81 |         logging.warning("Real and synthesized data do not have the same number of columns")
 82 | 
 83 |     logging.info("source_data: {}".format(source_df.shape))
 84 |     logging.info("syn_data: {}".format(syn_df.shape))
 85 | 
 86 |     source_df.dropna(axis=1, how='all', inplace=True)
 87 |     syn_df.dropna(axis=1, how='all', inplace=True)
 88 |     #cols_keep = source_df.columns & syn_df.columns
 89 |     cols_keep = source_df.columns.intersection(syn_df.columns)
 90 | 
 91 |     source_df = source_df[cols_keep]
 92 |     syn_df = syn_df[cols_keep]
 93 | 
 94 |     source_df = source_df.replace([np.inf,-np.inf,np.nan],0)
 95 |     syn_df = syn_df.replace([np.inf,-np.inf,np.nan],0)
 96 |     source_df = preprocessor_lib.impute_one_hot_encoded_df(source_df)
 97 |     syn_df = preprocessor_lib.impute_one_hot_encoded_df(syn_df)
 98 | 
 99 |     pd.set_option('max_columns', None)
100 |     pd.set_option('max_rows', None)
101 |     logging.info("-------data types---------")
102 |     logging.info(source_df.dtypes)
103 | 
104 |     num_common_rows = len(utilities_lib.get_common_rows(source_df, syn_df))
105 |     if num_common_rows > 0:
106 |         logging.warning("The number of common rows between the real data and the synthesized data is: {}".format(num_common_rows))
107 |     else:
108 |         logging.info("The number of common rows between the real data and the synthesized data is: {}".format(num_common_rows))
109 |     
110 |     logging.info("------Comparing the univariate statistics --------")
111 |     logging.info("------Comparing the mean--------------")
112 |     boolean_columns, nonboolean_columns = preprocessor_lib.get_boolean_and_nonboolean_columns(source_df)
113 |     means = pd.concat([source_df.loc[:, nonboolean_columns].mean(), syn_df.loc[:, nonboolean_columns].mean()], axis=1)
114 |     means.columns = ["Real (mean)", "Synthesized (mean)"]
115 |     logging.info(means)
116 |     means.to_csv(config.output_dir+config.proj_name+'_fid_mean_comparison.csv')
117 |     source_df.mean().to_csv(config.output_dir+config.proj_name+'_fid_mean_real.csv')
118 |     syn_df.mean().to_csv(config.output_dir+config.proj_name+'_fid_mean_syn.csv')
119 | 
120 |     num_pages = math.ceil(len(means)/20)
121 |     new_means = means.copy().reset_index()
122 |     for i in range(num_pages):
123 |         plt.clf()
124 |         fig, ax = plt.subplots()
125 |         ax.axis('off')
126 |         index_start = i*20
127 |         index_end = index_start + 20
128 |         if len(new_means) != 0:
129 |             the_table = ax.table(cellText=new_means[index_start:index_end].values,colLabels=new_means[index_start:index_end].columns,loc='center')
130 |         pdf_page.savefig(fig)
131 |         plt.clf()
132 | 
133 |     logging.info("------Comparing the median--------------")
134 |     medians = pd.concat([source_df.loc[:, nonboolean_columns].median(), syn_df.loc[:, nonboolean_columns].median()], axis=1)
135 |     medians.columns = ["Real (median)", "Synthesized (median)"]
136 |     logging.info(medians)
137 |     medians.to_csv(config.output_dir+config.proj_name+'_fid_median_comparison.csv')
138 |     source_df.median().to_csv(config.output_dir+config.proj_name+'_fid_median_real.csv')
139 |     syn_df.median().to_csv(config.output_dir+config.proj_name+'_fid_median_syn.csv')
140 | 
141 |     logging.info("------Covariance of real data--------------")
142 |     #logging.info(source_df.cov())
143 |     logging.info("------Covariance of synthesized data--------------")
144 |     #logging.info(syn_df.cov())
145 |     source_df.loc[:, nonboolean_columns].cov().to_csv(config.output_dir+config.proj_name+'_fid_cov_real.csv')
146 |     syn_df.loc[:, nonboolean_columns].cov().to_csv(config.output_dir+config.proj_name+'_fid_cov_syn.csv')
147 | 
148 |     p_thresh = 0.05
149 |     logging.info("Running Fisher Exact and Kolmogorov-Smirnov tests with p_threshold of {} ............".format(p_thresh))
150 |     dissimilar_cols = analytics_lib.extract_dissimilar_features(source_df, syn_df, p_threshold=p_thresh)
151 |     if len(dissimilar_cols) > 0:
152 |         logging.warning("Columns with different distributions between the original and the synthetic data: {}".format(dissimilar_cols))
153 |     else:
154 |         logging.info("Columns with different distributions between the original and the synthetic data: {}".format(dissimilar_cols))
155 | 
156 | 
157 |     dissimilar_cols_df = pd.DataFrame(dissimilar_cols, columns=['column name'])
158 |     dissimilar_cols_df.to_csv(config.output_dir+config.proj_name+'_dissimilar_cols.csv', index=False)
159 | 
160 |     num_pages = math.ceil(len(dissimilar_cols_df)/20)
161 |     for i in range(num_pages):
162 |         plt.clf()
163 |         fig, ax = plt.subplots()
164 |         ax.axis('off')
165 |         index_start = i*20
166 |         index_end = index_start + 20
167 |         if len(dissimilar_cols_df)==0:
168 |             the_table = ax.table(cellText=pd.DataFrame(['']).values,
169 |                         colLabels=['Variables with different distributions from real to synthetic data'],loc='center')
170 |         else:
171 |             the_table = ax.table(cellText=dissimilar_cols_df[index_start:index_end].values,
172 |                         colLabels=['Variables with different distributions from real to synthetic data'],loc='center')
173 |         plt.title("Variables with different distributions from real to synthetic data")
174 |         pdf_page.savefig(fig)
175 |         plt.clf()
176 |    
177 |     logging.info("-----------The p-values-----------------------")
178 |     stats = analytics_lib.compare_columns(source_df, syn_df)
179 |     logging.info(stats.loc[boolean_columns][['p-value', 'odds ratio']])
180 |     logging.info(stats.loc[nonboolean_columns][['p-value', 'ks statistic']])
181 |     
182 |     num_of_rows_source_df = source_df.shape[0]
183 |     logging.info("Reduce dimension of data to 2.........")
184 |     both_sets = pd.concat([source_df, syn_df], axis=0)
185 | 
186 |     #both_sets_low_dim = dimanalysis_lib.reduce_tsne(both_sets, n_components=2)
187 |     both_sets_low_dim = dimanalysis_lib.reduce_pca(both_sets, n_components=2)
188 |     
189 |     logging.info("Plotting original data.........")
190 |     both_sets_low_dim = both_sets_low_dim.reset_index(drop=True)
191 |     visualization_lib.scatter(both_sets_low_dim[0:num_of_rows_source_df], config.output_dir+'figs/'+config.proj_name+'_real_data.svg', hold=True, pdf_page=pdf_page, c='black', alpha=1)
192 | 
193 |     logging.info("Plotting synthesized data.........")
194 |     visualization_lib.scatter(both_sets_low_dim[num_of_rows_source_df:-1], config.output_dir+'figs/'+config.proj_name+'_syn_data.svg', pdf_page=pdf_page, c='red', alpha=0.5)
195 | 
196 |     logging.info("Calculating the silhouette coefficient between the real and the synthetic data.........")
197 |     s = analytics_lib.calculate_silhouette_coef(source_df, syn_df)
198 |     message = 'The silhouette score between real and synthesized data: %.3f'%(s)
199 |     logging.info(message)
200 | 
201 |     plt.clf()
202 |     fig, ax = plt.subplots()
203 |     ax.axis('off')
204 |     plt.text(0.05,0.95, message, transform=fig.transFigure, size=10)
205 |     pdf_page.savefig(fig)
206 |     plt.clf()
207 | 
208 |     logging.info("Generating the BOW representation..................")
209 |     codebook = bow_lib.generate_code_book(both_sets_low_dim, config.cv_bow_num_of_bins)
210 |     real_hist = bow_lib.get_histogram(codebook, both_sets_low_dim[0:len(source_df)])
211 |     syn_hist = bow_lib.get_histogram(codebook, both_sets_low_dim[len(source_df):len(both_sets)])
212 | 
213 |     message = 'Distance between real and synthesized data is: %.3f'%(distance.euclidean(real_hist, syn_hist))
214 |     logging.info(message)
215 |     visualization_lib.bar(real_hist, config.cv_bow_num_of_bins,
216 |             config.output_dir+'figs/'+config.proj_name+'_pca_bow_hist_real.svg', pdf_page=pdf_page, hold=True)
217 |     visualization_lib.bar(syn_hist, config.cv_bow_num_of_bins,
218 |             config.output_dir+'figs/'+config.proj_name+'_pca_bow_hist_syn.svg', pdf_page=pdf_page)
219 | 
220 |     plt.clf()
221 |     fig, ax = plt.subplots()
222 |     ax.axis('off')
223 |     plt.text(0.05,0.95, message, transform=fig.transFigure, size=10)
224 |     pdf_page.savefig(fig)
225 |     plt.clf()
226 | 
227 |     logging.info("Plotting the correlation heatmap for the original data...................")
228 |     boolean_columns, nonboolean_columns = preprocessor_lib.get_boolean_and_nonboolean_columns(source_df)
229 |     nonboolean_source_df = source_df.loc[:, nonboolean_columns]
230 |     visualization_lib.correlation_heatmap(nonboolean_source_df, config.output_dir+'figs/'+config.proj_name+'_real_corr.svg', corr='pearson', pdf_page=pdf_page)
231 | 
232 |     logging.info("Plotting the correlation heatmap for the synthesized data...................")
233 |     boolean_columns, nonboolean_columns = preprocessor_lib.get_boolean_and_nonboolean_columns(syn_df)
234 |     nonboolean_syn_df = syn_df.loc[:, nonboolean_columns]
235 |     visualization_lib.correlation_heatmap(nonboolean_syn_df, config.output_dir+'figs/'+config.proj_name+'_syn_corr.svg', corr='pearson', pdf_page=pdf_page)
236 | 
237 |     visualization_lib.diff_correlation_heatmap(nonboolean_source_df, nonboolean_syn_df, config.output_dir+'figs/'+config.proj_name+'_diff_corr.svg', corr='pearson', pdf_page=pdf_page)
238 | 
239 | 
240 |     return
241 | 
242 | 


--------------------------------------------------------------------------------
/analytics_lib.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Author: Mandis Beigi
  5 | # Copyright (c) 2022 Medidata Solutions, Inc.
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in
 15 | # all copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 | # THE SOFTWARE.
 24 | 
 25 | 
 26 | 
 27 | import os
 28 | import sys
 29 | import copy
 30 | from random import shuffle
 31 | import pandas as pd
 32 | import numpy as np
 33 | import scipy.stats as stats
 34 | import logging
 35 | 
 36 | from sklearn.model_selection import train_test_split
 37 | from sklearn.model_selection import KFold
 38 | from sklearn.model_selection import RepeatedStratifiedKFold
 39 | from sklearn.model_selection import cross_val_score
 40 | from sklearn.metrics import roc_auc_score, pairwise_distances, accuracy_score
 41 | from sklearn.metrics import silhouette_samples, silhouette_score
 42 | from sklearn.neighbors import NearestNeighbors
 43 | from sklearn.ensemble import RandomForestClassifier
 44 | from sklearn.ensemble import ExtraTreesClassifier
 45 | from lifelines import KaplanMeierFitter
 46 | from lifelines.statistics import logrank_test
 47 | import matplotlib.pyplot as plt
 48 | 
 49 | import preprocessor_lib
 50 | import utilities_lib
 51 | 
 52 | 
 53 | def calculate_silhouette_coef(x1, x2):
 54 |     x_all = pd.concat([x1, x2])
 55 |     
 56 |     y1 = pd.DataFrame(len(x1)*[1])
 57 |     y2 = pd.DataFrame(len(x2)*[2])
 58 |     y_all = pd.concat([y1, y2])
 59 |     
 60 |     s_1_2 = silhouette_score(x_all, np.ravel(y_all), metric='euclidean')
 61 |     
 62 |     return(s_1_2)
 63 | 
 64 | 
 65 | def remove_correlated_variables(data_df, threshold):
 66 |     col_corr = set() # Set of all the names of deleted columns
 67 |     corr_matrix = abs(data_df.corr())
 68 |     for i in range(len(corr_matrix.columns)):
 69 |         for j in range(i):
 70 |             if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
 71 |                 colname = corr_matrix.columns[i] # getting the name of column
 72 |                 col_corr.add(colname)
 73 |                 if colname in data_df.columns:
 74 |                     del data_df[colname] # deleting the column from the dataset data_df
 75 |     return data_df
 76 | 
 77 | def train_two_class_classifier(x_df, y_df, classifier_function=RandomForestClassifier, n_splits=10, n_repeats=3):
 78 |     cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=1)
 79 |     model = classifier_function()
 80 |     n_aucs = cross_val_score(model, x_df.to_numpy(), y_df.to_numpy().ravel(), scoring='roc_auc', cv=cv, n_jobs=-1, error_score='raise')
 81 |     return(np.mean(n_aucs), np.std(n_aucs))
 82 | 
 83 | 
 84 | def train_two_class_classifier_single_fold(x_df, y_df, classifier_function=RandomForestClassifier, test_size=0.2):
 85 |     x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=test_size)
 86 |     clf = classifier_function(max_iter=1000, multi_class='auto').fit(x_train, np.ravel(y_train))
 87 |     y_pred = clf.predict_proba(x_test)
 88 |     auc = roc_auc_score(y_test, y_pred[:,1])
 89 |     return(auc)
 90 | 
 91 | 
 92 | def train_multiclass_classifier(x_df, y_df, classifier_function=RandomForestClassifier, n_splits=10, n_repeats=3):
 93 |     cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=1)
 94 |     model = classifier_function()
 95 |     n_scores = cross_val_score(model, x_df.to_numpy(), y_df.to_numpy().ravel(), scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
 96 |     return(np.mean(n_scores), np.std(n_scores))
 97 | 
 98 | 
 99 | def train_multiclass_classifier_single_fold(x_df, y_df, classifier_function=RandomForestClassifier, test_size=0.2):
100 |     x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=test_size)
101 |     clf = classifier_function(max_iter=1000, multi_class='auto').fit(x_train, np.ravel(y_train))
102 |     y_pred = clf.predict(x_test)
103 |     acc = accuracy_score(y_test, y_pred)
104 |     return(acc)
105 | 
106 | 
107 | def compare_columns(df1, df2):
108 |     
109 |     if len(df1) != len(df2):
110 |         logging.warning("The lengths of the columns are not the same")
111 |         
112 |     boolean_variables, nonboolean_variables = preprocessor_lib.get_boolean_and_nonboolean_columns(df1)
113 | 
114 |     stats_df = pd.DataFrame(index = set(boolean_variables) or set(nonboolean_variables),
115 |                             columns=['p-value','odds ratio','ks statistic','method'])
116 |     
117 |     for c in boolean_variables: 
118 |         contingency_table = [[(df1.loc[:,c]==0).sum(),(df2.loc[:,c]==0).sum()], [(df1.loc[:,c]==1).sum(),(df2.loc[:,c]==1).sum()]]
119 |         
120 |         #high p-value means we don't reject the null hypothesis
121 |         stats_df.loc[c,'odds ratio'], stats_df.loc[c,'p-value'] = stats.fisher_exact(contingency_table)
122 |         stats_df.loc[c,'method'] = 'Fisher Exact Test'
123 | 
124 |     for c in nonboolean_variables:
125 |         #high p-value means they come from the same distribution
126 |         stats_df.loc[c,'ks statistic'], stats_df.loc[c,'p-value'] = stats.ks_2samp(df1.loc[:,c], df2.loc[:,c])
127 |         stats_df.loc[c,'method'] = 'Kolmogorov-Smirnov test'
128 |     
129 |     pd.set_option("display.max_rows", None, "display.max_columns", None)
130 |     
131 |     return(stats_df)
132 | 
133 | # Kolmogorov-Smirnov Test: A P-value of less than 0.05 is considered significant. 
134 | # The P-value less than significance level rejects the null hypothesis as we expect 
135 | # to see the observed outcome only 5% of the time if the null hypothesis was true.
136 | def extract_dissimilar_features(x1, x2, p_threshold=0.05):
137 |     
138 |     #logging.info("Comparing dataset 1 and dataset 2")
139 |     num_rows = min(len(x1), len(x2))
140 |     stats_df = compare_columns(x1[0:num_rows], x2[0:num_rows])
141 |     
142 |     d = stats_df.loc[(stats_df['p-value'] < p_threshold) & (stats_df['method'] == 'Fisher Exact Test')]
143 |     #cols1 = [utilities_lib.get_feature_name_v2(col) for col in d.index.tolist()]
144 |     cols1 = [col for col in d.index.tolist()]
145 |     d = stats_df.loc[(stats_df['p-value'] < p_threshold) & (stats_df['method'] == 'Kolmogorov-Smirnov test')]
146 |     #cols2 = [utilities_lib.get_feature_name_v2(col) for col in d.index.tolist()]
147 |     cols2 = [col for col in d.index.tolist()]
148 |     cols_1_2 = set(cols1+cols2)
149 |     #logging.info(cols_1_2)
150 |     
151 |     return(list(cols_1_2))
152 |     
153 | 
154 | def select_features(x1, x2, threshold=0.008):
155 |     y1 = pd.DataFrame(len(x1)*[1])
156 |     y2 = pd.DataFrame(len(x2)*[2])
157 |     x_all = pd.concat([x1, x2])
158 |     y_all = pd.concat([y1, y2])
159 |     
160 |     # Build a forest and compute the impurity-based feature importances
161 |     #forest = RandomForestClassifier()
162 |     forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
163 | 
164 |     forest.fit(x_all, np.ravel(y_all))
165 |     importances = forest.feature_importances_
166 |     std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
167 |     indices = np.argsort(importances)[::-1]
168 | 
169 |     # Print the feature ranking
170 |     logging.info("Feature ranking:")
171 | 
172 |     significant_features = []
173 |     nonsignificant_features = []
174 |     for f in range(x_all.shape[1]):
175 |         index = indices[f]
176 |         column_name = x_all.columns[index]
177 |         feature_name = utilities_lib.get_feature_name_v2(column_name)
178 |         
179 |         if importances[indices[f]] > threshold:
180 |             logging.info("%d. feature %s (%f)" % (f + 1, column_name, importances[indices[f]]))
181 |             if feature_name not in significant_features:
182 |                 significant_features.append(feature_name)
183 |         else:
184 |             if feature_name not in nonsignificant_features:
185 |                 nonsignificant_features.append(feature_name)
186 |     
187 |     return(significant_features, nonsignificant_features)
188 | 
189 | 
190 | def preprocess_km(df, event_col='DEATH_FLAG', event_val='True', time_to_event_col='TTE_DEATH', time_to_censor_col='TTE_DEATH'):
191 |     encoded_event_col = event_col + '|' + event_val
192 |     if time_to_censor_col is not None:
193 |         df['survival_drv'] = (utilities_lib.ifelse(df[encoded_event_col].isin([1.0]), 
194 |                                                     df[time_to_event_col], 
195 |                                                     df[time_to_censor_col])).values
196 |     else:
197 |         df['survival_drv'] = df[time_to_event_col].values
198 |     
199 |     df = df.loc[df['survival_drv'] >= 0, :].reset_index(drop=True)
200 |     
201 |     df['survival_event'] = pd.Series([0 for x in range(len(df.index))], index=df.index)
202 |     df.loc[df[encoded_event_col].isin([1.0]), 'survival_event'] = 1
203 |     df.loc[~(df[encoded_event_col].isin([1.0])), 'survival_event'] = 0
204 |     return(df)
205 | 
206 | 
207 | def km_wrapper(df, event_col='survival_event', time_to_event_col='survival_drv', cohort_col = None, tick_interval = 200):
208 |     # check for duplicates in subjects
209 |     km_df = df.copy()
210 |     
211 |     # fit to each cohort
212 |     if cohort_col is None:
213 |         km_df['cohort'] = np.zeros(km_df.shape[0])
214 |         cohort_col = 'cohort'
215 |     
216 |     out_tables = {}
217 |     out_models = {}
218 |     grps = km_df[cohort_col].unique()
219 |     at_risk_summary = pd.DataFrame()
220 |     for i in range(len(grps)):
221 |         kmf = KaplanMeierFitter()
222 |         T = km_df.loc[km_df[cohort_col] == grps[i], time_to_event_col]
223 |         C = km_df.loc[km_df[cohort_col] == grps[i], event_col]
224 |         kmf.fit(T, C, label = grps[i])
225 |         
226 |         tmp_tbl = kmf.event_table
227 |         tmp_tbl = pd.merge(tmp_tbl,
228 |                            kmf.survival_function_.rename(columns = {str(grps[i]) : 'survival_prob'}),
229 |                            left_index = True, right_index = True)
230 |         tmp_tbl = pd.merge(tmp_tbl, kmf.confidence_interval_.rename(columns = {str(grps[i]) + '_lower_0.95' : 'ci_lower',
231 |                                                                                str(grps[i]) + '_upper_0.95' : 'ci_upper'}),
232 |                            left_index = True, right_index = True)
233 |         
234 |         # get at risk count at tick intervals
235 |         missing_intervals = {i for i in range(0, int(max(tmp_tbl.index)+1), tick_interval)}.difference(tmp_tbl.index)
236 |         try:
237 |             max_val = max(missing_intervals)
238 |         except:
239 |             max_val = 0
240 |         missing_intervals.add(max_val + tick_interval)
241 |         at_risk_tbl = pd.merge(tmp_tbl['at_risk'],
242 |                                pd.DataFrame(index = missing_intervals),
243 |                                how = 'outer', left_index = True, right_index = True)
244 |         at_risk_tbl = at_risk_tbl.sort_index(axis=0).fillna(method='bfill').rename(columns = {'at_risk' : grps[i]})
245 |         at_risk_summary = pd.concat([at_risk_summary, at_risk_tbl.loc[{i for i in range(0, int(max(at_risk_tbl.index)+1),
246 |                                     tick_interval)},:].sort_index(axis=0).transpose()], axis = 0).fillna(0)
247 |         
248 |         out_tables[grps[i]] = tmp_tbl
249 |         
250 |         out_models[grps[i]] = {'fitted_km' : kmf, 'label' : grps[i]}
251 |     
252 |     return(T, C, out_tables, out_models, at_risk_summary)
253 | 
254 | 
255 | def plot_km(df, event_flag, event_value, time_to_event_col, time_to_censor_col, label_text, c='black'):
256 |     df_km = preprocess_km(df, event_col=event_flag, event_val=event_value,
257 |             time_to_event_col=time_to_event_col, time_to_censor_col=time_to_censor_col)
258 |     T, C, km_tables, km_fit, at_risk_table = km_wrapper(df_km)
259 |     plot = km_fit[0.0]['fitted_km'].plot(label=label_text, color=c)
260 |     #plot.set_xlim(0,5000)
261 |     plot.set_xlim(0,1000)
262 |     plot.set_ylim(0,1.0)
263 |     fig = plot.get_figure()
264 |     return(fig)
265 | 
266 | def km_log_rank(src_df, syn_df, event_flag, event_value, time_to_event_col, time_to_censor_col):
267 |     src_df_km = preprocess_km(src_df, event_col=event_flag, event_val=event_value,
268 |             time_to_event_col=time_to_event_col, time_to_censor_col=time_to_censor_col)
269 |     src_T, src_C, src_km_tables, src_km_fit, src_at_risk_table = km_wrapper(src_df_km)
270 | 
271 |     syn_df_km = preprocess_km(syn_df, event_col=event_flag, event_val=event_value,
272 |             time_to_event_col=time_to_event_col, time_to_censor_col=time_to_censor_col)
273 |     syn_T, syn_C, syn_km_tables, syn_km_fit, syn_at_risk_table = km_wrapper(syn_df_km)
274 | 
275 |     lr_summary = logrank_test(src_T, syn_T, src_C, syn_C)#, alpha=99)
276 |     lr_summary.print_summary()
277 |     #print("p_value={} test_statistic={}".format(lr_summary.p_value, lr_summary.test_statistic))
278 |     logging.info("p_value={} test_statistic={}".format(lr_summary.p_value, lr_summary.test_statistic))
279 | 
280 |     return
281 | 
282 | 


--------------------------------------------------------------------------------
/uci-heart-disease/.ipynb_checkpoints/processed.cleveland-checkpoint.csv:
--------------------------------------------------------------------------------
  1 | 63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
  2 | 67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
  3 | 67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
  4 | 37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
  5 | 41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
  6 | 56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
  7 | 62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,3
  8 | 57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
  9 | 63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,2
 10 | 53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1
 11 | 57.0,1.0,4.0,140.0,192.0,0.0,0.0,148.0,0.0,0.4,2.0,0.0,6.0,0
 12 | 56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,0
 13 | 56.0,1.0,3.0,130.0,256.0,1.0,2.0,142.0,1.0,0.6,2.0,1.0,6.0,2
 14 | 44.0,1.0,2.0,120.0,263.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,7.0,0
 15 | 52.0,1.0,3.0,172.0,199.0,1.0,0.0,162.0,0.0,0.5,1.0,0.0,7.0,0
 16 | 57.0,1.0,3.0,150.0,168.0,0.0,0.0,174.0,0.0,1.6,1.0,0.0,3.0,0
 17 | 48.0,1.0,2.0,110.0,229.0,0.0,0.0,168.0,0.0,1.0,3.0,0.0,7.0,1
 18 | 54.0,1.0,4.0,140.0,239.0,0.0,0.0,160.0,0.0,1.2,1.0,0.0,3.0,0
 19 | 48.0,0.0,3.0,130.0,275.0,0.0,0.0,139.0,0.0,0.2,1.0,0.0,3.0,0
 20 | 49.0,1.0,2.0,130.0,266.0,0.0,0.0,171.0,0.0,0.6,1.0,0.0,3.0,0
 21 | 64.0,1.0,1.0,110.0,211.0,0.0,2.0,144.0,1.0,1.8,2.0,0.0,3.0,0
 22 | 58.0,0.0,1.0,150.0,283.0,1.0,2.0,162.0,0.0,1.0,1.0,0.0,3.0,0
 23 | 58.0,1.0,2.0,120.0,284.0,0.0,2.0,160.0,0.0,1.8,2.0,0.0,3.0,1
 24 | 58.0,1.0,3.0,132.0,224.0,0.0,2.0,173.0,0.0,3.2,1.0,2.0,7.0,3
 25 | 60.0,1.0,4.0,130.0,206.0,0.0,2.0,132.0,1.0,2.4,2.0,2.0,7.0,4
 26 | 50.0,0.0,3.0,120.0,219.0,0.0,0.0,158.0,0.0,1.6,2.0,0.0,3.0,0
 27 | 58.0,0.0,3.0,120.0,340.0,0.0,0.0,172.0,0.0,0.0,1.0,0.0,3.0,0
 28 | 66.0,0.0,1.0,150.0,226.0,0.0,0.0,114.0,0.0,2.6,3.0,0.0,3.0,0
 29 | 43.0,1.0,4.0,150.0,247.0,0.0,0.0,171.0,0.0,1.5,1.0,0.0,3.0,0
 30 | 40.0,1.0,4.0,110.0,167.0,0.0,2.0,114.0,1.0,2.0,2.0,0.0,7.0,3
 31 | 69.0,0.0,1.0,140.0,239.0,0.0,0.0,151.0,0.0,1.8,1.0,2.0,3.0,0
 32 | 60.0,1.0,4.0,117.0,230.0,1.0,0.0,160.0,1.0,1.4,1.0,2.0,7.0,2
 33 | 64.0,1.0,3.0,140.0,335.0,0.0,0.0,158.0,0.0,0.0,1.0,0.0,3.0,1
 34 | 59.0,1.0,4.0,135.0,234.0,0.0,0.0,161.0,0.0,0.5,2.0,0.0,7.0,0
 35 | 44.0,1.0,3.0,130.0,233.0,0.0,0.0,179.0,1.0,0.4,1.0,0.0,3.0,0
 36 | 42.0,1.0,4.0,140.0,226.0,0.0,0.0,178.0,0.0,0.0,1.0,0.0,3.0,0
 37 | 43.0,1.0,4.0,120.0,177.0,0.0,2.0,120.0,1.0,2.5,2.0,0.0,7.0,3
 38 | 57.0,1.0,4.0,150.0,276.0,0.0,2.0,112.0,1.0,0.6,2.0,1.0,6.0,1
 39 | 55.0,1.0,4.0,132.0,353.0,0.0,0.0,132.0,1.0,1.2,2.0,1.0,7.0,3
 40 | 61.0,1.0,3.0,150.0,243.0,1.0,0.0,137.0,1.0,1.0,2.0,0.0,3.0,0
 41 | 65.0,0.0,4.0,150.0,225.0,0.0,2.0,114.0,0.0,1.0,2.0,3.0,7.0,4
 42 | 40.0,1.0,1.0,140.0,199.0,0.0,0.0,178.0,1.0,1.4,1.0,0.0,7.0,0
 43 | 71.0,0.0,2.0,160.0,302.0,0.0,0.0,162.0,0.0,0.4,1.0,2.0,3.0,0
 44 | 59.0,1.0,3.0,150.0,212.0,1.0,0.0,157.0,0.0,1.6,1.0,0.0,3.0,0
 45 | 61.0,0.0,4.0,130.0,330.0,0.0,2.0,169.0,0.0,0.0,1.0,0.0,3.0,1
 46 | 58.0,1.0,3.0,112.0,230.0,0.0,2.0,165.0,0.0,2.5,2.0,1.0,7.0,4
 47 | 51.0,1.0,3.0,110.0,175.0,0.0,0.0,123.0,0.0,0.6,1.0,0.0,3.0,0
 48 | 50.0,1.0,4.0,150.0,243.0,0.0,2.0,128.0,0.0,2.6,2.0,0.0,7.0,4
 49 | 65.0,0.0,3.0,140.0,417.0,1.0,2.0,157.0,0.0,0.8,1.0,1.0,3.0,0
 50 | 53.0,1.0,3.0,130.0,197.0,1.0,2.0,152.0,0.0,1.2,3.0,0.0,3.0,0
 51 | 41.0,0.0,2.0,105.0,198.0,0.0,0.0,168.0,0.0,0.0,1.0,1.0,3.0,0
 52 | 65.0,1.0,4.0,120.0,177.0,0.0,0.0,140.0,0.0,0.4,1.0,0.0,7.0,0
 53 | 44.0,1.0,4.0,112.0,290.0,0.0,2.0,153.0,0.0,0.0,1.0,1.0,3.0,2
 54 | 44.0,1.0,2.0,130.0,219.0,0.0,2.0,188.0,0.0,0.0,1.0,0.0,3.0,0
 55 | 60.0,1.0,4.0,130.0,253.0,0.0,0.0,144.0,1.0,1.4,1.0,1.0,7.0,1
 56 | 54.0,1.0,4.0,124.0,266.0,0.0,2.0,109.0,1.0,2.2,2.0,1.0,7.0,1
 57 | 50.0,1.0,3.0,140.0,233.0,0.0,0.0,163.0,0.0,0.6,2.0,1.0,7.0,1
 58 | 41.0,1.0,4.0,110.0,172.0,0.0,2.0,158.0,0.0,0.0,1.0,0.0,7.0,1
 59 | 54.0,1.0,3.0,125.0,273.0,0.0,2.0,152.0,0.0,0.5,3.0,1.0,3.0,0
 60 | 51.0,1.0,1.0,125.0,213.0,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0,0
 61 | 51.0,0.0,4.0,130.0,305.0,0.0,0.0,142.0,1.0,1.2,2.0,0.0,7.0,2
 62 | 46.0,0.0,3.0,142.0,177.0,0.0,2.0,160.0,1.0,1.4,3.0,0.0,3.0,0
 63 | 58.0,1.0,4.0,128.0,216.0,0.0,2.0,131.0,1.0,2.2,2.0,3.0,7.0,1
 64 | 54.0,0.0,3.0,135.0,304.0,1.0,0.0,170.0,0.0,0.0,1.0,0.0,3.0,0
 65 | 54.0,1.0,4.0,120.0,188.0,0.0,0.0,113.0,0.0,1.4,2.0,1.0,7.0,2
 66 | 60.0,1.0,4.0,145.0,282.0,0.0,2.0,142.0,1.0,2.8,2.0,2.0,7.0,2
 67 | 60.0,1.0,3.0,140.0,185.0,0.0,2.0,155.0,0.0,3.0,2.0,0.0,3.0,1
 68 | 54.0,1.0,3.0,150.0,232.0,0.0,2.0,165.0,0.0,1.6,1.0,0.0,7.0,0
 69 | 59.0,1.0,4.0,170.0,326.0,0.0,2.0,140.0,1.0,3.4,3.0,0.0,7.0,2
 70 | 46.0,1.0,3.0,150.0,231.0,0.0,0.0,147.0,0.0,3.6,2.0,0.0,3.0,1
 71 | 65.0,0.0,3.0,155.0,269.0,0.0,0.0,148.0,0.0,0.8,1.0,0.0,3.0,0
 72 | 67.0,1.0,4.0,125.0,254.0,1.0,0.0,163.0,0.0,0.2,2.0,2.0,7.0,3
 73 | 62.0,1.0,4.0,120.0,267.0,0.0,0.0,99.0,1.0,1.8,2.0,2.0,7.0,1
 74 | 65.0,1.0,4.0,110.0,248.0,0.0,2.0,158.0,0.0,0.6,1.0,2.0,6.0,1
 75 | 44.0,1.0,4.0,110.0,197.0,0.0,2.0,177.0,0.0,0.0,1.0,1.0,3.0,1
 76 | 65.0,0.0,3.0,160.0,360.0,0.0,2.0,151.0,0.0,0.8,1.0,0.0,3.0,0
 77 | 60.0,1.0,4.0,125.0,258.0,0.0,2.0,141.0,1.0,2.8,2.0,1.0,7.0,1
 78 | 51.0,0.0,3.0,140.0,308.0,0.0,2.0,142.0,0.0,1.5,1.0,1.0,3.0,0
 79 | 48.0,1.0,2.0,130.0,245.0,0.0,2.0,180.0,0.0,0.2,2.0,0.0,3.0,0
 80 | 58.0,1.0,4.0,150.0,270.0,0.0,2.0,111.0,1.0,0.8,1.0,0.0,7.0,3
 81 | 45.0,1.0,4.0,104.0,208.0,0.0,2.0,148.0,1.0,3.0,2.0,0.0,3.0,0
 82 | 53.0,0.0,4.0,130.0,264.0,0.0,2.0,143.0,0.0,0.4,2.0,0.0,3.0,0
 83 | 39.0,1.0,3.0,140.0,321.0,0.0,2.0,182.0,0.0,0.0,1.0,0.0,3.0,0
 84 | 68.0,1.0,3.0,180.0,274.0,1.0,2.0,150.0,1.0,1.6,2.0,0.0,7.0,3
 85 | 52.0,1.0,2.0,120.0,325.0,0.0,0.0,172.0,0.0,0.2,1.0,0.0,3.0,0
 86 | 44.0,1.0,3.0,140.0,235.0,0.0,2.0,180.0,0.0,0.0,1.0,0.0,3.0,0
 87 | 47.0,1.0,3.0,138.0,257.0,0.0,2.0,156.0,0.0,0.0,1.0,0.0,3.0,0
 88 | 53.0,0.0,3.0,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0.0,?,0
 89 | 53.0,0.0,4.0,138.0,234.0,0.0,2.0,160.0,0.0,0.0,1.0,0.0,3.0,0
 90 | 51.0,0.0,3.0,130.0,256.0,0.0,2.0,149.0,0.0,0.5,1.0,0.0,3.0,0
 91 | 66.0,1.0,4.0,120.0,302.0,0.0,2.0,151.0,0.0,0.4,2.0,0.0,3.0,0
 92 | 62.0,0.0,4.0,160.0,164.0,0.0,2.0,145.0,0.0,6.2,3.0,3.0,7.0,3
 93 | 62.0,1.0,3.0,130.0,231.0,0.0,0.0,146.0,0.0,1.8,2.0,3.0,7.0,0
 94 | 44.0,0.0,3.0,108.0,141.0,0.0,0.0,175.0,0.0,0.6,2.0,0.0,3.0,0
 95 | 63.0,0.0,3.0,135.0,252.0,0.0,2.0,172.0,0.0,0.0,1.0,0.0,3.0,0
 96 | 52.0,1.0,4.0,128.0,255.0,0.0,0.0,161.0,1.0,0.0,1.0,1.0,7.0,1
 97 | 59.0,1.0,4.0,110.0,239.0,0.0,2.0,142.0,1.0,1.2,2.0,1.0,7.0,2
 98 | 60.0,0.0,4.0,150.0,258.0,0.0,2.0,157.0,0.0,2.6,2.0,2.0,7.0,3
 99 | 52.0,1.0,2.0,134.0,201.0,0.0,0.0,158.0,0.0,0.8,1.0,1.0,3.0,0
100 | 48.0,1.0,4.0,122.0,222.0,0.0,2.0,186.0,0.0,0.0,1.0,0.0,3.0,0
101 | 45.0,1.0,4.0,115.0,260.0,0.0,2.0,185.0,0.0,0.0,1.0,0.0,3.0,0
102 | 34.0,1.0,1.0,118.0,182.0,0.0,2.0,174.0,0.0,0.0,1.0,0.0,3.0,0
103 | 57.0,0.0,4.0,128.0,303.0,0.0,2.0,159.0,0.0,0.0,1.0,1.0,3.0,0
104 | 71.0,0.0,3.0,110.0,265.0,1.0,2.0,130.0,0.0,0.0,1.0,1.0,3.0,0
105 | 49.0,1.0,3.0,120.0,188.0,0.0,0.0,139.0,0.0,2.0,2.0,3.0,7.0,3
106 | 54.0,1.0,2.0,108.0,309.0,0.0,0.0,156.0,0.0,0.0,1.0,0.0,7.0,0
107 | 59.0,1.0,4.0,140.0,177.0,0.0,0.0,162.0,1.0,0.0,1.0,1.0,7.0,2
108 | 57.0,1.0,3.0,128.0,229.0,0.0,2.0,150.0,0.0,0.4,2.0,1.0,7.0,1
109 | 61.0,1.0,4.0,120.0,260.0,0.0,0.0,140.0,1.0,3.6,2.0,1.0,7.0,2
110 | 39.0,1.0,4.0,118.0,219.0,0.0,0.0,140.0,0.0,1.2,2.0,0.0,7.0,3
111 | 61.0,0.0,4.0,145.0,307.0,0.0,2.0,146.0,1.0,1.0,2.0,0.0,7.0,1
112 | 56.0,1.0,4.0,125.0,249.0,1.0,2.0,144.0,1.0,1.2,2.0,1.0,3.0,1
113 | 52.0,1.0,1.0,118.0,186.0,0.0,2.0,190.0,0.0,0.0,2.0,0.0,6.0,0
114 | 43.0,0.0,4.0,132.0,341.0,1.0,2.0,136.0,1.0,3.0,2.0,0.0,7.0,2
115 | 62.0,0.0,3.0,130.0,263.0,0.0,0.0,97.0,0.0,1.2,2.0,1.0,7.0,2
116 | 41.0,1.0,2.0,135.0,203.0,0.0,0.0,132.0,0.0,0.0,2.0,0.0,6.0,0
117 | 58.0,1.0,3.0,140.0,211.0,1.0,2.0,165.0,0.0,0.0,1.0,0.0,3.0,0
118 | 35.0,0.0,4.0,138.0,183.0,0.0,0.0,182.0,0.0,1.4,1.0,0.0,3.0,0
119 | 63.0,1.0,4.0,130.0,330.0,1.0,2.0,132.0,1.0,1.8,1.0,3.0,7.0,3
120 | 65.0,1.0,4.0,135.0,254.0,0.0,2.0,127.0,0.0,2.8,2.0,1.0,7.0,2
121 | 48.0,1.0,4.0,130.0,256.0,1.0,2.0,150.0,1.0,0.0,1.0,2.0,7.0,3
122 | 63.0,0.0,4.0,150.0,407.0,0.0,2.0,154.0,0.0,4.0,2.0,3.0,7.0,4
123 | 51.0,1.0,3.0,100.0,222.0,0.0,0.0,143.0,1.0,1.2,2.0,0.0,3.0,0
124 | 55.0,1.0,4.0,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0,3
125 | 65.0,1.0,1.0,138.0,282.0,1.0,2.0,174.0,0.0,1.4,2.0,1.0,3.0,1
126 | 45.0,0.0,2.0,130.0,234.0,0.0,2.0,175.0,0.0,0.6,2.0,0.0,3.0,0
127 | 56.0,0.0,4.0,200.0,288.0,1.0,2.0,133.0,1.0,4.0,3.0,2.0,7.0,3
128 | 54.0,1.0,4.0,110.0,239.0,0.0,0.0,126.0,1.0,2.8,2.0,1.0,7.0,3
129 | 44.0,1.0,2.0,120.0,220.0,0.0,0.0,170.0,0.0,0.0,1.0,0.0,3.0,0
130 | 62.0,0.0,4.0,124.0,209.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,0
131 | 54.0,1.0,3.0,120.0,258.0,0.0,2.0,147.0,0.0,0.4,2.0,0.0,7.0,0
132 | 51.0,1.0,3.0,94.0,227.0,0.0,0.0,154.0,1.0,0.0,1.0,1.0,7.0,0
133 | 29.0,1.0,2.0,130.0,204.0,0.0,2.0,202.0,0.0,0.0,1.0,0.0,3.0,0
134 | 51.0,1.0,4.0,140.0,261.0,0.0,2.0,186.0,1.0,0.0,1.0,0.0,3.0,0
135 | 43.0,0.0,3.0,122.0,213.0,0.0,0.0,165.0,0.0,0.2,2.0,0.0,3.0,0
136 | 55.0,0.0,2.0,135.0,250.0,0.0,2.0,161.0,0.0,1.4,2.0,0.0,3.0,0
137 | 70.0,1.0,4.0,145.0,174.0,0.0,0.0,125.0,1.0,2.6,3.0,0.0,7.0,4
138 | 62.0,1.0,2.0,120.0,281.0,0.0,2.0,103.0,0.0,1.4,2.0,1.0,7.0,3
139 | 35.0,1.0,4.0,120.0,198.0,0.0,0.0,130.0,1.0,1.6,2.0,0.0,7.0,1
140 | 51.0,1.0,3.0,125.0,245.0,1.0,2.0,166.0,0.0,2.4,2.0,0.0,3.0,0
141 | 59.0,1.0,2.0,140.0,221.0,0.0,0.0,164.0,1.0,0.0,1.0,0.0,3.0,0
142 | 59.0,1.0,1.0,170.0,288.0,0.0,2.0,159.0,0.0,0.2,2.0,0.0,7.0,1
143 | 52.0,1.0,2.0,128.0,205.0,1.0,0.0,184.0,0.0,0.0,1.0,0.0,3.0,0
144 | 64.0,1.0,3.0,125.0,309.0,0.0,0.0,131.0,1.0,1.8,2.0,0.0,7.0,1
145 | 58.0,1.0,3.0,105.0,240.0,0.0,2.0,154.0,1.0,0.6,2.0,0.0,7.0,0
146 | 47.0,1.0,3.0,108.0,243.0,0.0,0.0,152.0,0.0,0.0,1.0,0.0,3.0,1
147 | 57.0,1.0,4.0,165.0,289.0,1.0,2.0,124.0,0.0,1.0,2.0,3.0,7.0,4
148 | 41.0,1.0,3.0,112.0,250.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0
149 | 45.0,1.0,2.0,128.0,308.0,0.0,2.0,170.0,0.0,0.0,1.0,0.0,3.0,0
150 | 60.0,0.0,3.0,102.0,318.0,0.0,0.0,160.0,0.0,0.0,1.0,1.0,3.0,0
151 | 52.0,1.0,1.0,152.0,298.0,1.0,0.0,178.0,0.0,1.2,2.0,0.0,7.0,0
152 | 42.0,0.0,4.0,102.0,265.0,0.0,2.0,122.0,0.0,0.6,2.0,0.0,3.0,0
153 | 67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,0
154 | 55.0,1.0,4.0,160.0,289.0,0.0,2.0,145.0,1.0,0.8,2.0,1.0,7.0,4
155 | 64.0,1.0,4.0,120.0,246.0,0.0,2.0,96.0,1.0,2.2,3.0,1.0,3.0,3
156 | 70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,1
157 | 51.0,1.0,4.0,140.0,299.0,0.0,0.0,173.0,1.0,1.6,1.0,0.0,7.0,1
158 | 58.0,1.0,4.0,125.0,300.0,0.0,2.0,171.0,0.0,0.0,1.0,2.0,7.0,1
159 | 60.0,1.0,4.0,140.0,293.0,0.0,2.0,170.0,0.0,1.2,2.0,2.0,7.0,2
160 | 68.0,1.0,3.0,118.0,277.0,0.0,0.0,151.0,0.0,1.0,1.0,1.0,7.0,0
161 | 46.0,1.0,2.0,101.0,197.0,1.0,0.0,156.0,0.0,0.0,1.0,0.0,7.0,0
162 | 77.0,1.0,4.0,125.0,304.0,0.0,2.0,162.0,1.0,0.0,1.0,3.0,3.0,4
163 | 54.0,0.0,3.0,110.0,214.0,0.0,0.0,158.0,0.0,1.6,2.0,0.0,3.0,0
164 | 58.0,0.0,4.0,100.0,248.0,0.0,2.0,122.0,0.0,1.0,2.0,0.0,3.0,0
165 | 48.0,1.0,3.0,124.0,255.0,1.0,0.0,175.0,0.0,0.0,1.0,2.0,3.0,0
166 | 57.0,1.0,4.0,132.0,207.0,0.0,0.0,168.0,1.0,0.0,1.0,0.0,7.0,0
167 | 52.0,1.0,3.0,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,?,3.0,0
168 | 54.0,0.0,2.0,132.0,288.0,1.0,2.0,159.0,1.0,0.0,1.0,1.0,3.0,0
169 | 35.0,1.0,4.0,126.0,282.0,0.0,2.0,156.0,1.0,0.0,1.0,0.0,7.0,1
170 | 45.0,0.0,2.0,112.0,160.0,0.0,0.0,138.0,0.0,0.0,2.0,0.0,3.0,0
171 | 70.0,1.0,3.0,160.0,269.0,0.0,0.0,112.0,1.0,2.9,2.0,1.0,7.0,3
172 | 53.0,1.0,4.0,142.0,226.0,0.0,2.0,111.0,1.0,0.0,1.0,0.0,7.0,0
173 | 59.0,0.0,4.0,174.0,249.0,0.0,0.0,143.0,1.0,0.0,2.0,0.0,3.0,1
174 | 62.0,0.0,4.0,140.0,394.0,0.0,2.0,157.0,0.0,1.2,2.0,0.0,3.0,0
175 | 64.0,1.0,4.0,145.0,212.0,0.0,2.0,132.0,0.0,2.0,2.0,2.0,6.0,4
176 | 57.0,1.0,4.0,152.0,274.0,0.0,0.0,88.0,1.0,1.2,2.0,1.0,7.0,1
177 | 52.0,1.0,4.0,108.0,233.0,1.0,0.0,147.0,0.0,0.1,1.0,3.0,7.0,0
178 | 56.0,1.0,4.0,132.0,184.0,0.0,2.0,105.0,1.0,2.1,2.0,1.0,6.0,1
179 | 43.0,1.0,3.0,130.0,315.0,0.0,0.0,162.0,0.0,1.9,1.0,1.0,3.0,0
180 | 53.0,1.0,3.0,130.0,246.0,1.0,2.0,173.0,0.0,0.0,1.0,3.0,3.0,0
181 | 48.0,1.0,4.0,124.0,274.0,0.0,2.0,166.0,0.0,0.5,2.0,0.0,7.0,3
182 | 56.0,0.0,4.0,134.0,409.0,0.0,2.0,150.0,1.0,1.9,2.0,2.0,7.0,2
183 | 42.0,1.0,1.0,148.0,244.0,0.0,2.0,178.0,0.0,0.8,1.0,2.0,3.0,0
184 | 59.0,1.0,1.0,178.0,270.0,0.0,2.0,145.0,0.0,4.2,3.0,0.0,7.0,0
185 | 60.0,0.0,4.0,158.0,305.0,0.0,2.0,161.0,0.0,0.0,1.0,0.0,3.0,1
186 | 63.0,0.0,2.0,140.0,195.0,0.0,0.0,179.0,0.0,0.0,1.0,2.0,3.0,0
187 | 42.0,1.0,3.0,120.0,240.0,1.0,0.0,194.0,0.0,0.8,3.0,0.0,7.0,0
188 | 66.0,1.0,2.0,160.0,246.0,0.0,0.0,120.0,1.0,0.0,2.0,3.0,6.0,2
189 | 54.0,1.0,2.0,192.0,283.0,0.0,2.0,195.0,0.0,0.0,1.0,1.0,7.0,1
190 | 69.0,1.0,3.0,140.0,254.0,0.0,2.0,146.0,0.0,2.0,2.0,3.0,7.0,2
191 | 50.0,1.0,3.0,129.0,196.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,0
192 | 51.0,1.0,4.0,140.0,298.0,0.0,0.0,122.0,1.0,4.2,2.0,3.0,7.0,3
193 | 43.0,1.0,4.0,132.0,247.0,1.0,2.0,143.0,1.0,0.1,2.0,?,7.0,1
194 | 62.0,0.0,4.0,138.0,294.0,1.0,0.0,106.0,0.0,1.9,2.0,3.0,3.0,2
195 | 68.0,0.0,3.0,120.0,211.0,0.0,2.0,115.0,0.0,1.5,2.0,0.0,3.0,0
196 | 67.0,1.0,4.0,100.0,299.0,0.0,2.0,125.0,1.0,0.9,2.0,2.0,3.0,3
197 | 69.0,1.0,1.0,160.0,234.0,1.0,2.0,131.0,0.0,0.1,2.0,1.0,3.0,0
198 | 45.0,0.0,4.0,138.0,236.0,0.0,2.0,152.0,1.0,0.2,2.0,0.0,3.0,0
199 | 50.0,0.0,2.0,120.0,244.0,0.0,0.0,162.0,0.0,1.1,1.0,0.0,3.0,0
200 | 59.0,1.0,1.0,160.0,273.0,0.0,2.0,125.0,0.0,0.0,1.0,0.0,3.0,1
201 | 50.0,0.0,4.0,110.0,254.0,0.0,2.0,159.0,0.0,0.0,1.0,0.0,3.0,0
202 | 64.0,0.0,4.0,180.0,325.0,0.0,0.0,154.0,1.0,0.0,1.0,0.0,3.0,0
203 | 57.0,1.0,3.0,150.0,126.0,1.0,0.0,173.0,0.0,0.2,1.0,1.0,7.0,0
204 | 64.0,0.0,3.0,140.0,313.0,0.0,0.0,133.0,0.0,0.2,1.0,0.0,7.0,0
205 | 43.0,1.0,4.0,110.0,211.0,0.0,0.0,161.0,0.0,0.0,1.0,0.0,7.0,0
206 | 45.0,1.0,4.0,142.0,309.0,0.0,2.0,147.0,1.0,0.0,2.0,3.0,7.0,3
207 | 58.0,1.0,4.0,128.0,259.0,0.0,2.0,130.0,1.0,3.0,2.0,2.0,7.0,3
208 | 50.0,1.0,4.0,144.0,200.0,0.0,2.0,126.0,1.0,0.9,2.0,0.0,7.0,3
209 | 55.0,1.0,2.0,130.0,262.0,0.0,0.0,155.0,0.0,0.0,1.0,0.0,3.0,0
210 | 62.0,0.0,4.0,150.0,244.0,0.0,0.0,154.0,1.0,1.4,2.0,0.0,3.0,1
211 | 37.0,0.0,3.0,120.0,215.0,0.0,0.0,170.0,0.0,0.0,1.0,0.0,3.0,0
212 | 38.0,1.0,1.0,120.0,231.0,0.0,0.0,182.0,1.0,3.8,2.0,0.0,7.0,4
213 | 41.0,1.0,3.0,130.0,214.0,0.0,2.0,168.0,0.0,2.0,2.0,0.0,3.0,0
214 | 66.0,0.0,4.0,178.0,228.0,1.0,0.0,165.0,1.0,1.0,2.0,2.0,7.0,3
215 | 52.0,1.0,4.0,112.0,230.0,0.0,0.0,160.0,0.0,0.0,1.0,1.0,3.0,1
216 | 56.0,1.0,1.0,120.0,193.0,0.0,2.0,162.0,0.0,1.9,2.0,0.0,7.0,0
217 | 46.0,0.0,2.0,105.0,204.0,0.0,0.0,172.0,0.0,0.0,1.0,0.0,3.0,0
218 | 46.0,0.0,4.0,138.0,243.0,0.0,2.0,152.0,1.0,0.0,2.0,0.0,3.0,0
219 | 64.0,0.0,4.0,130.0,303.0,0.0,0.0,122.0,0.0,2.0,2.0,2.0,3.0,0
220 | 59.0,1.0,4.0,138.0,271.0,0.0,2.0,182.0,0.0,0.0,1.0,0.0,3.0,0
221 | 41.0,0.0,3.0,112.0,268.0,0.0,2.0,172.0,1.0,0.0,1.0,0.0,3.0,0
222 | 54.0,0.0,3.0,108.0,267.0,0.0,2.0,167.0,0.0,0.0,1.0,0.0,3.0,0
223 | 39.0,0.0,3.0,94.0,199.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0
224 | 53.0,1.0,4.0,123.0,282.0,0.0,0.0,95.0,1.0,2.0,2.0,2.0,7.0,3
225 | 63.0,0.0,4.0,108.0,269.0,0.0,0.0,169.0,1.0,1.8,2.0,2.0,3.0,1
226 | 34.0,0.0,2.0,118.0,210.0,0.0,0.0,192.0,0.0,0.7,1.0,0.0,3.0,0
227 | 47.0,1.0,4.0,112.0,204.0,0.0,0.0,143.0,0.0,0.1,1.0,0.0,3.0,0
228 | 67.0,0.0,3.0,152.0,277.0,0.0,0.0,172.0,0.0,0.0,1.0,1.0,3.0,0
229 | 54.0,1.0,4.0,110.0,206.0,0.0,2.0,108.0,1.0,0.0,2.0,1.0,3.0,3
230 | 66.0,1.0,4.0,112.0,212.0,0.0,2.0,132.0,1.0,0.1,1.0,1.0,3.0,2
231 | 52.0,0.0,3.0,136.0,196.0,0.0,2.0,169.0,0.0,0.1,2.0,0.0,3.0,0
232 | 55.0,0.0,4.0,180.0,327.0,0.0,1.0,117.0,1.0,3.4,2.0,0.0,3.0,2
233 | 49.0,1.0,3.0,118.0,149.0,0.0,2.0,126.0,0.0,0.8,1.0,3.0,3.0,1
234 | 74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,0
235 | 54.0,0.0,3.0,160.0,201.0,0.0,0.0,163.0,0.0,0.0,1.0,1.0,3.0,0
236 | 54.0,1.0,4.0,122.0,286.0,0.0,2.0,116.0,1.0,3.2,2.0,2.0,3.0,3
237 | 56.0,1.0,4.0,130.0,283.0,1.0,2.0,103.0,1.0,1.6,3.0,0.0,7.0,2
238 | 46.0,1.0,4.0,120.0,249.0,0.0,2.0,144.0,0.0,0.8,1.0,0.0,7.0,1
239 | 49.0,0.0,2.0,134.0,271.0,0.0,0.0,162.0,0.0,0.0,2.0,0.0,3.0,0
240 | 42.0,1.0,2.0,120.0,295.0,0.0,0.0,162.0,0.0,0.0,1.0,0.0,3.0,0
241 | 41.0,1.0,2.0,110.0,235.0,0.0,0.0,153.0,0.0,0.0,1.0,0.0,3.0,0
242 | 41.0,0.0,2.0,126.0,306.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,0
243 | 49.0,0.0,4.0,130.0,269.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,0
244 | 61.0,1.0,1.0,134.0,234.0,0.0,0.0,145.0,0.0,2.6,2.0,2.0,3.0,2
245 | 60.0,0.0,3.0,120.0,178.0,1.0,0.0,96.0,0.0,0.0,1.0,0.0,3.0,0
246 | 67.0,1.0,4.0,120.0,237.0,0.0,0.0,71.0,0.0,1.0,2.0,0.0,3.0,2
247 | 58.0,1.0,4.0,100.0,234.0,0.0,0.0,156.0,0.0,0.1,1.0,1.0,7.0,2
248 | 47.0,1.0,4.0,110.0,275.0,0.0,2.0,118.0,1.0,1.0,2.0,1.0,3.0,1
249 | 52.0,1.0,4.0,125.0,212.0,0.0,0.0,168.0,0.0,1.0,1.0,2.0,7.0,3
250 | 62.0,1.0,2.0,128.0,208.0,1.0,2.0,140.0,0.0,0.0,1.0,0.0,3.0,0
251 | 57.0,1.0,4.0,110.0,201.0,0.0,0.0,126.0,1.0,1.5,2.0,0.0,6.0,0
252 | 58.0,1.0,4.0,146.0,218.0,0.0,0.0,105.0,0.0,2.0,2.0,1.0,7.0,1
253 | 64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,0
254 | 51.0,0.0,3.0,120.0,295.0,0.0,2.0,157.0,0.0,0.6,1.0,0.0,3.0,0
255 | 43.0,1.0,4.0,115.0,303.0,0.0,0.0,181.0,0.0,1.2,2.0,0.0,3.0,0
256 | 42.0,0.0,3.0,120.0,209.0,0.0,0.0,173.0,0.0,0.0,2.0,0.0,3.0,0
257 | 67.0,0.0,4.0,106.0,223.0,0.0,0.0,142.0,0.0,0.3,1.0,2.0,3.0,0
258 | 76.0,0.0,3.0,140.0,197.0,0.0,1.0,116.0,0.0,1.1,2.0,0.0,3.0,0
259 | 70.0,1.0,2.0,156.0,245.0,0.0,2.0,143.0,0.0,0.0,1.0,0.0,3.0,0
260 | 57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,1
261 | 44.0,0.0,3.0,118.0,242.0,0.0,0.0,149.0,0.0,0.3,2.0,1.0,3.0,0
262 | 58.0,0.0,2.0,136.0,319.0,1.0,2.0,152.0,0.0,0.0,1.0,2.0,3.0,3
263 | 60.0,0.0,1.0,150.0,240.0,0.0,0.0,171.0,0.0,0.9,1.0,0.0,3.0,0
264 | 44.0,1.0,3.0,120.0,226.0,0.0,0.0,169.0,0.0,0.0,1.0,0.0,3.0,0
265 | 61.0,1.0,4.0,138.0,166.0,0.0,2.0,125.0,1.0,3.6,2.0,1.0,3.0,4
266 | 42.0,1.0,4.0,136.0,315.0,0.0,0.0,125.0,1.0,1.8,2.0,0.0,6.0,2
267 | 52.0,1.0,4.0,128.0,204.0,1.0,0.0,156.0,1.0,1.0,2.0,0.0,?,2
268 | 59.0,1.0,3.0,126.0,218.0,1.0,0.0,134.0,0.0,2.2,2.0,1.0,6.0,2
269 | 40.0,1.0,4.0,152.0,223.0,0.0,0.0,181.0,0.0,0.0,1.0,0.0,7.0,1
270 | 42.0,1.0,3.0,130.0,180.0,0.0,0.0,150.0,0.0,0.0,1.0,0.0,3.0,0
271 | 61.0,1.0,4.0,140.0,207.0,0.0,2.0,138.0,1.0,1.9,1.0,1.0,7.0,1
272 | 66.0,1.0,4.0,160.0,228.0,0.0,2.0,138.0,0.0,2.3,1.0,0.0,6.0,0
273 | 46.0,1.0,4.0,140.0,311.0,0.0,0.0,120.0,1.0,1.8,2.0,2.0,7.0,2
274 | 71.0,0.0,4.0,112.0,149.0,0.0,0.0,125.0,0.0,1.6,2.0,0.0,3.0,0
275 | 59.0,1.0,1.0,134.0,204.0,0.0,0.0,162.0,0.0,0.8,1.0,2.0,3.0,1
276 | 64.0,1.0,1.0,170.0,227.0,0.0,2.0,155.0,0.0,0.6,2.0,0.0,7.0,0
277 | 66.0,0.0,3.0,146.0,278.0,0.0,2.0,152.0,0.0,0.0,2.0,1.0,3.0,0
278 | 39.0,0.0,3.0,138.0,220.0,0.0,0.0,152.0,0.0,0.0,2.0,0.0,3.0,0
279 | 57.0,1.0,2.0,154.0,232.0,0.0,2.0,164.0,0.0,0.0,1.0,1.0,3.0,1
280 | 58.0,0.0,4.0,130.0,197.0,0.0,0.0,131.0,0.0,0.6,2.0,0.0,3.0,0
281 | 57.0,1.0,4.0,110.0,335.0,0.0,0.0,143.0,1.0,3.0,2.0,1.0,7.0,2
282 | 47.0,1.0,3.0,130.0,253.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0
283 | 55.0,0.0,4.0,128.0,205.0,0.0,1.0,130.0,1.0,2.0,2.0,1.0,7.0,3
284 | 35.0,1.0,2.0,122.0,192.0,0.0,0.0,174.0,0.0,0.0,1.0,0.0,3.0,0
285 | 61.0,1.0,4.0,148.0,203.0,0.0,0.0,161.0,0.0,0.0,1.0,1.0,7.0,2
286 | 58.0,1.0,4.0,114.0,318.0,0.0,1.0,140.0,0.0,4.4,3.0,3.0,6.0,4
287 | 58.0,0.0,4.0,170.0,225.0,1.0,2.0,146.0,1.0,2.8,2.0,2.0,6.0,2
288 | 58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,?,7.0,0
289 | 56.0,1.0,2.0,130.0,221.0,0.0,2.0,163.0,0.0,0.0,1.0,0.0,7.0,0
290 | 56.0,1.0,2.0,120.0,240.0,0.0,0.0,169.0,0.0,0.0,3.0,0.0,3.0,0
291 | 67.0,1.0,3.0,152.0,212.0,0.0,2.0,150.0,0.0,0.8,2.0,0.0,7.0,1
292 | 55.0,0.0,2.0,132.0,342.0,0.0,0.0,166.0,0.0,1.2,1.0,0.0,3.0,0
293 | 44.0,1.0,4.0,120.0,169.0,0.0,0.0,144.0,1.0,2.8,3.0,0.0,6.0,2
294 | 63.0,1.0,4.0,140.0,187.0,0.0,2.0,144.0,1.0,4.0,1.0,2.0,7.0,2
295 | 63.0,0.0,4.0,124.0,197.0,0.0,0.0,136.0,1.0,0.0,2.0,0.0,3.0,1
296 | 41.0,1.0,2.0,120.0,157.0,0.0,0.0,182.0,0.0,0.0,1.0,0.0,3.0,0
297 | 59.0,1.0,4.0,164.0,176.0,1.0,2.0,90.0,0.0,1.0,2.0,2.0,6.0,3
298 | 57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0,1
299 | 45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
300 | 68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
301 | 57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
302 | 57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1
303 | 38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,?,3.0,0
304 | 


--------------------------------------------------------------------------------
/uci-heart-disease/processed.cleveland.csv:
--------------------------------------------------------------------------------
  1 | age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
  2 | 63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
  3 | 67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
  4 | 67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
  5 | 37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
  6 | 41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
  7 | 56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
  8 | 62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,3
  9 | 57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
 10 | 63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,2
 11 | 53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1
 12 | 57.0,1.0,4.0,140.0,192.0,0.0,0.0,148.0,0.0,0.4,2.0,0.0,6.0,0
 13 | 56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,0
 14 | 56.0,1.0,3.0,130.0,256.0,1.0,2.0,142.0,1.0,0.6,2.0,1.0,6.0,2
 15 | 44.0,1.0,2.0,120.0,263.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,7.0,0
 16 | 52.0,1.0,3.0,172.0,199.0,1.0,0.0,162.0,0.0,0.5,1.0,0.0,7.0,0
 17 | 57.0,1.0,3.0,150.0,168.0,0.0,0.0,174.0,0.0,1.6,1.0,0.0,3.0,0
 18 | 48.0,1.0,2.0,110.0,229.0,0.0,0.0,168.0,0.0,1.0,3.0,0.0,7.0,1
 19 | 54.0,1.0,4.0,140.0,239.0,0.0,0.0,160.0,0.0,1.2,1.0,0.0,3.0,0
 20 | 48.0,0.0,3.0,130.0,275.0,0.0,0.0,139.0,0.0,0.2,1.0,0.0,3.0,0
 21 | 49.0,1.0,2.0,130.0,266.0,0.0,0.0,171.0,0.0,0.6,1.0,0.0,3.0,0
 22 | 64.0,1.0,1.0,110.0,211.0,0.0,2.0,144.0,1.0,1.8,2.0,0.0,3.0,0
 23 | 58.0,0.0,1.0,150.0,283.0,1.0,2.0,162.0,0.0,1.0,1.0,0.0,3.0,0
 24 | 58.0,1.0,2.0,120.0,284.0,0.0,2.0,160.0,0.0,1.8,2.0,0.0,3.0,1
 25 | 58.0,1.0,3.0,132.0,224.0,0.0,2.0,173.0,0.0,3.2,1.0,2.0,7.0,3
 26 | 60.0,1.0,4.0,130.0,206.0,0.0,2.0,132.0,1.0,2.4,2.0,2.0,7.0,4
 27 | 50.0,0.0,3.0,120.0,219.0,0.0,0.0,158.0,0.0,1.6,2.0,0.0,3.0,0
 28 | 58.0,0.0,3.0,120.0,340.0,0.0,0.0,172.0,0.0,0.0,1.0,0.0,3.0,0
 29 | 66.0,0.0,1.0,150.0,226.0,0.0,0.0,114.0,0.0,2.6,3.0,0.0,3.0,0
 30 | 43.0,1.0,4.0,150.0,247.0,0.0,0.0,171.0,0.0,1.5,1.0,0.0,3.0,0
 31 | 40.0,1.0,4.0,110.0,167.0,0.0,2.0,114.0,1.0,2.0,2.0,0.0,7.0,3
 32 | 69.0,0.0,1.0,140.0,239.0,0.0,0.0,151.0,0.0,1.8,1.0,2.0,3.0,0
 33 | 60.0,1.0,4.0,117.0,230.0,1.0,0.0,160.0,1.0,1.4,1.0,2.0,7.0,2
 34 | 64.0,1.0,3.0,140.0,335.0,0.0,0.0,158.0,0.0,0.0,1.0,0.0,3.0,1
 35 | 59.0,1.0,4.0,135.0,234.0,0.0,0.0,161.0,0.0,0.5,2.0,0.0,7.0,0
 36 | 44.0,1.0,3.0,130.0,233.0,0.0,0.0,179.0,1.0,0.4,1.0,0.0,3.0,0
 37 | 42.0,1.0,4.0,140.0,226.0,0.0,0.0,178.0,0.0,0.0,1.0,0.0,3.0,0
 38 | 43.0,1.0,4.0,120.0,177.0,0.0,2.0,120.0,1.0,2.5,2.0,0.0,7.0,3
 39 | 57.0,1.0,4.0,150.0,276.0,0.0,2.0,112.0,1.0,0.6,2.0,1.0,6.0,1
 40 | 55.0,1.0,4.0,132.0,353.0,0.0,0.0,132.0,1.0,1.2,2.0,1.0,7.0,3
 41 | 61.0,1.0,3.0,150.0,243.0,1.0,0.0,137.0,1.0,1.0,2.0,0.0,3.0,0
 42 | 65.0,0.0,4.0,150.0,225.0,0.0,2.0,114.0,0.0,1.0,2.0,3.0,7.0,4
 43 | 40.0,1.0,1.0,140.0,199.0,0.0,0.0,178.0,1.0,1.4,1.0,0.0,7.0,0
 44 | 71.0,0.0,2.0,160.0,302.0,0.0,0.0,162.0,0.0,0.4,1.0,2.0,3.0,0
 45 | 59.0,1.0,3.0,150.0,212.0,1.0,0.0,157.0,0.0,1.6,1.0,0.0,3.0,0
 46 | 61.0,0.0,4.0,130.0,330.0,0.0,2.0,169.0,0.0,0.0,1.0,0.0,3.0,1
 47 | 58.0,1.0,3.0,112.0,230.0,0.0,2.0,165.0,0.0,2.5,2.0,1.0,7.0,4
 48 | 51.0,1.0,3.0,110.0,175.0,0.0,0.0,123.0,0.0,0.6,1.0,0.0,3.0,0
 49 | 50.0,1.0,4.0,150.0,243.0,0.0,2.0,128.0,0.0,2.6,2.0,0.0,7.0,4
 50 | 65.0,0.0,3.0,140.0,417.0,1.0,2.0,157.0,0.0,0.8,1.0,1.0,3.0,0
 51 | 53.0,1.0,3.0,130.0,197.0,1.0,2.0,152.0,0.0,1.2,3.0,0.0,3.0,0
 52 | 41.0,0.0,2.0,105.0,198.0,0.0,0.0,168.0,0.0,0.0,1.0,1.0,3.0,0
 53 | 65.0,1.0,4.0,120.0,177.0,0.0,0.0,140.0,0.0,0.4,1.0,0.0,7.0,0
 54 | 44.0,1.0,4.0,112.0,290.0,0.0,2.0,153.0,0.0,0.0,1.0,1.0,3.0,2
 55 | 44.0,1.0,2.0,130.0,219.0,0.0,2.0,188.0,0.0,0.0,1.0,0.0,3.0,0
 56 | 60.0,1.0,4.0,130.0,253.0,0.0,0.0,144.0,1.0,1.4,1.0,1.0,7.0,1
 57 | 54.0,1.0,4.0,124.0,266.0,0.0,2.0,109.0,1.0,2.2,2.0,1.0,7.0,1
 58 | 50.0,1.0,3.0,140.0,233.0,0.0,0.0,163.0,0.0,0.6,2.0,1.0,7.0,1
 59 | 41.0,1.0,4.0,110.0,172.0,0.0,2.0,158.0,0.0,0.0,1.0,0.0,7.0,1
 60 | 54.0,1.0,3.0,125.0,273.0,0.0,2.0,152.0,0.0,0.5,3.0,1.0,3.0,0
 61 | 51.0,1.0,1.0,125.0,213.0,0.0,2.0,125.0,1.0,1.4,1.0,1.0,3.0,0
 62 | 51.0,0.0,4.0,130.0,305.0,0.0,0.0,142.0,1.0,1.2,2.0,0.0,7.0,2
 63 | 46.0,0.0,3.0,142.0,177.0,0.0,2.0,160.0,1.0,1.4,3.0,0.0,3.0,0
 64 | 58.0,1.0,4.0,128.0,216.0,0.0,2.0,131.0,1.0,2.2,2.0,3.0,7.0,1
 65 | 54.0,0.0,3.0,135.0,304.0,1.0,0.0,170.0,0.0,0.0,1.0,0.0,3.0,0
 66 | 54.0,1.0,4.0,120.0,188.0,0.0,0.0,113.0,0.0,1.4,2.0,1.0,7.0,2
 67 | 60.0,1.0,4.0,145.0,282.0,0.0,2.0,142.0,1.0,2.8,2.0,2.0,7.0,2
 68 | 60.0,1.0,3.0,140.0,185.0,0.0,2.0,155.0,0.0,3.0,2.0,0.0,3.0,1
 69 | 54.0,1.0,3.0,150.0,232.0,0.0,2.0,165.0,0.0,1.6,1.0,0.0,7.0,0
 70 | 59.0,1.0,4.0,170.0,326.0,0.0,2.0,140.0,1.0,3.4,3.0,0.0,7.0,2
 71 | 46.0,1.0,3.0,150.0,231.0,0.0,0.0,147.0,0.0,3.6,2.0,0.0,3.0,1
 72 | 65.0,0.0,3.0,155.0,269.0,0.0,0.0,148.0,0.0,0.8,1.0,0.0,3.0,0
 73 | 67.0,1.0,4.0,125.0,254.0,1.0,0.0,163.0,0.0,0.2,2.0,2.0,7.0,3
 74 | 62.0,1.0,4.0,120.0,267.0,0.0,0.0,99.0,1.0,1.8,2.0,2.0,7.0,1
 75 | 65.0,1.0,4.0,110.0,248.0,0.0,2.0,158.0,0.0,0.6,1.0,2.0,6.0,1
 76 | 44.0,1.0,4.0,110.0,197.0,0.0,2.0,177.0,0.0,0.0,1.0,1.0,3.0,1
 77 | 65.0,0.0,3.0,160.0,360.0,0.0,2.0,151.0,0.0,0.8,1.0,0.0,3.0,0
 78 | 60.0,1.0,4.0,125.0,258.0,0.0,2.0,141.0,1.0,2.8,2.0,1.0,7.0,1
 79 | 51.0,0.0,3.0,140.0,308.0,0.0,2.0,142.0,0.0,1.5,1.0,1.0,3.0,0
 80 | 48.0,1.0,2.0,130.0,245.0,0.0,2.0,180.0,0.0,0.2,2.0,0.0,3.0,0
 81 | 58.0,1.0,4.0,150.0,270.0,0.0,2.0,111.0,1.0,0.8,1.0,0.0,7.0,3
 82 | 45.0,1.0,4.0,104.0,208.0,0.0,2.0,148.0,1.0,3.0,2.0,0.0,3.0,0
 83 | 53.0,0.0,4.0,130.0,264.0,0.0,2.0,143.0,0.0,0.4,2.0,0.0,3.0,0
 84 | 39.0,1.0,3.0,140.0,321.0,0.0,2.0,182.0,0.0,0.0,1.0,0.0,3.0,0
 85 | 68.0,1.0,3.0,180.0,274.0,1.0,2.0,150.0,1.0,1.6,2.0,0.0,7.0,3
 86 | 52.0,1.0,2.0,120.0,325.0,0.0,0.0,172.0,0.0,0.2,1.0,0.0,3.0,0
 87 | 44.0,1.0,3.0,140.0,235.0,0.0,2.0,180.0,0.0,0.0,1.0,0.0,3.0,0
 88 | 47.0,1.0,3.0,138.0,257.0,0.0,2.0,156.0,0.0,0.0,1.0,0.0,3.0,0
 89 | 53.0,0.0,3.0,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0.0,,0
 90 | 53.0,0.0,4.0,138.0,234.0,0.0,2.0,160.0,0.0,0.0,1.0,0.0,3.0,0
 91 | 51.0,0.0,3.0,130.0,256.0,0.0,2.0,149.0,0.0,0.5,1.0,0.0,3.0,0
 92 | 66.0,1.0,4.0,120.0,302.0,0.0,2.0,151.0,0.0,0.4,2.0,0.0,3.0,0
 93 | 62.0,0.0,4.0,160.0,164.0,0.0,2.0,145.0,0.0,6.2,3.0,3.0,7.0,3
 94 | 62.0,1.0,3.0,130.0,231.0,0.0,0.0,146.0,0.0,1.8,2.0,3.0,7.0,0
 95 | 44.0,0.0,3.0,108.0,141.0,0.0,0.0,175.0,0.0,0.6,2.0,0.0,3.0,0
 96 | 63.0,0.0,3.0,135.0,252.0,0.0,2.0,172.0,0.0,0.0,1.0,0.0,3.0,0
 97 | 52.0,1.0,4.0,128.0,255.0,0.0,0.0,161.0,1.0,0.0,1.0,1.0,7.0,1
 98 | 59.0,1.0,4.0,110.0,239.0,0.0,2.0,142.0,1.0,1.2,2.0,1.0,7.0,2
 99 | 60.0,0.0,4.0,150.0,258.0,0.0,2.0,157.0,0.0,2.6,2.0,2.0,7.0,3
100 | 52.0,1.0,2.0,134.0,201.0,0.0,0.0,158.0,0.0,0.8,1.0,1.0,3.0,0
101 | 48.0,1.0,4.0,122.0,222.0,0.0,2.0,186.0,0.0,0.0,1.0,0.0,3.0,0
102 | 45.0,1.0,4.0,115.0,260.0,0.0,2.0,185.0,0.0,0.0,1.0,0.0,3.0,0
103 | 34.0,1.0,1.0,118.0,182.0,0.0,2.0,174.0,0.0,0.0,1.0,0.0,3.0,0
104 | 57.0,0.0,4.0,128.0,303.0,0.0,2.0,159.0,0.0,0.0,1.0,1.0,3.0,0
105 | 71.0,0.0,3.0,110.0,265.0,1.0,2.0,130.0,0.0,0.0,1.0,1.0,3.0,0
106 | 49.0,1.0,3.0,120.0,188.0,0.0,0.0,139.0,0.0,2.0,2.0,3.0,7.0,3
107 | 54.0,1.0,2.0,108.0,309.0,0.0,0.0,156.0,0.0,0.0,1.0,0.0,7.0,0
108 | 59.0,1.0,4.0,140.0,177.0,0.0,0.0,162.0,1.0,0.0,1.0,1.0,7.0,2
109 | 57.0,1.0,3.0,128.0,229.0,0.0,2.0,150.0,0.0,0.4,2.0,1.0,7.0,1
110 | 61.0,1.0,4.0,120.0,260.0,0.0,0.0,140.0,1.0,3.6,2.0,1.0,7.0,2
111 | 39.0,1.0,4.0,118.0,219.0,0.0,0.0,140.0,0.0,1.2,2.0,0.0,7.0,3
112 | 61.0,0.0,4.0,145.0,307.0,0.0,2.0,146.0,1.0,1.0,2.0,0.0,7.0,1
113 | 56.0,1.0,4.0,125.0,249.0,1.0,2.0,144.0,1.0,1.2,2.0,1.0,3.0,1
114 | 52.0,1.0,1.0,118.0,186.0,0.0,2.0,190.0,0.0,0.0,2.0,0.0,6.0,0
115 | 43.0,0.0,4.0,132.0,341.0,1.0,2.0,136.0,1.0,3.0,2.0,0.0,7.0,2
116 | 62.0,0.0,3.0,130.0,263.0,0.0,0.0,97.0,0.0,1.2,2.0,1.0,7.0,2
117 | 41.0,1.0,2.0,135.0,203.0,0.0,0.0,132.0,0.0,0.0,2.0,0.0,6.0,0
118 | 58.0,1.0,3.0,140.0,211.0,1.0,2.0,165.0,0.0,0.0,1.0,0.0,3.0,0
119 | 35.0,0.0,4.0,138.0,183.0,0.0,0.0,182.0,0.0,1.4,1.0,0.0,3.0,0
120 | 63.0,1.0,4.0,130.0,330.0,1.0,2.0,132.0,1.0,1.8,1.0,3.0,7.0,3
121 | 65.0,1.0,4.0,135.0,254.0,0.0,2.0,127.0,0.0,2.8,2.0,1.0,7.0,2
122 | 48.0,1.0,4.0,130.0,256.0,1.0,2.0,150.0,1.0,0.0,1.0,2.0,7.0,3
123 | 63.0,0.0,4.0,150.0,407.0,0.0,2.0,154.0,0.0,4.0,2.0,3.0,7.0,4
124 | 51.0,1.0,3.0,100.0,222.0,0.0,0.0,143.0,1.0,1.2,2.0,0.0,3.0,0
125 | 55.0,1.0,4.0,140.0,217.0,0.0,0.0,111.0,1.0,5.6,3.0,0.0,7.0,3
126 | 65.0,1.0,1.0,138.0,282.0,1.0,2.0,174.0,0.0,1.4,2.0,1.0,3.0,1
127 | 45.0,0.0,2.0,130.0,234.0,0.0,2.0,175.0,0.0,0.6,2.0,0.0,3.0,0
128 | 56.0,0.0,4.0,200.0,288.0,1.0,2.0,133.0,1.0,4.0,3.0,2.0,7.0,3
129 | 54.0,1.0,4.0,110.0,239.0,0.0,0.0,126.0,1.0,2.8,2.0,1.0,7.0,3
130 | 44.0,1.0,2.0,120.0,220.0,0.0,0.0,170.0,0.0,0.0,1.0,0.0,3.0,0
131 | 62.0,0.0,4.0,124.0,209.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,0
132 | 54.0,1.0,3.0,120.0,258.0,0.0,2.0,147.0,0.0,0.4,2.0,0.0,7.0,0
133 | 51.0,1.0,3.0,94.0,227.0,0.0,0.0,154.0,1.0,0.0,1.0,1.0,7.0,0
134 | 29.0,1.0,2.0,130.0,204.0,0.0,2.0,202.0,0.0,0.0,1.0,0.0,3.0,0
135 | 51.0,1.0,4.0,140.0,261.0,0.0,2.0,186.0,1.0,0.0,1.0,0.0,3.0,0
136 | 43.0,0.0,3.0,122.0,213.0,0.0,0.0,165.0,0.0,0.2,2.0,0.0,3.0,0
137 | 55.0,0.0,2.0,135.0,250.0,0.0,2.0,161.0,0.0,1.4,2.0,0.0,3.0,0
138 | 70.0,1.0,4.0,145.0,174.0,0.0,0.0,125.0,1.0,2.6,3.0,0.0,7.0,4
139 | 62.0,1.0,2.0,120.0,281.0,0.0,2.0,103.0,0.0,1.4,2.0,1.0,7.0,3
140 | 35.0,1.0,4.0,120.0,198.0,0.0,0.0,130.0,1.0,1.6,2.0,0.0,7.0,1
141 | 51.0,1.0,3.0,125.0,245.0,1.0,2.0,166.0,0.0,2.4,2.0,0.0,3.0,0
142 | 59.0,1.0,2.0,140.0,221.0,0.0,0.0,164.0,1.0,0.0,1.0,0.0,3.0,0
143 | 59.0,1.0,1.0,170.0,288.0,0.0,2.0,159.0,0.0,0.2,2.0,0.0,7.0,1
144 | 52.0,1.0,2.0,128.0,205.0,1.0,0.0,184.0,0.0,0.0,1.0,0.0,3.0,0
145 | 64.0,1.0,3.0,125.0,309.0,0.0,0.0,131.0,1.0,1.8,2.0,0.0,7.0,1
146 | 58.0,1.0,3.0,105.0,240.0,0.0,2.0,154.0,1.0,0.6,2.0,0.0,7.0,0
147 | 47.0,1.0,3.0,108.0,243.0,0.0,0.0,152.0,0.0,0.0,1.0,0.0,3.0,1
148 | 57.0,1.0,4.0,165.0,289.0,1.0,2.0,124.0,0.0,1.0,2.0,3.0,7.0,4
149 | 41.0,1.0,3.0,112.0,250.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0
150 | 45.0,1.0,2.0,128.0,308.0,0.0,2.0,170.0,0.0,0.0,1.0,0.0,3.0,0
151 | 60.0,0.0,3.0,102.0,318.0,0.0,0.0,160.0,0.0,0.0,1.0,1.0,3.0,0
152 | 52.0,1.0,1.0,152.0,298.0,1.0,0.0,178.0,0.0,1.2,2.0,0.0,7.0,0
153 | 42.0,0.0,4.0,102.0,265.0,0.0,2.0,122.0,0.0,0.6,2.0,0.0,3.0,0
154 | 67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,0
155 | 55.0,1.0,4.0,160.0,289.0,0.0,2.0,145.0,1.0,0.8,2.0,1.0,7.0,4
156 | 64.0,1.0,4.0,120.0,246.0,0.0,2.0,96.0,1.0,2.2,3.0,1.0,3.0,3
157 | 70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,1
158 | 51.0,1.0,4.0,140.0,299.0,0.0,0.0,173.0,1.0,1.6,1.0,0.0,7.0,1
159 | 58.0,1.0,4.0,125.0,300.0,0.0,2.0,171.0,0.0,0.0,1.0,2.0,7.0,1
160 | 60.0,1.0,4.0,140.0,293.0,0.0,2.0,170.0,0.0,1.2,2.0,2.0,7.0,2
161 | 68.0,1.0,3.0,118.0,277.0,0.0,0.0,151.0,0.0,1.0,1.0,1.0,7.0,0
162 | 46.0,1.0,2.0,101.0,197.0,1.0,0.0,156.0,0.0,0.0,1.0,0.0,7.0,0
163 | 77.0,1.0,4.0,125.0,304.0,0.0,2.0,162.0,1.0,0.0,1.0,3.0,3.0,4
164 | 54.0,0.0,3.0,110.0,214.0,0.0,0.0,158.0,0.0,1.6,2.0,0.0,3.0,0
165 | 58.0,0.0,4.0,100.0,248.0,0.0,2.0,122.0,0.0,1.0,2.0,0.0,3.0,0
166 | 48.0,1.0,3.0,124.0,255.0,1.0,0.0,175.0,0.0,0.0,1.0,2.0,3.0,0
167 | 57.0,1.0,4.0,132.0,207.0,0.0,0.0,168.0,1.0,0.0,1.0,0.0,7.0,0
168 | 52.0,1.0,3.0,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,,3.0,0
169 | 54.0,0.0,2.0,132.0,288.0,1.0,2.0,159.0,1.0,0.0,1.0,1.0,3.0,0
170 | 35.0,1.0,4.0,126.0,282.0,0.0,2.0,156.0,1.0,0.0,1.0,0.0,7.0,1
171 | 45.0,0.0,2.0,112.0,160.0,0.0,0.0,138.0,0.0,0.0,2.0,0.0,3.0,0
172 | 70.0,1.0,3.0,160.0,269.0,0.0,0.0,112.0,1.0,2.9,2.0,1.0,7.0,3
173 | 53.0,1.0,4.0,142.0,226.0,0.0,2.0,111.0,1.0,0.0,1.0,0.0,7.0,0
174 | 59.0,0.0,4.0,174.0,249.0,0.0,0.0,143.0,1.0,0.0,2.0,0.0,3.0,1
175 | 62.0,0.0,4.0,140.0,394.0,0.0,2.0,157.0,0.0,1.2,2.0,0.0,3.0,0
176 | 64.0,1.0,4.0,145.0,212.0,0.0,2.0,132.0,0.0,2.0,2.0,2.0,6.0,4
177 | 57.0,1.0,4.0,152.0,274.0,0.0,0.0,88.0,1.0,1.2,2.0,1.0,7.0,1
178 | 52.0,1.0,4.0,108.0,233.0,1.0,0.0,147.0,0.0,0.1,1.0,3.0,7.0,0
179 | 56.0,1.0,4.0,132.0,184.0,0.0,2.0,105.0,1.0,2.1,2.0,1.0,6.0,1
180 | 43.0,1.0,3.0,130.0,315.0,0.0,0.0,162.0,0.0,1.9,1.0,1.0,3.0,0
181 | 53.0,1.0,3.0,130.0,246.0,1.0,2.0,173.0,0.0,0.0,1.0,3.0,3.0,0
182 | 48.0,1.0,4.0,124.0,274.0,0.0,2.0,166.0,0.0,0.5,2.0,0.0,7.0,3
183 | 56.0,0.0,4.0,134.0,409.0,0.0,2.0,150.0,1.0,1.9,2.0,2.0,7.0,2
184 | 42.0,1.0,1.0,148.0,244.0,0.0,2.0,178.0,0.0,0.8,1.0,2.0,3.0,0
185 | 59.0,1.0,1.0,178.0,270.0,0.0,2.0,145.0,0.0,4.2,3.0,0.0,7.0,0
186 | 60.0,0.0,4.0,158.0,305.0,0.0,2.0,161.0,0.0,0.0,1.0,0.0,3.0,1
187 | 63.0,0.0,2.0,140.0,195.0,0.0,0.0,179.0,0.0,0.0,1.0,2.0,3.0,0
188 | 42.0,1.0,3.0,120.0,240.0,1.0,0.0,194.0,0.0,0.8,3.0,0.0,7.0,0
189 | 66.0,1.0,2.0,160.0,246.0,0.0,0.0,120.0,1.0,0.0,2.0,3.0,6.0,2
190 | 54.0,1.0,2.0,192.0,283.0,0.0,2.0,195.0,0.0,0.0,1.0,1.0,7.0,1
191 | 69.0,1.0,3.0,140.0,254.0,0.0,2.0,146.0,0.0,2.0,2.0,3.0,7.0,2
192 | 50.0,1.0,3.0,129.0,196.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,0
193 | 51.0,1.0,4.0,140.0,298.0,0.0,0.0,122.0,1.0,4.2,2.0,3.0,7.0,3
194 | 43.0,1.0,4.0,132.0,247.0,1.0,2.0,143.0,1.0,0.1,2.0,,7.0,1
195 | 62.0,0.0,4.0,138.0,294.0,1.0,0.0,106.0,0.0,1.9,2.0,3.0,3.0,2
196 | 68.0,0.0,3.0,120.0,211.0,0.0,2.0,115.0,0.0,1.5,2.0,0.0,3.0,0
197 | 67.0,1.0,4.0,100.0,299.0,0.0,2.0,125.0,1.0,0.9,2.0,2.0,3.0,3
198 | 69.0,1.0,1.0,160.0,234.0,1.0,2.0,131.0,0.0,0.1,2.0,1.0,3.0,0
199 | 45.0,0.0,4.0,138.0,236.0,0.0,2.0,152.0,1.0,0.2,2.0,0.0,3.0,0
200 | 50.0,0.0,2.0,120.0,244.0,0.0,0.0,162.0,0.0,1.1,1.0,0.0,3.0,0
201 | 59.0,1.0,1.0,160.0,273.0,0.0,2.0,125.0,0.0,0.0,1.0,0.0,3.0,1
202 | 50.0,0.0,4.0,110.0,254.0,0.0,2.0,159.0,0.0,0.0,1.0,0.0,3.0,0
203 | 64.0,0.0,4.0,180.0,325.0,0.0,0.0,154.0,1.0,0.0,1.0,0.0,3.0,0
204 | 57.0,1.0,3.0,150.0,126.0,1.0,0.0,173.0,0.0,0.2,1.0,1.0,7.0,0
205 | 64.0,0.0,3.0,140.0,313.0,0.0,0.0,133.0,0.0,0.2,1.0,0.0,7.0,0
206 | 43.0,1.0,4.0,110.0,211.0,0.0,0.0,161.0,0.0,0.0,1.0,0.0,7.0,0
207 | 45.0,1.0,4.0,142.0,309.0,0.0,2.0,147.0,1.0,0.0,2.0,3.0,7.0,3
208 | 58.0,1.0,4.0,128.0,259.0,0.0,2.0,130.0,1.0,3.0,2.0,2.0,7.0,3
209 | 50.0,1.0,4.0,144.0,200.0,0.0,2.0,126.0,1.0,0.9,2.0,0.0,7.0,3
210 | 55.0,1.0,2.0,130.0,262.0,0.0,0.0,155.0,0.0,0.0,1.0,0.0,3.0,0
211 | 62.0,0.0,4.0,150.0,244.0,0.0,0.0,154.0,1.0,1.4,2.0,0.0,3.0,1
212 | 37.0,0.0,3.0,120.0,215.0,0.0,0.0,170.0,0.0,0.0,1.0,0.0,3.0,0
213 | 38.0,1.0,1.0,120.0,231.0,0.0,0.0,182.0,1.0,3.8,2.0,0.0,7.0,4
214 | 41.0,1.0,3.0,130.0,214.0,0.0,2.0,168.0,0.0,2.0,2.0,0.0,3.0,0
215 | 66.0,0.0,4.0,178.0,228.0,1.0,0.0,165.0,1.0,1.0,2.0,2.0,7.0,3
216 | 52.0,1.0,4.0,112.0,230.0,0.0,0.0,160.0,0.0,0.0,1.0,1.0,3.0,1
217 | 56.0,1.0,1.0,120.0,193.0,0.0,2.0,162.0,0.0,1.9,2.0,0.0,7.0,0
218 | 46.0,0.0,2.0,105.0,204.0,0.0,0.0,172.0,0.0,0.0,1.0,0.0,3.0,0
219 | 46.0,0.0,4.0,138.0,243.0,0.0,2.0,152.0,1.0,0.0,2.0,0.0,3.0,0
220 | 64.0,0.0,4.0,130.0,303.0,0.0,0.0,122.0,0.0,2.0,2.0,2.0,3.0,0
221 | 59.0,1.0,4.0,138.0,271.0,0.0,2.0,182.0,0.0,0.0,1.0,0.0,3.0,0
222 | 41.0,0.0,3.0,112.0,268.0,0.0,2.0,172.0,1.0,0.0,1.0,0.0,3.0,0
223 | 54.0,0.0,3.0,108.0,267.0,0.0,2.0,167.0,0.0,0.0,1.0,0.0,3.0,0
224 | 39.0,0.0,3.0,94.0,199.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0
225 | 53.0,1.0,4.0,123.0,282.0,0.0,0.0,95.0,1.0,2.0,2.0,2.0,7.0,3
226 | 63.0,0.0,4.0,108.0,269.0,0.0,0.0,169.0,1.0,1.8,2.0,2.0,3.0,1
227 | 34.0,0.0,2.0,118.0,210.0,0.0,0.0,192.0,0.0,0.7,1.0,0.0,3.0,0
228 | 47.0,1.0,4.0,112.0,204.0,0.0,0.0,143.0,0.0,0.1,1.0,0.0,3.0,0
229 | 67.0,0.0,3.0,152.0,277.0,0.0,0.0,172.0,0.0,0.0,1.0,1.0,3.0,0
230 | 54.0,1.0,4.0,110.0,206.0,0.0,2.0,108.0,1.0,0.0,2.0,1.0,3.0,3
231 | 66.0,1.0,4.0,112.0,212.0,0.0,2.0,132.0,1.0,0.1,1.0,1.0,3.0,2
232 | 52.0,0.0,3.0,136.0,196.0,0.0,2.0,169.0,0.0,0.1,2.0,0.0,3.0,0
233 | 55.0,0.0,4.0,180.0,327.0,0.0,1.0,117.0,1.0,3.4,2.0,0.0,3.0,2
234 | 49.0,1.0,3.0,118.0,149.0,0.0,2.0,126.0,0.0,0.8,1.0,3.0,3.0,1
235 | 74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,0
236 | 54.0,0.0,3.0,160.0,201.0,0.0,0.0,163.0,0.0,0.0,1.0,1.0,3.0,0
237 | 54.0,1.0,4.0,122.0,286.0,0.0,2.0,116.0,1.0,3.2,2.0,2.0,3.0,3
238 | 56.0,1.0,4.0,130.0,283.0,1.0,2.0,103.0,1.0,1.6,3.0,0.0,7.0,2
239 | 46.0,1.0,4.0,120.0,249.0,0.0,2.0,144.0,0.0,0.8,1.0,0.0,7.0,1
240 | 49.0,0.0,2.0,134.0,271.0,0.0,0.0,162.0,0.0,0.0,2.0,0.0,3.0,0
241 | 42.0,1.0,2.0,120.0,295.0,0.0,0.0,162.0,0.0,0.0,1.0,0.0,3.0,0
242 | 41.0,1.0,2.0,110.0,235.0,0.0,0.0,153.0,0.0,0.0,1.0,0.0,3.0,0
243 | 41.0,0.0,2.0,126.0,306.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,0
244 | 49.0,0.0,4.0,130.0,269.0,0.0,0.0,163.0,0.0,0.0,1.0,0.0,3.0,0
245 | 61.0,1.0,1.0,134.0,234.0,0.0,0.0,145.0,0.0,2.6,2.0,2.0,3.0,2
246 | 60.0,0.0,3.0,120.0,178.0,1.0,0.0,96.0,0.0,0.0,1.0,0.0,3.0,0
247 | 67.0,1.0,4.0,120.0,237.0,0.0,0.0,71.0,0.0,1.0,2.0,0.0,3.0,2
248 | 58.0,1.0,4.0,100.0,234.0,0.0,0.0,156.0,0.0,0.1,1.0,1.0,7.0,2
249 | 47.0,1.0,4.0,110.0,275.0,0.0,2.0,118.0,1.0,1.0,2.0,1.0,3.0,1
250 | 52.0,1.0,4.0,125.0,212.0,0.0,0.0,168.0,0.0,1.0,1.0,2.0,7.0,3
251 | 62.0,1.0,2.0,128.0,208.0,1.0,2.0,140.0,0.0,0.0,1.0,0.0,3.0,0
252 | 57.0,1.0,4.0,110.0,201.0,0.0,0.0,126.0,1.0,1.5,2.0,0.0,6.0,0
253 | 58.0,1.0,4.0,146.0,218.0,0.0,0.0,105.0,0.0,2.0,2.0,1.0,7.0,1
254 | 64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,0
255 | 51.0,0.0,3.0,120.0,295.0,0.0,2.0,157.0,0.0,0.6,1.0,0.0,3.0,0
256 | 43.0,1.0,4.0,115.0,303.0,0.0,0.0,181.0,0.0,1.2,2.0,0.0,3.0,0
257 | 42.0,0.0,3.0,120.0,209.0,0.0,0.0,173.0,0.0,0.0,2.0,0.0,3.0,0
258 | 67.0,0.0,4.0,106.0,223.0,0.0,0.0,142.0,0.0,0.3,1.0,2.0,3.0,0
259 | 76.0,0.0,3.0,140.0,197.0,0.0,1.0,116.0,0.0,1.1,2.0,0.0,3.0,0
260 | 70.0,1.0,2.0,156.0,245.0,0.0,2.0,143.0,0.0,0.0,1.0,0.0,3.0,0
261 | 57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,1
262 | 44.0,0.0,3.0,118.0,242.0,0.0,0.0,149.0,0.0,0.3,2.0,1.0,3.0,0
263 | 58.0,0.0,2.0,136.0,319.0,1.0,2.0,152.0,0.0,0.0,1.0,2.0,3.0,3
264 | 60.0,0.0,1.0,150.0,240.0,0.0,0.0,171.0,0.0,0.9,1.0,0.0,3.0,0
265 | 44.0,1.0,3.0,120.0,226.0,0.0,0.0,169.0,0.0,0.0,1.0,0.0,3.0,0
266 | 61.0,1.0,4.0,138.0,166.0,0.0,2.0,125.0,1.0,3.6,2.0,1.0,3.0,4
267 | 42.0,1.0,4.0,136.0,315.0,0.0,0.0,125.0,1.0,1.8,2.0,0.0,6.0,2
268 | 52.0,1.0,4.0,128.0,204.0,1.0,0.0,156.0,1.0,1.0,2.0,0.0,,2
269 | 59.0,1.0,3.0,126.0,218.0,1.0,0.0,134.0,0.0,2.2,2.0,1.0,6.0,2
270 | 40.0,1.0,4.0,152.0,223.0,0.0,0.0,181.0,0.0,0.0,1.0,0.0,7.0,1
271 | 42.0,1.0,3.0,130.0,180.0,0.0,0.0,150.0,0.0,0.0,1.0,0.0,3.0,0
272 | 61.0,1.0,4.0,140.0,207.0,0.0,2.0,138.0,1.0,1.9,1.0,1.0,7.0,1
273 | 66.0,1.0,4.0,160.0,228.0,0.0,2.0,138.0,0.0,2.3,1.0,0.0,6.0,0
274 | 46.0,1.0,4.0,140.0,311.0,0.0,0.0,120.0,1.0,1.8,2.0,2.0,7.0,2
275 | 71.0,0.0,4.0,112.0,149.0,0.0,0.0,125.0,0.0,1.6,2.0,0.0,3.0,0
276 | 59.0,1.0,1.0,134.0,204.0,0.0,0.0,162.0,0.0,0.8,1.0,2.0,3.0,1
277 | 64.0,1.0,1.0,170.0,227.0,0.0,2.0,155.0,0.0,0.6,2.0,0.0,7.0,0
278 | 66.0,0.0,3.0,146.0,278.0,0.0,2.0,152.0,0.0,0.0,2.0,1.0,3.0,0
279 | 39.0,0.0,3.0,138.0,220.0,0.0,0.0,152.0,0.0,0.0,2.0,0.0,3.0,0
280 | 57.0,1.0,2.0,154.0,232.0,0.0,2.0,164.0,0.0,0.0,1.0,1.0,3.0,1
281 | 58.0,0.0,4.0,130.0,197.0,0.0,0.0,131.0,0.0,0.6,2.0,0.0,3.0,0
282 | 57.0,1.0,4.0,110.0,335.0,0.0,0.0,143.0,1.0,3.0,2.0,1.0,7.0,2
283 | 47.0,1.0,3.0,130.0,253.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0
284 | 55.0,0.0,4.0,128.0,205.0,0.0,1.0,130.0,1.0,2.0,2.0,1.0,7.0,3
285 | 35.0,1.0,2.0,122.0,192.0,0.0,0.0,174.0,0.0,0.0,1.0,0.0,3.0,0
286 | 61.0,1.0,4.0,148.0,203.0,0.0,0.0,161.0,0.0,0.0,1.0,1.0,7.0,2
287 | 58.0,1.0,4.0,114.0,318.0,0.0,1.0,140.0,0.0,4.4,3.0,3.0,6.0,4
288 | 58.0,0.0,4.0,170.0,225.0,1.0,2.0,146.0,1.0,2.8,2.0,2.0,6.0,2
289 | 58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,,7.0,0
290 | 56.0,1.0,2.0,130.0,221.0,0.0,2.0,163.0,0.0,0.0,1.0,0.0,7.0,0
291 | 56.0,1.0,2.0,120.0,240.0,0.0,0.0,169.0,0.0,0.0,3.0,0.0,3.0,0
292 | 67.0,1.0,3.0,152.0,212.0,0.0,2.0,150.0,0.0,0.8,2.0,0.0,7.0,1
293 | 55.0,0.0,2.0,132.0,342.0,0.0,0.0,166.0,0.0,1.2,1.0,0.0,3.0,0
294 | 44.0,1.0,4.0,120.0,169.0,0.0,0.0,144.0,1.0,2.8,3.0,0.0,6.0,2
295 | 63.0,1.0,4.0,140.0,187.0,0.0,2.0,144.0,1.0,4.0,1.0,2.0,7.0,2
296 | 63.0,0.0,4.0,124.0,197.0,0.0,0.0,136.0,1.0,0.0,2.0,0.0,3.0,1
297 | 41.0,1.0,2.0,120.0,157.0,0.0,0.0,182.0,0.0,0.0,1.0,0.0,3.0,0
298 | 59.0,1.0,4.0,164.0,176.0,1.0,2.0,90.0,0.0,1.0,2.0,2.0,6.0,3
299 | 57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0,1
300 | 45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
301 | 68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
302 | 57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
303 | 57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1
304 | 38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,,3.0,0
305 | 


--------------------------------------------------------------------------------
/synthesis_lib.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # Author: Mandis Beigi
  5 | # Copyright (c) 2022 Medidata Solutions, Inc.
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in
 15 | # all copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 | # THE SOFTWARE.
 24 | 
 25 | 
 26 | import numpy as np
 27 | import pandas as pd
 28 | import gower
 29 | from sklearn.neighbors import NearestNeighbors
 30 | from scipy.stats import chi2_contingency
 31 | from scipy import stats
 32 | import logging
 33 | import math
 34 | 
 35 | import dimanalysis_lib
 36 | import preprocessor_lib
 37 | import utilities_lib
 38 | 
 39 | 
 40 | def is_in_group(col_name, col_groups):
 41 |     for col_group in col_groups:
 42 |         if col_name in col_group:
 43 |             return(True)
 44 |     return(False)
 45 | 
 46 | 
 47 | #only add noise to columns not in column pairings
 48 | #adds gaussian distribution noise to ints and floats. It maintains the min and max
 49 | def add_multiplicative_noise_to_df(df, column_pairing_groups=[]):
 50 |     logging.info("Adding noise to numeric columns.....................")
 51 | 
 52 |     #int_numerics = ['int16', 'int32', 'int64']
 53 |     #float_numerics = ['float16', 'float32', 'float64']
 54 |     int_numerics = ['int']
 55 |     float_numerics = ['float']
 56 | 
 57 |     int_numeric_columns = df.select_dtypes(include=int_numerics).columns.tolist() 
 58 |     float_numeric_columns = df.select_dtypes(include=float_numerics).columns.tolist() 
 59 | 
 60 |     #among the float columns if the values do not have decimal parts, move them into int columns
 61 |     for float_numeric_column in float_numeric_columns:
 62 |         num_vec = df[float_numeric_column].tolist()
 63 |         if not preprocessor_lib.contains_floats(num_vec):
 64 |             int_numeric_columns.append(float_numeric_column)
 65 | 
 66 |     #remove all items in the updated int columns from the float columns
 67 |     for int_numeric_column in int_numeric_columns:
 68 |         if int_numeric_column in float_numeric_columns:
 69 |             float_numeric_columns.remove(int_numeric_column)
 70 | 
 71 |     numeric_columns = df.select_dtypes(include=int_numerics+float_numerics).columns.tolist() 
 72 |     for numeric_column in numeric_columns:
 73 |         if is_in_group(numeric_column, column_pairing_groups):
 74 |             continue
 75 |         min_val = df[numeric_column].min()
 76 |         max_val = df[numeric_column].max()
 77 |         noise = np.random.normal(1, 0.05, [len(df),])
 78 |         if numeric_column in int_numeric_columns:
 79 |             df[numeric_column] = (df[numeric_column] * noise).round().astype('Int64')
 80 |         else:
 81 |             df[numeric_column] = df[numeric_column] * noise
 82 |         df.loc[df[numeric_column] > max_val, numeric_column] = max_val
 83 |         df.loc[df[numeric_column] < min_val, numeric_column] = min_val
 84 | 
 85 |     return
 86 | 
 87 | 
 88 | def find_max_distance_for_outliers(embedded_df, nn_nbrs, percentile=0.95):
 89 |     distances = []
 90 |     for i in range(0, len(embedded_df)):
 91 |         neighs = nn_nbrs.kneighbors(embedded_df.iloc[[i]], return_distance=True)
 92 |         closest_dist = neighs[0][0][1]
 93 |         distances.append(closest_dist)
 94 |     dist = np.quantile(distances, percentile)
 95 |     #logging.info("{}th percentile of distances: {}".format(percentile, dist))
 96 |     return dist
 97 |  
 98 | 
 99 | # Compute correlations between continuous variables (non-boolean)
100 | def get_column_groups_for_continuous_variables(df, nonboolean_columns, threshold):
101 |     groups = []
102 |     corr = np.abs(df[nonboolean_columns].corr(method='pearson'))
103 |     #corr.to_csv("continuous_corr.csv")
104 |     for i in range(len(nonboolean_columns)):
105 |         for j in range(i+1, len(nonboolean_columns)):
106 |             col1 = nonboolean_columns[i]
107 |             col2 = nonboolean_columns[j]
108 |             if abs(corr.loc[col1, col2]) > threshold:
109 |                 if [col1, col2] not in groups and [col2, col1] not in groups:
110 |                     groups.append([col1, col2])
111 |     logging.info("Highly correlated continuous variables: {}".format(groups))
112 |     return(groups)
113 | 
114 | 
115 | # Compute correlations between continuous variable and categorical variables
116 | def get_column_groups_between_continuous_and_categorical_variables(df, nonboolean_columns, boolean_columns, threshold):
117 |     groups = []
118 |     for i in range(len(nonboolean_columns)):
119 |         for j in range(len(boolean_columns)):
120 |             col1 = nonboolean_columns[i]
121 |             col2 = boolean_columns[j]
122 |             pbsr_r, pbsr_p = stats.pointbiserialr(df[col1],df[col2])
123 |             #logging.info("continuous variable: {} categorical variable: {} pbsr_r:{}".format(col1, col2, pbsr_r))
124 |             if abs(pbsr_r) > threshold:
125 |                 logging.info("Highly correlated continuous variable: {} and categorical variable: {}".format(col1, col2))
126 |                 if [col1, col2] not in groups and [col2, col1] not in groups:
127 |                     groups.append([col1, col2])
128 | 
129 |     logging.info("Highly correlated continuous and categorical variables: {}".format(groups))
130 |     return(groups)
131 | 
132 | 
133 | #group same categories together
134 | def get_column_groups_for_same_categories(boolean_columns):
135 |     groups=[]
136 | 
137 |     for col in boolean_columns:
138 |         if "|" not in col:
139 |             continue
140 | 
141 |         cat = col.split('|')[0]
142 |         found = False
143 |         while(True):
144 |             for group in groups:
145 |                 if col in group:
146 |                     found = True
147 |                     break
148 |                 cat_group = [i.split('|')[0] for i in group]
149 |                 if cat in cat_group:
150 |                     group.append(col)
151 |                     found = True
152 |             if found:
153 |                 break
154 |             else:
155 |                 groups.append([col])
156 |                 break
157 |     #logging.info("Groups of same name categories")
158 |     #logging.info(groups)
159 |     return(groups)
160 | 
161 | 
162 | # bias corrected version of Cramer’s V for association between categorical variables
163 | #(regardless of number of factor levels)
164 | def cramer_v_bias_correct(contingency_tbl):
165 |     chi2,_,_,_ = chi2_contingency(contingency_tbl)
166 |     n = contingency_tbl.values.sum()
167 |     phi2 = chi2/n
168 |     r,k = contingency_tbl.shape
169 |     phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
170 |     rcorr = r - ((r-1)**2)/(n-1)
171 |     kcorr = k - ((k-1)**2)/(n-1)
172 |     if phi2corr == min((kcorr-1), (rcorr-1)) == 0:
173 |         test_stat = 0
174 |     else:
175 |         test_stat = (phi2corr / min((kcorr-1), (rcorr-1))) ** (1/2)
176 |     dof = min((kcorr-1), (rcorr-1))
177 |     return(test_stat, dof)
178 | 
179 | 
180 | def get_column_groups_for_categorical_variables(df, boolean_columns, threshold):
181 |     groups = []
182 |     for i in range(len(boolean_columns)):
183 |         for j in range(i+1, len(boolean_columns)):
184 |             col1 = boolean_columns[i]
185 |             col2 = boolean_columns[j]
186 |             category_name1 = col1.split('|')
187 |             category_name2 = col2.split('|')
188 |             #if the category names are the same skip them because this has been taken care of
189 |             if col1.startswith(category_name2[0]+'|') and col2.startswith(category_name1[0]+'|'):
190 |                 continue
191 | 
192 |             contingency = pd.crosstab(df[col1], df[col2])
193 | 
194 |             cram_stat, dof = cramer_v_bias_correct(contingency)
195 |             p = 1 - stats.chi2.cdf(cram_stat, dof)
196 | 
197 |             #c, p, dof, expected = chi2_contingency(contingency)
198 |             #logging.info("p-value of categorical variables: {} and {} p-value: {}".format(col1, col2, p))
199 |             if p < 0.05:
200 |                 if [col1, col2] not in groups and [col2, col1] not in groups:
201 |                     groups.append([col1, col2])
202 | 
203 |     logging.info("Highly correlated categorical variables: {}".format(groups))
204 |     return(groups)
205 | 
206 | 
207 | # Compute correlations between categorical variables (boolean)
208 | def get_column_groups_for_categorical_variables_v2(df, boolean_columns, threshold):
209 |     groups = []
210 |     for i in range(len(boolean_columns)):
211 |         for j in range(i+1, len(boolean_columns)):
212 |             col1 = boolean_columns[i]
213 |             col2 = boolean_columns[j]
214 |             category_name1 = col1.split('|')
215 |             category_name2 = col2.split('|')
216 |             #if the category names are the same skip them because this has been taken care of
217 |             if col1.startswith(category_name2[0]+'|') and col2.startswith(category_name1[0]+'|'):
218 |                 continue
219 |             #perform chi-square test
220 |             contingency= pd.crosstab(df[col1], df[col2])
221 |             c, p, dof, expected = chi2_contingency(contingency)
222 |             #logging.info("p-value of categorical variables: {} and {} p-value: {}".format(col1, col2, p))
223 |             if p < 0.05:
224 |                 logging.info("Highly correlated categorical variables: {} and {} p-value: {}".format(col1, col2, p))
225 |                 if [col1, col2] not in groups and [col2, col1] not in groups:
226 |                     groups.append([col1, col2])
227 |     return(groups)
228 | 
229 | 
230 | #threshold is the correlation threshold for the continuous variables
231 | def generate_corr_cols_groups(df, threshold):
232 |     logging.info("Generating correlations between columns...")
233 |     boolean_columns, nonboolean_columns = preprocessor_lib.get_boolean_and_nonboolean_columns(df)
234 | 
235 |     # Compute correlations between categorical variables (i.e. the columns with boolean values)
236 |     logging.info("Computing correlations between categorical variables (boolean)")
237 |     cat_groups = get_column_groups_for_categorical_variables(df, boolean_columns, threshold)
238 | 
239 |     # Compute correlations between continuous variables (non-boolean)
240 |     logging.info("Computing correlations between continuous variables (non-boolean)")
241 |     cont_groups = get_column_groups_for_continuous_variables(df, nonboolean_columns, threshold)
242 | 
243 |     # Compute correlations between continuous variable and categorical variables
244 |     logging.info("Computing correlations between continuous variable and categorical variables")
245 |     cont_cat_groups = get_column_groups_between_continuous_and_categorical_variables(df, nonboolean_columns,
246 |             boolean_columns, threshold)
247 | 
248 |     #all_groups = cat_name_groups + cat_groups + cont_groups + cont_cat_groups
249 |     all_groups = cat_groups + cont_groups + cont_cat_groups
250 |     return(all_groups)
251 | 
252 | 
253 | #takes input in the form of [["a","c","d"],["b","e","f"]]  and generates pairwise column indices
254 | #such as {3:2, 2:1}
255 | def generate_col_pairing_indices(columns, column_pairing_groups):
256 |     logging.info("Generating column pairing indices...")
257 | 
258 |     col_pairing_indices = {}
259 |     for index in range(len(column_pairing_groups)):
260 |         for col1 in column_pairing_groups[index]:
261 |             bool1_list = (columns.str.startswith(col1+'|') | columns.str.match(col1))
262 |             idx1_list = [i for i, val in enumerate(bool1_list) if val]
263 |             for col1_idx in idx1_list:
264 |                 for col2 in column_pairing_groups[index]:
265 |                     bool2_list = (columns.str.startswith(col2+'|') | columns.str.match(col2))
266 |                     idx2_list = [i for i, val in enumerate(bool2_list) if val]
267 |                     for col2_idx in idx2_list:
268 |                         if col2_idx not in col_pairing_indices:
269 |                             if col2_idx>col1_idx:
270 |                                 col_pairing_indices[col2_idx] = col1_idx
271 |                         else:
272 |                             if col1_idx < col_pairing_indices[col2_idx]:
273 |                                 col_pairing_indices[col2_idx] = col1_idx
274 | 
275 |     #logging.info("column indices:")
276 |     #for i in range(len(columns)):
277 |     #    logging.info("{} {}".format(i, columns[i]))
278 |     #logging.info('column pairings:')
279 |     #logging.info(col_pairing_indices)
280 |     return(col_pairing_indices)
281 | 
282 | 
283 |     
284 | def synthesize(df, method='tsne', metric='euclidean', min_cluster_size=5, max_cluster_size=5, batch_size=1, 
285 |                corr_thresh=0.70, include_outliers=False,
286 |                holdout_cols=[], derived_cols_dict={}, col_pairings=[], imputing_method='simple', index_col='',
287 |                add_noise=False):
288 |     
289 |     #####   inputs
290 |     # df: dataframe where all categorical columns are already converted to numerical values
291 |     # method: method used dimension reduction (options are: 'tsne', 'pca')
292 |     # metric: metric used for 'tsne' only dimension reduction (options are: 'euclidean', 'gower')
293 |     # min_cluster_size: minimum number of parents to use in synthetic point generation
294 |     # max_cluster_size: maximum number of parents to use in synthetic point generation
295 |     # batch_size=1: ratio of the number of synthetic records to the real records. Needs to be an integer
296 |     # corr_thresh=0.70: correlation threshold
297 |     # include_outliers=False whether to use the outliers to generate simulated data
298 |     # holdout_cols: the vector of fixed columns names to omit in dimensionality reduction and to use together
299 |     # derived_cols_dict: the derived keys do not get used when embedding the data (e.g. {'bmi':['height','weight']})
300 |     # col_pairings: a list of groupings to co-segregate (e.g. [['prior_chemo_reg','prior_chemo_time']]
301 |     # imputing_method: the imputing method (options are: 'simple', 'iterative')
302 |     # index_col: index can be a column. It is used to map subjects across different tables
303 | 
304 |     df.dropna(axis=1, how='all', inplace=True)
305 | 
306 |     my_df = df.copy()
307 |     n_cols = my_df.shape[1]
308 | 
309 |     if imputing_method == 'iterative':
310 |         my_df = preprocessor_lib.iterative_impute(my_df)
311 |     else:
312 |         my_df = preprocessor_lib.impute_one_hot_encoded_df(my_df)
313 | 
314 |     if index_col == '':
315 |         df_to_embed = utilities_lib.drop_columns(my_df, holdout_cols+list(derived_cols_dict.keys()))
316 |     else:
317 |         df_to_embed = utilities_lib.drop_columns(my_df, [index_col]+holdout_cols+list(derived_cols_dict.keys()))
318 | 
319 |     # Compute correlations
320 |     #corr_cols_groups = generate_corr_cols_groups(df_to_embed, corr_thresh)
321 |     derived_groups = utilities_lib.convert_dict_to_groups(derived_cols_dict)
322 | 
323 |     #group same categories together
324 |     logging.info("Getting column groups for same categories")
325 |     #boolean_columns, nonboolean_columns = preprocessor_lib.get_boolean_and_nonboolean_columns(df)
326 |     boolean_columns, nonboolean_columns = preprocessor_lib.get_boolean_and_nonboolean_columns(df_to_embed)
327 |     cat_name_groups = get_column_groups_for_same_categories(boolean_columns)
328 | 
329 |     #column_pairing_groups = col_pairings + corr_cols_groups + derived_groups
330 |     column_pairing_groups = col_pairings + cat_name_groups + derived_groups
331 | 
332 |     logging.info("column_pairing_groups: {}".format(column_pairing_groups))
333 |     #col_pairing_indices = generate_col_pairing_indices(my_df.columns, column_pairing_groups)
334 |     col_pairing_indices = generate_col_pairing_indices(df_to_embed.columns, column_pairing_groups)
335 | 
336 |     # Perform tSNE with either gower metric or euclidean metric 
337 |     logging.info("Embedding the data......................")
338 |     if method == 'tsne':
339 |         embedded_df = dimanalysis_lib.reduce_tsne(df_to_embed, n_components=2, metric=metric)
340 |     elif method == 'pca':
341 |         embedded_df = dimanalysis_lib.reduce_pca(df_to_embed, n_components=2)
342 |     #elif method == 'umap':
343 |     #    embedded_df = dimanalysis_lib.reduce_umap(df_to_embed, n_components=2)
344 |     elif method == 'ica':
345 |         embedded_df = dimanalysis_lib.reduce_ica(df_to_embed, n_components=2)
346 |     logging.info("Finished embedding the data.............")
347 |     
348 |     data_size = len(embedded_df)
349 |     max_num_n = max_cluster_size
350 |     if max_cluster_size >= data_size:
351 |         max_num_n = data_size
352 |     min_num_n = min(min_cluster_size, data_size)
353 |     nn_nbrs = NearestNeighbors(n_neighbors=max_num_n).fit(embedded_df)
354 | 
355 |     max_dist = find_max_distance_for_outliers(embedded_df, nn_nbrs)
356 | 
357 |     # remove the outliers in the source data
358 |     outlier_indices = []
359 |     if not include_outliers:
360 |         for i in range(0, len(embedded_df)):
361 |             this_index = embedded_df.index[i]
362 |             neighs = nn_nbrs.kneighbors(embedded_df.iloc[[i]], return_distance=True)
363 |             distances = neighs[0][0]
364 |             if distances[1] > max_dist:    #if this is an outlier, skip, do not replicate it
365 |                 outlier_indices.append(this_index)
366 |         logging.info("Number of outliers found in original source data: {}".format(len(outlier_indices)))
367 |         #embedded_df_no_outliers = embedded_df.drop(outlier_indices)
368 |         #df_no_outliers = df.drop(outlier_indices)
369 | 
370 |     sampled = []
371 |     #num_outliers = 0
372 |     step_size = 1
373 |     if batch_size < 1:
374 |         step_size = math.floor(1/batch_size)
375 |         batch_size = 1
376 | 
377 |     for i in range(0, len(embedded_df)):
378 |         if i in outlier_indices:
379 |             continue
380 |         if i%step_size != 0:
381 |             continue
382 |         #this_index = embedded_df.index[i]
383 |         neighs = nn_nbrs.kneighbors(embedded_df.iloc[[i]], return_distance=True)
384 |         breeding = neighs[1][0]      #[1][0] gives the array of indices of closest neighbors
385 |         #distances = neighs[0][0]
386 |         #if distances[1] > max_dist:    #if this is an outlier, skip, do not replicate it
387 |         #    outlier_indices.append(i)
388 |         #    num_outliers += 1
389 |         #    if not include_outliers:
390 |         #        continue
391 |         
392 |         #TODO: add support for batch_size that are > 1 and not int
393 |         for _ in range(batch_size):
394 |             rows = []
395 |             for col in range(0, n_cols):
396 |                 if min_num_n == max_num_n:
397 |                     cluster_size = min_num_n
398 |                 else:
399 |                     cluster_size = np.random.randint(min_num_n, max_num_n)
400 |                 rand_num = np.random.randint(0, cluster_size)
401 |                 #if not include_outliers and distances[rand_num] > max_dist:
402 |                 #    rand_num = 0    #if the neighbor index selected is an outlier, pick self
403 |                 sample_row_index = breeding[rand_num]
404 |                 rows.append(rows[col_pairing_indices[col]] if col in col_pairing_indices else sample_row_index)
405 |             this_row = [int(embedded_df.index[i])]
406 |             for c in range(0, n_cols):
407 |                 this_row.append(df.iloc[rows[c]][c])
408 |             sampled.append(this_row)
409 |             
410 |     #logging.info("Number of outliers found: {}".format(num_outliers))
411 | 
412 |     syn_df = pd.DataFrame(sampled, columns=[index_col]+df.columns.to_list())
413 |     syn_df = syn_df.set_index(index_col)
414 | 
415 |     #add noise to the numeric columns
416 |     if add_noise: 
417 |         add_multiplicative_noise_to_df(syn_df, column_pairing_groups=column_pairing_groups)
418 |     
419 |     #drop any duplicates from the synthesized set that's also in the original set to secure privacy
420 |     len_before = syn_df.shape[0]
421 |     #duplicated_df = df.merge(syn_df, how = 'inner', indicator=False)
422 |     syn_df = pd.concat([syn_df, df, df]).drop_duplicates(keep=False)
423 |     len_after = syn_df.shape[0]
424 |     logging.info("Found {} duplicates and removed them from the synthetic set.".format(len_before-len_after))
425 |     
426 |     return(syn_df)
427 | 
428 | 


--------------------------------------------------------------------------------