├── CDEC
    ├── archs
    │   ├── cancer.json
    │   └── genome.json
    ├── customlayers.py
    ├── genome
    │   └── last_numeric.csv
    ├── keras_unpooling.py
    ├── main.py
    ├── misc.py
    ├── network.py
    ├── plots
    │   └── genome
    │   │   ├── autoencoder.png
    │   │   ├── clustered_kld.png
    │   │   ├── clustered_km.png
    │   │   └── raw.png
    └── self.trainAutoencoder.png
├── DEC_GenotypeClustering_Keras
    ├── DEC_Genotype_Clustering.py
    ├── LSTM_EthnicityPrediction.py
    └── genome.csv
├── PopulationClustering_v2
    ├── output_1.txt
    ├── pom.xml
    ├── results
    │   └── train.csv
    │   │   ├── DEC_Genotype_Clustering.py
    │   │   ├── LSTM_EthnicityPrediction.py
    │   │   ├── genome.csv
    │   │   └── part-00000-2c4830b2-4c39-48fc-909d-4868a1164190-c000.csv
    ├── src
    │   └── main
    │   │   └── scala
    │   │       └── org
    │   │           └── fit
    │   │               └── genomics
    │   │                   ├── PopGenomicsClassificationSpark.scala
    │   │                   ├── PopStratClassification.scala
    │   │                   ├── PopStratClustering.scala
    │   │                   └── featureExtractor.scala
    └── target
    │   ├── classes
    │       └── META-INF
    │       │   ├── MANIFEST.MF
    │       │   └── maven
    │       │       └── com.deri.sels
    │       │           └── PopulationClustering_v2
    │       │               ├── pom.properties
    │       │               └── pom.xml
    │   └── maven-archiver
    │       └── pom.properties
└── README.md


/CDEC/archs/cancer.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "name": "c-3-32_p_c-3-64_p_fc-32",
  4 |     "batch_size": 50,
  5 |     "layers": [
  6 |       {
  7 |         "type": "Input",
  8 |         "output_shape": [
  9 |           1,
 10 |           127,
 11 |           127
 12 |         ]
 13 |       },
 14 |       {
 15 |         "type": "Conv2D",
 16 |         "num_filters": 32,
 17 |         "filter_size": [
 18 |           3,
 19 |           3
 20 |         ],
 21 |         "non_linearity": "rectify",
 22 |         "conv_mode": "same"
 23 |       },
 24 |       {
 25 |         "type": "MaxPool2D",
 26 |         "filter_size": [
 27 |           2,
 28 |           2
 29 |         ]
 30 |       },
 31 |       {
 32 |         "type": "Conv2D",
 33 |         "num_filters": 64,
 34 |         "filter_size": [
 35 |           3,
 36 |           3
 37 |         ],
 38 |         "non_linearity": "rectify",
 39 |         "conv_mode": "same"
 40 |       },
 41 |       {
 42 |         "type": "MaxPool2D",
 43 |         "filter_size": [
 44 |           2,
 45 |           2
 46 |         ]
 47 |       },
 48 |       {
 49 |         "type": "Dense",
 50 |         "num_units": 3136,
 51 |         "non_linearity": "rectify"
 52 |       },
 53 |       {
 54 |         "type": "Dense",
 55 |         "num_units": 32,
 56 |         "non_linearity": "rectify"
 57 |       }
 58 |     ]
 59 |   },
 60 |   {
 61 |     "name": "c-5-6_p_c-5-16_p_c-4-120",
 62 |     "use_batch_norm": 1,
 63 |     "batch_size": 100,
 64 |     "layers": [
 65 |       {
 66 |         "type": "Input",
 67 |         "output_shape": [
 68 |           1,
 69 |           127,
 70 |           127
 71 |         ]
 72 |       },
 73 |       {
 74 |         "type": "Conv2D",
 75 |         "num_filters": 50,
 76 |         "filter_size": [
 77 |           4,
 78 |           4
 79 |         ],
 80 |         "non_linearity": "rectify"
 81 |       },
 82 |       {
 83 |         "type": "MaxPool2D*",
 84 |         "filter_size": [
 85 |           2,
 86 |           2
 87 |         ]
 88 |       },
 89 |       {
 90 |         "type": "Conv2D",
 91 |         "num_filters": 50,
 92 |         "filter_size": [
 93 |           3,
 94 |           3
 95 |         ],
 96 |         "non_linearity": "rectify"
 97 |       },
 98 |       {
 99 |         "type": "MaxPool2D*",
100 |         "filter_size": [
101 |           2,
102 |           2
103 |         ]
104 |       },
105 |       {
106 |         "type": "Conv2D",
107 |         "num_filters": 120,
108 |         "filter_size": [
109 |           2,
110 |           2
111 |         ],
112 |         "non_linearity": "linear"
113 |       }
114 |     ]
115 |   }
116 | 
117 | ]
118 | 


--------------------------------------------------------------------------------
/CDEC/archs/genome.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "name": "c-3-32_p_c-3-64_p_fc-32",
  4 |     "batch_size": 32,
  5 |     "layers": [
  6 |       {
  7 |         "type": "Input",
  8 |         "output_shape": [
  9 |           1,
 10 |           67,
 11 |           67
 12 |         ]
 13 |       },
 14 |       {
 15 |         "type": "Conv2D",
 16 |         "num_filters": 32,
 17 |         "filter_size": [
 18 |           3,
 19 |           3
 20 |         ],
 21 |         "non_linearity": "rectify",
 22 |         "conv_mode": "same"
 23 |       },
 24 |       {
 25 |         "type": "MaxPool2D",
 26 |         "filter_size": [
 27 |           2,
 28 |           2
 29 |         ]
 30 |       },
 31 |       {
 32 |         "type": "Conv2D",
 33 |         "num_filters": 64,
 34 |         "filter_size": [
 35 |           3,
 36 |           3
 37 |         ],
 38 |         "non_linearity": "rectify",
 39 |         "conv_mode": "same"
 40 |       },
 41 |       {
 42 |         "type": "MaxPool2D",
 43 |         "filter_size": [
 44 |           2,
 45 |           2
 46 |         ]
 47 |       },
 48 |       {
 49 |         "type": "Dense",
 50 |         "num_units": 3136,
 51 |         "non_linearity": "rectify"
 52 |       },
 53 |       {
 54 |         "type": "Dense",
 55 |         "num_units": 32,
 56 |         "non_linearity": "rectify"
 57 |       }
 58 |     ]
 59 |   },
 60 |   {
 61 |     "name": "c-5-6_p_c-5-16_p_c-4-120",
 62 |     "use_batch_norm": 1,
 63 |     "batch_size": 32,
 64 |     "layers": [
 65 |       {
 66 |         "type": "Input",
 67 |         "output_shape": [
 68 |           1,
 69 |           67,
 70 |           67
 71 |         ]
 72 |       },
 73 |       {
 74 |         "type": "Conv2D",
 75 |         "num_filters": 50,
 76 |         "filter_size": [
 77 |           5,
 78 |           5
 79 |         ],
 80 |         "non_linearity": "rectify"
 81 |       },
 82 |       {
 83 |         "type": "MaxPool2D*",
 84 |         "filter_size": [
 85 |           2,
 86 |           2
 87 |         ]
 88 |       },
 89 |       {
 90 |         "type": "Conv2D",
 91 |         "num_filters": 50,
 92 |         "filter_size": [
 93 |           5,
 94 |           5
 95 |         ],
 96 |         "non_linearity": "rectify"
 97 |       },
 98 |       {
 99 |         "type": "MaxPool2D*",
100 |         "filter_size": [
101 |           2,
102 |           2
103 |         ]
104 |       },
105 |       {
106 |         "type": "Conv2D",
107 |         "num_filters": 32,
108 |         "filter_size": [
109 |           2,
110 |           2
111 |         ],
112 |         "non_linearity": "linear"
113 |       }
114 |     ]
115 |   }
116 | ]
117 | 


--------------------------------------------------------------------------------
/CDEC/customlayers.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 25, 2017
 3 | '''
 4 | 
 5 | #from lasagne import layers
 6 | from keras.models import Sequential
 7 | from keras import backend as K
 8 | from keras import layers
 9 | from keras.engine.topology import Layer
10 | class Unpool2DLayer(layers.Layer):
11 |     """
12 |     This layer performs unpooling over the last two dimensions
13 |     of a 4D tensor.
14 |     Layer borrowed from: https://swarbrickjones.wordpress.com/2015/04/29/convolutional-autoencoders-in-pythontheanolasagne/
15 |     """
16 | 
17 |     def __init__(self, incoming, ds, **kwargs):
18 |         self.ds = ds
19 |         super(Unpool2DLayer, self).__init__(**kwargs)
20 |  
21 |     def compute_output_shape(self, input_shape):
22 |         output_shape = list(input_shape)
23 |         output_shape[1] = input_shape[1] * self.ds[0]
24 |         output_shape[2] = input_shape[2] * self.ds[1]
25 |         return tuple(output_shape)
26 |         
27 |     def call(self,incoming,**kwargs):  
28 |         '''
29 |         Just repeats the input element the upscaled image
30 |         '''
31 |         repaxis2 =  K.repeat_elements(incoming,self.ds[0], axis=1)  
32 |         Unpool_layer =  K.repeat_elements(repaxis2,self.ds[1], axis=2)
33 |         return Unpool_layer
34 | 
35 | 
36 | class ClusteringLayer(layers.Layer):
37 |     '''
38 |     This layer gives soft assignments for the clusters based on distance from k-means based
39 |     cluster centers. The weights of the layers are the cluster centers so that they can be learnt
40 |     while optimizing for loss
41 |     '''
42 |     def __init__(self,num_of_clusters, num_samples,latent_space_dim,**kwargs):
43 |         self.num_of_clusters = num_of_clusters
44 |         #self.alpha = alpha
45 |         #self.cluster_centers = cluster_centers
46 |         self.num_samples = num_samples
47 |         self.latent_space_dim = latent_space_dim
48 |         #self.intial_clusters = intial_clusters     
49 |         super(ClusteringLayer, self).__init__(**kwargs)  
50 |     def build(self,intial_clusters_shape):
51 |         # Create a trainable weight variable for this layer.
52 |         self.W = self.add_weight(name='W', 
53 |                                  shape=intial_clusters_shape,
54 |                                  initializer='glorot_uniform',
55 |                                  trainable=True)
56 |         super(ClusteringLayer, self).build(intial_clusters_shape)  # Be sure to call this at the end
57 |           
58 |       
59 |     def call(self,incoming,**kwargs):
60 |            
61 |        return  getSoftAssignments(incoming,self.W,self.num_of_clusters,self.num_samples, self.latent_space_dim)
62 |     def compute_output_shape(self, input_shape):
63 |         return (input_shape[0], self.num_of_clusters)
64 |    
65 |     def get_config(self):
66 |         config = {'W': self.W}
67 |         base_config = super(ClusteringLayer, self).get_config()
68 |         return dict(list(base_config.items()) + list(config.items()))
69 | 
70 | def getSoftAssignments(latent_space, cluster_centers, num_clusters,num_samples,latent_space_dim):
71 |     '''
72 |     Returns cluster membership distribution for each sample
73 |     :param latent_space: latent space representation of inputs
74 |     :param cluster_centers: the coordinates of cluster centers in latent space
75 |     :param num_clusters: total number of clusters
76 |     :param latent_space_dim: dimensionality of latent space
77 |     :param num_samples: total number of input samples
78 |     :return: soft assigment based on the equation qij = (1+|zi - uj|^2)^(-1)/sum_j'((1+|zi - uj'|^2)^(-1))
79 |     ''' 
80 |     z_expanded = K.reshape(latent_space,shape=(num_samples,1,latent_space_dim,))
81 |     z_expanded = K.tile(z_expanded, (1,num_clusters,1))
82 |     u_expanded = K.tile(K.expand_dims(cluster_centers,0), [num_samples, 1, 1])#[1, 10,120] after expand_dims #[100,10,120] after tile
83 |     distances_from_cluster_centers = K.sqrt(K.sum((z_expanded - u_expanded)**2,axis=2))#K.norm((z_expanded - u_expanded),2,axis=2)
84 |     qij_numerator = 1 + distances_from_cluster_centers**2
85 |     qij_numerator = 1 / qij_numerator
86 |     normalizer_q  =  K.sum(qij_numerator, axis=1)
87 |     normalizer_q  =  K.reshape(normalizer_q,(num_samples, 1))
88 |     #print((qij_numerator/normalizer_q).shape)
89 |     return qij_numerator/normalizer_q
90 | 
91 | 
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/CDEC/keras_unpooling.py:
--------------------------------------------------------------------------------
 1 | from keras import backend as K
 2 | from keras.layers.convolutional import UpSampling2D
 3 | from keras.layers.convolutional import MaxPooling2D
 4 | class MaxPoolingMask2D(MaxPooling2D):
 5 |     def __init__(self, pool_size=(2, 2), strides=None, border_mode='valid',
 6 |                  dim_ordering='default', **kwargs):
 7 |         super(MaxPoolingMask2D, self).__init__(pool_size, strides, border_mode,
 8 |                                            dim_ordering, **kwargs)
 9 | 
10 |     def _pooling_function(self, inputs, pool_size, strides,
11 |                           border_mode, dim_ordering):
12 |         pooled = K.pool2d(inputs, pool_size, strides, border_mode,
13 |                  dim_ordering, pool_mode='max')
14 |         upsampled = UpSampling2D(size=pool_size)(pooled)
15 |         indexMask = K.tf.equal(inputs, upsampled)
16 |         assert indexMask.get_shape().as_list() == inputs.get_shape().as_list()
17 |         return indexMask
18 |     
19 |     def get_output_shape_for(self, input_shape):
20 |         return input_shape
21 | 
22 | 
23 | def unpooling(inputs):
24 |     '''
25 |     do unpooling with indices, move this to separate layer if it works
26 |     1. do naive upsampling (repeat elements)
27 |     2. keep only values in mask (stored indices) and set the rest to zeros
28 |     '''
29 |     x = inputs[0]
30 |     mask = inputs[1]
31 |     mask_shape = mask.get_shape().as_list()
32 |     x_shape = x.get_shape().as_list()
33 |     pool_size = (mask_shape[1] / x_shape[1], mask_shape[2] / x_shape[2])
34 |     on_success = UpSampling2D(size=pool_size)(x)
35 |     on_fail = K.zeros_like(on_success)
36 |     return K.tf.where(mask, on_success, on_fail)
37 | 
38 | 
39 | def unpooling_output_shape(input_shape):
40 |     return input_shape[1]
41 | 


--------------------------------------------------------------------------------
/CDEC/main.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jul 9, 2017
  3 | '''
  4 | import numpy
  5 | import json
  6 | from misc import DatasetHelper, evaluateKMeans, visualizeData
  7 | from network import DCJC, rootLogger
  8 | from copy import deepcopy
  9 | import argparse
 10 | 
 11 | import tensorflow as tf
 12 | import keras.backend as K
 13 | K.set_image_dim_ordering('tf')
 14 | 
 15 | def testOnlyClusterInitialization(dataset_name, arch, epochs):
 16 |     '''
 17 |     Train an autoencoder defined by architecture arch and trains it with the dataset defined
 18 |     :param dataset_name: Name of the dataset with which the network will be trained [MNIST, COIL20]
 19 |     :param arch: Architecture of the network as a dictionary. Specification for architecture can be found in readme.md
 20 |     :param epochs: Number of train epochs
 21 |     :return: None - (side effect) saves the latent space and params of trained network in an appropriate location in saved_params folder
 22 |     '''
 23 |     arch_copy = deepcopy(arch)
 24 |     rootLogger.info("Loading dataset")
 25 |     dataset = DatasetHelper(dataset_name)
 26 |     dataset.loadDataset()
 27 |     rootLogger.info("Done loading dataset")
 28 |     rootLogger.info("Creating network")
 29 |     dcjc = DCJC(arch_copy)
 30 |     rootLogger.info("Done creating network")
 31 |     rootLogger.info("Starting training")
 32 |     dcjc.pretrainWithData(dataset, epochs, False);
 33 | 
 34 | 
 35 | def testOnlyClusterImprovement(dataset_name, arch, epochs, method):
 36 |     '''
 37 |     Use an initialized autoencoder and train it along with clustering loss. Assumed that pretrained autoencoder params
 38 |     are available, i.e. testOnlyClusterInitialization has been run already with the given params
 39 |     :param dataset_name: Name of the dataset with which the network will be trained [MNIST, COIL20]
 40 |     :param arch: Architecture of the network as a dictionary. Specification for architecture can be found in readme.md
 41 |     :param epochs: Number of train epochs
 42 |     :param method: Can be KM or KLD - depending on whether the clustering loss is KLDivergence loss between the current KMeans distribution(Q) and a more desired one(Q^2), or if the clustering loss is just the Kmeans loss
 43 |     :return: None - (side effect) saves latent space and params of the trained network
 44 |     '''
 45 |     arch_copy = deepcopy(arch)
 46 |     rootLogger.info("Loading dataset")
 47 |     dataset = DatasetHelper(dataset_name)
 48 |     dataset.loadDataset()
 49 |     rootLogger.info("Done loading dataset")
 50 |     rootLogger.info("Creating network")
 51 |     dcjc = DCJC(arch_copy)
 52 |     rootLogger.info("Starting cluster improvement")
 53 |     if method == 'KM':
 54 |         dcjc.doClusteringWithKMeansLoss(dataset, epochs)
 55 |     elif method == 'KLD':
 56 |         dcjc.doClusteringWithKLdivLoss(dataset, True, epochs)
 57 | 
 58 | 
 59 | def testKMeans(dataset_name, archs):
 60 |     '''
 61 |     Performs kMeans clustering, and report metrics on the output latent space produced by the networks defined in archs,
 62 |     with given dataset. Assumes that testOnlyClusterInitialization and testOnlyClusterImprovement have been run before
 63 |     this for the specified archs/datasets, as the results saved by them are used for clustering
 64 |     :param dataset_name: Name of dataset [MNIST, COIL20]
 65 |     :param archs: Architectures as a dictionary
 66 |     :return: None - reports the accuracy and nmi clustering metrics
 67 |     '''
 68 |     rootLogger.info('Initial Cluster Quality Comparison')
 69 |     rootLogger.info(80 * '_')
 70 |     rootLogger.info('%-50s     %8s     %8s' % ('method', 'ACC', 'NMI'))
 71 |     rootLogger.info(80 * '_')
 72 |     dataset = DatasetHelper(dataset_name)
 73 |     dataset.loadDataset()
 74 |     rootLogger.info(evaluateKMeans(dataset.input_flat, dataset.labels, dataset.getClusterCount(), 'image')[0])
 75 |     for arch in archs:
 76 |         Z = numpy.load('saved_params/' + dataset.name + '/z_' + arch['name'] + '.npy')
 77 |         rootLogger.info(evaluateKMeans(Z, dataset.labels, dataset.getClusterCount(), arch['name'])[0])
 78 |         Z = numpy.load('saved_params/' + dataset.name + '/pc_z_' + arch['name'] + '.npy')
 79 |         rootLogger.info(evaluateKMeans(Z, dataset.labels, dataset.getClusterCount(), arch['name'])[0])
 80 |         Z = numpy.load('saved_params/' + dataset.name + '/pc_km_z_' + arch['name'] + '.npy')
 81 |         rootLogger.info(evaluateKMeans(Z, dataset.labels, dataset.getClusterCount(), arch['name'])[0])
 82 |     rootLogger.info(80 * '_')
 83 | 
 84 | 
 85 | def visualizeLatentSpace(dataset_name, arch):
 86 |     '''
 87 |     Plots and saves graphs for visualized images space, autoencoder latent space, and the final clustering latent space
 88 |     :param dataset_name: Name of dataset [MNIST, COIL20]
 89 |     :param arch: Architectures as a dictionary
 90 |     :return: None - (side effect) saved graphs in plots/ folder
 91 |     '''
 92 |     rootLogger.info("Loading dataset")
 93 |     dataset = DatasetHelper(dataset_name)
 94 |     dataset.loadDataset()
 95 |     rootLogger.info("Done loading dataset")
 96 |     # We consider only the first 5000 point or less for better visualization
 97 |     max_points = min(dataset.input_flat.shape[0], 5000)
 98 |     # Image space
 99 |     visualizeData(dataset.input_flat[0:max_points], dataset.labels[0:max_points], dataset.getClusterCount(), "plots/%s/raw.png" % dataset.name)
100 |     # Latent space - autoencoder
101 |     Z = numpy.load('saved_params/' + dataset.name + '/z_' + arch['name'] + '.npy')
102 |     visualizeData(Z[0:max_points], dataset.labels[0:max_points], dataset.getClusterCount(), "plots/%s/autoencoder.png" % dataset.name)
103 |     # Latent space - kl div clustering network
104 |     Z = numpy.load('saved_params/' + dataset.name + '/pc_z_' + arch['name'] + '.npy')
105 |     visualizeData(Z[0:max_points], dataset.labels[0:max_points], dataset.getClusterCount(), "plots/%s/clustered_kld.png" % dataset.name)
106 |     # Latent space - kmeans clustering network
107 |     Z = numpy.load('saved_params/' + dataset.name + '/pc_km_z_' + arch['name'] + '.npy')
108 |     visualizeData(Z[0:max_points], dataset.labels[0:max_points], dataset.getClusterCount(), "plots/%s/clustered_km.png" % dataset.name)
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     '''
113 |     usage: main.py [-h] -d DATASET -a ARCHITECTURE [--pretrain PRETRAIN]
114 |                [--cluster CLUSTER] [--metrics METRICS] [--visualize VISUALIZE]
115 |         
116 |     required arguments:
117 |       -d DATASET, --dataset DATASET
118 |                             Dataset on which autoencoder is trained [MNIST,COIL20]
119 |       -a ARCHITECTURE, --architecture ARCHITECTURE
120 |                             Index of architecture of autoencoder in the json file
121 |                             (archs/)
122 |                             
123 |     optional arguments:
124 |       -h, --help            show this help message and exit
125 |       --pretrain PRETRAIN   Pretrain the autoencoder for specified #epochs
126 |                             specified by architecture on specified dataset
127 |       --cluster CLUSTER     Refine the autoencoder for specified #epochs with
128 |                             clustering loss, assumes that pretraining results are
129 |                             available
130 |       --metrics METRICS     Report k-means clustering metrics on the clustered
131 |                             latent space, assumes pretrain and cluster based
132 |                             training have been performed
133 |       --visualize VISUALIZE
134 |                             Visualize the image space and latent space, assumes
135 |                             pretraining and cluster based training have been
136 |                             performed
137 |     '''
138 |     # Load architectures from the json files
139 |     mnist_archs = []
140 |     coil_archs = []
141 |     cancer_archs = []
142 |     with open("archs/coil.json") as archs_file:
143 |         coil_archs = json.load(archs_file)
144 |     with open("archs/mnist.json") as archs_file:
145 |         mnist_archs = json.load(archs_file)       
146 |     with open("archs/cancer.json") as archs_file:
147 |         cancer_archs = json.load(archs_file)
148 |     
149 |     # Argument parsing
150 |     parser = argparse.ArgumentParser()
151 |     requiredArgs = parser.add_argument_group('required arguments')
152 |     requiredArgs.add_argument("-d", "--dataset", help="Dataset on which autoencoder is trained [MNIST,COIL20]", required=True)
153 |     requiredArgs.add_argument("-a", "--architecture", type=int, help="Index of architecture of autoencoder in the json file (archs/)", required=True)
154 |     requiredArgs.add_argument("-m", "--method", help="type of loss KLD or KM")
155 |     parser.add_argument("--pretrain", type=int, help="Pretrain the autoencoder for specified #epochs specified by architecture on specified dataset")
156 |     parser.add_argument("--cluster", type=int, help="Refine the autoencoder for specified #epochs with clustering loss, assumes that pretraining results are available")
157 |     parser.add_argument("--metrics", action='store_true', help="Report k-means clustering metrics on the clustered latent space, assumes pretrain and cluster based training have been performed")
158 |     parser.add_argument("--visualize", action='store_true', help="Visualize the image space and latent space, assumes pretraining and cluster based training have been performed")
159 |     args = parser.parse_args()
160 |     
161 |     # Train/Visualize as per the arguments
162 |     dataset_name = args.dataset
163 |     loss = args.method
164 |     arch_index = args.architecture
165 |     
166 |     if dataset_name == 'MNIST':
167 |         archs = mnist_archs
168 |     elif dataset_name == 'COIL20':
169 |         archs = coil_archs    
170 |     elif dataset_name == 'cancer':
171 |         archs = cancer_archs
172 |         
173 |     if args.pretrain:
174 |         testOnlyClusterInitialization(dataset_name, archs[arch_index], args.pretrain)
175 |     if args.cluster and loss =='KLD':
176 |         testOnlyClusterImprovement(dataset_name, archs[arch_index], args.cluster, loss)
177 |     elif args.cluster and loss =='KM':
178 |         testOnlyClusterImprovement(dataset_name, archs[arch_index], args.cluster, loss)
179 |     elif args.cluster and loss !='KM' and loss!='KLD':
180 |          print("Please specify the type of loss KLD or KM after writing -m ")
181 |     if args.metrics:
182 |         testKMeans(dataset_name, [archs[arch_index]])
183 |     if args.visualize:
184 |         visualizeLatentSpace(dataset_name, archs[arch_index])
185 | 


--------------------------------------------------------------------------------
/CDEC/misc.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jul 11, 2017
  3 | '''
  4 | 
  5 | import _pickle as cPickle
  6 | import _pickle 
  7 | import gzip
  8 | 
  9 | import numpy as np
 10 | from PIL import Image
 11 | import matplotlib
 12 | 
 13 | # For plotting graphs via ssh with no display
 14 | # Ref: https://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined
 15 | matplotlib.use('Agg')
 16 | 
 17 | from matplotlib import pyplot as plt
 18 | from numpy import float32
 19 | from sklearn import metrics
 20 | from sklearn.cluster.k_means_ import KMeans
 21 | from sklearn import manifold
 22 | from sklearn.utils.linear_assignment_ import linear_assignment
 23 | from sklearn import preprocessing
 24 | import os
 25 | from keras.preprocessing.image import load_img
 26 | 
 27 | import _pickle as cPickle
 28 | import _pickle 
 29 | import gzip
 30 | from skimage import transform 
 31 | import numpy as np
 32 | from PIL import Image
 33 | import matplotlib
 34 | import os
 35 | # For plotting graphs via ssh with no display
 36 | # Ref: https://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined
 37 | matplotlib.use('Agg')
 38 | from keras.preprocessing.image import load_img
 39 | from matplotlib import pyplot as plt
 40 | from numpy import float32
 41 | from sklearn import metrics
 42 | from sklearn.cluster.k_means_ import KMeans
 43 | from sklearn import manifold
 44 | from sklearn.utils.linear_assignment_ import linear_assignment
 45 | from sklearn import preprocessing
 46 | 
 47 | import tensorflow as tf
 48 | import keras.backend as K
 49 | K.set_image_dim_ordering('tf')
 50 | 
 51 | 
 52 | class DatasetHelper(object):
 53 |     '''
 54 |     Utility class for handling different datasets
 55 |     '''
 56 | 
 57 |     def __init__(self, name):
 58 |         '''
 59 |         A dataset instance keeps dataset name, the input set, the flat version of input set
 60 |         and the cluster labels
 61 |         '''
 62 |         self.name = name
 63 |         if name == 'MNIST':
 64 |             self.dataset = MNISTDataset()
 65 |         elif name == 'STL':
 66 |             self.dataset = STLDataset()
 67 |         elif name == 'COIL20':
 68 |             self.dataset = COIL20Dataset()
 69 |         elif name == 'cancer':    # added by Sher
 70 |             self.dataset = CANCERDataset()    
 71 | 
 72 |     def loadDataset(self):
 73 |         '''
 74 |         Load the appropriate dataset based on the dataset name
 75 |         '''
 76 |         self.input, self.labels, self.input_flat = self.dataset.loadDataset()
 77 | 
 78 |     def getClusterCount(self):
 79 |         '''
 80 |         Number of clusters in the dataset - e.g 10 for mnist, 20 for coil20
 81 |         '''
 82 |         return self.dataset.cluster_count
 83 | 
 84 |     def iterate_minibatches(self, set_type, batch_size, targets=None, shuffle=False):
 85 |         '''
 86 |         Utility method for getting batches out of a dataset
 87 |         :param set_type: IMAGE - suitable input for CNNs or FLAT - suitable for DNN
 88 |         :param batch_size: Size of minibatches
 89 |         :param targets: None if the output should be same as inputs (autoencoders), otherwise takes a target array from which batches can be extracted. Must have the same order as the dataset, e.g, dataset inputs nth sample has output at target's nth element
 90 |         :param shuffle: If the dataset needs to be shuffled or not
 91 |         :return: generates a batches of size batch_size from the dataset, each batch is the pair (input, output)
 92 |         '''
 93 |         inputs = None
 94 |         if set_type == 'IMAGE':
 95 |             inputs = self.input
 96 |             if targets is None:
 97 |                 targets = self.input
 98 |         elif set_type == 'FLAT':
 99 |             inputs = self.input_flat
100 |             if targets is None:
101 |                 targets = self.input_flat
102 |         assert len(inputs) == len(targets)
103 |         if shuffle:
104 |             indices = np.arange(len(inputs))
105 |             np.random.shuffle(indices)
106 |         for start_idx in range(0, len(inputs) - batch_size + 1, batch_size):
107 |             if shuffle:
108 |                 excerpt = indices[start_idx:start_idx + batch_size]
109 |             else:
110 |                 excerpt = slice(start_idx, start_idx + batch_size)
111 |             yield inputs[excerpt], targets[excerpt]
112 | 
113 | 
114 | class MNISTDataset(object):
115 |     '''
116 |     Class for reading and preparing MNIST dataset
117 |     '''
118 | 
119 |     def __init__(self):
120 |         self.cluster_count = 10
121 | 
122 |     def loadDataset(self):
123 |         f = gzip.open('mnist/mnist.pkl.gz', 'rb')
124 |         train_set, _, test_set = cPickle.load(f,encoding='latin1')
125 |         train_input, train_input_flat, train_labels = self.prepareDatasetForAutoencoder(train_set[0], train_set[1])
126 |         test_input, test_input_flat, test_labels = self.prepareDatasetForAutoencoder(test_set[0], test_set[1])
127 |         f.close()
128 |         # combine test and train samples
129 |         return [np.concatenate((train_input, test_input)), np.concatenate((train_labels, test_labels)),
130 |                 np.concatenate((train_input_flat, test_input_flat))]
131 | 
132 |     def prepareDatasetForAutoencoder(self, inputs, targets):
133 |         '''
134 |         Returns the image, flat and labels as a tuple
135 |         '''
136 |         X = inputs
137 |         X = X.reshape((-1,28, 28,1))
138 |         return (X, X.reshape((-1, 28 * 28)), targets)
139 | 
140 | 
141 | class CANCERDataset1(object):
142 |     '''
143 |     Class for reading and preparing MNIST dataset
144 |     '''
145 | 
146 |     def __init__(self):
147 |         self.cluster_count = 5
148 | 
149 |     def loadDataset(self):
150 |         import pandas as pd
151 |         import pandas as pd
152 |         
153 |         trainDF = pd.read_csv('cancer/TCGA_train.csv')     
154 |         train_labels = trainDF[trainDF.columns[-1]]
155 |         train_labels = np.asarray(train_labels)
156 |                 
157 |         train_features = trainDF.drop(trainDF.columns[-1],axis=1)        
158 |         train_features = train_features.as_matrix().astype(np.float32)
159 |         train_features = np.asarray([[train_features[row][col] for col in range(1,16130)] for row in range(599)])
160 |         train_features = np.asarray(train_features)
161 |         
162 |         testDF = pd.read_csv('cancer/TCGA_test.csv')
163 |         test_labels = testDF[testDF.columns[-1]]
164 |         test_labels = np.asarray(test_labels)
165 |         
166 |         test_features = testDF.drop(testDF.columns[-1],axis=1)        
167 |         test_features = test_features.as_matrix().astype(np.float32)
168 |         test_features = np.asarray([[test_features[row][col] for col in range(1,16130)] for row in range(200)])
169 |         test_features = np.asarray(test_features)        
170 |         
171 |         train_input, train_input_flat, train_labels = self.prepareDatasetForAutoencoder(train_features, train_labels)
172 |         test_input, test_input_flat, test_labels = self.prepareDatasetForAutoencoder(test_features, test_labels)
173 | 
174 |         # combine test and train samples
175 |         return [np.concatenate((train_input, test_input)), np.concatenate((train_labels, test_labels)),
176 |                 np.concatenate((train_input_flat, test_input_flat))]
177 | 
178 |     def prepareDatasetForAutoencoder(self, inputs, targets):
179 |         '''
180 |         Returns the image, flat and labels as a tuple
181 |         '''
182 |         X = inputs
183 |         X = X.reshape((-1, 127, 127, 1))
184 |         return (X, X.reshape((-1, 127 * 127)), targets)        
185 | 
186 | class CANCERDataset(object):
187 |     '''
188 |     Class for reading and preparing CANCER dataset
189 |     '''
190 |     def __init__(self):
191 |         self.cluster_count = 4
192 | 
193 |     def loadDataset(self):
194 |           root ='/home/rkarim/Training_data/'
195 |           features = []
196 |           features_flat = []
197 |           for rootName,dirName,fileNames in os.walk(root):
198 |             if(not rootName == root):
199 |                for fileName in fileNames:
200 |                   imgGray = load_img(rootName+'/'+fileName,color_mode='grayscale')
201 |                   transformed=transform.resize(np.array(imgGray),(512,512))
202 |                   features += [transformed.reshape((transformed.shape[0],transformed.shape[1]))]
203 |                   features_flat+=[transformed.reshape((transformed.shape[0]*transformed.shape[1]*1))]        
204 |           features=np.stack(features)
205 |           features_flat = np.stack(features_flat)
206 |           labels= features
207 |           return [np.concatenate((features, features),axis=0), np.concatenate((labels, labels),axis=0),
208 |                 np.concatenate((features_flat,features_flat),axis=0)]     
209 |          
210 |     def loadDataset1(self):
211 |         import pandas as pd
212 |         import pandas as pd
213 | 		 
214 |         df = pd.read_csv('cancer/TCGA_train.csv')
215 |         print(len(df.columns))
216 |         
217 |         labels = df[df.columns[-1]]
218 |         features = df.drop(df.columns[-1],axis=1)        
219 |         features = features.as_matrix().astype(np.float32)
220 |         features = np.asarray([[features[row][col] for col in range(1,16130)] for row in range(599)])
221 |         print("Is there any NaN value?")
222 |         print(np.count_nonzero(np.isnan(features)))
223 |         
224 |         min_max_scaler = preprocessing.MinMaxScaler()
225 |         train_input = min_max_scaler.fit_transform(features)
226 |         print(np.isfinite(train_input))
227 |         
228 |         train_input_flat = train_input
229 |         train_input = train_input.reshape((-1, 127, 127, 1))
230 |         train_input_flat = np.reshape(train_input, (-1, 127 * 127))
231 |         train_labels = np.asarray(labels)
232 |         
233 |         df2 = pd.read_csv('cancer/TCGA_test.csv')
234 |         labels2 = df2[df.columns[-1]]
235 |         features2 = df2.drop(df2.columns[-1],axis=1)
236 |         
237 |         features2 = features2.as_matrix().astype(np.float32)
238 |         features2 = np.asarray([[features2[row][col] for col in range(1,16130)] for row in range(200)])
239 | 
240 |         test_input = np.asarray(features2)
241 |         print(np.isfinite(test_input))
242 |         
243 |         test_input = min_max_scaler.fit_transform(test_input)
244 |         test_input_flat = test_input
245 |         test_input = test_input.reshape((-1, 127, 127, 1))
246 |         test_input_flat = np.reshape(test_input, (-1, 127 * 127))
247 |         test_labels = np.asarray(labels2)
248 |         
249 |         # combine test and train samples
250 |         return [np.concatenate((train_input, test_input)), np.concatenate((train_labels, test_labels)),
251 |                 np.concatenate((train_input_flat, test_input_flat))]    
252 |     
253 | 
254 | class STLDataset(object):
255 |     '''
256 |     Class for preparing and reading the STL dataset
257 |     '''
258 | 
259 |     def __init__(self):
260 |         self.cluster_count = 10
261 | 
262 |     def loadDataset(self):
263 |         train_x = np.fromfile('stl/train_X.bin', dtype=np.uint8)
264 |         train_y = np.fromfile('stl/train_y.bin', dtype=np.uint8)
265 |         test_x = np.fromfile('stl/train_X.bin', dtype=np.uint8)
266 |         test_y = np.fromfile('stl/train_y.bin', dtype=np.uint8)
267 |         train_input = np.reshape(train_x, (-1, 3, 96, 96))
268 |         train_labels = train_y
269 |         train_input_flat = np.reshape(test_x, (-1, 1, 3 * 96 * 96))
270 |         test_input = np.reshape(test_x, (-1, 3, 96, 96))
271 |         test_labels = test_y
272 |         test_input_flat = np.reshape(test_x, (-1, 1, 3 * 96 * 96))
273 |         return [np.concatenate(train_input, test_input), np.concatenate(train_labels, test_labels),
274 |                 np.concatenate(train_input_flat, test_input_flat)]
275 | 
276 | 
277 | class COIL20Dataset(object):
278 |     '''
279 |     Class for reading and preparing the COIL20Dataset
280 |     '''
281 | 
282 |     def __init__(self):
283 |         self.cluster_count = 20
284 | 
285 |     def loadDataset(self):
286 |         train_x = np.load('coil/coil_X.npy').astype(np.float32) / 256.0
287 |         train_y = np.load('coil/coil_y.npy')
288 |         train_x_flat = np.reshape(train_x, (-1, 128 * 128))
289 |         return [train_x, train_y, train_x_flat]
290 | 
291 | 
292 | def rescaleReshapeAndSaveImage(image_sample, out_filename):
293 |     '''
294 |     For saving the reconstructed output as an image
295 |     :param image_sample: output of the autoencoder
296 |     :param out_filename: filename for the saved image
297 |     :return: None (side effect) Image saved
298 |     '''
299 |     image_sample = ((image_sample - np.amin(image_sample)) / (np.amax(image_sample) - np.amin(image_sample))) * 255;
300 |     image_sample = np.rint(image_sample).astype(int)
301 |     image_sample = np.clip(image_sample, a_min=0, a_max=255).astype('uint8')
302 |     img = Image.fromarray(image_sample, 'L')
303 |     img.save(out_filename)
304 | 
305 | 
306 | def cluster_acc(y_true, y_pred):
307 |     '''
308 |     Uses the hungarian algorithm to find the best permutation mapping and then calculates the accuracy wrt
309 |     Implementation inpired from https://github.com/piiswrong/dec, since scikit does not implement this metric
310 |     this mapping and true labels
311 |     :param y_true: True cluster labels
312 |     :param y_pred: Predicted cluster labels
313 |     :return: accuracy score for the clustering
314 |     '''
315 |     D = int(max(y_pred.max(), y_true.max()) + 1)
316 |     w = np.zeros((D, D), dtype=np.int32)
317 |     for i in range(y_pred.size):
318 |         idx1 = int(y_pred[i])
319 |         idx2 = int(y_true[i])
320 |         w[idx1, idx2] += 1
321 |     ind = linear_assignment(w.max() - w)
322 |     return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size
323 | 
324 | 
325 | def getClusterMetricString(method_name, labels_true, labels_pred):
326 |     '''
327 |     Creates a formatted string containing the method name and acc, nmi metrics - can be used for printing
328 |     :param method_name: Name of the clustering method (just for printing)
329 |     :param labels_true: True label for each sample
330 |     :param labels_pred: Predicted label for each sample
331 |     :return: Formatted string containing metrics and method name
332 |     '''
333 |     acc = cluster_acc(labels_true, labels_pred)
334 |     nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred)
335 |     return '%-50s     %8.3f     %8.3f' % (method_name, acc, nmi)
336 | 
337 | 
338 | def evaluateKMeans(data, labels, nclusters, method_name):
339 |     '''
340 |     Clusters data with kmeans algorithm and then returns the string containing method name and metrics, and also the evaluated cluster centers
341 |     :param data: Points that need to be clustered as a numpy array
342 |     :param labels: True labels for the given points
343 |     :param nclusters: Total number of clusters
344 |     :param method_name: Name of the method from which the clustering space originates (only used for printing)
345 |     :return: Formatted string containing metrics and method name, cluster centers
346 |     '''
347 |     kmeans = KMeans(n_clusters=nclusters, n_init=5)
348 |     kmeans.fit(data)
349 |     return getClusterMetricString(method_name, labels, kmeans.labels_), kmeans.cluster_centers_
350 | 
351 | 
352 | def visualizeData(Z, labels, num_clusters, title):
353 |     '''
354 |     TSNE visualization of the points in latent space Z
355 |     :param Z: Numpy array containing points in latent space in which clustering was performed
356 |     :param labels: True labels - used for coloring points
357 |     :param num_clusters: Total number of clusters
358 |     :param title: filename where the plot should be saved
359 |     :return: None - (side effect) saves clustering visualization plot in specified location
360 |     '''
361 |     labels = labels.astype(int)
362 |     tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
363 |     Z_tsne = tsne.fit_transform(Z)
364 |     fig = plt.figure()
365 |     plt.scatter(Z_tsne[:, 0], Z_tsne[:, 1], s=2, c=labels, cmap=plt.cm.get_cmap("jet", num_clusters))
366 |     plt.colorbar(ticks=range(num_clusters))
367 |     fig.savefig(title, dpi=fig.dpi)
368 | 


--------------------------------------------------------------------------------
/CDEC/plots/genome/autoencoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rezacsedu/Convolutional-embedded-networks/f52c2a3816acbf05be28a52fe93140fe31495eb0/CDEC/plots/genome/autoencoder.png


--------------------------------------------------------------------------------
/CDEC/plots/genome/clustered_kld.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rezacsedu/Convolutional-embedded-networks/f52c2a3816acbf05be28a52fe93140fe31495eb0/CDEC/plots/genome/clustered_kld.png


--------------------------------------------------------------------------------
/CDEC/plots/genome/clustered_km.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rezacsedu/Convolutional-embedded-networks/f52c2a3816acbf05be28a52fe93140fe31495eb0/CDEC/plots/genome/clustered_km.png


--------------------------------------------------------------------------------
/CDEC/plots/genome/raw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rezacsedu/Convolutional-embedded-networks/f52c2a3816acbf05be28a52fe93140fe31495eb0/CDEC/plots/genome/raw.png


--------------------------------------------------------------------------------
/CDEC/self.trainAutoencoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rezacsedu/Convolutional-embedded-networks/f52c2a3816acbf05be28a52fe93140fe31495eb0/CDEC/self.trainAutoencoder.png


--------------------------------------------------------------------------------
/DEC_GenotypeClustering_Keras/DEC_Genotype_Clustering.py:
--------------------------------------------------------------------------------
  1 | from keras.datasets import mnist
  2 | import numpy as np
  3 | import pandas as pd
  4 | np.random.seed(10)
  5 | 
  6 | from time import time
  7 | import numpy as np
  8 | import keras.backend as K
  9 | from keras.engine.topology import Layer, InputSpec
 10 | from keras.layers import Dense, Input
 11 | from keras.models import Model
 12 | from keras.optimizers import RMSprop
 13 | from keras import callbacks
 14 | from keras.initializers import VarianceScaling
 15 | from sklearn.cluster import KMeans
 16 | from sklearn import metrics
 17 | from sklearn.metrics.cluster import normalized_mutual_info_score
 18 | from sklearn.metrics.cluster import adjusted_rand_score
 19 | from sklearn.metrics import accuracy_score
 20 | from sklearn import manifold
 21 | import keras.layers.normalization as bn
 22 | 
 23 | df1 = pd.read_csv('/home/asif/genome.csv', header=None)
 24 | print(df1.head())
 25 | 
 26 | label = df1[0]
 27 | print(label.head())
 28 | 
 29 | from sklearn import preprocessing
 30 | le = preprocessing.LabelEncoder()
 31 | lbl = le.fit(label)
 32 | labelss = lbl.transform(label)
 33 | labelDF = pd.DataFrame(labelss)
 34 | 
 35 | #labelArr = 
 36 | print(labelDF.head())
 37 | 
 38 | feature = df1.drop(0, axis=1)
 39 | print(feature.head())
 40 | 
 41 | from sklearn.preprocessing import MinMaxScaler
 42 | scaler = MinMaxScaler()
 43 | x1 = feature.iloc[:,1:]
 44 | df_scaled = pd.DataFrame(scaler.fit_transform(x1), columns=x1.columns)
 45 | df_scaled.head()
 46 | 
 47 | y = labelss
 48 | x = df_scaled.values
 49 | 
 50 | print(y.shape)
 51 | print(x.shape)
 52 | 
 53 | print(np.isnan(np.min(x)))
 54 | 
 55 | #y.shape
 56 | #x.shape
 57 | #print(x)
 58 | #print(y)
 59 | 
 60 | n_clusters = len(np.unique(y))
 61 | print(n_clusters)
 62 | 
 63 | kmeans = KMeans(n_clusters=n_clusters, n_init=5)
 64 | y_pred_kmeans = kmeans.fit_predict(x)
 65 | 
 66 | print(accuracy_score(y, y_pred_kmeans))
 67 | 
 68 | dims = [x.shape[-1], 16, 16, 32, 5]
 69 | init = VarianceScaling(scale=1. / 3., mode='fan_in', distribution='uniform')
 70 | pretrain_optimizer = RMSprop(lr=0.001, rho=0.01, epsilon=None, decay=0.0)
 71 | pretrain_epochs = 100
 72 | batch_size = 32
 73 | save_dir = 'result/'
 74 | 
 75 | def autoencoder(dims, act='relu', init='glorot_uniform'):
 76 |     """
 77 |     Fully connected auto-encoder model, symmetric.
 78 |     Arguments:
 79 |         dims: list of number of units in each layer of encoder. dims[0] is input dim, dims[-1] is units in hidden layer.
 80 |             The decoder is symmetric with encoder. So number of layers of the auto-encoder is 2*len(dims)-1
 81 |         act: activation, not applied to Input, Hidden and Output layers
 82 |     return:
 83 |         (ae_model, encoder_model), Model of autoencoder and model of encoder
 84 |     """
 85 |     n_stacks = len(dims) - 1
 86 |     # input
 87 |     input_img = Input(shape=(dims[0],), name='input')
 88 |     x = input_img
 89 |     # internal layers in encoder
 90 |     for i in range(n_stacks-1):
 91 |         x = Dense(dims[i + 1], activation=act, kernel_initializer=init, name='encoder_%d' % i)(x)
 92 |         #bn.BatchNormalization(momentum=0.9, epsilon=1e-06, weights=None)
 93 | 
 94 |     # hidden layer
 95 |     encoded = Dense(dims[-1], kernel_initializer=init, name='encoder_%d' % (n_stacks - 1))(x)  # hidden layer, features are extracted from here
 96 |     bn.BatchNormalization(momentum=0.9, epsilon=1e-06, weights=None)
 97 | 
 98 |     x = encoded
 99 |     # internal layers in decoder
100 |     for i in range(n_stacks-1, 0, -1):
101 |         x = Dense(dims[i], activation=act, kernel_initializer=init, name='decoder_%d' % i)(x)        
102 | 
103 |     # output
104 |     x = Dense(dims[0], kernel_initializer=init, name='decoder_0')(x)
105 |     bn.BatchNormalization(momentum=0.9, epsilon=1e-06, weights=None)
106 |     
107 |     decoded = x
108 |     return Model(inputs=input_img, outputs=decoded, name='AE'), Model(inputs=input_img, outputs=encoded, name='encoder')
109 | 	
110 | autoencoder, encoder = autoencoder(dims, init=init)
111 | autoencoder.compile(optimizer=pretrain_optimizer, loss='mse')
112 | autoencoder.fit(x, x, batch_size=batch_size, epochs=pretrain_epochs) #, callbacks=cb)
113 | autoencoder.save_weights(save_dir + '/ThesisDEC_weights.h5')
114 | 
115 | autoencoder.save_weights(save_dir + '/ThesisDEC_weights.h5')
116 | autoencoder.load_weights(save_dir + '/ThesisDEC_weights.h5')
117 | 
118 | 
119 | class ClusteringLayer(Layer):
120 |     """
121 |     Clustering layer converts input sample (feature) to soft label, i.e. a vector that represents the probability of the
122 |     sample belonging to each cluster. The probability is calculated with student's t-distribution.
123 | 
124 |     # Example
125 |     ```
126 |         model.add(ClusteringLayer(n_clusters=10))
127 |     ```
128 |     # Arguments
129 |         n_clusters: number of clusters.
130 |         weights: list of Numpy array with shape `(n_clusters, n_features)` witch represents the initial cluster centers.
131 |         alpha: degrees of freedom parameter in Student's t-distribution. Default to 1.0.
132 |     # Input shape
133 |         2D tensor with shape: `(n_samples, n_features)`.
134 |     # Output shape
135 |         2D tensor with shape: `(n_samples, n_clusters)`.
136 |     """
137 | 
138 |     def __init__(self, n_clusters, weights=None, alpha=1.0, **kwargs):
139 |         if 'input_shape' not in kwargs and 'input_dim' in kwargs:
140 |             kwargs['input_shape'] = (kwargs.pop('input_dim'),)
141 |         super(ClusteringLayer, self).__init__(**kwargs)
142 |         self.n_clusters = n_clusters
143 |         self.alpha = alpha
144 |         self.initial_weights = weights
145 |         self.input_spec = InputSpec(ndim=2)
146 | 
147 |     def build(self, input_shape):
148 |         assert len(input_shape) == 2
149 |         input_dim = input_shape[1]
150 |         self.input_spec = InputSpec(dtype=K.floatx(), shape=(None, input_dim))
151 |         self.clusters = self.add_weight((self.n_clusters, input_dim), initializer='glorot_uniform', name='clusters')
152 |         if self.initial_weights is not None:
153 |             self.set_weights(self.initial_weights)
154 |             del self.initial_weights
155 |         self.built = True
156 | 
157 |     def call(self, inputs, **kwargs):
158 |         """ student t-distribution, as same as used in t-SNE algorithm.
159 |          Measure the similarity between embedded point z_i and centroid µ_j.
160 |                  q_ij = 1/(1+dist(x_i, µ_j)^2), then normalize it.
161 |                  q_ij can be interpreted as the probability of assigning sample i to cluster j.
162 |                  (i.e., a soft assignment)
163 |         Arguments:
164 |             inputs: the variable containing data, shape=(n_samples, n_features)
165 |         Return:
166 |             q: student's t-distribution, or soft labels for each sample. shape=(n_samples, n_clusters)
167 |         """
168 |         q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
169 |         q **= (self.alpha + 1.0) / 2.0
170 |         q = K.transpose(K.transpose(q) / K.sum(q, axis=1)) # Make sure each sample's 10 values add up to 1.
171 |         return q
172 | 
173 |     def compute_output_shape(self, input_shape):
174 |         assert input_shape and len(input_shape) == 2
175 |         return input_shape[0], self.n_clusters
176 | 
177 |     def get_config(self):
178 |         config = {'n_clusters': self.n_clusters}
179 |         base_config = super(ClusteringLayer, self).get_config()
180 |         return dict(list(base_config.items()) + list(config.items()))
181 | 
182 | clustering_layer = ClusteringLayer(n_clusters, name='clustering')(encoder.output)
183 | model = Model(inputs=encoder.input, outputs=clustering_layer)
184 | model.compile(optimizer=RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0), loss='kld')
185 | 
186 | kmeans = KMeans(n_clusters=n_clusters, n_init=n_clusters)
187 | y_pred = kmeans.fit_predict(encoder.predict(x))
188 | 
189 | y_pred_last = np.copy(y_pred)
190 | 
191 | model.get_layer(name='clustering').set_weights([kmeans.cluster_centers_])
192 | 
193 | # computing an auxiliary target distribution
194 | def target_distribution(q):
195 |     weight = q ** 2 / q.sum(0)
196 |     return (weight.T / weight.sum(1)).T
197 | 
198 | loss = 0
199 | index = 0
200 | maxiter = 20000
201 | update_interval = 500
202 | index_array = np.arange(x.shape[0])
203 | 
204 | tol = 0.001 # tolerance threshold to stop training
205 | 
206 | for ite in range(int(maxiter)):
207 |     if ite % update_interval == 0:
208 |         q = model.predict(x, verbose=0)
209 |         p = target_distribution(q)  # update the auxiliary target distribution p
210 | 
211 |         # evaluate the clustering performance
212 |         y_pred = q.argmax(1)
213 |         if y is not None:
214 |             acc = np.round(accuracy_score(y, y_pred), 5)
215 |             nmi = np.round(normalized_mutual_info_score(y, y_pred), 5)
216 |             ari = np.round(adjusted_rand_score(y, y_pred), 5)
217 |             loss = np.round(loss, 5)
218 |             print('Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f' % (ite, acc, nmi, ari), ' ; loss=', loss)
219 | 
220 |         # check stop criterion - model convergence
221 |         delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
222 |         y_pred_last = np.copy(y_pred)
223 |         if ite > 0 and delta_label < tol:
224 |             print('delta_label ', delta_label, '< tol ', tol)
225 |             print('Reached tolerance threshold. Stopping training.')
226 |             break
227 |     idx = index_array[index * batch_size: min((index+1) * batch_size, x.shape[0])]
228 |     model.train_on_batch(x=x[idx], y=p[idx])
229 |     index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0
230 | 
231 | model.save_weights(save_dir + '/Thesis_DEC_model_final.h5')
232 | model.load_weights(save_dir + '/Thesis_DEC_model_final.h5')
233 | 
234 | # Eval.
235 | q = model.predict(x, verbose=0)
236 | p = target_distribution(q)  # update the auxiliary target distribution p
237 | 
238 | # evaluate the clustering performance
239 | y_pred = q.argmax(1)
240 | if y is not None:
241 |     acc = np.round(accuracy_score(y, y_pred), 5)
242 |     nmi = np.round(normalized_mutual_info_score(y, y_pred), 5)
243 |     ari = np.round(adjusted_rand_score(y, y_pred), 5)
244 |     loss = np.round(loss, 5)
245 |     print('Acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari), ' ; loss=', loss)
246 | 
247 | import seaborn as sns
248 | import sklearn.metrics
249 | import matplotlib.pyplot as plt
250 | sns.set(font_scale=1.5)
251 | confusion_matrix = sklearn.metrics.confusion_matrix(y, y_pred)
252 | 
253 | plt.figure(figsize=(12, 11))
254 | sns.heatmap(confusion_matrix, annot=True, fmt="d", annot_kws={"size": 15});
255 | plt.title("Confusion matrix", fontsize=25)
256 | plt.ylabel('True label', fontsize=25)
257 | plt.xlabel('Clustering label', fontsize=25)
258 | plt.show()
259 | 
260 | def visualizeData(Z, labels, num_clusters, title):
261 |     '''
262 |     TSNE visualization of the points in latent space Z
263 |     :param Z: Numpy array containing points in latent space in which clustering was performed
264 |     :param labels: True labels - used for coloring points
265 |     :param num_clusters: Total number of clusters
266 |     :param title: filename where the plot should be saved
267 |     :return: None - (side effect) saves clustering visualization plot in specified location
268 |     '''
269 |     labels = labels.astype(int)
270 |     tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
271 |     Z_tsne = tsne.fit_transform(Z)
272 |     fig = plt.figure()
273 |     plt.scatter(Z_tsne[:, 0], Z_tsne[:, 1], s=2, c=labels, cmap=plt.cm.get_cmap("jet", num_clusters))
274 |     plt.colorbar(ticks=range(num_clusters))
275 |     fig.savefig(title, dpi=fig.dpi)
276 | 
277 | from sklearn.manifold import TSNE
278 | import seaborn as sn
279 | import matplotlib.pyplot as plt
280 | 
281 | data_1000 =  x[0:1000,:]
282 | labels_1000 = y[0:1000]
283 | 
284 | model = TSNE(n_components = 2, random_state = 0)
285 | 
286 | tsne_data = model.fit_transform(x)
287 | #y_pred
288 | 
289 | tsne_data = np.vstack((tsne_data.T, y)).T
290 | tsne_df = pd.DataFrame(data= tsne_data, columns= ("Dim_1","Dim_2","label"))
291 | 
292 | sn.FacetGrid(tsne_df, hue= "label", size = 6).map(plt.scatter, 'Dim_1', 'Dim_2').add_legend()
293 | plt.show()	
294 | 
295 | visualizeData(x, y, n_clusters, "t_SNE_graph_original.png")
296 | visualizeData(x, y_pred, n_clusters, "t_SNE_graph_predicted.png")
297 | 


--------------------------------------------------------------------------------
/DEC_GenotypeClustering_Keras/LSTM_EthnicityPrediction.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import glob
  4 | import numpy as np
  5 | import sys
  6 | from time import time
  7 | 
  8 | from sklearn.preprocessing import LabelEncoder
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn.metrics import precision_recall_fscore_support
 11 | from sklearn import metrics
 12 | 
 13 | from keras.models import Sequential
 14 | from keras.layers import LSTM, Dense, Dropout, Activation, Flatten
 15 | from keras.callbacks import TensorBoard
 16 | from keras.optimizers import RMSprop
 17 | from keras.regularizers import l2
 18 | from keras.callbacks import EarlyStopping
 19 | from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
 20 | from keras.utils import np_utils
 21 | 
 22 | from keras import backend as K
 23 | K.set_image_dim_ordering('tf')
 24 | import matplotlib.pyplot as plt
 25 | import itertools
 26 | 
 27 | import numpy as np
 28 | import pandas as pd
 29 | np.random.seed(10)
 30 | 
 31 | from time import time
 32 | import numpy as np
 33 | import keras.backend as K
 34 | from keras.engine.topology import Layer, InputSpec
 35 | from keras.layers import Dense, Input
 36 | from keras.models import Model
 37 | from keras.optimizers import RMSprop
 38 | from keras import callbacks
 39 | from keras.initializers import VarianceScaling
 40 | from sklearn.cluster import KMeans
 41 | from sklearn import metrics
 42 | from sklearn.metrics.cluster import normalized_mutual_info_score
 43 | from sklearn.metrics.cluster import adjusted_rand_score
 44 | from sklearn.metrics import accuracy_score
 45 | from sklearn import manifold
 46 | import keras.layers.normalization as bn
 47 | 
 48 | from sklearn.metrics import confusion_matrix
 49 | 
 50 | df1 = pd.read_csv('/home/asif/genome.csv', header=None)
 51 | print(df1.head())
 52 | 
 53 | label = df1[0]
 54 | print(label.head())
 55 | 
 56 | from sklearn import preprocessing
 57 | le = preprocessing.LabelEncoder()
 58 | lbl = le.fit(label)
 59 | labelss = lbl.transform(label)
 60 | labelDF = pd.DataFrame(labelss)
 61 | 
 62 | #labelArr = 
 63 | print(labelDF.head())
 64 | 
 65 | feature = df1.drop(0, axis=1)
 66 | print(feature.head())
 67 | 
 68 | from sklearn.preprocessing import MinMaxScaler
 69 | scaler = MinMaxScaler()
 70 | x1 = feature.iloc[:,1:]
 71 | df_scaled = pd.DataFrame(scaler.fit_transform(x1), columns=x1.columns)
 72 | df_scaled.head()
 73 | 
 74 | y = labelss
 75 | x = df_scaled.values
 76 | 
 77 | features = x
 78 | labels = y
 79 | 
 80 | def prepare_test_train_valid():
 81 | 	# Train-test split 
 82 | 	train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size=0.25, random_state=100)
 83 | 	test_x, valid_x, test_y, valid_y = train_test_split(train_x, train_y, test_size=0.50, random_state=100)
 84 | 
 85 | 	return train_x, test_x, train_y, test_y, valid_x, valid_y
 86 | 
 87 | def one_hot_encode(labels):
 88 |     n_labels = len(labels)
 89 |     n_unique_labels = len(np.unique(labels))
 90 |     one_hot_encode = np.zeros((n_labels,n_unique_labels))
 91 |     one_hot_encode[np.arange(n_labels), labels] = 1
 92 |     return one_hot_encode
 93 | 
 94 | labels = one_hot_encode(labels)
 95 | 
 96 | # Extract feature
 97 | train_x, test_x, train_y, test_y, valid_x, valid_y = prepare_test_train_valid()
 98 | 
 99 | print('X_train shape:', train_x.shape)
100 | print('Y_train shape:', train_y.shape)
101 | 
102 | num_classes = 5
103 | data_dim = 52
104 | timesteps = 1
105 | 
106 | train_x = np.reshape(train_x,(train_x.shape[0], 1, train_x.shape[1]))
107 | test_x = np.reshape(test_x,(test_x.shape[0], 1, test_x.shape[1]))
108 | valid_x = np.reshape(valid_x,(valid_x.shape[0], 1, valid_x.shape[1]))
109 | 
110 | def plot_confusion_matrix(cm, classes,
111 |                           normalize=False,
112 |                           title='Confusion matrix',
113 |                           cmap=plt.cm.Blues):
114 |     """
115 |     This function prints and plots the confusion matrix.
116 |     Normalization can be applied by setting `normalize=True`.
117 |     """
118 |     if normalize:
119 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
120 |         print("Normalized confusion matrix")
121 |     else:
122 |         print('Confusion matrix, without normalization')
123 | 
124 |     print(cm)
125 | 
126 |     plt.imshow(cm, interpolation='nearest', cmap=cmap)
127 |     plt.title(title)
128 |     plt.colorbar()
129 |     tick_marks = np.arange(len(classes))
130 |     plt.xticks(tick_marks, classes, rotation=45)
131 |     plt.yticks(tick_marks, classes)
132 | 
133 |     fmt = '.2f' if normalize else 'd'
134 |     thresh = cm.max() / 2.
135 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
136 |         plt.text(j, i, format(cm[i, j], fmt),
137 |                  horizontalalignment="center",
138 |                  color="white" if cm[i, j] > thresh else "black")
139 | 
140 |     plt.tight_layout()
141 |     plt.ylabel('True label')
142 |     plt.xlabel('Predicted label')
143 | 
144 | def build_LSTM(): #OK
145 |     # expected input data shape: (batch_size, timesteps, data_dim)
146 |     model = Sequential()
147 |     model.add(LSTM(32, return_sequences=True, input_shape=(timesteps, data_dim))) 
148 | 	
149 |     model.add(LSTM(24, return_sequences=True))
150 | 
151 |     #model.add(Dropout(0.2))
152 |     model.add(LSTM(16, return_sequences=True)) 
153 |     model.add(Dropout(0.2))
154 |     
155 |     # apply softmax to output
156 |     model.add(Flatten())
157 |     model.add(Dense(num_classes, activation='softmax'))
158 |     return model
159 | 
160 | def model_train_evaluate(model, number_epoch):   
161 |     sgd = RMSprop(lr=0.001, rho=0.01, epsilon=None, decay=0.0)
162 | 
163 |     # a stopping function should the validation loss stop improving
164 |     earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=0, mode='auto')
165 | 
166 |     #if model in ['RNN']: 
167 |     rnn_model = build_LSTM() #OK
168 |     rnn_model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=sgd)
169 |     tensorboardRNN = TensorBoard(log_dir="RNN_logs/{}".format(time()))
170 |     rnn_model.fit(train_x, train_y, validation_data=(valid_x, valid_y), callbacks=[tensorboardRNN], batch_size=128, epochs=int(number_epoch))
171 |     print(rnn_model.summary())
172 |         
173 |     y_prob = rnn_model.predict(test_x) 
174 |     y_pred = y_prob.argmax(axis=-1)
175 |     y_true = np.argmax(test_y, 1)
176 | 
177 |     roc = roc_auc_score(test_y, y_prob)
178 |     print ("ROC:",  round(roc,3))
179 | 
180 |     # evaluate the model
181 |     score, accuracy = rnn_model.evaluate(test_x, test_y, batch_size=32)
182 |     print("\nAccuracy = {:.2f}".format(accuracy))
183 | 
184 |     # the F-score gives a similiar value to the accuracy score, but useful for cross-checking
185 |     p,r,f,s = precision_recall_fscore_support(y_true, y_pred, average='micro')
186 |     print ("F-Score:", round(f,2))
187 |     print ("Precision:", round(p,2))
188 |     print ("Recall:", round(r,2))
189 |     print ("F-Score:", round(f,2)) 
190 |     
191 |     # Compute confusion matrix
192 |     cnf_matrix = confusion_matrix(y_true, y_pred)
193 |     np.set_printoptions(precision=2)
194 |     
195 |     class_names = ["FIN", "GBR", "ASW", "CHB", "CLM"]
196 | 
197 |     # Plot non-normalized confusion matrix
198 |     plt.figure()
199 |     plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix: true vs predicted label')  
200 |     plt.show() 
201 | 
202 | model = build_LSTM()
203 | model_train_evaluate(model, 1000)
204 | import gc; gc.collect()
205 | 


--------------------------------------------------------------------------------
/DEC_GenotypeClustering_Keras/genome.csv:
--------------------------------------------------------------------------------
  1 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  2 | GBR,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
  3 | CHB,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  4 | CHB,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  5 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
  6 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
  7 | CHB,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
  8 | ASW,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
  9 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
 10 | ASW,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0
 11 | ASW,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
 12 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
 13 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 14 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
 15 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 16 | CHB,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 17 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 18 | GBR,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
 19 | GBR,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
 20 | GBR,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
 21 | ASW,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 22 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 23 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 24 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 25 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 26 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 27 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 28 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 29 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 30 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 31 | CLM,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
 32 | CLM,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
 33 | CHB,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
 34 | CLM,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 35 | CLM,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 36 | FIN,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 37 | GBR,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
 38 | FIN,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 39 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 40 | GBR,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
 41 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 42 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 43 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 44 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 45 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 46 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 47 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 48 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 49 | ASW,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0
 50 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 51 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 52 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
 53 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 54 | CLM,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 55 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 56 | CLM,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
 57 | CHB,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 58 | ASW,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 59 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 60 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 61 | CLM,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
 62 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 63 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0
 64 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 65 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
 66 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 67 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 68 | CLM,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0
 69 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 70 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 71 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 72 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 73 | CHB,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 74 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 75 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 76 | GBR,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
 77 | ASW,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 78 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
 79 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 80 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
 81 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 82 | CHB,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 83 | CLM,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
 84 | CLM,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 85 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
 86 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 87 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 88 | FIN,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 89 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 90 | ASW,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0
 91 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
 92 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 93 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 94 | ASW,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
 95 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
 96 | FIN,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 97 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
 98 | FIN,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 99 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
100 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
101 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
102 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
103 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
104 | GBR,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
105 | FIN,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
106 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
107 | GBR,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
108 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1
109 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
110 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
111 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
112 | FIN,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
113 | CHB,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
114 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
115 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
116 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
117 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
118 | FIN,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
119 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
120 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
121 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
122 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
123 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
124 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
125 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
126 | CLM,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
127 | ASW,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
128 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
129 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
130 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
131 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
132 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1
133 | CLM,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
134 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
135 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
136 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
137 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
138 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
139 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
140 | ASW,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
141 | FIN,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
142 | CLM,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
143 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
144 | CLM,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
145 | FIN,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
146 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
147 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
148 | CLM,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1
149 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
150 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
151 | ASW,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0
152 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
153 | CLM,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
154 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
155 | GBR,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
156 | GBR,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
157 | CLM,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1
158 | CHB,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
159 | ASW,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
160 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
161 | CLM,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
162 | CLM,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
163 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
164 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
165 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
166 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
167 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
168 | FIN,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
169 | CLM,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
170 | ASW,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
171 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
172 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
173 | ASW,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
174 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
175 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
176 | CLM,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
177 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
178 | CHB,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
179 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
180 | GBR,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
181 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
182 | CHB,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
183 | FIN,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
184 | ASW,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
185 | GBR,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
186 | GBR,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
187 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
188 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
189 | GBR,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,1,0
190 | FIN,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
191 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
192 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
193 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0
194 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
195 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
196 | CLM,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
197 | CHB,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
198 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
199 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
200 | 


--------------------------------------------------------------------------------
/PopulationClustering_v2/output_1.txt:
--------------------------------------------------------------------------------
1 | Found 199 samples


--------------------------------------------------------------------------------
/PopulationClustering_v2/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 	<groupId>com.deri.sels</groupId>
  5 | 	<artifactId>PopulationClustering_v2</artifactId>
  6 | 	<version>0.1-SNAPSHOT</version>
  7 | 	<properties>
  8 | 		<spark.version>2.2.1</spark.version>
  9 | 		<scala.version>2.11.8</scala.version>
 10 | 		<h2o.version>3.16.0.2</h2o.version>
 11 | 		<sparklingwater.version>2.2.6</sparklingwater.version>
 12 | 		<adam.version>0.23.0</adam.version>
 13 | 	</properties>
 14 | 	<pluginRepositories>
 15 | 		<pluginRepository>
 16 | 			<id>scala-tools.org</id>
 17 | 			<name>Scala-tools Maven2 Repository</name>
 18 | 			<url>http://scala-tools.org/repo-releases</url>
 19 | 		</pluginRepository>
 20 | 	</pluginRepositories>
 21 | 	<dependencies>
 22 | 		<dependency>
 23 | 			<groupId>org.bdgenomics.adam</groupId>
 24 | 			<artifactId>adam-core_2.11</artifactId>
 25 | 			<version>0.23.0</version>
 26 | 		</dependency>
 27 | 
 28 | 		<dependency>
 29 | 			<groupId>ai.h2o</groupId>
 30 | 			<artifactId>sparkling-water-core_2.11</artifactId>
 31 | 			<version>2.2.6</version>
 32 | 		</dependency>
 33 | 		<dependency>
 34 | 			<groupId>ai.h2o</groupId>
 35 | 			<artifactId>sparkling-water-examples_2.11</artifactId>
 36 | 			<version>2.2.6</version>
 37 | 		</dependency>
 38 | 				<dependency>
 39 | 			<groupId>org.apache.directory.studio</groupId>
 40 | 			<artifactId>org.apache.commons.io</artifactId>
 41 | 			<version>2.4</version>
 42 | 		</dependency>
 43 | 		<dependency>
 44 | 			<groupId>org.apache.spark</groupId>
 45 | 			<artifactId>spark-core_2.11</artifactId>
 46 | 			<version>${spark.version}</version>
 47 | 		</dependency>
 48 | 
 49 | 		<dependency>
 50 | 			<groupId>ai.h2o</groupId>
 51 | 			<artifactId>h2o-core</artifactId>
 52 | 			<version>${h2o.version}</version>
 53 | 		</dependency>
 54 | 		<dependency>
 55 | 			<groupId>ai.h2o</groupId>
 56 | 			<artifactId>h2o-scala_2.11</artifactId>
 57 | 			<version>${h2o.version}</version>
 58 | 		</dependency>
 59 | 		<dependency>
 60 | 			<groupId>ai.h2o</groupId>
 61 | 			<artifactId>h2o-algos</artifactId>
 62 | 			<version>${h2o.version}</version>
 63 | 		</dependency>
 64 | 		<dependency>
 65 | 			<groupId>ai.h2o</groupId>
 66 | 			<artifactId>h2o-app</artifactId>
 67 | 			<version>${h2o.version}</version>
 68 | 		</dependency>
 69 | 		<dependency>
 70 | 			<groupId>ai.h2o</groupId>
 71 | 			<artifactId>h2o-persist-hdfs</artifactId>
 72 | 			<version>${h2o.version}</version>
 73 | 		</dependency>
 74 | 		    <dependency>
 75 |         <artifactId>scala-library</artifactId>
 76 |         <groupId>org.scala-lang</groupId>
 77 |         <version>${scala.version}</version>
 78 |     </dependency>
 79 | 		<dependency>
 80 | 			<groupId>ai.h2o</groupId>
 81 | 			<artifactId>google-analytics-java</artifactId>
 82 | 			<version>1.1.2-H2O-CUSTOM</version>
 83 | 		</dependency>
 84 | 		<dependency>
 85 | 			<groupId>joda-time</groupId>
 86 | 			<artifactId>joda-time</artifactId>
 87 | 			<version>2.9.9</version>
 88 | 		</dependency>
 89 | 	</dependencies>
 90 | 	<repositories>
 91 | 		<repository>
 92 | 			<id>snapshots-repo</id>
 93 | 			<url>https://oss.sonatype.org/content/repositories/snapshots</url>
 94 | 			<releases>
 95 | 				<enabled>false</enabled>
 96 | 			</releases>
 97 | 			<snapshots>
 98 | 				<enabled>true</enabled>
 99 | 				<updatePolicy>daily</updatePolicy>  <!-- Optional, update daily -->
100 | 			</snapshots>
101 | 		</repository>
102 | 	</repositories>
103 | 	<build>
104 | 		<plugins>
105 | 			<!-- download source code in Eclipse, best practice -->
106 | 			<plugin>
107 | 				<groupId>org.apache.maven.plugins</groupId>
108 | 				<artifactId>maven-eclipse-plugin</artifactId>
109 | 				<version>2.9</version>
110 | 				<configuration>
111 | 					<downloadSources>true</downloadSources>
112 | 					<downloadJavadocs>false</downloadJavadocs>
113 | 				</configuration>
114 | 			</plugin>
115 | 			<!-- Set a compiler level -->
116 | 			<plugin>
117 | 				<groupId>org.apache.maven.plugins</groupId>
118 | 				<artifactId>maven-compiler-plugin</artifactId>
119 | 				<version>3.5.1</version>
120 | 				<configuration>
121 | 					<source>${jdk.version}</source>
122 | 					<target>${jdk.version}</target>
123 | 				</configuration>
124 | 			</plugin>
125 | 			<plugin>
126 | 				<artifactId>maven-shade-plugin</artifactId>
127 | 				<version>2.4.3</version>
128 | 				<executions>
129 | 					<execution>
130 | 						<phase>package</phase>
131 | 						<goals>
132 | 							<goal>shade</goal>
133 | 						</goals>
134 | 						<configuration>
135 | 							<createDependencyReducedPom>false</createDependencyReducedPom>
136 | 
137 | 							<filters>
138 | 								<filter>
139 | 									<artifact>*:*</artifact>
140 | 									<excludes>
141 | 										<exclude>META-INF/*.SF</exclude>
142 | 										<exclude>META-INF/*.DSA</exclude>
143 | 										<exclude>META-INF/*.RSA</exclude>
144 | 									</excludes>
145 | 								</filter>
146 | 							</filters>
147 | 							<transformers>
148 | 								<transformer
149 | 									implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
150 | 							</transformers>
151 | 						</configuration>
152 | 					</execution>
153 | 				</executions>
154 | 			</plugin>
155 | 			<!-- Maven Assembly Plugin -->
156 | 			<plugin>
157 | 				<groupId>org.apache.maven.plugins</groupId>
158 | 				<artifactId>maven-assembly-plugin</artifactId>
159 | 				<version>2.4.1</version>
160 | 				<configuration>
161 | 					<!-- get all project dependencies -->
162 | 					<descriptorRefs>
163 | 						<descriptorRef>jar-with-dependencies</descriptorRef>
164 | 					</descriptorRefs>
165 | 					<!-- MainClass in mainfest make a executable jar -->
166 | 					<archive>
167 | 						<manifest>
168 | 						     <mainClass>org.fit.genomics.PopStratClassification</mainClass>
169 | 						</manifest>
170 | 					</archive>
171 | 
172 | 					<property>
173 | 						<name>oozie.launcher.mapreduce.job.user.classpath.first</name>
174 | 						<value>true</value>
175 | 					</property>
176 | 
177 | 				</configuration>
178 | 				<executions>
179 | 					<execution>
180 | 						<id>make-assembly</id>
181 | 						<!-- bind to the packaging phase -->
182 | 						<phase>package</phase>
183 | 						<goals>
184 | 							<goal>single</goal>
185 | 						</goals>
186 | 					</execution>
187 | 				</executions>
188 | 			</plugin>
189 | 		</plugins>
190 | 	</build>
191 | 	
192 | </project>


--------------------------------------------------------------------------------
/PopulationClustering_v2/results/train.csv/DEC_Genotype_Clustering.py:
--------------------------------------------------------------------------------
  1 | from keras.datasets import mnist
  2 | import numpy as np
  3 | import pandas as pd
  4 | np.random.seed(10)
  5 | 
  6 | from time import time
  7 | import numpy as np
  8 | import keras.backend as K
  9 | from keras.engine.topology import Layer, InputSpec
 10 | from keras.layers import Dense, Input
 11 | from keras.models import Model
 12 | from keras.optimizers import RMSprop
 13 | from keras import callbacks
 14 | from keras.initializers import VarianceScaling
 15 | from sklearn.cluster import KMeans
 16 | from sklearn import metrics
 17 | from sklearn.metrics.cluster import normalized_mutual_info_score
 18 | from sklearn.metrics.cluster import adjusted_rand_score
 19 | from sklearn.metrics import accuracy_score
 20 | from sklearn import manifold
 21 | import keras.layers.normalization as bn
 22 | 
 23 | df1 = pd.read_csv('/home/asif/genome.csv', header=None)
 24 | print(df1.head())
 25 | 
 26 | label = df1[0]
 27 | print(label.head())
 28 | 
 29 | from sklearn import preprocessing
 30 | le = preprocessing.LabelEncoder()
 31 | lbl = le.fit(label)
 32 | labelss = lbl.transform(label)
 33 | labelDF = pd.DataFrame(labelss)
 34 | 
 35 | #labelArr = 
 36 | print(labelDF.head())
 37 | 
 38 | feature = df1.drop(0, axis=1)
 39 | print(feature.head())
 40 | 
 41 | from sklearn.preprocessing import MinMaxScaler
 42 | scaler = MinMaxScaler()
 43 | x1 = feature.iloc[:,1:]
 44 | df_scaled = pd.DataFrame(scaler.fit_transform(x1), columns=x1.columns)
 45 | df_scaled.head()
 46 | 
 47 | y = labelss
 48 | x = df_scaled.values
 49 | 
 50 | print(y.shape)
 51 | print(x.shape)
 52 | 
 53 | print(np.isnan(np.min(x)))
 54 | 
 55 | #y.shape
 56 | #x.shape
 57 | #print(x)
 58 | #print(y)
 59 | 
 60 | n_clusters = len(np.unique(y))
 61 | print(n_clusters)
 62 | 
 63 | kmeans = KMeans(n_clusters=n_clusters, n_init=5)
 64 | y_pred_kmeans = kmeans.fit_predict(x)
 65 | 
 66 | print(accuracy_score(y, y_pred_kmeans))
 67 | 
 68 | dims = [x.shape[-1], 16, 16, 32, 5]
 69 | init = VarianceScaling(scale=1. / 3., mode='fan_in', distribution='uniform')
 70 | pretrain_optimizer = RMSprop(lr=0.001, rho=0.01, epsilon=None, decay=0.0)
 71 | pretrain_epochs = 100
 72 | batch_size = 32
 73 | save_dir = 'result/'
 74 | 
 75 | def autoencoder(dims, act='relu', init='glorot_uniform'):
 76 |     """
 77 |     Fully connected auto-encoder model, symmetric.
 78 |     Arguments:
 79 |         dims: list of number of units in each layer of encoder. dims[0] is input dim, dims[-1] is units in hidden layer.
 80 |             The decoder is symmetric with encoder. So number of layers of the auto-encoder is 2*len(dims)-1
 81 |         act: activation, not applied to Input, Hidden and Output layers
 82 |     return:
 83 |         (ae_model, encoder_model), Model of autoencoder and model of encoder
 84 |     """
 85 |     n_stacks = len(dims) - 1
 86 |     # input
 87 |     input_img = Input(shape=(dims[0],), name='input')
 88 |     x = input_img
 89 |     # internal layers in encoder
 90 |     for i in range(n_stacks-1):
 91 |         x = Dense(dims[i + 1], activation=act, kernel_initializer=init, name='encoder_%d' % i)(x)
 92 |         #bn.BatchNormalization(momentum=0.9, epsilon=1e-06, weights=None)
 93 | 
 94 |     # hidden layer
 95 |     encoded = Dense(dims[-1], kernel_initializer=init, name='encoder_%d' % (n_stacks - 1))(x)  # hidden layer, features are extracted from here
 96 |     bn.BatchNormalization(momentum=0.9, epsilon=1e-06, weights=None)
 97 | 
 98 |     x = encoded
 99 |     # internal layers in decoder
100 |     for i in range(n_stacks-1, 0, -1):
101 |         x = Dense(dims[i], activation=act, kernel_initializer=init, name='decoder_%d' % i)(x)        
102 | 
103 |     # output
104 |     x = Dense(dims[0], kernel_initializer=init, name='decoder_0')(x)
105 |     bn.BatchNormalization(momentum=0.9, epsilon=1e-06, weights=None)
106 |     
107 |     decoded = x
108 |     return Model(inputs=input_img, outputs=decoded, name='AE'), Model(inputs=input_img, outputs=encoded, name='encoder')
109 | 	
110 | autoencoder, encoder = autoencoder(dims, init=init)
111 | autoencoder.compile(optimizer=pretrain_optimizer, loss='mse')
112 | autoencoder.fit(x, x, batch_size=batch_size, epochs=pretrain_epochs) #, callbacks=cb)
113 | autoencoder.save_weights(save_dir + '/ThesisDEC_weights.h5')
114 | 
115 | autoencoder.save_weights(save_dir + '/ThesisDEC_weights.h5')
116 | autoencoder.load_weights(save_dir + '/ThesisDEC_weights.h5')
117 | 
118 | 
119 | class ClusteringLayer(Layer):
120 |     """
121 |     Clustering layer converts input sample (feature) to soft label, i.e. a vector that represents the probability of the
122 |     sample belonging to each cluster. The probability is calculated with student's t-distribution.
123 | 
124 |     # Example
125 |     ```
126 |         model.add(ClusteringLayer(n_clusters=10))
127 |     ```
128 |     # Arguments
129 |         n_clusters: number of clusters.
130 |         weights: list of Numpy array with shape `(n_clusters, n_features)` witch represents the initial cluster centers.
131 |         alpha: degrees of freedom parameter in Student's t-distribution. Default to 1.0.
132 |     # Input shape
133 |         2D tensor with shape: `(n_samples, n_features)`.
134 |     # Output shape
135 |         2D tensor with shape: `(n_samples, n_clusters)`.
136 |     """
137 | 
138 |     def __init__(self, n_clusters, weights=None, alpha=1.0, **kwargs):
139 |         if 'input_shape' not in kwargs and 'input_dim' in kwargs:
140 |             kwargs['input_shape'] = (kwargs.pop('input_dim'),)
141 |         super(ClusteringLayer, self).__init__(**kwargs)
142 |         self.n_clusters = n_clusters
143 |         self.alpha = alpha
144 |         self.initial_weights = weights
145 |         self.input_spec = InputSpec(ndim=2)
146 | 
147 |     def build(self, input_shape):
148 |         assert len(input_shape) == 2
149 |         input_dim = input_shape[1]
150 |         self.input_spec = InputSpec(dtype=K.floatx(), shape=(None, input_dim))
151 |         self.clusters = self.add_weight((self.n_clusters, input_dim), initializer='glorot_uniform', name='clusters')
152 |         if self.initial_weights is not None:
153 |             self.set_weights(self.initial_weights)
154 |             del self.initial_weights
155 |         self.built = True
156 | 
157 |     def call(self, inputs, **kwargs):
158 |         """ student t-distribution, as same as used in t-SNE algorithm.
159 |          Measure the similarity between embedded point z_i and centroid µ_j.
160 |                  q_ij = 1/(1+dist(x_i, µ_j)^2), then normalize it.
161 |                  q_ij can be interpreted as the probability of assigning sample i to cluster j.
162 |                  (i.e., a soft assignment)
163 |         Arguments:
164 |             inputs: the variable containing data, shape=(n_samples, n_features)
165 |         Return:
166 |             q: student's t-distribution, or soft labels for each sample. shape=(n_samples, n_clusters)
167 |         """
168 |         q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
169 |         q **= (self.alpha + 1.0) / 2.0
170 |         q = K.transpose(K.transpose(q) / K.sum(q, axis=1)) # Make sure each sample's 10 values add up to 1.
171 |         return q
172 | 
173 |     def compute_output_shape(self, input_shape):
174 |         assert input_shape and len(input_shape) == 2
175 |         return input_shape[0], self.n_clusters
176 | 
177 |     def get_config(self):
178 |         config = {'n_clusters': self.n_clusters}
179 |         base_config = super(ClusteringLayer, self).get_config()
180 |         return dict(list(base_config.items()) + list(config.items()))
181 | 
182 | clustering_layer = ClusteringLayer(n_clusters, name='clustering')(encoder.output)
183 | model = Model(inputs=encoder.input, outputs=clustering_layer)
184 | model.compile(optimizer=RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0), loss='kld')
185 | 
186 | kmeans = KMeans(n_clusters=n_clusters, n_init=n_clusters)
187 | y_pred = kmeans.fit_predict(encoder.predict(x))
188 | 
189 | y_pred_last = np.copy(y_pred)
190 | 
191 | model.get_layer(name='clustering').set_weights([kmeans.cluster_centers_])
192 | 
193 | # computing an auxiliary target distribution
194 | def target_distribution(q):
195 |     weight = q ** 2 / q.sum(0)
196 |     return (weight.T / weight.sum(1)).T
197 | 
198 | loss = 0
199 | index = 0
200 | maxiter = 20000
201 | update_interval = 500
202 | index_array = np.arange(x.shape[0])
203 | 
204 | tol = 0.001 # tolerance threshold to stop training
205 | 
206 | for ite in range(int(maxiter)):
207 |     if ite % update_interval == 0:
208 |         q = model.predict(x, verbose=0)
209 |         p = target_distribution(q)  # update the auxiliary target distribution p
210 | 
211 |         # evaluate the clustering performance
212 |         y_pred = q.argmax(1)
213 |         if y is not None:
214 |             acc = np.round(accuracy_score(y, y_pred), 5)
215 |             nmi = np.round(normalized_mutual_info_score(y, y_pred), 5)
216 |             ari = np.round(adjusted_rand_score(y, y_pred), 5)
217 |             loss = np.round(loss, 5)
218 |             print('Iter %d: acc = %.5f, nmi = %.5f, ari = %.5f' % (ite, acc, nmi, ari), ' ; loss=', loss)
219 | 
220 |         # check stop criterion - model convergence
221 |         delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
222 |         y_pred_last = np.copy(y_pred)
223 |         if ite > 0 and delta_label < tol:
224 |             print('delta_label ', delta_label, '< tol ', tol)
225 |             print('Reached tolerance threshold. Stopping training.')
226 |             break
227 |     idx = index_array[index * batch_size: min((index+1) * batch_size, x.shape[0])]
228 |     model.train_on_batch(x=x[idx], y=p[idx])
229 |     index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0
230 | 
231 | model.save_weights(save_dir + '/Thesis_DEC_model_final.h5')
232 | model.load_weights(save_dir + '/Thesis_DEC_model_final.h5')
233 | 
234 | # Eval.
235 | q = model.predict(x, verbose=0)
236 | p = target_distribution(q)  # update the auxiliary target distribution p
237 | 
238 | # evaluate the clustering performance
239 | y_pred = q.argmax(1)
240 | if y is not None:
241 |     acc = np.round(accuracy_score(y, y_pred), 5)
242 |     nmi = np.round(normalized_mutual_info_score(y, y_pred), 5)
243 |     ari = np.round(adjusted_rand_score(y, y_pred), 5)
244 |     loss = np.round(loss, 5)
245 |     print('Acc = %.5f, nmi = %.5f, ari = %.5f' % (acc, nmi, ari), ' ; loss=', loss)
246 | 
247 | import seaborn as sns
248 | import sklearn.metrics
249 | import matplotlib.pyplot as plt
250 | sns.set(font_scale=1.5)
251 | confusion_matrix = sklearn.metrics.confusion_matrix(y, y_pred)
252 | 
253 | plt.figure(figsize=(12, 11))
254 | sns.heatmap(confusion_matrix, annot=True, fmt="d", annot_kws={"size": 15});
255 | plt.title("Confusion matrix", fontsize=25)
256 | plt.ylabel('True label', fontsize=25)
257 | plt.xlabel('Clustering label', fontsize=25)
258 | plt.show()
259 | 
260 | def visualizeData(Z, labels, num_clusters, title):
261 |     '''
262 |     TSNE visualization of the points in latent space Z
263 |     :param Z: Numpy array containing points in latent space in which clustering was performed
264 |     :param labels: True labels - used for coloring points
265 |     :param num_clusters: Total number of clusters
266 |     :param title: filename where the plot should be saved
267 |     :return: None - (side effect) saves clustering visualization plot in specified location
268 |     '''
269 |     labels = labels.astype(int)
270 |     tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
271 |     Z_tsne = tsne.fit_transform(Z)
272 |     fig = plt.figure()
273 |     plt.scatter(Z_tsne[:, 0], Z_tsne[:, 1], s=2, c=labels, cmap=plt.cm.get_cmap("jet", num_clusters))
274 |     plt.colorbar(ticks=range(num_clusters))
275 |     fig.savefig(title, dpi=fig.dpi)
276 | 
277 | from sklearn.manifold import TSNE
278 | import seaborn as sn
279 | import matplotlib.pyplot as plt
280 | 
281 | data_1000 =  x[0:1000,:]
282 | labels_1000 = y[0:1000]
283 | 
284 | model = TSNE(n_components = 2, random_state = 0)
285 | 
286 | tsne_data = model.fit_transform(x)
287 | #y_pred
288 | 
289 | tsne_data = np.vstack((tsne_data.T, y)).T
290 | tsne_df = pd.DataFrame(data= tsne_data, columns= ("Dim_1","Dim_2","label"))
291 | 
292 | sn.FacetGrid(tsne_df, hue= "label", size = 6).map(plt.scatter, 'Dim_1', 'Dim_2').add_legend()
293 | plt.show()	
294 | 
295 | visualizeData(x, y, n_clusters, "t_SNE_graph_original.png")
296 | visualizeData(x, y_pred, n_clusters, "t_SNE_graph_predicted.png")
297 | 


--------------------------------------------------------------------------------
/PopulationClustering_v2/results/train.csv/LSTM_EthnicityPrediction.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import glob
  4 | import numpy as np
  5 | import sys
  6 | from time import time
  7 | 
  8 | from sklearn.preprocessing import LabelEncoder
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn.metrics import precision_recall_fscore_support
 11 | from sklearn import metrics
 12 | 
 13 | from keras.models import Sequential
 14 | from keras.layers import LSTM, Dense, Dropout, Activation, Flatten
 15 | from keras.callbacks import TensorBoard
 16 | from keras.optimizers import RMSprop
 17 | from keras.regularizers import l2
 18 | from keras.callbacks import EarlyStopping
 19 | from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
 20 | from keras.utils import np_utils
 21 | 
 22 | from keras import backend as K
 23 | K.set_image_dim_ordering('tf')
 24 | import matplotlib.pyplot as plt
 25 | import itertools
 26 | 
 27 | import numpy as np
 28 | import pandas as pd
 29 | np.random.seed(10)
 30 | 
 31 | from time import time
 32 | import numpy as np
 33 | import keras.backend as K
 34 | from keras.engine.topology import Layer, InputSpec
 35 | from keras.layers import Dense, Input
 36 | from keras.models import Model
 37 | from keras.optimizers import RMSprop
 38 | from keras import callbacks
 39 | from keras.initializers import VarianceScaling
 40 | from sklearn.cluster import KMeans
 41 | from sklearn import metrics
 42 | from sklearn.metrics.cluster import normalized_mutual_info_score
 43 | from sklearn.metrics.cluster import adjusted_rand_score
 44 | from sklearn.metrics import accuracy_score
 45 | from sklearn import manifold
 46 | import keras.layers.normalization as bn
 47 | 
 48 | from sklearn.metrics import confusion_matrix
 49 | 
 50 | df1 = pd.read_csv('/home/asif/genome.csv', header=None)
 51 | print(df1.head())
 52 | 
 53 | label = df1[0]
 54 | print(label.head())
 55 | 
 56 | from sklearn import preprocessing
 57 | le = preprocessing.LabelEncoder()
 58 | lbl = le.fit(label)
 59 | labelss = lbl.transform(label)
 60 | labelDF = pd.DataFrame(labelss)
 61 | 
 62 | #labelArr = 
 63 | print(labelDF.head())
 64 | 
 65 | feature = df1.drop(0, axis=1)
 66 | print(feature.head())
 67 | 
 68 | from sklearn.preprocessing import MinMaxScaler
 69 | scaler = MinMaxScaler()
 70 | x1 = feature.iloc[:,1:]
 71 | df_scaled = pd.DataFrame(scaler.fit_transform(x1), columns=x1.columns)
 72 | df_scaled.head()
 73 | 
 74 | y = labelss
 75 | x = df_scaled.values
 76 | 
 77 | features = x
 78 | labels = y
 79 | 
 80 | def prepare_test_train_valid():
 81 | 	# Train-test split 
 82 | 	train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size=0.25, random_state=100)
 83 | 	test_x, valid_x, test_y, valid_y = train_test_split(train_x, train_y, test_size=0.50, random_state=100)
 84 | 
 85 | 	return train_x, test_x, train_y, test_y, valid_x, valid_y
 86 | 
 87 | def one_hot_encode(labels):
 88 |     n_labels = len(labels)
 89 |     n_unique_labels = len(np.unique(labels))
 90 |     one_hot_encode = np.zeros((n_labels,n_unique_labels))
 91 |     one_hot_encode[np.arange(n_labels), labels] = 1
 92 |     return one_hot_encode
 93 | 
 94 | labels = one_hot_encode(labels)
 95 | 
 96 | # Extract feature
 97 | train_x, test_x, train_y, test_y, valid_x, valid_y = prepare_test_train_valid()
 98 | 
 99 | print('X_train shape:', train_x.shape)
100 | print('Y_train shape:', train_y.shape)
101 | 
102 | num_classes = 5
103 | data_dim = 52
104 | timesteps = 1
105 | 
106 | train_x = np.reshape(train_x,(train_x.shape[0], 1, train_x.shape[1]))
107 | test_x = np.reshape(test_x,(test_x.shape[0], 1, test_x.shape[1]))
108 | valid_x = np.reshape(valid_x,(valid_x.shape[0], 1, valid_x.shape[1]))
109 | 
110 | def plot_confusion_matrix(cm, classes,
111 |                           normalize=False,
112 |                           title='Confusion matrix',
113 |                           cmap=plt.cm.Blues):
114 |     """
115 |     This function prints and plots the confusion matrix.
116 |     Normalization can be applied by setting `normalize=True`.
117 |     """
118 |     if normalize:
119 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
120 |         print("Normalized confusion matrix")
121 |     else:
122 |         print('Confusion matrix, without normalization')
123 | 
124 |     print(cm)
125 | 
126 |     plt.imshow(cm, interpolation='nearest', cmap=cmap)
127 |     plt.title(title)
128 |     plt.colorbar()
129 |     tick_marks = np.arange(len(classes))
130 |     plt.xticks(tick_marks, classes, rotation=45)
131 |     plt.yticks(tick_marks, classes)
132 | 
133 |     fmt = '.2f' if normalize else 'd'
134 |     thresh = cm.max() / 2.
135 |     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
136 |         plt.text(j, i, format(cm[i, j], fmt),
137 |                  horizontalalignment="center",
138 |                  color="white" if cm[i, j] > thresh else "black")
139 | 
140 |     plt.tight_layout()
141 |     plt.ylabel('True label')
142 |     plt.xlabel('Predicted label')
143 | 
144 | def build_LSTM(): #OK
145 |     # expected input data shape: (batch_size, timesteps, data_dim)
146 |     model = Sequential()
147 |     model.add(LSTM(32, return_sequences=True, input_shape=(timesteps, data_dim))) 
148 | 	
149 |     model.add(LSTM(24, return_sequences=True))
150 | 
151 |     #model.add(Dropout(0.2))
152 |     model.add(LSTM(16, return_sequences=True)) 
153 |     model.add(Dropout(0.2))
154 |     
155 |     # apply softmax to output
156 |     model.add(Flatten())
157 |     model.add(Dense(num_classes, activation='softmax'))
158 |     return model
159 | 
160 | def model_train_evaluate(model, number_epoch):   
161 |     sgd = RMSprop(lr=0.001, rho=0.01, epsilon=None, decay=0.0)
162 | 
163 |     # a stopping function should the validation loss stop improving
164 |     earlystop = EarlyStopping(monitor='val_loss', patience=1, verbose=0, mode='auto')
165 | 
166 |     #if model in ['RNN']: 
167 |     rnn_model = build_LSTM() #OK
168 |     rnn_model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=sgd)
169 |     tensorboardRNN = TensorBoard(log_dir="RNN_logs/{}".format(time()))
170 |     rnn_model.fit(train_x, train_y, validation_data=(valid_x, valid_y), callbacks=[tensorboardRNN], batch_size=128, epochs=int(number_epoch))
171 |     print(rnn_model.summary())
172 |         
173 |     y_prob = rnn_model.predict(test_x) 
174 |     y_pred = y_prob.argmax(axis=-1)
175 |     y_true = np.argmax(test_y, 1)
176 | 
177 |     roc = roc_auc_score(test_y, y_prob)
178 |     print ("ROC:",  round(roc,3))
179 | 
180 |     # evaluate the model
181 |     score, accuracy = rnn_model.evaluate(test_x, test_y, batch_size=32)
182 |     print("\nAccuracy = {:.2f}".format(accuracy))
183 | 
184 |     # the F-score gives a similiar value to the accuracy score, but useful for cross-checking
185 |     p,r,f,s = precision_recall_fscore_support(y_true, y_pred, average='micro')
186 |     print ("F-Score:", round(f,2))
187 |     print ("Precision:", round(p,2))
188 |     print ("Recall:", round(r,2))
189 |     print ("F-Score:", round(f,2)) 
190 |     
191 |     # Compute confusion matrix
192 |     cnf_matrix = confusion_matrix(y_true, y_pred)
193 |     np.set_printoptions(precision=2)
194 |     
195 |     class_names = ["FIN", "GBR", "ASW", "CHB", "CLM"]
196 | 
197 |     # Plot non-normalized confusion matrix
198 |     plt.figure()
199 |     plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix: true vs predicted label')  
200 |     plt.show() 
201 | 
202 | model = build_LSTM()
203 | model_train_evaluate(model, 1000)
204 | import gc; gc.collect()
205 | 


--------------------------------------------------------------------------------
/PopulationClustering_v2/results/train.csv/genome.csv:
--------------------------------------------------------------------------------
  1 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  2 | GBR,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
  3 | CHB,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  4 | CHB,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  5 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
  6 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
  7 | CHB,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
  8 | ASW,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
  9 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
 10 | ASW,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0
 11 | ASW,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
 12 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
 13 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 14 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
 15 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 16 | CHB,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 17 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 18 | GBR,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
 19 | GBR,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
 20 | GBR,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
 21 | ASW,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 22 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 23 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 24 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 25 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 26 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 27 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 28 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 29 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 30 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 31 | CLM,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
 32 | CLM,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
 33 | CHB,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
 34 | CLM,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 35 | CLM,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 36 | FIN,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 37 | GBR,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
 38 | FIN,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 39 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 40 | GBR,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
 41 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 42 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 43 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 44 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 45 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 46 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 47 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 48 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 49 | ASW,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0
 50 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 51 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 52 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
 53 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 54 | CLM,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 55 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 56 | CLM,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
 57 | CHB,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 58 | ASW,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 59 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 60 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 61 | CLM,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
 62 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 63 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0
 64 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 65 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
 66 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 67 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 68 | CLM,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0
 69 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 70 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 71 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 72 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 73 | CHB,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 74 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 75 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 76 | GBR,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
 77 | ASW,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 78 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
 79 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 80 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
 81 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 82 | CHB,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 83 | CLM,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
 84 | CLM,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 85 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
 86 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 87 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 88 | FIN,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 89 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 90 | ASW,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0
 91 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
 92 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 93 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 94 | ASW,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
 95 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
 96 | FIN,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 97 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
 98 | FIN,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 99 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
100 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
101 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
102 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
103 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
104 | GBR,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
105 | FIN,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
106 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
107 | GBR,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
108 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1
109 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
110 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
111 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
112 | FIN,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
113 | CHB,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
114 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
115 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
116 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
117 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
118 | FIN,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
119 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
120 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
121 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
122 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
123 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
124 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
125 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
126 | CLM,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
127 | ASW,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
128 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
129 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
130 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
131 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
132 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1
133 | CLM,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
134 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
135 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
136 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
137 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
138 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
139 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
140 | ASW,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
141 | FIN,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
142 | CLM,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
143 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
144 | CLM,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
145 | FIN,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
146 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
147 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
148 | CLM,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1
149 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
150 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
151 | ASW,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0
152 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
153 | CLM,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
154 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
155 | GBR,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
156 | GBR,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
157 | CLM,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1
158 | CHB,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
159 | ASW,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
160 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
161 | CLM,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
162 | CLM,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
163 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
164 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
165 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
166 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
167 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
168 | FIN,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
169 | CLM,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
170 | ASW,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
171 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
172 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
173 | ASW,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
174 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
175 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
176 | CLM,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
177 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
178 | CHB,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
179 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
180 | GBR,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
181 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
182 | CHB,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
183 | FIN,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
184 | ASW,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
185 | GBR,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
186 | GBR,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
187 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
188 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
189 | GBR,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,1,0
190 | FIN,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
191 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
192 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
193 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0
194 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
195 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
196 | CLM,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
197 | CHB,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
198 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
199 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
200 | 


--------------------------------------------------------------------------------
/PopulationClustering_v2/results/train.csv/part-00000-2c4830b2-4c39-48fc-909d-4868a1164190-c000.csv:
--------------------------------------------------------------------------------
  1 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  2 | GBR,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
  3 | CHB,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  4 | CHB,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  5 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
  6 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
  7 | CHB,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
  8 | ASW,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
  9 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
 10 | ASW,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0
 11 | ASW,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
 12 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
 13 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 14 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
 15 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 16 | CHB,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 17 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 18 | GBR,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
 19 | GBR,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
 20 | GBR,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
 21 | ASW,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 22 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 23 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 24 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 25 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 26 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 27 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 28 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 29 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 30 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 31 | CLM,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
 32 | CLM,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
 33 | CHB,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
 34 | CLM,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 35 | CLM,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 36 | FIN,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 37 | GBR,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
 38 | FIN,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 39 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 40 | GBR,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
 41 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 42 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 43 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 44 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 45 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 46 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 47 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 48 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 49 | ASW,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0
 50 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 51 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 52 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
 53 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 54 | CLM,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 55 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 56 | CLM,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
 57 | CHB,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 58 | ASW,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 59 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 60 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 61 | CLM,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
 62 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 63 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0
 64 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 65 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
 66 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 67 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 68 | CLM,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0
 69 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 70 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 71 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 72 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 73 | CHB,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 74 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 75 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 76 | GBR,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
 77 | ASW,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 78 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
 79 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 80 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
 81 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 82 | CHB,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 83 | CLM,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
 84 | CLM,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 85 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
 86 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 87 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 88 | FIN,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 89 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 90 | ASW,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0
 91 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
 92 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 93 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 94 | ASW,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
 95 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
 96 | FIN,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 97 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
 98 | FIN,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 99 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
100 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
101 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
102 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
103 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
104 | GBR,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
105 | FIN,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
106 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
107 | GBR,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
108 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1
109 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
110 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
111 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
112 | FIN,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
113 | CHB,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
114 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
115 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
116 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
117 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
118 | FIN,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
119 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
120 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
121 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
122 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
123 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
124 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
125 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
126 | CLM,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
127 | ASW,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
128 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
129 | FIN,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
130 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
131 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
132 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1
133 | CLM,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
134 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
135 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
136 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
137 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
138 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
139 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
140 | ASW,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1
141 | FIN,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
142 | CLM,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
143 | GBR,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
144 | CLM,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
145 | FIN,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
146 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
147 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
148 | CLM,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1
149 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
150 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
151 | ASW,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0
152 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
153 | CLM,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
154 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
155 | GBR,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
156 | GBR,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
157 | CLM,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1
158 | CHB,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
159 | ASW,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
160 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
161 | CLM,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
162 | CLM,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
163 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
164 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
165 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
166 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
167 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
168 | FIN,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
169 | CLM,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
170 | ASW,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
171 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
172 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
173 | ASW,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
174 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
175 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
176 | CLM,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
177 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
178 | CHB,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
179 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
180 | GBR,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
181 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
182 | CHB,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
183 | FIN,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
184 | ASW,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
185 | GBR,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
186 | GBR,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
187 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
188 | GBR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
189 | GBR,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,1,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,1,0
190 | FIN,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
191 | FIN,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
192 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
193 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0
194 | CHB,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
195 | CLM,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
196 | CLM,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
197 | CHB,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
198 | ASW,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
199 | CHB,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
200 | 


--------------------------------------------------------------------------------
/PopulationClustering_v2/src/main/scala/org/fit/genomics/PopGenomicsClassificationSpark.scala:
--------------------------------------------------------------------------------
  1 | package org.fit.genomics
  2 | 
  3 | import hex.FrameSplitter
  4 | import org.apache.spark.SparkContext
  5 | import org.apache.spark.h2o.H2OContext
  6 | import org.bdgenomics.adam.rdd.ADAMContext._
  7 | import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele }
  8 | import water.{ Job, Key }
  9 | import water.fvec.Frame
 10 | 
 11 | import org.apache.spark.h2o._
 12 | import java.io.File
 13 | import java.io._
 14 | import scala.collection.JavaConverters._
 15 | import scala.collection.immutable.Range.inclusive
 16 | import scala.io.Source
 17 | 
 18 | import org.apache.spark.rdd.RDD
 19 | import org.apache.spark.sql._
 20 | import org.apache.spark.sql.types.{ IntegerType, StringType, StructField, StructType }
 21 | import org.apache.spark.ml.feature.{ VectorAssembler, Normalizer }
 22 | import org.apache.spark.ml.Pipeline
 23 | import org.apache.spark.ml.feature.VectorIndexer
 24 | import org.apache.spark.ml.feature.StringIndexer
 25 | import org.apache.spark.ml.feature.PCA
 26 | import org.apache.spark.ml.{ Pipeline }
 27 | import org.apache.spark.ml.classification.{ RandomForestClassifier, RandomForestClassificationModel }
 28 | import org.apache.spark.ml.evaluation.{ MulticlassClassificationEvaluator }
 29 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
 30 | 
 31 | object PopGenomicsClassificationSpark {
 32 |   def main(args: Array[String]): Unit = {
 33 |     val genotypeFile = "C:/Users/admin-karim/Downloads/genotypes.vcf"
 34 |     val panelFile = "C:/Users/admin-karim/Downloads/genotypes.panel"
 35 | 
 36 |     val spark:SparkSession =  SparkSession
 37 |                               .builder()
 38 |                                .appName("PopStrat")
 39 |                                 .master("local[*]")
 40 |                                  .config("spark.sql.warehouse.dir", "C:/Exp/")
 41 |                                   .getOrCreate()
 42 |                                             
 43 |     val sc: SparkContext = spark.sparkContext
 44 | 
 45 |     // Create a set of the populations that we want to predict
 46 |     // Then create a map of sample ID -> population so that we can filter out the samples we're not interested in
 47 |     //val populations = Set("GBR", "ASW", "FIN", "CHB", "CLM")
 48 |     val populations = Set("FIN", "GBR", "ASW", "CHB", "CLM")
 49 | 
 50 |     def extract(file: String,
 51 |       filter: (String, String) => Boolean): Map[String, String] = {
 52 |       Source
 53 |         .fromFile(file)
 54 |         .getLines()
 55 |         .map(line => {
 56 |           val tokens = line.split(Array('\t', ' ')).toList
 57 |           tokens(0) -> tokens(1)
 58 |         })
 59 |         .toMap
 60 |         .filter(tuple => filter(tuple._1, tuple._2))
 61 |     }
 62 | 
 63 |     val panel: Map[String, String] = extract(
 64 |       panelFile,
 65 |       (sampleID: String, pop: String) => populations.contains(pop))
 66 | 
 67 |     // Load the ADAM genotypes from the parquet file(s)
 68 |     // Next, filter the genotypes so that we're left with only those in the populations we're interested in
 69 |     val allGenotypes: RDD[Genotype] = sc.loadGenotypes(genotypeFile).rdd
 70 |     //allGenotypes.adamParquetSave("output")
 71 |     val genotypes: RDD[Genotype] = allGenotypes.filter(genotype => {
 72 |       panel.contains(genotype.getSampleId)
 73 |     })
 74 | 
 75 |     // Convert the Genotype objects to our own SampleVariant objects to try and conserve memory
 76 |     case class SampleVariant(sampleId: String,
 77 |       variantId: Int,
 78 |       alternateCount: Int)
 79 |       
 80 |     def variantId(genotype: Genotype): String = {
 81 |       val name = genotype.getVariant.getContigName
 82 |       val start = genotype.getVariant.getStart
 83 |       val end = genotype.getVariant.getEnd
 84 |       s"$name:$start:$end"
 85 |     }
 86 | 
 87 |     def alternateCount(genotype: Genotype): Int = {
 88 |       genotype.getAlleles.asScala.count(_ != GenotypeAllele.REF)
 89 |     }
 90 | 
 91 |     def toVariant(genotype: Genotype): SampleVariant = {
 92 |       // Intern sample IDs as they will be repeated a lot
 93 |       new SampleVariant(genotype.getSampleId.intern(),
 94 |         variantId(genotype).hashCode(),
 95 |         alternateCount(genotype))
 96 |     }
 97 | 
 98 |     val variantsRDD: RDD[SampleVariant] = genotypes.map(toVariant)
 99 |     //println(s"Variant RDD: " + variantsRDD.first())
100 | 
101 |     // Group the variants by sample ID so we can process the variants sample-by-sample
102 |     // Then get the total number of samples. This will be used to find variants that are missing for some samples.
103 |     // Group the variants by variant ID and filter out those variants that are missing from some samples
104 |     val variantsBySampleId: RDD[(String, Iterable[SampleVariant])] =
105 |       variantsRDD.groupBy(_.sampleId)
106 |     val sampleCount: Long = variantsBySampleId.count()
107 |     println("Found " + sampleCount + " samples")
108 | 
109 |     val variantsByVariantId: RDD[(Int, Iterable[SampleVariant])] =
110 |       variantsRDD.groupBy(_.variantId).filter {
111 |         case (_, sampleVariants) => sampleVariants.size == sampleCount
112 |       }
113 | 
114 |     // Make a map of variant ID -> count of samples with an alternate count of greater than zero
115 |     // then filter out those variants that are not in our desired frequency range. The objective here is simply to
116 |     // reduce the number of dimensions in the data set to make it easier to train the model.
117 |     // The specified range is fairly arbitrary and was chosen based on the fact that it includes a reasonable
118 |     // number of variants, but not too many.
119 |     val variantFrequencies: collection.Map[Int, Int] = variantsByVariantId
120 |       .map {
121 |         case (variantId, sampleVariants) =>
122 |           (variantId, sampleVariants.count(_.alternateCount > 0))
123 |       }
124 |       .collectAsMap()
125 | 
126 |     val permittedRange = inclusive(11, 11)
127 |     val filteredVariantsBySampleId: RDD[(String, Iterable[SampleVariant])] =
128 |       variantsBySampleId.map {
129 |         case (sampleId, sampleVariants) =>
130 |           val filteredSampleVariants = sampleVariants.filter(
131 |             variant =>
132 |               permittedRange.contains(
133 |                 variantFrequencies.getOrElse(variant.variantId, -1)))
134 |           (sampleId, filteredSampleVariants)
135 |       }
136 | 
137 |     //println(s"Filtered Variant RDD: " + filteredVariantsBySampleId.first())
138 | 
139 |     // Sort the variants for each sample ID. Each sample should now have the same number of sorted variants.
140 |     // All items in the RDD should now have the same variants in the same order so we can just use the first
141 |     // one to construct our header
142 |     // Next construct the rows of our SchemaRDD from the variants
143 |     val sortedVariantsBySampleId: RDD[(String, Array[SampleVariant])] =
144 |       filteredVariantsBySampleId.map {
145 |         case (sampleId, variants) =>
146 |           (sampleId, variants.toArray.sortBy(_.variantId))
147 |       }
148 | 
149 |     println(s"Sorted by Sample ID RDD: " + sortedVariantsBySampleId.first())
150 | 
151 |     val header = StructType(
152 |       Seq(StructField("Region", StringType)) ++
153 |         sortedVariantsBySampleId
154 |         .first()
155 |         ._2
156 |         .map(variant => {
157 |           StructField(variant.variantId.toString, IntegerType)
158 |         }))
159 | 
160 |     val rowRDD: RDD[Row] = sortedVariantsBySampleId.map {
161 |       case (sampleId, sortedVariants) =>
162 |         val region: Array[String] = Array(panel.getOrElse(sampleId, "Unknown"))
163 |         val alternateCounts: Array[Int] = sortedVariants.map(_.alternateCount)
164 |         Row.fromSeq(region ++ alternateCounts)
165 |     }
166 | 
167 |     // Create the SchemaRDD from the header and rows and convert the SchemaRDD into a H2O dataframe
168 |     val sqlContext = spark.sqlContext
169 |     val schemaDF = sqlContext.createDataFrame(rowRDD, header)
170 |     schemaDF.printSchema()
171 |     schemaDF.show(10)
172 | 
173 |     val featureCols = schemaDF.columns.drop(1)
174 | 
175 |     val assembler = new VectorAssembler()
176 |       .setInputCols(featureCols)
177 |       .setOutputCol("features")
178 | 
179 |     val assembleDF = assembler.transform(schemaDF).select("features", "Region")
180 |     assembleDF.show()
181 | 
182 |     /*
183 |     val pca = new PCA()
184 |       .setInputCol("features")
185 |       .setOutputCol("pcaFeatures")
186 |       .setK(50)
187 |       .fit(assembleDF)
188 | 
189 |     val pcaDF = pca.transform(assembleDF).select("pcaFeatures", "Region").withColumnRenamed("pcaFeatures", "features")//.withColumnRenamed("Region", "label")
190 |     pcaDF.show()
191 |     * 
192 |     */
193 |     
194 |     
195 |     val indexer = new StringIndexer()
196 |       .setInputCol("Region")
197 |       .setOutputCol("label")
198 | 
199 |     val indexedDF = indexer.fit(assembleDF).transform(assembleDF).select("features", "label")
200 |     println("Indeexed: ")
201 |     indexedDF.show(10)
202 | 
203 |     val seed = 12345L
204 |     val splits = indexedDF.randomSplit(Array(0.75, 0.25), seed)
205 |     val (trainDF, testDF) = (splits(0), splits(1))
206 | 
207 |     trainDF.cache
208 |     testDF.cache
209 | 
210 |     val rf = new RandomForestClassifier()
211 |       .setLabelCol("label")
212 |       .setFeaturesCol("features")
213 |       .setSeed(1234567L)
214 | 
215 |     // Search through decision tree's maxDepth parameter for best model
216 |     val paramGrid = new ParamGridBuilder()
217 |       .addGrid(rf.maxDepth, 3 :: 5 :: 15 :: 20 :: 25 :: 30 :: Nil)
218 |       .addGrid(rf.featureSubsetStrategy, "auto" :: "all" :: Nil)
219 |       .addGrid(rf.impurity, "gini" :: "entropy" :: Nil)
220 |       .addGrid(rf.maxBins, 3 :: 5 :: 10 :: 15 :: 25 :: 35 :: 45 :: Nil)
221 |       .addGrid(rf.numTrees, 5 :: 10 :: 15 :: 20 :: 30 :: Nil)
222 |       .build()
223 | 
224 |     val evaluator = new MulticlassClassificationEvaluator()
225 |       .setLabelCol("label")
226 |       .setPredictionCol("prediction")
227 | 
228 |     // Set up 10-fold cross validation
229 |     val numFolds = 10
230 |     val crossval = new CrossValidator()
231 |       .setEstimator(rf)
232 |       .setEvaluator(evaluator)
233 |       .setEstimatorParamMaps(paramGrid)
234 |       .setNumFolds(numFolds)
235 | 
236 |     val cvModel = crossval.fit(trainDF)
237 | 
238 |     // Save the workflow
239 |     //cvModel.write.overwrite().save("model/RF_model_churn")
240 | 
241 |     val predictions = cvModel.transform(testDF)
242 |     predictions.show(10)
243 | 
244 |     val metric = new MulticlassClassificationEvaluator()
245 |       .setLabelCol("label")
246 |       .setPredictionCol("prediction")
247 | 
248 |     val evaluator1 = metric.setMetricName("accuracy")
249 |     val evaluator2 = metric.setMetricName("weightedPrecision")
250 |     val evaluator3 = metric.setMetricName("weightedRecall")
251 |     val evaluator4 = metric.setMetricName("f1")
252 | 
253 |     // compute the classification accuracy, precision, recall, f1 measure and error on test data.
254 |     val accuracy = evaluator1.evaluate(predictions)
255 |     val precision = evaluator2.evaluate(predictions)
256 |     val recall = evaluator3.evaluate(predictions)
257 |     val f1 = evaluator4.evaluate(predictions)
258 | 
259 |     // Print the performance metrics
260 |     println("Accuracy = " + accuracy);
261 |     println("Precision = " + precision)
262 |     println("Recall = " + recall)
263 |     println("F1 = " + f1)
264 |     println(s"Test Error = ${1 - accuracy}")
265 | 
266 |     // Shutdown Spark cluster and H2O
267 |     spark.stop()
268 |   }
269 | 
270 | }
271 | 


--------------------------------------------------------------------------------
/PopulationClustering_v2/src/main/scala/org/fit/genomics/PopStratClassification.scala:
--------------------------------------------------------------------------------
  1 | package org.fit.genomics
  2 | 
  3 | import java.io._
  4 | 
  5 | import hex.FrameSplitter
  6 | import hex.deeplearning.DeepLearning
  7 | import hex.deeplearning.DeepLearningModel.DeepLearningParameters
  8 | import hex.deeplearning.DeepLearningModel.DeepLearningParameters.Activation
  9 | import org.apache.spark.SparkContext
 10 | import org.apache.spark.h2o.H2OContext
 11 | import org.apache.spark.rdd.RDD
 12 | import org.apache.spark.sql._
 13 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 14 | 
 15 | import org.bdgenomics.adam.rdd.ADAMContext._
 16 | import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele}
 17 | import water.{Job, Key}
 18 | import water.support.ModelMetricsSupport
 19 | import water.fvec.Frame
 20 | 
 21 | import org.apache.spark.h2o._
 22 | import java.io.File
 23 | 
 24 | import htsjdk.samtools.ValidationStringency
 25 | 
 26 | import _root_.hex.{ModelMetrics, ModelMetricsSupervised, ModelMetricsMultinomial}
 27 | 
 28 | import scala.collection.JavaConverters._
 29 | import scala.collection.immutable.Range.inclusive
 30 | import scala.io.Source
 31 | 
 32 | object PopStratClassification {
 33 |   def main(args: Array[String]): Unit = {
 34 |     val genotypeFile = "C:/Users/admin-karim/Downloads/ALL.chrY.phase3_integrated_v2a.20130502.genotypes.vcf"
 35 |     val panelFile = "C:/Users/admin-karim/Downloads/genotypes.panel"
 36 | 
 37 |     val sparkSession: SparkSession =
 38 |       SparkSession.builder.appName("PopStrat").master("local[*]").getOrCreate()
 39 |     val sc: SparkContext = sparkSession.sparkContext
 40 | 
 41 |     // Create a set of the populations that we want to predict
 42 |     // Then create a map of sample ID -> population so that we can filter out the samples we're not interested in
 43 |     //val populations = Set("GBR", "ASW", "FIN", "CHB", "CLM")
 44 |     val populations = Set("FIN", "GBR", "ASW", "CHB", "CLM")
 45 | 
 46 |     def extract(file: String,
 47 |       filter: (String, String) => Boolean): Map[String, String] = {
 48 |       Source
 49 |         .fromFile(file)
 50 |         .getLines()
 51 |         .map(line => {
 52 |           val tokens = line.split(Array('\t', ' ')).toList
 53 |           tokens(0) -> tokens(1)
 54 |         })
 55 |         .toMap
 56 |         .filter(tuple => filter(tuple._1, tuple._2))
 57 |     }
 58 | 
 59 |     val panel: Map[String, String] = extract(
 60 |       panelFile,
 61 |       (sampleID: String, pop: String) => populations.contains(pop))
 62 | 
 63 |     // Load the ADAM genotypes from the parquet file(s)
 64 |     // Next, filter the genotypes so that we're left with only those in the populations we're interested in
 65 |     //val allGenotypes: RDD[Genotype] = sc.loadGenotypes(genotypeFile, stringency = ValidationStringency.SILENT).rdd
 66 |     val genotypes0 = sc.loadGenotypes("C:/Users/admin-karim/Downloads/VCF_files/1.vcf", stringency = ValidationStringency.SILENT)
 67 |     
 68 |     //val genotypes0 = sc.loadGenotypes("sample0.vcf")
 69 |     val genotypes1 = sc.loadGenotypes("C:/Users/admin-karim/Downloads/VCF_files/2.vcf")
 70 |     val union = genotypes0.union(genotypes1)
 71 |     val rdd: RDD[Genotype] = union.rdd
 72 | 
 73 |     val allGenotypes: RDD[Genotype] = rdd.rdd
 74 | 
 75 |     //allGenotypes.adamParquetSave("output")
 76 |     val genotypesFiltered: RDD[Genotype] = allGenotypes.filter(genotype => {
 77 |       panel.contains(genotype.getSampleId)
 78 |     })
 79 | 
 80 |     // Convert the Genotype objects to our own SampleVariant objects to try and conserve memory
 81 |     case class SampleVariant(sampleId: String,
 82 |       variantId: Int,
 83 |       alternateCount: Int)
 84 |     def variantId(genotype: Genotype): String = {
 85 |       val name = genotype.getVariant.getContigName
 86 |       val start = genotype.getVariant.getStart
 87 |       val end = genotype.getVariant.getEnd
 88 |       s"$name:$start:$end"
 89 |     }
 90 | 
 91 |     def alternateCount(genotype: Genotype): Int = {
 92 |       genotype.getAlleles.asScala.count(_ != GenotypeAllele.REF)
 93 |     }
 94 | 
 95 |     def toVariant(genotype: Genotype): SampleVariant = {
 96 |       // Intern sample IDs as they will be repeated a lot
 97 |       new SampleVariant(genotype.getSampleId.intern(),
 98 |         variantId(genotype).hashCode(),
 99 |         alternateCount(genotype))
100 |     }
101 | 
102 |     val variantsRDD: RDD[SampleVariant] = genotypesFiltered.map(toVariant)
103 |     //println(s"Variant RDD: " + variantsRDD.first())
104 | 
105 |     // Group the variants by sample ID so we can process the variants sample-by-sample
106 |     // Then get the total number of samples. This will be used to find variants that are missing for some samples.
107 |     // Group the variants by variant ID and filter out those variants that are missing from some samples
108 |     val variantsBySampleId: RDD[(String, Iterable[SampleVariant])] =
109 |       variantsRDD.groupBy(_.sampleId)
110 |     val sampleCount: Long = variantsBySampleId.count()
111 |     println("Found " + sampleCount + " samples")
112 | 
113 |     val writer_0 = new PrintWriter(new File("output_1.txt"))
114 |     writer_0.write("Found " + sampleCount + " samples")
115 |     //writer.write(s"Confusion Matrix"+ cm)
116 |     //writer.write("Prediction Matrix"+ result)
117 |     writer_0.close()
118 | 
119 |     val variantsByVariantId: RDD[(Int, Iterable[SampleVariant])] =
120 |       variantsRDD.groupBy(_.variantId).filter {
121 |         case (_, sampleVariants) => sampleVariants.size == sampleCount
122 |       }
123 | 
124 |     // Make a map of variant ID -> count of samples with an alternate count of greater than zero
125 |     // then filter out those variants that are not in our desired frequency range. The objective here is simply to
126 |     // reduce the number of dimensions in the data set to make it easier to train the model.
127 |     // The specified range is fairly arbitrary and was chosen based on the fact that it includes a reasonable
128 |     // number of variants, but not too many.
129 |     val variantFrequencies: collection.Map[Int, Int] = variantsByVariantId
130 |       .map {
131 |         case (variantId, sampleVariants) =>
132 |           (variantId, sampleVariants.count(_.alternateCount > 0))
133 |       }
134 |       .collectAsMap()
135 |       
136 |    println(variantFrequencies.max)   
137 | 
138 |     val permittedRange = inclusive(11, 11)
139 |     val filteredVariantsBySampleId: RDD[(String, Iterable[SampleVariant])] =
140 |       variantsBySampleId.map {
141 |         case (sampleId, sampleVariants) =>
142 |           val filteredSampleVariants = sampleVariants.filter(
143 |             variant =>
144 |               permittedRange.contains(
145 |                 variantFrequencies.getOrElse(variant.variantId, -1)))
146 |           (sampleId, filteredSampleVariants)
147 |       }
148 | 
149 |     //println(s"Filtered Variant RDD: " + filteredVariantsBySampleId.first())
150 | 
151 |     // Sort the variants for each sample ID. Each sample should now have the same number of sorted variants.
152 |     // All items in the RDD should now have the same variants in the same order so we can just use the first
153 |     // one to construct our header
154 |     // Next construct the rows of our SchemaRDD from the variants
155 |     val sortedVariantsBySampleId: RDD[(String, Array[SampleVariant])] =
156 |       filteredVariantsBySampleId.map {
157 |         case (sampleId, variants) =>
158 |           (sampleId, variants.toArray.sortBy(_.variantId))
159 |       }
160 | 
161 |     println(s"Sorted by Sample ID RDD: " + sortedVariantsBySampleId.first())
162 | 
163 |     val header = StructType(
164 |       Seq(StructField("Region", StringType)) ++
165 |         sortedVariantsBySampleId
166 |         .first()
167 |         ._2
168 |         .map(variant => {
169 |           StructField(variant.variantId.toString, IntegerType)
170 |         }))
171 | 
172 |     val rowRDD: RDD[Row] = sortedVariantsBySampleId.map {
173 |       case (sampleId, sortedVariants) =>
174 |         val region: Array[String] = Array(panel.getOrElse(sampleId, "Unknown"))
175 |         val alternateCounts: Array[Int] = sortedVariants.map(_.alternateCount)
176 |         Row.fromSeq(region ++ alternateCounts)
177 |     }
178 | 
179 |     // Create the SchemaRDD from the header and rows and convert the SchemaRDD into a H2O dataframe
180 |     val sqlContext = sparkSession.sqlContext
181 |     val schemaDF = sqlContext.createDataFrame(rowRDD, header)
182 |     schemaDF.coalesce(1).write.format("com.databricks.spark.csv").csv("results/train.csv")
183 |     //testData.write.format("com.databricks.spark.csv").csv("results/test.csv") 
184 |     
185 |     schemaDF.show()
186 |     
187 |     val h2oContext = H2OContext.getOrCreate(sparkSession)
188 |     import h2oContext.implicits._
189 |         
190 |     val dataFrame = h2oContext.asH2OFrame(schemaDF)
191 |     dataFrame
192 |       .replace(dataFrame.find("Region"),
193 |         dataFrame.vec("Region").toCategoricalVec())
194 |       .remove()
195 |     dataFrame.update()
196 | 
197 |     // Split the dataframe into 60% training, 20% test, and 20% validation data
198 |     val frameSplitter = new FrameSplitter(
199 |       dataFrame,
200 |       Array(0.50, 0.30),
201 |       Array("training", "test", "validation").map(Key.make[Frame]),
202 |       null)
203 | 
204 |     water.H2O.submitTask(frameSplitter)
205 |     val splits = frameSplitter.getResult
206 |     val training = splits(0)
207 |     val test = splits(1)
208 |     val validation = splits(2)
209 | 
210 |     // Set the parameters for our deep learning model.
211 |     val deepLearningParameters = new DeepLearningParameters()
212 |     deepLearningParameters._train = training
213 |     deepLearningParameters._valid = validation
214 |     deepLearningParameters._response_column = "Region"
215 |     deepLearningParameters._epochs = 2
216 |     deepLearningParameters._l2 = 0.01
217 |     deepLearningParameters._seed = 1234567
218 |     deepLearningParameters._activation = Activation.RectifierWithDropout
219 |     deepLearningParameters._hidden = Array[Int](32, 64, 128)
220 | 
221 |     // Train the deep learning model
222 |     val deepLearning = new DeepLearning(deepLearningParameters)
223 |     val deepLearningTrained = deepLearning.trainModel
224 |     val trainedModel = deepLearningTrained.get
225 | 
226 |     val error = trainedModel.classification_error()
227 |     println("Training Error: " + error)
228 | 
229 |     //val predict = trainedModel.score(test)('predict)
230 |     //trainedModel.score(test)('predict)
231 |     
232 |     trainedModel.score(dataFrame)('predict)
233 |     println(variantFrequencies.max)   
234 | 
235 |     
236 |     /*
237 |     val h2oContext2 = H2OContext.getOrCreate(sc)
238 |     import h2oContext2._
239 |     import h2oContext2.implicits._
240 |     
241 |     val predictionsFromModel = asRDD[DoubleHolder](predict).collect.map(_.result.getOrElse(Double.NaN))
242 |     predictionsFromModel.foreach{ value => println(value)}
243 |     * 
244 |     */
245 | 
246 |     // Collect model metrics and evaluate model quality
247 |     //val trainMetrics = ModelMetricsSupport.modelMetrics[ModelMetricsMultinomial](trainedModel, test)
248 |     //val met = trainMetrics.cm()
249 |     //println("Accuracy: "+ met.accuracy())
250 |     //println("MSE: "+ trainMetrics.mse)
251 |     //println("RMSE: "+ trainMetrics.rmse)
252 |     //println("R2: " + trainMetrics.r2)
253 |      
254 |     // Shutdown Spark cluster and H2O
255 |     h2oContext.stop(stopSparkContext = true)
256 |     sparkSession.stop()
257 |   }
258 | }
259 | 


--------------------------------------------------------------------------------
/PopulationClustering_v2/src/main/scala/org/fit/genomics/PopStratClustering.scala:
--------------------------------------------------------------------------------
  1 | package org.fit.genomics
  2 | 
  3 | import hex.FrameSplitter
  4 | import org.apache.spark.{ SparkConf, SparkContext }
  5 | import org.apache.spark.h2o.H2OContext
  6 | import org.apache.spark.rdd.RDD
  7 | import org.apache.spark.sql._
  8 | import org.bdgenomics.adam.rdd.ADAMContext._
  9 | import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele}
 10 | import org.apache.spark._
 11 | import org.apache.spark.rdd.RDD
 12 | import org.apache.spark.mllib.linalg.{ Vectors, Vector }
 13 | import org.apache.spark.ml.clustering.KMeans
 14 | import water.fvec.Frame
 15 | import java.io._
 16 | import org.apache.spark.SparkContext
 17 | import org.apache.spark.h2o.H2OContext
 18 | import org.apache.spark.sql.types.{ IntegerType, StringType, StructField, StructType }
 19 | 
 20 | import org.apache.spark.ml.feature.{ VectorAssembler, Normalizer }
 21 | import org.apache.spark.ml.Pipeline
 22 | import org.apache.spark.ml.feature.VectorIndexer
 23 | import org.apache.spark.ml.feature.PCA
 24 | 
 25 | import water.{ Job, Key }
 26 | import water.fvec.Frame
 27 | 
 28 | import org.apache.spark.h2o._
 29 | import java.io.File
 30 | import water._
 31 | 
 32 | import scala.collection.JavaConverters._
 33 | import scala.collection.immutable.Range.inclusive
 34 | import scala.io.Source
 35 | 
 36 | object PopStratClusterings {
 37 |   def main(args: Array[String]): Unit = {
 38 |     val genotypeFile = "C:/Users/admin-karim/Downloads/genotypes.vcf"
 39 |     val panelFile = "C:/Users/admin-karim/Downloads/genotypes.panel"
 40 | 
 41 |     val sparkSession: SparkSession =
 42 |       SparkSession.builder.appName("PopStrat").master("local[*]").getOrCreate()
 43 |     val sc: SparkContext = sparkSession.sparkContext
 44 | 
 45 |     val populations = Set("GBR", "MXL", "ASW", "CHB", "CLM")
 46 |     def extract(file: String, filter: (String, String) => Boolean): Map[String, String] = {
 47 |       Source
 48 |         .fromFile(file)
 49 |         .getLines()
 50 |         .map(line => {
 51 |           val tokens = line.split(Array('\t', ' ')).toList
 52 |           tokens(0) -> tokens(1)
 53 |         })
 54 |         .toMap
 55 |         .filter(tuple => filter(tuple._1, tuple._2))
 56 |     }
 57 | 
 58 |     val panel: Map[String, String] = extract(
 59 |       panelFile,
 60 |       (sampleID: String, pop: String) => populations.contains(pop))
 61 |     val allGenotypes: RDD[Genotype] = sc.loadGenotypes(genotypeFile).rdd
 62 |     val genotypes: RDD[Genotype] = allGenotypes.filter(genotype => {
 63 |       panel.contains(genotype.getSampleId)
 64 |     })
 65 | 
 66 |     // Convert the Genotype objects to our own SampleVariant objects to try and conserve memory
 67 |     case class SampleVariant(sampleId: String,
 68 |       variantId: Int,
 69 |       alternateCount: Int)
 70 | 
 71 |     def variantId(genotype: Genotype): String = {
 72 |       val name = genotype.getVariant.getContigName
 73 |       val start = genotype.getVariant.getStart
 74 |       val end = genotype.getVariant.getEnd
 75 |       s"$name:$start:$end"
 76 |     }
 77 | 
 78 |     def alternateCount(genotype: Genotype): Int = {
 79 |       genotype.getAlleles.asScala.count(_ != GenotypeAllele.REF)
 80 |     }
 81 | 
 82 |     def toVariant(genotype: Genotype): SampleVariant = {
 83 |       // Intern sample IDs as they will be repeated a lot
 84 |       new SampleVariant(genotype.getSampleId.intern(),
 85 |         variantId(genotype).hashCode(),
 86 |         alternateCount(genotype))
 87 |     }
 88 | 
 89 |     val variantsRDD: RDD[SampleVariant] = genotypes.map(toVariant)
 90 |     val variantsBySampleId: RDD[(String, Iterable[SampleVariant])] =
 91 |       variantsRDD.groupBy(_.sampleId)
 92 |     val sampleCount: Long = variantsBySampleId.count()
 93 |     println("Found " + sampleCount + " samples")
 94 | 
 95 |     val variantsByVariantId: RDD[(Int, Iterable[SampleVariant])] =
 96 |       variantsRDD.groupBy(_.variantId).filter {
 97 |         case (_, sampleVariants) => sampleVariants.size == sampleCount
 98 |       }
 99 | 
100 |     val variantFrequencies: collection.Map[Int, Int] = variantsByVariantId
101 |       .map {
102 |         case (variantId, sampleVariants) =>
103 |           (variantId, sampleVariants.count(_.alternateCount > 0))
104 |       }
105 |       .collectAsMap()
106 | 
107 |     val permittedRange = inclusive(11, 11)
108 |     val filteredVariantsBySampleId: RDD[(String, Iterable[SampleVariant])] =
109 |       variantsBySampleId.map {
110 |         case (sampleId, sampleVariants) =>
111 |           val filteredSampleVariants = sampleVariants.filter(
112 |             variant =>
113 |               permittedRange.contains(
114 |                 variantFrequencies.getOrElse(variant.variantId, -1)))
115 |           (sampleId, filteredSampleVariants)
116 |       }
117 | 
118 |     val sortedVariantsBySampleId: RDD[(String, Array[SampleVariant])] =
119 |       filteredVariantsBySampleId.map {
120 |         case (sampleId, variants) =>
121 |           (sampleId, variants.toArray.sortBy(_.variantId))
122 |       }
123 | 
124 |     println(s"Sorted by Sample ID RDD: " + sortedVariantsBySampleId.first())
125 | 
126 |     val header = StructType(
127 |       Array(StructField("Region", StringType)) ++
128 |         sortedVariantsBySampleId
129 |         .first()
130 |         ._2
131 |         .map(variant => {
132 |           StructField(variant.variantId.toString, IntegerType)
133 |         }))
134 | 
135 |     val rowRDD: RDD[Row] = sortedVariantsBySampleId.map {
136 |       case (sampleId, sortedVariants) =>
137 |         val region: Array[String] = Array(panel.getOrElse(sampleId, "Unknown"))
138 |         val alternateCounts: Array[Int] = sortedVariants.map(_.alternateCount)
139 |         Row.fromSeq(region ++ alternateCounts)
140 |     }
141 | 
142 |     //val featureVectorsRDD = rowRDD.map { x: Row => x.getAs[Vector](0) }
143 | 
144 |     // Create the SchemaRDD from the header and rows and convert the SchemaRDD into a Spark dataframe
145 |     val sqlContext = sparkSession.sqlContext
146 |     val schemaDF = sqlContext.createDataFrame(rowRDD, header).drop("Region")
147 |     schemaDF.printSchema()
148 |     schemaDF.show(10)
149 | 
150 |     val featureCols = schemaDF.columns
151 | 
152 |     val assembler = new VectorAssembler()
153 |       .setInputCols(featureCols)
154 |       .setOutputCol("features")
155 | 
156 |     val assembleDF = assembler.transform(schemaDF).select("features")
157 |     assembleDF.show()
158 | 
159 |     val pca = new PCA()
160 |       .setInputCol("features")
161 |       .setOutputCol("pcaFeatures")
162 |       .setK(50)
163 |       .fit(assembleDF)
164 | 
165 |     val pcaDF = pca.transform(assembleDF).select("pcaFeatures").withColumnRenamed("pcaFeatures", "features")
166 |     pcaDF.show()
167 | 
168 |     val iterations = 20
169 |     for (i <- 2 to iterations) {
170 |       // Trains a k-means model.
171 |       val kmeans = new KMeans().setK(i).setSeed(12345L)
172 |       val model = kmeans.fit(pcaDF)
173 | 
174 |       // Evaluate clustering by computing Within Set Sum of Squared Errors.
175 |       val WSSSE = model.computeCost(pcaDF)
176 |       println("Within Set Sum of Squared Errors for k = " + i + " is " + WSSSE)
177 |     }
178 |     sparkSession.stop()
179 |   }
180 | }


--------------------------------------------------------------------------------
/PopulationClustering_v2/src/main/scala/org/fit/genomics/featureExtractor.scala:
--------------------------------------------------------------------------------
  1 | package org.fit.genomics
  2 | 
  3 | import java.io._
  4 | 
  5 | import hex.FrameSplitter
  6 | import hex.deeplearning.DeepLearning
  7 | import hex.deeplearning.DeepLearningModel.DeepLearningParameters
  8 | import hex.deeplearning.DeepLearningModel.DeepLearningParameters.Activation
  9 | import org.apache.spark.SparkContext
 10 | import org.apache.spark.h2o.H2OContext
 11 | import org.apache.spark.rdd.RDD
 12 | import org.apache.spark.sql._
 13 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 14 | 
 15 | import org.bdgenomics.adam.rdd.ADAMContext._
 16 | import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele}
 17 | import water.{Job, Key}
 18 | import water.support.ModelMetricsSupport
 19 | import water.fvec.Frame
 20 | 
 21 | import org.apache.spark.h2o._
 22 | import java.io.File
 23 | 
 24 | import htsjdk.samtools.ValidationStringency
 25 | 
 26 | import _root_.hex.{ModelMetrics, ModelMetricsSupervised, ModelMetricsMultinomial}
 27 | 
 28 | import scala.collection.JavaConverters._
 29 | import scala.collection.immutable.Range.inclusive
 30 | import scala.io.Source
 31 | 
 32 | object featureExtractor {
 33 |   def main(args: Array[String]): Unit = {
 34 |     val genotypeFile = "ALL.chrY.phase3_integrated_v2a.20130502.genotypes.vcf"
 35 |     val panelFile = "genotypes.panel"
 36 | 
 37 |     val sparkSession: SparkSession =
 38 |       SparkSession.builder.appName("PopStrat").master("local[*]").getOrCreate()
 39 |     val sc: SparkContext = sparkSession.sparkContext
 40 | 
 41 |     // Create a set of the populations that we want to predict
 42 |     // Then create a map of sample ID -> population so that we can filter out the samples we're not interested in
 43 |     //val populations = Set("GBR", "ASW", "FIN", "CHB", "CLM")
 44 |     val populations = Set("FIN", "GBR", "ASW", "CHB", "CLM")
 45 | 
 46 |     def extract(file: String,
 47 |       filter: (String, String) => Boolean): Map[String, String] = {
 48 |       Source
 49 |         .fromFile(file)
 50 |         .getLines()
 51 |         .map(line => {
 52 |           val tokens = line.split(Array('\t', ' ')).toList
 53 |           tokens(0) -> tokens(1)
 54 |         })
 55 |         .toMap
 56 |         .filter(tuple => filter(tuple._1, tuple._2))
 57 |     }
 58 | 
 59 |     val panel: Map[String, String] = extract(
 60 |       panelFile,
 61 |       (sampleID: String, pop: String) => populations.contains(pop))
 62 | 
 63 |     // Load the ADAM genotypes from the parquet file(s)
 64 |     
 65 |     // Next, filter the genotypes so that we're left with only those in the populations we're interested in
 66 |     //val allGenotypes: RDD[Genotype] = sc.loadGenotypes(genotypeFile, stringency = ValidationStringency.SILENT).rdd
 67 |     
 68 |     //If you want to generate RDD out of multiple vcf files, use the following:
 69 |     //val allGenotypes:RDD[Genotype] = sc.loadGenotypes('VCF_files/*.vcf', stringency = ValidationStringency.SILENT).rdd 
 70 |     
 71 |     val genotypes0 = sc.loadGenotypes("VCF_files/1.vcf", stringency = ValidationStringency.SILENT)
 72 |     
 73 |     //val genotypes0 = sc.loadGenotypes("sample0.vcf")
 74 |     val genotypes1 = sc.loadGenotypes("VCF_files/2.vcf")
 75 |     val union = genotypes0.union(genotypes1)
 76 |     val rdd: RDD[Genotype] = union.rdd
 77 | 
 78 |     val allGenotypes: RDD[Genotype] = rdd.rdd
 79 | 
 80 |     //allGenotypes.adamParquetSave("output")
 81 |     val genotypesFiltered: RDD[Genotype] = allGenotypes.filter(genotype => {
 82 |       panel.contains(genotype.getSampleId)
 83 |     })
 84 | 
 85 |     // Convert the Genotype objects to our own SampleVariant objects to try and conserve memory
 86 |     case class SampleVariant(sampleId: String,
 87 |       variantId: Int,
 88 |       alternateCount: Int)
 89 |     def variantId(genotype: Genotype): String = {
 90 |       val name = genotype.getVariant.getContigName
 91 |       val start = genotype.getVariant.getStart
 92 |       val end = genotype.getVariant.getEnd
 93 |       s"$name:$start:$end"
 94 |     }
 95 | 
 96 |     def alternateCount(genotype: Genotype): Int = {
 97 |       genotype.getAlleles.asScala.count(_ != GenotypeAllele.REF)
 98 |     }
 99 | 
100 |     def toVariant(genotype: Genotype): SampleVariant = {
101 |       // Intern sample IDs as they will be repeated a lot
102 |       new SampleVariant(genotype.getSampleId.intern(),
103 |         variantId(genotype).hashCode(),
104 |         alternateCount(genotype))
105 |     }
106 | 
107 |     val variantsRDD: RDD[SampleVariant] = genotypesFiltered.map(toVariant)
108 |     //println(s"Variant RDD: " + variantsRDD.first())
109 | 
110 |     // Group the variants by sample ID so we can process the variants sample-by-sample
111 |     // Then get the total number of samples. This will be used to find variants that are missing for some samples.
112 |     // Group the variants by variant ID and filter out those variants that are missing from some samples
113 |     val variantsBySampleId: RDD[(String, Iterable[SampleVariant])] =
114 |       variantsRDD.groupBy(_.sampleId)
115 |     val sampleCount: Long = variantsBySampleId.count()
116 |     println("Found " + sampleCount + " samples")
117 | 
118 |     val writer_0 = new PrintWriter(new File("output_1.txt"))
119 |     writer_0.write("Found " + sampleCount + " samples")
120 |     //writer.write(s"Confusion Matrix"+ cm)
121 |     //writer.write("Prediction Matrix"+ result)
122 |     writer_0.close()
123 | 
124 |     val variantsByVariantId: RDD[(Int, Iterable[SampleVariant])] =
125 |       variantsRDD.groupBy(_.variantId).filter {
126 |         case (_, sampleVariants) => sampleVariants.size == sampleCount
127 |       }
128 | 
129 |     // Make a map of variant ID -> count of samples with an alternate count of greater than zero
130 |     // then filter out those variants that are not in our desired frequency range. The objective here is simply to
131 |     // reduce the number of dimensions in the data set to make it easier to train the model.
132 |     // The specified range is fairly arbitrary and was chosen based on the fact that it includes a reasonable
133 |     // number of variants, but not too many.
134 |     val variantFrequencies: collection.Map[Int, Int] = variantsByVariantId
135 |       .map {
136 |         case (variantId, sampleVariants) =>
137 |           (variantId, sampleVariants.count(_.alternateCount > 0))
138 |       }
139 |       .collectAsMap()
140 |       
141 |    println(variantFrequencies.max)   
142 | 
143 |     val permittedRange = inclusive(11, 11)
144 |     val filteredVariantsBySampleId: RDD[(String, Iterable[SampleVariant])] =
145 |       variantsBySampleId.map {
146 |         case (sampleId, sampleVariants) =>
147 |           val filteredSampleVariants = sampleVariants.filter(
148 |             variant =>
149 |               permittedRange.contains(
150 |                 variantFrequencies.getOrElse(variant.variantId, -1)))
151 |           (sampleId, filteredSampleVariants)
152 |       }
153 | 
154 |     //println(s"Filtered Variant RDD: " + filteredVariantsBySampleId.first())
155 | 
156 |     // Sort the variants for each sample ID. Each sample should now have the same number of sorted variants.
157 |     // All items in the RDD should now have the same variants in the same order so we can just use the first
158 |     // one to construct our header
159 |     // Next construct the rows of our SchemaRDD from the variants
160 |     val sortedVariantsBySampleId: RDD[(String, Array[SampleVariant])] =
161 |       filteredVariantsBySampleId.map {
162 |         case (sampleId, variants) =>
163 |           (sampleId, variants.toArray.sortBy(_.variantId))
164 |       }
165 | 
166 |     println(s"Sorted by Sample ID RDD: " + sortedVariantsBySampleId.first())
167 | 
168 |     val header = StructType(
169 |       Seq(StructField("Region", StringType)) ++
170 |         sortedVariantsBySampleId
171 |         .first()
172 |         ._2
173 |         .map(variant => {
174 |           StructField(variant.variantId.toString, IntegerType)
175 |         }))
176 | 
177 |     val rowRDD: RDD[Row] = sortedVariantsBySampleId.map {
178 |       case (sampleId, sortedVariants) =>
179 |         val region: Array[String] = Array(panel.getOrElse(sampleId, "Unknown"))
180 |         val alternateCounts: Array[Int] = sortedVariants.map(_.alternateCount)
181 |         Row.fromSeq(region ++ alternateCounts)
182 |     }
183 | 
184 |     // Create the SchemaRDD from the header and rows and convert the SchemaRDD into a H2O dataframe
185 |     val sqlContext = sparkSession.sqlContext
186 |     val schemaDF = sqlContext.createDataFrame(rowRDD, header)
187 |     
188 |     // Write the resultant DataFrame as CSV file to be used by Keras-based DEC algorithm
189 |     schemaDF.coalesce(1).write.format("com.databricks.spark.csv").csv("results/train.csv")
190 |     //testData.write.format("com.databricks.spark.csv").csv("results/test.csv") 
191 |     
192 |     // Shutdown Spark cluster and H2O
193 |     h2oContext.stop(stopSparkContext = true)
194 |     sparkSession.stop()
195 |   }
196 | }
197 | 


--------------------------------------------------------------------------------
/PopulationClustering_v2/target/classes/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | Built-By: admin-karim
3 | Build-Jdk: 1.8.0_171
4 | Created-By: Maven Integration for Eclipse
5 | 
6 | 


--------------------------------------------------------------------------------
/PopulationClustering_v2/target/classes/META-INF/maven/com.deri.sels/PopulationClustering_v2/pom.properties:
--------------------------------------------------------------------------------
1 | #Generated by Maven Integration for Eclipse
2 | #Fri Aug 17 13:58:22 CEST 2018
3 | version=0.1-SNAPSHOT
4 | groupId=com.deri.sels
5 | m2e.projectName=PopulationClustering_v2
6 | m2e.projectLocation=C\:\\Users\\admin-karim\\Downloads\\WS\\PopulationClustering_v2
7 | artifactId=PopulationClustering_v2
8 | 


--------------------------------------------------------------------------------
/PopulationClustering_v2/target/classes/META-INF/maven/com.deri.sels/PopulationClustering_v2/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 	<groupId>com.deri.sels</groupId>
  5 | 	<artifactId>PopulationClustering_v2</artifactId>
  6 | 	<version>0.1-SNAPSHOT</version>
  7 | 	<properties>
  8 | 		<spark.version>2.2.1</spark.version>
  9 | 		<scala.version>2.11.8</scala.version>
 10 | 		<h2o.version>3.16.0.2</h2o.version>
 11 | 		<sparklingwater.version>2.2.6</sparklingwater.version>
 12 | 		<adam.version>0.23.0</adam.version>
 13 | 	</properties>
 14 | 	<pluginRepositories>
 15 | 		<pluginRepository>
 16 | 			<id>scala-tools.org</id>
 17 | 			<name>Scala-tools Maven2 Repository</name>
 18 | 			<url>http://scala-tools.org/repo-releases</url>
 19 | 		</pluginRepository>
 20 | 	</pluginRepositories>
 21 | 	<dependencies>
 22 | 		<dependency>
 23 | 			<groupId>org.bdgenomics.adam</groupId>
 24 | 			<artifactId>adam-core_2.11</artifactId>
 25 | 			<version>0.23.0</version>
 26 | 		</dependency>
 27 | 
 28 | 		<dependency>
 29 | 			<groupId>ai.h2o</groupId>
 30 | 			<artifactId>sparkling-water-core_2.11</artifactId>
 31 | 			<version>2.2.6</version>
 32 | 		</dependency>
 33 | 		<dependency>
 34 | 			<groupId>ai.h2o</groupId>
 35 | 			<artifactId>sparkling-water-examples_2.11</artifactId>
 36 | 			<version>2.2.6</version>
 37 | 		</dependency>
 38 | 				<dependency>
 39 | 			<groupId>org.apache.directory.studio</groupId>
 40 | 			<artifactId>org.apache.commons.io</artifactId>
 41 | 			<version>2.4</version>
 42 | 		</dependency>
 43 | 		<dependency>
 44 | 			<groupId>org.apache.spark</groupId>
 45 | 			<artifactId>spark-core_2.11</artifactId>
 46 | 			<version>${spark.version}</version>
 47 | 		</dependency>
 48 | 
 49 | 		<dependency>
 50 | 			<groupId>ai.h2o</groupId>
 51 | 			<artifactId>h2o-core</artifactId>
 52 | 			<version>${h2o.version}</version>
 53 | 		</dependency>
 54 | 		<dependency>
 55 | 			<groupId>ai.h2o</groupId>
 56 | 			<artifactId>h2o-scala_2.11</artifactId>
 57 | 			<version>${h2o.version}</version>
 58 | 		</dependency>
 59 | 		<dependency>
 60 | 			<groupId>ai.h2o</groupId>
 61 | 			<artifactId>h2o-algos</artifactId>
 62 | 			<version>${h2o.version}</version>
 63 | 		</dependency>
 64 | 		<dependency>
 65 | 			<groupId>ai.h2o</groupId>
 66 | 			<artifactId>h2o-app</artifactId>
 67 | 			<version>${h2o.version}</version>
 68 | 		</dependency>
 69 | 		<dependency>
 70 | 			<groupId>ai.h2o</groupId>
 71 | 			<artifactId>h2o-persist-hdfs</artifactId>
 72 | 			<version>${h2o.version}</version>
 73 | 		</dependency>
 74 | 		    <dependency>
 75 |         <artifactId>scala-library</artifactId>
 76 |         <groupId>org.scala-lang</groupId>
 77 |         <version>${scala.version}</version>
 78 |     </dependency>
 79 | 		<dependency>
 80 | 			<groupId>ai.h2o</groupId>
 81 | 			<artifactId>google-analytics-java</artifactId>
 82 | 			<version>1.1.2-H2O-CUSTOM</version>
 83 | 		</dependency>
 84 | 		<dependency>
 85 | 			<groupId>joda-time</groupId>
 86 | 			<artifactId>joda-time</artifactId>
 87 | 			<version>2.9.9</version>
 88 | 		</dependency>
 89 | 	</dependencies>
 90 | 	<repositories>
 91 | 		<repository>
 92 | 			<id>snapshots-repo</id>
 93 | 			<url>https://oss.sonatype.org/content/repositories/snapshots</url>
 94 | 			<releases>
 95 | 				<enabled>false</enabled>
 96 | 			</releases>
 97 | 			<snapshots>
 98 | 				<enabled>true</enabled>
 99 | 				<updatePolicy>daily</updatePolicy>  <!-- Optional, update daily -->
100 | 			</snapshots>
101 | 		</repository>
102 | 	</repositories>
103 | 	<build>
104 | 		<plugins>
105 | 			<!-- download source code in Eclipse, best practice -->
106 | 			<plugin>
107 | 				<groupId>org.apache.maven.plugins</groupId>
108 | 				<artifactId>maven-eclipse-plugin</artifactId>
109 | 				<version>2.9</version>
110 | 				<configuration>
111 | 					<downloadSources>true</downloadSources>
112 | 					<downloadJavadocs>false</downloadJavadocs>
113 | 				</configuration>
114 | 			</plugin>
115 | 			<!-- Set a compiler level -->
116 | 			<plugin>
117 | 				<groupId>org.apache.maven.plugins</groupId>
118 | 				<artifactId>maven-compiler-plugin</artifactId>
119 | 				<version>3.5.1</version>
120 | 				<configuration>
121 | 					<source>${jdk.version}</source>
122 | 					<target>${jdk.version}</target>
123 | 				</configuration>
124 | 			</plugin>
125 | 			<plugin>
126 | 				<artifactId>maven-shade-plugin</artifactId>
127 | 				<version>2.4.3</version>
128 | 				<executions>
129 | 					<execution>
130 | 						<phase>package</phase>
131 | 						<goals>
132 | 							<goal>shade</goal>
133 | 						</goals>
134 | 						<configuration>
135 | 							<createDependencyReducedPom>false</createDependencyReducedPom>
136 | 
137 | 							<filters>
138 | 								<filter>
139 | 									<artifact>*:*</artifact>
140 | 									<excludes>
141 | 										<exclude>META-INF/*.SF</exclude>
142 | 										<exclude>META-INF/*.DSA</exclude>
143 | 										<exclude>META-INF/*.RSA</exclude>
144 | 									</excludes>
145 | 								</filter>
146 | 							</filters>
147 | 							<transformers>
148 | 								<transformer
149 | 									implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
150 | 							</transformers>
151 | 						</configuration>
152 | 					</execution>
153 | 				</executions>
154 | 			</plugin>
155 | 			<!-- Maven Assembly Plugin -->
156 | 			<plugin>
157 | 				<groupId>org.apache.maven.plugins</groupId>
158 | 				<artifactId>maven-assembly-plugin</artifactId>
159 | 				<version>2.4.1</version>
160 | 				<configuration>
161 | 					<!-- get all project dependencies -->
162 | 					<descriptorRefs>
163 | 						<descriptorRef>jar-with-dependencies</descriptorRef>
164 | 					</descriptorRefs>
165 | 					<!-- MainClass in mainfest make a executable jar -->
166 | 					<archive>
167 | 						<manifest>
168 | 						     <mainClass>org.fit.genomics.PopStratClassification</mainClass>
169 | 						</manifest>
170 | 					</archive>
171 | 
172 | 					<property>
173 | 						<name>oozie.launcher.mapreduce.job.user.classpath.first</name>
174 | 						<value>true</value>
175 | 					</property>
176 | 
177 | 				</configuration>
178 | 				<executions>
179 | 					<execution>
180 | 						<id>make-assembly</id>
181 | 						<!-- bind to the packaging phase -->
182 | 						<phase>package</phase>
183 | 						<goals>
184 | 							<goal>single</goal>
185 | 						</goals>
186 | 					</execution>
187 | 				</executions>
188 | 			</plugin>
189 | 		</plugins>
190 | 	</build>
191 | 	
192 | </project>


--------------------------------------------------------------------------------
/PopulationClustering_v2/target/maven-archiver/pom.properties:
--------------------------------------------------------------------------------
1 | #Generated by Maven
2 | #Thu Aug 02 01:47:00 CEST 2018
3 | version=0.1-SNAPSHOT
4 | groupId=com.deri.sels
5 | artifactId=PopulationClustering_v2
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This repository contains the implemnetation of our papers titled "Recurrent Deep Embedding Networks for Genotype Clustering and Ethnicity Prediction" and "Convolutional Embedded Networks for Population Scale Clustering and Bio-ancestry Inferencing". The former is available on "Arxiv as pre-print"(link: https://arxiv.org/pdf/1805.12218.pdf). The later has been submitted to IEEE/ACM Transactions on Computational Biology and Bioinformatics, which is under review. 
 2 | 
 3 | This repo will have two different implementations: i) Deep Embedding Networks(DEC) and Recurrent Deep Embedding Networks(CDEC) using ii) Spark and H2O implementations of our paper titled "Recurrent Deep Embedding Networks for Genotype Clustering and Ethnicity Prediction". 
 4 | 
 5 | ## Implementation details
 6 | The proof of the concept of our approach is implemented in Spark, ADAM, and Keras. In particular, for the scalable and faster preprocessing of huge number of genetic variants across all the chromosomes (i.e. 870 GB of data), we used ADAM and Spark to convert the genetic variants from VCF format to Spark DataFrame. Then we convert Spark DataFrame into NumPy arrays. Finally, we use Keras to implement Conv-LSTM and CDEC networks for for Population Scale Clustering and Ancestry Inference, respectively. 
 7 | 
 8 | Experiments were carried out on a computing cluster having 32 cores, 64-bit Ubuntu 14.04 OS. Software stack consisting of Apache Spark v2.3.0, H2O v3.14.0.1, Sparkling Water v1.2.5, ADAM v0.22.0 and Keras v2.0.9 with TensorFlow backend. We compare approach with the state-of-the-art such as ADMIXTURE and VariationSpark. 
 9 | 
10 | ### CDEC implementation in Python with Keras
11 | Refer to https://github.com/rezacsedu/Recurrent-Deep-Embedding-Networks/tree/master/CDEC for more details. Network training were carried out on a Nvidia TitanX GPU with CUDA and cuDNN enabled to make the overall pipeline faster. 
12 | 
13 | #### Step 1: Feature extraction using Scala, Adam, and Spark 
14 | For this, first, download the VCF files (containing the variants) and the panel file (containing the labels) from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/. 
15 |  
16 | Then go to https://github.com/rezacsedu/VariationDEC/tree/master/PopulationClustering_v2 and use the featureExtractor.scala
17 | to extract the features and save as a DataFrame in CSV to be used by Keras-based DEC.
18 | 
19 | For this, make sure that you've configured Spark correctly on your machine. Alternatively, execute this script as a standalone Scala project from Eclipse or IntelliJ IDEA. 
20 | 
21 | #### Step 2: This is the CDEC part in Keras 
22 | Go to https://github.com/rezacsedu/Recurrent-Deep-Embedding-Networks/tree/master/CDEC. Then there are several Python scripts and a sample genetic variants feature in csv for the clustering and classification, respectively. 
23 | 
24 | - genome.csv: is the sample genetic variants featres
25 | - customlayers.py: for creating custom clustering layer in Keras 
26 | - keras_unpooling.py: for performing conv unpooling operation for COnv autoencoder part of the network
27 | - misc.py: contains the data preparation helper modules
28 | - network.py: CDEC network creation for the clustering
29 | - main.py: the main class that encapsulates all the steps. 
30 | 
31 | #### Instruction
32 | 
33 | #### Acknowledgement: 
34 | This implementation slightly based on https://github.com/elieJalbout/Clustering-with-Deep-learning
35 | 
36 | ### DEC implementation in Python
37 | A modified version of Keras based DEC implementation (https://github.com/XifengGuo/DEC-keras) proposed by Ali F. et al. is used in our approach. Network training were carried out on a Nvidia TitanX GPU with CUDA and cuDNN enabled to make the overall pipeline faster. 
38 | 
39 | #### Step 1: Feature extraction using Scala, Adam and Spark 
40 | For this, first, download the VCF files (containing the variants) and the panel file (containing the labels) from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/. 
41 |  
42 | Then go to https://github.com/rezacsedu/VariationDEC/tree/master/PopulationClustering_v2 and use the featureExtractor.scala
43 | to extract the features and save as a DataFrame in CSV to be used by Keras-based DEC.
44 | 
45 | For this, make sure that you've configured Spark correctly on your machine. Alternatively, execute this script as a standalone Scala project from Eclipse or IntelliJ IDEA. 
46 | 
47 | #### Step 2: This is the DEC part in Keras/Python 
48 | Go to https://github.com/rezacsedu/VariationDEC/tree/master/DEC_GenotypeClustering_Keras. Then there are 2 Python scripts and a sample genetic variants feature in csv for the clustering and classification respectively. 
49 | 
50 | - genome.csv: is the sample genetic variants featres
51 | - DEC_Genotype_Clustering.py: for the clustering 
52 | - LSTM_EthnicityPrediction.py: for the classification 
53 | 
54 | ### Spark and H2O implementation in Scala
55 | For this, first download the VCF files (containing the variants) and the panel file (containing the labels) from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/. Then go to https://github.com/rezacsedu/VariationDEC/tree/master/PopulationClustering_v2 and you'll see there Scala scripts as listed below: 
56 | 
57 | - PopGenomicsClassificationSpark.scala: this is the Spark implementation of ethnicity prediction
58 | - PopStratClassification.scala: this is the H2O implementation of ethnicity prediction
59 | - PopStratClustering.scala: this is the H2O/Spark implementation of the genotype clustering but using K-means prediction
60 | 
61 | For this, make sure that you've configured Spark and Adam (see https://github.com/bigdatagenomics/adam) correctly on your machine. Alternatively, execute this script as a standalone Scala project from Eclipse or IntelliJ IDEA.
62 | 
63 | ### Citation request
64 |     @inproceedings{karim2018recurrent,
65 |         title={Recurrent Deep Embedding Networks for Genotype Clustering and Ethnicity Prediction},
66 |         author={Karim, Md and Cochez, Michael and Beyan, Oya Deniz and Zappa, Achille and Sahay, Ratnesh and Decker, Stefan and Schuhmann, Dietrich-Rebholz and others},
67 |         booktitle={arXiv preprint arXiv:1805.12218},
68 |         year={2018}
69 |     }
70 | 
71 | ### Contributing
72 | For any questions, feel free to open an issue or contact at rezaul.karim@rwth-aachen.de
73 | 


--------------------------------------------------------------------------------