├── Data
    ├── aid_disease_mapping.csv
    ├── aid_target_mapping.csv
    ├── all_molecular_data.zip
    └── data_reference_list.txt
├── LICENSE
├── LICENSE.cczero
├── LICENSE.mit
├── README.md
├── clustering.py
├── correlation.py
├── fingerprint_extraction.py
├── manual_tagging.py
├── molecular_data_cleaning.py
├── ner.py
├── preprocessing.py
├── splitting.py
└── training.py


/Data/aid_disease_mapping.csv:
--------------------------------------------------------------------------------
  1 | AID,cancer,nervous System,immune system,cardiovascular,toxicity,obesity,virus,diabetes,metabolic disorders,bacteria,parasite,epigenetics_genetics,pulmonary,infection,aging,fungal
  2 | activity_1554,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  3 | activity_2732,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0
  4 | activity_1085,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
  5 | activity_1236,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
  6 | activity_1274,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
  7 | activity_781,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
  8 | activity_422,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  9 | activity_1224905,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 10 | activity_624256,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 11 | activity_588358,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 12 | activity_1346378,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
 13 | activity_743266,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
 14 | activity_624417,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0
 15 | activity_624418,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0
 16 | activity_652104,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0
 17 | activity_602310,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 18 | activity_588579,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 19 | activity_504847,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
 20 | activity_651635,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 21 | activity_743255,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 22 | activity_588591,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 23 | activity_588590,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 24 | activity_504845,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 25 | activity_504327,1,0,1,1,0,1,0,1,0,0,0,0,1,0,0,0
 26 | activity_504648,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 27 | activity_602313,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 28 | activity_624172,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0
 29 | activity_624246,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 30 | activity_652106,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 31 | activity_2676,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0
 32 | activity_652025,0,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0
 33 | activity_720504,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 34 | activity_720542,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 35 | activity_651725,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 36 | activity_651768,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 37 | activity_504444,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 38 | activity_743279,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
 39 | activity_651644,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
 40 | activity_651550,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 41 | activity_1259400,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
 42 | activity_720551,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 43 | activity_720553,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 44 | activity_651965,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
 45 | activity_540263,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 46 | activity_485314,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 47 | activity_651724,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 48 | activity_488837,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 49 | activity_485353,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 50 | activity_540303,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 51 | activity_485298,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 52 | activity_504706,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 53 | activity_485313,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 54 | activity_485297,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 55 | activity_2551,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 56 | activity_2546,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 57 | activity_540276,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 58 | activity_540256,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 59 | activity_2662,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 60 | activity_2315,0,1,1,1,0,0,0,0,1,1,0,0,0,1,0,0
 61 | activity_2675,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0
 62 | activity_2326,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
 63 | activity_2549,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 64 | activity_880,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 65 | activity_2147,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
 66 | activity_485360,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 67 | activity_1458,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 68 | activity_1487,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
 69 | activity_445,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 70 | activity_1259318,0,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0
 71 | activity_602429,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 72 | activity_624304,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 73 | activity_624352,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
 74 | activity_651699,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 75 | activity_651647,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 76 | activity_1347056,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 77 | activity_588405,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 78 | activity_588493,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 79 | activity_1259313,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
 80 | activity_489030,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 81 | activity_489031,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 82 | activity_485273,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
 83 | activity_2071,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 84 | activity_485,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
 85 | activity_1259388,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
 86 | activity_588692,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 87 | activity_720706,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
 88 | activity_602393,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 89 | activity_2099,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 90 | activity_504621,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
 91 | activity_1813,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 92 | activity_1814,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 93 | activity_1662,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
 94 | activity_2221,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
 95 | activity_940,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 96 | activity_1422,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 97 | activity_1021,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 98 | activity_1022,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 99 | activity_817,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
100 | activity_602162,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0
101 | activity_588499,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
102 | activity_504490,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
103 | activity_488847,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
104 | activity_2557,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
105 | activity_624170,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0
106 | activity_624171,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0
107 | activity_624414,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
108 | activity_1347076,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
109 | activity_1347075,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
110 | activity_624415,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
111 | activity_1347120,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0
112 | activity_504339,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
113 | activity_504891,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0
114 | activity_540317,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
115 | activity_624202,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
116 | activity_720707,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0
117 | activity_720708,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0
118 | activity_720709,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0
119 | activity_720711,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0
120 | activity_504333,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
121 | activity_504332,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
122 | activity_540253,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
123 | activity_504842,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
124 | activity_1463,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
125 | activity_1461,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
126 | activity_1347165,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
127 | activity_1457,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
128 | activity_2417,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
129 | activity_1454,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
130 | activity_1948,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
131 | activity_602449,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
132 | activity_1224865,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0
133 | activity_588473,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
134 | activity_588475,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
135 | activity_493187,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
136 | activity_504462,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
137 | activity_463195,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
138 | activity_485346,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
139 | activity_1496,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
140 | activity_1012,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
141 | activity_1216,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
142 | activity_1259310,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
143 | activity_743269,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
144 | activity_652017,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
145 | activity_2642,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0
146 | activity_488922,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
147 | activity_2237,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
148 | activity_2247,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
149 | activity_624483,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
150 | activity_1415,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
151 | activity_1423,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
152 | activity_1439,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
153 | activity_1441,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
154 | activity_950,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
155 | activity_951,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
156 | activity_952,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
157 | activity_1007,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
158 | activity_1008,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
159 | activity_1009,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
160 | activity_624263,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
161 | activity_651819,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
162 | activity_602438,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
163 | activity_651636,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
164 | activity_493098,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
165 | activity_2521,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0
166 | activity_2520,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0
167 | activity_1656,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0
168 | activity_1259374,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
169 | activity_1259422,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
170 | activity_686940,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
171 | activity_720508,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
172 | activity_720509,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
173 | activity_720543,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
174 | activity_720582,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
175 | activity_720648,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
176 | activity_720700,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
177 | activity_720704,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
178 | activity_651958,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
179 | activity_602396,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
180 | activity_624169,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
181 | activity_651602,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
182 | activity_602229,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
183 | activity_588352,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
184 | activity_588726,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
185 | activity_588354,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
186 | activity_651719,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
187 | activity_588335,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
188 | activity_504326,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
189 | activity_504357,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
190 | activity_434989,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
191 | activity_435030,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
192 | activity_449728,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
193 | activity_463079,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
194 | activity_463141,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
195 | activity_463210,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
196 | activity_485270,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
197 | activity_485272,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
198 | activity_2435,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
199 | activity_2445,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
200 | activity_2606,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
201 | activity_2796,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
202 | activity_2797,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
203 | activity_2280,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
204 | activity_1987,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
205 | activity_2235,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
206 | activity_2300,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
207 | activity_1899,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
208 | activity_1950,1,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0
209 | activity_1962,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
210 | activity_1527,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
211 | activity_1556,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
212 | activity_1700,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
213 | activity_1706,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
214 | activity_1789,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
215 | activity_1800,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
216 | activity_1825,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
217 | activity_1861,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
218 | activity_1304,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0
219 | activity_1359,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0
220 | activity_1416,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
221 | activity_1424,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
222 | activity_1446,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
223 | activity_1448,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
224 | activity_1481,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
225 | activity_1486,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
226 | activity_1509,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
227 | activity_1510,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
228 | activity_1515,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
229 | activity_1040,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0
230 | activity_861,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
231 | activity_793,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
232 | activity_803,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
233 | activity_828,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
234 | activity_623870,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
235 | activity_743287,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0
236 | activity_602329,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
237 | activity_651654,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
238 | activity_602405,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
239 | activity_651610,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
240 | activity_652154,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
241 | activity_504884,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
242 | activity_602481,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
243 | activity_652163,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
244 | activity_651710,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
245 | activity_588549,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
246 | activity_651711,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
247 | activity_652162,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
248 | activity_588436,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
249 | activity_651687,0,0,1,1,0,1,0,1,1,0,0,0,0,0,1,0
250 | activity_651723,0,0,1,1,0,1,0,1,1,0,0,0,0,0,1,0
251 | activity_588391,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
252 | activity_686996,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
253 | activity_720511,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
254 | activity_651640,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
255 | activity_651661,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
256 | activity_743247,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
257 | activity_504558,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
258 | activity_652197,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
259 | activity_488895,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
260 | activity_488965,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
261 | activity_493131,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
262 | activity_504523,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
263 | activity_504582,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
264 | activity_504423,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
265 | activity_602346,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
266 | activity_602342,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
267 | activity_623901,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
268 | activity_743397,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
269 | activity_488839,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
270 | activity_602340,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
271 | activity_488899,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
272 | activity_540336,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
273 | activity_504406,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
274 | activity_488896,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
275 | activity_504775,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
276 | activity_743445,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
277 | activity_743398,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
278 | activity_588334,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
279 | activity_686992,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
280 | activity_2016,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
281 | activity_2025,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
282 | activity_2023,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
283 | activity_2066,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
284 | activity_2029,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
285 | activity_2462,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
286 | activity_2661,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
287 | activity_2716,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
288 | activity_2718,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
289 | activity_2650,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
290 | activity_435005,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0
291 | activity_1885,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
292 | activity_1979,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
293 | activity_2097,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
294 | activity_2717,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
295 | activity_1910,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
296 | activity_841,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
297 | activity_1053175,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
298 | activity_1259309,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
299 | activity_1259311,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
300 | activity_720641,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
301 | activity_588519,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
302 | activity_492967,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
303 | activity_504770,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
304 | activity_485395,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
305 | activity_493033,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
306 | activity_540299,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
307 | activity_651702,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
308 | activity_588497,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
309 | activity_588501,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
310 | activity_504577,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
311 | activity_485275,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
312 | activity_2690,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
313 | activity_1347041,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
314 | activity_602332,0,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0
315 | activity_686979,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
316 | activity_686978,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
317 | activity_624173,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
318 | activity_1347131,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
319 | activity_1159524,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
320 | activity_1347071,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
321 | activity_1259415,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
322 | activity_588453,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
323 | activity_588456,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
324 | activity_588795,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
325 | activity_602179,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
326 | activity_686970,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
327 | activity_686971,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
328 | activity_485364,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
329 | activity_2528,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
330 | activity_485290,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
331 | activity_2517,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
332 | activity_485281,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
333 | activity_624296,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
334 | activity_624297,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
335 | activity_651820,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
336 | activity_540267,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
337 | activity_504937,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
338 | activity_624288,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
339 | activity_624287,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
340 | activity_463254,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
341 | activity_720580,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
342 | activity_504466,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
343 | activity_504467,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
344 | activity_652105,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
345 | activity_2101,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
346 | activity_504834,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
347 | activity_485349,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
348 | activity_2685,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
349 | activity_720579,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
350 | activity_504832,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
351 | activity_2100,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0
352 | activity_2314,0,0,1,1,0,0,0,0,1,1,0,0,0,1,0,0
353 | activity_2472,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
354 | activity_2451,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
355 | activity_1868,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
356 | activity_1460,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
357 | activity_1768,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
358 | activity_1631,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
359 | activity_1634,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
360 | activity_2107,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0
361 | activity_1471,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
362 | activity_1688,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
363 | activity_1030,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
364 | activity_1379,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
365 | activity_2242,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0
366 | activity_1476,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
367 | activity_1478,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
368 | activity_1452,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
369 | activity_894,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
370 | activity_902,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
371 | activity_924,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
372 | activity_1347417,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
373 | activity_881,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
374 | activity_624168,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
375 | activity_624204,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
376 | activity_624354,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
377 | activity_651560,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0
378 | activity_602244,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
379 | activity_602261,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
380 | activity_602274,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
381 | activity_602399,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
382 | activity_588458,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
383 | activity_588489,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
384 | activity_588492,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
385 | activity_588621,1,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0
386 | activity_588850,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
387 | activity_602141,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
388 | activity_504720,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
389 | activity_493091,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
390 | activity_493160,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0
391 | activity_493011,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
392 | activity_493012,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
393 | activity_449763,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0
394 | activity_463104,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0
395 | activity_463190,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
396 | activity_463212,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
397 | activity_435022,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
398 | activity_2380,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0
399 | activity_2825,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
400 | activity_435003,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
401 | activity_651582,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0
402 | activity_1443,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
403 | activity_1135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
404 | activity_1217,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
405 | activity_1029,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
406 | activity_720637,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
407 | activity_1347034,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
408 | activity_1347037,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
409 | activity_602233,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
410 | activity_485367,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
411 | activity_1490,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0
412 | activity_1468,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
413 | activity_602440,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
414 | activity_504690,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
415 | activity_588413,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
416 | activity_652067,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
417 | activity_652126,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
418 | activity_652257,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
419 | activity_686964,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
420 | activity_687014,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
421 | activity_687016,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
422 | activity_720596,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
423 | activity_720702,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
424 | activity_743126,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
425 | activity_1053197,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
426 | activity_651821,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
427 | activity_651957,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
428 | activity_652010,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
429 | activity_652039,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
430 | activity_602281,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
431 | activity_624267,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
432 | activity_624268,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
433 | activity_624377,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
434 | activity_624416,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
435 | activity_651718,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
436 | activity_651800,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0
437 | activity_651572,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
438 | activity_602163,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0
439 | activity_602123,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0
440 | activity_540364,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
441 | activity_504411,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
442 | activity_504700,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
443 | activity_504707,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
444 | activity_504734,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
445 | activity_504766,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
446 | activity_504803,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
447 | activity_492953,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
448 | activity_492956,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
449 | activity_492972,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
450 | activity_493008,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
451 | activity_493087,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0
452 | activity_493244,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
453 | activity_463082,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
454 | activity_2751,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
455 | activity_434962,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0
456 | activity_2130,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
457 | activity_2174,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
458 | activity_2177,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
459 | activity_2234,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
460 | activity_2057,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
461 | activity_2129,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
462 | activity_1906,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0
463 | activity_1947,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
464 | activity_1974,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
465 | activity_1822,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0
466 | activity_1845,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
467 | activity_1321,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
468 | activity_631,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0
469 | activity_731,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0
470 | activity_1032,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
471 | activity_1203,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
472 | activity_920,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
473 | activity_932,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
474 | activity_862,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
475 | activity_871,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
476 | activity_743093,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
477 | activity_2052,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
478 | activity_1529,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
479 | activity_1530,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
480 | activity_1531,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
481 | activity_758,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
482 | activity_760,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
483 | activity_761,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
484 | activity_757,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
485 | activity_759,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
486 | activity_764,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
487 | activity_1325,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
488 | activity_1326,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
489 | activity_493014,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
490 | activity_588664,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
491 | activity_602410,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0
492 | activity_623877,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
493 | activity_488975,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
494 | activity_488977,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
495 | activity_588511,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0
496 | activity_588627,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
497 | activity_588675,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
498 | activity_588676,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
499 | activity_1511,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
500 | activity_1672,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
501 | activity_2550,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
502 | activity_2553,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
503 | activity_2648,0,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0
504 | activity_463111,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
505 | activity_463165,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
506 | activity_2156,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
507 | activity_2227,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
508 | activity_2239,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
509 | activity_624037,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
510 | activity_624038,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
511 | activity_624040,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
512 | activity_624125,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
513 | activity_624126,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
514 | activity_624127,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
515 | activity_624466,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
516 | activity_624467,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
517 | activity_602247,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
518 | activity_602248,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
519 | activity_602250,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
520 | activity_588814,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
521 | activity_588819,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
522 | activity_588852,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
523 | activity_504634,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
524 | activity_504692,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
525 | activity_504454,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
526 | activity_652048,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
527 | activity_652051,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
528 | activity_493056,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
529 | activity_493084,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
530 | activity_504660,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
531 | activity_652054,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
532 | activity_624463,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
533 | activity_504651,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
534 | activity_624465,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
535 | activity_504652,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
536 | activity_624464,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
537 | activity_624291,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
538 | activity_485347,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
539 | activity_485344,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
540 | activity_485358,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
541 | activity_492947,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
542 | activity_504810,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
543 | activity_504812,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
544 | activity_540275,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
545 | activity_540277,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
546 | activity_493036,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
547 | activity_1469,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
548 | activity_1479,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
549 | activity_720552,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
550 | activity_720719,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
551 | activity_720725,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
552 | activity_743053,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
553 | activity_743054,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
554 | activity_743063,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
555 | activity_743067,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0
556 | activity_743077,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
557 | activity_743078,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
558 | activity_743091,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
559 | activity_743122,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
560 | activity_743139,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
561 | activity_743140,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
562 | activity_1159523,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
563 | activity_1159528,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
564 | activity_1159531,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
565 | activity_1159555,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
566 | activity_1224892,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
567 | activity_1224893,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
568 | activity_1224894,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
569 | activity_1224895,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
570 | activity_1224896,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
571 | activity_1259247,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
572 | activity_1259248,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
573 | activity_1259387,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
574 | activity_1259390,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
575 | activity_1259391,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
576 | activity_1259392,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
577 | activity_1259393,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
578 | activity_1259394,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
579 | activity_1259395,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
580 | activity_1259396,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
581 | activity_1259401,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0
582 | activity_1259402,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0
583 | activity_1259403,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0
584 | activity_1259404,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0
585 | activity_1347030,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
586 | activity_1347031,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
587 | activity_1347032,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
588 | activity_1347033,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
589 | activity_1347036,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
590 | activity_1347038,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
591 | activity_1159518,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
592 | activity_1159519,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0
593 | activity_743199,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
594 | activity_743219,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
595 | activity_743226,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
596 | activity_743227,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
597 | activity_743228,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
598 | activity_743239,1,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0
599 | activity_743240,1,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0
600 | activity_743241,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
601 | activity_743242,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
602 | 


--------------------------------------------------------------------------------
/Data/aid_target_mapping.csv:
--------------------------------------------------------------------------------
  1 | AID,Membrane receptor,Enzyme (other),Nuclear receptor,Hydrolase,Protease,Transcription factor,Kinase,Epigenetic regulator,Ion channel,Transferase,Oxidoreductase,Transporter,NTPase,Phosphatase
  2 | activity_1554,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  3 | activity_2732,0,0,0,0,0,1,0,0,0,0,0,0,0,0
  4 | activity_1085,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  5 | activity_1236,0,0,0,0,1,0,0,0,0,0,0,0,0,0
  6 | activity_1274,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  7 | activity_781,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  8 | activity_422,0,0,0,0,0,0,0,0,0,0,0,0,0,0
  9 | activity_1224905,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 10 | activity_624256,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 11 | activity_588358,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 12 | activity_1346378,0,0,0,0,0,0,0,0,0,1,0,0,0,0
 13 | activity_743266,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 14 | activity_624417,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 15 | activity_624418,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 16 | activity_652104,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 17 | activity_602310,0,0,0,1,0,0,0,0,0,0,0,0,0,0
 18 | activity_588579,0,1,0,0,0,0,0,0,0,0,0,0,0,0
 19 | activity_504847,0,0,1,0,0,0,0,0,0,0,0,0,0,0
 20 | activity_651635,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 21 | activity_743255,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 22 | activity_588591,0,0,0,0,0,0,0,0,0,1,0,0,0,0
 23 | activity_588590,0,0,0,0,0,0,0,0,0,1,0,0,0,0
 24 | activity_504845,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 25 | activity_504327,0,0,0,0,0,0,0,1,0,0,0,0,0,0
 26 | activity_504648,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 27 | activity_602313,0,0,0,1,0,0,0,0,0,0,0,0,0,0
 28 | activity_624172,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 29 | activity_624246,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 30 | activity_652106,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 31 | activity_2676,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 32 | activity_652025,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 33 | activity_720504,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 34 | activity_720542,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 35 | activity_651725,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 36 | activity_651768,0,1,0,0,0,0,0,0,0,0,0,0,0,0
 37 | activity_504444,0,0,0,0,0,1,0,0,0,0,0,0,0,0
 38 | activity_743279,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 39 | activity_651644,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 40 | activity_651550,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 41 | activity_1259400,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 42 | activity_720551,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 43 | activity_720553,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 44 | activity_651965,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 45 | activity_540263,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 46 | activity_485314,0,1,0,0,0,0,0,0,0,0,0,0,0,0
 47 | activity_651724,0,0,0,1,0,0,0,0,0,0,0,0,0,0
 48 | activity_488837,0,1,0,0,0,0,0,0,0,0,0,0,0,0
 49 | activity_485353,0,1,0,0,0,0,0,0,0,0,0,0,0,0
 50 | activity_540303,1,0,0,0,1,0,0,0,0,0,0,0,0,0
 51 | activity_485298,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 52 | activity_504706,0,0,0,0,0,1,0,0,0,0,0,0,0,0
 53 | activity_485313,0,0,0,0,0,0,0,0,0,0,0,1,0,0
 54 | activity_485297,0,0,0,1,0,0,0,0,0,0,0,0,0,0
 55 | activity_2551,0,0,1,0,0,0,0,0,0,0,0,0,0,0
 56 | activity_2546,0,0,1,0,0,0,0,0,0,0,0,0,0,0
 57 | activity_540276,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 58 | activity_540256,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 59 | activity_2662,0,0,0,0,0,0,0,1,0,0,0,0,0,0
 60 | activity_2315,0,0,0,1,0,0,0,0,0,0,0,0,0,0
 61 | activity_2675,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 62 | activity_2326,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 63 | activity_2549,0,1,0,0,0,0,0,0,0,0,0,0,0,0
 64 | activity_880,0,1,0,0,0,0,0,0,0,0,0,0,0,0
 65 | activity_2147,0,0,0,0,0,0,0,1,0,0,0,0,0,0
 66 | activity_485360,0,0,0,0,0,0,0,1,0,0,0,0,0,0
 67 | activity_1458,0,0,0,0,0,0,0,1,0,0,0,0,0,0
 68 | activity_1487,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 69 | activity_445,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 70 | activity_1259318,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 71 | activity_602429,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 72 | activity_624304,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 73 | activity_624352,0,0,0,0,0,1,0,0,0,0,0,0,0,0
 74 | activity_651699,0,0,0,0,0,0,0,0,0,1,0,0,0,0
 75 | activity_651647,0,0,0,0,1,0,0,0,0,0,0,0,0,0
 76 | activity_1347056,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 77 | activity_588405,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 78 | activity_588493,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 79 | activity_1259313,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 80 | activity_489030,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 81 | activity_489031,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 82 | activity_485273,0,0,0,0,0,0,0,0,0,1,0,0,0,0
 83 | activity_2071,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 84 | activity_485,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 85 | activity_1259388,0,0,0,0,0,0,0,1,0,0,0,0,0,0
 86 | activity_588692,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 87 | activity_720706,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 88 | activity_602393,0,0,0,0,0,0,0,1,0,0,0,0,0,0
 89 | activity_2099,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 90 | activity_504621,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 91 | activity_1813,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 92 | activity_1814,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 93 | activity_1662,0,0,0,0,0,0,1,0,0,0,0,0,0,0
 94 | activity_2221,0,1,0,0,0,0,0,0,0,0,0,0,0,0
 95 | activity_940,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 96 | activity_1422,1,0,0,0,0,0,0,0,0,0,0,0,0,0
 97 | activity_1021,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 98 | activity_1022,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 99 | activity_817,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100 | activity_602162,0,0,0,0,0,0,0,0,0,0,0,1,0,0
101 | activity_588499,0,0,0,0,1,0,0,0,0,0,0,0,0,0
102 | activity_504490,0,0,0,0,0,0,0,0,0,0,0,0,0,0
103 | activity_488847,0,0,0,0,0,0,1,0,0,0,0,0,0,0
104 | activity_2557,1,0,0,0,0,0,0,0,0,0,0,0,0,0
105 | activity_624170,0,0,0,1,0,0,0,0,0,0,0,0,0,0
106 | activity_624171,0,0,0,0,0,1,0,0,0,0,0,0,0,0
107 | activity_624414,0,0,0,0,0,0,0,0,1,0,0,0,0,0
108 | activity_1347076,0,0,0,0,0,0,0,0,0,0,0,0,0,0
109 | activity_1347075,0,0,0,0,0,0,0,0,0,0,0,0,0,0
110 | activity_624415,0,0,0,0,0,0,0,0,1,0,0,0,0,0
111 | activity_1347120,0,0,0,0,0,0,0,0,0,0,0,0,0,0
112 | activity_504339,0,0,0,0,0,0,0,1,0,0,0,0,0,0
113 | activity_504891,0,0,0,0,0,0,0,0,0,0,0,0,0,0
114 | activity_540317,0,0,0,0,0,0,0,1,0,0,0,0,0,0
115 | activity_624202,0,1,0,0,0,0,0,0,0,0,0,0,0,0
116 | activity_720707,0,0,0,0,0,0,0,0,0,0,0,0,0,0
117 | activity_720708,0,0,0,0,0,0,0,0,0,0,0,0,0,0
118 | activity_720709,0,0,0,0,0,0,0,0,0,0,0,0,0,0
119 | activity_720711,0,0,0,0,0,0,0,0,0,0,0,0,0,0
120 | activity_504333,0,0,0,0,0,0,0,1,0,0,0,0,0,0
121 | activity_504332,0,0,0,0,0,0,0,0,0,0,0,0,0,0
122 | activity_540253,0,0,0,0,0,0,0,0,0,0,0,0,0,0
123 | activity_504842,0,0,0,0,0,0,0,0,0,0,0,0,0,0
124 | activity_1463,0,0,0,0,0,0,0,0,0,0,0,0,0,0
125 | activity_1461,1,0,0,0,0,0,0,0,0,0,0,0,0,0
126 | activity_1347165,0,0,0,0,0,0,0,0,0,0,0,0,0,0
127 | activity_1457,0,0,0,1,0,0,0,0,0,0,0,0,0,0
128 | activity_2417,0,0,0,0,0,0,0,0,0,0,0,0,0,0
129 | activity_1454,0,0,0,0,0,0,1,0,0,0,0,0,0,0
130 | activity_1948,0,0,0,0,0,0,0,0,0,0,0,0,0,0
131 | activity_602449,0,0,0,0,0,0,0,0,0,0,0,0,0,0
132 | activity_1224865,0,0,0,0,0,0,0,0,0,0,0,0,0,0
133 | activity_588473,1,0,0,0,0,0,0,0,0,0,0,0,0,0
134 | activity_588475,1,0,0,0,0,0,0,0,0,0,0,0,0,0
135 | activity_493187,0,0,0,0,0,0,0,0,0,1,0,0,0,0
136 | activity_504462,0,1,0,0,0,0,0,0,0,0,0,0,0,0
137 | activity_463195,0,0,0,0,0,0,0,0,0,0,0,0,0,0
138 | activity_485346,0,0,0,0,0,0,0,0,0,0,0,0,0,0
139 | activity_1496,0,0,0,0,0,1,0,0,0,0,0,0,0,0
140 | activity_1012,0,1,0,0,0,0,0,0,0,0,0,0,0,0
141 | activity_1216,0,0,0,0,0,0,0,0,0,0,0,0,0,0
142 | activity_1259310,0,0,0,0,0,0,0,0,0,0,0,0,0,0
143 | activity_743269,0,0,0,1,0,0,0,1,0,0,0,0,0,0
144 | activity_652017,0,0,0,0,0,0,0,1,0,0,0,0,0,0
145 | activity_2642,0,0,0,0,0,0,0,0,1,0,0,0,0,0
146 | activity_488922,0,0,0,0,0,0,0,0,1,0,0,0,0,0
147 | activity_2237,0,0,0,0,0,0,0,0,0,0,0,0,0,0
148 | activity_2247,0,0,0,0,0,0,0,0,0,0,0,0,0,0
149 | activity_624483,0,0,0,0,0,0,0,0,0,0,0,0,0,0
150 | activity_1415,0,0,0,0,0,0,0,0,0,0,0,0,0,0
151 | activity_1423,0,1,0,0,0,0,0,0,0,0,0,0,0,0
152 | activity_1439,0,0,0,0,0,0,0,0,0,0,0,0,0,0
153 | activity_1441,0,1,0,0,0,0,0,0,0,0,0,0,0,0
154 | activity_950,0,0,0,0,0,0,0,0,1,0,0,0,0,0
155 | activity_951,0,0,0,0,0,0,0,0,0,0,0,0,0,0
156 | activity_952,0,0,0,0,0,0,0,0,0,0,0,0,0,0
157 | activity_1007,0,0,0,0,0,0,0,0,1,0,0,0,0,0
158 | activity_1008,0,0,0,0,0,0,0,0,0,0,0,0,0,0
159 | activity_1009,0,0,0,0,0,0,0,0,0,0,0,0,0,0
160 | activity_624263,0,0,0,0,0,0,0,0,0,0,0,0,0,0
161 | activity_651819,0,0,0,0,0,0,0,0,0,0,0,0,0,0
162 | activity_602438,0,0,0,0,0,0,0,0,0,0,0,0,0,0
163 | activity_651636,1,0,0,0,0,0,0,0,0,0,0,0,0,0
164 | activity_493098,1,0,0,0,0,0,0,0,0,0,0,0,0,0
165 | activity_2521,1,0,0,0,0,0,0,0,0,0,0,0,0,0
166 | activity_2520,1,0,0,0,0,0,0,0,0,0,0,0,0,0
167 | activity_1656,0,0,0,0,0,0,0,0,0,0,0,0,0,0
168 | activity_1259374,0,0,0,0,0,0,0,0,0,0,0,0,0,0
169 | activity_1259422,0,0,0,0,0,0,0,0,0,0,0,0,0,0
170 | activity_686940,0,0,1,0,0,0,0,0,0,0,0,0,0,0
171 | activity_720508,0,0,0,0,0,0,0,0,0,0,0,0,0,0
172 | activity_720509,0,0,0,0,0,0,0,0,0,0,0,0,0,0
173 | activity_720543,0,0,0,0,0,0,0,0,0,1,0,0,0,0
174 | activity_720582,0,0,0,0,1,0,0,0,0,0,0,0,0,0
175 | activity_720648,0,0,0,0,1,0,0,0,0,0,0,0,0,0
176 | activity_720700,0,1,0,0,0,0,0,0,0,0,0,0,0,0
177 | activity_720704,0,1,0,0,0,0,0,0,0,0,0,0,0,0
178 | activity_651958,0,1,0,0,0,0,0,0,0,0,0,0,0,0
179 | activity_602396,0,0,1,0,0,0,0,0,0,0,0,0,0,0
180 | activity_624169,1,0,0,0,0,0,0,0,0,0,0,0,0,0
181 | activity_651602,0,1,0,0,0,0,0,0,0,0,0,0,0,0
182 | activity_602229,0,0,1,0,0,0,0,0,0,0,0,0,0,0
183 | activity_588352,0,0,0,0,0,0,0,1,0,0,0,0,0,0
184 | activity_588726,0,0,0,0,0,0,0,0,0,0,0,0,0,0
185 | activity_588354,0,0,0,0,0,0,0,1,0,0,0,0,0,0
186 | activity_651719,1,0,0,0,0,0,0,0,0,0,0,0,0,0
187 | activity_588335,0,0,0,0,0,0,0,0,0,0,0,0,0,0
188 | activity_504326,1,0,0,0,0,0,0,0,0,0,0,0,0,0
189 | activity_504357,1,0,0,0,0,0,0,0,0,0,0,0,0,0
190 | activity_434989,1,0,0,0,0,0,0,0,0,0,0,0,0,0
191 | activity_435030,0,0,0,0,0,0,0,0,0,0,0,0,1,0
192 | activity_449728,0,0,0,0,0,0,0,0,0,0,0,0,0,0
193 | activity_463079,0,0,0,0,0,0,0,0,0,0,0,0,0,0
194 | activity_463141,0,0,0,0,1,0,0,0,0,0,0,0,0,0
195 | activity_463210,0,0,0,0,1,0,0,0,0,0,0,0,0,0
196 | activity_485270,1,0,0,0,0,0,0,0,0,0,0,0,0,0
197 | activity_485272,0,1,0,0,0,0,0,0,0,0,0,0,0,0
198 | activity_2435,1,0,0,0,0,0,0,0,0,0,0,0,0,0
199 | activity_2445,1,0,0,0,0,0,0,0,0,0,0,0,0,0
200 | activity_2606,0,0,0,0,1,0,0,0,0,0,0,0,0,0
201 | activity_2796,0,0,0,0,0,1,0,0,0,0,0,0,0,0
202 | activity_2797,1,0,0,0,0,0,0,0,0,0,0,0,0,0
203 | activity_2280,0,0,0,0,0,0,0,0,0,0,0,0,0,0
204 | activity_1987,0,0,0,0,0,0,0,0,0,0,0,0,0,1
205 | activity_2235,0,0,0,0,0,0,0,0,0,0,0,0,0,1
206 | activity_2300,0,0,1,0,0,0,0,0,0,0,0,0,0,0
207 | activity_1899,0,1,0,0,0,0,0,0,0,0,0,0,0,0
208 | activity_1950,0,0,0,0,0,0,0,0,0,0,0,0,0,0
209 | activity_1962,0,1,0,0,0,0,0,0,0,0,0,0,0,0
210 | activity_1527,0,0,0,1,0,0,0,0,0,0,0,0,0,0
211 | activity_1556,0,0,0,1,0,0,0,0,0,0,0,0,0,0
212 | activity_1700,0,0,0,0,0,1,0,0,0,0,0,0,0,0
213 | activity_1706,0,1,0,0,0,0,0,0,0,0,0,0,0,0
214 | activity_1789,0,0,0,0,0,0,0,0,0,0,0,0,0,0
215 | activity_1800,0,0,0,0,0,0,0,0,0,0,0,0,0,0
216 | activity_1825,0,0,0,0,0,0,0,0,0,0,0,0,0,0
217 | activity_1861,1,0,0,0,0,0,0,0,0,0,0,0,0,0
218 | activity_1304,1,0,0,0,0,0,0,0,0,0,0,0,0,0
219 | activity_1359,1,0,0,0,0,0,0,0,0,0,0,0,0,0
220 | activity_1416,0,0,0,0,0,0,1,0,0,0,0,0,0,0
221 | activity_1424,0,0,0,0,0,0,0,0,1,0,0,0,0,0
222 | activity_1446,0,0,0,0,0,0,1,0,0,0,0,0,0,0
223 | activity_1448,0,0,0,0,0,0,0,0,0,0,0,0,0,0
224 | activity_1481,0,0,0,0,0,0,0,0,0,0,0,1,0,0
225 | activity_1486,0,0,0,0,0,0,0,0,0,0,0,0,0,0
226 | activity_1509,1,0,0,0,0,0,0,0,0,0,0,0,0,0
227 | activity_1510,1,0,0,0,0,0,0,0,0,0,0,0,0,0
228 | activity_1515,0,0,0,1,0,0,0,0,0,0,0,0,0,0
229 | activity_1040,1,0,0,0,0,0,0,0,0,0,0,0,0,0
230 | activity_861,0,0,0,0,0,0,0,0,0,0,0,0,0,0
231 | activity_793,1,0,0,0,0,0,0,0,0,0,0,0,0,0
232 | activity_803,1,0,0,0,0,0,0,0,0,0,0,0,0,0
233 | activity_828,1,0,0,0,0,0,0,0,0,0,0,0,0,0
234 | activity_623870,0,0,0,0,0,0,0,0,0,0,0,0,0,0
235 | activity_743287,0,0,0,0,0,0,0,0,0,0,0,0,0,0
236 | activity_602329,0,1,0,0,0,0,0,0,0,0,0,0,0,0
237 | activity_651654,0,0,0,0,0,0,0,0,0,0,0,0,0,0
238 | activity_602405,0,0,0,0,0,0,0,0,0,0,0,0,0,0
239 | activity_651610,0,0,0,0,0,0,0,0,0,0,0,0,0,0
240 | activity_652154,0,0,0,0,0,1,0,0,0,0,0,0,0,0
241 | activity_504884,0,0,0,0,0,0,0,0,0,0,0,0,0,0
242 | activity_602481,0,0,0,0,0,0,0,0,0,1,0,0,0,0
243 | activity_652163,0,0,0,0,0,0,0,0,0,0,0,0,0,0
244 | activity_651710,0,0,0,0,0,0,0,0,0,0,0,0,0,0
245 | activity_588549,0,1,0,0,0,0,0,0,0,0,0,0,0,0
246 | activity_651711,0,1,0,0,0,0,0,0,0,0,0,0,0,0
247 | activity_652162,0,0,0,1,0,0,0,0,0,0,0,0,0,0
248 | activity_588436,0,0,0,0,0,0,0,0,0,0,0,0,0,0
249 | activity_651687,0,0,0,0,0,0,0,0,0,0,0,0,0,0
250 | activity_651723,0,0,0,0,0,0,0,0,0,0,0,0,0,0
251 | activity_588391,0,0,0,0,0,0,0,0,0,0,0,0,0,0
252 | activity_686996,0,0,0,0,1,0,0,0,0,0,0,0,0,0
253 | activity_720511,0,0,0,0,0,0,0,0,1,0,0,0,0,0
254 | activity_651640,0,0,0,0,0,0,0,0,0,0,0,0,0,0
255 | activity_651661,0,0,0,0,0,0,0,0,0,0,0,0,0,0
256 | activity_743247,0,0,0,0,0,0,0,0,0,0,0,0,0,0
257 | activity_504558,0,0,0,0,0,0,0,0,0,0,0,0,0,0
258 | activity_652197,0,0,0,0,1,0,0,0,0,0,0,0,0,0
259 | activity_488895,0,0,0,0,0,0,0,0,0,0,0,0,0,0
260 | activity_488965,0,0,0,1,0,0,0,0,0,0,0,0,0,0
261 | activity_493131,0,0,0,0,0,0,0,0,0,0,0,0,0,0
262 | activity_504523,0,0,0,0,0,1,0,0,0,0,0,0,0,0
263 | activity_504582,0,0,0,0,0,0,0,0,0,0,0,0,0,0
264 | activity_504423,0,0,0,0,0,0,0,0,0,0,0,0,0,0
265 | activity_602346,0,0,0,0,0,0,0,0,0,0,0,0,0,0
266 | activity_602342,0,0,0,0,0,0,0,0,0,0,0,0,0,0
267 | activity_623901,0,0,0,0,0,0,0,0,0,0,0,0,0,0
268 | activity_743397,0,0,0,0,0,0,0,0,0,0,0,0,0,0
269 | activity_488839,0,0,0,0,0,0,1,0,0,0,0,0,0,0
270 | activity_602340,0,0,0,0,0,0,0,0,0,0,0,0,0,0
271 | activity_488899,0,0,0,0,0,1,0,0,0,0,0,0,0,0
272 | activity_540336,0,0,0,0,0,0,0,0,0,0,0,0,0,0
273 | activity_504406,0,0,0,0,0,0,0,0,0,0,0,0,0,0
274 | activity_488896,1,0,0,0,0,0,0,0,0,0,0,0,0,0
275 | activity_504775,1,0,0,0,0,0,0,0,0,0,0,0,0,0
276 | activity_743445,0,0,0,0,0,0,0,1,0,0,0,0,0,0
277 | activity_743398,0,0,0,0,0,0,0,0,0,0,0,0,0,0
278 | activity_588334,0,0,0,0,0,0,0,0,0,0,0,0,0,0
279 | activity_686992,0,0,0,0,0,0,0,0,0,0,0,0,0,0
280 | activity_2016,0,0,0,0,0,0,0,0,0,0,0,1,0,0
281 | activity_2025,0,0,0,0,0,0,0,0,0,0,0,0,0,0
282 | activity_2023,0,0,0,0,1,0,0,0,0,0,0,0,0,0
283 | activity_2066,0,0,0,0,0,0,0,0,0,0,0,1,0,0
284 | activity_2029,0,0,0,0,0,0,0,0,0,1,0,0,0,0
285 | activity_2462,0,0,0,0,0,0,0,0,0,0,0,0,0,0
286 | activity_2661,0,0,0,0,0,0,1,0,0,0,0,0,0,0
287 | activity_2716,0,0,0,0,0,0,0,0,0,0,0,0,0,0
288 | activity_2718,0,0,0,0,0,0,0,1,0,0,0,0,0,0
289 | activity_2650,0,0,0,0,0,0,1,0,0,0,0,0,0,0
290 | activity_435005,0,0,0,0,0,0,0,0,0,0,0,0,0,0
291 | activity_1885,0,0,0,0,0,0,0,0,0,0,0,0,0,0
292 | activity_1979,0,0,0,0,0,0,0,0,0,0,0,0,0,0
293 | activity_2097,0,0,0,0,0,0,1,0,0,0,0,0,0,0
294 | activity_2717,0,0,0,0,0,0,0,0,0,0,0,0,0,0
295 | activity_1910,0,0,0,0,0,0,0,0,0,0,0,0,0,0
296 | activity_841,0,0,0,0,0,0,0,0,0,0,0,0,0,0
297 | activity_1053175,0,0,0,0,0,0,0,0,0,0,0,0,0,0
298 | activity_1259309,0,0,0,0,0,0,0,0,0,0,0,0,0,0
299 | activity_1259311,0,0,0,0,0,0,0,0,0,0,0,0,0,0
300 | activity_720641,0,1,0,0,0,0,0,0,0,0,0,0,0,0
301 | activity_588519,0,0,0,0,1,0,0,0,0,0,0,0,0,0
302 | activity_492967,0,0,0,0,0,0,0,0,0,0,0,0,0,0
303 | activity_504770,0,0,0,0,0,0,0,0,0,0,0,0,0,0
304 | activity_485395,0,1,0,0,0,0,0,0,0,0,0,0,0,0
305 | activity_493033,0,1,0,0,0,0,0,0,0,0,0,0,0,0
306 | activity_540299,0,0,0,0,0,0,0,0,0,0,0,0,0,0
307 | activity_651702,0,0,0,0,0,0,0,0,0,0,0,0,0,0
308 | activity_588497,0,1,0,0,0,0,0,0,0,0,0,0,0,0
309 | activity_588501,0,0,0,0,1,0,0,0,0,0,0,0,0,0
310 | activity_504577,0,0,0,1,0,0,0,0,0,0,0,0,0,0
311 | activity_485275,0,0,0,0,0,0,0,0,0,0,0,0,0,0
312 | activity_2690,0,0,0,0,0,0,0,0,0,0,0,0,0,0
313 | activity_1347041,0,0,0,0,0,0,0,0,0,0,0,0,0,0
314 | activity_602332,0,0,0,0,0,0,0,0,0,0,0,0,0,0
315 | activity_686979,0,0,0,0,0,0,0,0,0,0,0,0,1,0
316 | activity_686978,0,0,0,0,0,0,0,0,0,0,0,0,1,0
317 | activity_624173,0,0,0,0,0,0,0,0,0,0,0,0,0,0
318 | activity_1347131,0,0,0,0,0,0,0,0,0,1,0,0,0,0
319 | activity_1159524,0,0,0,0,0,0,0,0,0,0,0,0,0,0
320 | activity_1347071,1,0,0,0,0,0,0,0,0,0,0,0,0,0
321 | activity_1259415,0,0,0,0,0,0,0,0,0,0,0,0,0,0
322 | activity_588453,0,0,0,0,0,0,0,0,0,0,1,0,0,0
323 | activity_588456,0,0,0,0,0,0,0,0,0,0,1,0,0,0
324 | activity_588795,0,0,0,1,0,0,0,0,0,0,0,0,0,0
325 | activity_602179,0,0,0,0,0,0,0,0,0,0,1,0,0,0
326 | activity_686970,0,0,0,0,0,0,0,0,0,0,1,0,0,0
327 | activity_686971,0,0,0,0,0,0,0,0,0,0,0,0,0,0
328 | activity_485364,0,1,0,0,0,0,0,0,0,0,0,0,0,0
329 | activity_2528,0,1,0,0,0,0,0,0,0,0,0,0,0,0
330 | activity_485290,0,0,0,0,0,0,0,0,0,0,0,0,1,0
331 | activity_2517,0,1,0,0,0,0,0,0,0,0,0,0,0,0
332 | activity_485281,0,0,0,0,0,0,0,0,0,0,0,0,0,0
333 | activity_624296,0,0,0,0,0,0,0,0,0,0,0,0,0,0
334 | activity_624297,0,0,0,0,0,0,0,0,0,0,0,0,0,0
335 | activity_651820,0,0,0,0,0,0,0,0,0,0,0,0,0,0
336 | activity_540267,0,0,0,0,0,0,0,0,0,0,0,0,0,0
337 | activity_504937,0,0,0,0,0,0,0,0,0,0,0,0,1,0
338 | activity_624288,0,0,0,0,0,0,0,0,0,0,0,0,0,0
339 | activity_624287,0,0,0,0,0,0,0,0,0,0,0,0,0,0
340 | activity_463254,0,0,0,0,1,0,0,0,0,0,0,0,0,0
341 | activity_720580,0,0,0,0,0,0,0,0,0,0,0,0,0,0
342 | activity_504466,0,0,0,0,0,0,0,0,0,0,0,0,0,0
343 | activity_504467,0,0,0,0,0,0,0,0,0,0,0,0,0,0
344 | activity_652105,0,0,0,0,0,0,0,0,0,1,0,0,0,0
345 | activity_2101,0,1,0,0,0,0,0,0,0,0,0,0,0,0
346 | activity_504834,0,0,0,0,0,0,0,0,0,0,0,0,0,0
347 | activity_485349,0,0,0,0,0,0,1,0,0,0,0,0,0,0
348 | activity_2685,0,0,0,0,0,0,0,0,0,0,0,0,0,0
349 | activity_720579,0,0,0,0,0,0,0,0,0,0,0,0,0,0
350 | activity_504832,0,0,0,0,0,0,0,0,0,0,0,0,0,0
351 | activity_2100,0,0,0,1,0,0,0,0,0,0,0,0,0,0
352 | activity_2314,0,0,0,1,0,0,0,0,0,0,0,0,0,0
353 | activity_2472,0,0,0,0,0,0,0,0,0,0,1,0,0,0
354 | activity_2451,0,1,0,0,0,0,0,0,0,0,0,0,0,0
355 | activity_1868,0,0,0,0,0,0,0,0,0,1,0,0,0,0
356 | activity_1460,0,0,0,0,0,0,0,0,0,0,0,0,0,0
357 | activity_1768,0,0,0,0,0,0,0,1,0,0,0,0,0,0
358 | activity_1631,0,1,0,0,0,0,0,0,0,0,0,0,0,0
359 | activity_1634,0,1,0,0,0,0,0,0,0,0,0,0,0,0
360 | activity_2107,0,0,0,1,0,0,0,0,0,0,0,0,0,0
361 | activity_1471,0,0,0,0,0,0,0,0,0,0,0,0,0,0
362 | activity_1688,0,0,0,0,0,0,0,0,0,0,0,0,0,0
363 | activity_1030,0,0,0,0,0,0,0,0,0,0,1,0,0,0
364 | activity_1379,0,1,0,0,0,0,0,0,0,0,0,0,0,0
365 | activity_2242,0,0,0,1,0,0,0,0,0,0,0,0,0,0
366 | activity_1476,0,0,0,0,1,0,0,0,0,0,0,0,0,0
367 | activity_1478,0,0,0,0,1,0,0,0,0,0,0,0,0,0
368 | activity_1452,0,1,0,0,0,0,0,0,0,0,0,0,0,0
369 | activity_894,0,1,0,0,0,0,0,0,0,0,0,0,0,0
370 | activity_902,0,0,0,0,0,1,0,0,0,0,0,0,0,0
371 | activity_924,0,0,0,0,0,1,0,0,0,0,0,0,0,0
372 | activity_1347417,0,0,0,0,0,0,0,0,0,0,0,0,0,0
373 | activity_881,0,0,0,0,0,0,0,0,0,0,1,0,0,0
374 | activity_624168,0,1,0,0,0,0,0,0,0,0,0,0,0,0
375 | activity_624204,0,0,0,0,1,0,0,0,0,0,0,0,0,0
376 | activity_624354,1,0,0,0,0,0,0,0,0,0,0,0,0,0
377 | activity_651560,0,0,0,0,0,0,0,0,0,0,0,0,0,1
378 | activity_602244,1,0,0,0,0,0,0,0,0,0,0,0,0,0
379 | activity_602261,0,0,0,0,0,0,0,0,0,1,0,0,0,0
380 | activity_602274,0,0,0,0,0,0,0,0,0,0,0,0,0,0
381 | activity_602399,0,0,0,0,0,0,0,0,0,1,0,0,0,0
382 | activity_588458,0,0,0,0,0,0,0,1,0,0,0,0,0,0
383 | activity_588489,0,0,0,0,0,0,0,0,0,0,0,0,0,0
384 | activity_588492,0,0,0,0,0,0,0,0,0,0,0,0,0,0
385 | activity_588621,0,0,0,0,0,0,0,0,0,0,0,0,0,1
386 | activity_588850,0,0,0,0,0,0,0,0,1,0,0,0,0,0
387 | activity_602141,0,0,0,0,0,0,0,0,0,0,0,0,0,0
388 | activity_504720,0,1,0,0,0,0,0,0,0,0,0,0,0,0
389 | activity_493091,0,0,0,0,0,0,0,0,0,0,0,0,0,1
390 | activity_493160,0,0,0,0,0,0,0,0,0,1,0,0,0,0
391 | activity_493011,0,0,0,1,0,0,0,0,0,0,0,0,0,0
392 | activity_493012,0,0,0,1,0,0,0,0,0,0,0,0,0,0
393 | activity_449763,0,0,0,0,0,0,0,0,0,0,0,0,0,0
394 | activity_463104,0,0,0,0,0,0,0,0,0,0,0,0,0,0
395 | activity_463190,0,0,0,0,0,0,0,0,0,0,0,0,0,0
396 | activity_463212,0,0,0,0,0,0,0,0,0,0,0,0,0,0
397 | activity_435022,0,0,0,0,0,0,0,0,0,0,0,0,0,0
398 | activity_2380,0,0,0,0,0,0,0,0,0,0,0,0,0,0
399 | activity_2825,0,0,0,0,0,0,0,0,0,0,0,0,0,0
400 | activity_435003,0,0,0,0,0,0,0,0,0,0,0,0,0,0
401 | activity_651582,0,0,0,0,0,0,0,0,0,0,0,0,0,0
402 | activity_1443,0,0,0,0,0,0,0,0,0,0,0,0,0,0
403 | activity_1135,0,1,0,0,0,0,0,0,0,0,0,0,0,0
404 | activity_1217,0,0,0,0,0,0,0,0,0,0,1,0,0,0
405 | activity_1029,0,0,0,0,0,0,0,0,0,0,0,0,0,0
406 | activity_720637,0,0,0,0,0,0,0,0,0,0,0,0,0,0
407 | activity_1347034,0,0,0,0,1,0,0,0,0,0,0,0,0,0
408 | activity_1347037,0,0,0,0,1,0,0,0,0,0,0,0,0,0
409 | activity_602233,0,1,0,0,0,0,0,0,0,0,0,0,0,0
410 | activity_485367,0,1,0,0,0,0,0,0,0,0,0,0,0,0
411 | activity_1490,0,0,0,0,0,0,0,0,0,1,0,0,0,0
412 | activity_1468,0,0,0,0,0,0,0,0,0,0,0,0,0,0
413 | activity_602440,0,0,0,0,1,0,0,0,0,0,0,0,0,0
414 | activity_504690,0,1,0,0,0,0,0,0,0,0,0,0,0,0
415 | activity_588413,0,0,0,0,0,1,0,0,0,0,0,0,0,0
416 | activity_652067,0,0,1,0,0,0,0,0,0,0,0,0,0,0
417 | activity_652126,0,0,1,0,0,0,0,0,0,0,0,0,0,0
418 | activity_652257,0,0,0,0,0,0,0,1,0,0,0,0,0,0
419 | activity_686964,0,0,0,0,0,0,0,1,0,0,0,0,0,0
420 | activity_687014,0,0,1,0,0,0,0,0,0,0,0,0,0,0
421 | activity_687016,0,0,0,0,0,0,0,1,0,0,0,0,0,0
422 | activity_720596,0,0,0,0,0,0,0,0,0,0,0,0,0,0
423 | activity_720702,0,0,0,0,0,0,0,0,0,0,0,0,1,0
424 | activity_743126,0,0,0,1,0,0,0,0,0,0,0,0,0,0
425 | activity_1053197,0,0,0,1,0,0,0,0,0,0,0,0,0,0
426 | activity_651821,0,0,0,0,0,0,0,0,0,0,0,0,0,0
427 | activity_651957,0,0,1,0,0,0,0,0,0,0,0,0,0,0
428 | activity_652010,0,0,1,0,0,0,0,0,0,0,0,0,0,0
429 | activity_652039,0,0,0,0,1,0,0,0,0,0,0,0,0,0
430 | activity_602281,0,0,0,0,0,0,0,0,0,1,0,0,0,0
431 | activity_624267,0,0,0,0,0,0,1,0,0,0,0,0,0,0
432 | activity_624268,0,1,0,0,0,0,0,0,0,0,0,0,0,0
433 | activity_624377,0,0,0,0,0,0,0,0,0,0,0,0,0,0
434 | activity_624416,0,0,0,0,0,0,0,0,0,0,0,0,0,0
435 | activity_651718,0,0,0,0,0,0,0,0,0,0,1,0,0,0
436 | activity_651800,1,0,0,0,0,0,0,0,0,0,0,0,0,0
437 | activity_651572,0,0,0,0,0,0,0,0,0,0,0,0,0,0
438 | activity_602163,0,0,0,0,0,0,0,0,0,0,1,0,0,0
439 | activity_602123,0,0,0,0,1,0,0,0,0,0,0,0,0,0
440 | activity_540364,0,0,0,0,0,0,0,0,0,0,0,0,0,0
441 | activity_504411,0,1,0,0,0,0,0,0,0,0,0,0,0,0
442 | activity_504700,0,0,0,0,0,0,1,0,0,0,0,0,0,0
443 | activity_504707,0,0,0,0,0,0,1,0,0,0,0,0,0,0
444 | activity_504734,1,0,0,0,0,0,0,0,0,0,0,0,0,0
445 | activity_504766,0,0,1,0,0,0,0,0,0,0,0,0,0,0
446 | activity_504803,0,0,0,1,0,0,0,0,0,0,0,0,0,0
447 | activity_492953,0,1,0,0,0,0,0,0,0,0,0,0,0,0
448 | activity_492956,0,0,0,1,0,0,0,0,0,0,0,0,0,0
449 | activity_492972,0,0,0,1,0,0,0,0,0,0,0,0,0,0
450 | activity_493008,0,0,0,0,0,0,0,0,0,0,0,0,0,0
451 | activity_493087,0,1,0,0,0,0,0,0,0,0,0,0,0,0
452 | activity_493244,0,0,0,0,0,0,0,0,0,0,0,0,0,0
453 | activity_463082,0,0,0,1,0,0,0,0,0,0,0,0,0,0
454 | activity_2751,0,1,0,0,0,0,0,0,0,0,0,0,0,0
455 | activity_434962,0,1,0,0,0,0,0,0,0,0,0,0,0,0
456 | activity_2130,0,0,0,1,0,0,0,0,0,0,0,0,0,0
457 | activity_2174,0,0,0,1,0,0,0,0,0,0,0,0,0,0
458 | activity_2177,0,0,0,1,0,0,0,0,0,0,0,0,0,0
459 | activity_2234,0,0,0,0,0,1,0,0,0,0,0,0,0,0
460 | activity_2057,0,0,0,0,0,0,0,0,0,0,0,0,0,0
461 | activity_2129,0,0,0,0,0,0,0,0,1,0,0,0,0,0
462 | activity_1906,0,0,0,0,1,0,0,0,0,0,0,0,0,0
463 | activity_1947,0,0,0,1,0,0,0,0,0,0,0,0,0,0
464 | activity_1974,0,1,0,0,0,0,0,0,0,0,0,0,0,0
465 | activity_1822,0,1,0,0,0,0,0,0,0,0,0,0,0,0
466 | activity_1845,0,0,0,0,0,0,0,0,0,0,0,0,0,0
467 | activity_1321,0,0,0,0,0,0,1,0,0,0,0,0,0,0
468 | activity_631,0,0,1,0,0,0,0,1,0,0,0,0,0,0
469 | activity_731,0,0,1,0,0,0,0,1,0,0,0,0,0,0
470 | activity_1032,0,0,1,0,0,0,0,0,0,0,0,0,0,0
471 | activity_1203,0,0,0,0,0,0,0,0,0,0,0,0,0,0
472 | activity_920,0,0,0,0,0,1,0,0,0,0,0,0,0,0
473 | activity_932,0,0,0,0,0,1,0,0,0,0,0,0,0,0
474 | activity_862,0,0,0,0,0,0,0,0,0,0,0,0,0,0
475 | activity_871,0,0,0,0,0,0,0,0,0,0,0,0,0,0
476 | activity_743093,0,0,0,0,0,0,0,0,0,0,0,0,0,0
477 | activity_2052,1,0,0,0,0,0,0,0,0,0,0,0,0,0
478 | activity_1529,0,0,0,0,0,0,1,0,0,0,0,0,0,0
479 | activity_1530,0,0,0,0,0,0,1,0,0,0,0,0,0,0
480 | activity_1531,0,0,0,0,0,0,1,0,0,0,0,0,0,0
481 | activity_758,0,0,0,1,0,0,0,0,0,0,0,0,0,0
482 | activity_760,0,0,0,1,0,0,0,0,0,0,0,0,0,0
483 | activity_761,0,0,0,0,0,0,0,0,0,0,0,0,0,0
484 | activity_757,0,0,0,1,0,0,0,0,0,0,0,0,0,0
485 | activity_759,0,0,0,1,0,0,0,0,0,0,0,0,0,0
486 | activity_764,0,0,0,1,0,0,0,0,0,0,0,0,0,0
487 | activity_1325,0,0,0,0,0,0,0,0,0,0,0,1,0,0
488 | activity_1326,0,0,0,0,0,0,0,0,0,0,0,1,0,0
489 | activity_493014,0,0,0,0,0,0,0,0,0,0,0,0,0,0
490 | activity_588664,0,0,0,0,0,0,1,0,0,0,0,0,0,0
491 | activity_602410,0,0,0,0,0,0,0,0,1,0,0,0,0,0
492 | activity_623877,0,0,0,0,0,0,0,0,1,0,0,0,0,0
493 | activity_488975,0,0,0,0,0,0,0,0,0,0,0,1,0,0
494 | activity_488977,0,0,0,0,0,0,0,0,0,0,0,1,0,0
495 | activity_588511,0,0,0,0,0,0,0,0,1,0,0,0,0,0
496 | activity_588627,1,0,0,0,0,0,0,0,0,0,0,0,0,0
497 | activity_588675,1,0,0,0,0,0,0,0,0,0,0,0,0,0
498 | activity_588676,1,0,0,0,0,0,0,0,0,0,0,0,0,0
499 | activity_1511,0,0,0,0,0,0,0,0,1,0,0,0,0,0
500 | activity_1672,0,0,0,0,0,0,0,0,1,0,0,0,0,0
501 | activity_2550,0,0,0,0,0,0,0,0,1,0,0,0,0,0
502 | activity_2553,0,0,0,0,0,0,0,0,1,0,0,0,0,0
503 | activity_2648,0,0,0,0,0,0,0,0,1,0,0,0,0,0
504 | activity_463111,0,0,0,0,0,0,0,0,0,0,0,0,0,0
505 | activity_463165,0,0,0,0,0,0,0,0,0,0,0,0,0,0
506 | activity_2156,0,0,0,0,0,0,0,0,1,0,0,0,0,0
507 | activity_2227,0,0,0,0,0,0,0,0,0,0,0,0,0,0
508 | activity_2239,0,0,0,0,0,0,0,0,1,0,0,0,0,0
509 | activity_624037,1,0,0,0,0,0,0,0,0,0,0,0,0,0
510 | activity_624038,1,0,0,0,0,0,0,0,0,0,0,0,0,0
511 | activity_624040,1,0,0,0,0,0,0,0,0,0,0,0,0,0
512 | activity_624125,1,0,0,0,0,0,0,0,0,0,0,0,0,0
513 | activity_624126,1,0,0,0,0,0,0,0,0,0,0,0,0,0
514 | activity_624127,1,0,0,0,0,0,0,0,0,0,0,0,0,0
515 | activity_624466,1,0,0,0,0,0,0,0,0,0,0,0,0,0
516 | activity_624467,1,0,0,0,0,0,0,0,0,0,0,0,0,0
517 | activity_602247,0,0,0,0,0,0,0,0,0,0,0,0,0,0
518 | activity_602248,0,0,0,0,0,0,0,0,0,0,0,0,0,0
519 | activity_602250,0,0,0,0,0,0,0,0,0,0,0,0,0,0
520 | activity_588814,1,0,0,0,0,0,0,0,0,0,0,0,0,0
521 | activity_588819,1,0,0,0,0,0,0,0,0,0,0,0,0,0
522 | activity_588852,1,0,0,0,0,0,0,0,0,0,0,0,0,0
523 | activity_504634,1,0,0,0,0,0,0,0,0,0,0,0,0,0
524 | activity_504692,1,0,0,0,0,0,0,0,0,0,0,0,0,0
525 | activity_504454,1,0,0,0,0,0,0,0,0,0,0,0,0,0
526 | activity_652048,1,0,0,0,0,0,0,0,0,0,0,0,0,0
527 | activity_652051,1,0,0,0,0,0,0,0,0,0,0,0,0,0
528 | activity_493056,1,0,0,0,0,0,0,0,0,0,0,0,0,0
529 | activity_493084,1,0,0,0,0,0,0,0,0,0,0,0,0,0
530 | activity_504660,1,0,0,0,0,0,0,0,0,0,0,0,0,0
531 | activity_652054,1,0,0,0,0,0,0,0,0,0,0,0,0,0
532 | activity_624463,1,0,0,0,0,0,0,0,0,0,0,0,0,0
533 | activity_504651,1,0,0,0,0,0,0,0,0,0,0,0,0,0
534 | activity_624465,1,0,0,0,0,0,0,0,0,0,0,0,0,0
535 | activity_504652,1,0,0,0,0,0,0,0,0,0,0,0,0,0
536 | activity_624464,1,0,0,0,0,0,0,0,0,0,0,0,0,0
537 | activity_624291,0,0,0,0,0,0,0,0,0,0,0,0,0,0
538 | activity_485347,1,0,0,0,0,0,0,0,0,0,0,0,0,0
539 | activity_485344,1,0,0,0,0,0,0,0,0,0,0,0,0,0
540 | activity_485358,1,0,0,0,0,0,0,0,0,0,0,0,0,0
541 | activity_492947,1,0,0,0,0,0,0,0,0,0,0,0,0,0
542 | activity_504810,1,0,0,0,0,0,0,0,0,0,0,0,0,0
543 | activity_504812,1,0,0,0,0,0,0,0,0,0,0,0,0,0
544 | activity_540275,0,0,0,0,0,0,0,0,1,0,0,0,0,0
545 | activity_540277,0,0,0,0,0,0,0,0,1,0,0,0,0,0
546 | activity_493036,1,0,0,0,0,0,0,0,0,0,0,0,0,0
547 | activity_1469,0,0,1,0,0,0,0,0,0,0,0,0,0,0
548 | activity_1479,0,0,1,0,0,0,0,0,0,0,0,0,0,0
549 | activity_720552,0,0,0,0,0,1,0,0,0,0,0,0,0,0
550 | activity_720719,0,0,1,0,0,0,0,0,0,0,0,0,0,0
551 | activity_720725,0,0,1,0,0,0,0,0,0,0,0,0,0,0
552 | activity_743053,0,0,1,0,0,0,0,0,0,0,0,0,0,0
553 | activity_743054,0,0,1,0,0,0,0,0,0,0,0,0,0,0
554 | activity_743063,0,0,1,0,0,0,0,0,0,0,0,0,0,0
555 | activity_743067,0,0,1,0,0,0,0,0,0,0,0,0,0,0
556 | activity_743077,0,0,1,0,0,0,0,0,0,0,0,0,0,0
557 | activity_743078,0,0,1,0,0,0,0,0,0,0,0,0,0,0
558 | activity_743091,0,0,1,0,0,0,0,0,0,0,0,0,0,0
559 | activity_743122,0,0,0,0,0,1,0,0,0,0,0,0,0,0
560 | activity_743139,0,0,0,0,0,0,0,0,0,0,0,0,0,0
561 | activity_743140,0,0,1,0,0,0,0,0,0,0,0,0,0,0
562 | activity_1159523,0,0,1,0,0,0,0,0,0,0,0,0,0,0
563 | activity_1159528,0,0,0,0,0,1,0,0,0,0,0,0,0,0
564 | activity_1159531,0,0,1,0,0,0,0,0,0,0,0,0,0,0
565 | activity_1159555,0,0,1,0,0,0,0,0,0,0,0,0,0,0
566 | activity_1224892,0,0,1,0,0,0,0,0,0,0,0,0,0,0
567 | activity_1224893,0,0,1,0,0,0,0,0,0,0,0,0,0,0
568 | activity_1224894,0,0,0,0,0,1,0,0,0,0,0,0,0,0
569 | activity_1224895,1,0,0,0,0,0,0,0,0,0,0,0,0,0
570 | activity_1224896,0,0,0,0,0,0,0,0,0,0,0,0,0,0
571 | activity_1259247,0,0,1,0,0,0,0,0,0,0,0,0,0,0
572 | activity_1259248,0,0,1,0,0,0,0,0,0,0,0,0,0,0
573 | activity_1259387,0,0,1,0,0,0,0,0,0,0,0,0,0,0
574 | activity_1259390,0,0,0,0,0,1,0,0,0,0,0,0,0,0
575 | activity_1259391,0,0,1,0,0,0,0,0,0,0,0,0,0,0
576 | activity_1259392,0,0,0,0,0,1,0,0,0,0,0,0,0,0
577 | activity_1259393,1,0,0,0,0,0,0,0,0,0,0,0,0,0
578 | activity_1259394,0,0,1,0,0,0,0,0,0,0,0,0,0,0
579 | activity_1259395,1,0,0,0,0,0,0,0,0,0,0,0,0,0
580 | activity_1259396,0,0,1,0,0,0,0,0,0,0,0,0,0,0
581 | activity_1259401,0,0,1,0,0,0,0,0,0,0,0,0,0,0
582 | activity_1259402,0,0,1,0,0,0,0,0,0,0,0,0,0,0
583 | activity_1259403,0,0,1,0,0,0,0,0,0,0,0,0,0,0
584 | activity_1259404,0,0,1,0,0,0,0,0,0,0,0,0,0,0
585 | activity_1347030,1,0,0,0,0,0,0,0,0,0,0,0,0,0
586 | activity_1347031,0,0,1,0,0,0,0,0,0,0,0,0,0,0
587 | activity_1347032,0,0,0,0,0,1,0,0,0,0,0,0,0,0
588 | activity_1347033,0,0,0,0,0,0,0,0,0,0,0,0,0,0
589 | activity_1347036,0,0,1,0,0,0,0,0,0,0,0,0,0,0
590 | activity_1347038,1,0,0,0,0,0,0,0,0,0,0,0,0,0
591 | activity_1159518,0,0,0,0,0,0,0,0,0,0,0,0,0,0
592 | activity_1159519,0,0,0,0,0,1,0,0,0,0,0,0,0,0
593 | activity_743199,0,0,1,0,0,0,0,0,0,0,0,0,0,0
594 | activity_743219,0,0,0,0,0,1,0,0,0,0,0,0,0,0
595 | activity_743226,0,0,1,0,0,0,0,0,0,0,0,0,0,0
596 | activity_743227,0,0,1,0,0,0,0,0,0,0,0,0,0,0
597 | activity_743228,0,0,0,0,0,0,0,0,0,0,0,0,0,0
598 | activity_743239,0,0,1,0,0,0,0,0,0,0,0,0,0,0
599 | activity_743240,0,0,1,0,0,0,0,0,0,0,0,0,0,0
600 | activity_743241,0,0,1,0,0,0,0,0,0,0,0,0,0,0
601 | activity_743242,0,0,1,0,0,0,0,0,0,0,0,0,0,0
602 | 


--------------------------------------------------------------------------------
/Data/all_molecular_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LumosBio/MolData/73002723e0d20d0a2fdbd3950e738ebff214eede/Data/all_molecular_data.zip


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The code and the software for this project is licensed under the MIT license as described in LICENSE.mit file.
2 | 
3 | The data for this project is licensed under Creative Commons Zero of Creative Commons BY-SA as described in LICENSE.cczero file.
4 | 
5 | The data sources used for creation of the MolData dataset are referenced in the "data/data_reference_list.txt" as well as in the license file for the data.
6 | 


--------------------------------------------------------------------------------
/LICENSE.mit:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Transilico
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MolData - A Molecular Benchmark for Disease and Target Based Machine Learning
 2 | 
 3 | Deep learning’s automatic feature extraction has been a revolutionary addition to computational drug discovery, infusing both the capabilities of learning abstract features and discovering complex molecular patterns via learning from molecular data. Since biological and chemical knowledge is are necessary for overcoming the challenges of data curation, balancing, training, and evaluation, it is important for databases to contain meaningful information regarding the exact target and disease of each bioassay. The existing depositories such as PubChem or ChEMBL offer the screening data of for millions of molecules against a variety of cells and targets, however, their bioassays contain complex biological information descriptions which can hinder their usage by the machine learning community. In this work, a comprehensive disease and target-based dataset is collected from PubChem in order to facilitate and accelerate molecular machine learning for better drug discovery. MolData is one the largest efforts to date for democratizing the molecular machine learning, with roughly 170 million drug screening results from 1.4 million unique molecules assigned to specific diseases and targets. It also provides 30 unique categories of targets and diseases. Correlation analysis of the MolData bioassays unveil valuable information for drug repurposing for multiple diseases including cancer, metabolic disorders, and infectious diseases. Finally, we provide a benchmark of more than 30 models trained on each category using multitask learning. MolData aims to pave the way for computational drug discovery and accelerate the advancement of molecular artificial intelligence in a practical manner.
 4 | 
 5 | # Requirements
 6 | Requirements for training the models and running the benchmark:\
 7 | deepchem==2.5.0\
 8 | rdkit==2020.09.1\
 9 | tensorflow==2.5.0
10 | 
11 | Requirements for clustering the bioassay text descriptions:\
12 | biobert-embedding==0.1.1\
13 | transformers\
14 | kneed\
15 | python-docx
16 | 
17 | # How to Use
18 | After cloning the repo, this repository can be used to perform training on the MolData dataset, or to create a molecular dataset from bioassays and their descriptions. To get bechmark result and to train model on the molecular data:\
19 | 1- Unzip the data within the data directory.\
20 | 2- Run training.py for training and evaluating a Graph Convolutional Neural Network, or a traditional ECFP-based fully connected network.\
21 | If you plan to work with bioassays descriptions, move forward to "Preprocessing Bioassay Descriptions" section. In short you would need to preprocess the description, download molecular data, and clean and partition the molecular data.
22 | 
23 | # Data
24 | The MolData dataset is can be accessed from the data directory after unzipping. all_molecular_data includes 1.4 million molecules, 600 columns of binary bioactivity labels, and the splits (training, validation, or test) the molecules belong to. aid_disease_mapping contains the mapping between bioassays and their related category of diseases, and aid_target_mapping contains the same for category of targets. To have accurate and comparable results, please use the provided split labels for training and evaluations.
25 | 
26 | The data sources used for the creation of the MolData dataset gathered from the PubChem database are referenced within "data/data_reference_list.txt"
27 | 
28 | # Training on the Molecular Data
29 | The training.py script offers simple training on the molecular data for all benchamarks, evaluates the trained models, and saves the results for each model. To start the training:\
30 | 1- Select the data type (disease or target) within the code.\
31 | 2 - Specify the featurizer (GraphConv or ECFP). GraphConv triggers training of a Graph Convolutional Neural Network, while ECFP trains a simple fully connected neural network.\
32 | 3 - Specify the training data directory (default is at data/ where you unzip the main dataset).\
33 | 4 - Start the training.
34 | 
35 | Training happens on a transformed training set to overcome imbalance, where positive data point have higher weights than the negative data points for the loss function. However, evaluation is done on untransformed validation and test sets, to not allow the transformed weights to affect the metric calculation outcomes and to allow missing values to not be counted toward the metric calculations.
36 | 
37 | # Preprocessing Bioassay Descriptions (optional)
38 | Bioassays descriptions and summaries are downloaded from PuChem as text files for 9 different sources. The scripts follow this order:\
39 | 1- Preprocessing.py: Cleans the descriptions and extracts useful information from them using pre-defined rules.\
40 | 2- Clustering.py: Used BioBERT to extract features from the cleaned descriptions and titles, then used KMeans to cluster them. The cluster number are only used as recommendation during taggign each bioassay.\
41 | 3- Ner.py: Uses  a model trained for disease entity recognition to find all disease related words within the description. These words do not have an effect on the tagging, since the detected words were too broad.\
42 | 4- Manual_tag.py: After a human expert highlights the disease and target related words in all descriptions in a word files, these highlighted words are read and used for finding disease and target tags for each bioassay.
43 | 
44 | # Preprocessing Molecular Data (optional)
45 | After the assays are found and tagged, the molecular data for each assay is downloaded from PubChem using PubChem's bulk download interface. The scripts regarding this section follow this order:\
46 | 1- Molecular_data_cleaning.py: Makes SMILES canon, cleans duplicate SMILES, adds binary labels to SMILES.\
47 | 2- Fingerprint_extraction.py: Extracts ECFP4 fingerprints from the data, then used Tanimoto Coefficient to calculate the diversity within the dataset.\
48 | 3- Correlation.py: Find linear correlation between the labels of all datasets (bioassays), this can be a starting step for drug repurposing.\
49 | 4- Splitting.py: Splits the molecular data to train, validation, and test splits using the molecular scaffolds.
50 | 


--------------------------------------------------------------------------------
/clustering.py:
--------------------------------------------------------------------------------
  1 | from biobert_embedding.embedding import BiobertEmbedding
  2 | import pandas as pd
  3 | import os
  4 | import torch
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | from kneed import KneeLocator
  8 | from sklearn.cluster import KMeans
  9 | from sklearn.metrics import silhouette_score
 10 | from sklearn.preprocessing import StandardScaler
 11 | from sklearn.preprocessing import LabelEncoder
 12 | from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
 13 | # from docx import Document
 14 | from sklearn.decomposition import PCA
 15 | import seaborn as sns
 16 | 
 17 | def intersection(lst1, lst2):
 18 |     lst3 = [value for value in lst1 if value in lst2]
 19 |     return lst3
 20 | 
 21 | def sentence_vector(tokenized_text, biobert):
 22 |     encoded_layers = biobert.eval_fwdprop_biobert(tokenized_text)
 23 | 
 24 |     # `encoded_layers` has shape [12 x 1 x 22 x 768]
 25 |     # `token_vecs` is a tensor with shape [22 x 768]
 26 |     token_vecs = encoded_layers[11][0]
 27 | 
 28 |     # Calculate the average of all 22 token vectors.
 29 |     sentence_embedding = torch.mean(token_vecs, dim=0)
 30 |     return sentence_embedding
 31 | 
 32 | 
 33 | def clean_punctuation(sample):
 34 |     sample = sample.replace('-', ' ')
 35 |     sample = sample.replace('/', ' ')
 36 |     sample = sample.replace('(', ' ')
 37 |     sample = sample.replace(')', ' ')
 38 |     sample = sample.replace('\'', ' ')
 39 |     sample = sample.replace('.', ' ')
 40 |     sample = sample.replace(':', ' ')
 41 |     sample = sample.replace(',', ' ')
 42 |     sample = sample.replace(';', ' ')
 43 |     sample = sample.replace('_', ' ')
 44 |     return sample
 45 | 
 46 | 
 47 | def delete_punctuation(sample):
 48 |     sample = sample.replace('-', '')
 49 |     sample = sample.replace('/', '')
 50 |     sample = sample.replace('(', '')
 51 |     sample = sample.replace(')', '')
 52 |     sample = sample.replace('\'', '')
 53 |     sample = sample.replace('.', '')
 54 |     sample = sample.replace(':', '')
 55 |     sample = sample.replace(',', '')
 56 |     sample = sample.replace(';', '')
 57 |     sample = sample.replace('_', '')
 58 |     return sample
 59 | 
 60 | 
 61 | df = pd.read_csv('merged.csv')
 62 | df = df[df['substance_num'] >= 100]
 63 | df_tox = df[df['source'] == 'Tox21']
 64 | df = df[df['substance_num'] >= 100000]
 65 | df = df.merge(df_tox, how='outer')
 66 | biobert = BiobertEmbedding()
 67 | 
 68 | sources = np.array(df['source'])
 69 | print(np.unique(sources))
 70 | 
 71 | 
 72 | embs = []
 73 | long_counter = 0
 74 | descs = np.array(df['description'])
 75 | titles = np.array(df['name'])
 76 | # Extract embeddings for descriptions
 77 | for desc in descs:
 78 |     desc = biobert.process_text(desc.lower())
 79 |     embs.append(sentence_vector(desc[:512], biobert))
 80 |     if len(desc) > 512:
 81 |         long_counter += 1
 82 | print(long_counter, 'out of', len(descs), 'descriptions were truncated (Max 512 tokens).')
 83 | 
 84 | embs_np = []
 85 | for e in embs:
 86 |     embs_np.append(e.numpy())
 87 | embs_np = np.array(embs_np)
 88 | print(embs_np.shape)
 89 | 
 90 | # Extract embeddings for titles
 91 | embs_title = []
 92 | long_counter = 0
 93 | for title in titles:
 94 |     title = biobert.process_text(title.lower())
 95 |     embs_title.append(sentence_vector(title[:512], biobert))
 96 |     if len(title) > 512:
 97 |         long_counter += 1
 98 | print(long_counter, 'out of', len(titles), 'descriptions were truncated (Max 512 tokens).')
 99 | 
100 | embs_title_np = []
101 | for e in embs_title:
102 |     embs_title_np.append(e.numpy())
103 | embs_title_np = np.array(embs_title_np)
104 | print(embs_title_np.shape)
105 | 
106 | # Concatenate embeddings for both titles and descriptions
107 | features = np.concatenate((embs_np, embs_title_np), axis=1)
108 | print(features.shape)
109 | 
110 | # Standardize the features
111 | scaler = StandardScaler()
112 | scaled_features = scaler.fit_transform(features)
113 | 
114 | kmeans_kwargs = {
115 | "init": "k-means++",
116 |  "n_init": 10,
117 | "max_iter": 1000,
118 | "random_state": 42}
119 | 
120 | # Cluster the text features and find optimum number of clusters
121 | sse = []
122 | total_k = 51
123 | for k in range(1, total_k):
124 |     kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
125 |     kmeans.fit(scaled_features)
126 |     sse.append(kmeans.inertia_)
127 |     print(k)
128 | plt.style.use("fivethirtyeight")
129 | plt.plot(range(1, total_k), sse)
130 | plt.xticks(range(1, total_k))
131 | plt.xlabel("Number of Clusters")
132 | plt.ylabel("SSE")
133 | plt.tight_layout()
134 | plt.savefig('knee.png', format='png', dpi=300)
135 | plt.show()
136 | 
137 | kl = KneeLocator(range(1, total_k), sse, curve="convex", direction="decreasing")
138 | optimum_k = kl.elbow
139 | print(optimum_k)
140 | kmeans = KMeans(n_clusters=optimum_k, **kmeans_kwargs)
141 | kmeans.fit(scaled_features)
142 | 
143 | # Perform PCA to be able to display the clusters
144 | pca = PCA(n_components=2, random_state=42)
145 | pca_features = pca.fit_transform(scaled_features)
146 | label_encoder = LabelEncoder()
147 | true_labels = label_encoder.fit_transform(sources)
148 | 
149 | df['cluster'] = kmeans.labels_
150 | for i in range(len(features[0])):
151 |     df['feature'+str(i)] = features[:,i]
152 | 
153 | # Add cluster information to the data
154 | df.to_csv('merged_features_clustered.csv', header=True, index=False)
155 | 
156 | # Display the clusters
157 | pcadf = pd.DataFrame(pca_features,columns=["Principal Component 1", "Principal Component 2"])
158 | pcadf["Cluster"] = kmeans.labels_
159 | pcadf["Data Source"] = label_encoder.inverse_transform(true_labels)
160 | 
161 | pcadf = pcadf.replace({'Data Source': {'Broad Institute': 'Broad Institute', 'Burnham Center for Chemical Genomics': 'Burnham Center',
162 |  'Emory University Molecular Libraries Screening Center': 'Emory University',
163 |  'ICCB-Longwood Screening Facility, Harvard Medical School': 'ICCB-Longwood',
164 |  'Johns Hopkins Ion Channel Center': 'Johns Hopkins', 'NMMLSC':'NMMLSC',
165 |  'National Center for Advancing Translational Sciences (NCATS)': 'NCATS',
166 |  'The Scripps Research Institute Molecular Screening Center': 'Scripps', 'Tox21': 'Tox21'}})
167 | 
168 | # plt.style.use("fivethirtyeight")
169 | plt.style.use("default")
170 | 
171 | plt.figure(figsize=(10, 8))
172 | # fix color wheel
173 | scat = sns.scatterplot("Principal Component 1", "Principal Component 2", s=100,data=pcadf,
174 |                        hue="Cluster",style="Data Source", palette=sns.color_palette("tab10",len(np.unique(df['cluster']))))
175 | # scat.set_title("Clustering results")
176 | # plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)
177 | plt.xlabel("Principal Component 1", fontsize=20)
178 | plt.ylabel("Principal Component 2", fontsize=20)
179 | plt.tick_params(axis="y",direction="in")
180 | plt.tick_params(axis="x",direction="in")
181 | plt.xticks(fontsize=15)
182 | plt.yticks(fontsize=15)
183 | plt.legend(fontsize=10, ncol=2, bbox_to_anchor=(0.44, 0.62))
184 | # plt.legend(ncol=2)
185 | plt.tight_layout()
186 | plt.savefig('clusters.png', format='png')
187 | plt.show()
188 | 
189 | ###############################################################
190 | 
191 | 
192 | 
193 | 
194 | 


--------------------------------------------------------------------------------
/correlation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import math
  5 | 
  6 | mol_df = pd.read_csv('merged_cleaned_benchmarked_threshold_scaffold_split_stratified_pca.csv')
  7 | activity_columns = [c for c in list(mol_df.columns) if 'activity' in c]
  8 | 
  9 | corr = mol_df[activity_columns].corr()
 10 | corr.to_csv('results/correlation_all.csv', header=True, index=True)
 11 | 
 12 | corr = pd.read_csv('results/correlation_all.csv', index_col=0)
 13 | map_df = pd.read_csv('aid_tag_mapping.csv')
 14 | desc_df = pd.read_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked_cleaned_counted_threshold.csv')
 15 | desc_info = {}
 16 | for i in range(len(desc_df)):
 17 |     desc_info['activity_' + str(desc_df.iloc[i]['AID'])] = [desc_df.iloc[i]['name'], desc_df.iloc[i]['source'], desc_df.iloc[i]['target'], desc_df.iloc[i]['benchmark_tag']]
 18 | all_categories = ['all'] + list(map_df.columns)[1:]
 19 | for current_tag in all_categories:
 20 |     # current_tag = 'toxicity'
 21 |     print(current_tag)
 22 |     if current_tag == 'all':
 23 |         current_corr = corr
 24 |     else:
 25 |         current_aids = np.array(list(map_df[map_df[current_tag] == 1]['AID']))
 26 |         current_corr = corr[current_aids]
 27 |         current_corr = current_corr[current_corr.index.isin(current_aids)]
 28 |         corr.to_csv('results/correlation_'+current_tag+'.csv', header=True, index=False)
 29 |     if current_tag in ['all','cancer']:
 30 |         fontsize = 1
 31 |     else:
 32 |         fontsize = 4
 33 |     current_columns = [c.lstrip('activity_') for c in list(current_corr.columns)]
 34 |     current_corr = current_corr.reindex(list(current_corr.columns))
 35 |     # current_corr.to_csv('toxicity_correlation_matrix.csv', index=True)
 36 |     # current_corr = np.array(current_corr)
 37 |     # dummy = []
 38 |     # for i in range(len(current_corr)):
 39 |     #     for j in range(len(current_corr)):
 40 |     #         dummy.append([i,j,current_corr[i,j], current_columns[i], current_columns[j]])
 41 |     # dummy = np.array(dummy)
 42 |     # dummy_df = pd.DataFrame(dummy, columns=['x', 'y', 'correlation', 'AID_x', 'AID_y'])
 43 |     # dummy_df.to_csv('toxicity_correlation_matrix.csv', index=False)
 44 |     fig = plt.figure(figsize=(8,4), dpi=300)
 45 |     ax = fig.add_subplot(111)
 46 |     cax = ax.matshow(current_corr,cmap='coolwarm', vmin=-1, vmax=1)
 47 |     fig.colorbar(cax)
 48 |     ticks = np.arange(0,len(current_columns),1)
 49 |     ax.set_xticks(ticks)
 50 |     plt.xticks(rotation=90)
 51 |     ax.set_yticks(ticks)
 52 |     # plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
 53 |     ax.set_xticklabels(current_columns, fontsize=fontsize)
 54 |     ax.set_yticklabels(current_columns, fontsize=fontsize)
 55 |     plt.xlabel('Bioassay AID', fontsize=fontsize+7)
 56 |     ax.xaxis.set_label_position('top')
 57 |     plt.ylabel('Bioassay AID', fontsize=fontsize+7)
 58 |     plt.tight_layout()
 59 |     plt.savefig('results/correlation_'+current_tag+'.png', format='png', dpi=300)
 60 |     plt.show()
 61 |     plt.close()
 62 | 
 63 | corr_array = np.array(corr)
 64 | corr_array_abs = np.absolute(np.array(corr))
 65 | all_aids = list(corr.columns)
 66 | threshold = 0.5
 67 | interesting_aids = []
 68 | interesting_corr = []
 69 | for i in range(len(corr_array)):
 70 |     first_aid = all_aids[i]
 71 |     first_source = desc_info[first_aid][1]
 72 |     for j in range(len(corr_array)):
 73 |         if i == j:
 74 |             continue
 75 |         second_aid = all_aids[j]
 76 |         second_source = desc_info[second_aid][1]
 77 |         if first_source == 'Tox21' or second_source == 'Tox21':
 78 |             continue
 79 |         current_corr = corr_array[i, j]
 80 |         if abs(current_corr) >= threshold:
 81 |             if abs(int(first_aid.lstrip('activity_')) - int(second_aid.lstrip('activity_'))) > 5:
 82 |                 if first_source == second_source:
 83 |                     source_overlap = 1
 84 |                 else:
 85 |                     source_overlap = 0
 86 |                 if first_aid not in interesting_aids:
 87 |                     interesting_aids.append(first_aid)
 88 |                 if second_aid not in interesting_aids:
 89 |                     interesting_aids.append(second_aid)
 90 |                 interesting_corr.append([first_aid, second_aid, current_corr, source_overlap])
 91 | print(len(interesting_aids))
 92 | interesting_corr = pd.DataFrame(data=interesting_corr, columns=['first_AID', 'second_AID', 'corr', 'same_source'])
 93 | interesting_corr = interesting_corr.sort_values(by=['corr'], ascending=False)
 94 | added_info = []
 95 | for i in range(len(interesting_corr)):
 96 |     added_info.append(desc_info[interesting_corr.iloc[i]['first_AID']] + desc_info[interesting_corr.iloc[i]['second_AID']])
 97 | interesting_corr[['first_name', 'first_source', 'first_target', 'first_benchmark_tag', 'second_name', 'second_source', 'second_target', 'second_benchmark_tag']] = np.array(added_info)
 98 | interesting_corr_unique = interesting_corr.iloc[list(np.arange(0,len(interesting_corr), 2))]
 99 | 
100 | 
101 | overlap = []
102 | active_overlap = []
103 | sizes = []
104 | actives = []
105 | for i in range(len(interesting_corr_unique)):
106 |     first_aid = interesting_corr_unique.iloc[i]['first_AID']
107 |     second_aid = interesting_corr_unique.iloc[i]['second_AID']
108 |     current_df = mol_df[[first_aid, second_aid]].dropna(how='any')
109 |     overlap.append(len(current_df))
110 |     current_df_1 = current_df[current_df[first_aid] == 1]
111 |     current_df_2 = current_df[current_df[second_aid] == 1]
112 |     actives.append([len(current_df_1), len(current_df_2)])
113 |     current_df = current_df[current_df[first_aid] == 1]
114 |     current_df = current_df[current_df[second_aid] == 1]
115 |     active_overlap.append(len(current_df))
116 |     sizes.append([len(mol_df[first_aid].dropna(how='any')), len(mol_df[second_aid].dropna(how='any'))])
117 | 
118 | 
119 | interesting_corr_unique.insert(loc=3, column='overlap', value=overlap)
120 | interesting_corr_unique[['first_size', 'second_size']] = sizes
121 | interesting_corr_unique[['first_active', 'second_active']] = actives
122 | 
123 | interesting_corr_unique.insert(loc=4, column='overlap_active', value=active_overlap)
124 | expert_df = pd.read_csv('results/correlation_interesting_expert.csv')
125 | previous = []
126 | for i in range(len(expert_df)):
127 |     first_aid = expert_df.iloc[i]['first_AID']
128 |     second_aid = expert_df.iloc[i]['second_AID']
129 |     previous.append([first_aid, second_aid])
130 |     previous.append([second_aid, first_aid])
131 | previous_checked = []
132 | for i in range(len(interesting_corr_unique)):
133 |     first_aid = interesting_corr_unique.iloc[i]['first_AID']
134 |     second_aid = interesting_corr_unique.iloc[i]['second_AID']
135 |     if [first_aid, second_aid] in previous:
136 |         previous_checked.append(1)
137 |     else:
138 |         previous_checked.append(0)
139 | interesting_corr_unique.insert(loc=3, column='previously_checked', value=previous_checked)
140 | 
141 | interesting_corr_unique.to_csv('results/correlation_interesting.csv', header=True, index=False)
142 | # interesting_corr_unique.to_csv('results/correlation_interesting_diff_source.csv', header=True, index=False)
143 | 
144 | expert_df = pd.read_csv('results/correlation_interesting_expert.csv')
145 | 
146 | overlap = []
147 | active_overlap = []
148 | sizes = []
149 | actives = []
150 | for i in range(len(expert_df)):
151 |     first_aid = expert_df.iloc[i]['first_AID']
152 |     second_aid = expert_df.iloc[i]['second_AID']
153 |     current_df = mol_df[[first_aid, second_aid]].dropna(how='any')
154 |     overlap.append(len(current_df))
155 |     current_df_1 = current_df[current_df[first_aid] == 1]
156 |     current_df_2 = current_df[current_df[second_aid] == 1]
157 |     actives.append([len(current_df_1), len(current_df_2)])
158 |     current_df = current_df[current_df[first_aid] == 1]
159 |     current_df = current_df[current_df[second_aid] == 1]
160 |     active_overlap.append(len(current_df))
161 |     sizes.append([len(mol_df[first_aid].dropna(how='any')), len(mol_df[second_aid].dropna(how='any'))])
162 | # a = mol_df['activity_1259404']
163 | # len(a.dropna(how='any'))
164 | 
165 | expert_df.insert(loc=3, column='overlap', value=overlap)
166 | expert_df[['first_size', 'second_size']] = sizes
167 | expert_df[['first_active', 'second_active']] = actives
168 | 
169 | expert_df.insert(loc=4, column='overlap_active', value=active_overlap)
170 | 
171 | # interesting_corr_unique.to_csv('results/correlation_interesting.csv', header=True, index=False)
172 | expert_df.to_csv('results/correlation_interesting_expert_overlap.csv', header=True, index=False)
173 | 
174 | current_corr = corr[interesting_aids]
175 | current_corr = current_corr[current_corr.index.isin(interesting_aids)]
176 | current_columns = [c.lstrip('activity_') for c in list(current_corr.columns)]
177 | current_corr = current_corr.reindex(list(current_corr.columns))
178 | fig = plt.figure(figsize=(8,4), dpi=300)
179 | ax = fig.add_subplot(111)
180 | cax = ax.matshow(current_corr,cmap='coolwarm', vmin=-1, vmax=1)
181 | fig.colorbar(cax)
182 | ticks = np.arange(0,len(current_columns),1)
183 | ax.set_xticks(ticks)
184 | plt.xticks(rotation=90)
185 | ax.set_yticks(ticks)
186 | ax.set_xticklabels(current_columns, fontsize=5)
187 | ax.set_yticklabels(current_columns, fontsize=5)
188 | plt.tight_layout()
189 | # plt.savefig('results/correlation_interesting.png', format='png', dpi=300)
190 | plt.savefig('results/correlation_interesting_diff_source.png', format='png', dpi=300)
191 | plt.show()
192 | plt.close()
193 | 
194 | 
195 | final_df = pd.read_csv('results/correlation_interesting_final.csv')
196 | first_aids = list(final_df['first'])
197 | second_aids = list(final_df['second'])
198 | final_aids = []
199 | for i in range(len(first_aids)):
200 |     if first_aids[i] not in final_aids:
201 |         final_aids.append(first_aids[i])
202 |     if second_aids[i] not in final_aids:
203 |         final_aids.append(second_aids[i])
204 | 
205 | 
206 | current_corr = corr[final_aids]
207 | current_corr = current_corr[current_corr.index.isin(final_aids)]
208 | current_columns = [c.lstrip('activity_') for c in list(current_corr.columns)]
209 | current_corr = current_corr.reindex(list(current_corr.columns))
210 | #
211 | # current_corr = np.array(current_corr)
212 | # dummy = []
213 | # for i in range(len(current_corr)):
214 | #     for j in range(len(current_corr)):
215 | #         dummy.append([i,j,current_corr[i,j], current_columns[i], current_columns[j]])
216 | # dummy = np.array(dummy)
217 | # dummy_df = pd.DataFrame(dummy, columns=['x', 'y', 'correlation', 'AID_x', 'AID_y'])
218 | # dummy_df.to_csv('interesting_correlation_matrix.csv', index=False)
219 | 
220 | fig = plt.figure(figsize=(8,4), dpi=300)
221 | ax = fig.add_subplot(111)
222 | cax = ax.matshow(current_corr,cmap='coolwarm', vmin=-1, vmax=1)
223 | fig.colorbar(cax)
224 | ticks = np.arange(0,len(current_columns),1)
225 | ax.set_xticks(ticks)
226 | plt.xticks(rotation=90)
227 | ax.set_yticks(ticks)
228 | ax.set_xticklabels(current_columns, fontsize=5)
229 | ax.set_yticklabels(current_columns, fontsize=5)
230 | plt.xlabel('Bioassay AID', fontsize=5 + 7)
231 | ax.xaxis.set_label_position('top')
232 | plt.ylabel('Bioassay AID', fontsize=5 + 7)
233 | plt.tight_layout()
234 | # plt.savefig('results/correlation_interesting.png', format='png', dpi=300)
235 | plt.savefig('results/correlation_interesting_final.png', format='png', dpi=300)
236 | plt.show()
237 | plt.close()
238 | 
239 | activity_columns = [c for c in list(mol_df.columns) if 'activity' in c]
240 | all_sums = mol_df[activity_columns].sum(axis=1)
241 | all_counts = mol_df[activity_columns].count(axis=1)
242 | plt.hist(all_sums, bins=range(10,141))
243 | plt.hist(all_counts, bins='auto')
244 | count_df = pd.DataFrame(data=all_sums, columns=['active'])
245 | count_df['total'] = all_counts
246 | count_df['activity_percentage'] = 100 * count_df['active']/count_df['total']
247 | 
248 | plt.scatter(count_df['total'], count_df['active'])
249 | 
250 | from scipy.stats import gaussian_kde
251 | x1 = np.array(count_df['total'])
252 | y1 = np.array(count_df['active'])
253 | xy1 = np.vstack([x1, y1])
254 | z1 = gaussian_kde(xy1)(xy1)
255 | 
256 | len(z1)
257 | 
258 | efficacy_df = pd.DataFrame(data=x1, columns=["Number of Screens"])
259 | efficacy_df["Number of Active Results"] = y1
260 | efficacy_df["Density"] = z1
261 | efficacy_df.to_csv("gaussian_efficacy.csv", index=False)
262 | smaller_efficacy_df = efficacy_df.sample(n=200000, random_state=42)
263 | smaller_efficacy_df.to_csv("gaussian_efficacy_small.csv", index=False)
264 | 
265 | x1 = np.array(count_df['total'])
266 | y1 = np.array(count_df['activity_percentage'])
267 | xy1 = np.vstack([x1, y1])
268 | z1 = gaussian_kde(xy1)(xy1)
269 | 
270 | efficacy_label = []
271 | for i in range(len(count_df)):
272 |     current_data = count_df.iloc[i]
273 |     current_total = current_data['total']
274 |     current_active = current_data['active']
275 |     current_percent = current_data['activity_percentage']
276 |     if current_percent <= 2:
277 |         efficacy_label.append('Wasted')
278 |     elif current_active >= 50:
279 |         efficacy_label.append('Possible Toxic')
280 |     elif current_total > 100:
281 |         efficacy_label.append('Familiar Molecules')
282 |     else:
283 |         efficacy_label.append('New Molecules')
284 | 
285 | toxic_line = []
286 | for i in range(50,600,10):
287 |     toxic_line.append([int(i), int(5000/i)])
288 | import pandas as pd
289 | toxic_line_df = pd.DataFrame(data=toxic_line, columns=['x','y'])
290 | toxic_line_df.to_csv("toxic_line.csv", header=True, index=False)
291 | 
292 | efficacy_df = pd.DataFrame(data=x1, columns=["Number of Screens"])
293 | efficacy_df["Active Percentage"] = y1
294 | efficacy_df["Density"] = z1
295 | efficacy_df["Efficacy Type"] = efficacy_label
296 | efficacy_df.to_csv("gaussian_efficacy_percentage.csv", index=False)
297 | smaller_efficacy_df = efficacy_df.sample(n=300000, random_state=42)
298 | smaller_efficacy_df.to_csv("gaussian_efficacy_percentage_small.csv", index=False)
299 | 
300 | all_sources = np.array(desc_df['source'])
301 | all_aids = np.array(desc_df['AID'])
302 | # aid_source = {'activity_' + str(s):[] for s in all_aids}
303 | aid_source = {}
304 | for i in range(len(all_aids)):
305 |     aid_source['activity_' + str(all_aids[i])] = all_sources[i]
306 | 
307 | # for s in aid_source:
308 | #     print(len(aid_source[s]))
309 | 
310 | source_index = {}
311 | unique_sources = np.unique(all_sources)
312 | for s in range(len(unique_sources)):
313 |     source_index[unique_sources[s]] = s
314 | 
315 | mol_source = []
316 | for m in range(len(mol_df)):
317 |     current_data = mol_df.iloc[m][activity_columns]
318 |     current_aids = list(current_data[pd.notna(current_data)].index)
319 |     current_sources = np.unique([source_index[aid_source[aid]] for aid in current_aids])
320 |     mol_source.append(current_sources)
321 |     if m % 100000 == 0:
322 |         print(m)
323 | 
324 | source_connection = np.zeros((len(unique_sources), len(unique_sources)), dtype=int)
325 | counter = 0
326 | for s in mol_source:
327 |     if len(s) == 1:
328 |         source_connection[int(s[0]), int(s[0])] += 1
329 |     else:
330 |         for i in range(len(s)):
331 |             if i == len(s) - 1:
332 |                 break
333 |             for j in range(i + 1, len(s)):
334 |                 source_connection[int(s[i]), int(s[j])] += 1
335 |                 source_connection[int(s[j]), int(s[i])] += 1
336 |     if counter % 10000 == 0:
337 |         print(counter)
338 |     counter += 1
339 | 
340 | source_con_df = pd.DataFrame(data=source_connection, columns=unique_sources)
341 | source_con_df.insert(loc=0, column='source', value=unique_sources)
342 | source_con_df.to_csv('source_molecular_overlap.csv', header=True, index=False)


--------------------------------------------------------------------------------
/fingerprint_extraction.py:
--------------------------------------------------------------------------------
  1 | import deepchem as dc
  2 | import numpy as np
  3 | import pandas as pd
  4 | import logging
  5 | import os
  6 | import matplotlib.pyplot as plt
  7 | input_data = 'merged_cleaned_benchmarked_threshold_scaffold_split_stratified.csv'
  8 | input_columns = list(pd.read_csv(input_data).columns)
  9 | input_tasks = list(np.array(input_columns)[[True if 'activity' in c else False for c in input_columns]])
 10 | print(input_tasks)
 11 | split = 'specified'
 12 | featurizer = 'ECFP'
 13 | 
 14 | 
 15 | data_dir = input_data
 16 | 
 17 | # assign data and tasks
 18 | dataset_file = data_dir
 19 | tasks = input_tasks
 20 | valid_indices, test_indices = None, None
 21 | if split == 'specified':
 22 |     dummy_df = pd.read_csv(data_dir, low_memory=False)
 23 |     valid_indices = dummy_df.index[dummy_df['split'] == 'validation'].tolist()
 24 |     test_indices = dummy_df.index[dummy_df['split'] == 'test'].tolist()
 25 | print("About to load the dataset.")
 26 | 
 27 | # create featurizer, loader, transformers, and splitter
 28 | if featurizer == 'ECFP':
 29 |     featurizer = dc.feat.CircularFingerprint(size=1024, chiral=True)
 30 | elif featurizer == 'GraphConv':
 31 |     featurizer = dc.feat.ConvMolFeaturizer(use_chirality=True)
 32 | loader = dc.data.CSVLoader(tasks=tasks, feature_field="smiles", featurizer=featurizer)
 33 | splitters = {
 34 |     'scaffold': dc.splits.ScaffoldSplitter(),
 35 |     'specified': dc.splits.SpecifiedSplitter(valid_indices=valid_indices, test_indices=test_indices)
 36 | }
 37 | splitter = splitters[split]
 38 | 
 39 | 
 40 | if not os.path.exists(dataset_file):
 41 |     print("Dataset not found")
 42 | 
 43 | print("About to featurize the dataset.")
 44 | dataset = loader.create_dataset([dataset_file], shard_size=8192)
 45 | 
 46 | print("About to transform data")
 47 | transformers = [dc.trans.BalancingTransformer(dataset=dataset)]
 48 | for transformer in transformers:
 49 |     dataset = transformer.transform(dataset)
 50 | train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)
 51 | # dc.utils.data_utils.save_dataset_to_disk(data_save_dir, train=train_dataset, valid=valid_dataset, test=test_dataset, transformers=transformers)
 52 | 
 53 | # Extract the smiles for each split
 54 | train_smiles = np.array(train_dataset.ids)
 55 | valid_smiles = np.array(valid_dataset.ids)
 56 | test_smiles = np.array(test_dataset.ids)
 57 | 
 58 | train_features = np.array(train_dataset.X)
 59 | valid_features = np.array(valid_dataset.X)
 60 | test_features = np.array(test_dataset.X)
 61 | 
 62 | all_features = np.concatenate((train_features, valid_features, test_features))
 63 | print(all_features.shape)
 64 | # Save the smiles back into the CSV file
 65 | all_smiles = np.array(list(train_smiles) + list(valid_smiles) + list(test_smiles))
 66 | labels_smiles = np.array(['train'] * len(train_smiles) + ['validation'] * len(valid_smiles) + ['test'] * len(test_smiles))
 67 | smiles_df = pd.DataFrame(data=all_features, columns=['feature_' + str(i) for i in range(1024)])
 68 | smiles_df.insert(loc=0,column='smiles', value=all_smiles)
 69 | smiles_df.insert(loc=1,column='split', value=labels_smiles)
 70 | smiles_df.to_csv('smiles_ecfp.csv', header=True, index=False)
 71 | 
 72 | 
 73 | # Read the SMILES again for Tanimoto Coefficient calculations
 74 | smiles_df = pd.read_csv('smiles_ecfp.csv')
 75 | feature_columns = []
 76 | for c in smiles_df.columns:
 77 |     if 'feature' in c:
 78 |         feature_columns.append(c)
 79 | 
 80 | 
 81 | def largest_tanimoto_similarity(f1_bool, f2_bool):
 82 |     # f1 is one boolean numpy array, containing the molecular fingerprint for one molecule
 83 |     # f2 is (N-1)*M boolean numpy matrix, containing molecular fingerprints for all molecules except f1
 84 |     # Returns the largest Tanimoto Coefficient between f1 and the rest of the fingerprint (most similar)
 85 |     f1_bool = np.tile(f1_bool, (len(f2_bool), 1))
 86 |     # Overlap between "ones" from f1 and "ones" from the rest of the dataset
 87 |     overlap = np.sum(np.logical_and(f1_bool, f2_bool), axis=1)
 88 |     # Union between "ones" from f1 and "ones" from the rest of the dataset
 89 |     union = np.sum(np.logical_or(f1_bool, f2_bool), axis=1)
 90 |     return np.max(overlap/union)
 91 | 
 92 | 
 93 | tanimoto_scores = []
 94 | fingerprint_array = np.array(smiles_df[feature_columns].sample(n=200000, random_state=42), dtype=bool)
 95 | # fingerprint_array = np.array(smiles_df[feature_columns], dtype=bool)
 96 | for i in range(len(fingerprint_array)):
 97 |     indices = np.arange(len(fingerprint_array))
 98 |     dummy_score = largest_tanimoto_similarity(fingerprint_array[i, :], fingerprint_array[indices != i, :])
 99 |     tanimoto_scores.append(dummy_score)
100 |     print(i, dummy_score)
101 | 
102 | np.save('results/tanimoto_scores', np.array(tanimoto_scores))
103 | 
104 | 
105 | fontsize = 13
106 | fig, ax = plt.subplots(figsize=(5, 4), dpi=300)
107 | # plot the cumulative histogram
108 | n, bins, patches = ax.hist(tanimoto_scores, 200, density=True, histtype='step',
109 |                            cumulative=True)
110 | # Find the percentage where tanimoto score is 0.5
111 | index_tanimoto_7 = 1 - np.sum(np.array(tanimoto_scores) > 0.7) / len(tanimoto_scores)
112 | index_tanimoto_5 = 1 - np.sum(np.array(tanimoto_scores) > 0.5) / len(tanimoto_scores)
113 | 
114 | # tidy up the figure
115 | ax.grid(True)
116 | plt.yticks([0,0.2,0.4,0.6,0.8,1], [0,20,40,60,80,100], fontsize=fontsize)
117 | plt.xticks(fontsize=fontsize)
118 | ax.set_ylabel('Cumulative % of Molecules', labelpad=0, fontsize=fontsize)
119 | ax.set_xlabel('Largest Tanimoto Coefficient', labelpad=0, fontsize=fontsize)
120 | 
121 | # add one point in the 50 percentile
122 | plt.annotate('(0.7,'+str(round(index_tanimoto_7*100, 2))+')',  # this is the text
123 |              (0.70, index_tanimoto_7),  # these are the coordinates to position the label
124 |              textcoords="offset points",  # how to position the text
125 |              xytext=(0, 3),  # distance from text to points (x,y)
126 |              ha='right',
127 |              fontsize=fontsize)  # horizontal alignment can be left, right or center
128 | ax.scatter([0.7], [index_tanimoto_7], c='black', s=7, zorder=3)
129 | plt.annotate('(0.5,'+str(round(index_tanimoto_5*100, 2))+')',  # this is the text
130 |              (0.50, index_tanimoto_5),  # these are the coordinates to position the label
131 |              textcoords="offset points",  # how to position the text
132 |              xytext=(0, 3),  # distance from text to points (x,y)
133 |              ha='right',
134 |              fontsize=fontsize)  # horizontal alignment can be left, right or center
135 | ax.scatter([0.5], [index_tanimoto_5], c='black', s=7, zorder=3)
136 | plt.tight_layout()
137 | plt.savefig('results/cumulative_tanimoto.png', format='png', dpi=300)
138 | plt.show()
139 | 


--------------------------------------------------------------------------------
/manual_tagging.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import os
  3 | import torch
  4 | import numpy as np
  5 | import matplotlib.pyplot as plt
  6 | from sklearn.cluster import KMeans
  7 | from sklearn.metrics import silhouette_score
  8 | from sklearn.preprocessing import StandardScaler
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
 11 | # from docx import Document
 12 | from sklearn.decomposition import PCA
 13 | import seaborn as sns
 14 | from fuzzywuzzy import fuzz
 15 | 
 16 | 
 17 | def intersection(lst1, lst2):
 18 |     lst3 = [value for value in lst1 if value in lst2]
 19 |     return lst3
 20 | 
 21 | 
 22 | def clean_punctuation(sample):
 23 |     sample = sample.replace('-', ' ')
 24 |     sample = sample.replace('/', ' ')
 25 |     sample = sample.replace('(', ' ')
 26 |     sample = sample.replace(')', ' ')
 27 |     sample = sample.replace('\'', ' ')
 28 |     sample = sample.replace('.', ' ')
 29 |     sample = sample.replace(':', ' ')
 30 |     sample = sample.replace(',', ' ')
 31 |     sample = sample.replace(';', ' ')
 32 |     sample = sample.replace('_', ' ')
 33 |     return sample
 34 | 
 35 | 
 36 | def delete_punctuation(sample):
 37 |     sample = sample.replace('-', '')
 38 |     sample = sample.replace('/', '')
 39 |     sample = sample.replace('(', '')
 40 |     sample = sample.replace(')', '')
 41 |     sample = sample.replace('\'', '')
 42 |     sample = sample.replace('.', '')
 43 |     sample = sample.replace(':', '')
 44 |     sample = sample.replace(',', '')
 45 |     sample = sample.replace(';', '')
 46 |     sample = sample.replace('_', '')
 47 |     return sample
 48 | 
 49 | 
 50 | df = pd.read_csv('merged_features_clustered_ner_cleaned.csv')
 51 | all_tagged_dfs = []
 52 | disease_clusters = []
 53 | target_clusters = []
 54 | total_unhighlighted = []
 55 | total_merged = []
 56 | for cluster_number in range(10):
 57 |     cluster_df = df[df['cluster'] == cluster_number]
 58 |     cluster_df.reset_index(inplace=True)
 59 |     descs = np.array(cluster_df['description'])
 60 |     # print(len(descs))
 61 | 
 62 |     document = Document('labeled/'+str(cluster_number)+'.docx')
 63 |     words = document._element.xpath('//w:r')
 64 | 
 65 |     all_words = []
 66 |     all_props = []
 67 |     for w in words:
 68 |         dummy = w.xml
 69 |         dummy = dummy.replace('<w:t xml:space=\"preserve\">', '<w:t>')
 70 |         if "<w:t>" not in dummy:
 71 |             continue
 72 |         current_word = dummy.split("<w:t>")[1].split("</w:t>")[0].lower().strip()
 73 |         if len(current_word) == 0:
 74 |             continue
 75 |         all_words.append(current_word)
 76 |         if "\"yellow\"" in dummy:
 77 |             all_props.append('disease')
 78 |         elif "FF0000" in dummy:
 79 |             all_props.append('target')
 80 |         else:
 81 |             all_props.append('normal')
 82 |     # print(len(all_props), len(all_words))
 83 | 
 84 |     text = document._element.xml
 85 |     text = text.replace('<w:t xml:space=\"preserve\">', '<w:t>')
 86 |     words = text.split('<w:t>')
 87 |     lines = []
 88 |     all_words2 = []
 89 |     all_paragraph = []
 90 |     paragraph_counter = 0
 91 |     high_light_counter = {}
 92 |     for w in range(len(words)):
 93 |         if "</w:t>" not in words[w]:
 94 |             continue
 95 |         current_word = words[w].split("</w:t>")[0].lower().strip()
 96 |         if len(current_word) != 0:
 97 |             all_words2.append(current_word)
 98 |             all_paragraph.append(paragraph_counter)
 99 |         if "\"yellow\"" in words[w]:
100 |             high_light_counter[paragraph_counter] = 1
101 |         if 'paraId' in words[w]:
102 |             paragraph_counter += 1
103 |         # if "by the brm gene" in words[w].lower():
104 |         #     print(words[w], w)
105 |     # print(len(all_words2))
106 |     word_df = pd.DataFrame(data={'text': all_words,'text2': all_words2, 'paragraph': all_paragraph, 'properties': all_props})
107 |     merged = []
108 |     current_counter = 0
109 |     dummy = ''
110 |     dummy_disease = []
111 |     dummy_target = []
112 |     all_disease = []
113 |     all_target = []
114 |     previous_i = -1
115 |     for i in range(len(all_words2)):
116 |         if all_paragraph[i] != current_counter:
117 |             current_counter = all_paragraph[i+1]
118 |             merged.append(dummy)
119 |             all_disease.append(dummy_disease)
120 |             all_target.append(dummy_target)
121 |             dummy = ''
122 |             dummy_disease = []
123 |             dummy_target = []
124 |         if all_props[i] == 'disease':
125 |             if '(' in all_words2[i] and not all_words2[i].startswith('('):
126 |                 current_words = all_words2[i].split('(')
127 |                 for c in current_words:
128 |                     current_word = delete_punctuation(c)
129 |                     dummy_disease.append(current_word.strip().lower())
130 |             else:
131 |                 current_word = delete_punctuation(all_words2[i])
132 |                 dummy_disease.append(current_word.strip().lower())
133 |         elif all_props[i] == 'target':
134 |             # if '(' in all_words2[i] and not all_words2[i].startswith('('):
135 |             #     current_words = all_words2[i].split('(')
136 |             #     for c in current_words:
137 |             #         current_word = delete_punctuation(c)
138 |             #         dummy_target.append(current_word.strip().lower())
139 |             # else:
140 |             #     current_word = delete_punctuation(all_words2[i])
141 |             #     dummy_target.append(current_word.strip().lower())
142 |             # current_word = delete_punctuation(all_words2[i])
143 |             # dummy_target.append(current_word.strip().lower())
144 |             if previous_i == i - 1:
145 |                 # print(all_words2[i-1].strip().lower(), all_words2[i].strip().lower())
146 |                 dummy_target[-1] = dummy_target[-1] + all_words2[i].strip().lower()
147 |                 previous_i = i
148 |             else:
149 |                 dummy_target.append(all_words2[i].strip().lower())
150 |                 previous_i = i
151 |         dummy = dummy + all_words2[i] + ' '
152 |     merged.append(dummy)
153 |     all_disease.append(dummy_disease)
154 |     all_target.append(dummy_target)
155 |     print(cluster_number)
156 |     print(len(all_target), len(all_disease), len(merged))
157 |     print(len(cluster_df['description'].drop_duplicates()))
158 |     print(len(high_light_counter))
159 |     total_unhighlighted.append(len(merged) - len(high_light_counter))
160 |     total_merged.append(len(merged))
161 |     ordered_merged = []
162 |     ordered_disease = []
163 |     ordered_target = []
164 |     match_scores = []
165 |     mapping = {}
166 |     for d in range(len(descs)):
167 |         dummy = ''
168 |         dummy_disease = []
169 |         dummy_target = []
170 |         overlap_counter = 0
171 |         for m in range(len(merged)):
172 |             current_counter = fuzz.token_set_ratio(merged[m], descs[d].lower())
173 |             # current_counter = len(np.unique(intersection(np.unique(merged[m].split()), np.unique(descs[d].lower().split()))))
174 |             if current_counter > overlap_counter:
175 |                 overlap_counter = current_counter
176 |                 dummy = merged[m]
177 |                 dummy_disease = all_disease[m]
178 |                 dummy_target = all_target[m]
179 |                 mapping[d] = m
180 |         # match_scores.append(overlap_counter/len(descs[d].split()))
181 |         match_scores.append(overlap_counter)
182 |         # print(d, match_scores[-1])
183 |         ordered_merged.append(dummy)
184 |         ordered_disease.append(dummy_disease)
185 |         ordered_target.append(dummy_target)
186 | 
187 |     ordered_disease_str = []
188 |     ordered_target_str = []
189 |     for i in range(len(ordered_disease)):
190 |         dummy = ''
191 |         for j in ordered_disease[i]:
192 |             dummy += j + ', '
193 |         ordered_disease_str.append(dummy.rstrip(', '))
194 |         dummy = ''
195 |         for j in ordered_target[i]:
196 |             dummy += j + ', '
197 |         ordered_target_str.append(dummy.rstrip(', '))
198 | 
199 |     # new_df = pd.DataFrame(data={'description': descs, 'new_description': ordered_merged, 'disease':ordered_disease_str, 'target': ordered_target_str})
200 |     cluster_df['recovered_description'] = ordered_merged
201 |     cluster_df['recovery_score'] = match_scores
202 |     cluster_df['disease_tags_ground_truth'] = ordered_disease_str
203 |     cluster_df['target_tags_ground_truth'] = ordered_target_str
204 |     all_tagged_dfs.append(cluster_df)
205 |     disease_clusters.append(ordered_disease)
206 |     target_clusters.append(ordered_target)
207 | merged_df = all_tagged_dfs[0]
208 | for i in range(1, len(all_tagged_dfs)):
209 |     merged_df = merged_df.merge(all_tagged_dfs[i], how='outer')
210 | 
211 | merged_df.to_csv('merged_features_clustered_ner_cleaned_extracted.csv', header=True, index=False)
212 | 
213 | 
214 | 
215 | disease_clusters_merged = []
216 | disease_merged = []
217 | for i in disease_clusters:
218 |     dummy = []
219 |     for j in i:
220 |         for k in j:
221 |             dummy.append(k)
222 |     disease_clusters_merged.append(dummy)
223 |     disease_merged.extend(dummy)
224 | 
225 | disease_counter = {}
226 | for i in disease_merged:
227 |     if i in disease_counter:
228 |         disease_counter[i] += 1
229 |     else:
230 |         disease_counter[i] = 1
231 | 
232 | disease_counter = dict(sorted(disease_counter.items(), key=lambda item: item[1]))
233 | x = list(disease_counter.keys())
234 | y = list(disease_counter.values())
235 | x.reverse()
236 | y.reverse()
237 | disease_counter_df = pd.DataFrame(data={'all_disease': x, 'count_all_disease': y})
238 | 
239 | all_x = []
240 | all_y = []
241 | for cluster_number in range(10):
242 |     disease_counter = {}
243 |     for i in disease_clusters_merged[cluster_number]:
244 |         if i in disease_counter:
245 |             disease_counter[i] += 1
246 |         else:
247 |             disease_counter[i] = 1
248 | 
249 |     disease_counter = dict(sorted(disease_counter.items(), key=lambda item: item[1]))
250 |     x = list(disease_counter.keys())
251 |     y = list(disease_counter.values())
252 |     x.reverse()
253 |     y.reverse()
254 |     disease_counter_df_dummy = pd.DataFrame(data={str(cluster_number)+'_disease': x, str(cluster_number)+'_count': y})
255 |     disease_counter_df = pd.concat([disease_counter_df,disease_counter_df_dummy], axis=1)
256 | 
257 | disease_counter_df.to_csv('count_all_diseases.csv', index=False, header=True)
258 | 
259 | target_clusters_merged = []
260 | target_merged = []
261 | for i in target_clusters:
262 |     dummy = []
263 |     for j in i:
264 |         for k in j:
265 |             dummy.append(k)
266 |     target_clusters_merged.append(dummy)
267 |     target_merged.extend(dummy)
268 | 
269 | target_counter = {}
270 | for i in target_merged:
271 |     if i in target_counter:
272 |         target_counter[i] += 1
273 |     else:
274 |         target_counter[i] = 1
275 | 
276 | target_counter = dict(sorted(target_counter.items(), key=lambda item: item[1]))
277 | x = list(target_counter.keys())
278 | y = list(target_counter.values())
279 | x.reverse()
280 | y.reverse()
281 | target_counter_df = pd.DataFrame(data={'all_target': x, 'count_all_target': y})
282 | 
283 | all_x = []
284 | all_y = []
285 | for cluster_number in range(10):
286 |     target_counter = {}
287 |     for i in target_clusters_merged[cluster_number]:
288 |         if i in target_counter:
289 |             target_counter[i] += 1
290 |         else:
291 |             target_counter[i] = 1
292 | 
293 |     target_counter = dict(sorted(target_counter.items(), key=lambda item: item[1]))
294 |     x = list(target_counter.keys())
295 |     y = list(target_counter.values())
296 |     x.reverse()
297 |     y.reverse()
298 |     target_counter_df_dummy = pd.DataFrame(data={str(cluster_number)+'_target': x, str(cluster_number)+'_count': y})
299 |     # target_counter_df[str(cluster_number)+'target'] = x
300 |     # target_counter_df[str(cluster_number)+'count'] = y
301 |     target_counter_df = pd.concat([target_counter_df,target_counter_df_dummy], axis=1)
302 | 
303 | target_counter_df.to_csv('count_all_targets.csv', index=False, header=True)
304 | 
305 | ner_results = np.array(merged_df['ner_tags'])
306 | ner_results_cleaned = []
307 | for n in ner_results:
308 |     ner_results_cleaned.append(np.unique(n.strip('[]').replace('\'', '').split(',')))
309 | ner_results_str = []
310 | for n in ner_results_cleaned:
311 |     dummy_tag = ''
312 |     for t in n:
313 |         dummy_tag += t.strip() + ','
314 |     ner_results_str.append(dummy_tag.rstrip(','))
315 | merged_df.pop('ner_tags')
316 | merged_df['ner_tags'] = ner_results_str
317 | 
318 | merged_df.to_csv('merged_features_clustered_ner_cleaned_extracted.csv', header=True, index=False)
319 | 
320 | ner_results_str = np.array(pd.read_csv('merged_features_clustered_ner_cleaned_extracted.csv')['ner_tags'])
321 | cluster_nums = np.array(pd.read_csv('merged_features_clustered_ner_cleaned_extracted.csv')['cluster'])
322 | 
323 | disease_merged = []
324 | disease_clusters_merged = [[],[],[],[],[],[],[],[],[],[]]
325 | for i in range(len(ner_results_str)):
326 |     n = ner_results_str[i]
327 |     if pd.isna(n):
328 |         continue
329 |     dummy = n.split(',')
330 |     if len(dummy) > 0:
331 |         disease_merged.extend(dummy)
332 |         disease_clusters_merged[cluster_nums[i]].extend(dummy)
333 | 
334 | disease_counter = {}
335 | for i in disease_merged:
336 |     if i in disease_counter:
337 |         disease_counter[i] += 1
338 |     else:
339 |         disease_counter[i] = 1
340 | 
341 | disease_counter = dict(sorted(disease_counter.items(), key=lambda item: item[1]))
342 | x = list(disease_counter.keys())
343 | y = list(disease_counter.values())
344 | x.reverse()
345 | y.reverse()
346 | disease_counter_df = pd.DataFrame(data={'all_disease': x, 'count_all_disease': y})
347 | 
348 | all_x = []
349 | all_y = []
350 | for cluster_number in range(10):
351 |     disease_counter = {}
352 |     for i in disease_clusters_merged[cluster_number]:
353 |         if i in disease_counter:
354 |             disease_counter[i] += 1
355 |         else:
356 |             disease_counter[i] = 1
357 | 
358 |     disease_counter = dict(sorted(disease_counter.items(), key=lambda item: item[1]))
359 |     x = list(disease_counter.keys())
360 |     y = list(disease_counter.values())
361 |     x.reverse()
362 |     y.reverse()
363 |     disease_counter_df_dummy = pd.DataFrame(data={str(cluster_number)+'_disease': x, str(cluster_number)+'_count': y})
364 |     # disease_counter_df[str(cluster_number)+'disease'] = x
365 |     # disease_counter_df[str(cluster_number)+'count'] = y
366 |     disease_counter_df = pd.concat([disease_counter_df,disease_counter_df_dummy], axis=1)
367 | 
368 | disease_counter_df.to_csv('count_all_diseases_ner.csv', index=False, header=True)
369 | 
370 | import pandas as pd
371 | import numpy as np
372 | annot_df = pd.read_csv('key.csv')
373 | annot_classes = list(np.unique(np.array(annot_df['class'])))
374 | annot_map = {}
375 | for a in range(len(annot_classes)):
376 |     annot_map[a] = annot_classes[a]
377 | 
378 | disease_class = []
379 | for a in annot_classes:
380 |     current_df = annot_df[annot_df['class'] == a]
381 |     current_diseases = list(np.unique(np.array(current_df['name'])))
382 |     disease_class.append(current_diseases)
383 | annot_diseases = list(np.unique(np.array(annot_df['name'])))
384 | df = pd.read_csv('merged_features_clustered_ner_cleaned_extracted.csv')
385 | diseases = np.array(df['disease_tags_ground_truth'])
386 | unannotated = []
387 | benchmark_tag = []
388 | for i in range(len(diseases)):
389 |     n = diseases[i]
390 |     if pd.isna(n):
391 |         benchmark_tag.append('')
392 |         continue
393 |     dummy = n.split(',')
394 |     tag_dummy = []
395 |     if len(dummy) == 0:
396 |         benchmark_tag.append('')
397 |         continue
398 |     for dc in dummy:
399 |         disease = dc.strip()
400 |         for d_class in range(len(disease_class)):
401 |             if disease in disease_class[d_class]:
402 |                 if annot_map[d_class] not in tag_dummy:
403 |                     tag_dummy.append(annot_map[d_class])
404 |             if disease not in annot_diseases:
405 |                 if disease not in unannotated:
406 |                     unannotated.append(disease)
407 |                     print(disease)
408 |     tag_dummy_str = ''
409 |     for tag_d in tag_dummy:
410 |         tag_dummy_str += tag_d + ','
411 |     benchmark_tag.append(tag_dummy_str.rstrip(','))
412 | df['benchmark_tag'] = benchmark_tag
413 | df.to_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked.csv', header=True, index=False)
414 | 
415 | dummy = df[df['benchmark_tag'] == '']
416 | 
417 | print(len(dummy), len(dummy.dropna(subset=['disease_tags_ground_truth'])),len(dummy.dropna(subset=['target_tags_ground_truth'])), len(df))
418 | print(np.sum(total_unhighlighted))
419 | print(np.sum(total_merged))
420 | for i in range(10):
421 |     print('Cluster', str(i), 'has', str(total_unhighlighted[i]), 'not highlighted out of the total',
422 |           str(total_merged[i]), 'paragraphs and', str(len(dummy[dummy['cluster'] == i])),
423 |           'untagged descriptions out of', str(len(df[df['cluster'] == i])), 'descriptions.')
424 | print('Total', str(np.sum(total_unhighlighted)), 'not highlighted paragraphs and', str(len(dummy)),
425 |       'untagged descriptions and', str(len(df)-len(dummy)), 'tagged descriptions.')
426 | 
427 | 
428 | 
429 | 
430 | useless_tags = [
431 | 'gpr119',
432 | 'damage',
433 | 'all',
434 | 'aml',
435 | 'syndrome',
436 | 'mbt',
437 | 'anthelmintics',
438 | 'antithrombic',
439 | 'embryonic lethality',
440 | 'transcriptional and translational regulation',
441 | 'sds',
442 | 'type iv',
443 | 'ftld',
444 | 'epigenetics',
445 | 'hnf4',
446 | 'metabolic',
447 | 'neonatal',
448 | 'hh',
449 | 'autophagy',
450 | 'colorectal',
451 | 'trust',
452 | 'ra',
453 | 'cellular proliferation',
454 | 'liver',
455 | 'mpds',
456 | 'adult bone formation',
457 | 'firefly',
458 | 'firefly luciferase',
459 | 'luminescent',
460 | 'diseases',
461 | 'disease',
462 | 'dyrk1 kinase',
463 | 'cell survival',
464 | 'mv',
465 | 'of mycobacterium tuberculosis',
466 | 'diarrhea',
467 | 'mas',
468 | 'at',
469 | 'gsdii',
470 | 'omim 230400',
471 | 'muscle',
472 | 'pca',
473 | 'proliferation stimuli',
474 | 'ad',
475 | 'cf',
476 | 'luciferase',
477 | 'gpcr',
478 | 'cjd',
479 | 'hd',
480 | 'skeletal muscle',
481 | 'frda',
482 | 'hcs',
483 | 'mm',
484 | 'cll',
485 | 'cellular senescence',
486 | 'tyrosine kinases',
487 | 'kappab',
488 | 'cml',
489 | 'drg',
490 | 'alqts',
491 | 'relapse',
492 | 'vascular smooth muscle',
493 | 'muscle diseases',
494 | 'cytotoxic',
495 | 'tgfbeta antagonists',
496 | 'pxrluc',
497 | 'liver regeneration'
498 | ]
499 | for u in unannotated:
500 |     if u not in useless_tags:
501 |         print(u)
502 | 
503 | # unused_aids.extend(list(df[df['benchmark_tag'] == '']['AID']))
504 | unused_aids = [
505 |  588856,
506 |  588855,
507 |  1663,
508 |  2216,
509 |  1832,
510 |  782,
511 |  588342,
512 |  1865,
513 |  2599,
514 |  540295,
515 |  540308,
516 |  720647,
517 |  743238,
518 |  588674,
519 |  602363,
520 |  651704,
521 |  651658,
522 |  488862,
523 |  504414,
524 |  652115,
525 |  504441,
526 |  504408,
527 |  602252,
528 |  485317,
529 |  2288,
530 |  2289,
531 |  2629,
532 |  1875,
533 |  2094,
534 |  2098,
535 |  2563,
536 |  588478,
537 |  1159583,
538 |  485294,
539 |  485341,
540 |  1721,
541 |  1722,
542 |  651999,
543 |  2805,
544 |  2806,
545 |  434973,
546 |  2524,
547 |  2540,
548 |  2544,
549 |  1016,
550 |  1006,
551 |  1020,
552 |  1027,
553 |  1136,
554 |  720516]
555 | used_aids = [aid for aid in df['AID'] if aid not in unused_aids]
556 | print(len(df))
557 | df_cleaned = df[df['AID'].isin(used_aids)]
558 | print(len(df_cleaned))
559 | df_cleaned.to_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked_cleaned.csv', header=True, index=False)
560 | 
561 | 


--------------------------------------------------------------------------------
/molecular_data_cleaning.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from rdkit.Chem import MolFromSmiles, MolToSmiles
  4 | np.random.seed(123)
  5 | 
  6 | 
  7 | def canon_smile(smile):
  8 |     return MolToSmiles(MolFromSmiles(smile), isomericSmiles=True)
  9 | 
 10 | # get all AIDs
 11 | aids = np.array(pd.read_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked.csv')['AID'])
 12 | 
 13 | # Create download links for all datasets
 14 | str_aid = ''
 15 | for aid in aids:
 16 |     str_aid+='https://pubchem.ncbi.nlm.nih.gov/assay/pcget.cgi?query=download&record_type=datatable&actvty=all&response_type=save&aid='+str(aid) + '\n'
 17 | 
 18 | text_file = open('download_links.txt', "w+")
 19 | n = text_file.write(str_aid)
 20 | text_file.close()
 21 | 
 22 | # After datasets are downloaded, place them under datasets directory (this needs to be done manually)
 23 | 
 24 | # get all CIDs for molecules
 25 | all_cids = []
 26 | counter = 0
 27 | for aid in aids:
 28 |     all_cids.extend(np.array(pd.read_csv('datasets/AID_'+str(aid)+'_datatable_all.csv', usecols=['PUBCHEM_CID'])['PUBCHEM_CID'], dtype=int))
 29 |     print(counter, aid)
 30 |     counter += 1
 31 | 
 32 | print(len(all_cids))
 33 | # Find unique CIDs
 34 | unique_cids = np.unique(np.array(all_cids))
 35 | print(len(unique_cids))
 36 | 
 37 | # Create 3 lists of CIDs for download from PubChem, due to limitation from PubChem
 38 | for chunk in range(3):
 39 |     str_cid = ''
 40 |     for cid in unique_cids[499999 * chunk: 499999 * (chunk + 1)]:
 41 |         str_cid += str(cid) + '\n'
 42 |     text_file = open('unique_cids_'+str(chunk)+'.txt', "w+")
 43 |     n = text_file.write(str_cid)
 44 |     text_file.close()
 45 | 
 46 | # Manually download the SMILES from PubChem using the three text files and place them under smiles diretory
 47 | 
 48 | # Read all SMILES
 49 | li = []
 50 | for filename in range(3):
 51 |     df = pd.read_csv('smiles/'+str(filename)+'.txt', delimiter='\t')
 52 |     li.append(df)
 53 | smiles_df = pd.concat(li, axis=0, ignore_index=True)
 54 | 
 55 | # Make all SMILES canonical
 56 | canon_list = []
 57 | counter = 0
 58 | error_counter = 0
 59 | all_smiles = np.array(smiles_df['smiles'])
 60 | for s in all_smiles:
 61 |     try:
 62 |         cannon_dummy = canon_smile(s)
 63 |     except:
 64 |         cannon_dummy = ''
 65 |         error_counter += 1
 66 |     canon_list.append(cannon_dummy)
 67 |     if counter % 100000 == 0:
 68 |         print(counter, error_counter)
 69 |     counter += 1
 70 | smiles_df['canon_smiles'] = np.array(canon_list)
 71 | # Save mapping between input SMILES and canonical SMILES
 72 | smiles_df.to_csv('smiles/canon_map.csv', header=True, index=False)
 73 | 
 74 | 
 75 | # Find bioactivity labels from the datasets
 76 | aids = np.array(pd.read_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked.csv')['AID'])
 77 | all_df = []
 78 | for aid in aids:
 79 |     df = pd.read_csv('datasets/AID_'+str(aid)+'_datatable_all.csv', low_memory=False, usecols=['PUBCHEM_RESULT_TAG'])
 80 |     # find the row that data begins
 81 |     correct_column = 3
 82 |     for i in range(8):
 83 |         if df['PUBCHEM_RESULT_TAG'][i] == '1':
 84 |             correct_column = i
 85 |     print(aid, correct_column)
 86 |     all_df.append(pd.read_csv('datasets/AID_'+str(aid)+'_datatable_all.csv', low_memory=False, skiprows=list(range(1,correct_column+1)),
 87 |                      usecols=lambda c: c in set(['PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_CID', 'Phenotype', 'Activity Summary'])))
 88 | 
 89 | # Make a dictionary from the mapping for canonical SMILES
 90 | canon_df = pd.read_csv('smiles/canon_map.csv')
 91 | canon_map = pd.Series(canon_df['canon_smiles'].values,index=canon_df['PUBCHEM_CID']).to_dict()
 92 | 
 93 | 
 94 | all_out_keys = ['Active', 'Inactive', 'Inconclusive', 'Unspecified']
 95 | all_pheno_keys = ['Activator', 'Active', 'Cytotoxic', 'Fluorescent', 'Inactive', 'Inconclusive', 'Inhibitor',
 96 |                   'Quencher', 'Signal activator', 'Signal inhibitor', 'ikB active']
 97 | all_summary_keys = ['active agonist', 'active antagonist', 'inactive', 'inconclusive', 'inconclusive agonist',
 98 |                     'inconclusive agonist (cytotoxic)', 'inconclusive agonist (fluorescent)', 'inconclusive antagonist',
 99 |                     'inconclusive antagonist (cytotoxic)', 'inconclusive antagonist (fluorescent)']
100 | 
101 | def check_type_exist(type_, dict_):
102 |     if type_ in dict_:
103 |         return dict_[type_]
104 |     else:
105 |         return 0
106 | 
107 | # go through all datasets and clean the data
108 | all_count_type = []
109 | all_df_cleaned = []
110 | # flag = 0
111 | for file_counter in range(len(all_df)):
112 |     # file_counter = 0
113 |     df = all_df[file_counter]
114 |     print(aids[file_counter])
115 |     print('Initial shape: ', df.shape)
116 |     # print(df.columns)
117 | 
118 |     # Delete empty or duplicate smiles
119 |     df = df.dropna(subset=['PUBCHEM_CID'])
120 |     df = df.drop_duplicates(subset='PUBCHEM_CID', keep='first')
121 |     # df.reset_index(inplace=True)
122 |     print('Shape after deleting empty or duplicate smiles: ', df.shape)
123 | 
124 |     # Delete data point with inconclusive or unspeecified bioactivity labels
125 |     df = df[df['PUBCHEM_ACTIVITY_OUTCOME'] != 'Inconclusive']
126 |     df = df[df['PUBCHEM_ACTIVITY_OUTCOME'] != 'Unspecified']
127 |     print('Shape after deleting iconclusive and unspecified: ', df.shape)
128 |     #     df = df[df[phenotype_column] != 'Cytotoxic']
129 | 
130 |     # Find unique values in columns
131 |     df_dropped = df.dropna(subset=['PUBCHEM_ACTIVITY_OUTCOME'])
132 |     phenotype_column = None
133 |     outcome_column = None
134 |     summary_column = None
135 |     pheno_dict = {}
136 |     outcome_dict = {}
137 |     summary_dict = {}
138 |     for column in df.columns:
139 |         if 'Phenotype' in column:
140 |             phenotype_column = column
141 |             # print(column, np.unique(df_dropped[column]))
142 |             pheno_dict = df[phenotype_column].value_counts().to_dict()
143 |             # all_pheno_keys.extend(pheno_dict.keys())
144 |             print(pheno_dict)
145 |         if 'OUTCOME' in column:
146 |             outcome_column = column
147 |             # print(column, np.unique(df_dropped[column]))
148 |             outcome_dict = df[outcome_column].value_counts().to_dict()
149 |             # all_out_keys.extend(outcome_dict.keys())
150 |             print(outcome_dict)
151 |         if 'Summary' in column:
152 |             summary_column = column
153 |             # print(column, np.unique(df_dropped[column]))
154 |             summary_dict = df[summary_column].value_counts().to_dict()
155 |             # all_summary_keys.extend(summary_dict.keys())
156 |             print(summary_dict)
157 |     count_type = []
158 |     for k in all_out_keys:
159 |         count_type.append(check_type_exist(k, outcome_dict))
160 |     for k in all_pheno_keys:
161 |         count_type.append(check_type_exist(k, pheno_dict))
162 |     for k in all_summary_keys:
163 |         count_type.append(check_type_exist(k, summary_dict))
164 |     all_count_type.append(count_type)
165 | 
166 |     # convert bioactivity to binary labels
167 |     df[outcome_column] = df[outcome_column].replace({'Active': 1, 'Inactive': 0})
168 |     # print(df.columns)
169 |     # Save the bioactivity labels under a column with name "activity_" + AID number
170 |     df.rename({outcome_column: 'activity_'+str(aids[file_counter])}, axis=1, inplace=True)
171 | 
172 |     # Inser canon SMILES
173 |     df['smiles'] = df['PUBCHEM_CID'].map(canon_map)
174 |     # Delete duplicate SMILES
175 |     df = df.drop_duplicates(subset='smiles', keep='first')
176 |     # df.reset_index(inplace=True)
177 |     print('Shape after deleting duplicate canon smiles: ', df.shape)
178 | 
179 |     # Shuffle and save
180 |     df = df.sample(frac=1, random_state=42).reset_index(drop=True)
181 |     all_df_cleaned.append(df)
182 |     df.to_csv('cleaned_datasets/'+ str(aids[file_counter])+'_cleaned.csv', header=True, index=False)
183 | # all_count_type = np.array(all_count_type)
184 | # count_df = pd.DataFrame(data=all_count_type, columns=all_out_keys+all_pheno_keys+all_summary_keys)
185 | # count_df.insert(0, 'AID', aids)
186 | # count_df.to_csv('type_count_wo_inconclusive.csv', index=False, header=True)
187 | 
188 | # Merge all datasets together
189 | all_df = [pd.read_csv('cleaned_datasets/'+ str(aid)+'_cleaned.csv', low_memory=False) for aid in aids]
190 | for file_counter in range(len(all_df)):
191 |     df = all_df[file_counter]
192 |     outcome_column = None
193 |     for column in df.columns:
194 |         if 'activity' in column:
195 |             outcome_column = column
196 |             # print(column, np.unique(df[column]))
197 |     # if len(np.unique(df[outcome_column])) == 0:
198 |     #     continue
199 |     df = df[['smiles', outcome_column]]
200 |     if file_counter == 0:
201 |         merged = df
202 |     else:
203 |         merged = merged.merge(df, on=['smiles'], how='outer')
204 |     print(file_counter)
205 | print(merged.head())
206 | print(merged.shape)
207 | print(merged.iloc[0])
208 | smiles_canon = merged['smiles']
209 | # merged.nunique(axis=0)
210 | # print('smiles and unique smiles: ', len(smiles_canon), len(np.unique(smiles_canon)))
211 | print('number of rows with missing data: ', merged.shape[0] - merged.dropna().shape[0])
212 | 
213 | # Insert CIDs into the data
214 | canon_map_reversed = {}
215 | for k in canon_map:
216 |     if canon_map[k] not in canon_map_reversed:
217 |         canon_map_reversed[canon_map[k]] = str(k)
218 |     else:
219 |         canon_map_reversed[canon_map[k]] += ','+str(k)
220 | cleaned_ids = []
221 | for s in smiles_canon:
222 |     cleaned_ids.append(canon_map_reversed[s])
223 | merged.insert(1,'PUBCHEM_CID', np.array(cleaned_ids))
224 | # Save the cleaned dataset
225 | merged.to_csv('merged_cleaned.csv', header=True, index=False)
226 | 
227 | merged = pd.read_csv('merged_cleaned.csv')
228 | # Unused datasets are found after manual labeling, where they do not belong to disease category, which is why this is hardcoded
229 | unused_aids = [
230 |  588856,
231 |  588855,
232 |  1663,
233 |  2216,
234 |  1832,
235 |  782,
236 |  588342,
237 |  1865,
238 |  2599,
239 |  540295,
240 |  540308,
241 |  720647,
242 |  743238,
243 |  588674,
244 |  602363,
245 |  651704,
246 |  651658,
247 |  488862,
248 |  504414,
249 |  652115,
250 |  504441,
251 |  504408,
252 |  602252,
253 |  485317,
254 |  2629,
255 |  1875,
256 |  2094,
257 |  2098,
258 |  2288,
259 |  2289,
260 |  2563,
261 |  588478,
262 |  1159583,
263 |  485294,
264 |  485341,
265 |  1721,
266 |  1722,
267 |  651999,
268 |  2805,
269 |  2806,
270 |  434973,
271 |  2524,
272 |  2540,
273 |  2544,
274 |  1016,
275 |  1006,
276 |  1020,
277 |  1027,
278 |  1136,
279 |  720516]
280 | print(len(merged.columns))
281 | # drop unused datasets
282 | merged.drop(['activity_'+str(a) for a in unused_aids], axis=1, inplace=True)
283 | print(len(merged.columns))
284 | print(len(merged))
285 | activity_columns = [c for c in merged.columns if 'activity' in c]
286 | # drop empty rows
287 | merged.dropna(subset=activity_columns, how='all', inplace=True)
288 | print(merged.shape)
289 | # Save the new merged datasets with unused bioassays deleted
290 | merged.to_csv('merged_cleaned_benchmarked.csv', header=True, index=False)
291 | 
292 | # Read the description file for bioassays information
293 | df = pd.read_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked_cleaned.csv')
294 | # Find the number of molecules from the descriptions and from the merged dataset, they should be close to each other
295 | active_num = []
296 | total_num = []
297 | for aid in activity_columns:
298 |     aid_number = int(aid.lstrip('activity_'))
299 |     df_dummy = df[df['AID'] == aid_number]
300 |     active_num.append(len(merged[merged[aid] == 1]))
301 |     total_num.append(len(merged[aid].dropna()))
302 |     if not pd.isna(df_dummy['active_num']).bool():
303 |         print(aid_number, int(df_dummy['substance_num']), int(df_dummy['active_num']), len(merged[aid].dropna()), len(merged[merged[aid] == 1]))
304 |     else:
305 |         print(aid_number, int(df_dummy['substance_num']), len(merged[aid].dropna()), len(merged[merged[aid] == 1]))
306 | 
307 | 
308 | print(len(active_num) - np.sum(np.array(active_num)>=50))
309 | 
310 | # Number of molecules after cleaning
311 | df['recovered_substance_num'] = np.array(total_num)
312 | df['recovered_active_num'] = np.array(active_num)
313 | 
314 | # Save molecular data with a column for count
315 | df.to_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked_cleaned_counted.csv', header=True, index=False)
316 | 
317 | # Delete bioassays that have less than 15 active data points
318 | df = pd.read_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked_cleaned_counted.csv')
319 | print(len(df))
320 | df = df[df['recovered_active_num'] >= 15]
321 | print(len(df))
322 | 
323 | # Save the molecular data after cleaning, verifying the count, and deleting unused bioassays or bioassays with less than 15 active molecules
324 | df.to_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked_cleaned_counted_threshold.csv', header=True, index=False)
325 | 
326 | 


--------------------------------------------------------------------------------
/ner.py:
--------------------------------------------------------------------------------
 1 | from transformers.pipelines import pipeline
 2 | import transformers
 3 | import torch
 4 | import pandas as pd
 5 | import numpy as np
 6 | print(transformers.__version__)
 7 | print(torch.__version__)
 8 | 
 9 | classifier = pipeline("ner", model='alvaroalon2/biobert_diseases_ner')
10 | df = pd.read_csv('merged.csv')
11 | df = df[df['substance_num'] >= 100]
12 | df_tox = df[df['source'] == 'Tox21']
13 | df = df[df['substance_num'] >= 100000]
14 | df = df.merge(df_tox, how='outer')
15 | # biobert = BiobertEmbedding()
16 | 
17 | sources = np.array(df['source'])
18 | print(np.unique(sources))
19 | def clean_punctuation(sample):
20 |     sample = sample.replace('-', ' ')
21 |     sample = sample.replace('/', ' ')
22 |     sample = sample.replace('(', ' ')
23 |     sample = sample.replace(')', ' ')
24 |     sample = sample.replace('\'', ' ')
25 |     sample = sample.replace('.', ' ')
26 |     sample = sample.replace(':', ' ')
27 |     sample = sample.replace(',', ' ')
28 |     sample = sample.replace(';', ' ')
29 |     sample = sample.replace('_', ' ')
30 |     return sample
31 | descs = np.array(df['description'])
32 | titles = np.array(df['name'])
33 | descs = np.array([clean_punctuation(d) for d in descs])
34 | titles = np.array([clean_punctuation(d) for d in titles])
35 | 
36 | 
37 | disease_tags = []
38 | for i in range(len(descs)):
39 |     dummy = titles[i].lower() + ' . ' + descs[i].lower()
40 |     # dummy = dummy.split()
41 |     ner = classifier(dummy)
42 |     disease_dummy = []
43 |     for j in ner:
44 |         if 'DISEASE' in j['entity']:
45 |             w = j['word']
46 |             if '##' in w and len(disease_dummy) != 0:
47 |                 disease_dummy[-1] = disease_dummy[-1] + w.lstrip('##')
48 |             else:
49 |                 disease_dummy.append(w)
50 |     disease_tags.append(disease_dummy)
51 | df = pd.read_csv('merged_features_clustered.csv')
52 | print(len(df), len(disease_tags))
53 | 
54 | 
55 | df.insert(10, 'ner_tags', disease_tags)
56 | df.to_csv('merged_features_clustered_ner.csv')
57 | 
58 | df = pd.read_csv('merged_features_clustered_ner.csv')
59 | for column in df.columns:
60 |     if 'feature' in column or 'Unnamed' in column:
61 |         df.drop(column, axis=1, inplace=True)
62 | df.to_csv('merged_features_clustered_ner_cleaned.csv', header=True, index=False)
63 | 
64 | 


--------------------------------------------------------------------------------
/preprocessing.py:
--------------------------------------------------------------------------------
  1 | from os import listdir
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | # data from multiple data sources are downloaded from PubChem and placed in data directory in seperate folders
  6 | data_dir = 'data/'
  7 | folder_names = sorted(listdir(data_dir))
  8 | summs = [data_dir+f+'/summary.txt' for f in folder_names]
  9 | descs = [data_dir+f+'/description.txt' for f in folder_names]
 10 | print(len(summs))
 11 | data = []
 12 | for file_counter in range(len(summs)):
 13 |     # file_counter = 0
 14 |     summ = summs[file_counter]
 15 |     with open(summ) as file:
 16 |         lines = []
 17 |         for line in file:
 18 |             lines.append(line.rstrip('\n'))
 19 |     block_idx = []
 20 |     for i in range(len(lines)):
 21 |         if lines[i] == '':
 22 |             block_idx.append(i)
 23 |     for block_counter in range(len(block_idx)):
 24 |         # block_counter = 0
 25 |         if block_counter != len(block_idx) - 1:
 26 |             current_lines = lines[block_idx[block_counter]+1:block_idx[block_counter+1]]
 27 |         else:
 28 |             current_lines = lines[block_idx[block_counter]+1:]
 29 |         block_number = current_lines[0].split('.')[0]
 30 |         if int(block_number) != block_counter + 1:
 31 |             print('Error in parsing block '+str(block_number)+' in '+summ)
 32 |         name = current_lines[0].lstrip(block_number+'.').lstrip()
 33 |         source = np.NaN
 34 |         aid = np.NaN
 35 |         sub_num = np.NaN
 36 |         active_num = np.NaN
 37 |         target = np.NaN
 38 |         for line in current_lines:
 39 |             if line == current_lines[0]:
 40 |                 continue
 41 |             # extract data source
 42 |             if line.startswith('Source'):
 43 |                 source = line.split(':')[1].lstrip()
 44 |             # extract bioassay AID
 45 |             elif line.startswith('AID'):
 46 |                 aid = line.split(':')[1].lstrip()
 47 |             # extract number of molecules and active molecules
 48 |             elif line.startswith('Substance BioActivity'):
 49 |                 dummy = line.split(':')[1].lstrip()
 50 |                 dummy_num = dummy.split()
 51 |                 for num_counter in range(len(dummy_num)):
 52 |                     if 'Active' in dummy_num[num_counter]:
 53 |                         active_num = int(dummy_num[num_counter-1])
 54 |                     if 'Tested' in dummy_num[num_counter]:
 55 |                         sub_num = int(dummy_num[num_counter-1])
 56 |             # extract target
 57 |             elif line.startswith('Protein Targets') or line.startswith('Protein Target'):
 58 |                 target = line.split(':')[1].lstrip()
 59 |             else:
 60 |                 print('UNUSED DATA:', line)
 61 |         data.append([aid, name, source, block_number, target, sub_num, active_num])
 62 | 
 63 | # Parse descriptions
 64 | data_desc = []
 65 | for file_counter in range(len(descs)):
 66 |     desc = descs[file_counter]
 67 |     with open(desc) as file:
 68 |         lines = []
 69 |         for line in file:
 70 |             lines.append(line.rstrip('\n'))
 71 |     block_idx = []
 72 |     for i in range(len(lines)):
 73 |         if lines[i] == '':
 74 |             block_idx.append(i)
 75 |     for block_counter in range(len(block_idx)):
 76 |         if block_counter != len(block_idx) - 1:
 77 |             current_lines = lines[block_idx[block_counter] + 1:block_idx[block_counter + 1]]
 78 |         else:
 79 |             current_lines = lines[block_idx[block_counter] + 1:]
 80 |         if '.' not in current_lines[0]:
 81 |             continue
 82 |         block_number = current_lines[0].split('.')[0]
 83 |         if int(block_number) != block_counter + 1:
 84 |             print('Error in parsing block ' + str(block_number) + ' in ' + desc)
 85 |         name = current_lines[0].lstrip(block_number + '.').lstrip()
 86 |         source = np.NaN
 87 |         aid = np.NaN
 88 |         description = np.NaN
 89 |         for line in current_lines:
 90 |             # line=current_lines[1]
 91 |             if line == current_lines[0]:
 92 |                 continue
 93 |             if line.startswith('Source:') and '_||_' not in line:
 94 |                 source = line.split(':')[1].lstrip()
 95 |             elif line.startswith('AID:') and '_||_' not in line:
 96 |                 aid = line.split(':')[1].lstrip()
 97 |             else:
 98 |                 # Rules for parsing descriptions from different data sources
 99 |                 dummy_lines = line.split('_||_')
100 |                 if folder_names[file_counter] == 'Broad_Ins':
101 |                     description = line.replace('_||_', ' ')
102 |                 elif folder_names[file_counter] == 'Emory':
103 |                     description = ''
104 |                     useless = ['Assay Overview', 'NIH Molecular Libraries Screening Centers Network [MLSCN]',\
105 |                     'Emory Chemical Biology Discovery Center in MLSCN','Assay provider','MLSCN Grant']
106 |                     for dummy_line in dummy_lines:
107 |                         if not any(x in dummy_line for x in useless):
108 |                             description = description + ' ' + dummy_line
109 |                         else:
110 |                             print('UNUSED DATA:', dummy_line)
111 |                 elif folder_names[file_counter] == 'ICCB':
112 |                     description = ''
113 |                     useless = ['This screen was conducted by']
114 |                     for dummy_line in dummy_lines:
115 |                         if not any(x in dummy_line for x in useless):
116 |                             description = description + ' ' + dummy_line
117 |                         else:
118 |                             print('UNUSED DATA:', dummy_line)
119 |                 elif folder_names[file_counter] == 'John_Hopkins':
120 |                     description = ''
121 |                     useless = ['Data Source', 'BioAssay Type', 'Source (MLPCN Center Name)', 'Screening Center PI',\
122 |                                'Center Affiliation', 'Network:', 'Assay provider:','Assay Provider:','Grant Proposal Number',
123 |                                'Grant Proposal PI', 'Assay Implementation', 'Name:', 'External Assay ID:']
124 |                     reference_flag = 0
125 |                     for dummy_line in dummy_lines:
126 |                         if 'References' in dummy_line or 'Reference' in dummy_line:
127 |                             reference_flag = 1
128 |                         if not reference_flag and not any(x in dummy_line for x in useless):
129 |                             description = description + ' ' + dummy_line
130 |                         else:
131 |                             print('UNUSED DATA:', dummy_line)
132 |                         if 'Keywords' in dummy_line:
133 |                             reference_flag = 0
134 |                 elif folder_names[file_counter] == 'Ncats':
135 |                     description = ''
136 |                     reference_flag = 0
137 |                     useless = ['NIH Molecular Libraries Probe Centers Network [MLPCN]', 'MLPCN Grant',
138 |                                'Assay Provider', 'Assay Submitter (PI)', 'NIH Chemical Genomics Center [NCGC]']
139 |                     for dummy_line in dummy_lines:
140 |                         if 'References' in dummy_line or 'Reference' in dummy_line:
141 |                             reference_flag = 1
142 |                         if not reference_flag and not any(x in dummy_line for x in useless):
143 |                             description = description + ' ' + dummy_line
144 |                         else:
145 |                             print('UNUSED DATA:', dummy_line)
146 |                         if 'Keywords' in dummy_line:
147 |                             reference_flag = 0
148 |                 elif folder_names[file_counter] == 'NMMLSC':
149 |                     description = ''
150 |                     reference_flag = 0
151 |                     useless = ['University of New Mexico Assay Overview:', 'Assay Support:',
152 |                                'PI:', 'PI Affiliation:', 'Screening Center PI:', 'Screening Lead:',
153 |                                'Assay Implementation:', 'UNM Cheminformatics:', 'Chemistry:',
154 |                                'Vanderbilt Specialized Chemistry Center PI:', 'Vanderbilt Chemistry Lead:',
155 |                                'Assay Background and Significance:', 'Project Title:', 'Screening Center Manager:',
156 |                                'Screening Center/PI:', 'Lead Biologist:', 'Screening Operations Team:',
157 |                                'Chemistry Lead:', 'Specialized Chemistry Center:', 'Assay Support:',
158 |                                'University of New Mexico Center for Molecular Discovery  PI:', 'Center PI:',
159 |                                'Target Team Leader for the Center:', 'KU SCC Project Manager:',
160 |                                'KU SCC Chemists on this project:', 'Assay provider:','Assay Provider:', 'Co-PI:', 'KU Specialized Chemistry Center PI:']
161 |                     for dummy_line in dummy_lines:
162 |                         if not any(x in dummy_line for x in useless):
163 |                             description = description + ' ' + dummy_line
164 |                         else:
165 |                             print('UNUSED DATA:', dummy_line)
166 |                 elif folder_names[file_counter] == 'Sanford_Burnam':
167 |                     description = ''
168 |                     reference_flag = 0
169 |                     useless = ['Data Source:', 'Source Affiliation:', 'Network:',
170 |                                'NIH Molecular Libraries Probe Production Centers Network (MLPCN)',
171 |                                'Grant Number:', 'Assay Provider:', 'Grant Proposal Number:']
172 |                     for dummy_line in dummy_lines:
173 |                         if 'REFERENCES' in dummy_line or 'References' in dummy_line:
174 |                             reference_flag = 1
175 |                         if not reference_flag and not any(x in dummy_line for x in useless):
176 |                             description = description + ' ' + dummy_line
177 |                         else:
178 |                             print('UNUSED DATA:', dummy_line)
179 |                 elif folder_names[file_counter] == 'Scripps':
180 |                     description = ''
181 |                     reference_flag = 0
182 |                     useless = ['Source (MLPCN Center Name):','Source (MLSCN Center Name):', 'Center Affiliation:', 'Assay Provider:',
183 |                                'Network:', 'Grant Proposal Number', 'Grant Proposal PI:', 'External Assay ID:',
184 |                                'Name:', 'Source:',
185 |                                'Center Affiliation:', 'Affiliation:']
186 |                     for dummy_line in dummy_lines:
187 |                         # if block_counter == 128:
188 |                         #     print(128, description)
189 |                         if 'References' in dummy_line or 'Reference' in dummy_line:
190 |                             reference_flag = 1
191 |                         if not reference_flag and not any(x in dummy_line for x in useless):
192 |                             description = description + ' ' + dummy_line
193 |                         # else:
194 |                         #     print('UNUSED DATA:', dummy_line)
195 |                         if 'Keywords' in dummy_line:
196 |                             reference_flag = 0
197 |                 elif folder_names[file_counter] == 'Tox21':
198 |                     description = dummy_lines[-1]
199 |                 else:
200 |                     print('ERROR! Parsing rules have not been defined for ' + folder_names[file_counter])
201 |         data_desc.append([aid, name, source, block_number, description])
202 |         # print(name)
203 |         # print(source)
204 |         # print(aid)
205 |         # print(sub_num)
206 |         # print(target)
207 | data = np.array(data)
208 | column_names = ['AID', 'name', 'source', 'block_number', 'target', 'substance_num', 'active_num']
209 | data_dict = {}
210 | for i in range(len(column_names)):
211 |     data_dict[column_names[i]] = data[:,i]
212 | df = pd.DataFrame(data=data_dict)
213 | 
214 | data_desc = np.array(data_desc)
215 | column_names = ['AID', 'name', 'source', 'block_number', 'description']
216 | data_dict = {}
217 | for i in range(len(column_names)):
218 |     data_dict[column_names[i]] = data_desc[:,i]
219 | df_desc = pd.DataFrame(data=data_dict)
220 | 
221 | 
222 | merged = df.merge(df_desc, how='outer')
223 | print(len(df), len(df_desc), len(merged))
224 | print(len(df_desc.dropna()))
225 | 
226 | # Save bioassays' information and descriptions
227 | merged.to_csv('merged.csv', header=True, index=False)
228 | 
229 | 


--------------------------------------------------------------------------------
/splitting.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | from typing import Any, Dict, List, Iterator, Optional, Sequence, Tuple
  5 | 
  6 | 
  7 | 
  8 | from rdkit import Chem
  9 | from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
 10 | 
 11 | 
 12 | 
 13 | 
 14 | def split(smiles,
 15 |           frac_train: float = 0.8,
 16 |           frac_valid: float = 0.1,
 17 |           frac_test: float = 0.1,
 18 |           seed: Optional[int] = None,
 19 |           log_every_n: Optional[int] = 1000
 20 |           ) -> Tuple[List[int], List[int], List[int]]:
 21 |     """
 22 |     Splits internal compounds into train/validation/test by scaffold.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     dataset: Dataset
 27 |       Dataset to be split.
 28 |     frac_train: float, optional (default 0.8)
 29 |       The fraction of data to be used for the training split.
 30 |     frac_valid: float, optional (default 0.1)
 31 |       The fraction of data to be used for the validation split.
 32 |     frac_test: float, optional (default 0.1)
 33 |       The fraction of data to be used for the test split.
 34 |     seed: int, optional (default None)
 35 |       Random seed to use.
 36 |     log_every_n: int, optional (default 1000)
 37 |       Controls the logger by dictating how often logger outputs
 38 |       will be produced.
 39 | 
 40 |     Returns
 41 |     -------
 42 |     Tuple[List[int], List[int], List[int]]
 43 |       A tuple of train indices, valid indices, and test indices.
 44 |       Each indices is a list of integers.
 45 |     """
 46 |     np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
 47 |     scaffold_sets = generate_scaffolds(smiles)
 48 | 
 49 |     train_cutoff = frac_train * len(smiles)
 50 |     valid_cutoff = (frac_train + frac_valid) * len(smiles)
 51 |     train_inds: List[int] = []
 52 |     valid_inds: List[int] = []
 53 |     test_inds: List[int] = []
 54 | 
 55 |     # logger.info("About to sort in scaffold sets")
 56 |     for scaffold_set in scaffold_sets:
 57 |         if len(train_inds) + len(scaffold_set) > train_cutoff:
 58 |             if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
 59 |                 test_inds += scaffold_set
 60 |             else:
 61 |                 valid_inds += scaffold_set
 62 |         else:
 63 |             train_inds += scaffold_set
 64 |     return train_inds, valid_inds, test_inds
 65 | 
 66 | 
 67 | def generate_scaffolds(smiles_list,
 68 |                        log_every_n: int = 1000) -> List[List[int]]:
 69 |     """Returns all scaffolds from the dataset.
 70 | 
 71 |     Parameters
 72 |     ----------
 73 |     dataset: Dataset
 74 |       Dataset to be split.
 75 |     log_every_n: int, optional (default 1000)
 76 |       Controls the logger by dictating how often logger outputs
 77 |       will be produced.
 78 | 
 79 |     Returns
 80 |     -------
 81 |     scaffold_sets: List[List[int]]
 82 |       List of indices of each scaffold in the dataset.
 83 |     """
 84 |     scaffolds = {}
 85 |     data_len = len(smiles_list)
 86 | 
 87 |     # logger.info("About to generate scaffolds")
 88 |     for ind, smiles in enumerate(smiles_list):
 89 |         # if ind % log_every_n == 0:
 90 |             # logger.info("Generating scaffold %d/%d" % (ind, data_len))
 91 |         scaffold = MurckoScaffoldSmiles(mol=Chem.MolFromSmiles(smiles), includeChirality=True)
 92 |         if scaffold not in scaffolds:
 93 |             scaffolds[scaffold] = [ind]
 94 |         else:
 95 |             scaffolds[scaffold].append(ind)
 96 | 
 97 |     # Sort from largest to smallest scaffold sets
 98 |     scaffolds = {key: sorted(value) for key, value in scaffolds.items()}
 99 |     scaffold_sets = [
100 |         scaffold_set for (scaffold, scaffold_set) in sorted(
101 |             scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True)
102 |     ]
103 |     return scaffold_sets
104 | 
105 | input_data = 'data/all_molecular_data.csv'
106 | smiles = np.array(pd.read_csv(input_data)['smiles'])
107 | scaffold_list = []
108 | counter = 0
109 | for s in smiles:
110 |     try:
111 |         scaffold_list.append(MurckoScaffoldSmiles(mol=Chem.MolFromSmiles(s), includeChirality=True))
112 |     except:
113 |         scaffold_list.append(s)
114 |         print(s, counter)
115 |     counter += 1
116 | df = pd.read_csv(input_data)
117 | df['scaffold'] = np.array(scaffold_list)
118 | 
119 | df.dropna(subset=['smiles'], inplace=True)
120 | print(len(df))
121 | df.to_csv('merged_cleaned_benchmarked_threshold_scaffold.csv', header=True, index=False)
122 | 
123 | scaffold_list = np.array(df['scaffold'])
124 | smiles = np.array(df['smiles'])
125 | 
126 | scaffolds = {}
127 | for ind, scaffold in enumerate(scaffold_list):
128 |     if scaffold not in scaffolds:
129 |         scaffolds[scaffold] = [ind]
130 |     else:
131 |         scaffolds[scaffold].append(ind)
132 | 
133 | scaffolds = {key: sorted(value) for key, value in scaffolds.items()}
134 | scaffold_sets = [
135 |     scaffold_set for (scaffold, scaffold_set) in sorted(
136 |         scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True)
137 | ]
138 | 
139 | frac_train = 0.8
140 | frac_valid= 0.1
141 | frac_test = 0.1
142 | train_cutoff = frac_train * len(smiles)
143 | valid_cutoff = (frac_train + frac_valid) * len(smiles)
144 | train_inds = []
145 | valid_inds = []
146 | test_inds = []
147 | 
148 | for scaffold_set in scaffold_sets:
149 |     if len(train_inds) + len(scaffold_set) > train_cutoff:
150 |         if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
151 |             test_inds.extend(scaffold_set)
152 |         else:
153 |             valid_inds.extend(scaffold_set)
154 |     else:
155 |         train_inds.extend(scaffold_set)
156 | 
157 | split_list = np.empty(len(smiles), dtype=object)
158 | for i in train_inds:
159 |     split_list[i] = 'train'
160 | for i in valid_inds:
161 |     split_list[i] = 'validation'
162 | for i in test_inds:
163 |     split_list[i] = 'test'
164 | 
165 | print(len(split_list))
166 | df['split'] = split_list
167 | df.to_csv('merged_cleaned_benchmarked_threshold_scaffold_split.csv', header=True, index=False)
168 | 
169 | # df = pd.read_csv('merged_cleaned_benchmarked_threshold_scaffold_split.csv')
170 | df = pd.read_csv('merged_cleaned_benchmarked_threshold_scaffold_split_stratified.csv')
171 | 
172 | activity_columns = [c for c in df.columns if 'activity' in c]
173 | train_dummy = df[df['split'] == 'train']
174 | valid_dummy = df[df['split'] == 'validation']
175 | test_dummy = df[df['split'] == 'test']
176 | 
177 | problem_aids = []
178 | for a in activity_columns:
179 |     train_sum = int(np.sum(train_dummy[a]))
180 |     valid_sum = int(np.sum(valid_dummy[a]))
181 |     test_sum = int(np.sum(test_dummy[a]))
182 |     if train_sum < 1 or valid_sum < 1 or test_sum <1:
183 |         print(a, int(np.sum(train_dummy[a])), int(np.sum(valid_dummy[a])), int(np.sum(test_dummy[a])))
184 |         problem_aids.append(a)
185 | 
186 | for a in activity_columns:
187 |     train_sum = int(np.sum(list(train_dummy[a]==0)))
188 |     valid_sum = int(np.sum(list(valid_dummy[a]==0)))
189 |     test_sum = int(np.sum(list(test_dummy[a]==0)))
190 |     if train_sum < 1 or valid_sum < 1 or test_sum <1:
191 |         print(a, train_sum, valid_sum, test_sum)
192 |         problem_aids.append(a)
193 | 
194 | for a in activity_columns:
195 |     train_sum = int(np.sum(list(train_dummy[a]==0)))
196 |     valid_sum = int(np.sum(list(valid_dummy[a]==0)))
197 |     test_sum = int(np.sum(list(test_dummy[a]==0)))
198 |     print(a, train_sum, int(np.sum(train_dummy[a])), valid_sum, int(np.sum(valid_dummy[a])), test_sum, int(np.sum(test_dummy[a])))
199 | 
200 | import random
201 | extra_valid_ind = []
202 | extra_test_ind = []
203 | for a in problem_aids:
204 |     dummy_df = df[df[a] == 1]
205 |     aid_inds = dummy_df.index[dummy_df['split'] == 'train'].tolist()
206 |     d1,d2 = random.sample(aid_inds, 2)
207 |     extra_valid_ind.append(d1)
208 |     extra_test_ind.append(d2)
209 | 
210 | for i in extra_valid_ind:
211 |     df.at[i, 'split'] = 'validation'
212 | 
213 | for i in extra_test_ind:
214 |     df.at[i, 'split'] = 'test'
215 | 
216 | train_dummy = df[df['split'] == 'train']
217 | valid_dummy = df[df['split'] == 'validation']
218 | test_dummy = df[df['split'] == 'test']
219 | 
220 | for a in activity_columns:
221 |     train_sum = int(np.sum(train_dummy[a]))
222 |     valid_sum = int(np.sum(valid_dummy[a]))
223 |     test_sum = int(np.sum(test_dummy[a]))
224 |     if train_sum < 1 or valid_sum < 1 or test_sum <1:
225 |         print(a, int(np.sum(train_dummy[a])), int(np.sum(valid_dummy[a])), int(np.sum(test_dummy[a])))
226 | df.to_csv('merged_cleaned_benchmarked_threshold_scaffold_split_stratified.csv', header=True, index=False)
227 | 
228 | df = pd.read_csv('merged_cleaned_benchmarked_threshold_scaffold_split_stratified.csv')
229 | mini = df[:20000]
230 | mini.to_csv('mini.csv', header=True, index=False)
231 | 


--------------------------------------------------------------------------------
/training.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import deepchem as dc
  5 | from deepchem.models import GraphConvModel
  6 | import matplotlib.pyplot as plt
  7 | from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, recall_score, average_precision_score, precision_score
  8 | from deepchem.metrics.score_function import bedroc_score
  9 | import time
 10 | import os
 11 | from rdkit.Chem import MolFromSmiles, MolToSmiles
 12 | import shutil
 13 | import logging
 14 | import itertools
 15 | from typing import Any, Dict, List, Iterator, Optional, Sequence, Tuple
 16 | from deepchem.splits import Splitter
 17 | from deepchem.data import Dataset, DiskDataset
 18 | 
 19 | 
 20 | class MolDataSpecifiedSplitter(Splitter):
 21 |   """Split data in the fashion specified by user. Uses DeepChem's specified
 22 |   splitter but also takes as input the training splits.
 23 | 
 24 |   For some applications, you will already know how you'd like to split the
 25 |   dataset. In this splitter, you simplify specify `train_indices`, `valid_indices` and
 26 |   `test_indices` and the datapoints at those indices are pulled out of the
 27 |   dataset. Note that this is different from `IndexSplitter` which only splits
 28 |   based on the existing dataset ordering, while this `SpecifiedSplitter` can
 29 |   split on any specified ordering.
 30 |   """
 31 | 
 32 |   def __init__(self,
 33 |                train_indices: Optional[List[int]] = None,
 34 |                valid_indices: Optional[List[int]] = None,
 35 |                test_indices: Optional[List[int]] = None
 36 |                ):
 37 |     """
 38 |     Parameters
 39 |     -----------
 40 |     valid_indices: List[int]
 41 |       List of indices of samples in the valid set
 42 |     test_indices: List[int]
 43 |       List of indices of samples in the test set
 44 |     """
 45 |     self.train_indices = train_indices
 46 |     self.valid_indices = valid_indices
 47 |     self.test_indices = test_indices
 48 | 
 49 |   def split(self,
 50 |             dataset: Dataset,
 51 |             frac_train: float = 0.8,
 52 |             frac_valid: float = 0.1,
 53 |             frac_test: float = 0.1,
 54 |             seed: Optional[int] = None,
 55 |             log_every_n: Optional[int] = None
 56 |            ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
 57 |     """
 58 |     Splits internal compounds into train/validation/test in designated order.
 59 | 
 60 |     Parameters
 61 |     ----------
 62 |     dataset: Dataset
 63 |       Dataset to be split.
 64 |     frac_train: float, optional (default 0.8)
 65 |       Fraction of dataset put into training data.
 66 |     frac_valid: float, optional (default 0.1)
 67 |       Fraction of dataset put into validation data.
 68 |     frac_test: float, optional (default 0.1)
 69 |       Fraction of dataset put into test data.
 70 |     seed: int, optional (default None)
 71 |       Random seed to use.
 72 |     log_every_n: int, optional (default None)
 73 |       Log every n examples (not currently used).
 74 | 
 75 |     Returns
 76 |     -------
 77 |     Tuple[np.ndarray, np.ndarray, np.ndarray]
 78 |       A tuple of train indices, valid indices, and test indices.
 79 |       Each indices is a numpy array.
 80 |     """
 81 |     if self.train_indices is None:
 82 |       self.train_indices = []
 83 |     if self.valid_indices is None:
 84 |       self.valid_indices = []
 85 |     if self.test_indices is None:
 86 |       self.test_indices = []
 87 | 
 88 |     return (np.array(self.train_indices), np.array(self.valid_indices),
 89 |             np.array(self.test_indices))
 90 | 
 91 | 
 92 | def MolDataLoadData(data_dir, tasks, featurizer):
 93 |     dummy_df = pd.read_csv(data_dir, low_memory=False)
 94 |     train_indices = dummy_df.index[dummy_df['split'] == 'train'].tolist()
 95 |     valid_indices = dummy_df.index[dummy_df['split'] == 'validation'].tolist()
 96 |     test_indices = dummy_df.index[dummy_df['split'] == 'test'].tolist()
 97 |     print(len(dummy_df), len(train_indices) + len(valid_indices) + len(test_indices))
 98 |     print("About to load the dataset.")
 99 | 
100 |     # create featurizer, loader, transformers, and splitter
101 |     if featurizer == 'ECFP':
102 |         featurizer = dc.feat.CircularFingerprint(size=1024, chiral=True)
103 |     elif featurizer == 'GraphConv':
104 |         featurizer = dc.feat.ConvMolFeaturizer(use_chirality=True)
105 |     loader = dc.data.CSVLoader(tasks=tasks, feature_field="smiles", featurizer=featurizer)
106 |     splitters = {
107 |         'specified': MolDataSpecifiedSplitter(train_indices=train_indices, valid_indices=valid_indices, test_indices=test_indices)
108 |     }
109 |     splitter = splitters['specified']
110 | 
111 |     if not os.path.exists(data_dir):
112 |         print("Dataset not found")
113 |     print("About to featurize the dataset.")
114 |     dataset = loader.create_dataset([data_dir], shard_size=8192, data_dir='tmp/loader/')
115 | 
116 |     # Initialize transformers
117 |     print("About to split data")
118 |     untransformed_train_dataset, untransformed_valid_dataset, untransformed_test_dataset = \
119 |         splitter.train_valid_test_split(dataset, train_dir='tmp/train_un/',
120 |                                         valid_dir='tmp/valid_un/',
121 |                                         test_dir='tmp/test_un/')
122 |     print("About to transform data")
123 |     transformers = [dc.trans.BalancingTransformer(dataset=dataset)]
124 |     # Only transform the train dataset
125 |     for transformer in transformers:
126 |         train_dataset = transformer.transform(untransformed_train_dataset, out_dir='tmp/train/')
127 | 
128 |     shutil.rmtree('tmp/loader/', ignore_errors=True)
129 |     return train_dataset, untransformed_valid_dataset, untransformed_test_dataset, transformers
130 | 
131 | 
132 | #######################################################################
133 | # Select operation mode, disease benchmarks or target benchmarks
134 | data_type = 'disease'
135 | # data_type = 'target'
136 | 
137 | # Select model and featurizer type
138 | featurizer = 'GraphConv'
139 | # featurizer = 'ECFP'
140 | 
141 | # Specify data directory
142 | data_dir = 'data/all_molecular_data.csv'
143 | map_df = pd.read_csv('data/aid_'+data_type+'_mapping.csv')
144 | print(map_df.columns)
145 | 
146 | epochnb = 50
147 | graph_conv_layers = [512, 512, 512]
148 | dropout = 0.1
149 | learning_rate = 0.0001
150 | batch_size = 128
151 | dense_layer_size = 1024
152 | if data_type == 'disease':
153 |     all_categories = ['all', 'cancer', 'nervous System', 'immune system', 'cardiovascular',
154 |                           'toxicity', 'obesity', 'virus', 'diabetes', 'metabolic disorders', 'bacteria',
155 |                           'parasite', 'epigenetics_genetics', 'pulmonary', 'infection', 'aging', 'fungal']
156 | if data_type == 'target':
157 |     all_categories = ['all_target', 'Membrane receptor', 'Enzyme (other)', 'Nuclear receptor',
158 |            'Hydrolase', 'Protease', 'Transcription factor', 'Kinase',
159 |            'Epigenetic regulator', 'Ion channel', 'Transferase', 'Oxidoreductase',
160 |            'Transporter', 'NTPase', 'Phosphatase']
161 | 
162 | logging.basicConfig(level=logging.INFO)
163 | 
164 | for run_type in all_categories:
165 |     np.random.seed(42)
166 |     tf.compat.v1.set_random_seed(42)
167 |     if run_type == 'all' or run_type == 'all_target':
168 |         tasks = list(np.array(map_df['AID'])[[True if t > 0 else False for t in np.sum(np.array(map_df[all_categories[1:]]), axis=1)]])
169 |     else:
170 |         tasks = list(map_df[map_df[run_type] == 1]['AID'])
171 |     # Select tasks based on the operation mode
172 |     print(run_type, tasks)
173 |     print(len(tasks))
174 | 
175 |     timestr = time.strftime("%m%d-%H%M")
176 |     model_dir = 'built_models/moldata/'+featurizer+'/' + timestr + '/'
177 |     if os.path.isdir(model_dir):
178 |         timestr = timestr.split('-')[0] + '-' + timestr.split('-')[1][:2] + str(int(timestr.split('-')[1][2:])+60)
179 |     os.makedirs(model_dir, exist_ok=True)
180 | 
181 |     # Load the data from the splits, transform only the train split
182 |     train_dataset, untransformed_valid_dataset, untransformed_test_dataset, transformers = MolDataLoadData(data_dir=data_dir, tasks=tasks,featurizer=featurizer)
183 |     training_data_len = len(train_dataset.y)
184 | 
185 | 
186 |     metric = [
187 |         dc.metrics.Metric(dc.metrics.accuracy_score, mode="classification", classification_handling_mode='threshold', threshold_value=0.5, n_tasks=len(tasks)),
188 |         dc.metrics.Metric(dc.metrics.recall_score, mode="classification", classification_handling_mode='threshold', threshold_value=0.5, n_tasks=len(tasks)),
189 |         dc.metrics.Metric(dc.metrics.precision_score, mode="classification", classification_handling_mode='threshold',
190 |                           threshold_value=0.5, n_tasks=len(tasks)),
191 |         dc.metrics.Metric(dc.metrics.roc_auc_score, mode="classification", n_tasks=len(tasks))]
192 |     if featurizer == 'GraphConv':
193 |         model = None
194 |         model = GraphConvModel(
195 |             len(tasks),
196 |             batch_size=batch_size,
197 |             mode='classification',
198 |             number_atom_features=78,
199 |             tensorboard=False,
200 |             use_queue=True,
201 |             graph_conv_layers=graph_conv_layers,
202 |             dense_layer_size=dense_layer_size,
203 |             dropout=dropout,
204 |             learning_rate=learning_rate,
205 |             model_dir=model_dir)
206 | 
207 |         for epoch_num in range(epochnb):
208 |             loss = model.fit(train_dataset, nb_epoch=1, checkpoint_interval=2*(training_data_len // batch_size),
209 |                              max_checkpoints_to_keep=1000)
210 |             print(epoch_num)
211 |     elif featurizer == 'ECFP':
212 |         model = None
213 |         model = dc.models.MultitaskClassifier(
214 |             len(tasks),
215 |             n_features=1024,
216 |             layer_sizes=[dense_layer_size],
217 |             dropouts=[dropout],
218 |             learning_rate=learning_rate,
219 |             batch_size=batch_size,
220 |             use_queue=False,
221 |             model_dir=model_dir)
222 |         loss = model.fit(train_dataset, nb_epoch=epochnb)
223 |     results = model.evaluate(untransformed_test_dataset, metrics=metric, transformers=[], use_sample_weights=True, per_task_metrics=True)
224 |     r = [list(results[0].values())]
225 |     keys = list(results[0].keys())
226 |     for i in range(len(results[1][keys[0]])):
227 |         dummy = [results[1][k][i] for k in keys]
228 |         r.append(dummy)
229 |     r = np.array(r)
230 |     print(r.shape)
231 |     keys = [k + '_test' for k in keys]
232 |     results_df = pd.DataFrame(data=np.array(r), columns=keys)
233 |     results_df.insert(loc=0, column='AID', value=['all'] + tasks)
234 |     results_df.insert(loc=len(results_df.columns), column='model_dir', value=[model_dir] * len(results_df))
235 |     results_valid = model.evaluate(untransformed_valid_dataset, metrics=metric, transformers=[],
236 |                                    use_sample_weights=True, per_task_metrics=True)
237 |     r_valid = [list(results_valid[0].values())]
238 |     keys_valid = list(results_valid[0].keys())
239 |     for i in range(len(results_valid[1][keys_valid[0]])):
240 |         dummy = [results_valid[1][k][i] for k in keys_valid]
241 |         r_valid.append(dummy)
242 |     r_valid = np.array(r_valid)
243 |     print(r_valid.shape)
244 |     keys_valid = [k + '_valid' for k in keys_valid]
245 |     # results_df_valid = pd.DataFrame(data=np.array(r_valid), columns=keys_valid)
246 |     for col in range(len(keys_valid)):
247 |         results_df[keys_valid[col]] = np.array(r_valid)[:, col]
248 |     # results_df_valid.insert(loc=0, column='AID', value=['all'] + input_tasks)
249 | 
250 |     results_df.to_csv('results/'+run_type+'_results.csv', header=True, index=False)
251 |     shutil.rmtree('tmp/train_un/', ignore_errors=True)
252 |     shutil.rmtree('tmp/valid_un/', ignore_errors=True)
253 |     shutil.rmtree('tmp/test_un/', ignore_errors=True)
254 |     shutil.rmtree('tmp/train/', ignore_errors=True)
255 |     shutil.rmtree('tmp/valid/', ignore_errors=True)
256 |     shutil.rmtree('tmp/test/', ignore_errors=True)
257 | 
258 | 
259 | all_results = []
260 | for c in all_categories:
261 |     results_df = pd.read_csv('results/'+c+'_results.csv')
262 |     results_df = results_df[results_df['AID'] == 'all']
263 |     all_results.append(results_df[['accuracy_score_test', 'recall_score_test', 'precision_score_test',
264 |                                    'roc_auc_score_test', 'accuracy_score_valid', 'recall_score_valid',
265 |                                    'precision_score_valid', 'roc_auc_score_valid']].iloc[0])
266 | 
267 | all_results_df = pd.DataFrame(data=np.array(all_results), columns=['accuracy_score_test', 'recall_score_test', 'precision_score_test',
268 |                                    'roc_auc_score_test', 'accuracy_score_valid', 'recall_score_valid',
269 |                                    'precision_score_valid', 'roc_auc_score_valid'])
270 | all_results_df.insert(loc=0, column='benchmark', value=all_categories)
271 | all_results_df["accuracy_score_test"] = 100 * all_results_df["accuracy_score_test"]
272 | all_results_df["recall_score_test"] = 100 * all_results_df["recall_score_test"]
273 | all_results_df["precision_score_test"] = 100 * all_results_df["precision_score_test"]
274 | all_results_df["accuracy_score_valid"] = 100 * all_results_df["accuracy_score_valid"]
275 | all_results_df["recall_score_valid"] = 100 * all_results_df["recall_score_valid"]
276 | all_results_df["precision_score_valid"] = 100 * all_results_df["precision_score_valid"]
277 | all_results_df = all_results_df.round({'accuracy_score_test': 2, 'recall_score_test': 2, 'precision_score_test': 2, 'roc_auc_score_test': 4})
278 | all_results_df = all_results_df.round({'accuracy_score_valid': 2, 'recall_score_valid': 2, 'precision_score_valid': 2, 'roc_auc_score_valid': 4})
279 | all_results_df.to_csv('results/final_results_'+data_type+'.csv', header=True, index=False)
280 | 
281 | 
282 | import pandas as pd
283 | import numpy as np
284 | all_disease_categories = ['all', 'cancer', 'nervous System', 'immune system', 'cardiovascular',
285 |                           'toxicity', 'obesity', 'virus', 'diabetes', 'metabolic disorders', 'bacteria',
286 |                           'parasite', 'epigenetics_genetics', 'pulmonary', 'infection', 'aging', 'fungal'] +\
287 |                          ['all_target', 'Membrane receptor', 'Enzyme (other)', 'Nuclear receptor',
288 |        'Hydrolase', 'Protease', 'Transcription factor', 'Kinase',
289 |        'Epigenetic regulator', 'Ion channel', 'Transferase', 'Oxidoreductase',
290 |        'Transporter', 'NTPase', 'Phosphatase']
291 | 
292 | writer = pd.ExcelWriter('results/detailed_results_combined.xlsx', engine = 'xlsxwriter')
293 | for c in ['final_results_disease', 'final_results_target']:
294 |     current_df = pd.read_csv('results/'+c+'.csv')
295 |     if c == 'final_results_disease':
296 |         c = 'benchmark_results_disease'
297 |     if c == 'final_results_target':
298 |         c = 'benchmark_results_target'
299 |     current_df.insert(loc=1, column='run_type', value=c)
300 |     current_df.to_excel(writer, sheet_name=c, index=False)
301 | 
302 | for c in all_disease_categories:
303 |     current_df = pd.read_csv('results/'+c+'_results.csv')
304 |     if c == 'all':
305 |         c = 'all_disease'
306 |     current_df.at[0, 'AID'] = c
307 |     current_df.insert(loc=1, column='run_type', value=c)
308 |     current_df.to_excel(writer, sheet_name=c, index=False)
309 |     if c == 'all_disease':
310 |         all_df = current_df
311 |     else:
312 |         all_df = all_df.merge(current_df, how='outer')
313 | 
314 | all_df = all_df.sort_values(by='AID', ignore_index=True)
315 | all_df.to_csv('results/sorted_detailed_results_combined.csv', header=True, index=False)
316 | all_df.to_excel(writer, sheet_name='everything', index=False)
317 | 
318 | writer.save()
319 | writer.close()
320 | 


--------------------------------------------------------------------------------