├── Data ├── aid_disease_mapping.csv ├── aid_target_mapping.csv ├── all_molecular_data.zip └── data_reference_list.txt ├── LICENSE ├── LICENSE.cczero ├── LICENSE.mit ├── README.md ├── clustering.py ├── correlation.py ├── fingerprint_extraction.py ├── manual_tagging.py ├── molecular_data_cleaning.py ├── ner.py ├── preprocessing.py ├── splitting.py └── training.py /Data/aid_disease_mapping.csv: -------------------------------------------------------------------------------- 1 | AID,cancer,nervous System,immune system,cardiovascular,toxicity,obesity,virus,diabetes,metabolic disorders,bacteria,parasite,epigenetics_genetics,pulmonary,infection,aging,fungal 2 | activity_1554,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 3 | activity_2732,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0 4 | activity_1085,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0 5 | activity_1236,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 6 | activity_1274,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0 7 | activity_781,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 8 | activity_422,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 9 | activity_1224905,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 10 | activity_624256,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 11 | activity_588358,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 12 | activity_1346378,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0 13 | activity_743266,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0 14 | activity_624417,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0 15 | activity_624418,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0 16 | activity_652104,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0 17 | activity_602310,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 18 | activity_588579,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 19 | activity_504847,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0 20 | activity_651635,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 21 | activity_743255,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 22 | activity_588591,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 23 | activity_588590,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 24 | activity_504845,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 25 | activity_504327,1,0,1,1,0,1,0,1,0,0,0,0,1,0,0,0 26 | activity_504648,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 27 | activity_602313,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 28 | activity_624172,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0 29 | activity_624246,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 30 | activity_652106,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 31 | activity_2676,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0 32 | activity_652025,0,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0 33 | activity_720504,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 34 | activity_720542,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 35 | activity_651725,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 36 | activity_651768,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 37 | activity_504444,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 38 | activity_743279,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0 39 | activity_651644,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0 40 | activity_651550,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 41 | activity_1259400,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 42 | activity_720551,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 43 | activity_720553,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 44 | activity_651965,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 45 | activity_540263,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 46 | activity_485314,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 47 | activity_651724,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 48 | activity_488837,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 49 | activity_485353,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 50 | activity_540303,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 51 | activity_485298,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 52 | activity_504706,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 53 | activity_485313,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0 54 | activity_485297,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0 55 | activity_2551,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 56 | activity_2546,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 57 | activity_540276,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 58 | activity_540256,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 59 | activity_2662,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 60 | activity_2315,0,1,1,1,0,0,0,0,1,1,0,0,0,1,0,0 61 | activity_2675,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0 62 | activity_2326,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0 63 | activity_2549,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 64 | activity_880,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 65 | activity_2147,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0 66 | activity_485360,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 67 | activity_1458,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 68 | activity_1487,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0 69 | activity_445,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 70 | activity_1259318,0,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0 71 | activity_602429,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 72 | activity_624304,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 73 | activity_624352,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 74 | activity_651699,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 75 | activity_651647,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 76 | activity_1347056,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 77 | activity_588405,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 78 | activity_588493,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 79 | activity_1259313,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0 80 | activity_489030,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 81 | activity_489031,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 82 | activity_485273,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0 83 | activity_2071,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 84 | activity_485,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 85 | activity_1259388,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 86 | activity_588692,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 87 | activity_720706,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 88 | activity_602393,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 89 | activity_2099,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 90 | activity_504621,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 91 | activity_1813,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 92 | activity_1814,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 93 | activity_1662,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 94 | activity_2221,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0 95 | activity_940,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 96 | activity_1422,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 97 | activity_1021,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 98 | activity_1022,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 99 | activity_817,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0 100 | activity_602162,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0 101 | activity_588499,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0 102 | activity_504490,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 103 | activity_488847,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 104 | activity_2557,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 105 | activity_624170,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0 106 | activity_624171,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0 107 | activity_624414,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0 108 | activity_1347076,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 109 | activity_1347075,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 110 | activity_624415,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0 111 | activity_1347120,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0 112 | activity_504339,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0 113 | activity_504891,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0 114 | activity_540317,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 115 | activity_624202,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 116 | activity_720707,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0 117 | activity_720708,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0 118 | activity_720709,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0 119 | activity_720711,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0 120 | activity_504333,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0 121 | activity_504332,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0 122 | activity_540253,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 123 | activity_504842,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 124 | activity_1463,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 125 | activity_1461,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 126 | activity_1347165,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 127 | activity_1457,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 128 | activity_2417,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 129 | activity_1454,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 130 | activity_1948,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0 131 | activity_602449,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0 132 | activity_1224865,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0 133 | activity_588473,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 134 | activity_588475,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 135 | activity_493187,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0 136 | activity_504462,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 137 | activity_463195,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 138 | activity_485346,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 139 | activity_1496,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 140 | activity_1012,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0 141 | activity_1216,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 142 | activity_1259310,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 143 | activity_743269,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0 144 | activity_652017,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0 145 | activity_2642,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0 146 | activity_488922,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 147 | activity_2237,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 148 | activity_2247,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 149 | activity_624483,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 150 | activity_1415,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0 151 | activity_1423,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0 152 | activity_1439,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0 153 | activity_1441,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0 154 | activity_950,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 155 | activity_951,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 156 | activity_952,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 157 | activity_1007,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 158 | activity_1008,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 159 | activity_1009,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 160 | activity_624263,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 161 | activity_651819,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 162 | activity_602438,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 163 | activity_651636,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0 164 | activity_493098,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 165 | activity_2521,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0 166 | activity_2520,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0 167 | activity_1656,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0 168 | activity_1259374,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 169 | activity_1259422,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 170 | activity_686940,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 171 | activity_720508,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0 172 | activity_720509,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0 173 | activity_720543,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0 174 | activity_720582,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 175 | activity_720648,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 176 | activity_720700,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 177 | activity_720704,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 178 | activity_651958,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0 179 | activity_602396,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 180 | activity_624169,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 181 | activity_651602,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 182 | activity_602229,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 183 | activity_588352,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 184 | activity_588726,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 185 | activity_588354,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 186 | activity_651719,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0 187 | activity_588335,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 188 | activity_504326,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0 189 | activity_504357,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0 190 | activity_434989,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 191 | activity_435030,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0 192 | activity_449728,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0 193 | activity_463079,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 194 | activity_463141,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 195 | activity_463210,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 196 | activity_485270,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 197 | activity_485272,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 198 | activity_2435,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 199 | activity_2445,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 200 | activity_2606,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 201 | activity_2796,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0 202 | activity_2797,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 203 | activity_2280,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 204 | activity_1987,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 205 | activity_2235,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 206 | activity_2300,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 207 | activity_1899,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 208 | activity_1950,1,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0 209 | activity_1962,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 210 | activity_1527,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 211 | activity_1556,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 212 | activity_1700,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 213 | activity_1706,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 214 | activity_1789,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0 215 | activity_1800,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 216 | activity_1825,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 217 | activity_1861,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 218 | activity_1304,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0 219 | activity_1359,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0 220 | activity_1416,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 221 | activity_1424,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 222 | activity_1446,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 223 | activity_1448,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0 224 | activity_1481,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 225 | activity_1486,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 226 | activity_1509,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 227 | activity_1510,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 228 | activity_1515,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 229 | activity_1040,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0 230 | activity_861,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 231 | activity_793,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 232 | activity_803,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 233 | activity_828,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 234 | activity_623870,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 235 | activity_743287,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0 236 | activity_602329,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 237 | activity_651654,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 238 | activity_602405,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 239 | activity_651610,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 240 | activity_652154,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 241 | activity_504884,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 242 | activity_602481,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 243 | activity_652163,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 244 | activity_651710,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 245 | activity_588549,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 246 | activity_651711,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 247 | activity_652162,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0 248 | activity_588436,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 249 | activity_651687,0,0,1,1,0,1,0,1,1,0,0,0,0,0,1,0 250 | activity_651723,0,0,1,1,0,1,0,1,1,0,0,0,0,0,1,0 251 | activity_588391,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 252 | activity_686996,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 253 | activity_720511,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0 254 | activity_651640,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 255 | activity_651661,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 256 | activity_743247,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 257 | activity_504558,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 258 | activity_652197,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 259 | activity_488895,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 260 | activity_488965,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 261 | activity_493131,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 262 | activity_504523,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 263 | activity_504582,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 264 | activity_504423,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 265 | activity_602346,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 266 | activity_602342,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 267 | activity_623901,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 268 | activity_743397,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0 269 | activity_488839,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 270 | activity_602340,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0 271 | activity_488899,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 272 | activity_540336,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 273 | activity_504406,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 274 | activity_488896,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0 275 | activity_504775,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0 276 | activity_743445,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 277 | activity_743398,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 278 | activity_588334,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 279 | activity_686992,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 280 | activity_2016,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 281 | activity_2025,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 282 | activity_2023,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 283 | activity_2066,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 284 | activity_2029,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 285 | activity_2462,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 286 | activity_2661,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0 287 | activity_2716,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 288 | activity_2718,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 289 | activity_2650,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 290 | activity_435005,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0 291 | activity_1885,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 292 | activity_1979,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 293 | activity_2097,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 294 | activity_2717,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 295 | activity_1910,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 296 | activity_841,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 297 | activity_1053175,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 298 | activity_1259309,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 299 | activity_1259311,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 300 | activity_720641,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0 301 | activity_588519,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 302 | activity_492967,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 303 | activity_504770,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 304 | activity_485395,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 305 | activity_493033,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 306 | activity_540299,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 307 | activity_651702,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0 308 | activity_588497,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 309 | activity_588501,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 310 | activity_504577,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 311 | activity_485275,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1 312 | activity_2690,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 313 | activity_1347041,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 314 | activity_602332,0,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0 315 | activity_686979,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 316 | activity_686978,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 317 | activity_624173,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 318 | activity_1347131,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0 319 | activity_1159524,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0 320 | activity_1347071,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 321 | activity_1259415,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0 322 | activity_588453,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 323 | activity_588456,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 324 | activity_588795,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 325 | activity_602179,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 326 | activity_686970,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 327 | activity_686971,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 328 | activity_485364,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 329 | activity_2528,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 330 | activity_485290,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 331 | activity_2517,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 332 | activity_485281,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 333 | activity_624296,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 334 | activity_624297,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 335 | activity_651820,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 336 | activity_540267,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 337 | activity_504937,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0 338 | activity_624288,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0 339 | activity_624287,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0 340 | activity_463254,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 341 | activity_720580,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 342 | activity_504466,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 343 | activity_504467,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 344 | activity_652105,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 345 | activity_2101,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0 346 | activity_504834,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 347 | activity_485349,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 348 | activity_2685,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0 349 | activity_720579,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 350 | activity_504832,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 351 | activity_2100,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0 352 | activity_2314,0,0,1,1,0,0,0,0,1,1,0,0,0,1,0,0 353 | activity_2472,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 354 | activity_2451,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 355 | activity_1868,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0 356 | activity_1460,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 357 | activity_1768,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 358 | activity_1631,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0 359 | activity_1634,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0 360 | activity_2107,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0 361 | activity_1471,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 362 | activity_1688,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 363 | activity_1030,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0 364 | activity_1379,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0 365 | activity_2242,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0 366 | activity_1476,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 367 | activity_1478,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 368 | activity_1452,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 369 | activity_894,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 370 | activity_902,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 371 | activity_924,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 372 | activity_1347417,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 373 | activity_881,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 374 | activity_624168,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 375 | activity_624204,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 376 | activity_624354,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 377 | activity_651560,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0 378 | activity_602244,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 379 | activity_602261,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 380 | activity_602274,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 381 | activity_602399,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 382 | activity_588458,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 383 | activity_588489,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 384 | activity_588492,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 385 | activity_588621,1,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0 386 | activity_588850,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0 387 | activity_602141,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0 388 | activity_504720,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0 389 | activity_493091,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 390 | activity_493160,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0 391 | activity_493011,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 392 | activity_493012,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 393 | activity_449763,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0 394 | activity_463104,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0 395 | activity_463190,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 396 | activity_463212,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 397 | activity_435022,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 398 | activity_2380,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0 399 | activity_2825,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 400 | activity_435003,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 401 | activity_651582,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0 402 | activity_1443,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 403 | activity_1135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0 404 | activity_1217,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0 405 | activity_1029,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0 406 | activity_720637,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 407 | activity_1347034,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 408 | activity_1347037,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 409 | activity_602233,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 410 | activity_485367,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 411 | activity_1490,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0 412 | activity_1468,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 413 | activity_602440,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 414 | activity_504690,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 415 | activity_588413,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 416 | activity_652067,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 417 | activity_652126,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 418 | activity_652257,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0 419 | activity_686964,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 420 | activity_687014,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 421 | activity_687016,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 422 | activity_720596,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 423 | activity_720702,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 424 | activity_743126,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 425 | activity_1053197,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0 426 | activity_651821,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 427 | activity_651957,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 428 | activity_652010,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 429 | activity_652039,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 430 | activity_602281,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0 431 | activity_624267,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0 432 | activity_624268,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 433 | activity_624377,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0 434 | activity_624416,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 435 | activity_651718,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0 436 | activity_651800,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0 437 | activity_651572,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0 438 | activity_602163,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0 439 | activity_602123,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0 440 | activity_540364,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0 441 | activity_504411,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 442 | activity_504700,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 443 | activity_504707,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 444 | activity_504734,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 445 | activity_504766,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 446 | activity_504803,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0 447 | activity_492953,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 448 | activity_492956,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 449 | activity_492972,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 450 | activity_493008,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 451 | activity_493087,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0 452 | activity_493244,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 453 | activity_463082,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 454 | activity_2751,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0 455 | activity_434962,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0 456 | activity_2130,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 457 | activity_2174,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 458 | activity_2177,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 459 | activity_2234,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 460 | activity_2057,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 461 | activity_2129,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 462 | activity_1906,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0 463 | activity_1947,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 464 | activity_1974,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 465 | activity_1822,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0 466 | activity_1845,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 467 | activity_1321,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 468 | activity_631,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0 469 | activity_731,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0 470 | activity_1032,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 471 | activity_1203,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 472 | activity_920,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 473 | activity_932,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 474 | activity_862,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 475 | activity_871,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 476 | activity_743093,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0 477 | activity_2052,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0 478 | activity_1529,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0 479 | activity_1530,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0 480 | activity_1531,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0 481 | activity_758,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 482 | activity_760,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 483 | activity_761,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 484 | activity_757,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 485 | activity_759,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 486 | activity_764,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 487 | activity_1325,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 488 | activity_1326,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 489 | activity_493014,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 490 | activity_588664,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 491 | activity_602410,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0 492 | activity_623877,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 493 | activity_488975,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 494 | activity_488977,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 495 | activity_588511,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0 496 | activity_588627,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 497 | activity_588675,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 498 | activity_588676,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 499 | activity_1511,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 500 | activity_1672,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0 501 | activity_2550,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 502 | activity_2553,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0 503 | activity_2648,0,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0 504 | activity_463111,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0 505 | activity_463165,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0 506 | activity_2156,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 507 | activity_2227,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 508 | activity_2239,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0 509 | activity_624037,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 510 | activity_624038,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 511 | activity_624040,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 512 | activity_624125,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 513 | activity_624126,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 514 | activity_624127,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 515 | activity_624466,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 516 | activity_624467,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 517 | activity_602247,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 518 | activity_602248,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 519 | activity_602250,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 520 | activity_588814,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 521 | activity_588819,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 522 | activity_588852,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 523 | activity_504634,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 524 | activity_504692,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 525 | activity_504454,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 526 | activity_652048,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 527 | activity_652051,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 528 | activity_493056,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 529 | activity_493084,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 530 | activity_504660,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 531 | activity_652054,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 532 | activity_624463,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 533 | activity_504651,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 534 | activity_624465,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 535 | activity_504652,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 536 | activity_624464,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 537 | activity_624291,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0 538 | activity_485347,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 539 | activity_485344,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 540 | activity_485358,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 541 | activity_492947,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0 542 | activity_504810,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0 543 | activity_504812,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0 544 | activity_540275,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 545 | activity_540277,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0 546 | activity_493036,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 547 | activity_1469,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0 548 | activity_1479,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0 549 | activity_720552,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 550 | activity_720719,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 551 | activity_720725,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 552 | activity_743053,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 553 | activity_743054,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 554 | activity_743063,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 555 | activity_743067,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0 556 | activity_743077,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 557 | activity_743078,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 558 | activity_743091,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 559 | activity_743122,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 560 | activity_743139,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 561 | activity_743140,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 562 | activity_1159523,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 563 | activity_1159528,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 564 | activity_1159531,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0 565 | activity_1159555,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 566 | activity_1224892,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 567 | activity_1224893,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 568 | activity_1224894,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 569 | activity_1224895,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0 570 | activity_1224896,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 571 | activity_1259247,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 572 | activity_1259248,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 573 | activity_1259387,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 574 | activity_1259390,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 575 | activity_1259391,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 576 | activity_1259392,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 577 | activity_1259393,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0 578 | activity_1259394,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 579 | activity_1259395,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0 580 | activity_1259396,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 581 | activity_1259401,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0 582 | activity_1259402,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0 583 | activity_1259403,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0 584 | activity_1259404,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0 585 | activity_1347030,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 586 | activity_1347031,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 587 | activity_1347032,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 588 | activity_1347033,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 589 | activity_1347036,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 590 | activity_1347038,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 591 | activity_1159518,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 592 | activity_1159519,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0 593 | activity_743199,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 594 | activity_743219,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 595 | activity_743226,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 596 | activity_743227,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 597 | activity_743228,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 598 | activity_743239,1,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0 599 | activity_743240,1,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0 600 | activity_743241,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 601 | activity_743242,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0 602 | -------------------------------------------------------------------------------- /Data/aid_target_mapping.csv: -------------------------------------------------------------------------------- 1 | AID,Membrane receptor,Enzyme (other),Nuclear receptor,Hydrolase,Protease,Transcription factor,Kinase,Epigenetic regulator,Ion channel,Transferase,Oxidoreductase,Transporter,NTPase,Phosphatase 2 | activity_1554,0,0,0,0,0,0,0,0,0,0,0,0,0,0 3 | activity_2732,0,0,0,0,0,1,0,0,0,0,0,0,0,0 4 | activity_1085,0,0,0,0,0,0,0,0,0,0,0,0,0,0 5 | activity_1236,0,0,0,0,1,0,0,0,0,0,0,0,0,0 6 | activity_1274,0,0,0,0,0,0,0,0,0,0,0,0,0,0 7 | activity_781,0,0,0,0,0,0,0,0,0,0,0,0,0,0 8 | activity_422,0,0,0,0,0,0,0,0,0,0,0,0,0,0 9 | activity_1224905,0,0,0,0,0,0,0,0,0,0,0,0,0,0 10 | activity_624256,0,0,0,0,0,0,0,0,0,0,0,0,0,0 11 | activity_588358,0,0,0,0,0,0,0,0,0,0,0,0,0,0 12 | activity_1346378,0,0,0,0,0,0,0,0,0,1,0,0,0,0 13 | activity_743266,1,0,0,0,0,0,0,0,0,0,0,0,0,0 14 | activity_624417,1,0,0,0,0,0,0,0,0,0,0,0,0,0 15 | activity_624418,0,0,0,0,0,0,0,0,0,0,0,0,0,0 16 | activity_652104,0,0,0,0,0,0,0,0,0,0,0,0,0,0 17 | activity_602310,0,0,0,1,0,0,0,0,0,0,0,0,0,0 18 | activity_588579,0,1,0,0,0,0,0,0,0,0,0,0,0,0 19 | activity_504847,0,0,1,0,0,0,0,0,0,0,0,0,0,0 20 | activity_651635,0,0,0,0,0,0,0,0,0,0,0,0,0,0 21 | activity_743255,0,0,0,0,1,0,0,0,0,0,0,0,0,0 22 | activity_588591,0,0,0,0,0,0,0,0,0,1,0,0,0,0 23 | activity_588590,0,0,0,0,0,0,0,0,0,1,0,0,0,0 24 | activity_504845,0,0,0,0,0,0,0,0,0,0,0,0,0,0 25 | activity_504327,0,0,0,0,0,0,0,1,0,0,0,0,0,0 26 | activity_504648,0,0,0,0,0,0,0,0,0,0,0,0,0,0 27 | activity_602313,0,0,0,1,0,0,0,0,0,0,0,0,0,0 28 | activity_624172,1,0,0,0,0,0,0,0,0,0,0,0,0,0 29 | activity_624246,0,0,0,0,0,0,0,0,0,0,0,0,0,0 30 | activity_652106,0,0,0,0,0,0,0,0,0,0,0,0,0,0 31 | activity_2676,1,0,0,0,0,0,0,0,0,0,0,0,0,0 32 | activity_652025,0,0,0,0,0,0,0,0,0,0,0,0,0,0 33 | activity_720504,0,0,0,0,0,0,1,0,0,0,0,0,0,0 34 | activity_720542,0,0,0,0,0,0,0,0,0,0,0,0,0,0 35 | activity_651725,0,0,0,0,0,0,0,0,0,0,0,0,0,0 36 | activity_651768,0,1,0,0,0,0,0,0,0,0,0,0,0,0 37 | activity_504444,0,0,0,0,0,1,0,0,0,0,0,0,0,0 38 | activity_743279,0,0,0,0,0,0,0,0,0,0,0,0,0,0 39 | activity_651644,0,0,0,0,0,0,0,0,0,0,0,0,0,0 40 | activity_651550,0,0,0,0,0,0,1,0,0,0,0,0,0,0 41 | activity_1259400,1,0,0,0,0,0,0,0,0,0,0,0,0,0 42 | activity_720551,0,0,0,0,0,0,0,0,0,0,0,0,0,0 43 | activity_720553,0,0,0,0,0,0,0,0,0,0,0,0,0,0 44 | activity_651965,0,0,0,0,1,0,0,0,0,0,0,0,0,0 45 | activity_540263,0,0,0,0,0,0,0,0,0,0,0,0,0,0 46 | activity_485314,0,1,0,0,0,0,0,0,0,0,0,0,0,0 47 | activity_651724,0,0,0,1,0,0,0,0,0,0,0,0,0,0 48 | activity_488837,0,1,0,0,0,0,0,0,0,0,0,0,0,0 49 | activity_485353,0,1,0,0,0,0,0,0,0,0,0,0,0,0 50 | activity_540303,1,0,0,0,1,0,0,0,0,0,0,0,0,0 51 | activity_485298,0,0,0,0,0,0,0,0,0,0,0,0,0,0 52 | activity_504706,0,0,0,0,0,1,0,0,0,0,0,0,0,0 53 | activity_485313,0,0,0,0,0,0,0,0,0,0,0,1,0,0 54 | activity_485297,0,0,0,1,0,0,0,0,0,0,0,0,0,0 55 | activity_2551,0,0,1,0,0,0,0,0,0,0,0,0,0,0 56 | activity_2546,0,0,1,0,0,0,0,0,0,0,0,0,0,0 57 | activity_540276,0,0,0,0,0,0,0,0,0,0,0,0,0,0 58 | activity_540256,0,0,0,0,0,0,0,0,0,0,0,0,0,0 59 | activity_2662,0,0,0,0,0,0,0,1,0,0,0,0,0,0 60 | activity_2315,0,0,0,1,0,0,0,0,0,0,0,0,0,0 61 | activity_2675,0,0,0,0,0,0,0,0,0,0,0,0,0,0 62 | activity_2326,0,0,0,0,0,0,0,0,0,0,0,0,0,0 63 | activity_2549,0,1,0,0,0,0,0,0,0,0,0,0,0,0 64 | activity_880,0,1,0,0,0,0,0,0,0,0,0,0,0,0 65 | activity_2147,0,0,0,0,0,0,0,1,0,0,0,0,0,0 66 | activity_485360,0,0,0,0,0,0,0,1,0,0,0,0,0,0 67 | activity_1458,0,0,0,0,0,0,0,1,0,0,0,0,0,0 68 | activity_1487,0,0,0,0,0,0,0,0,0,0,0,0,0,0 69 | activity_445,0,0,0,0,0,0,0,0,0,0,0,0,0,0 70 | activity_1259318,0,0,0,0,0,0,0,0,0,0,0,0,0,0 71 | activity_602429,0,0,0,0,0,0,0,0,0,0,0,0,0,0 72 | activity_624304,0,0,0,0,0,0,0,0,0,0,0,0,0,0 73 | activity_624352,0,0,0,0,0,1,0,0,0,0,0,0,0,0 74 | activity_651699,0,0,0,0,0,0,0,0,0,1,0,0,0,0 75 | activity_651647,0,0,0,0,1,0,0,0,0,0,0,0,0,0 76 | activity_1347056,0,0,0,0,0,0,0,0,0,0,0,0,0,0 77 | activity_588405,0,0,0,0,0,0,0,0,0,0,0,0,0,0 78 | activity_588493,0,0,0,0,0,0,0,0,0,0,0,0,0,0 79 | activity_1259313,0,0,0,0,0,0,0,0,0,0,0,0,0,0 80 | activity_489030,0,0,0,0,0,0,0,0,0,0,0,0,0,0 81 | activity_489031,0,0,0,0,0,0,0,0,0,0,0,0,0,0 82 | activity_485273,0,0,0,0,0,0,0,0,0,1,0,0,0,0 83 | activity_2071,0,0,0,0,0,0,0,0,0,0,0,0,0,0 84 | activity_485,1,0,0,0,0,0,0,0,0,0,0,0,0,0 85 | activity_1259388,0,0,0,0,0,0,0,1,0,0,0,0,0,0 86 | activity_588692,0,0,0,0,0,0,0,0,0,0,0,0,0,0 87 | activity_720706,0,0,0,0,0,0,0,0,0,0,0,0,0,0 88 | activity_602393,0,0,0,0,0,0,0,1,0,0,0,0,0,0 89 | activity_2099,0,0,0,0,0,0,0,0,0,0,0,0,0,0 90 | activity_504621,0,0,0,0,0,0,0,0,0,0,0,0,0,0 91 | activity_1813,0,0,0,0,0,0,0,0,0,0,0,0,0,0 92 | activity_1814,0,0,0,0,0,0,0,0,0,0,0,0,0,0 93 | activity_1662,0,0,0,0,0,0,1,0,0,0,0,0,0,0 94 | activity_2221,0,1,0,0,0,0,0,0,0,0,0,0,0,0 95 | activity_940,1,0,0,0,0,0,0,0,0,0,0,0,0,0 96 | activity_1422,1,0,0,0,0,0,0,0,0,0,0,0,0,0 97 | activity_1021,0,0,0,0,0,0,0,0,0,0,0,0,0,0 98 | activity_1022,0,0,0,0,0,0,0,0,0,0,0,0,0,0 99 | activity_817,0,0,0,0,0,0,0,0,0,0,0,0,0,0 100 | activity_602162,0,0,0,0,0,0,0,0,0,0,0,1,0,0 101 | activity_588499,0,0,0,0,1,0,0,0,0,0,0,0,0,0 102 | activity_504490,0,0,0,0,0,0,0,0,0,0,0,0,0,0 103 | activity_488847,0,0,0,0,0,0,1,0,0,0,0,0,0,0 104 | activity_2557,1,0,0,0,0,0,0,0,0,0,0,0,0,0 105 | activity_624170,0,0,0,1,0,0,0,0,0,0,0,0,0,0 106 | activity_624171,0,0,0,0,0,1,0,0,0,0,0,0,0,0 107 | activity_624414,0,0,0,0,0,0,0,0,1,0,0,0,0,0 108 | activity_1347076,0,0,0,0,0,0,0,0,0,0,0,0,0,0 109 | activity_1347075,0,0,0,0,0,0,0,0,0,0,0,0,0,0 110 | activity_624415,0,0,0,0,0,0,0,0,1,0,0,0,0,0 111 | activity_1347120,0,0,0,0,0,0,0,0,0,0,0,0,0,0 112 | activity_504339,0,0,0,0,0,0,0,1,0,0,0,0,0,0 113 | activity_504891,0,0,0,0,0,0,0,0,0,0,0,0,0,0 114 | activity_540317,0,0,0,0,0,0,0,1,0,0,0,0,0,0 115 | activity_624202,0,1,0,0,0,0,0,0,0,0,0,0,0,0 116 | activity_720707,0,0,0,0,0,0,0,0,0,0,0,0,0,0 117 | activity_720708,0,0,0,0,0,0,0,0,0,0,0,0,0,0 118 | activity_720709,0,0,0,0,0,0,0,0,0,0,0,0,0,0 119 | activity_720711,0,0,0,0,0,0,0,0,0,0,0,0,0,0 120 | activity_504333,0,0,0,0,0,0,0,1,0,0,0,0,0,0 121 | activity_504332,0,0,0,0,0,0,0,0,0,0,0,0,0,0 122 | activity_540253,0,0,0,0,0,0,0,0,0,0,0,0,0,0 123 | activity_504842,0,0,0,0,0,0,0,0,0,0,0,0,0,0 124 | activity_1463,0,0,0,0,0,0,0,0,0,0,0,0,0,0 125 | activity_1461,1,0,0,0,0,0,0,0,0,0,0,0,0,0 126 | activity_1347165,0,0,0,0,0,0,0,0,0,0,0,0,0,0 127 | activity_1457,0,0,0,1,0,0,0,0,0,0,0,0,0,0 128 | activity_2417,0,0,0,0,0,0,0,0,0,0,0,0,0,0 129 | activity_1454,0,0,0,0,0,0,1,0,0,0,0,0,0,0 130 | activity_1948,0,0,0,0,0,0,0,0,0,0,0,0,0,0 131 | activity_602449,0,0,0,0,0,0,0,0,0,0,0,0,0,0 132 | activity_1224865,0,0,0,0,0,0,0,0,0,0,0,0,0,0 133 | activity_588473,1,0,0,0,0,0,0,0,0,0,0,0,0,0 134 | activity_588475,1,0,0,0,0,0,0,0,0,0,0,0,0,0 135 | activity_493187,0,0,0,0,0,0,0,0,0,1,0,0,0,0 136 | activity_504462,0,1,0,0,0,0,0,0,0,0,0,0,0,0 137 | activity_463195,0,0,0,0,0,0,0,0,0,0,0,0,0,0 138 | activity_485346,0,0,0,0,0,0,0,0,0,0,0,0,0,0 139 | activity_1496,0,0,0,0,0,1,0,0,0,0,0,0,0,0 140 | activity_1012,0,1,0,0,0,0,0,0,0,0,0,0,0,0 141 | activity_1216,0,0,0,0,0,0,0,0,0,0,0,0,0,0 142 | activity_1259310,0,0,0,0,0,0,0,0,0,0,0,0,0,0 143 | activity_743269,0,0,0,1,0,0,0,1,0,0,0,0,0,0 144 | activity_652017,0,0,0,0,0,0,0,1,0,0,0,0,0,0 145 | activity_2642,0,0,0,0,0,0,0,0,1,0,0,0,0,0 146 | activity_488922,0,0,0,0,0,0,0,0,1,0,0,0,0,0 147 | activity_2237,0,0,0,0,0,0,0,0,0,0,0,0,0,0 148 | activity_2247,0,0,0,0,0,0,0,0,0,0,0,0,0,0 149 | activity_624483,0,0,0,0,0,0,0,0,0,0,0,0,0,0 150 | activity_1415,0,0,0,0,0,0,0,0,0,0,0,0,0,0 151 | activity_1423,0,1,0,0,0,0,0,0,0,0,0,0,0,0 152 | activity_1439,0,0,0,0,0,0,0,0,0,0,0,0,0,0 153 | activity_1441,0,1,0,0,0,0,0,0,0,0,0,0,0,0 154 | activity_950,0,0,0,0,0,0,0,0,1,0,0,0,0,0 155 | activity_951,0,0,0,0,0,0,0,0,0,0,0,0,0,0 156 | activity_952,0,0,0,0,0,0,0,0,0,0,0,0,0,0 157 | activity_1007,0,0,0,0,0,0,0,0,1,0,0,0,0,0 158 | activity_1008,0,0,0,0,0,0,0,0,0,0,0,0,0,0 159 | activity_1009,0,0,0,0,0,0,0,0,0,0,0,0,0,0 160 | activity_624263,0,0,0,0,0,0,0,0,0,0,0,0,0,0 161 | activity_651819,0,0,0,0,0,0,0,0,0,0,0,0,0,0 162 | activity_602438,0,0,0,0,0,0,0,0,0,0,0,0,0,0 163 | activity_651636,1,0,0,0,0,0,0,0,0,0,0,0,0,0 164 | activity_493098,1,0,0,0,0,0,0,0,0,0,0,0,0,0 165 | activity_2521,1,0,0,0,0,0,0,0,0,0,0,0,0,0 166 | activity_2520,1,0,0,0,0,0,0,0,0,0,0,0,0,0 167 | activity_1656,0,0,0,0,0,0,0,0,0,0,0,0,0,0 168 | activity_1259374,0,0,0,0,0,0,0,0,0,0,0,0,0,0 169 | activity_1259422,0,0,0,0,0,0,0,0,0,0,0,0,0,0 170 | activity_686940,0,0,1,0,0,0,0,0,0,0,0,0,0,0 171 | activity_720508,0,0,0,0,0,0,0,0,0,0,0,0,0,0 172 | activity_720509,0,0,0,0,0,0,0,0,0,0,0,0,0,0 173 | activity_720543,0,0,0,0,0,0,0,0,0,1,0,0,0,0 174 | activity_720582,0,0,0,0,1,0,0,0,0,0,0,0,0,0 175 | activity_720648,0,0,0,0,1,0,0,0,0,0,0,0,0,0 176 | activity_720700,0,1,0,0,0,0,0,0,0,0,0,0,0,0 177 | activity_720704,0,1,0,0,0,0,0,0,0,0,0,0,0,0 178 | activity_651958,0,1,0,0,0,0,0,0,0,0,0,0,0,0 179 | activity_602396,0,0,1,0,0,0,0,0,0,0,0,0,0,0 180 | activity_624169,1,0,0,0,0,0,0,0,0,0,0,0,0,0 181 | activity_651602,0,1,0,0,0,0,0,0,0,0,0,0,0,0 182 | activity_602229,0,0,1,0,0,0,0,0,0,0,0,0,0,0 183 | activity_588352,0,0,0,0,0,0,0,1,0,0,0,0,0,0 184 | activity_588726,0,0,0,0,0,0,0,0,0,0,0,0,0,0 185 | activity_588354,0,0,0,0,0,0,0,1,0,0,0,0,0,0 186 | activity_651719,1,0,0,0,0,0,0,0,0,0,0,0,0,0 187 | activity_588335,0,0,0,0,0,0,0,0,0,0,0,0,0,0 188 | activity_504326,1,0,0,0,0,0,0,0,0,0,0,0,0,0 189 | activity_504357,1,0,0,0,0,0,0,0,0,0,0,0,0,0 190 | activity_434989,1,0,0,0,0,0,0,0,0,0,0,0,0,0 191 | activity_435030,0,0,0,0,0,0,0,0,0,0,0,0,1,0 192 | activity_449728,0,0,0,0,0,0,0,0,0,0,0,0,0,0 193 | activity_463079,0,0,0,0,0,0,0,0,0,0,0,0,0,0 194 | activity_463141,0,0,0,0,1,0,0,0,0,0,0,0,0,0 195 | activity_463210,0,0,0,0,1,0,0,0,0,0,0,0,0,0 196 | activity_485270,1,0,0,0,0,0,0,0,0,0,0,0,0,0 197 | activity_485272,0,1,0,0,0,0,0,0,0,0,0,0,0,0 198 | activity_2435,1,0,0,0,0,0,0,0,0,0,0,0,0,0 199 | activity_2445,1,0,0,0,0,0,0,0,0,0,0,0,0,0 200 | activity_2606,0,0,0,0,1,0,0,0,0,0,0,0,0,0 201 | activity_2796,0,0,0,0,0,1,0,0,0,0,0,0,0,0 202 | activity_2797,1,0,0,0,0,0,0,0,0,0,0,0,0,0 203 | activity_2280,0,0,0,0,0,0,0,0,0,0,0,0,0,0 204 | activity_1987,0,0,0,0,0,0,0,0,0,0,0,0,0,1 205 | activity_2235,0,0,0,0,0,0,0,0,0,0,0,0,0,1 206 | activity_2300,0,0,1,0,0,0,0,0,0,0,0,0,0,0 207 | activity_1899,0,1,0,0,0,0,0,0,0,0,0,0,0,0 208 | activity_1950,0,0,0,0,0,0,0,0,0,0,0,0,0,0 209 | activity_1962,0,1,0,0,0,0,0,0,0,0,0,0,0,0 210 | activity_1527,0,0,0,1,0,0,0,0,0,0,0,0,0,0 211 | activity_1556,0,0,0,1,0,0,0,0,0,0,0,0,0,0 212 | activity_1700,0,0,0,0,0,1,0,0,0,0,0,0,0,0 213 | activity_1706,0,1,0,0,0,0,0,0,0,0,0,0,0,0 214 | activity_1789,0,0,0,0,0,0,0,0,0,0,0,0,0,0 215 | activity_1800,0,0,0,0,0,0,0,0,0,0,0,0,0,0 216 | activity_1825,0,0,0,0,0,0,0,0,0,0,0,0,0,0 217 | activity_1861,1,0,0,0,0,0,0,0,0,0,0,0,0,0 218 | activity_1304,1,0,0,0,0,0,0,0,0,0,0,0,0,0 219 | activity_1359,1,0,0,0,0,0,0,0,0,0,0,0,0,0 220 | activity_1416,0,0,0,0,0,0,1,0,0,0,0,0,0,0 221 | activity_1424,0,0,0,0,0,0,0,0,1,0,0,0,0,0 222 | activity_1446,0,0,0,0,0,0,1,0,0,0,0,0,0,0 223 | activity_1448,0,0,0,0,0,0,0,0,0,0,0,0,0,0 224 | activity_1481,0,0,0,0,0,0,0,0,0,0,0,1,0,0 225 | activity_1486,0,0,0,0,0,0,0,0,0,0,0,0,0,0 226 | activity_1509,1,0,0,0,0,0,0,0,0,0,0,0,0,0 227 | activity_1510,1,0,0,0,0,0,0,0,0,0,0,0,0,0 228 | activity_1515,0,0,0,1,0,0,0,0,0,0,0,0,0,0 229 | activity_1040,1,0,0,0,0,0,0,0,0,0,0,0,0,0 230 | activity_861,0,0,0,0,0,0,0,0,0,0,0,0,0,0 231 | activity_793,1,0,0,0,0,0,0,0,0,0,0,0,0,0 232 | activity_803,1,0,0,0,0,0,0,0,0,0,0,0,0,0 233 | activity_828,1,0,0,0,0,0,0,0,0,0,0,0,0,0 234 | activity_623870,0,0,0,0,0,0,0,0,0,0,0,0,0,0 235 | activity_743287,0,0,0,0,0,0,0,0,0,0,0,0,0,0 236 | activity_602329,0,1,0,0,0,0,0,0,0,0,0,0,0,0 237 | activity_651654,0,0,0,0,0,0,0,0,0,0,0,0,0,0 238 | activity_602405,0,0,0,0,0,0,0,0,0,0,0,0,0,0 239 | activity_651610,0,0,0,0,0,0,0,0,0,0,0,0,0,0 240 | activity_652154,0,0,0,0,0,1,0,0,0,0,0,0,0,0 241 | activity_504884,0,0,0,0,0,0,0,0,0,0,0,0,0,0 242 | activity_602481,0,0,0,0,0,0,0,0,0,1,0,0,0,0 243 | activity_652163,0,0,0,0,0,0,0,0,0,0,0,0,0,0 244 | activity_651710,0,0,0,0,0,0,0,0,0,0,0,0,0,0 245 | activity_588549,0,1,0,0,0,0,0,0,0,0,0,0,0,0 246 | activity_651711,0,1,0,0,0,0,0,0,0,0,0,0,0,0 247 | activity_652162,0,0,0,1,0,0,0,0,0,0,0,0,0,0 248 | activity_588436,0,0,0,0,0,0,0,0,0,0,0,0,0,0 249 | activity_651687,0,0,0,0,0,0,0,0,0,0,0,0,0,0 250 | activity_651723,0,0,0,0,0,0,0,0,0,0,0,0,0,0 251 | activity_588391,0,0,0,0,0,0,0,0,0,0,0,0,0,0 252 | activity_686996,0,0,0,0,1,0,0,0,0,0,0,0,0,0 253 | activity_720511,0,0,0,0,0,0,0,0,1,0,0,0,0,0 254 | activity_651640,0,0,0,0,0,0,0,0,0,0,0,0,0,0 255 | activity_651661,0,0,0,0,0,0,0,0,0,0,0,0,0,0 256 | activity_743247,0,0,0,0,0,0,0,0,0,0,0,0,0,0 257 | activity_504558,0,0,0,0,0,0,0,0,0,0,0,0,0,0 258 | activity_652197,0,0,0,0,1,0,0,0,0,0,0,0,0,0 259 | activity_488895,0,0,0,0,0,0,0,0,0,0,0,0,0,0 260 | activity_488965,0,0,0,1,0,0,0,0,0,0,0,0,0,0 261 | activity_493131,0,0,0,0,0,0,0,0,0,0,0,0,0,0 262 | activity_504523,0,0,0,0,0,1,0,0,0,0,0,0,0,0 263 | activity_504582,0,0,0,0,0,0,0,0,0,0,0,0,0,0 264 | activity_504423,0,0,0,0,0,0,0,0,0,0,0,0,0,0 265 | activity_602346,0,0,0,0,0,0,0,0,0,0,0,0,0,0 266 | activity_602342,0,0,0,0,0,0,0,0,0,0,0,0,0,0 267 | activity_623901,0,0,0,0,0,0,0,0,0,0,0,0,0,0 268 | activity_743397,0,0,0,0,0,0,0,0,0,0,0,0,0,0 269 | activity_488839,0,0,0,0,0,0,1,0,0,0,0,0,0,0 270 | activity_602340,0,0,0,0,0,0,0,0,0,0,0,0,0,0 271 | activity_488899,0,0,0,0,0,1,0,0,0,0,0,0,0,0 272 | activity_540336,0,0,0,0,0,0,0,0,0,0,0,0,0,0 273 | activity_504406,0,0,0,0,0,0,0,0,0,0,0,0,0,0 274 | activity_488896,1,0,0,0,0,0,0,0,0,0,0,0,0,0 275 | activity_504775,1,0,0,0,0,0,0,0,0,0,0,0,0,0 276 | activity_743445,0,0,0,0,0,0,0,1,0,0,0,0,0,0 277 | activity_743398,0,0,0,0,0,0,0,0,0,0,0,0,0,0 278 | activity_588334,0,0,0,0,0,0,0,0,0,0,0,0,0,0 279 | activity_686992,0,0,0,0,0,0,0,0,0,0,0,0,0,0 280 | activity_2016,0,0,0,0,0,0,0,0,0,0,0,1,0,0 281 | activity_2025,0,0,0,0,0,0,0,0,0,0,0,0,0,0 282 | activity_2023,0,0,0,0,1,0,0,0,0,0,0,0,0,0 283 | activity_2066,0,0,0,0,0,0,0,0,0,0,0,1,0,0 284 | activity_2029,0,0,0,0,0,0,0,0,0,1,0,0,0,0 285 | activity_2462,0,0,0,0,0,0,0,0,0,0,0,0,0,0 286 | activity_2661,0,0,0,0,0,0,1,0,0,0,0,0,0,0 287 | activity_2716,0,0,0,0,0,0,0,0,0,0,0,0,0,0 288 | activity_2718,0,0,0,0,0,0,0,1,0,0,0,0,0,0 289 | activity_2650,0,0,0,0,0,0,1,0,0,0,0,0,0,0 290 | activity_435005,0,0,0,0,0,0,0,0,0,0,0,0,0,0 291 | activity_1885,0,0,0,0,0,0,0,0,0,0,0,0,0,0 292 | activity_1979,0,0,0,0,0,0,0,0,0,0,0,0,0,0 293 | activity_2097,0,0,0,0,0,0,1,0,0,0,0,0,0,0 294 | activity_2717,0,0,0,0,0,0,0,0,0,0,0,0,0,0 295 | activity_1910,0,0,0,0,0,0,0,0,0,0,0,0,0,0 296 | activity_841,0,0,0,0,0,0,0,0,0,0,0,0,0,0 297 | activity_1053175,0,0,0,0,0,0,0,0,0,0,0,0,0,0 298 | activity_1259309,0,0,0,0,0,0,0,0,0,0,0,0,0,0 299 | activity_1259311,0,0,0,0,0,0,0,0,0,0,0,0,0,0 300 | activity_720641,0,1,0,0,0,0,0,0,0,0,0,0,0,0 301 | activity_588519,0,0,0,0,1,0,0,0,0,0,0,0,0,0 302 | activity_492967,0,0,0,0,0,0,0,0,0,0,0,0,0,0 303 | activity_504770,0,0,0,0,0,0,0,0,0,0,0,0,0,0 304 | activity_485395,0,1,0,0,0,0,0,0,0,0,0,0,0,0 305 | activity_493033,0,1,0,0,0,0,0,0,0,0,0,0,0,0 306 | activity_540299,0,0,0,0,0,0,0,0,0,0,0,0,0,0 307 | activity_651702,0,0,0,0,0,0,0,0,0,0,0,0,0,0 308 | activity_588497,0,1,0,0,0,0,0,0,0,0,0,0,0,0 309 | activity_588501,0,0,0,0,1,0,0,0,0,0,0,0,0,0 310 | activity_504577,0,0,0,1,0,0,0,0,0,0,0,0,0,0 311 | activity_485275,0,0,0,0,0,0,0,0,0,0,0,0,0,0 312 | activity_2690,0,0,0,0,0,0,0,0,0,0,0,0,0,0 313 | activity_1347041,0,0,0,0,0,0,0,0,0,0,0,0,0,0 314 | activity_602332,0,0,0,0,0,0,0,0,0,0,0,0,0,0 315 | activity_686979,0,0,0,0,0,0,0,0,0,0,0,0,1,0 316 | activity_686978,0,0,0,0,0,0,0,0,0,0,0,0,1,0 317 | activity_624173,0,0,0,0,0,0,0,0,0,0,0,0,0,0 318 | activity_1347131,0,0,0,0,0,0,0,0,0,1,0,0,0,0 319 | activity_1159524,0,0,0,0,0,0,0,0,0,0,0,0,0,0 320 | activity_1347071,1,0,0,0,0,0,0,0,0,0,0,0,0,0 321 | activity_1259415,0,0,0,0,0,0,0,0,0,0,0,0,0,0 322 | activity_588453,0,0,0,0,0,0,0,0,0,0,1,0,0,0 323 | activity_588456,0,0,0,0,0,0,0,0,0,0,1,0,0,0 324 | activity_588795,0,0,0,1,0,0,0,0,0,0,0,0,0,0 325 | activity_602179,0,0,0,0,0,0,0,0,0,0,1,0,0,0 326 | activity_686970,0,0,0,0,0,0,0,0,0,0,1,0,0,0 327 | activity_686971,0,0,0,0,0,0,0,0,0,0,0,0,0,0 328 | activity_485364,0,1,0,0,0,0,0,0,0,0,0,0,0,0 329 | activity_2528,0,1,0,0,0,0,0,0,0,0,0,0,0,0 330 | activity_485290,0,0,0,0,0,0,0,0,0,0,0,0,1,0 331 | activity_2517,0,1,0,0,0,0,0,0,0,0,0,0,0,0 332 | activity_485281,0,0,0,0,0,0,0,0,0,0,0,0,0,0 333 | activity_624296,0,0,0,0,0,0,0,0,0,0,0,0,0,0 334 | activity_624297,0,0,0,0,0,0,0,0,0,0,0,0,0,0 335 | activity_651820,0,0,0,0,0,0,0,0,0,0,0,0,0,0 336 | activity_540267,0,0,0,0,0,0,0,0,0,0,0,0,0,0 337 | activity_504937,0,0,0,0,0,0,0,0,0,0,0,0,1,0 338 | activity_624288,0,0,0,0,0,0,0,0,0,0,0,0,0,0 339 | activity_624287,0,0,0,0,0,0,0,0,0,0,0,0,0,0 340 | activity_463254,0,0,0,0,1,0,0,0,0,0,0,0,0,0 341 | activity_720580,0,0,0,0,0,0,0,0,0,0,0,0,0,0 342 | activity_504466,0,0,0,0,0,0,0,0,0,0,0,0,0,0 343 | activity_504467,0,0,0,0,0,0,0,0,0,0,0,0,0,0 344 | activity_652105,0,0,0,0,0,0,0,0,0,1,0,0,0,0 345 | activity_2101,0,1,0,0,0,0,0,0,0,0,0,0,0,0 346 | activity_504834,0,0,0,0,0,0,0,0,0,0,0,0,0,0 347 | activity_485349,0,0,0,0,0,0,1,0,0,0,0,0,0,0 348 | activity_2685,0,0,0,0,0,0,0,0,0,0,0,0,0,0 349 | activity_720579,0,0,0,0,0,0,0,0,0,0,0,0,0,0 350 | activity_504832,0,0,0,0,0,0,0,0,0,0,0,0,0,0 351 | activity_2100,0,0,0,1,0,0,0,0,0,0,0,0,0,0 352 | activity_2314,0,0,0,1,0,0,0,0,0,0,0,0,0,0 353 | activity_2472,0,0,0,0,0,0,0,0,0,0,1,0,0,0 354 | activity_2451,0,1,0,0,0,0,0,0,0,0,0,0,0,0 355 | activity_1868,0,0,0,0,0,0,0,0,0,1,0,0,0,0 356 | activity_1460,0,0,0,0,0,0,0,0,0,0,0,0,0,0 357 | activity_1768,0,0,0,0,0,0,0,1,0,0,0,0,0,0 358 | activity_1631,0,1,0,0,0,0,0,0,0,0,0,0,0,0 359 | activity_1634,0,1,0,0,0,0,0,0,0,0,0,0,0,0 360 | activity_2107,0,0,0,1,0,0,0,0,0,0,0,0,0,0 361 | activity_1471,0,0,0,0,0,0,0,0,0,0,0,0,0,0 362 | activity_1688,0,0,0,0,0,0,0,0,0,0,0,0,0,0 363 | activity_1030,0,0,0,0,0,0,0,0,0,0,1,0,0,0 364 | activity_1379,0,1,0,0,0,0,0,0,0,0,0,0,0,0 365 | activity_2242,0,0,0,1,0,0,0,0,0,0,0,0,0,0 366 | activity_1476,0,0,0,0,1,0,0,0,0,0,0,0,0,0 367 | activity_1478,0,0,0,0,1,0,0,0,0,0,0,0,0,0 368 | activity_1452,0,1,0,0,0,0,0,0,0,0,0,0,0,0 369 | activity_894,0,1,0,0,0,0,0,0,0,0,0,0,0,0 370 | activity_902,0,0,0,0,0,1,0,0,0,0,0,0,0,0 371 | activity_924,0,0,0,0,0,1,0,0,0,0,0,0,0,0 372 | activity_1347417,0,0,0,0,0,0,0,0,0,0,0,0,0,0 373 | activity_881,0,0,0,0,0,0,0,0,0,0,1,0,0,0 374 | activity_624168,0,1,0,0,0,0,0,0,0,0,0,0,0,0 375 | activity_624204,0,0,0,0,1,0,0,0,0,0,0,0,0,0 376 | activity_624354,1,0,0,0,0,0,0,0,0,0,0,0,0,0 377 | activity_651560,0,0,0,0,0,0,0,0,0,0,0,0,0,1 378 | activity_602244,1,0,0,0,0,0,0,0,0,0,0,0,0,0 379 | activity_602261,0,0,0,0,0,0,0,0,0,1,0,0,0,0 380 | activity_602274,0,0,0,0,0,0,0,0,0,0,0,0,0,0 381 | activity_602399,0,0,0,0,0,0,0,0,0,1,0,0,0,0 382 | activity_588458,0,0,0,0,0,0,0,1,0,0,0,0,0,0 383 | activity_588489,0,0,0,0,0,0,0,0,0,0,0,0,0,0 384 | activity_588492,0,0,0,0,0,0,0,0,0,0,0,0,0,0 385 | activity_588621,0,0,0,0,0,0,0,0,0,0,0,0,0,1 386 | activity_588850,0,0,0,0,0,0,0,0,1,0,0,0,0,0 387 | activity_602141,0,0,0,0,0,0,0,0,0,0,0,0,0,0 388 | activity_504720,0,1,0,0,0,0,0,0,0,0,0,0,0,0 389 | activity_493091,0,0,0,0,0,0,0,0,0,0,0,0,0,1 390 | activity_493160,0,0,0,0,0,0,0,0,0,1,0,0,0,0 391 | activity_493011,0,0,0,1,0,0,0,0,0,0,0,0,0,0 392 | activity_493012,0,0,0,1,0,0,0,0,0,0,0,0,0,0 393 | activity_449763,0,0,0,0,0,0,0,0,0,0,0,0,0,0 394 | activity_463104,0,0,0,0,0,0,0,0,0,0,0,0,0,0 395 | activity_463190,0,0,0,0,0,0,0,0,0,0,0,0,0,0 396 | activity_463212,0,0,0,0,0,0,0,0,0,0,0,0,0,0 397 | activity_435022,0,0,0,0,0,0,0,0,0,0,0,0,0,0 398 | activity_2380,0,0,0,0,0,0,0,0,0,0,0,0,0,0 399 | activity_2825,0,0,0,0,0,0,0,0,0,0,0,0,0,0 400 | activity_435003,0,0,0,0,0,0,0,0,0,0,0,0,0,0 401 | activity_651582,0,0,0,0,0,0,0,0,0,0,0,0,0,0 402 | activity_1443,0,0,0,0,0,0,0,0,0,0,0,0,0,0 403 | activity_1135,0,1,0,0,0,0,0,0,0,0,0,0,0,0 404 | activity_1217,0,0,0,0,0,0,0,0,0,0,1,0,0,0 405 | activity_1029,0,0,0,0,0,0,0,0,0,0,0,0,0,0 406 | activity_720637,0,0,0,0,0,0,0,0,0,0,0,0,0,0 407 | activity_1347034,0,0,0,0,1,0,0,0,0,0,0,0,0,0 408 | activity_1347037,0,0,0,0,1,0,0,0,0,0,0,0,0,0 409 | activity_602233,0,1,0,0,0,0,0,0,0,0,0,0,0,0 410 | activity_485367,0,1,0,0,0,0,0,0,0,0,0,0,0,0 411 | activity_1490,0,0,0,0,0,0,0,0,0,1,0,0,0,0 412 | activity_1468,0,0,0,0,0,0,0,0,0,0,0,0,0,0 413 | activity_602440,0,0,0,0,1,0,0,0,0,0,0,0,0,0 414 | activity_504690,0,1,0,0,0,0,0,0,0,0,0,0,0,0 415 | activity_588413,0,0,0,0,0,1,0,0,0,0,0,0,0,0 416 | activity_652067,0,0,1,0,0,0,0,0,0,0,0,0,0,0 417 | activity_652126,0,0,1,0,0,0,0,0,0,0,0,0,0,0 418 | activity_652257,0,0,0,0,0,0,0,1,0,0,0,0,0,0 419 | activity_686964,0,0,0,0,0,0,0,1,0,0,0,0,0,0 420 | activity_687014,0,0,1,0,0,0,0,0,0,0,0,0,0,0 421 | activity_687016,0,0,0,0,0,0,0,1,0,0,0,0,0,0 422 | activity_720596,0,0,0,0,0,0,0,0,0,0,0,0,0,0 423 | activity_720702,0,0,0,0,0,0,0,0,0,0,0,0,1,0 424 | activity_743126,0,0,0,1,0,0,0,0,0,0,0,0,0,0 425 | activity_1053197,0,0,0,1,0,0,0,0,0,0,0,0,0,0 426 | activity_651821,0,0,0,0,0,0,0,0,0,0,0,0,0,0 427 | activity_651957,0,0,1,0,0,0,0,0,0,0,0,0,0,0 428 | activity_652010,0,0,1,0,0,0,0,0,0,0,0,0,0,0 429 | activity_652039,0,0,0,0,1,0,0,0,0,0,0,0,0,0 430 | activity_602281,0,0,0,0,0,0,0,0,0,1,0,0,0,0 431 | activity_624267,0,0,0,0,0,0,1,0,0,0,0,0,0,0 432 | activity_624268,0,1,0,0,0,0,0,0,0,0,0,0,0,0 433 | activity_624377,0,0,0,0,0,0,0,0,0,0,0,0,0,0 434 | activity_624416,0,0,0,0,0,0,0,0,0,0,0,0,0,0 435 | activity_651718,0,0,0,0,0,0,0,0,0,0,1,0,0,0 436 | activity_651800,1,0,0,0,0,0,0,0,0,0,0,0,0,0 437 | activity_651572,0,0,0,0,0,0,0,0,0,0,0,0,0,0 438 | activity_602163,0,0,0,0,0,0,0,0,0,0,1,0,0,0 439 | activity_602123,0,0,0,0,1,0,0,0,0,0,0,0,0,0 440 | activity_540364,0,0,0,0,0,0,0,0,0,0,0,0,0,0 441 | activity_504411,0,1,0,0,0,0,0,0,0,0,0,0,0,0 442 | activity_504700,0,0,0,0,0,0,1,0,0,0,0,0,0,0 443 | activity_504707,0,0,0,0,0,0,1,0,0,0,0,0,0,0 444 | activity_504734,1,0,0,0,0,0,0,0,0,0,0,0,0,0 445 | activity_504766,0,0,1,0,0,0,0,0,0,0,0,0,0,0 446 | activity_504803,0,0,0,1,0,0,0,0,0,0,0,0,0,0 447 | activity_492953,0,1,0,0,0,0,0,0,0,0,0,0,0,0 448 | activity_492956,0,0,0,1,0,0,0,0,0,0,0,0,0,0 449 | activity_492972,0,0,0,1,0,0,0,0,0,0,0,0,0,0 450 | activity_493008,0,0,0,0,0,0,0,0,0,0,0,0,0,0 451 | activity_493087,0,1,0,0,0,0,0,0,0,0,0,0,0,0 452 | activity_493244,0,0,0,0,0,0,0,0,0,0,0,0,0,0 453 | activity_463082,0,0,0,1,0,0,0,0,0,0,0,0,0,0 454 | activity_2751,0,1,0,0,0,0,0,0,0,0,0,0,0,0 455 | activity_434962,0,1,0,0,0,0,0,0,0,0,0,0,0,0 456 | activity_2130,0,0,0,1,0,0,0,0,0,0,0,0,0,0 457 | activity_2174,0,0,0,1,0,0,0,0,0,0,0,0,0,0 458 | activity_2177,0,0,0,1,0,0,0,0,0,0,0,0,0,0 459 | activity_2234,0,0,0,0,0,1,0,0,0,0,0,0,0,0 460 | activity_2057,0,0,0,0,0,0,0,0,0,0,0,0,0,0 461 | activity_2129,0,0,0,0,0,0,0,0,1,0,0,0,0,0 462 | activity_1906,0,0,0,0,1,0,0,0,0,0,0,0,0,0 463 | activity_1947,0,0,0,1,0,0,0,0,0,0,0,0,0,0 464 | activity_1974,0,1,0,0,0,0,0,0,0,0,0,0,0,0 465 | activity_1822,0,1,0,0,0,0,0,0,0,0,0,0,0,0 466 | activity_1845,0,0,0,0,0,0,0,0,0,0,0,0,0,0 467 | activity_1321,0,0,0,0,0,0,1,0,0,0,0,0,0,0 468 | activity_631,0,0,1,0,0,0,0,1,0,0,0,0,0,0 469 | activity_731,0,0,1,0,0,0,0,1,0,0,0,0,0,0 470 | activity_1032,0,0,1,0,0,0,0,0,0,0,0,0,0,0 471 | activity_1203,0,0,0,0,0,0,0,0,0,0,0,0,0,0 472 | activity_920,0,0,0,0,0,1,0,0,0,0,0,0,0,0 473 | activity_932,0,0,0,0,0,1,0,0,0,0,0,0,0,0 474 | activity_862,0,0,0,0,0,0,0,0,0,0,0,0,0,0 475 | activity_871,0,0,0,0,0,0,0,0,0,0,0,0,0,0 476 | activity_743093,0,0,0,0,0,0,0,0,0,0,0,0,0,0 477 | activity_2052,1,0,0,0,0,0,0,0,0,0,0,0,0,0 478 | activity_1529,0,0,0,0,0,0,1,0,0,0,0,0,0,0 479 | activity_1530,0,0,0,0,0,0,1,0,0,0,0,0,0,0 480 | activity_1531,0,0,0,0,0,0,1,0,0,0,0,0,0,0 481 | activity_758,0,0,0,1,0,0,0,0,0,0,0,0,0,0 482 | activity_760,0,0,0,1,0,0,0,0,0,0,0,0,0,0 483 | activity_761,0,0,0,0,0,0,0,0,0,0,0,0,0,0 484 | activity_757,0,0,0,1,0,0,0,0,0,0,0,0,0,0 485 | activity_759,0,0,0,1,0,0,0,0,0,0,0,0,0,0 486 | activity_764,0,0,0,1,0,0,0,0,0,0,0,0,0,0 487 | activity_1325,0,0,0,0,0,0,0,0,0,0,0,1,0,0 488 | activity_1326,0,0,0,0,0,0,0,0,0,0,0,1,0,0 489 | activity_493014,0,0,0,0,0,0,0,0,0,0,0,0,0,0 490 | activity_588664,0,0,0,0,0,0,1,0,0,0,0,0,0,0 491 | activity_602410,0,0,0,0,0,0,0,0,1,0,0,0,0,0 492 | activity_623877,0,0,0,0,0,0,0,0,1,0,0,0,0,0 493 | activity_488975,0,0,0,0,0,0,0,0,0,0,0,1,0,0 494 | activity_488977,0,0,0,0,0,0,0,0,0,0,0,1,0,0 495 | activity_588511,0,0,0,0,0,0,0,0,1,0,0,0,0,0 496 | activity_588627,1,0,0,0,0,0,0,0,0,0,0,0,0,0 497 | activity_588675,1,0,0,0,0,0,0,0,0,0,0,0,0,0 498 | activity_588676,1,0,0,0,0,0,0,0,0,0,0,0,0,0 499 | activity_1511,0,0,0,0,0,0,0,0,1,0,0,0,0,0 500 | activity_1672,0,0,0,0,0,0,0,0,1,0,0,0,0,0 501 | activity_2550,0,0,0,0,0,0,0,0,1,0,0,0,0,0 502 | activity_2553,0,0,0,0,0,0,0,0,1,0,0,0,0,0 503 | activity_2648,0,0,0,0,0,0,0,0,1,0,0,0,0,0 504 | activity_463111,0,0,0,0,0,0,0,0,0,0,0,0,0,0 505 | activity_463165,0,0,0,0,0,0,0,0,0,0,0,0,0,0 506 | activity_2156,0,0,0,0,0,0,0,0,1,0,0,0,0,0 507 | activity_2227,0,0,0,0,0,0,0,0,0,0,0,0,0,0 508 | activity_2239,0,0,0,0,0,0,0,0,1,0,0,0,0,0 509 | activity_624037,1,0,0,0,0,0,0,0,0,0,0,0,0,0 510 | activity_624038,1,0,0,0,0,0,0,0,0,0,0,0,0,0 511 | activity_624040,1,0,0,0,0,0,0,0,0,0,0,0,0,0 512 | activity_624125,1,0,0,0,0,0,0,0,0,0,0,0,0,0 513 | activity_624126,1,0,0,0,0,0,0,0,0,0,0,0,0,0 514 | activity_624127,1,0,0,0,0,0,0,0,0,0,0,0,0,0 515 | activity_624466,1,0,0,0,0,0,0,0,0,0,0,0,0,0 516 | activity_624467,1,0,0,0,0,0,0,0,0,0,0,0,0,0 517 | activity_602247,0,0,0,0,0,0,0,0,0,0,0,0,0,0 518 | activity_602248,0,0,0,0,0,0,0,0,0,0,0,0,0,0 519 | activity_602250,0,0,0,0,0,0,0,0,0,0,0,0,0,0 520 | activity_588814,1,0,0,0,0,0,0,0,0,0,0,0,0,0 521 | activity_588819,1,0,0,0,0,0,0,0,0,0,0,0,0,0 522 | activity_588852,1,0,0,0,0,0,0,0,0,0,0,0,0,0 523 | activity_504634,1,0,0,0,0,0,0,0,0,0,0,0,0,0 524 | activity_504692,1,0,0,0,0,0,0,0,0,0,0,0,0,0 525 | activity_504454,1,0,0,0,0,0,0,0,0,0,0,0,0,0 526 | activity_652048,1,0,0,0,0,0,0,0,0,0,0,0,0,0 527 | activity_652051,1,0,0,0,0,0,0,0,0,0,0,0,0,0 528 | activity_493056,1,0,0,0,0,0,0,0,0,0,0,0,0,0 529 | activity_493084,1,0,0,0,0,0,0,0,0,0,0,0,0,0 530 | activity_504660,1,0,0,0,0,0,0,0,0,0,0,0,0,0 531 | activity_652054,1,0,0,0,0,0,0,0,0,0,0,0,0,0 532 | activity_624463,1,0,0,0,0,0,0,0,0,0,0,0,0,0 533 | activity_504651,1,0,0,0,0,0,0,0,0,0,0,0,0,0 534 | activity_624465,1,0,0,0,0,0,0,0,0,0,0,0,0,0 535 | activity_504652,1,0,0,0,0,0,0,0,0,0,0,0,0,0 536 | activity_624464,1,0,0,0,0,0,0,0,0,0,0,0,0,0 537 | activity_624291,0,0,0,0,0,0,0,0,0,0,0,0,0,0 538 | activity_485347,1,0,0,0,0,0,0,0,0,0,0,0,0,0 539 | activity_485344,1,0,0,0,0,0,0,0,0,0,0,0,0,0 540 | activity_485358,1,0,0,0,0,0,0,0,0,0,0,0,0,0 541 | activity_492947,1,0,0,0,0,0,0,0,0,0,0,0,0,0 542 | activity_504810,1,0,0,0,0,0,0,0,0,0,0,0,0,0 543 | activity_504812,1,0,0,0,0,0,0,0,0,0,0,0,0,0 544 | activity_540275,0,0,0,0,0,0,0,0,1,0,0,0,0,0 545 | activity_540277,0,0,0,0,0,0,0,0,1,0,0,0,0,0 546 | activity_493036,1,0,0,0,0,0,0,0,0,0,0,0,0,0 547 | activity_1469,0,0,1,0,0,0,0,0,0,0,0,0,0,0 548 | activity_1479,0,0,1,0,0,0,0,0,0,0,0,0,0,0 549 | activity_720552,0,0,0,0,0,1,0,0,0,0,0,0,0,0 550 | activity_720719,0,0,1,0,0,0,0,0,0,0,0,0,0,0 551 | activity_720725,0,0,1,0,0,0,0,0,0,0,0,0,0,0 552 | activity_743053,0,0,1,0,0,0,0,0,0,0,0,0,0,0 553 | activity_743054,0,0,1,0,0,0,0,0,0,0,0,0,0,0 554 | activity_743063,0,0,1,0,0,0,0,0,0,0,0,0,0,0 555 | activity_743067,0,0,1,0,0,0,0,0,0,0,0,0,0,0 556 | activity_743077,0,0,1,0,0,0,0,0,0,0,0,0,0,0 557 | activity_743078,0,0,1,0,0,0,0,0,0,0,0,0,0,0 558 | activity_743091,0,0,1,0,0,0,0,0,0,0,0,0,0,0 559 | activity_743122,0,0,0,0,0,1,0,0,0,0,0,0,0,0 560 | activity_743139,0,0,0,0,0,0,0,0,0,0,0,0,0,0 561 | activity_743140,0,0,1,0,0,0,0,0,0,0,0,0,0,0 562 | activity_1159523,0,0,1,0,0,0,0,0,0,0,0,0,0,0 563 | activity_1159528,0,0,0,0,0,1,0,0,0,0,0,0,0,0 564 | activity_1159531,0,0,1,0,0,0,0,0,0,0,0,0,0,0 565 | activity_1159555,0,0,1,0,0,0,0,0,0,0,0,0,0,0 566 | activity_1224892,0,0,1,0,0,0,0,0,0,0,0,0,0,0 567 | activity_1224893,0,0,1,0,0,0,0,0,0,0,0,0,0,0 568 | activity_1224894,0,0,0,0,0,1,0,0,0,0,0,0,0,0 569 | activity_1224895,1,0,0,0,0,0,0,0,0,0,0,0,0,0 570 | activity_1224896,0,0,0,0,0,0,0,0,0,0,0,0,0,0 571 | activity_1259247,0,0,1,0,0,0,0,0,0,0,0,0,0,0 572 | activity_1259248,0,0,1,0,0,0,0,0,0,0,0,0,0,0 573 | activity_1259387,0,0,1,0,0,0,0,0,0,0,0,0,0,0 574 | activity_1259390,0,0,0,0,0,1,0,0,0,0,0,0,0,0 575 | activity_1259391,0,0,1,0,0,0,0,0,0,0,0,0,0,0 576 | activity_1259392,0,0,0,0,0,1,0,0,0,0,0,0,0,0 577 | activity_1259393,1,0,0,0,0,0,0,0,0,0,0,0,0,0 578 | activity_1259394,0,0,1,0,0,0,0,0,0,0,0,0,0,0 579 | activity_1259395,1,0,0,0,0,0,0,0,0,0,0,0,0,0 580 | activity_1259396,0,0,1,0,0,0,0,0,0,0,0,0,0,0 581 | activity_1259401,0,0,1,0,0,0,0,0,0,0,0,0,0,0 582 | activity_1259402,0,0,1,0,0,0,0,0,0,0,0,0,0,0 583 | activity_1259403,0,0,1,0,0,0,0,0,0,0,0,0,0,0 584 | activity_1259404,0,0,1,0,0,0,0,0,0,0,0,0,0,0 585 | activity_1347030,1,0,0,0,0,0,0,0,0,0,0,0,0,0 586 | activity_1347031,0,0,1,0,0,0,0,0,0,0,0,0,0,0 587 | activity_1347032,0,0,0,0,0,1,0,0,0,0,0,0,0,0 588 | activity_1347033,0,0,0,0,0,0,0,0,0,0,0,0,0,0 589 | activity_1347036,0,0,1,0,0,0,0,0,0,0,0,0,0,0 590 | activity_1347038,1,0,0,0,0,0,0,0,0,0,0,0,0,0 591 | activity_1159518,0,0,0,0,0,0,0,0,0,0,0,0,0,0 592 | activity_1159519,0,0,0,0,0,1,0,0,0,0,0,0,0,0 593 | activity_743199,0,0,1,0,0,0,0,0,0,0,0,0,0,0 594 | activity_743219,0,0,0,0,0,1,0,0,0,0,0,0,0,0 595 | activity_743226,0,0,1,0,0,0,0,0,0,0,0,0,0,0 596 | activity_743227,0,0,1,0,0,0,0,0,0,0,0,0,0,0 597 | activity_743228,0,0,0,0,0,0,0,0,0,0,0,0,0,0 598 | activity_743239,0,0,1,0,0,0,0,0,0,0,0,0,0,0 599 | activity_743240,0,0,1,0,0,0,0,0,0,0,0,0,0,0 600 | activity_743241,0,0,1,0,0,0,0,0,0,0,0,0,0,0 601 | activity_743242,0,0,1,0,0,0,0,0,0,0,0,0,0,0 602 | -------------------------------------------------------------------------------- /Data/all_molecular_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LumosBio/MolData/73002723e0d20d0a2fdbd3950e738ebff214eede/Data/all_molecular_data.zip -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The code and the software for this project is licensed under the MIT license as described in LICENSE.mit file. 2 | 3 | The data for this project is licensed under Creative Commons Zero of Creative Commons BY-SA as described in LICENSE.cczero file. 4 | 5 | The data sources used for creation of the MolData dataset are referenced in the "data/data_reference_list.txt" as well as in the license file for the data. 6 | -------------------------------------------------------------------------------- /LICENSE.mit: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Transilico 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MolData - A Molecular Benchmark for Disease and Target Based Machine Learning 2 | 3 | Deep learning’s automatic feature extraction has been a revolutionary addition to computational drug discovery, infusing both the capabilities of learning abstract features and discovering complex molecular patterns via learning from molecular data. Since biological and chemical knowledge is are necessary for overcoming the challenges of data curation, balancing, training, and evaluation, it is important for databases to contain meaningful information regarding the exact target and disease of each bioassay. The existing depositories such as PubChem or ChEMBL offer the screening data of for millions of molecules against a variety of cells and targets, however, their bioassays contain complex biological information descriptions which can hinder their usage by the machine learning community. In this work, a comprehensive disease and target-based dataset is collected from PubChem in order to facilitate and accelerate molecular machine learning for better drug discovery. MolData is one the largest efforts to date for democratizing the molecular machine learning, with roughly 170 million drug screening results from 1.4 million unique molecules assigned to specific diseases and targets. It also provides 30 unique categories of targets and diseases. Correlation analysis of the MolData bioassays unveil valuable information for drug repurposing for multiple diseases including cancer, metabolic disorders, and infectious diseases. Finally, we provide a benchmark of more than 30 models trained on each category using multitask learning. MolData aims to pave the way for computational drug discovery and accelerate the advancement of molecular artificial intelligence in a practical manner. 4 | 5 | # Requirements 6 | Requirements for training the models and running the benchmark:\ 7 | deepchem==2.5.0\ 8 | rdkit==2020.09.1\ 9 | tensorflow==2.5.0 10 | 11 | Requirements for clustering the bioassay text descriptions:\ 12 | biobert-embedding==0.1.1\ 13 | transformers\ 14 | kneed\ 15 | python-docx 16 | 17 | # How to Use 18 | After cloning the repo, this repository can be used to perform training on the MolData dataset, or to create a molecular dataset from bioassays and their descriptions. To get bechmark result and to train model on the molecular data:\ 19 | 1- Unzip the data within the data directory.\ 20 | 2- Run training.py for training and evaluating a Graph Convolutional Neural Network, or a traditional ECFP-based fully connected network.\ 21 | If you plan to work with bioassays descriptions, move forward to "Preprocessing Bioassay Descriptions" section. In short you would need to preprocess the description, download molecular data, and clean and partition the molecular data. 22 | 23 | # Data 24 | The MolData dataset is can be accessed from the data directory after unzipping. all_molecular_data includes 1.4 million molecules, 600 columns of binary bioactivity labels, and the splits (training, validation, or test) the molecules belong to. aid_disease_mapping contains the mapping between bioassays and their related category of diseases, and aid_target_mapping contains the same for category of targets. To have accurate and comparable results, please use the provided split labels for training and evaluations. 25 | 26 | The data sources used for the creation of the MolData dataset gathered from the PubChem database are referenced within "data/data_reference_list.txt" 27 | 28 | # Training on the Molecular Data 29 | The training.py script offers simple training on the molecular data for all benchamarks, evaluates the trained models, and saves the results for each model. To start the training:\ 30 | 1- Select the data type (disease or target) within the code.\ 31 | 2 - Specify the featurizer (GraphConv or ECFP). GraphConv triggers training of a Graph Convolutional Neural Network, while ECFP trains a simple fully connected neural network.\ 32 | 3 - Specify the training data directory (default is at data/ where you unzip the main dataset).\ 33 | 4 - Start the training. 34 | 35 | Training happens on a transformed training set to overcome imbalance, where positive data point have higher weights than the negative data points for the loss function. However, evaluation is done on untransformed validation and test sets, to not allow the transformed weights to affect the metric calculation outcomes and to allow missing values to not be counted toward the metric calculations. 36 | 37 | # Preprocessing Bioassay Descriptions (optional) 38 | Bioassays descriptions and summaries are downloaded from PuChem as text files for 9 different sources. The scripts follow this order:\ 39 | 1- Preprocessing.py: Cleans the descriptions and extracts useful information from them using pre-defined rules.\ 40 | 2- Clustering.py: Used BioBERT to extract features from the cleaned descriptions and titles, then used KMeans to cluster them. The cluster number are only used as recommendation during taggign each bioassay.\ 41 | 3- Ner.py: Uses a model trained for disease entity recognition to find all disease related words within the description. These words do not have an effect on the tagging, since the detected words were too broad.\ 42 | 4- Manual_tag.py: After a human expert highlights the disease and target related words in all descriptions in a word files, these highlighted words are read and used for finding disease and target tags for each bioassay. 43 | 44 | # Preprocessing Molecular Data (optional) 45 | After the assays are found and tagged, the molecular data for each assay is downloaded from PubChem using PubChem's bulk download interface. The scripts regarding this section follow this order:\ 46 | 1- Molecular_data_cleaning.py: Makes SMILES canon, cleans duplicate SMILES, adds binary labels to SMILES.\ 47 | 2- Fingerprint_extraction.py: Extracts ECFP4 fingerprints from the data, then used Tanimoto Coefficient to calculate the diversity within the dataset.\ 48 | 3- Correlation.py: Find linear correlation between the labels of all datasets (bioassays), this can be a starting step for drug repurposing.\ 49 | 4- Splitting.py: Splits the molecular data to train, validation, and test splits using the molecular scaffolds. 50 | -------------------------------------------------------------------------------- /clustering.py: -------------------------------------------------------------------------------- 1 | from biobert_embedding.embedding import BiobertEmbedding 2 | import pandas as pd 3 | import os 4 | import torch 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from kneed import KneeLocator 8 | from sklearn.cluster import KMeans 9 | from sklearn.metrics import silhouette_score 10 | from sklearn.preprocessing import StandardScaler 11 | from sklearn.preprocessing import LabelEncoder 12 | from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS 13 | # from docx import Document 14 | from sklearn.decomposition import PCA 15 | import seaborn as sns 16 | 17 | def intersection(lst1, lst2): 18 | lst3 = [value for value in lst1 if value in lst2] 19 | return lst3 20 | 21 | def sentence_vector(tokenized_text, biobert): 22 | encoded_layers = biobert.eval_fwdprop_biobert(tokenized_text) 23 | 24 | # `encoded_layers` has shape [12 x 1 x 22 x 768] 25 | # `token_vecs` is a tensor with shape [22 x 768] 26 | token_vecs = encoded_layers[11][0] 27 | 28 | # Calculate the average of all 22 token vectors. 29 | sentence_embedding = torch.mean(token_vecs, dim=0) 30 | return sentence_embedding 31 | 32 | 33 | def clean_punctuation(sample): 34 | sample = sample.replace('-', ' ') 35 | sample = sample.replace('/', ' ') 36 | sample = sample.replace('(', ' ') 37 | sample = sample.replace(')', ' ') 38 | sample = sample.replace('\'', ' ') 39 | sample = sample.replace('.', ' ') 40 | sample = sample.replace(':', ' ') 41 | sample = sample.replace(',', ' ') 42 | sample = sample.replace(';', ' ') 43 | sample = sample.replace('_', ' ') 44 | return sample 45 | 46 | 47 | def delete_punctuation(sample): 48 | sample = sample.replace('-', '') 49 | sample = sample.replace('/', '') 50 | sample = sample.replace('(', '') 51 | sample = sample.replace(')', '') 52 | sample = sample.replace('\'', '') 53 | sample = sample.replace('.', '') 54 | sample = sample.replace(':', '') 55 | sample = sample.replace(',', '') 56 | sample = sample.replace(';', '') 57 | sample = sample.replace('_', '') 58 | return sample 59 | 60 | 61 | df = pd.read_csv('merged.csv') 62 | df = df[df['substance_num'] >= 100] 63 | df_tox = df[df['source'] == 'Tox21'] 64 | df = df[df['substance_num'] >= 100000] 65 | df = df.merge(df_tox, how='outer') 66 | biobert = BiobertEmbedding() 67 | 68 | sources = np.array(df['source']) 69 | print(np.unique(sources)) 70 | 71 | 72 | embs = [] 73 | long_counter = 0 74 | descs = np.array(df['description']) 75 | titles = np.array(df['name']) 76 | # Extract embeddings for descriptions 77 | for desc in descs: 78 | desc = biobert.process_text(desc.lower()) 79 | embs.append(sentence_vector(desc[:512], biobert)) 80 | if len(desc) > 512: 81 | long_counter += 1 82 | print(long_counter, 'out of', len(descs), 'descriptions were truncated (Max 512 tokens).') 83 | 84 | embs_np = [] 85 | for e in embs: 86 | embs_np.append(e.numpy()) 87 | embs_np = np.array(embs_np) 88 | print(embs_np.shape) 89 | 90 | # Extract embeddings for titles 91 | embs_title = [] 92 | long_counter = 0 93 | for title in titles: 94 | title = biobert.process_text(title.lower()) 95 | embs_title.append(sentence_vector(title[:512], biobert)) 96 | if len(title) > 512: 97 | long_counter += 1 98 | print(long_counter, 'out of', len(titles), 'descriptions were truncated (Max 512 tokens).') 99 | 100 | embs_title_np = [] 101 | for e in embs_title: 102 | embs_title_np.append(e.numpy()) 103 | embs_title_np = np.array(embs_title_np) 104 | print(embs_title_np.shape) 105 | 106 | # Concatenate embeddings for both titles and descriptions 107 | features = np.concatenate((embs_np, embs_title_np), axis=1) 108 | print(features.shape) 109 | 110 | # Standardize the features 111 | scaler = StandardScaler() 112 | scaled_features = scaler.fit_transform(features) 113 | 114 | kmeans_kwargs = { 115 | "init": "k-means++", 116 | "n_init": 10, 117 | "max_iter": 1000, 118 | "random_state": 42} 119 | 120 | # Cluster the text features and find optimum number of clusters 121 | sse = [] 122 | total_k = 51 123 | for k in range(1, total_k): 124 | kmeans = KMeans(n_clusters=k, **kmeans_kwargs) 125 | kmeans.fit(scaled_features) 126 | sse.append(kmeans.inertia_) 127 | print(k) 128 | plt.style.use("fivethirtyeight") 129 | plt.plot(range(1, total_k), sse) 130 | plt.xticks(range(1, total_k)) 131 | plt.xlabel("Number of Clusters") 132 | plt.ylabel("SSE") 133 | plt.tight_layout() 134 | plt.savefig('knee.png', format='png', dpi=300) 135 | plt.show() 136 | 137 | kl = KneeLocator(range(1, total_k), sse, curve="convex", direction="decreasing") 138 | optimum_k = kl.elbow 139 | print(optimum_k) 140 | kmeans = KMeans(n_clusters=optimum_k, **kmeans_kwargs) 141 | kmeans.fit(scaled_features) 142 | 143 | # Perform PCA to be able to display the clusters 144 | pca = PCA(n_components=2, random_state=42) 145 | pca_features = pca.fit_transform(scaled_features) 146 | label_encoder = LabelEncoder() 147 | true_labels = label_encoder.fit_transform(sources) 148 | 149 | df['cluster'] = kmeans.labels_ 150 | for i in range(len(features[0])): 151 | df['feature'+str(i)] = features[:,i] 152 | 153 | # Add cluster information to the data 154 | df.to_csv('merged_features_clustered.csv', header=True, index=False) 155 | 156 | # Display the clusters 157 | pcadf = pd.DataFrame(pca_features,columns=["Principal Component 1", "Principal Component 2"]) 158 | pcadf["Cluster"] = kmeans.labels_ 159 | pcadf["Data Source"] = label_encoder.inverse_transform(true_labels) 160 | 161 | pcadf = pcadf.replace({'Data Source': {'Broad Institute': 'Broad Institute', 'Burnham Center for Chemical Genomics': 'Burnham Center', 162 | 'Emory University Molecular Libraries Screening Center': 'Emory University', 163 | 'ICCB-Longwood Screening Facility, Harvard Medical School': 'ICCB-Longwood', 164 | 'Johns Hopkins Ion Channel Center': 'Johns Hopkins', 'NMMLSC':'NMMLSC', 165 | 'National Center for Advancing Translational Sciences (NCATS)': 'NCATS', 166 | 'The Scripps Research Institute Molecular Screening Center': 'Scripps', 'Tox21': 'Tox21'}}) 167 | 168 | # plt.style.use("fivethirtyeight") 169 | plt.style.use("default") 170 | 171 | plt.figure(figsize=(10, 8)) 172 | # fix color wheel 173 | scat = sns.scatterplot("Principal Component 1", "Principal Component 2", s=100,data=pcadf, 174 | hue="Cluster",style="Data Source", palette=sns.color_palette("tab10",len(np.unique(df['cluster'])))) 175 | # scat.set_title("Clustering results") 176 | # plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0) 177 | plt.xlabel("Principal Component 1", fontsize=20) 178 | plt.ylabel("Principal Component 2", fontsize=20) 179 | plt.tick_params(axis="y",direction="in") 180 | plt.tick_params(axis="x",direction="in") 181 | plt.xticks(fontsize=15) 182 | plt.yticks(fontsize=15) 183 | plt.legend(fontsize=10, ncol=2, bbox_to_anchor=(0.44, 0.62)) 184 | # plt.legend(ncol=2) 185 | plt.tight_layout() 186 | plt.savefig('clusters.png', format='png') 187 | plt.show() 188 | 189 | ############################################################### 190 | 191 | 192 | 193 | 194 | -------------------------------------------------------------------------------- /correlation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import math 5 | 6 | mol_df = pd.read_csv('merged_cleaned_benchmarked_threshold_scaffold_split_stratified_pca.csv') 7 | activity_columns = [c for c in list(mol_df.columns) if 'activity' in c] 8 | 9 | corr = mol_df[activity_columns].corr() 10 | corr.to_csv('results/correlation_all.csv', header=True, index=True) 11 | 12 | corr = pd.read_csv('results/correlation_all.csv', index_col=0) 13 | map_df = pd.read_csv('aid_tag_mapping.csv') 14 | desc_df = pd.read_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked_cleaned_counted_threshold.csv') 15 | desc_info = {} 16 | for i in range(len(desc_df)): 17 | desc_info['activity_' + str(desc_df.iloc[i]['AID'])] = [desc_df.iloc[i]['name'], desc_df.iloc[i]['source'], desc_df.iloc[i]['target'], desc_df.iloc[i]['benchmark_tag']] 18 | all_categories = ['all'] + list(map_df.columns)[1:] 19 | for current_tag in all_categories: 20 | # current_tag = 'toxicity' 21 | print(current_tag) 22 | if current_tag == 'all': 23 | current_corr = corr 24 | else: 25 | current_aids = np.array(list(map_df[map_df[current_tag] == 1]['AID'])) 26 | current_corr = corr[current_aids] 27 | current_corr = current_corr[current_corr.index.isin(current_aids)] 28 | corr.to_csv('results/correlation_'+current_tag+'.csv', header=True, index=False) 29 | if current_tag in ['all','cancer']: 30 | fontsize = 1 31 | else: 32 | fontsize = 4 33 | current_columns = [c.lstrip('activity_') for c in list(current_corr.columns)] 34 | current_corr = current_corr.reindex(list(current_corr.columns)) 35 | # current_corr.to_csv('toxicity_correlation_matrix.csv', index=True) 36 | # current_corr = np.array(current_corr) 37 | # dummy = [] 38 | # for i in range(len(current_corr)): 39 | # for j in range(len(current_corr)): 40 | # dummy.append([i,j,current_corr[i,j], current_columns[i], current_columns[j]]) 41 | # dummy = np.array(dummy) 42 | # dummy_df = pd.DataFrame(dummy, columns=['x', 'y', 'correlation', 'AID_x', 'AID_y']) 43 | # dummy_df.to_csv('toxicity_correlation_matrix.csv', index=False) 44 | fig = plt.figure(figsize=(8,4), dpi=300) 45 | ax = fig.add_subplot(111) 46 | cax = ax.matshow(current_corr,cmap='coolwarm', vmin=-1, vmax=1) 47 | fig.colorbar(cax) 48 | ticks = np.arange(0,len(current_columns),1) 49 | ax.set_xticks(ticks) 50 | plt.xticks(rotation=90) 51 | ax.set_yticks(ticks) 52 | # plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right') 53 | ax.set_xticklabels(current_columns, fontsize=fontsize) 54 | ax.set_yticklabels(current_columns, fontsize=fontsize) 55 | plt.xlabel('Bioassay AID', fontsize=fontsize+7) 56 | ax.xaxis.set_label_position('top') 57 | plt.ylabel('Bioassay AID', fontsize=fontsize+7) 58 | plt.tight_layout() 59 | plt.savefig('results/correlation_'+current_tag+'.png', format='png', dpi=300) 60 | plt.show() 61 | plt.close() 62 | 63 | corr_array = np.array(corr) 64 | corr_array_abs = np.absolute(np.array(corr)) 65 | all_aids = list(corr.columns) 66 | threshold = 0.5 67 | interesting_aids = [] 68 | interesting_corr = [] 69 | for i in range(len(corr_array)): 70 | first_aid = all_aids[i] 71 | first_source = desc_info[first_aid][1] 72 | for j in range(len(corr_array)): 73 | if i == j: 74 | continue 75 | second_aid = all_aids[j] 76 | second_source = desc_info[second_aid][1] 77 | if first_source == 'Tox21' or second_source == 'Tox21': 78 | continue 79 | current_corr = corr_array[i, j] 80 | if abs(current_corr) >= threshold: 81 | if abs(int(first_aid.lstrip('activity_')) - int(second_aid.lstrip('activity_'))) > 5: 82 | if first_source == second_source: 83 | source_overlap = 1 84 | else: 85 | source_overlap = 0 86 | if first_aid not in interesting_aids: 87 | interesting_aids.append(first_aid) 88 | if second_aid not in interesting_aids: 89 | interesting_aids.append(second_aid) 90 | interesting_corr.append([first_aid, second_aid, current_corr, source_overlap]) 91 | print(len(interesting_aids)) 92 | interesting_corr = pd.DataFrame(data=interesting_corr, columns=['first_AID', 'second_AID', 'corr', 'same_source']) 93 | interesting_corr = interesting_corr.sort_values(by=['corr'], ascending=False) 94 | added_info = [] 95 | for i in range(len(interesting_corr)): 96 | added_info.append(desc_info[interesting_corr.iloc[i]['first_AID']] + desc_info[interesting_corr.iloc[i]['second_AID']]) 97 | interesting_corr[['first_name', 'first_source', 'first_target', 'first_benchmark_tag', 'second_name', 'second_source', 'second_target', 'second_benchmark_tag']] = np.array(added_info) 98 | interesting_corr_unique = interesting_corr.iloc[list(np.arange(0,len(interesting_corr), 2))] 99 | 100 | 101 | overlap = [] 102 | active_overlap = [] 103 | sizes = [] 104 | actives = [] 105 | for i in range(len(interesting_corr_unique)): 106 | first_aid = interesting_corr_unique.iloc[i]['first_AID'] 107 | second_aid = interesting_corr_unique.iloc[i]['second_AID'] 108 | current_df = mol_df[[first_aid, second_aid]].dropna(how='any') 109 | overlap.append(len(current_df)) 110 | current_df_1 = current_df[current_df[first_aid] == 1] 111 | current_df_2 = current_df[current_df[second_aid] == 1] 112 | actives.append([len(current_df_1), len(current_df_2)]) 113 | current_df = current_df[current_df[first_aid] == 1] 114 | current_df = current_df[current_df[second_aid] == 1] 115 | active_overlap.append(len(current_df)) 116 | sizes.append([len(mol_df[first_aid].dropna(how='any')), len(mol_df[second_aid].dropna(how='any'))]) 117 | 118 | 119 | interesting_corr_unique.insert(loc=3, column='overlap', value=overlap) 120 | interesting_corr_unique[['first_size', 'second_size']] = sizes 121 | interesting_corr_unique[['first_active', 'second_active']] = actives 122 | 123 | interesting_corr_unique.insert(loc=4, column='overlap_active', value=active_overlap) 124 | expert_df = pd.read_csv('results/correlation_interesting_expert.csv') 125 | previous = [] 126 | for i in range(len(expert_df)): 127 | first_aid = expert_df.iloc[i]['first_AID'] 128 | second_aid = expert_df.iloc[i]['second_AID'] 129 | previous.append([first_aid, second_aid]) 130 | previous.append([second_aid, first_aid]) 131 | previous_checked = [] 132 | for i in range(len(interesting_corr_unique)): 133 | first_aid = interesting_corr_unique.iloc[i]['first_AID'] 134 | second_aid = interesting_corr_unique.iloc[i]['second_AID'] 135 | if [first_aid, second_aid] in previous: 136 | previous_checked.append(1) 137 | else: 138 | previous_checked.append(0) 139 | interesting_corr_unique.insert(loc=3, column='previously_checked', value=previous_checked) 140 | 141 | interesting_corr_unique.to_csv('results/correlation_interesting.csv', header=True, index=False) 142 | # interesting_corr_unique.to_csv('results/correlation_interesting_diff_source.csv', header=True, index=False) 143 | 144 | expert_df = pd.read_csv('results/correlation_interesting_expert.csv') 145 | 146 | overlap = [] 147 | active_overlap = [] 148 | sizes = [] 149 | actives = [] 150 | for i in range(len(expert_df)): 151 | first_aid = expert_df.iloc[i]['first_AID'] 152 | second_aid = expert_df.iloc[i]['second_AID'] 153 | current_df = mol_df[[first_aid, second_aid]].dropna(how='any') 154 | overlap.append(len(current_df)) 155 | current_df_1 = current_df[current_df[first_aid] == 1] 156 | current_df_2 = current_df[current_df[second_aid] == 1] 157 | actives.append([len(current_df_1), len(current_df_2)]) 158 | current_df = current_df[current_df[first_aid] == 1] 159 | current_df = current_df[current_df[second_aid] == 1] 160 | active_overlap.append(len(current_df)) 161 | sizes.append([len(mol_df[first_aid].dropna(how='any')), len(mol_df[second_aid].dropna(how='any'))]) 162 | # a = mol_df['activity_1259404'] 163 | # len(a.dropna(how='any')) 164 | 165 | expert_df.insert(loc=3, column='overlap', value=overlap) 166 | expert_df[['first_size', 'second_size']] = sizes 167 | expert_df[['first_active', 'second_active']] = actives 168 | 169 | expert_df.insert(loc=4, column='overlap_active', value=active_overlap) 170 | 171 | # interesting_corr_unique.to_csv('results/correlation_interesting.csv', header=True, index=False) 172 | expert_df.to_csv('results/correlation_interesting_expert_overlap.csv', header=True, index=False) 173 | 174 | current_corr = corr[interesting_aids] 175 | current_corr = current_corr[current_corr.index.isin(interesting_aids)] 176 | current_columns = [c.lstrip('activity_') for c in list(current_corr.columns)] 177 | current_corr = current_corr.reindex(list(current_corr.columns)) 178 | fig = plt.figure(figsize=(8,4), dpi=300) 179 | ax = fig.add_subplot(111) 180 | cax = ax.matshow(current_corr,cmap='coolwarm', vmin=-1, vmax=1) 181 | fig.colorbar(cax) 182 | ticks = np.arange(0,len(current_columns),1) 183 | ax.set_xticks(ticks) 184 | plt.xticks(rotation=90) 185 | ax.set_yticks(ticks) 186 | ax.set_xticklabels(current_columns, fontsize=5) 187 | ax.set_yticklabels(current_columns, fontsize=5) 188 | plt.tight_layout() 189 | # plt.savefig('results/correlation_interesting.png', format='png', dpi=300) 190 | plt.savefig('results/correlation_interesting_diff_source.png', format='png', dpi=300) 191 | plt.show() 192 | plt.close() 193 | 194 | 195 | final_df = pd.read_csv('results/correlation_interesting_final.csv') 196 | first_aids = list(final_df['first']) 197 | second_aids = list(final_df['second']) 198 | final_aids = [] 199 | for i in range(len(first_aids)): 200 | if first_aids[i] not in final_aids: 201 | final_aids.append(first_aids[i]) 202 | if second_aids[i] not in final_aids: 203 | final_aids.append(second_aids[i]) 204 | 205 | 206 | current_corr = corr[final_aids] 207 | current_corr = current_corr[current_corr.index.isin(final_aids)] 208 | current_columns = [c.lstrip('activity_') for c in list(current_corr.columns)] 209 | current_corr = current_corr.reindex(list(current_corr.columns)) 210 | # 211 | # current_corr = np.array(current_corr) 212 | # dummy = [] 213 | # for i in range(len(current_corr)): 214 | # for j in range(len(current_corr)): 215 | # dummy.append([i,j,current_corr[i,j], current_columns[i], current_columns[j]]) 216 | # dummy = np.array(dummy) 217 | # dummy_df = pd.DataFrame(dummy, columns=['x', 'y', 'correlation', 'AID_x', 'AID_y']) 218 | # dummy_df.to_csv('interesting_correlation_matrix.csv', index=False) 219 | 220 | fig = plt.figure(figsize=(8,4), dpi=300) 221 | ax = fig.add_subplot(111) 222 | cax = ax.matshow(current_corr,cmap='coolwarm', vmin=-1, vmax=1) 223 | fig.colorbar(cax) 224 | ticks = np.arange(0,len(current_columns),1) 225 | ax.set_xticks(ticks) 226 | plt.xticks(rotation=90) 227 | ax.set_yticks(ticks) 228 | ax.set_xticklabels(current_columns, fontsize=5) 229 | ax.set_yticklabels(current_columns, fontsize=5) 230 | plt.xlabel('Bioassay AID', fontsize=5 + 7) 231 | ax.xaxis.set_label_position('top') 232 | plt.ylabel('Bioassay AID', fontsize=5 + 7) 233 | plt.tight_layout() 234 | # plt.savefig('results/correlation_interesting.png', format='png', dpi=300) 235 | plt.savefig('results/correlation_interesting_final.png', format='png', dpi=300) 236 | plt.show() 237 | plt.close() 238 | 239 | activity_columns = [c for c in list(mol_df.columns) if 'activity' in c] 240 | all_sums = mol_df[activity_columns].sum(axis=1) 241 | all_counts = mol_df[activity_columns].count(axis=1) 242 | plt.hist(all_sums, bins=range(10,141)) 243 | plt.hist(all_counts, bins='auto') 244 | count_df = pd.DataFrame(data=all_sums, columns=['active']) 245 | count_df['total'] = all_counts 246 | count_df['activity_percentage'] = 100 * count_df['active']/count_df['total'] 247 | 248 | plt.scatter(count_df['total'], count_df['active']) 249 | 250 | from scipy.stats import gaussian_kde 251 | x1 = np.array(count_df['total']) 252 | y1 = np.array(count_df['active']) 253 | xy1 = np.vstack([x1, y1]) 254 | z1 = gaussian_kde(xy1)(xy1) 255 | 256 | len(z1) 257 | 258 | efficacy_df = pd.DataFrame(data=x1, columns=["Number of Screens"]) 259 | efficacy_df["Number of Active Results"] = y1 260 | efficacy_df["Density"] = z1 261 | efficacy_df.to_csv("gaussian_efficacy.csv", index=False) 262 | smaller_efficacy_df = efficacy_df.sample(n=200000, random_state=42) 263 | smaller_efficacy_df.to_csv("gaussian_efficacy_small.csv", index=False) 264 | 265 | x1 = np.array(count_df['total']) 266 | y1 = np.array(count_df['activity_percentage']) 267 | xy1 = np.vstack([x1, y1]) 268 | z1 = gaussian_kde(xy1)(xy1) 269 | 270 | efficacy_label = [] 271 | for i in range(len(count_df)): 272 | current_data = count_df.iloc[i] 273 | current_total = current_data['total'] 274 | current_active = current_data['active'] 275 | current_percent = current_data['activity_percentage'] 276 | if current_percent <= 2: 277 | efficacy_label.append('Wasted') 278 | elif current_active >= 50: 279 | efficacy_label.append('Possible Toxic') 280 | elif current_total > 100: 281 | efficacy_label.append('Familiar Molecules') 282 | else: 283 | efficacy_label.append('New Molecules') 284 | 285 | toxic_line = [] 286 | for i in range(50,600,10): 287 | toxic_line.append([int(i), int(5000/i)]) 288 | import pandas as pd 289 | toxic_line_df = pd.DataFrame(data=toxic_line, columns=['x','y']) 290 | toxic_line_df.to_csv("toxic_line.csv", header=True, index=False) 291 | 292 | efficacy_df = pd.DataFrame(data=x1, columns=["Number of Screens"]) 293 | efficacy_df["Active Percentage"] = y1 294 | efficacy_df["Density"] = z1 295 | efficacy_df["Efficacy Type"] = efficacy_label 296 | efficacy_df.to_csv("gaussian_efficacy_percentage.csv", index=False) 297 | smaller_efficacy_df = efficacy_df.sample(n=300000, random_state=42) 298 | smaller_efficacy_df.to_csv("gaussian_efficacy_percentage_small.csv", index=False) 299 | 300 | all_sources = np.array(desc_df['source']) 301 | all_aids = np.array(desc_df['AID']) 302 | # aid_source = {'activity_' + str(s):[] for s in all_aids} 303 | aid_source = {} 304 | for i in range(len(all_aids)): 305 | aid_source['activity_' + str(all_aids[i])] = all_sources[i] 306 | 307 | # for s in aid_source: 308 | # print(len(aid_source[s])) 309 | 310 | source_index = {} 311 | unique_sources = np.unique(all_sources) 312 | for s in range(len(unique_sources)): 313 | source_index[unique_sources[s]] = s 314 | 315 | mol_source = [] 316 | for m in range(len(mol_df)): 317 | current_data = mol_df.iloc[m][activity_columns] 318 | current_aids = list(current_data[pd.notna(current_data)].index) 319 | current_sources = np.unique([source_index[aid_source[aid]] for aid in current_aids]) 320 | mol_source.append(current_sources) 321 | if m % 100000 == 0: 322 | print(m) 323 | 324 | source_connection = np.zeros((len(unique_sources), len(unique_sources)), dtype=int) 325 | counter = 0 326 | for s in mol_source: 327 | if len(s) == 1: 328 | source_connection[int(s[0]), int(s[0])] += 1 329 | else: 330 | for i in range(len(s)): 331 | if i == len(s) - 1: 332 | break 333 | for j in range(i + 1, len(s)): 334 | source_connection[int(s[i]), int(s[j])] += 1 335 | source_connection[int(s[j]), int(s[i])] += 1 336 | if counter % 10000 == 0: 337 | print(counter) 338 | counter += 1 339 | 340 | source_con_df = pd.DataFrame(data=source_connection, columns=unique_sources) 341 | source_con_df.insert(loc=0, column='source', value=unique_sources) 342 | source_con_df.to_csv('source_molecular_overlap.csv', header=True, index=False) -------------------------------------------------------------------------------- /fingerprint_extraction.py: -------------------------------------------------------------------------------- 1 | import deepchem as dc 2 | import numpy as np 3 | import pandas as pd 4 | import logging 5 | import os 6 | import matplotlib.pyplot as plt 7 | input_data = 'merged_cleaned_benchmarked_threshold_scaffold_split_stratified.csv' 8 | input_columns = list(pd.read_csv(input_data).columns) 9 | input_tasks = list(np.array(input_columns)[[True if 'activity' in c else False for c in input_columns]]) 10 | print(input_tasks) 11 | split = 'specified' 12 | featurizer = 'ECFP' 13 | 14 | 15 | data_dir = input_data 16 | 17 | # assign data and tasks 18 | dataset_file = data_dir 19 | tasks = input_tasks 20 | valid_indices, test_indices = None, None 21 | if split == 'specified': 22 | dummy_df = pd.read_csv(data_dir, low_memory=False) 23 | valid_indices = dummy_df.index[dummy_df['split'] == 'validation'].tolist() 24 | test_indices = dummy_df.index[dummy_df['split'] == 'test'].tolist() 25 | print("About to load the dataset.") 26 | 27 | # create featurizer, loader, transformers, and splitter 28 | if featurizer == 'ECFP': 29 | featurizer = dc.feat.CircularFingerprint(size=1024, chiral=True) 30 | elif featurizer == 'GraphConv': 31 | featurizer = dc.feat.ConvMolFeaturizer(use_chirality=True) 32 | loader = dc.data.CSVLoader(tasks=tasks, feature_field="smiles", featurizer=featurizer) 33 | splitters = { 34 | 'scaffold': dc.splits.ScaffoldSplitter(), 35 | 'specified': dc.splits.SpecifiedSplitter(valid_indices=valid_indices, test_indices=test_indices) 36 | } 37 | splitter = splitters[split] 38 | 39 | 40 | if not os.path.exists(dataset_file): 41 | print("Dataset not found") 42 | 43 | print("About to featurize the dataset.") 44 | dataset = loader.create_dataset([dataset_file], shard_size=8192) 45 | 46 | print("About to transform data") 47 | transformers = [dc.trans.BalancingTransformer(dataset=dataset)] 48 | for transformer in transformers: 49 | dataset = transformer.transform(dataset) 50 | train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset) 51 | # dc.utils.data_utils.save_dataset_to_disk(data_save_dir, train=train_dataset, valid=valid_dataset, test=test_dataset, transformers=transformers) 52 | 53 | # Extract the smiles for each split 54 | train_smiles = np.array(train_dataset.ids) 55 | valid_smiles = np.array(valid_dataset.ids) 56 | test_smiles = np.array(test_dataset.ids) 57 | 58 | train_features = np.array(train_dataset.X) 59 | valid_features = np.array(valid_dataset.X) 60 | test_features = np.array(test_dataset.X) 61 | 62 | all_features = np.concatenate((train_features, valid_features, test_features)) 63 | print(all_features.shape) 64 | # Save the smiles back into the CSV file 65 | all_smiles = np.array(list(train_smiles) + list(valid_smiles) + list(test_smiles)) 66 | labels_smiles = np.array(['train'] * len(train_smiles) + ['validation'] * len(valid_smiles) + ['test'] * len(test_smiles)) 67 | smiles_df = pd.DataFrame(data=all_features, columns=['feature_' + str(i) for i in range(1024)]) 68 | smiles_df.insert(loc=0,column='smiles', value=all_smiles) 69 | smiles_df.insert(loc=1,column='split', value=labels_smiles) 70 | smiles_df.to_csv('smiles_ecfp.csv', header=True, index=False) 71 | 72 | 73 | # Read the SMILES again for Tanimoto Coefficient calculations 74 | smiles_df = pd.read_csv('smiles_ecfp.csv') 75 | feature_columns = [] 76 | for c in smiles_df.columns: 77 | if 'feature' in c: 78 | feature_columns.append(c) 79 | 80 | 81 | def largest_tanimoto_similarity(f1_bool, f2_bool): 82 | # f1 is one boolean numpy array, containing the molecular fingerprint for one molecule 83 | # f2 is (N-1)*M boolean numpy matrix, containing molecular fingerprints for all molecules except f1 84 | # Returns the largest Tanimoto Coefficient between f1 and the rest of the fingerprint (most similar) 85 | f1_bool = np.tile(f1_bool, (len(f2_bool), 1)) 86 | # Overlap between "ones" from f1 and "ones" from the rest of the dataset 87 | overlap = np.sum(np.logical_and(f1_bool, f2_bool), axis=1) 88 | # Union between "ones" from f1 and "ones" from the rest of the dataset 89 | union = np.sum(np.logical_or(f1_bool, f2_bool), axis=1) 90 | return np.max(overlap/union) 91 | 92 | 93 | tanimoto_scores = [] 94 | fingerprint_array = np.array(smiles_df[feature_columns].sample(n=200000, random_state=42), dtype=bool) 95 | # fingerprint_array = np.array(smiles_df[feature_columns], dtype=bool) 96 | for i in range(len(fingerprint_array)): 97 | indices = np.arange(len(fingerprint_array)) 98 | dummy_score = largest_tanimoto_similarity(fingerprint_array[i, :], fingerprint_array[indices != i, :]) 99 | tanimoto_scores.append(dummy_score) 100 | print(i, dummy_score) 101 | 102 | np.save('results/tanimoto_scores', np.array(tanimoto_scores)) 103 | 104 | 105 | fontsize = 13 106 | fig, ax = plt.subplots(figsize=(5, 4), dpi=300) 107 | # plot the cumulative histogram 108 | n, bins, patches = ax.hist(tanimoto_scores, 200, density=True, histtype='step', 109 | cumulative=True) 110 | # Find the percentage where tanimoto score is 0.5 111 | index_tanimoto_7 = 1 - np.sum(np.array(tanimoto_scores) > 0.7) / len(tanimoto_scores) 112 | index_tanimoto_5 = 1 - np.sum(np.array(tanimoto_scores) > 0.5) / len(tanimoto_scores) 113 | 114 | # tidy up the figure 115 | ax.grid(True) 116 | plt.yticks([0,0.2,0.4,0.6,0.8,1], [0,20,40,60,80,100], fontsize=fontsize) 117 | plt.xticks(fontsize=fontsize) 118 | ax.set_ylabel('Cumulative % of Molecules', labelpad=0, fontsize=fontsize) 119 | ax.set_xlabel('Largest Tanimoto Coefficient', labelpad=0, fontsize=fontsize) 120 | 121 | # add one point in the 50 percentile 122 | plt.annotate('(0.7,'+str(round(index_tanimoto_7*100, 2))+')', # this is the text 123 | (0.70, index_tanimoto_7), # these are the coordinates to position the label 124 | textcoords="offset points", # how to position the text 125 | xytext=(0, 3), # distance from text to points (x,y) 126 | ha='right', 127 | fontsize=fontsize) # horizontal alignment can be left, right or center 128 | ax.scatter([0.7], [index_tanimoto_7], c='black', s=7, zorder=3) 129 | plt.annotate('(0.5,'+str(round(index_tanimoto_5*100, 2))+')', # this is the text 130 | (0.50, index_tanimoto_5), # these are the coordinates to position the label 131 | textcoords="offset points", # how to position the text 132 | xytext=(0, 3), # distance from text to points (x,y) 133 | ha='right', 134 | fontsize=fontsize) # horizontal alignment can be left, right or center 135 | ax.scatter([0.5], [index_tanimoto_5], c='black', s=7, zorder=3) 136 | plt.tight_layout() 137 | plt.savefig('results/cumulative_tanimoto.png', format='png', dpi=300) 138 | plt.show() 139 | -------------------------------------------------------------------------------- /manual_tagging.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import torch 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from sklearn.cluster import KMeans 7 | from sklearn.metrics import silhouette_score 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS 11 | # from docx import Document 12 | from sklearn.decomposition import PCA 13 | import seaborn as sns 14 | from fuzzywuzzy import fuzz 15 | 16 | 17 | def intersection(lst1, lst2): 18 | lst3 = [value for value in lst1 if value in lst2] 19 | return lst3 20 | 21 | 22 | def clean_punctuation(sample): 23 | sample = sample.replace('-', ' ') 24 | sample = sample.replace('/', ' ') 25 | sample = sample.replace('(', ' ') 26 | sample = sample.replace(')', ' ') 27 | sample = sample.replace('\'', ' ') 28 | sample = sample.replace('.', ' ') 29 | sample = sample.replace(':', ' ') 30 | sample = sample.replace(',', ' ') 31 | sample = sample.replace(';', ' ') 32 | sample = sample.replace('_', ' ') 33 | return sample 34 | 35 | 36 | def delete_punctuation(sample): 37 | sample = sample.replace('-', '') 38 | sample = sample.replace('/', '') 39 | sample = sample.replace('(', '') 40 | sample = sample.replace(')', '') 41 | sample = sample.replace('\'', '') 42 | sample = sample.replace('.', '') 43 | sample = sample.replace(':', '') 44 | sample = sample.replace(',', '') 45 | sample = sample.replace(';', '') 46 | sample = sample.replace('_', '') 47 | return sample 48 | 49 | 50 | df = pd.read_csv('merged_features_clustered_ner_cleaned.csv') 51 | all_tagged_dfs = [] 52 | disease_clusters = [] 53 | target_clusters = [] 54 | total_unhighlighted = [] 55 | total_merged = [] 56 | for cluster_number in range(10): 57 | cluster_df = df[df['cluster'] == cluster_number] 58 | cluster_df.reset_index(inplace=True) 59 | descs = np.array(cluster_df['description']) 60 | # print(len(descs)) 61 | 62 | document = Document('labeled/'+str(cluster_number)+'.docx') 63 | words = document._element.xpath('//w:r') 64 | 65 | all_words = [] 66 | all_props = [] 67 | for w in words: 68 | dummy = w.xml 69 | dummy = dummy.replace('', '') 70 | if "" not in dummy: 71 | continue 72 | current_word = dummy.split("")[1].split("")[0].lower().strip() 73 | if len(current_word) == 0: 74 | continue 75 | all_words.append(current_word) 76 | if "\"yellow\"" in dummy: 77 | all_props.append('disease') 78 | elif "FF0000" in dummy: 79 | all_props.append('target') 80 | else: 81 | all_props.append('normal') 82 | # print(len(all_props), len(all_words)) 83 | 84 | text = document._element.xml 85 | text = text.replace('', '') 86 | words = text.split('') 87 | lines = [] 88 | all_words2 = [] 89 | all_paragraph = [] 90 | paragraph_counter = 0 91 | high_light_counter = {} 92 | for w in range(len(words)): 93 | if "" not in words[w]: 94 | continue 95 | current_word = words[w].split("")[0].lower().strip() 96 | if len(current_word) != 0: 97 | all_words2.append(current_word) 98 | all_paragraph.append(paragraph_counter) 99 | if "\"yellow\"" in words[w]: 100 | high_light_counter[paragraph_counter] = 1 101 | if 'paraId' in words[w]: 102 | paragraph_counter += 1 103 | # if "by the brm gene" in words[w].lower(): 104 | # print(words[w], w) 105 | # print(len(all_words2)) 106 | word_df = pd.DataFrame(data={'text': all_words,'text2': all_words2, 'paragraph': all_paragraph, 'properties': all_props}) 107 | merged = [] 108 | current_counter = 0 109 | dummy = '' 110 | dummy_disease = [] 111 | dummy_target = [] 112 | all_disease = [] 113 | all_target = [] 114 | previous_i = -1 115 | for i in range(len(all_words2)): 116 | if all_paragraph[i] != current_counter: 117 | current_counter = all_paragraph[i+1] 118 | merged.append(dummy) 119 | all_disease.append(dummy_disease) 120 | all_target.append(dummy_target) 121 | dummy = '' 122 | dummy_disease = [] 123 | dummy_target = [] 124 | if all_props[i] == 'disease': 125 | if '(' in all_words2[i] and not all_words2[i].startswith('('): 126 | current_words = all_words2[i].split('(') 127 | for c in current_words: 128 | current_word = delete_punctuation(c) 129 | dummy_disease.append(current_word.strip().lower()) 130 | else: 131 | current_word = delete_punctuation(all_words2[i]) 132 | dummy_disease.append(current_word.strip().lower()) 133 | elif all_props[i] == 'target': 134 | # if '(' in all_words2[i] and not all_words2[i].startswith('('): 135 | # current_words = all_words2[i].split('(') 136 | # for c in current_words: 137 | # current_word = delete_punctuation(c) 138 | # dummy_target.append(current_word.strip().lower()) 139 | # else: 140 | # current_word = delete_punctuation(all_words2[i]) 141 | # dummy_target.append(current_word.strip().lower()) 142 | # current_word = delete_punctuation(all_words2[i]) 143 | # dummy_target.append(current_word.strip().lower()) 144 | if previous_i == i - 1: 145 | # print(all_words2[i-1].strip().lower(), all_words2[i].strip().lower()) 146 | dummy_target[-1] = dummy_target[-1] + all_words2[i].strip().lower() 147 | previous_i = i 148 | else: 149 | dummy_target.append(all_words2[i].strip().lower()) 150 | previous_i = i 151 | dummy = dummy + all_words2[i] + ' ' 152 | merged.append(dummy) 153 | all_disease.append(dummy_disease) 154 | all_target.append(dummy_target) 155 | print(cluster_number) 156 | print(len(all_target), len(all_disease), len(merged)) 157 | print(len(cluster_df['description'].drop_duplicates())) 158 | print(len(high_light_counter)) 159 | total_unhighlighted.append(len(merged) - len(high_light_counter)) 160 | total_merged.append(len(merged)) 161 | ordered_merged = [] 162 | ordered_disease = [] 163 | ordered_target = [] 164 | match_scores = [] 165 | mapping = {} 166 | for d in range(len(descs)): 167 | dummy = '' 168 | dummy_disease = [] 169 | dummy_target = [] 170 | overlap_counter = 0 171 | for m in range(len(merged)): 172 | current_counter = fuzz.token_set_ratio(merged[m], descs[d].lower()) 173 | # current_counter = len(np.unique(intersection(np.unique(merged[m].split()), np.unique(descs[d].lower().split())))) 174 | if current_counter > overlap_counter: 175 | overlap_counter = current_counter 176 | dummy = merged[m] 177 | dummy_disease = all_disease[m] 178 | dummy_target = all_target[m] 179 | mapping[d] = m 180 | # match_scores.append(overlap_counter/len(descs[d].split())) 181 | match_scores.append(overlap_counter) 182 | # print(d, match_scores[-1]) 183 | ordered_merged.append(dummy) 184 | ordered_disease.append(dummy_disease) 185 | ordered_target.append(dummy_target) 186 | 187 | ordered_disease_str = [] 188 | ordered_target_str = [] 189 | for i in range(len(ordered_disease)): 190 | dummy = '' 191 | for j in ordered_disease[i]: 192 | dummy += j + ', ' 193 | ordered_disease_str.append(dummy.rstrip(', ')) 194 | dummy = '' 195 | for j in ordered_target[i]: 196 | dummy += j + ', ' 197 | ordered_target_str.append(dummy.rstrip(', ')) 198 | 199 | # new_df = pd.DataFrame(data={'description': descs, 'new_description': ordered_merged, 'disease':ordered_disease_str, 'target': ordered_target_str}) 200 | cluster_df['recovered_description'] = ordered_merged 201 | cluster_df['recovery_score'] = match_scores 202 | cluster_df['disease_tags_ground_truth'] = ordered_disease_str 203 | cluster_df['target_tags_ground_truth'] = ordered_target_str 204 | all_tagged_dfs.append(cluster_df) 205 | disease_clusters.append(ordered_disease) 206 | target_clusters.append(ordered_target) 207 | merged_df = all_tagged_dfs[0] 208 | for i in range(1, len(all_tagged_dfs)): 209 | merged_df = merged_df.merge(all_tagged_dfs[i], how='outer') 210 | 211 | merged_df.to_csv('merged_features_clustered_ner_cleaned_extracted.csv', header=True, index=False) 212 | 213 | 214 | 215 | disease_clusters_merged = [] 216 | disease_merged = [] 217 | for i in disease_clusters: 218 | dummy = [] 219 | for j in i: 220 | for k in j: 221 | dummy.append(k) 222 | disease_clusters_merged.append(dummy) 223 | disease_merged.extend(dummy) 224 | 225 | disease_counter = {} 226 | for i in disease_merged: 227 | if i in disease_counter: 228 | disease_counter[i] += 1 229 | else: 230 | disease_counter[i] = 1 231 | 232 | disease_counter = dict(sorted(disease_counter.items(), key=lambda item: item[1])) 233 | x = list(disease_counter.keys()) 234 | y = list(disease_counter.values()) 235 | x.reverse() 236 | y.reverse() 237 | disease_counter_df = pd.DataFrame(data={'all_disease': x, 'count_all_disease': y}) 238 | 239 | all_x = [] 240 | all_y = [] 241 | for cluster_number in range(10): 242 | disease_counter = {} 243 | for i in disease_clusters_merged[cluster_number]: 244 | if i in disease_counter: 245 | disease_counter[i] += 1 246 | else: 247 | disease_counter[i] = 1 248 | 249 | disease_counter = dict(sorted(disease_counter.items(), key=lambda item: item[1])) 250 | x = list(disease_counter.keys()) 251 | y = list(disease_counter.values()) 252 | x.reverse() 253 | y.reverse() 254 | disease_counter_df_dummy = pd.DataFrame(data={str(cluster_number)+'_disease': x, str(cluster_number)+'_count': y}) 255 | disease_counter_df = pd.concat([disease_counter_df,disease_counter_df_dummy], axis=1) 256 | 257 | disease_counter_df.to_csv('count_all_diseases.csv', index=False, header=True) 258 | 259 | target_clusters_merged = [] 260 | target_merged = [] 261 | for i in target_clusters: 262 | dummy = [] 263 | for j in i: 264 | for k in j: 265 | dummy.append(k) 266 | target_clusters_merged.append(dummy) 267 | target_merged.extend(dummy) 268 | 269 | target_counter = {} 270 | for i in target_merged: 271 | if i in target_counter: 272 | target_counter[i] += 1 273 | else: 274 | target_counter[i] = 1 275 | 276 | target_counter = dict(sorted(target_counter.items(), key=lambda item: item[1])) 277 | x = list(target_counter.keys()) 278 | y = list(target_counter.values()) 279 | x.reverse() 280 | y.reverse() 281 | target_counter_df = pd.DataFrame(data={'all_target': x, 'count_all_target': y}) 282 | 283 | all_x = [] 284 | all_y = [] 285 | for cluster_number in range(10): 286 | target_counter = {} 287 | for i in target_clusters_merged[cluster_number]: 288 | if i in target_counter: 289 | target_counter[i] += 1 290 | else: 291 | target_counter[i] = 1 292 | 293 | target_counter = dict(sorted(target_counter.items(), key=lambda item: item[1])) 294 | x = list(target_counter.keys()) 295 | y = list(target_counter.values()) 296 | x.reverse() 297 | y.reverse() 298 | target_counter_df_dummy = pd.DataFrame(data={str(cluster_number)+'_target': x, str(cluster_number)+'_count': y}) 299 | # target_counter_df[str(cluster_number)+'target'] = x 300 | # target_counter_df[str(cluster_number)+'count'] = y 301 | target_counter_df = pd.concat([target_counter_df,target_counter_df_dummy], axis=1) 302 | 303 | target_counter_df.to_csv('count_all_targets.csv', index=False, header=True) 304 | 305 | ner_results = np.array(merged_df['ner_tags']) 306 | ner_results_cleaned = [] 307 | for n in ner_results: 308 | ner_results_cleaned.append(np.unique(n.strip('[]').replace('\'', '').split(','))) 309 | ner_results_str = [] 310 | for n in ner_results_cleaned: 311 | dummy_tag = '' 312 | for t in n: 313 | dummy_tag += t.strip() + ',' 314 | ner_results_str.append(dummy_tag.rstrip(',')) 315 | merged_df.pop('ner_tags') 316 | merged_df['ner_tags'] = ner_results_str 317 | 318 | merged_df.to_csv('merged_features_clustered_ner_cleaned_extracted.csv', header=True, index=False) 319 | 320 | ner_results_str = np.array(pd.read_csv('merged_features_clustered_ner_cleaned_extracted.csv')['ner_tags']) 321 | cluster_nums = np.array(pd.read_csv('merged_features_clustered_ner_cleaned_extracted.csv')['cluster']) 322 | 323 | disease_merged = [] 324 | disease_clusters_merged = [[],[],[],[],[],[],[],[],[],[]] 325 | for i in range(len(ner_results_str)): 326 | n = ner_results_str[i] 327 | if pd.isna(n): 328 | continue 329 | dummy = n.split(',') 330 | if len(dummy) > 0: 331 | disease_merged.extend(dummy) 332 | disease_clusters_merged[cluster_nums[i]].extend(dummy) 333 | 334 | disease_counter = {} 335 | for i in disease_merged: 336 | if i in disease_counter: 337 | disease_counter[i] += 1 338 | else: 339 | disease_counter[i] = 1 340 | 341 | disease_counter = dict(sorted(disease_counter.items(), key=lambda item: item[1])) 342 | x = list(disease_counter.keys()) 343 | y = list(disease_counter.values()) 344 | x.reverse() 345 | y.reverse() 346 | disease_counter_df = pd.DataFrame(data={'all_disease': x, 'count_all_disease': y}) 347 | 348 | all_x = [] 349 | all_y = [] 350 | for cluster_number in range(10): 351 | disease_counter = {} 352 | for i in disease_clusters_merged[cluster_number]: 353 | if i in disease_counter: 354 | disease_counter[i] += 1 355 | else: 356 | disease_counter[i] = 1 357 | 358 | disease_counter = dict(sorted(disease_counter.items(), key=lambda item: item[1])) 359 | x = list(disease_counter.keys()) 360 | y = list(disease_counter.values()) 361 | x.reverse() 362 | y.reverse() 363 | disease_counter_df_dummy = pd.DataFrame(data={str(cluster_number)+'_disease': x, str(cluster_number)+'_count': y}) 364 | # disease_counter_df[str(cluster_number)+'disease'] = x 365 | # disease_counter_df[str(cluster_number)+'count'] = y 366 | disease_counter_df = pd.concat([disease_counter_df,disease_counter_df_dummy], axis=1) 367 | 368 | disease_counter_df.to_csv('count_all_diseases_ner.csv', index=False, header=True) 369 | 370 | import pandas as pd 371 | import numpy as np 372 | annot_df = pd.read_csv('key.csv') 373 | annot_classes = list(np.unique(np.array(annot_df['class']))) 374 | annot_map = {} 375 | for a in range(len(annot_classes)): 376 | annot_map[a] = annot_classes[a] 377 | 378 | disease_class = [] 379 | for a in annot_classes: 380 | current_df = annot_df[annot_df['class'] == a] 381 | current_diseases = list(np.unique(np.array(current_df['name']))) 382 | disease_class.append(current_diseases) 383 | annot_diseases = list(np.unique(np.array(annot_df['name']))) 384 | df = pd.read_csv('merged_features_clustered_ner_cleaned_extracted.csv') 385 | diseases = np.array(df['disease_tags_ground_truth']) 386 | unannotated = [] 387 | benchmark_tag = [] 388 | for i in range(len(diseases)): 389 | n = diseases[i] 390 | if pd.isna(n): 391 | benchmark_tag.append('') 392 | continue 393 | dummy = n.split(',') 394 | tag_dummy = [] 395 | if len(dummy) == 0: 396 | benchmark_tag.append('') 397 | continue 398 | for dc in dummy: 399 | disease = dc.strip() 400 | for d_class in range(len(disease_class)): 401 | if disease in disease_class[d_class]: 402 | if annot_map[d_class] not in tag_dummy: 403 | tag_dummy.append(annot_map[d_class]) 404 | if disease not in annot_diseases: 405 | if disease not in unannotated: 406 | unannotated.append(disease) 407 | print(disease) 408 | tag_dummy_str = '' 409 | for tag_d in tag_dummy: 410 | tag_dummy_str += tag_d + ',' 411 | benchmark_tag.append(tag_dummy_str.rstrip(',')) 412 | df['benchmark_tag'] = benchmark_tag 413 | df.to_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked.csv', header=True, index=False) 414 | 415 | dummy = df[df['benchmark_tag'] == ''] 416 | 417 | print(len(dummy), len(dummy.dropna(subset=['disease_tags_ground_truth'])),len(dummy.dropna(subset=['target_tags_ground_truth'])), len(df)) 418 | print(np.sum(total_unhighlighted)) 419 | print(np.sum(total_merged)) 420 | for i in range(10): 421 | print('Cluster', str(i), 'has', str(total_unhighlighted[i]), 'not highlighted out of the total', 422 | str(total_merged[i]), 'paragraphs and', str(len(dummy[dummy['cluster'] == i])), 423 | 'untagged descriptions out of', str(len(df[df['cluster'] == i])), 'descriptions.') 424 | print('Total', str(np.sum(total_unhighlighted)), 'not highlighted paragraphs and', str(len(dummy)), 425 | 'untagged descriptions and', str(len(df)-len(dummy)), 'tagged descriptions.') 426 | 427 | 428 | 429 | 430 | useless_tags = [ 431 | 'gpr119', 432 | 'damage', 433 | 'all', 434 | 'aml', 435 | 'syndrome', 436 | 'mbt', 437 | 'anthelmintics', 438 | 'antithrombic', 439 | 'embryonic lethality', 440 | 'transcriptional and translational regulation', 441 | 'sds', 442 | 'type iv', 443 | 'ftld', 444 | 'epigenetics', 445 | 'hnf4', 446 | 'metabolic', 447 | 'neonatal', 448 | 'hh', 449 | 'autophagy', 450 | 'colorectal', 451 | 'trust', 452 | 'ra', 453 | 'cellular proliferation', 454 | 'liver', 455 | 'mpds', 456 | 'adult bone formation', 457 | 'firefly', 458 | 'firefly luciferase', 459 | 'luminescent', 460 | 'diseases', 461 | 'disease', 462 | 'dyrk1 kinase', 463 | 'cell survival', 464 | 'mv', 465 | 'of mycobacterium tuberculosis', 466 | 'diarrhea', 467 | 'mas', 468 | 'at', 469 | 'gsdii', 470 | 'omim 230400', 471 | 'muscle', 472 | 'pca', 473 | 'proliferation stimuli', 474 | 'ad', 475 | 'cf', 476 | 'luciferase', 477 | 'gpcr', 478 | 'cjd', 479 | 'hd', 480 | 'skeletal muscle', 481 | 'frda', 482 | 'hcs', 483 | 'mm', 484 | 'cll', 485 | 'cellular senescence', 486 | 'tyrosine kinases', 487 | 'kappab', 488 | 'cml', 489 | 'drg', 490 | 'alqts', 491 | 'relapse', 492 | 'vascular smooth muscle', 493 | 'muscle diseases', 494 | 'cytotoxic', 495 | 'tgfbeta antagonists', 496 | 'pxrluc', 497 | 'liver regeneration' 498 | ] 499 | for u in unannotated: 500 | if u not in useless_tags: 501 | print(u) 502 | 503 | # unused_aids.extend(list(df[df['benchmark_tag'] == '']['AID'])) 504 | unused_aids = [ 505 | 588856, 506 | 588855, 507 | 1663, 508 | 2216, 509 | 1832, 510 | 782, 511 | 588342, 512 | 1865, 513 | 2599, 514 | 540295, 515 | 540308, 516 | 720647, 517 | 743238, 518 | 588674, 519 | 602363, 520 | 651704, 521 | 651658, 522 | 488862, 523 | 504414, 524 | 652115, 525 | 504441, 526 | 504408, 527 | 602252, 528 | 485317, 529 | 2288, 530 | 2289, 531 | 2629, 532 | 1875, 533 | 2094, 534 | 2098, 535 | 2563, 536 | 588478, 537 | 1159583, 538 | 485294, 539 | 485341, 540 | 1721, 541 | 1722, 542 | 651999, 543 | 2805, 544 | 2806, 545 | 434973, 546 | 2524, 547 | 2540, 548 | 2544, 549 | 1016, 550 | 1006, 551 | 1020, 552 | 1027, 553 | 1136, 554 | 720516] 555 | used_aids = [aid for aid in df['AID'] if aid not in unused_aids] 556 | print(len(df)) 557 | df_cleaned = df[df['AID'].isin(used_aids)] 558 | print(len(df_cleaned)) 559 | df_cleaned.to_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked_cleaned.csv', header=True, index=False) 560 | 561 | -------------------------------------------------------------------------------- /molecular_data_cleaning.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from rdkit.Chem import MolFromSmiles, MolToSmiles 4 | np.random.seed(123) 5 | 6 | 7 | def canon_smile(smile): 8 | return MolToSmiles(MolFromSmiles(smile), isomericSmiles=True) 9 | 10 | # get all AIDs 11 | aids = np.array(pd.read_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked.csv')['AID']) 12 | 13 | # Create download links for all datasets 14 | str_aid = '' 15 | for aid in aids: 16 | str_aid+='https://pubchem.ncbi.nlm.nih.gov/assay/pcget.cgi?query=download&record_type=datatable&actvty=all&response_type=save&aid='+str(aid) + '\n' 17 | 18 | text_file = open('download_links.txt', "w+") 19 | n = text_file.write(str_aid) 20 | text_file.close() 21 | 22 | # After datasets are downloaded, place them under datasets directory (this needs to be done manually) 23 | 24 | # get all CIDs for molecules 25 | all_cids = [] 26 | counter = 0 27 | for aid in aids: 28 | all_cids.extend(np.array(pd.read_csv('datasets/AID_'+str(aid)+'_datatable_all.csv', usecols=['PUBCHEM_CID'])['PUBCHEM_CID'], dtype=int)) 29 | print(counter, aid) 30 | counter += 1 31 | 32 | print(len(all_cids)) 33 | # Find unique CIDs 34 | unique_cids = np.unique(np.array(all_cids)) 35 | print(len(unique_cids)) 36 | 37 | # Create 3 lists of CIDs for download from PubChem, due to limitation from PubChem 38 | for chunk in range(3): 39 | str_cid = '' 40 | for cid in unique_cids[499999 * chunk: 499999 * (chunk + 1)]: 41 | str_cid += str(cid) + '\n' 42 | text_file = open('unique_cids_'+str(chunk)+'.txt', "w+") 43 | n = text_file.write(str_cid) 44 | text_file.close() 45 | 46 | # Manually download the SMILES from PubChem using the three text files and place them under smiles diretory 47 | 48 | # Read all SMILES 49 | li = [] 50 | for filename in range(3): 51 | df = pd.read_csv('smiles/'+str(filename)+'.txt', delimiter='\t') 52 | li.append(df) 53 | smiles_df = pd.concat(li, axis=0, ignore_index=True) 54 | 55 | # Make all SMILES canonical 56 | canon_list = [] 57 | counter = 0 58 | error_counter = 0 59 | all_smiles = np.array(smiles_df['smiles']) 60 | for s in all_smiles: 61 | try: 62 | cannon_dummy = canon_smile(s) 63 | except: 64 | cannon_dummy = '' 65 | error_counter += 1 66 | canon_list.append(cannon_dummy) 67 | if counter % 100000 == 0: 68 | print(counter, error_counter) 69 | counter += 1 70 | smiles_df['canon_smiles'] = np.array(canon_list) 71 | # Save mapping between input SMILES and canonical SMILES 72 | smiles_df.to_csv('smiles/canon_map.csv', header=True, index=False) 73 | 74 | 75 | # Find bioactivity labels from the datasets 76 | aids = np.array(pd.read_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked.csv')['AID']) 77 | all_df = [] 78 | for aid in aids: 79 | df = pd.read_csv('datasets/AID_'+str(aid)+'_datatable_all.csv', low_memory=False, usecols=['PUBCHEM_RESULT_TAG']) 80 | # find the row that data begins 81 | correct_column = 3 82 | for i in range(8): 83 | if df['PUBCHEM_RESULT_TAG'][i] == '1': 84 | correct_column = i 85 | print(aid, correct_column) 86 | all_df.append(pd.read_csv('datasets/AID_'+str(aid)+'_datatable_all.csv', low_memory=False, skiprows=list(range(1,correct_column+1)), 87 | usecols=lambda c: c in set(['PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_CID', 'Phenotype', 'Activity Summary']))) 88 | 89 | # Make a dictionary from the mapping for canonical SMILES 90 | canon_df = pd.read_csv('smiles/canon_map.csv') 91 | canon_map = pd.Series(canon_df['canon_smiles'].values,index=canon_df['PUBCHEM_CID']).to_dict() 92 | 93 | 94 | all_out_keys = ['Active', 'Inactive', 'Inconclusive', 'Unspecified'] 95 | all_pheno_keys = ['Activator', 'Active', 'Cytotoxic', 'Fluorescent', 'Inactive', 'Inconclusive', 'Inhibitor', 96 | 'Quencher', 'Signal activator', 'Signal inhibitor', 'ikB active'] 97 | all_summary_keys = ['active agonist', 'active antagonist', 'inactive', 'inconclusive', 'inconclusive agonist', 98 | 'inconclusive agonist (cytotoxic)', 'inconclusive agonist (fluorescent)', 'inconclusive antagonist', 99 | 'inconclusive antagonist (cytotoxic)', 'inconclusive antagonist (fluorescent)'] 100 | 101 | def check_type_exist(type_, dict_): 102 | if type_ in dict_: 103 | return dict_[type_] 104 | else: 105 | return 0 106 | 107 | # go through all datasets and clean the data 108 | all_count_type = [] 109 | all_df_cleaned = [] 110 | # flag = 0 111 | for file_counter in range(len(all_df)): 112 | # file_counter = 0 113 | df = all_df[file_counter] 114 | print(aids[file_counter]) 115 | print('Initial shape: ', df.shape) 116 | # print(df.columns) 117 | 118 | # Delete empty or duplicate smiles 119 | df = df.dropna(subset=['PUBCHEM_CID']) 120 | df = df.drop_duplicates(subset='PUBCHEM_CID', keep='first') 121 | # df.reset_index(inplace=True) 122 | print('Shape after deleting empty or duplicate smiles: ', df.shape) 123 | 124 | # Delete data point with inconclusive or unspeecified bioactivity labels 125 | df = df[df['PUBCHEM_ACTIVITY_OUTCOME'] != 'Inconclusive'] 126 | df = df[df['PUBCHEM_ACTIVITY_OUTCOME'] != 'Unspecified'] 127 | print('Shape after deleting iconclusive and unspecified: ', df.shape) 128 | # df = df[df[phenotype_column] != 'Cytotoxic'] 129 | 130 | # Find unique values in columns 131 | df_dropped = df.dropna(subset=['PUBCHEM_ACTIVITY_OUTCOME']) 132 | phenotype_column = None 133 | outcome_column = None 134 | summary_column = None 135 | pheno_dict = {} 136 | outcome_dict = {} 137 | summary_dict = {} 138 | for column in df.columns: 139 | if 'Phenotype' in column: 140 | phenotype_column = column 141 | # print(column, np.unique(df_dropped[column])) 142 | pheno_dict = df[phenotype_column].value_counts().to_dict() 143 | # all_pheno_keys.extend(pheno_dict.keys()) 144 | print(pheno_dict) 145 | if 'OUTCOME' in column: 146 | outcome_column = column 147 | # print(column, np.unique(df_dropped[column])) 148 | outcome_dict = df[outcome_column].value_counts().to_dict() 149 | # all_out_keys.extend(outcome_dict.keys()) 150 | print(outcome_dict) 151 | if 'Summary' in column: 152 | summary_column = column 153 | # print(column, np.unique(df_dropped[column])) 154 | summary_dict = df[summary_column].value_counts().to_dict() 155 | # all_summary_keys.extend(summary_dict.keys()) 156 | print(summary_dict) 157 | count_type = [] 158 | for k in all_out_keys: 159 | count_type.append(check_type_exist(k, outcome_dict)) 160 | for k in all_pheno_keys: 161 | count_type.append(check_type_exist(k, pheno_dict)) 162 | for k in all_summary_keys: 163 | count_type.append(check_type_exist(k, summary_dict)) 164 | all_count_type.append(count_type) 165 | 166 | # convert bioactivity to binary labels 167 | df[outcome_column] = df[outcome_column].replace({'Active': 1, 'Inactive': 0}) 168 | # print(df.columns) 169 | # Save the bioactivity labels under a column with name "activity_" + AID number 170 | df.rename({outcome_column: 'activity_'+str(aids[file_counter])}, axis=1, inplace=True) 171 | 172 | # Inser canon SMILES 173 | df['smiles'] = df['PUBCHEM_CID'].map(canon_map) 174 | # Delete duplicate SMILES 175 | df = df.drop_duplicates(subset='smiles', keep='first') 176 | # df.reset_index(inplace=True) 177 | print('Shape after deleting duplicate canon smiles: ', df.shape) 178 | 179 | # Shuffle and save 180 | df = df.sample(frac=1, random_state=42).reset_index(drop=True) 181 | all_df_cleaned.append(df) 182 | df.to_csv('cleaned_datasets/'+ str(aids[file_counter])+'_cleaned.csv', header=True, index=False) 183 | # all_count_type = np.array(all_count_type) 184 | # count_df = pd.DataFrame(data=all_count_type, columns=all_out_keys+all_pheno_keys+all_summary_keys) 185 | # count_df.insert(0, 'AID', aids) 186 | # count_df.to_csv('type_count_wo_inconclusive.csv', index=False, header=True) 187 | 188 | # Merge all datasets together 189 | all_df = [pd.read_csv('cleaned_datasets/'+ str(aid)+'_cleaned.csv', low_memory=False) for aid in aids] 190 | for file_counter in range(len(all_df)): 191 | df = all_df[file_counter] 192 | outcome_column = None 193 | for column in df.columns: 194 | if 'activity' in column: 195 | outcome_column = column 196 | # print(column, np.unique(df[column])) 197 | # if len(np.unique(df[outcome_column])) == 0: 198 | # continue 199 | df = df[['smiles', outcome_column]] 200 | if file_counter == 0: 201 | merged = df 202 | else: 203 | merged = merged.merge(df, on=['smiles'], how='outer') 204 | print(file_counter) 205 | print(merged.head()) 206 | print(merged.shape) 207 | print(merged.iloc[0]) 208 | smiles_canon = merged['smiles'] 209 | # merged.nunique(axis=0) 210 | # print('smiles and unique smiles: ', len(smiles_canon), len(np.unique(smiles_canon))) 211 | print('number of rows with missing data: ', merged.shape[0] - merged.dropna().shape[0]) 212 | 213 | # Insert CIDs into the data 214 | canon_map_reversed = {} 215 | for k in canon_map: 216 | if canon_map[k] not in canon_map_reversed: 217 | canon_map_reversed[canon_map[k]] = str(k) 218 | else: 219 | canon_map_reversed[canon_map[k]] += ','+str(k) 220 | cleaned_ids = [] 221 | for s in smiles_canon: 222 | cleaned_ids.append(canon_map_reversed[s]) 223 | merged.insert(1,'PUBCHEM_CID', np.array(cleaned_ids)) 224 | # Save the cleaned dataset 225 | merged.to_csv('merged_cleaned.csv', header=True, index=False) 226 | 227 | merged = pd.read_csv('merged_cleaned.csv') 228 | # Unused datasets are found after manual labeling, where they do not belong to disease category, which is why this is hardcoded 229 | unused_aids = [ 230 | 588856, 231 | 588855, 232 | 1663, 233 | 2216, 234 | 1832, 235 | 782, 236 | 588342, 237 | 1865, 238 | 2599, 239 | 540295, 240 | 540308, 241 | 720647, 242 | 743238, 243 | 588674, 244 | 602363, 245 | 651704, 246 | 651658, 247 | 488862, 248 | 504414, 249 | 652115, 250 | 504441, 251 | 504408, 252 | 602252, 253 | 485317, 254 | 2629, 255 | 1875, 256 | 2094, 257 | 2098, 258 | 2288, 259 | 2289, 260 | 2563, 261 | 588478, 262 | 1159583, 263 | 485294, 264 | 485341, 265 | 1721, 266 | 1722, 267 | 651999, 268 | 2805, 269 | 2806, 270 | 434973, 271 | 2524, 272 | 2540, 273 | 2544, 274 | 1016, 275 | 1006, 276 | 1020, 277 | 1027, 278 | 1136, 279 | 720516] 280 | print(len(merged.columns)) 281 | # drop unused datasets 282 | merged.drop(['activity_'+str(a) for a in unused_aids], axis=1, inplace=True) 283 | print(len(merged.columns)) 284 | print(len(merged)) 285 | activity_columns = [c for c in merged.columns if 'activity' in c] 286 | # drop empty rows 287 | merged.dropna(subset=activity_columns, how='all', inplace=True) 288 | print(merged.shape) 289 | # Save the new merged datasets with unused bioassays deleted 290 | merged.to_csv('merged_cleaned_benchmarked.csv', header=True, index=False) 291 | 292 | # Read the description file for bioassays information 293 | df = pd.read_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked_cleaned.csv') 294 | # Find the number of molecules from the descriptions and from the merged dataset, they should be close to each other 295 | active_num = [] 296 | total_num = [] 297 | for aid in activity_columns: 298 | aid_number = int(aid.lstrip('activity_')) 299 | df_dummy = df[df['AID'] == aid_number] 300 | active_num.append(len(merged[merged[aid] == 1])) 301 | total_num.append(len(merged[aid].dropna())) 302 | if not pd.isna(df_dummy['active_num']).bool(): 303 | print(aid_number, int(df_dummy['substance_num']), int(df_dummy['active_num']), len(merged[aid].dropna()), len(merged[merged[aid] == 1])) 304 | else: 305 | print(aid_number, int(df_dummy['substance_num']), len(merged[aid].dropna()), len(merged[merged[aid] == 1])) 306 | 307 | 308 | print(len(active_num) - np.sum(np.array(active_num)>=50)) 309 | 310 | # Number of molecules after cleaning 311 | df['recovered_substance_num'] = np.array(total_num) 312 | df['recovered_active_num'] = np.array(active_num) 313 | 314 | # Save molecular data with a column for count 315 | df.to_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked_cleaned_counted.csv', header=True, index=False) 316 | 317 | # Delete bioassays that have less than 15 active data points 318 | df = pd.read_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked_cleaned_counted.csv') 319 | print(len(df)) 320 | df = df[df['recovered_active_num'] >= 15] 321 | print(len(df)) 322 | 323 | # Save the molecular data after cleaning, verifying the count, and deleting unused bioassays or bioassays with less than 15 active molecules 324 | df.to_csv('merged_features_clustered_ner_cleaned_extracted_benchmarked_cleaned_counted_threshold.csv', header=True, index=False) 325 | 326 | -------------------------------------------------------------------------------- /ner.py: -------------------------------------------------------------------------------- 1 | from transformers.pipelines import pipeline 2 | import transformers 3 | import torch 4 | import pandas as pd 5 | import numpy as np 6 | print(transformers.__version__) 7 | print(torch.__version__) 8 | 9 | classifier = pipeline("ner", model='alvaroalon2/biobert_diseases_ner') 10 | df = pd.read_csv('merged.csv') 11 | df = df[df['substance_num'] >= 100] 12 | df_tox = df[df['source'] == 'Tox21'] 13 | df = df[df['substance_num'] >= 100000] 14 | df = df.merge(df_tox, how='outer') 15 | # biobert = BiobertEmbedding() 16 | 17 | sources = np.array(df['source']) 18 | print(np.unique(sources)) 19 | def clean_punctuation(sample): 20 | sample = sample.replace('-', ' ') 21 | sample = sample.replace('/', ' ') 22 | sample = sample.replace('(', ' ') 23 | sample = sample.replace(')', ' ') 24 | sample = sample.replace('\'', ' ') 25 | sample = sample.replace('.', ' ') 26 | sample = sample.replace(':', ' ') 27 | sample = sample.replace(',', ' ') 28 | sample = sample.replace(';', ' ') 29 | sample = sample.replace('_', ' ') 30 | return sample 31 | descs = np.array(df['description']) 32 | titles = np.array(df['name']) 33 | descs = np.array([clean_punctuation(d) for d in descs]) 34 | titles = np.array([clean_punctuation(d) for d in titles]) 35 | 36 | 37 | disease_tags = [] 38 | for i in range(len(descs)): 39 | dummy = titles[i].lower() + ' . ' + descs[i].lower() 40 | # dummy = dummy.split() 41 | ner = classifier(dummy) 42 | disease_dummy = [] 43 | for j in ner: 44 | if 'DISEASE' in j['entity']: 45 | w = j['word'] 46 | if '##' in w and len(disease_dummy) != 0: 47 | disease_dummy[-1] = disease_dummy[-1] + w.lstrip('##') 48 | else: 49 | disease_dummy.append(w) 50 | disease_tags.append(disease_dummy) 51 | df = pd.read_csv('merged_features_clustered.csv') 52 | print(len(df), len(disease_tags)) 53 | 54 | 55 | df.insert(10, 'ner_tags', disease_tags) 56 | df.to_csv('merged_features_clustered_ner.csv') 57 | 58 | df = pd.read_csv('merged_features_clustered_ner.csv') 59 | for column in df.columns: 60 | if 'feature' in column or 'Unnamed' in column: 61 | df.drop(column, axis=1, inplace=True) 62 | df.to_csv('merged_features_clustered_ner_cleaned.csv', header=True, index=False) 63 | 64 | -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | from os import listdir 2 | import numpy as np 3 | import pandas as pd 4 | 5 | # data from multiple data sources are downloaded from PubChem and placed in data directory in seperate folders 6 | data_dir = 'data/' 7 | folder_names = sorted(listdir(data_dir)) 8 | summs = [data_dir+f+'/summary.txt' for f in folder_names] 9 | descs = [data_dir+f+'/description.txt' for f in folder_names] 10 | print(len(summs)) 11 | data = [] 12 | for file_counter in range(len(summs)): 13 | # file_counter = 0 14 | summ = summs[file_counter] 15 | with open(summ) as file: 16 | lines = [] 17 | for line in file: 18 | lines.append(line.rstrip('\n')) 19 | block_idx = [] 20 | for i in range(len(lines)): 21 | if lines[i] == '': 22 | block_idx.append(i) 23 | for block_counter in range(len(block_idx)): 24 | # block_counter = 0 25 | if block_counter != len(block_idx) - 1: 26 | current_lines = lines[block_idx[block_counter]+1:block_idx[block_counter+1]] 27 | else: 28 | current_lines = lines[block_idx[block_counter]+1:] 29 | block_number = current_lines[0].split('.')[0] 30 | if int(block_number) != block_counter + 1: 31 | print('Error in parsing block '+str(block_number)+' in '+summ) 32 | name = current_lines[0].lstrip(block_number+'.').lstrip() 33 | source = np.NaN 34 | aid = np.NaN 35 | sub_num = np.NaN 36 | active_num = np.NaN 37 | target = np.NaN 38 | for line in current_lines: 39 | if line == current_lines[0]: 40 | continue 41 | # extract data source 42 | if line.startswith('Source'): 43 | source = line.split(':')[1].lstrip() 44 | # extract bioassay AID 45 | elif line.startswith('AID'): 46 | aid = line.split(':')[1].lstrip() 47 | # extract number of molecules and active molecules 48 | elif line.startswith('Substance BioActivity'): 49 | dummy = line.split(':')[1].lstrip() 50 | dummy_num = dummy.split() 51 | for num_counter in range(len(dummy_num)): 52 | if 'Active' in dummy_num[num_counter]: 53 | active_num = int(dummy_num[num_counter-1]) 54 | if 'Tested' in dummy_num[num_counter]: 55 | sub_num = int(dummy_num[num_counter-1]) 56 | # extract target 57 | elif line.startswith('Protein Targets') or line.startswith('Protein Target'): 58 | target = line.split(':')[1].lstrip() 59 | else: 60 | print('UNUSED DATA:', line) 61 | data.append([aid, name, source, block_number, target, sub_num, active_num]) 62 | 63 | # Parse descriptions 64 | data_desc = [] 65 | for file_counter in range(len(descs)): 66 | desc = descs[file_counter] 67 | with open(desc) as file: 68 | lines = [] 69 | for line in file: 70 | lines.append(line.rstrip('\n')) 71 | block_idx = [] 72 | for i in range(len(lines)): 73 | if lines[i] == '': 74 | block_idx.append(i) 75 | for block_counter in range(len(block_idx)): 76 | if block_counter != len(block_idx) - 1: 77 | current_lines = lines[block_idx[block_counter] + 1:block_idx[block_counter + 1]] 78 | else: 79 | current_lines = lines[block_idx[block_counter] + 1:] 80 | if '.' not in current_lines[0]: 81 | continue 82 | block_number = current_lines[0].split('.')[0] 83 | if int(block_number) != block_counter + 1: 84 | print('Error in parsing block ' + str(block_number) + ' in ' + desc) 85 | name = current_lines[0].lstrip(block_number + '.').lstrip() 86 | source = np.NaN 87 | aid = np.NaN 88 | description = np.NaN 89 | for line in current_lines: 90 | # line=current_lines[1] 91 | if line == current_lines[0]: 92 | continue 93 | if line.startswith('Source:') and '_||_' not in line: 94 | source = line.split(':')[1].lstrip() 95 | elif line.startswith('AID:') and '_||_' not in line: 96 | aid = line.split(':')[1].lstrip() 97 | else: 98 | # Rules for parsing descriptions from different data sources 99 | dummy_lines = line.split('_||_') 100 | if folder_names[file_counter] == 'Broad_Ins': 101 | description = line.replace('_||_', ' ') 102 | elif folder_names[file_counter] == 'Emory': 103 | description = '' 104 | useless = ['Assay Overview', 'NIH Molecular Libraries Screening Centers Network [MLSCN]',\ 105 | 'Emory Chemical Biology Discovery Center in MLSCN','Assay provider','MLSCN Grant'] 106 | for dummy_line in dummy_lines: 107 | if not any(x in dummy_line for x in useless): 108 | description = description + ' ' + dummy_line 109 | else: 110 | print('UNUSED DATA:', dummy_line) 111 | elif folder_names[file_counter] == 'ICCB': 112 | description = '' 113 | useless = ['This screen was conducted by'] 114 | for dummy_line in dummy_lines: 115 | if not any(x in dummy_line for x in useless): 116 | description = description + ' ' + dummy_line 117 | else: 118 | print('UNUSED DATA:', dummy_line) 119 | elif folder_names[file_counter] == 'John_Hopkins': 120 | description = '' 121 | useless = ['Data Source', 'BioAssay Type', 'Source (MLPCN Center Name)', 'Screening Center PI',\ 122 | 'Center Affiliation', 'Network:', 'Assay provider:','Assay Provider:','Grant Proposal Number', 123 | 'Grant Proposal PI', 'Assay Implementation', 'Name:', 'External Assay ID:'] 124 | reference_flag = 0 125 | for dummy_line in dummy_lines: 126 | if 'References' in dummy_line or 'Reference' in dummy_line: 127 | reference_flag = 1 128 | if not reference_flag and not any(x in dummy_line for x in useless): 129 | description = description + ' ' + dummy_line 130 | else: 131 | print('UNUSED DATA:', dummy_line) 132 | if 'Keywords' in dummy_line: 133 | reference_flag = 0 134 | elif folder_names[file_counter] == 'Ncats': 135 | description = '' 136 | reference_flag = 0 137 | useless = ['NIH Molecular Libraries Probe Centers Network [MLPCN]', 'MLPCN Grant', 138 | 'Assay Provider', 'Assay Submitter (PI)', 'NIH Chemical Genomics Center [NCGC]'] 139 | for dummy_line in dummy_lines: 140 | if 'References' in dummy_line or 'Reference' in dummy_line: 141 | reference_flag = 1 142 | if not reference_flag and not any(x in dummy_line for x in useless): 143 | description = description + ' ' + dummy_line 144 | else: 145 | print('UNUSED DATA:', dummy_line) 146 | if 'Keywords' in dummy_line: 147 | reference_flag = 0 148 | elif folder_names[file_counter] == 'NMMLSC': 149 | description = '' 150 | reference_flag = 0 151 | useless = ['University of New Mexico Assay Overview:', 'Assay Support:', 152 | 'PI:', 'PI Affiliation:', 'Screening Center PI:', 'Screening Lead:', 153 | 'Assay Implementation:', 'UNM Cheminformatics:', 'Chemistry:', 154 | 'Vanderbilt Specialized Chemistry Center PI:', 'Vanderbilt Chemistry Lead:', 155 | 'Assay Background and Significance:', 'Project Title:', 'Screening Center Manager:', 156 | 'Screening Center/PI:', 'Lead Biologist:', 'Screening Operations Team:', 157 | 'Chemistry Lead:', 'Specialized Chemistry Center:', 'Assay Support:', 158 | 'University of New Mexico Center for Molecular Discovery PI:', 'Center PI:', 159 | 'Target Team Leader for the Center:', 'KU SCC Project Manager:', 160 | 'KU SCC Chemists on this project:', 'Assay provider:','Assay Provider:', 'Co-PI:', 'KU Specialized Chemistry Center PI:'] 161 | for dummy_line in dummy_lines: 162 | if not any(x in dummy_line for x in useless): 163 | description = description + ' ' + dummy_line 164 | else: 165 | print('UNUSED DATA:', dummy_line) 166 | elif folder_names[file_counter] == 'Sanford_Burnam': 167 | description = '' 168 | reference_flag = 0 169 | useless = ['Data Source:', 'Source Affiliation:', 'Network:', 170 | 'NIH Molecular Libraries Probe Production Centers Network (MLPCN)', 171 | 'Grant Number:', 'Assay Provider:', 'Grant Proposal Number:'] 172 | for dummy_line in dummy_lines: 173 | if 'REFERENCES' in dummy_line or 'References' in dummy_line: 174 | reference_flag = 1 175 | if not reference_flag and not any(x in dummy_line for x in useless): 176 | description = description + ' ' + dummy_line 177 | else: 178 | print('UNUSED DATA:', dummy_line) 179 | elif folder_names[file_counter] == 'Scripps': 180 | description = '' 181 | reference_flag = 0 182 | useless = ['Source (MLPCN Center Name):','Source (MLSCN Center Name):', 'Center Affiliation:', 'Assay Provider:', 183 | 'Network:', 'Grant Proposal Number', 'Grant Proposal PI:', 'External Assay ID:', 184 | 'Name:', 'Source:', 185 | 'Center Affiliation:', 'Affiliation:'] 186 | for dummy_line in dummy_lines: 187 | # if block_counter == 128: 188 | # print(128, description) 189 | if 'References' in dummy_line or 'Reference' in dummy_line: 190 | reference_flag = 1 191 | if not reference_flag and not any(x in dummy_line for x in useless): 192 | description = description + ' ' + dummy_line 193 | # else: 194 | # print('UNUSED DATA:', dummy_line) 195 | if 'Keywords' in dummy_line: 196 | reference_flag = 0 197 | elif folder_names[file_counter] == 'Tox21': 198 | description = dummy_lines[-1] 199 | else: 200 | print('ERROR! Parsing rules have not been defined for ' + folder_names[file_counter]) 201 | data_desc.append([aid, name, source, block_number, description]) 202 | # print(name) 203 | # print(source) 204 | # print(aid) 205 | # print(sub_num) 206 | # print(target) 207 | data = np.array(data) 208 | column_names = ['AID', 'name', 'source', 'block_number', 'target', 'substance_num', 'active_num'] 209 | data_dict = {} 210 | for i in range(len(column_names)): 211 | data_dict[column_names[i]] = data[:,i] 212 | df = pd.DataFrame(data=data_dict) 213 | 214 | data_desc = np.array(data_desc) 215 | column_names = ['AID', 'name', 'source', 'block_number', 'description'] 216 | data_dict = {} 217 | for i in range(len(column_names)): 218 | data_dict[column_names[i]] = data_desc[:,i] 219 | df_desc = pd.DataFrame(data=data_dict) 220 | 221 | 222 | merged = df.merge(df_desc, how='outer') 223 | print(len(df), len(df_desc), len(merged)) 224 | print(len(df_desc.dropna())) 225 | 226 | # Save bioassays' information and descriptions 227 | merged.to_csv('merged.csv', header=True, index=False) 228 | 229 | -------------------------------------------------------------------------------- /splitting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from typing import Any, Dict, List, Iterator, Optional, Sequence, Tuple 5 | 6 | 7 | 8 | from rdkit import Chem 9 | from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles 10 | 11 | 12 | 13 | 14 | def split(smiles, 15 | frac_train: float = 0.8, 16 | frac_valid: float = 0.1, 17 | frac_test: float = 0.1, 18 | seed: Optional[int] = None, 19 | log_every_n: Optional[int] = 1000 20 | ) -> Tuple[List[int], List[int], List[int]]: 21 | """ 22 | Splits internal compounds into train/validation/test by scaffold. 23 | 24 | Parameters 25 | ---------- 26 | dataset: Dataset 27 | Dataset to be split. 28 | frac_train: float, optional (default 0.8) 29 | The fraction of data to be used for the training split. 30 | frac_valid: float, optional (default 0.1) 31 | The fraction of data to be used for the validation split. 32 | frac_test: float, optional (default 0.1) 33 | The fraction of data to be used for the test split. 34 | seed: int, optional (default None) 35 | Random seed to use. 36 | log_every_n: int, optional (default 1000) 37 | Controls the logger by dictating how often logger outputs 38 | will be produced. 39 | 40 | Returns 41 | ------- 42 | Tuple[List[int], List[int], List[int]] 43 | A tuple of train indices, valid indices, and test indices. 44 | Each indices is a list of integers. 45 | """ 46 | np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) 47 | scaffold_sets = generate_scaffolds(smiles) 48 | 49 | train_cutoff = frac_train * len(smiles) 50 | valid_cutoff = (frac_train + frac_valid) * len(smiles) 51 | train_inds: List[int] = [] 52 | valid_inds: List[int] = [] 53 | test_inds: List[int] = [] 54 | 55 | # logger.info("About to sort in scaffold sets") 56 | for scaffold_set in scaffold_sets: 57 | if len(train_inds) + len(scaffold_set) > train_cutoff: 58 | if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff: 59 | test_inds += scaffold_set 60 | else: 61 | valid_inds += scaffold_set 62 | else: 63 | train_inds += scaffold_set 64 | return train_inds, valid_inds, test_inds 65 | 66 | 67 | def generate_scaffolds(smiles_list, 68 | log_every_n: int = 1000) -> List[List[int]]: 69 | """Returns all scaffolds from the dataset. 70 | 71 | Parameters 72 | ---------- 73 | dataset: Dataset 74 | Dataset to be split. 75 | log_every_n: int, optional (default 1000) 76 | Controls the logger by dictating how often logger outputs 77 | will be produced. 78 | 79 | Returns 80 | ------- 81 | scaffold_sets: List[List[int]] 82 | List of indices of each scaffold in the dataset. 83 | """ 84 | scaffolds = {} 85 | data_len = len(smiles_list) 86 | 87 | # logger.info("About to generate scaffolds") 88 | for ind, smiles in enumerate(smiles_list): 89 | # if ind % log_every_n == 0: 90 | # logger.info("Generating scaffold %d/%d" % (ind, data_len)) 91 | scaffold = MurckoScaffoldSmiles(mol=Chem.MolFromSmiles(smiles), includeChirality=True) 92 | if scaffold not in scaffolds: 93 | scaffolds[scaffold] = [ind] 94 | else: 95 | scaffolds[scaffold].append(ind) 96 | 97 | # Sort from largest to smallest scaffold sets 98 | scaffolds = {key: sorted(value) for key, value in scaffolds.items()} 99 | scaffold_sets = [ 100 | scaffold_set for (scaffold, scaffold_set) in sorted( 101 | scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True) 102 | ] 103 | return scaffold_sets 104 | 105 | input_data = 'data/all_molecular_data.csv' 106 | smiles = np.array(pd.read_csv(input_data)['smiles']) 107 | scaffold_list = [] 108 | counter = 0 109 | for s in smiles: 110 | try: 111 | scaffold_list.append(MurckoScaffoldSmiles(mol=Chem.MolFromSmiles(s), includeChirality=True)) 112 | except: 113 | scaffold_list.append(s) 114 | print(s, counter) 115 | counter += 1 116 | df = pd.read_csv(input_data) 117 | df['scaffold'] = np.array(scaffold_list) 118 | 119 | df.dropna(subset=['smiles'], inplace=True) 120 | print(len(df)) 121 | df.to_csv('merged_cleaned_benchmarked_threshold_scaffold.csv', header=True, index=False) 122 | 123 | scaffold_list = np.array(df['scaffold']) 124 | smiles = np.array(df['smiles']) 125 | 126 | scaffolds = {} 127 | for ind, scaffold in enumerate(scaffold_list): 128 | if scaffold not in scaffolds: 129 | scaffolds[scaffold] = [ind] 130 | else: 131 | scaffolds[scaffold].append(ind) 132 | 133 | scaffolds = {key: sorted(value) for key, value in scaffolds.items()} 134 | scaffold_sets = [ 135 | scaffold_set for (scaffold, scaffold_set) in sorted( 136 | scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True) 137 | ] 138 | 139 | frac_train = 0.8 140 | frac_valid= 0.1 141 | frac_test = 0.1 142 | train_cutoff = frac_train * len(smiles) 143 | valid_cutoff = (frac_train + frac_valid) * len(smiles) 144 | train_inds = [] 145 | valid_inds = [] 146 | test_inds = [] 147 | 148 | for scaffold_set in scaffold_sets: 149 | if len(train_inds) + len(scaffold_set) > train_cutoff: 150 | if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff: 151 | test_inds.extend(scaffold_set) 152 | else: 153 | valid_inds.extend(scaffold_set) 154 | else: 155 | train_inds.extend(scaffold_set) 156 | 157 | split_list = np.empty(len(smiles), dtype=object) 158 | for i in train_inds: 159 | split_list[i] = 'train' 160 | for i in valid_inds: 161 | split_list[i] = 'validation' 162 | for i in test_inds: 163 | split_list[i] = 'test' 164 | 165 | print(len(split_list)) 166 | df['split'] = split_list 167 | df.to_csv('merged_cleaned_benchmarked_threshold_scaffold_split.csv', header=True, index=False) 168 | 169 | # df = pd.read_csv('merged_cleaned_benchmarked_threshold_scaffold_split.csv') 170 | df = pd.read_csv('merged_cleaned_benchmarked_threshold_scaffold_split_stratified.csv') 171 | 172 | activity_columns = [c for c in df.columns if 'activity' in c] 173 | train_dummy = df[df['split'] == 'train'] 174 | valid_dummy = df[df['split'] == 'validation'] 175 | test_dummy = df[df['split'] == 'test'] 176 | 177 | problem_aids = [] 178 | for a in activity_columns: 179 | train_sum = int(np.sum(train_dummy[a])) 180 | valid_sum = int(np.sum(valid_dummy[a])) 181 | test_sum = int(np.sum(test_dummy[a])) 182 | if train_sum < 1 or valid_sum < 1 or test_sum <1: 183 | print(a, int(np.sum(train_dummy[a])), int(np.sum(valid_dummy[a])), int(np.sum(test_dummy[a]))) 184 | problem_aids.append(a) 185 | 186 | for a in activity_columns: 187 | train_sum = int(np.sum(list(train_dummy[a]==0))) 188 | valid_sum = int(np.sum(list(valid_dummy[a]==0))) 189 | test_sum = int(np.sum(list(test_dummy[a]==0))) 190 | if train_sum < 1 or valid_sum < 1 or test_sum <1: 191 | print(a, train_sum, valid_sum, test_sum) 192 | problem_aids.append(a) 193 | 194 | for a in activity_columns: 195 | train_sum = int(np.sum(list(train_dummy[a]==0))) 196 | valid_sum = int(np.sum(list(valid_dummy[a]==0))) 197 | test_sum = int(np.sum(list(test_dummy[a]==0))) 198 | print(a, train_sum, int(np.sum(train_dummy[a])), valid_sum, int(np.sum(valid_dummy[a])), test_sum, int(np.sum(test_dummy[a]))) 199 | 200 | import random 201 | extra_valid_ind = [] 202 | extra_test_ind = [] 203 | for a in problem_aids: 204 | dummy_df = df[df[a] == 1] 205 | aid_inds = dummy_df.index[dummy_df['split'] == 'train'].tolist() 206 | d1,d2 = random.sample(aid_inds, 2) 207 | extra_valid_ind.append(d1) 208 | extra_test_ind.append(d2) 209 | 210 | for i in extra_valid_ind: 211 | df.at[i, 'split'] = 'validation' 212 | 213 | for i in extra_test_ind: 214 | df.at[i, 'split'] = 'test' 215 | 216 | train_dummy = df[df['split'] == 'train'] 217 | valid_dummy = df[df['split'] == 'validation'] 218 | test_dummy = df[df['split'] == 'test'] 219 | 220 | for a in activity_columns: 221 | train_sum = int(np.sum(train_dummy[a])) 222 | valid_sum = int(np.sum(valid_dummy[a])) 223 | test_sum = int(np.sum(test_dummy[a])) 224 | if train_sum < 1 or valid_sum < 1 or test_sum <1: 225 | print(a, int(np.sum(train_dummy[a])), int(np.sum(valid_dummy[a])), int(np.sum(test_dummy[a]))) 226 | df.to_csv('merged_cleaned_benchmarked_threshold_scaffold_split_stratified.csv', header=True, index=False) 227 | 228 | df = pd.read_csv('merged_cleaned_benchmarked_threshold_scaffold_split_stratified.csv') 229 | mini = df[:20000] 230 | mini.to_csv('mini.csv', header=True, index=False) 231 | -------------------------------------------------------------------------------- /training.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import tensorflow as tf 4 | import deepchem as dc 5 | from deepchem.models import GraphConvModel 6 | import matplotlib.pyplot as plt 7 | from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, recall_score, average_precision_score, precision_score 8 | from deepchem.metrics.score_function import bedroc_score 9 | import time 10 | import os 11 | from rdkit.Chem import MolFromSmiles, MolToSmiles 12 | import shutil 13 | import logging 14 | import itertools 15 | from typing import Any, Dict, List, Iterator, Optional, Sequence, Tuple 16 | from deepchem.splits import Splitter 17 | from deepchem.data import Dataset, DiskDataset 18 | 19 | 20 | class MolDataSpecifiedSplitter(Splitter): 21 | """Split data in the fashion specified by user. Uses DeepChem's specified 22 | splitter but also takes as input the training splits. 23 | 24 | For some applications, you will already know how you'd like to split the 25 | dataset. In this splitter, you simplify specify `train_indices`, `valid_indices` and 26 | `test_indices` and the datapoints at those indices are pulled out of the 27 | dataset. Note that this is different from `IndexSplitter` which only splits 28 | based on the existing dataset ordering, while this `SpecifiedSplitter` can 29 | split on any specified ordering. 30 | """ 31 | 32 | def __init__(self, 33 | train_indices: Optional[List[int]] = None, 34 | valid_indices: Optional[List[int]] = None, 35 | test_indices: Optional[List[int]] = None 36 | ): 37 | """ 38 | Parameters 39 | ----------- 40 | valid_indices: List[int] 41 | List of indices of samples in the valid set 42 | test_indices: List[int] 43 | List of indices of samples in the test set 44 | """ 45 | self.train_indices = train_indices 46 | self.valid_indices = valid_indices 47 | self.test_indices = test_indices 48 | 49 | def split(self, 50 | dataset: Dataset, 51 | frac_train: float = 0.8, 52 | frac_valid: float = 0.1, 53 | frac_test: float = 0.1, 54 | seed: Optional[int] = None, 55 | log_every_n: Optional[int] = None 56 | ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: 57 | """ 58 | Splits internal compounds into train/validation/test in designated order. 59 | 60 | Parameters 61 | ---------- 62 | dataset: Dataset 63 | Dataset to be split. 64 | frac_train: float, optional (default 0.8) 65 | Fraction of dataset put into training data. 66 | frac_valid: float, optional (default 0.1) 67 | Fraction of dataset put into validation data. 68 | frac_test: float, optional (default 0.1) 69 | Fraction of dataset put into test data. 70 | seed: int, optional (default None) 71 | Random seed to use. 72 | log_every_n: int, optional (default None) 73 | Log every n examples (not currently used). 74 | 75 | Returns 76 | ------- 77 | Tuple[np.ndarray, np.ndarray, np.ndarray] 78 | A tuple of train indices, valid indices, and test indices. 79 | Each indices is a numpy array. 80 | """ 81 | if self.train_indices is None: 82 | self.train_indices = [] 83 | if self.valid_indices is None: 84 | self.valid_indices = [] 85 | if self.test_indices is None: 86 | self.test_indices = [] 87 | 88 | return (np.array(self.train_indices), np.array(self.valid_indices), 89 | np.array(self.test_indices)) 90 | 91 | 92 | def MolDataLoadData(data_dir, tasks, featurizer): 93 | dummy_df = pd.read_csv(data_dir, low_memory=False) 94 | train_indices = dummy_df.index[dummy_df['split'] == 'train'].tolist() 95 | valid_indices = dummy_df.index[dummy_df['split'] == 'validation'].tolist() 96 | test_indices = dummy_df.index[dummy_df['split'] == 'test'].tolist() 97 | print(len(dummy_df), len(train_indices) + len(valid_indices) + len(test_indices)) 98 | print("About to load the dataset.") 99 | 100 | # create featurizer, loader, transformers, and splitter 101 | if featurizer == 'ECFP': 102 | featurizer = dc.feat.CircularFingerprint(size=1024, chiral=True) 103 | elif featurizer == 'GraphConv': 104 | featurizer = dc.feat.ConvMolFeaturizer(use_chirality=True) 105 | loader = dc.data.CSVLoader(tasks=tasks, feature_field="smiles", featurizer=featurizer) 106 | splitters = { 107 | 'specified': MolDataSpecifiedSplitter(train_indices=train_indices, valid_indices=valid_indices, test_indices=test_indices) 108 | } 109 | splitter = splitters['specified'] 110 | 111 | if not os.path.exists(data_dir): 112 | print("Dataset not found") 113 | print("About to featurize the dataset.") 114 | dataset = loader.create_dataset([data_dir], shard_size=8192, data_dir='tmp/loader/') 115 | 116 | # Initialize transformers 117 | print("About to split data") 118 | untransformed_train_dataset, untransformed_valid_dataset, untransformed_test_dataset = \ 119 | splitter.train_valid_test_split(dataset, train_dir='tmp/train_un/', 120 | valid_dir='tmp/valid_un/', 121 | test_dir='tmp/test_un/') 122 | print("About to transform data") 123 | transformers = [dc.trans.BalancingTransformer(dataset=dataset)] 124 | # Only transform the train dataset 125 | for transformer in transformers: 126 | train_dataset = transformer.transform(untransformed_train_dataset, out_dir='tmp/train/') 127 | 128 | shutil.rmtree('tmp/loader/', ignore_errors=True) 129 | return train_dataset, untransformed_valid_dataset, untransformed_test_dataset, transformers 130 | 131 | 132 | ####################################################################### 133 | # Select operation mode, disease benchmarks or target benchmarks 134 | data_type = 'disease' 135 | # data_type = 'target' 136 | 137 | # Select model and featurizer type 138 | featurizer = 'GraphConv' 139 | # featurizer = 'ECFP' 140 | 141 | # Specify data directory 142 | data_dir = 'data/all_molecular_data.csv' 143 | map_df = pd.read_csv('data/aid_'+data_type+'_mapping.csv') 144 | print(map_df.columns) 145 | 146 | epochnb = 50 147 | graph_conv_layers = [512, 512, 512] 148 | dropout = 0.1 149 | learning_rate = 0.0001 150 | batch_size = 128 151 | dense_layer_size = 1024 152 | if data_type == 'disease': 153 | all_categories = ['all', 'cancer', 'nervous System', 'immune system', 'cardiovascular', 154 | 'toxicity', 'obesity', 'virus', 'diabetes', 'metabolic disorders', 'bacteria', 155 | 'parasite', 'epigenetics_genetics', 'pulmonary', 'infection', 'aging', 'fungal'] 156 | if data_type == 'target': 157 | all_categories = ['all_target', 'Membrane receptor', 'Enzyme (other)', 'Nuclear receptor', 158 | 'Hydrolase', 'Protease', 'Transcription factor', 'Kinase', 159 | 'Epigenetic regulator', 'Ion channel', 'Transferase', 'Oxidoreductase', 160 | 'Transporter', 'NTPase', 'Phosphatase'] 161 | 162 | logging.basicConfig(level=logging.INFO) 163 | 164 | for run_type in all_categories: 165 | np.random.seed(42) 166 | tf.compat.v1.set_random_seed(42) 167 | if run_type == 'all' or run_type == 'all_target': 168 | tasks = list(np.array(map_df['AID'])[[True if t > 0 else False for t in np.sum(np.array(map_df[all_categories[1:]]), axis=1)]]) 169 | else: 170 | tasks = list(map_df[map_df[run_type] == 1]['AID']) 171 | # Select tasks based on the operation mode 172 | print(run_type, tasks) 173 | print(len(tasks)) 174 | 175 | timestr = time.strftime("%m%d-%H%M") 176 | model_dir = 'built_models/moldata/'+featurizer+'/' + timestr + '/' 177 | if os.path.isdir(model_dir): 178 | timestr = timestr.split('-')[0] + '-' + timestr.split('-')[1][:2] + str(int(timestr.split('-')[1][2:])+60) 179 | os.makedirs(model_dir, exist_ok=True) 180 | 181 | # Load the data from the splits, transform only the train split 182 | train_dataset, untransformed_valid_dataset, untransformed_test_dataset, transformers = MolDataLoadData(data_dir=data_dir, tasks=tasks,featurizer=featurizer) 183 | training_data_len = len(train_dataset.y) 184 | 185 | 186 | metric = [ 187 | dc.metrics.Metric(dc.metrics.accuracy_score, mode="classification", classification_handling_mode='threshold', threshold_value=0.5, n_tasks=len(tasks)), 188 | dc.metrics.Metric(dc.metrics.recall_score, mode="classification", classification_handling_mode='threshold', threshold_value=0.5, n_tasks=len(tasks)), 189 | dc.metrics.Metric(dc.metrics.precision_score, mode="classification", classification_handling_mode='threshold', 190 | threshold_value=0.5, n_tasks=len(tasks)), 191 | dc.metrics.Metric(dc.metrics.roc_auc_score, mode="classification", n_tasks=len(tasks))] 192 | if featurizer == 'GraphConv': 193 | model = None 194 | model = GraphConvModel( 195 | len(tasks), 196 | batch_size=batch_size, 197 | mode='classification', 198 | number_atom_features=78, 199 | tensorboard=False, 200 | use_queue=True, 201 | graph_conv_layers=graph_conv_layers, 202 | dense_layer_size=dense_layer_size, 203 | dropout=dropout, 204 | learning_rate=learning_rate, 205 | model_dir=model_dir) 206 | 207 | for epoch_num in range(epochnb): 208 | loss = model.fit(train_dataset, nb_epoch=1, checkpoint_interval=2*(training_data_len // batch_size), 209 | max_checkpoints_to_keep=1000) 210 | print(epoch_num) 211 | elif featurizer == 'ECFP': 212 | model = None 213 | model = dc.models.MultitaskClassifier( 214 | len(tasks), 215 | n_features=1024, 216 | layer_sizes=[dense_layer_size], 217 | dropouts=[dropout], 218 | learning_rate=learning_rate, 219 | batch_size=batch_size, 220 | use_queue=False, 221 | model_dir=model_dir) 222 | loss = model.fit(train_dataset, nb_epoch=epochnb) 223 | results = model.evaluate(untransformed_test_dataset, metrics=metric, transformers=[], use_sample_weights=True, per_task_metrics=True) 224 | r = [list(results[0].values())] 225 | keys = list(results[0].keys()) 226 | for i in range(len(results[1][keys[0]])): 227 | dummy = [results[1][k][i] for k in keys] 228 | r.append(dummy) 229 | r = np.array(r) 230 | print(r.shape) 231 | keys = [k + '_test' for k in keys] 232 | results_df = pd.DataFrame(data=np.array(r), columns=keys) 233 | results_df.insert(loc=0, column='AID', value=['all'] + tasks) 234 | results_df.insert(loc=len(results_df.columns), column='model_dir', value=[model_dir] * len(results_df)) 235 | results_valid = model.evaluate(untransformed_valid_dataset, metrics=metric, transformers=[], 236 | use_sample_weights=True, per_task_metrics=True) 237 | r_valid = [list(results_valid[0].values())] 238 | keys_valid = list(results_valid[0].keys()) 239 | for i in range(len(results_valid[1][keys_valid[0]])): 240 | dummy = [results_valid[1][k][i] for k in keys_valid] 241 | r_valid.append(dummy) 242 | r_valid = np.array(r_valid) 243 | print(r_valid.shape) 244 | keys_valid = [k + '_valid' for k in keys_valid] 245 | # results_df_valid = pd.DataFrame(data=np.array(r_valid), columns=keys_valid) 246 | for col in range(len(keys_valid)): 247 | results_df[keys_valid[col]] = np.array(r_valid)[:, col] 248 | # results_df_valid.insert(loc=0, column='AID', value=['all'] + input_tasks) 249 | 250 | results_df.to_csv('results/'+run_type+'_results.csv', header=True, index=False) 251 | shutil.rmtree('tmp/train_un/', ignore_errors=True) 252 | shutil.rmtree('tmp/valid_un/', ignore_errors=True) 253 | shutil.rmtree('tmp/test_un/', ignore_errors=True) 254 | shutil.rmtree('tmp/train/', ignore_errors=True) 255 | shutil.rmtree('tmp/valid/', ignore_errors=True) 256 | shutil.rmtree('tmp/test/', ignore_errors=True) 257 | 258 | 259 | all_results = [] 260 | for c in all_categories: 261 | results_df = pd.read_csv('results/'+c+'_results.csv') 262 | results_df = results_df[results_df['AID'] == 'all'] 263 | all_results.append(results_df[['accuracy_score_test', 'recall_score_test', 'precision_score_test', 264 | 'roc_auc_score_test', 'accuracy_score_valid', 'recall_score_valid', 265 | 'precision_score_valid', 'roc_auc_score_valid']].iloc[0]) 266 | 267 | all_results_df = pd.DataFrame(data=np.array(all_results), columns=['accuracy_score_test', 'recall_score_test', 'precision_score_test', 268 | 'roc_auc_score_test', 'accuracy_score_valid', 'recall_score_valid', 269 | 'precision_score_valid', 'roc_auc_score_valid']) 270 | all_results_df.insert(loc=0, column='benchmark', value=all_categories) 271 | all_results_df["accuracy_score_test"] = 100 * all_results_df["accuracy_score_test"] 272 | all_results_df["recall_score_test"] = 100 * all_results_df["recall_score_test"] 273 | all_results_df["precision_score_test"] = 100 * all_results_df["precision_score_test"] 274 | all_results_df["accuracy_score_valid"] = 100 * all_results_df["accuracy_score_valid"] 275 | all_results_df["recall_score_valid"] = 100 * all_results_df["recall_score_valid"] 276 | all_results_df["precision_score_valid"] = 100 * all_results_df["precision_score_valid"] 277 | all_results_df = all_results_df.round({'accuracy_score_test': 2, 'recall_score_test': 2, 'precision_score_test': 2, 'roc_auc_score_test': 4}) 278 | all_results_df = all_results_df.round({'accuracy_score_valid': 2, 'recall_score_valid': 2, 'precision_score_valid': 2, 'roc_auc_score_valid': 4}) 279 | all_results_df.to_csv('results/final_results_'+data_type+'.csv', header=True, index=False) 280 | 281 | 282 | import pandas as pd 283 | import numpy as np 284 | all_disease_categories = ['all', 'cancer', 'nervous System', 'immune system', 'cardiovascular', 285 | 'toxicity', 'obesity', 'virus', 'diabetes', 'metabolic disorders', 'bacteria', 286 | 'parasite', 'epigenetics_genetics', 'pulmonary', 'infection', 'aging', 'fungal'] +\ 287 | ['all_target', 'Membrane receptor', 'Enzyme (other)', 'Nuclear receptor', 288 | 'Hydrolase', 'Protease', 'Transcription factor', 'Kinase', 289 | 'Epigenetic regulator', 'Ion channel', 'Transferase', 'Oxidoreductase', 290 | 'Transporter', 'NTPase', 'Phosphatase'] 291 | 292 | writer = pd.ExcelWriter('results/detailed_results_combined.xlsx', engine = 'xlsxwriter') 293 | for c in ['final_results_disease', 'final_results_target']: 294 | current_df = pd.read_csv('results/'+c+'.csv') 295 | if c == 'final_results_disease': 296 | c = 'benchmark_results_disease' 297 | if c == 'final_results_target': 298 | c = 'benchmark_results_target' 299 | current_df.insert(loc=1, column='run_type', value=c) 300 | current_df.to_excel(writer, sheet_name=c, index=False) 301 | 302 | for c in all_disease_categories: 303 | current_df = pd.read_csv('results/'+c+'_results.csv') 304 | if c == 'all': 305 | c = 'all_disease' 306 | current_df.at[0, 'AID'] = c 307 | current_df.insert(loc=1, column='run_type', value=c) 308 | current_df.to_excel(writer, sheet_name=c, index=False) 309 | if c == 'all_disease': 310 | all_df = current_df 311 | else: 312 | all_df = all_df.merge(current_df, how='outer') 313 | 314 | all_df = all_df.sort_values(by='AID', ignore_index=True) 315 | all_df.to_csv('results/sorted_detailed_results_combined.csv', header=True, index=False) 316 | all_df.to_excel(writer, sheet_name='everything', index=False) 317 | 318 | writer.save() 319 | writer.close() 320 | --------------------------------------------------------------------------------