├── 10_lab_ablation_ml.py ├── 1_gen_swdd_xk.py ├── 2_extract_embedding.py ├── 3_extract_time_series_feature.py ├── 4_make_ts_dataset.py ├── 5_lab_length.py ├── 6_lab_prop.py ├── 7_lab_ablation.py ├── 8_lab_length_ml.py ├── 9_lab_prop_ml.py ├── LICENSE ├── README.md ├── config.py ├── envs ├── sktime-0.5.3.zip └── sktime-dl-0.2.0-modify.zip ├── img └── Method-DSTS.png ├── preprocess.py ├── requirements.txt └── utils ├── __init__.py ├── __pycache__ ├── __init__.cpython-37.pyc ├── __init__.cpython-38.pyc ├── analysis.cpython-37.pyc ├── analysis.cpython-38.pyc ├── clog.cpython-37.pyc ├── clog.cpython-38.pyc ├── data.cpython-37.pyc ├── data.cpython-38.pyc ├── extractor.cpython-37.pyc ├── informer.cpython-37.pyc ├── parser.cpython-37.pyc ├── summarizor.cpython-37.pyc ├── symptom.cpython-37.pyc ├── system_monitor.cpython-37.pyc └── system_monitor.cpython-38.pyc ├── analysis.py ├── clog.py ├── data.py ├── extractor.py └── symptom.py /10_lab_ablation_ml.py: -------------------------------------------------------------------------------- 1 | def calc_metrics_binary(model, X_test, y_test): 2 | from sklearn.metrics import ( 3 | classification_report, 4 | accuracy_score, 5 | f1_score, 6 | roc_auc_score, 7 | recall_score, 8 | precision_score, 9 | ) 10 | 11 | y_pred = model.predict(X_test) 12 | report = classification_report( 13 | y_test, y_pred, target_names=["Normal", "Depressed"], digits=4 14 | ) 15 | return report 16 | 17 | 18 | from datetime import datetime 19 | 20 | 21 | def train_TimeSeriesForest(X_train, y_train, X_test, y_test): 22 | start_time = datetime.now() 23 | # base 24 | print("***TimeSeriesForestClassifier***") 25 | from sklearn.pipeline import Pipeline 26 | from sktime.classification.interval_based import TimeSeriesForestClassifier 27 | from sktime.classification.compose import ColumnEnsembleClassifier 28 | from sktime.transformations.panel.compose import ColumnConcatenator 29 | 30 | steps = [ 31 | ("concatenate", ColumnConcatenator()), 32 | ("classify", TimeSeriesForestClassifier(n_estimators=100)), 33 | ] 34 | clf = Pipeline(steps) 35 | clf.fit(X_train, y_train) 36 | 37 | report = calc_metrics_binary(clf, X_test, y_test) 38 | 39 | print(report) 40 | 41 | print(str(datetime.now() - start_time)) 42 | 43 | return clf 44 | 45 | 46 | def train_ROCKETClassifier(X_train, y_train, X_test, y_test): 47 | start_time = datetime.now() 48 | # 2020 49 | print("***ROCKETClassifier***") 50 | from sktime.classification.kernel_based import ROCKETClassifier 51 | 52 | clf = ROCKETClassifier(num_kernels=500) 53 | clf.fit(X_train, y_train) 54 | 55 | report = calc_metrics_binary(clf, X_test, y_test) 56 | 57 | print(report) 58 | 59 | print(str(datetime.now() - start_time)) 60 | 61 | return clf 62 | 63 | 64 | def train_Signature(X_train, y_train, X_test, y_test): 65 | start_time = datetime.now() 66 | 67 | # 2020 68 | print("***SignatureClassifier***") 69 | from sktime.classification.feature_based import SignatureClassifier 70 | 71 | clf = SignatureClassifier() 72 | clf.fit(X_train, y_train) 73 | 74 | report = calc_metrics_binary(clf, X_test, y_test) 75 | 76 | print(report) 77 | 78 | print(str(datetime.now() - start_time)) 79 | 80 | return clf 81 | 82 | 83 | def train_Arsenal(X_train, y_train, X_test, y_test): 84 | start_time = datetime.now() 85 | # uni 86 | print("***Arsenal***") 87 | from sktime.classification.kernel_based import Arsenal 88 | 89 | clf = Arsenal(num_kernels=200, n_estimators=5) 90 | clf.fit(X_train, y_train) 91 | 92 | report = calc_metrics_binary(clf, X_test, y_test) 93 | 94 | print(report) 95 | print(str(datetime.now() - start_time)) 96 | start_time = datetime.now() 97 | 98 | return clf 99 | 100 | 101 | def train_TSFresh(X_train, y_train, X_test, y_test): 102 | start_time = datetime.now() 103 | # 2018 104 | print("***TSFreshClassifier***") 105 | from sktime.classification.feature_based import TSFreshClassifier 106 | 107 | clf = TSFreshClassifier() 108 | clf.fit(X_train, y_train) 109 | 110 | report = calc_metrics_binary(clf, X_test, y_test) 111 | 112 | print(report) 113 | 114 | print(str(datetime.now() - start_time)) 115 | 116 | return clf 117 | 118 | 119 | def train_HIVECOTEV2(X_train, y_train, X_test, y_test): 120 | start_time = datetime.now() 121 | # No module named sktime.classificastion.shapelet_based.mrseql.mrseql 122 | print("***HIVECOTEV2***") 123 | from sktime.classification.hybrid import HIVECOTEV2 124 | from sktime.contrib.vector_classifiers._rotation_forest import RotationForest 125 | 126 | clf = HIVECOTEV2( 127 | stc_params={ 128 | "estimator": RotationForest(n_estimators=3), 129 | "n_shapelet_samples": 500, 130 | "max_shapelets": 20, 131 | "batch_size": 100, 132 | }, 133 | drcif_params={"n_estimators": 10}, 134 | arsenal_params={"num_kernels": 100, "n_estimators": 5}, 135 | tde_params={ 136 | "n_parameter_samples": 25, 137 | "max_ensemble_size": 5, 138 | "randomly_selected_params": 10, 139 | }, 140 | ) 141 | clf.fit(X_train, y_train) 142 | 143 | report = calc_metrics_binary(clf, X_test, y_test) 144 | 145 | print(report) 146 | print(str(datetime.now() - start_time)) 147 | start_time = datetime.now() 148 | 149 | return clf 150 | 151 | 152 | def train_ShapeletTransform(X_train, y_train, X_test, y_test): 153 | start_time = datetime.now() 154 | # shapelet 155 | print("***ShapeletTransformClassifier***") 156 | from sktime.classification.shapelet_based import ShapeletTransformClassifier 157 | from sktime.contrib.vector_classifiers._rotation_forest import RotationForest 158 | 159 | clf = ShapeletTransformClassifier( 160 | estimator=RotationForest(n_estimators=3), 161 | n_shapelet_samples=500, 162 | max_shapelets=20, 163 | batch_size=100, 164 | ) 165 | clf.fit(X_train, y_train) 166 | 167 | report = calc_metrics_binary(clf, X_test, y_test) 168 | 169 | print(report) 170 | print(str(datetime.now() - start_time)) 171 | start_time = datetime.now() 172 | 173 | return clf 174 | 175 | 176 | model_list = ["bst", "rocket", "gs", "hc2", "tsf"] 177 | 178 | if __name__ == "__main__": 179 | import os 180 | import numpy as np 181 | from sktime.utils.data_io import load_from_tsfile_to_dataframe 182 | from utils.symptom import symptoms_dsm_5 as symptoms 183 | 184 | # 情绪 185 | feat_emo = [ 186 | "depressive_mood", 187 | "retardation_or_agitation", 188 | "panic_and_anxious", 189 | ] # sad, agi, pan 190 | # 认知 191 | feat_cog = [ 192 | "interest_pleasure_loss", 193 | "self_blame", 194 | "suicidal_ideation", 195 | "concentration_problem", 196 | ] # int, sel(low-esteem), sui, con 197 | # 躯体 198 | feat_bod = [ 199 | "appetite_weight_problem", 200 | "insomnia_or_hypersomnia", 201 | "energy_loss", 202 | "sympathetic_arousal", 203 | ] # app, ins, ene, sym 204 | # 行为? 205 | 206 | feat_emo_dim = [] 207 | feat_cog_dim = [] 208 | feat_bod_dim = [] 209 | 210 | for i, k in enumerate(symptoms): 211 | print(i, k) 212 | feat_id = "dim_{}".format(i) 213 | if k in feat_emo: 214 | feat_emo_dim.append(feat_id) 215 | elif k in feat_cog: 216 | feat_cog_dim.append(feat_id) 217 | elif k in feat_bod: 218 | feat_bod_dim.append(feat_id) 219 | 220 | print(feat_emo_dim, feat_cog_dim, feat_bod_dim) 221 | 222 | feat_group = [feat_emo_dim, feat_cog_dim, feat_bod_dim] 223 | 224 | 225 | flag = 1 226 | for r in range(10): 227 | 228 | # load data 229 | data_dir = "dataset/swdd-7k_ts_origin_500_0" 230 | 231 | X_train_all, y_train = load_from_tsfile_to_dataframe( 232 | os.path.join(data_dir, "train.ts") 233 | ) 234 | X_test_all, y_test = load_from_tsfile_to_dataframe( 235 | os.path.join(data_dir, "test.ts") 236 | ) 237 | 238 | print(X_train_all.shape, y_train.shape, X_test_all.shape, y_test.shape) 239 | print(np.unique(y_train)) 240 | 241 | for i in range(len(feat_group)): 242 | if flag and i < 1: 243 | continue 244 | 245 | feat_group_train = [] 246 | for j in range(len(feat_group)): 247 | if i != j: 248 | feat_group_train += feat_group[j] 249 | print(feat_group_train) 250 | 251 | save_dir = "results/swdd-7k_model_500_0_remove_feat_group_{}".format(i) 252 | 253 | print(data_dir, save_dir) 254 | 255 | # ablation 256 | X_train = X_train_all[feat_group_train] 257 | X_test = X_test_all[feat_group_train] 258 | 259 | for cls in model_list: 260 | if flag and cls != "hc2": 261 | continue 262 | if flag: 263 | flag = 0 264 | print(cls) 265 | if cls == "bst": 266 | try: 267 | clf = train_ShapeletTransform( 268 | X_train, y_train, X_test, y_test 269 | ) # need at least one array to concatenate 270 | except Exception as e: 271 | print(e) 272 | elif cls == "rocket": 273 | clf = train_ROCKETClassifier(X_train, y_train, X_test, y_test) 274 | elif cls == "gs": 275 | # try: 276 | # clf = train_TSFresh( 277 | # X_train, y_train, X_test, y_test 278 | # ) 279 | # except Exception as e: 280 | # print(e) 281 | try: 282 | # clf = train_Signature(X_train, y_train, X_test, y_test) 283 | clf = train_Arsenal(X_train, y_train, X_test, y_test) 284 | except Exception as e: 285 | print(e) 286 | elif cls == "hc2": 287 | try: 288 | clf = train_HIVECOTEV2( 289 | X_train, y_train, X_test, y_test 290 | ) # not work 291 | except Exception as e: 292 | print(e) 293 | elif cls == "tsf": 294 | clf = train_TimeSeriesForest(X_train, y_train, X_test, y_test) 295 | 296 | # # analyze model 297 | report = calc_metrics_binary(clf, X_test, y_test) 298 | 299 | res_save_path = os.path.join(save_dir, cls + ".txt") 300 | with open(res_save_path, "a+") as f: 301 | f.write(report) 302 | f.write("\n" + "*" * 15 + "\n") 303 | 304 | -------------------------------------------------------------------------------- /1_gen_swdd_xk.py: -------------------------------------------------------------------------------- 1 | from utils.data import gen_swdd_4k, gen_swdd_7k 2 | 3 | if __name__ == "__main__": 4 | data_dir = "dataset/swdd" 5 | # gen_swdd_7k(data_dir=data_dir) 6 | gen_swdd_4k(data_dir=data_dir) 7 | -------------------------------------------------------------------------------- /2_extract_embedding.py: -------------------------------------------------------------------------------- 1 | from preprocess import extract_embedding 2 | 3 | import os 4 | 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 6 | 7 | if __name__ == "__main__": 8 | # data_dir = "dataset/swdd-7k" 9 | # emb_dir = data_dir + "_embedding" 10 | # extract_embedding( 11 | # data_dir=data_dir, 12 | # save_dir=emb_dir, 13 | # modelname="paraphrase-xlm-r-multilingual-v1", 14 | # ) 15 | 16 | for i in range(10, 100, 10): 17 | data_dir = "dataset/swdd-4k_{}".format(i) 18 | emb_dir = data_dir + "_embedding" 19 | extract_embedding( 20 | data_dir=data_dir, 21 | save_dir=emb_dir, 22 | modelname="paraphrase-xlm-r-multilingual-v1", 23 | ) 24 | -------------------------------------------------------------------------------- /3_extract_time_series_feature.py: -------------------------------------------------------------------------------- 1 | from preprocess import extract_time_series_feature 2 | import os 3 | 4 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 5 | 6 | if __name__ == "__main__": 7 | # extract_time_series_feature(data_dir="dataset/swdd-7k_embedding", origin_only=True) 8 | for i in range(10, 100, 10): 9 | extract_time_series_feature( 10 | data_dir="dataset/swdd-4k_{}_embedding".format(i), origin_only=True 11 | ) 12 | -------------------------------------------------------------------------------- /4_make_ts_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import jsonlines 3 | import numpy as np 4 | from sklearn import preprocessing 5 | from sklearn.model_selection import train_test_split 6 | from collections import namedtuple 7 | 8 | 9 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 10 | 11 | Dataset = namedtuple("Dataset", ["X", "y"]) 12 | 13 | 14 | def preprocess(dataset, normalize=True, unify_dims=False, to_categorical=True): 15 | """ 16 | ## Prepare the data 17 | """ 18 | X, y = dataset.X, dataset.y 19 | 20 | if normalize: 21 | X_mean = X.mean() 22 | X_std = X.std() 23 | X = (X - X_mean) / (X_std + 1e-8) 24 | 25 | if unify_dims: 26 | # cutoff or expand timestamps 27 | pass 28 | 29 | if to_categorical: 30 | # 将标签独热编码 31 | lb = preprocessing.LabelBinarizer() 32 | y = lb.fit_transform(y) 33 | 34 | return Dataset(X, y) 35 | 36 | 37 | def make_dataset_ts( 38 | data_dir="swdd-7k", 39 | feat_dir="swdd-7k_embedding_500_50", 40 | save_dir="swdd-7k_ts_500_50", 41 | samp_cnt=7000, 42 | ): 43 | """ 44 | Note: 1. 在x中继续嵌套元组,将索引编进去,这样最后导出来就是带编号的sample,可以进行错误案例分析 45 | 2. 时间序列Z-Score预处理不在此处进行 46 | """ 47 | if not os.path.exists(save_dir): 48 | os.mkdir(save_dir) 49 | 50 | X = [] 51 | y = [] 52 | 53 | for i in range(samp_cnt): 54 | file_id = i 55 | time_series = np.load(os.path.join(feat_dir, "%04d.npy" % file_id)) 56 | X.append(time_series) 57 | with open( 58 | os.path.join(data_dir, "%04d.jsonl" % file_id), "r", encoding="utf8" 59 | ) as f: 60 | for item in jsonlines.Reader(f): 61 | datum = item 62 | y.append(datum["label"]) 63 | X = np.array(X) 64 | y = np.array(y) 65 | Dataset.X = X 66 | Dataset.y = y 67 | # Dataset_pre = preprocess(dataset=Dataset) 68 | 69 | X_train, X_test, y_train, y_test = train_test_split( 70 | Dataset.X, 71 | Dataset.y, 72 | test_size=0.4, 73 | random_state=2022, 74 | stratify=Dataset.y, 75 | ) 76 | # val_x, X_test, val_y, y_test = train_test_split( 77 | # X_test, y_test, test_size=0.5, random_state=2022, stratify=y_test 78 | # ) 79 | # print(len(y_train), len(y_test), len(val_y), np.sum(y_test == 1)) 80 | print(len(y_train), len(y_test), np.sum(y_test == 1)) 81 | 82 | with open(os.path.join(save_dir, "train.ts"), "w") as f: 83 | for idx, v in enumerate(X_train): 84 | for i in range(v.shape[0]): 85 | for j in range(v.shape[1]): 86 | f.write(str(v[i][j])) 87 | if j < v.shape[1] - 1: 88 | f.write(",") 89 | f.write(":") 90 | f.write(str(y_train[idx]) + "\n") 91 | 92 | with open(os.path.join(save_dir, "test.ts"), "w") as f: 93 | for idx, v in enumerate(X_test): 94 | for i in range(v.shape[0]): 95 | for j in range(v.shape[1]): 96 | f.write(str(v[i][j])) 97 | if j < v.shape[1] - 1: 98 | f.write(",") 99 | f.write(":") 100 | f.write(str(y_test[idx]) + "\n") 101 | 102 | 103 | def dataset_7k_origin_main(): 104 | 105 | make_dataset_ts( 106 | data_dir="dataset/swdd-7k", 107 | feat_dir="dataset/swdd-7k_embedding_origin_500_0", 108 | save_dir="dataset/swdd-7k_ts_origin_500_0", 109 | ) 110 | 111 | 112 | def dataset_4k_prop_origin_main(): 113 | for i in range(10, 100, 10): 114 | data_dir = "dataset/swdd-4k_{}".format(i) 115 | 116 | make_dataset_ts( 117 | data_dir=data_dir, 118 | feat_dir="{}_embedding_origin_500_0".format(data_dir), 119 | save_dir="{}_ts_origin_500_0".format(data_dir), 120 | samp_cnt=4000, 121 | ) 122 | 123 | 124 | if __name__ == "__main__": 125 | # dataset_7k_origin_main() 126 | dataset_4k_prop_origin_main() 127 | -------------------------------------------------------------------------------- /5_lab_length.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from utils.analysis import calc_metrics_binary 3 | 4 | # from types import new_class 5 | import tensorflow as tf 6 | import keras 7 | import gc 8 | from config import lab_config 9 | import os 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 12 | 13 | config = lab_config.sktime_dl 14 | 15 | 16 | def setNetwork(cls, network_config=None, train_config=None): 17 | """ 18 | Basic way of determining the classifier to build. To differentiate settings just and another elif. 19 | :param cls: String indicating which classifier you want 20 | :return: A classifier 21 | """ 22 | from sktime_dl.deeplearning import FCNClassifier 23 | from sktime_dl.deeplearning import MCDCNNClassifier 24 | from sktime_dl.deeplearning import CNNClassifier 25 | from sktime_dl.deeplearning import TWIESNClassifier 26 | from sktime_dl.deeplearning import InceptionTimeClassifier 27 | from pathlib import Path 28 | import os 29 | 30 | # fold = train_config["random_state"] 31 | model_save_dir = train_config.get("model_save_directory", "") 32 | # model_name = cls + "_" + str(fold) 33 | # train_config["model_name"] = model_name 34 | 35 | if model_save_dir: 36 | try: 37 | os.makedirs(model_save_dir) 38 | except os.error: 39 | pass 40 | 41 | # fold = int(fold) 42 | cls = cls.lower() 43 | if cls == "mcnn": 44 | return CNNClassifier(**network_config, **train_config) 45 | elif cls == "fcn": 46 | return FCNClassifier(**network_config, **train_config) 47 | elif cls == "mcdcnn": 48 | return MCDCNNClassifier(**network_config, **train_config) 49 | elif cls == "twiesn": 50 | train_cfg_copy = { 51 | k: v for k, v in train_config.items() if k != "model_save_directory" 52 | } 53 | return TWIESNClassifier(**network_config, **train_cfg_copy) 54 | elif cls == "inception": 55 | return InceptionTimeClassifier(**network_config, **train_config) 56 | else: 57 | raise Exception("UNKNOWN CLASSIFIER: " + cls) 58 | 59 | 60 | def read_dataset(data_dir): 61 | import os 62 | import numpy as np 63 | from sktime.utils.data_io import load_from_tsfile_to_dataframe 64 | 65 | X_train, y_train = load_from_tsfile_to_dataframe(os.path.join(data_dir, "train.ts")) 66 | X_test, y_test = load_from_tsfile_to_dataframe(os.path.join(data_dir, "test.ts")) 67 | 68 | from sklearn.model_selection import train_test_split 69 | 70 | X_val, X_test, y_val, y_test = train_test_split( 71 | X_test, 72 | y_test, 73 | test_size=0.5, 74 | random_state=2022, 75 | stratify=y_test, 76 | ) 77 | print( 78 | X_train.shape, 79 | y_train.shape, 80 | X_test.shape, 81 | y_test.shape, 82 | X_val.shape, 83 | y_val.shape, 84 | ) 85 | print(np.unique(y_train)) 86 | return (X_train, y_train), (X_val, y_val), (X_test, y_test) 87 | 88 | 89 | model_list = [ 90 | "fcn", 91 | "mcnn", 92 | "mcdcnn", 93 | "twiesn", # sklearn Ridge object has no attribute save. 明明是个sklearn,却要用keras.save。。 94 | "inception", # 作者居然说这是strongest?? 95 | ] 96 | 97 | 98 | if __name__ == "__main__": 99 | flag = 1 100 | for r in range(10): 101 | data_dir = "dataset/swdd-7k_ts_origin_500_0" 102 | # load data 103 | (X_train, y_train), (X_val, y_val), (X_test, y_test) = read_dataset( 104 | data_dir=data_dir 105 | ) 106 | for i in range(50, 501, 50): 107 | # TODO: 108 | # if flag and i < 500: 109 | # continue 110 | # data_dir = "swdd-7k_ts_500_{}".format(i) 111 | save_dir = "results/swdd-7k_model_500_{}_simple".format(i) 112 | 113 | print(data_dir, save_dir) 114 | 115 | for cls in model_list: 116 | # TODO: 117 | # if flag and cls != "tlenet": 118 | # continue 119 | if flag: 120 | flag = 0 121 | import os 122 | 123 | train_cfg = config["train_config"][cls] 124 | network_cfg = config["network_config"][cls] 125 | train_cfg["model_save_directory"] = save_dir 126 | 127 | # build network 128 | network = setNetwork( 129 | cls, network_config=network_cfg, train_config=train_cfg 130 | ) 131 | 132 | # train 133 | network.fit( 134 | X_train[:i], 135 | y_train[:i], 136 | validation_X=X_val[:i], 137 | validation_y=y_val[:i], 138 | ) 139 | 140 | # # analyze model 141 | report = calc_metrics_binary(network, X_test[:i], y_test[:i]) 142 | 143 | res_save_path = os.path.join(save_dir, cls + ".txt") 144 | with open(res_save_path, "a+") as f: 145 | f.write(report) 146 | f.write("\n" + "*" * 15 + "\n") 147 | del network 148 | gc.collect() 149 | keras.backend.clear_session() 150 | tf.keras.backend.clear_session() 151 | -------------------------------------------------------------------------------- /6_lab_prop.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from utils.analysis import calc_metrics_binary 3 | 4 | # from types import new_class 5 | import tensorflow as tf 6 | import keras 7 | import gc 8 | from config import lab_config 9 | import os 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 12 | 13 | config = lab_config.sktime_dl 14 | 15 | 16 | def setNetwork(cls, network_config=None, train_config=None): 17 | """ 18 | Basic way of determining the classifier to build. To differentiate settings just and another elif. 19 | :param cls: String indicating which classifier you want 20 | :return: A classifier 21 | """ 22 | from sktime_dl.deeplearning import FCNClassifier 23 | from sktime_dl.deeplearning import MCDCNNClassifier 24 | from sktime_dl.deeplearning import CNNClassifier 25 | from sktime_dl.deeplearning import TWIESNClassifier 26 | from sktime_dl.deeplearning import InceptionTimeClassifier 27 | from pathlib import Path 28 | import os 29 | 30 | # fold = train_config["random_state"] 31 | model_save_dir = train_config.get("model_save_directory", "") 32 | # model_name = cls + "_" + str(fold) 33 | # train_config["model_name"] = model_name 34 | 35 | if model_save_dir: 36 | try: 37 | os.makedirs(model_save_dir) 38 | except os.error: 39 | pass 40 | 41 | # fold = int(fold) 42 | cls = cls.lower() 43 | if cls == "mcnn": 44 | return CNNClassifier(**network_config, **train_config) 45 | elif cls == "fcn": 46 | return FCNClassifier(**network_config, **train_config) 47 | elif cls == "mcdcnn": 48 | return MCDCNNClassifier(**network_config, **train_config) 49 | elif cls == "twiesn": 50 | train_cfg_copy = { 51 | k: v for k, v in train_config.items() if k != "model_save_directory" 52 | } 53 | return TWIESNClassifier(**network_config, **train_cfg_copy) 54 | elif cls == "inception": 55 | return InceptionTimeClassifier(**network_config, **train_config) 56 | else: 57 | raise Exception("UNKNOWN CLASSIFIER: " + cls) 58 | 59 | def read_dataset(data_dir): 60 | import os 61 | import numpy as np 62 | from sktime.utils.data_io import load_from_tsfile_to_dataframe 63 | 64 | X_train, y_train = load_from_tsfile_to_dataframe(os.path.join(data_dir, "train.ts")) 65 | X_test, y_test = load_from_tsfile_to_dataframe(os.path.join(data_dir, "test.ts")) 66 | 67 | from sklearn.model_selection import train_test_split 68 | 69 | X_val, X_test, y_val, y_test = train_test_split( 70 | X_test, y_test, test_size=0.5, random_state=2022, stratify=y_test, 71 | ) 72 | print( 73 | X_train.shape, 74 | y_train.shape, 75 | X_test.shape, 76 | y_test.shape, 77 | X_val.shape, 78 | y_val.shape, 79 | ) 80 | print(np.unique(y_train)) 81 | return (X_train, y_train), (X_val, y_val), (X_test, y_test) 82 | 83 | 84 | model_list = [ 85 | "fcn", 86 | "mcnn", 87 | "mcdcnn", 88 | "twiesn", # sklearn Ridge object has no attribute save. 明明是个sklearn,却要用keras.save。。 89 | "inception", # 作者居然说这是strongest?? 90 | ] 91 | 92 | if __name__ == "__main__": 93 | flag = 1 94 | for r in range(10): 95 | for i in range(10, 100, 10): 96 | # TODO: 97 | # if flag and i < 90: 98 | # continue 99 | 100 | data_dir = "dataset/swdd-4k_{}_ts_origin_500_0".format(i) 101 | save_dir = "results/swdd-4k_{}_model_500_0".format(i) 102 | 103 | print(data_dir, save_dir) 104 | 105 | # load data 106 | (X_train, y_train), (X_val, y_val), (X_test, y_test) = read_dataset( 107 | data_dir=data_dir 108 | ) 109 | for cls in model_list: 110 | # if flag and cls != "tlenet": 111 | # continue 112 | if flag: 113 | flag = 0 114 | print(cls) 115 | import os 116 | 117 | train_cfg = config["train_config"][cls] 118 | network_cfg = config["network_config"][cls] 119 | train_cfg["model_save_directory"] = save_dir 120 | 121 | # build network 122 | network = setNetwork( 123 | cls, network_config=network_cfg, train_config=train_cfg 124 | ) 125 | 126 | # train 127 | network.fit(X_train, y_train, validation_X=X_val, validation_y=y_val) 128 | 129 | # # analyze model 130 | report = calc_metrics_binary(network, X_test, y_test) 131 | 132 | res_save_path = os.path.join(save_dir, cls + ".txt") 133 | with open(res_save_path, "a+") as f: 134 | f.write(report) 135 | f.write("\n" + "*" * 15 + "\n") 136 | del network 137 | gc.collect() 138 | keras.backend.clear_session() 139 | tf.keras.backend.clear_session() 140 | -------------------------------------------------------------------------------- /7_lab_ablation.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from utils.analysis import calc_metrics_binary 3 | 4 | # from types import new_class 5 | import tensorflow as tf 6 | import keras 7 | import gc 8 | from config import lab_config 9 | import os 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 12 | 13 | config = lab_config.sktime_dl 14 | 15 | 16 | def setNetwork(cls, network_config=None, train_config=None): 17 | """ 18 | Basic way of determining the classifier to build. To differentiate settings just and another elif. 19 | :param cls: String indicating which classifier you want 20 | :return: A classifier 21 | """ 22 | 23 | from sktime_dl.deeplearning import FCNClassifier 24 | from sktime_dl.deeplearning import MCDCNNClassifier 25 | from sktime_dl.deeplearning import CNNClassifier 26 | from sktime_dl.deeplearning import TWIESNClassifier 27 | from sktime_dl.deeplearning import InceptionTimeClassifier 28 | from pathlib import Path 29 | import os 30 | 31 | # fold = train_config["random_state"] 32 | model_save_dir = train_config.get("model_save_directory", "") 33 | # model_name = cls + "_" + str(fold) 34 | # train_config["model_name"] = model_name 35 | 36 | if model_save_dir: 37 | try: 38 | os.makedirs(model_save_dir) 39 | except os.error: 40 | pass 41 | 42 | # fold = int(fold) 43 | cls = cls.lower() 44 | if cls == "mcnn": 45 | return CNNClassifier(**network_config, **train_config) 46 | elif cls == "fcn": 47 | return FCNClassifier(**network_config, **train_config) 48 | elif cls == "mcdcnn": 49 | return MCDCNNClassifier(**network_config, **train_config) 50 | elif cls == "twiesn": 51 | train_cfg_copy = { 52 | k: v for k, v in train_config.items() if k != "model_save_directory" 53 | } 54 | return TWIESNClassifier(**network_config, **train_cfg_copy) 55 | elif cls == "inception": 56 | return InceptionTimeClassifier(**network_config, **train_config) 57 | else: 58 | raise Exception("UNKNOWN CLASSIFIER: " + cls) 59 | 60 | 61 | def read_dataset(data_dir): 62 | import os 63 | import numpy as np 64 | from sktime.utils.data_io import load_from_tsfile_to_dataframe 65 | 66 | X_train, y_train = load_from_tsfile_to_dataframe(os.path.join(data_dir, "train.ts")) 67 | X_test, y_test = load_from_tsfile_to_dataframe(os.path.join(data_dir, "test.ts")) 68 | 69 | from sklearn.model_selection import train_test_split 70 | 71 | X_val, X_test, y_val, y_test = train_test_split( 72 | X_test, 73 | y_test, 74 | test_size=0.5, 75 | random_state=2022, 76 | stratify=y_test, 77 | ) 78 | print( 79 | X_train.shape, 80 | y_train.shape, 81 | X_test.shape, 82 | y_test.shape, 83 | X_val.shape, 84 | y_val.shape, 85 | ) 86 | print(np.unique(y_train)) 87 | return (X_train, y_train), (X_val, y_val), (X_test, y_test) 88 | 89 | 90 | model_list = [ 91 | "fcn", 92 | "mcnn", 93 | "mcdcnn", 94 | "twiesn", # sklearn Ridge object has no attribute save. 明明是个sklearn,却要用keras.save。。 95 | "inception", # 作者居然说这是strongest?? 96 | ] 97 | 98 | 99 | if __name__ == "__main__": 100 | from utils.symptom import symptoms_dsm_5 as symptoms 101 | 102 | # 情绪 103 | feat_emo = [ 104 | "depressive_mood", 105 | "retardation_or_agitation", 106 | "panic_and_anxious", 107 | ] # sad, agi, pan 108 | # 认知 109 | feat_cog = [ 110 | "interest_pleasure_loss", 111 | "self_blame", 112 | "suicidal_ideation", 113 | "concentration_problem", 114 | ] # int, sel(low-esteem), sui, con 115 | # 躯体 116 | feat_bod = [ 117 | "appetite_weight_problem", 118 | "insomnia_or_hypersomnia", 119 | "energy_loss", 120 | "sympathetic_arousal", 121 | ] # app, ins, ene, sym 122 | # 行为? 123 | for r in range(10): 124 | feat_emo_dim = [] 125 | feat_cog_dim = [] 126 | feat_bod_dim = [] 127 | 128 | for i, k in enumerate(symptoms): 129 | print(i, k) 130 | feat_id = "dim_{}".format(i) 131 | if k in feat_emo: 132 | feat_emo_dim.append(feat_id) 133 | elif k in feat_cog: 134 | feat_cog_dim.append(feat_id) 135 | elif k in feat_bod: 136 | feat_bod_dim.append(feat_id) 137 | 138 | print(feat_emo_dim, feat_cog_dim, feat_bod_dim) 139 | 140 | feat_group = [feat_emo_dim, feat_cog_dim, feat_bod_dim] 141 | 142 | # load data 143 | data_dir = "dataset/swdd-7k_ts_origin_500_0" 144 | 145 | (X_train_all, y_train), (X_val_all, y_val), (X_test_all, y_test) = read_dataset( 146 | data_dir=data_dir 147 | ) 148 | 149 | for i in range(len(feat_group)): 150 | feat_group_train = [] 151 | for j in range(len(feat_group)): 152 | if i != j: 153 | feat_group_train += feat_group[j] 154 | print(feat_group_train) 155 | 156 | save_dir = "results/swdd-7k_model_500_0_remove_feat_group_{}".format(i) 157 | 158 | print(data_dir, save_dir) 159 | 160 | # ablation 161 | X_train = X_train_all[feat_group_train] 162 | X_val = X_val_all[feat_group_train] 163 | X_test = X_test_all[feat_group_train] 164 | 165 | for cls in model_list: 166 | import os 167 | 168 | train_cfg = config["train_config"][cls] 169 | network_cfg = config["network_config"][cls] 170 | train_cfg["model_save_directory"] = save_dir 171 | 172 | # build network 173 | network = setNetwork( 174 | cls, network_config=network_cfg, train_config=train_cfg 175 | ) 176 | 177 | # train 178 | network.fit(X_train, y_train, validation_X=X_val, validation_y=y_val) 179 | 180 | # # analyze model 181 | report = calc_metrics_binary(network, X_test, y_test) 182 | 183 | res_save_path = os.path.join(save_dir, cls + ".txt") 184 | with open(res_save_path, "a+") as f: 185 | f.write(report) 186 | f.write("\n" + "*" * 15 + "\n") 187 | del network 188 | gc.collect() 189 | keras.backend.clear_session() 190 | tf.keras.backend.clear_session() 191 | 192 | with open(os.path.join(save_dir, "feat_group_train.txt"), "w+") as f: 193 | f.write(str(feat_group_train)) 194 | -------------------------------------------------------------------------------- /8_lab_length_ml.py: -------------------------------------------------------------------------------- 1 | def calc_metrics_binary(model, X_test, y_test): 2 | from sklearn.metrics import ( 3 | classification_report, 4 | accuracy_score, 5 | f1_score, 6 | roc_auc_score, 7 | recall_score, 8 | precision_score, 9 | ) 10 | 11 | y_pred = model.predict(X_test) 12 | report = classification_report( 13 | y_test, y_pred, target_names=["Normal", "Depressed"], digits=4 14 | ) 15 | return report 16 | 17 | 18 | from datetime import datetime 19 | 20 | 21 | def train_TimeSeriesForest(X_train, y_train, X_test, y_test): 22 | start_time = datetime.now() 23 | # base 24 | print("***TimeSeriesForestClassifier***") 25 | from sklearn.pipeline import Pipeline 26 | from sktime.classification.interval_based import TimeSeriesForestClassifier 27 | from sktime.classification.compose import ColumnEnsembleClassifier 28 | from sktime.transformations.panel.compose import ColumnConcatenator 29 | 30 | steps = [ 31 | ("concatenate", ColumnConcatenator()), 32 | ("classify", TimeSeriesForestClassifier(n_estimators=100)), 33 | ] 34 | clf = Pipeline(steps) 35 | clf.fit(X_train, y_train) 36 | 37 | report = calc_metrics_binary(clf, X_test, y_test) 38 | 39 | print(report) 40 | 41 | print(str(datetime.now() - start_time)) 42 | 43 | return clf 44 | 45 | 46 | def train_ROCKETClassifier(X_train, y_train, X_test, y_test): 47 | start_time = datetime.now() 48 | # 2020 49 | print("***ROCKETClassifier***") 50 | from sktime.classification.kernel_based import ROCKETClassifier 51 | 52 | clf = ROCKETClassifier(num_kernels=500) 53 | clf.fit(X_train, y_train) 54 | 55 | report = calc_metrics_binary(clf, X_test, y_test) 56 | 57 | print(report) 58 | 59 | print(str(datetime.now() - start_time)) 60 | 61 | return clf 62 | 63 | 64 | def train_Signature(X_train, y_train, X_test, y_test): 65 | start_time = datetime.now() 66 | 67 | # 2020 68 | print("***SignatureClassifier***") 69 | from sktime.classification.feature_based import SignatureClassifier 70 | 71 | clf = SignatureClassifier() 72 | clf.fit(X_train, y_train) 73 | 74 | report = calc_metrics_binary(clf, X_test, y_test) 75 | 76 | print(report) 77 | 78 | print(str(datetime.now() - start_time)) 79 | 80 | return clf 81 | 82 | 83 | def train_Arsenal(X_train, y_train, X_test, y_test): 84 | start_time = datetime.now() 85 | # uni 86 | print("***Arsenal***") 87 | from sktime.classification.kernel_based import Arsenal 88 | 89 | clf = Arsenal(num_kernels=200, n_estimators=5) 90 | clf.fit(X_train, y_train) 91 | 92 | report = calc_metrics_binary(clf, X_test, y_test) 93 | 94 | print(report) 95 | print(str(datetime.now() - start_time)) 96 | start_time = datetime.now() 97 | 98 | return clf 99 | 100 | 101 | def train_TSFresh(X_train, y_train, X_test, y_test): 102 | start_time = datetime.now() 103 | # 2018 104 | print("***TSFreshClassifier***") 105 | from sktime.classification.feature_based import TSFreshClassifier 106 | 107 | clf = TSFreshClassifier() 108 | clf.fit(X_train, y_train) 109 | 110 | report = calc_metrics_binary(clf, X_test, y_test) 111 | 112 | print(report) 113 | 114 | print(str(datetime.now() - start_time)) 115 | 116 | return clf 117 | 118 | 119 | def train_HIVECOTEV2(X_train, y_train, X_test, y_test): 120 | start_time = datetime.now() 121 | # No module named sktime.classificastion.shapelet_based.mrseql.mrseql 122 | print("***HIVECOTEV2***") 123 | from sktime.classification.hybrid import HIVECOTEV2 124 | from sktime.contrib.vector_classifiers._rotation_forest import RotationForest 125 | 126 | clf = HIVECOTEV2( 127 | stc_params={ 128 | "estimator": RotationForest(n_estimators=3), 129 | "n_shapelet_samples": 500, 130 | "max_shapelets": 20, 131 | "batch_size": 100, 132 | }, 133 | drcif_params={"n_estimators": 10}, 134 | arsenal_params={"num_kernels": 100, "n_estimators": 5}, 135 | tde_params={ 136 | "n_parameter_samples": 25, 137 | "max_ensemble_size": 5, 138 | "randomly_selected_params": 10, 139 | }, 140 | ) 141 | clf.fit(X_train, y_train) 142 | 143 | report = calc_metrics_binary(clf, X_test, y_test) 144 | 145 | print(report) 146 | print(str(datetime.now() - start_time)) 147 | start_time = datetime.now() 148 | 149 | return clf 150 | 151 | 152 | def train_ShapeletTransform(X_train, y_train, X_test, y_test): 153 | start_time = datetime.now() 154 | # shapelet 155 | print("***ShapeletTransformClassifier***") 156 | from sktime.classification.shapelet_based import ShapeletTransformClassifier 157 | from sktime.contrib.vector_classifiers._rotation_forest import RotationForest 158 | 159 | clf = ShapeletTransformClassifier( 160 | estimator=RotationForest(n_estimators=3), 161 | n_shapelet_samples=500, 162 | max_shapelets=20, 163 | batch_size=100, 164 | ) 165 | clf.fit(X_train, y_train) 166 | 167 | report = calc_metrics_binary(clf, X_test, y_test) 168 | 169 | print(report) 170 | print(str(datetime.now() - start_time)) 171 | start_time = datetime.now() 172 | 173 | return clf 174 | 175 | 176 | model_list = [ 177 | "bst", 178 | "rocket", 179 | "gs", 180 | "hc2", 181 | "tsf" 182 | ] 183 | 184 | 185 | if __name__ == "__main__": 186 | import os 187 | import numpy as np 188 | from sktime.utils.data_io import load_from_tsfile_to_dataframe 189 | 190 | for r in range(10): 191 | # data_dir = "swdd-7k_ts_500_500" 192 | data_dir = "dataset/swdd-7k_ts_origin_500_0" 193 | 194 | X_train, y_train = load_from_tsfile_to_dataframe( 195 | os.path.join(data_dir, "train.ts") 196 | ) 197 | X_test, y_test = load_from_tsfile_to_dataframe( 198 | os.path.join(data_dir, "test.ts") 199 | ) 200 | 201 | print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) 202 | print(np.unique(y_train)) 203 | 204 | for i in range(50, 501, 50): 205 | # if flag and i < 500: 206 | # continue 207 | # data_dir = "swdd-7k_ts_500_{}".format(i) 208 | save_dir = "results/swdd-7k_model_500_{}_simple".format(i) 209 | 210 | print(data_dir, save_dir) 211 | 212 | for cls in model_list: 213 | if cls == "bst": 214 | try: 215 | clf = train_ShapeletTransform( 216 | X_train[:i], y_train[:i], X_test[:i], y_test[:i] 217 | ) # need at least one array to concatenate 218 | except Exception as e: 219 | print(e) 220 | elif cls == "rocket": 221 | clf = train_ROCKETClassifier( 222 | X_train[:i], y_train[:i], X_test[:i], y_test[:i] 223 | ) 224 | elif cls == "gs": 225 | # try: 226 | # clf = train_TSFresh( 227 | # X_train[:i], y_train[:i], X_test[:i], y_test[:i] 228 | # ) 229 | # except Exception as e: 230 | # print(e) 231 | try: 232 | # clf = train_Signature(X_train[:i], y_train[:i], X_test[:i], y_test[:i]) 233 | clf = train_Arsenal( 234 | X_train[:i], y_train[:i], X_test[:i], y_test[:i] 235 | ) 236 | except Exception as e: 237 | print(e) 238 | elif cls == "hc2": 239 | try: 240 | clf = train_HIVECOTEV2( 241 | X_train[:i], y_train[:i], X_test[:i], y_test[:i] 242 | ) # not work 243 | except Exception as e: 244 | print(e) 245 | elif cls == "tsf": 246 | clf = train_TimeSeriesForest( 247 | X_train[:i], y_train[:i], X_test[:i], y_test[:i] 248 | ) 249 | 250 | # # analyze model 251 | report = calc_metrics_binary(clf, X_test[:i], y_test[:i]) 252 | 253 | res_save_path = os.path.join(save_dir, cls + ".txt") 254 | with open(res_save_path, "a+") as f: 255 | f.write(report) 256 | f.write("\n" + "*" * 15 + "\n") 257 | -------------------------------------------------------------------------------- /9_lab_prop_ml.py: -------------------------------------------------------------------------------- 1 | def calc_metrics_binary(model, X_test, y_test): 2 | from sklearn.metrics import ( 3 | classification_report, 4 | accuracy_score, 5 | f1_score, 6 | roc_auc_score, 7 | recall_score, 8 | precision_score, 9 | ) 10 | 11 | y_pred = model.predict(X_test) 12 | report = classification_report( 13 | y_test, y_pred, target_names=["Normal", "Depressed"], digits=4 14 | ) 15 | return report 16 | 17 | 18 | from datetime import datetime 19 | 20 | 21 | def train_TimeSeriesForest(X_train, y_train, X_test, y_test): 22 | start_time = datetime.now() 23 | # base 24 | print("***TimeSeriesForestClassifier***") 25 | from sklearn.pipeline import Pipeline 26 | from sktime.classification.interval_based import TimeSeriesForestClassifier 27 | from sktime.classification.compose import ColumnEnsembleClassifier 28 | from sktime.transformations.panel.compose import ColumnConcatenator 29 | 30 | steps = [ 31 | ("concatenate", ColumnConcatenator()), 32 | ("classify", TimeSeriesForestClassifier(n_estimators=100)), 33 | ] 34 | clf = Pipeline(steps) 35 | clf.fit(X_train, y_train) 36 | 37 | report = calc_metrics_binary(clf, X_test, y_test) 38 | 39 | print(report) 40 | 41 | print(str(datetime.now() - start_time)) 42 | 43 | return clf 44 | 45 | 46 | def train_ROCKETClassifier(X_train, y_train, X_test, y_test): 47 | start_time = datetime.now() 48 | # 2020 49 | print("***ROCKETClassifier***") 50 | from sktime.classification.kernel_based import ROCKETClassifier 51 | 52 | clf = ROCKETClassifier(num_kernels=500) 53 | clf.fit(X_train, y_train) 54 | 55 | report = calc_metrics_binary(clf, X_test, y_test) 56 | 57 | print(report) 58 | 59 | print(str(datetime.now() - start_time)) 60 | 61 | return clf 62 | 63 | 64 | def train_Signature(X_train, y_train, X_test, y_test): 65 | start_time = datetime.now() 66 | 67 | # 2020 68 | print("***SignatureClassifier***") 69 | from sktime.classification.feature_based import SignatureClassifier 70 | 71 | clf = SignatureClassifier() 72 | clf.fit(X_train, y_train) 73 | 74 | report = calc_metrics_binary(clf, X_test, y_test) 75 | 76 | print(report) 77 | 78 | print(str(datetime.now() - start_time)) 79 | 80 | return clf 81 | 82 | 83 | def train_Arsenal(X_train, y_train, X_test, y_test): 84 | start_time = datetime.now() 85 | # uni 86 | print("***Arsenal***") 87 | from sktime.classification.kernel_based import Arsenal 88 | 89 | clf = Arsenal(num_kernels=200, n_estimators=5) 90 | clf.fit(X_train, y_train) 91 | 92 | report = calc_metrics_binary(clf, X_test, y_test) 93 | 94 | print(report) 95 | print(str(datetime.now() - start_time)) 96 | start_time = datetime.now() 97 | 98 | return clf 99 | 100 | 101 | def train_TSFresh(X_train, y_train, X_test, y_test): 102 | start_time = datetime.now() 103 | # 2018 104 | print("***TSFreshClassifier***") 105 | from sktime.classification.feature_based import TSFreshClassifier 106 | 107 | clf = TSFreshClassifier() 108 | clf.fit(X_train, y_train) 109 | 110 | report = calc_metrics_binary(clf, X_test, y_test) 111 | 112 | print(report) 113 | 114 | print(str(datetime.now() - start_time)) 115 | 116 | return clf 117 | 118 | 119 | def train_HIVECOTEV2(X_train, y_train, X_test, y_test): 120 | start_time = datetime.now() 121 | # No module named sktime.classificastion.shapelet_based.mrseql.mrseql 122 | print("***HIVECOTEV2***") 123 | from sktime.classification.hybrid import HIVECOTEV2 124 | from sktime.contrib.vector_classifiers._rotation_forest import RotationForest 125 | 126 | clf = HIVECOTEV2( 127 | stc_params={ 128 | "estimator": RotationForest(n_estimators=3), 129 | "n_shapelet_samples": 500, 130 | "max_shapelets": 20, 131 | "batch_size": 100, 132 | }, 133 | drcif_params={"n_estimators": 10}, 134 | arsenal_params={"num_kernels": 100, "n_estimators": 5}, 135 | tde_params={ 136 | "n_parameter_samples": 25, 137 | "max_ensemble_size": 5, 138 | "randomly_selected_params": 10, 139 | }, 140 | ) 141 | clf.fit(X_train, y_train) 142 | 143 | report = calc_metrics_binary(clf, X_test, y_test) 144 | 145 | print(report) 146 | print(str(datetime.now() - start_time)) 147 | start_time = datetime.now() 148 | 149 | return clf 150 | 151 | 152 | def train_ShapeletTransform(X_train, y_train, X_test, y_test): 153 | start_time = datetime.now() 154 | # shapelet 155 | print("***ShapeletTransformClassifier***") 156 | from sktime.classification.shapelet_based import ShapeletTransformClassifier 157 | from sktime.contrib.vector_classifiers._rotation_forest import RotationForest 158 | 159 | clf = ShapeletTransformClassifier( 160 | estimator=RotationForest(n_estimators=3), 161 | n_shapelet_samples=500, 162 | max_shapelets=20, 163 | batch_size=100, 164 | ) 165 | clf.fit(X_train, y_train) 166 | 167 | report = calc_metrics_binary(clf, X_test, y_test) 168 | 169 | print(report) 170 | print(str(datetime.now() - start_time)) 171 | start_time = datetime.now() 172 | 173 | return clf 174 | 175 | 176 | model_list = ["bst", "rocket", "gs", "hc2", "tsf"] 177 | 178 | if __name__ == "__main__": 179 | import os 180 | import numpy as np 181 | from sktime.utils.data_io import load_from_tsfile_to_dataframe 182 | 183 | flag = 1 184 | for r in range(9): 185 | for i in range(10, 100, 10): 186 | # TODO: 187 | if flag and i < 10: 188 | continue 189 | 190 | data_dir = "dataset/swdd-4k_{}_ts_origin_500_0".format(i) 191 | save_dir = "results/swdd-4k_{}_model_500_0".format(i) 192 | 193 | if not os.path.exists(save_dir): 194 | os.makedirs(save_dir) 195 | 196 | print(data_dir, save_dir) 197 | 198 | # load data 199 | X_train, y_train = load_from_tsfile_to_dataframe( 200 | os.path.join(data_dir, "train.ts") 201 | ) 202 | X_test, y_test = load_from_tsfile_to_dataframe( 203 | os.path.join(data_dir, "test.ts") 204 | ) 205 | 206 | print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) 207 | print(np.unique(y_train)) 208 | 209 | for cls in model_list: 210 | if flag and cls != "hc2": 211 | continue 212 | if flag: 213 | flag = 0 214 | print(cls) 215 | if cls == "bst": 216 | try: 217 | clf = train_ShapeletTransform( 218 | X_train, y_train, X_test, y_test 219 | ) # need at least one array to concatenate 220 | except Exception as e: 221 | print(e) 222 | elif cls == "rocket": 223 | clf = train_ROCKETClassifier(X_train, y_train, X_test, y_test) 224 | elif cls == "gs": 225 | # try: 226 | # clf = train_TSFresh( 227 | # X_train, y_train, X_test, y_test 228 | # ) 229 | # except Exception as e: 230 | # print(e) 231 | try: 232 | # clf = train_Signature(X_train, y_train, X_test, y_test) 233 | clf = train_Arsenal(X_train, y_train, X_test, y_test) 234 | except Exception as e: 235 | print(e) 236 | elif cls == "hc2": 237 | try: 238 | clf = train_HIVECOTEV2( 239 | X_train, y_train, X_test, y_test 240 | ) # not work 241 | except Exception as e: 242 | print(e) 243 | elif cls == "tsf": 244 | clf = train_TimeSeriesForest(X_train, y_train, X_test, y_test) 245 | 246 | # # analyze model 247 | report = calc_metrics_binary(clf, X_test, y_test) 248 | 249 | res_save_path = os.path.join(save_dir, cls + ".txt") 250 | with open(res_save_path, "a+") as f: 251 | f.write(report) 252 | f.write("\n" + "*" * 15 + "\n") 253 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 YiChengCai1999 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Depression Detection on Online Social Network with Multivariate Time Series Feature of User Depressive Symptoms 2 | 3 | The source code for [**Depression Detection on Online Social Network with Multivariate Time Series Feature of User Depressive Symptoms**](https://www.sciencedirect.com/science/article/pii/S0957417423000398) paper, accepted at ESWA 2023. 4 | 5 | ## Abstract 6 | 7 | In recent years, depression has attracted worldwide attention because of its prevalence and great risk for suicide. Existing studies have confirmed the feasibility of depression detection on online social networks. Most existing researches extract the overall features of users during a specific period, which cannot reflect the dynamic variation of depression. Besides, the methods proposed in these studies are often lack in interpretability and fail to establish the correlation between features and depressive symptoms in clinical. To address these problems, we propose a novel framework for depression detection based on multivariate time series feature of user depressive symptoms. Firstly, we construct and publish a well-labeled dataset collecting from the most popular Chinese social network platform Sina Weibo. To the best of our knowledge, it is the first large-scale depression dataset with complete collection of user tweeting histories, which includes 3,711 depressed users and 19,526 non-depressed users. Then, we propose a feature extraction method that reveals user depression symptoms variation in the form of multivariate time series. Moreover, we explore the various influencing factors to the performance of our proposed framework. In addition, we also explore the contributions of features to classification as well as their interpretability and conduct feature ablations on them. The experimental results show that our proposed method is effective and the extracted multivariate time series feature can well characterize the depressive state variation of users. Finally, we analyze the shortcomings and challenges of this study. Our research work also provides methods and ideas for tracking and visualizing the development of depression among online social network users. 8 | 9 | ![Illustration of our feature extraction method - DSTS](https://github.com/cyc21csri/DepressionDetection/blob/main/img/Method-DSTS.png) 10 | 11 | ## Dataset 12 | 13 | You can download and aquire the information the datasets from the following links [SWDD](https://github.com/cyc21csri/SWDD). 14 | 15 | ### Requirements 16 | 17 | Alongside the packages mentioned in the file "requirements.txt", you should install the following packages as below to run the **deep learning classifiers** in the paper. 18 | 19 | - sktime-dl-0.2.0 (a modification to the origin package sktime-dl-0.1.0 by the author of this work) 20 | 21 | - sktime-0.5.3 22 | 23 | We have packaged the two packages in zip files, see folder envs. 24 | 25 | For the **machine learning classifiers** in this paper (when running the file 8_lab_length_ml.py, 9_lab_prop_ml.py, 10_lab_ablation_ml.py), however, **you should update the sktime package to version 0.8.1**. 26 | 27 | ## How to Run 28 | 29 | We have uploaded all the source code needed to reproduce the results of experiments in the paper. Follow the instructions as below to run the code: 30 | 31 | ``` 32 | git clone https://github.com/cyc21csri/DepressionDetection.git 33 | cd DepressionDetection 34 | mkdir -p data/swdd dataset results 35 | ``` 36 | 37 | - Download `SWDD` data from [here](https://drive.google.com/file/d/1fNKtoo4SP98OAhalMjNRZfFqmQZsQ0fh/view?usp=sharing) and unzip it to `data/swdd` folder 38 | 39 | - The source code is reorganized and the filename is renamed in the form of "[No.]\_[FileName]", where [No.] indicates the execution order of the scripts. 40 | 41 | Notice that after you have executed the "4_make_ts_dataset.py", you should manually add the following info to the head of generated dataset file "train.ts" and "test.ts" in case it cannot be recognized as a time-series dataset. 42 | 43 | ``` 44 | @problemName MDDWeibo 45 | @timeStamps false 46 | @missing false 47 | @univariate false 48 | @dimensions 11 49 | @equalLength true 50 | @seriesLength 500 51 | @classLabel true 0 1 52 | @data 53 | ``` 54 | 55 | For more information of the time-series dataset format adopted in this paper, see [here](https://timeseriesclassification.com/) and download one of the dataset in [UCR Archive](https://www.cs.ucr.edu/~eamonn/time_series_data_2018/) to obtain full comprehension of the format. 56 | 57 | ## Cite (BibTex) 58 | 59 | Please cite the following paper, if you find our work useful in your research: 60 | 61 | ``` 62 | @article{cai2023depression, 63 | title={Depression Detection on Online Social Network with Multivariate Time Series Feature of User Depressive Symptoms}, 64 | author={Cai, Yicheng and Wang, Haizhou and Ye, Huali and Jin, Yanwen and Gao, Wei}, 65 | journal={Expert Systems with Applications}, 66 | pages={119538}, 67 | volume = {217}, 68 | year={2023}, 69 | doi={10.1016/j.eswa.2023.119538} 70 | } 71 | ``` 72 | 73 | ## Supplementary Infos to the paper 74 | 75 | ### Depression Symptom Descriptions 76 | 77 | > in social media language 78 | 79 | | # | 症状名 (Symptom) | 症状描述 (Description) | 80 | |:---:|:--------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| 81 | | 1 | 悲伤情绪 (Sadness) | 长时间不开心、不高兴、不快乐,心情低落消沉、郁闷压抑、沮丧或绝望,经常总是悲伤想哭、伤心流泪、痛苦难过、感到空虚难熬惆怅 (Feeling unhappy, down, sad, or depressed for a long time, with a low, gloomy, distressed, discouraged or hopeless mood. Often feeling like crying, shedding tears of sorrow, agony and grief, feeling empty, unbearable melancholy and desolation.) | 82 | | 2 | 兴致下降 (Loss of interest/pleasure) | 对几乎所有活动没兴趣、没意思没动力,乐趣明显减少、没有愉悦感,厌世、成天无精打采 (Having little interest or pleasure in almost all activities, feeling that things are meaningless or worthless, lacking motivation and drive. Experiencing a noticeable decrease in enjoyment, inability to feel joy, disenchantment with life, feeling lifeless and apathetic all day.) | 83 | | 3 | 食欲问题 (Appetite problem) | 食欲减退、经常饱、没胃口、想吐 (Loss of appetite, feeling full frequently, no desire to eat, feeling like vomiting.) | 84 | | 4 | 睡眠障碍 (Insomnia) | 经常失眠睡不着、服用安眠药、熬夜到凌晨 (Frequently unable to sleep, insomnia, taking sleeping pills, staying up late into the early morning.) | 85 | | 5 | 急躁 (Agitation) | 精神性躁动、易感烦躁、坐立难安,言行冲动、易怒、易抓狂 (Mental agitation, easily irritated, restless, impulsive in words and actions, prone to anger, easily driven crazy.) | 86 | | 6 | 精力不足 (Energy Loss) | 经常感到累、困、昏晕乏力、疲惫没力气、没有精神 (Often feeling tired, sleepy, dizzy, weak, fatigued, lacking energy and vitality.) | 87 | | 7 | 自责 (Self-blame) | 经常自我否定,我好没用、没有价值、一无是处、一事无成、好失败,让自己或家人失望,经常对不起、内疚自责、都是我的错 (Frequently self-negating, feeling useless, worthless, incompetent, a failure who lets myself or family down. Often feeling guilty, blaming and being hard on myself, thinking everything is my fault.) | 88 | | 8 | 注意力下降 (Concentration Problem) | 注意力下降、无法专注、感到集中注意力困难、思考能力减退、犹豫不决、精神恍惚 (Decreased attention, inability to focus, difficulty concentrating, reduced thinking ability, indecisiveness, mental confusion.) | 89 | | 9 | 自杀倾向 (Suicidal Ideation) | 反复想到死亡、想死、自杀、结束生命,用刀片割腕自残、想跳楼自杀、计划自杀 (Repeated thoughts of death, wanting to die, suicide, ending one's life. Self-harming with razor blades, thinking of jumping off a building to commit suicide, making suicide plans.) | 90 | | 10 | 交感神经唤醒 (Sympathetic Arousal) | 心慌、心悸、胸闷、喘不过气、颤抖、视力模糊、冒冷汗 (Feeling panic, heart palpitations, chest tightness, shortness of breath, trembling, blurred vision, breaking out in a cold sweat.) | 91 | | 11 | 恐慌 (Panic) | 经常好怕、害怕、恐惧、恐慌,想逃避 (Often feeling scared, afraid, terrified, panicked, wanting to escape.) | 92 | 93 | ### Depressive search words 94 | 95 | > To find depression indicative tweets in Sina Weibo. 96 | > 97 | > The crawler of Sina Weibo see https://github.com/cyc21csri/SinaWeiboCrawler 98 | 99 | | # | Chinese Search Words | English Meaning | 100 | |:---:|:--------------------:|:--------------------------------------------------------:| 101 | | 1 | \#抑郁症\# | Sina Weibo super topic of "Depression" in English | 102 | | 2 | 文拉法辛 | "Venlafaxine" in English | 103 | | 3 | 舍曲林 | "Sertraline" in English | 104 | | 4 | 度洛西汀 | "Duloxetine" in English | 105 | | 5 | 抑郁 一无是处 | "Depression" and "Good for nothing" in English | 106 | | 6 | 抑郁 生无可恋 | "Depression" and "Don't want to live anymore" in English | 107 | | 7 | 抑郁 没意思 | "Depression" and "Boring" in English | 108 | | 8 | 抑郁 难熬 | "Depression" and "Suffering" in English | 109 | | 9 | 抑郁 自残 | "Depression" and "Self-harm" in English | 110 | | 10 | 抑郁 吃药 | "Depression" and "Take medicine" in English | 111 | | 11 | 抑郁 想哭 | "Depression" and "Want to cry" in English | 112 | | 12 | 抑郁 想死 | "Depression" and "Want to die" in English | 113 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | from munch import Munch 2 | import numpy as np 3 | 4 | NB_VARIABLES = 11 5 | TIMESTEPS = 500 6 | NUM_CLASSES = 2 7 | MALSTM_DATA_DIR = "dataset/swdd-7k_npz_500_{}".format(TIMESTEPS) 8 | SKTIME_DATA_DIR = "dataset/swdd-7k_ts_origin_500_{}".format(TIMESTEPS) 9 | SAVE_DIR = "results/swdd-7k_model_500_{}".format(TIMESTEPS) 10 | 11 | lab_config = Munch( 12 | { 13 | "model_save_directory": SAVE_DIR, 14 | "malstm": { 15 | "data_dir": MALSTM_DATA_DIR, 16 | "model_config": { 17 | "alstm_units": 8, 18 | "dropout": 0.8, 19 | "filters": [128, 256, 128], 20 | "kernel_sizes": [8, 5, 3], 21 | "padding": "same", 22 | "kernel_initializer": "he_uniform", 23 | "activation": "relu", 24 | "num_classes": NUM_CLASSES, 25 | "input_shape": (NB_VARIABLES, TIMESTEPS), # TIMESTEPS 26 | }, 27 | "train_config": { 28 | "batch_size": 128, 29 | "epochs": 100, 30 | "learning_rate": 1e-3, 31 | "callback_config": { 32 | "reduce_lr": { 33 | "monitor": "val_loss", # "loss", 34 | "patience": 100, 35 | "mode": "auto", 36 | "factor": 1.0 / np.cbrt(2), 37 | "min_lr": 1e-4, 38 | }, 39 | }, 40 | }, 41 | "model_file": "malstm_fcn_7k.keras", 42 | }, 43 | "sktime_dl": { 44 | "data_dir": SKTIME_DATA_DIR, 45 | "network_config": { 46 | "mcnn": { 47 | "kernel_size": 7, 48 | "avg_pool_size": 3, 49 | "nb_conv_layers": 2, 50 | "filter_sizes": [6, 12], 51 | }, 52 | "fcn": {}, 53 | "mcdcnn": { 54 | "kernel_size": 5, 55 | "pool_size": 2, 56 | "filter_sizes": [8, 8], 57 | "dense_units": 732, 58 | }, 59 | "twiesn": {}, 60 | "inception": { 61 | "nb_filters": 32, 62 | "use_residual": True, 63 | "use_bottleneck": True, 64 | "bottleneck_size": 32, 65 | "depth": 6, 66 | "kernel_size": 41 - 1, 67 | }, 68 | }, 69 | "train_config": { 70 | "mcnn": { 71 | "batch_size": 32, # 16, 72 | "nb_epochs": 200, # 200 73 | "verbose": True, 74 | "random_state": 0, # 在comparison实验中需统一 75 | "model_save_directory": SAVE_DIR, 76 | "model_name": "cnn-7k", 77 | }, 78 | "fcn": { 79 | "nb_epochs": 200, # 200, # 2000, 80 | "batch_size": 64, # 16, 81 | "verbose": True, 82 | "random_state": 0, 83 | "model_name": "fcn-7k", 84 | "model_save_directory": SAVE_DIR, 85 | }, 86 | "inception": { 87 | "nb_epochs": 500, # 500 # 1500, 88 | "batch_size": 64, 89 | "verbose": True, 90 | "random_state": 0, 91 | "model_name": "inception-7k", 92 | "model_save_directory": SAVE_DIR, 93 | }, 94 | "mcdcnn": { 95 | "nb_epochs": 120, # 120 96 | "batch_size": 16, 97 | "verbose": True, 98 | "random_state": 0, 99 | "model_name": "mcdcnn-7k", 100 | "model_save_directory": SAVE_DIR, 101 | }, 102 | "twiesn": { 103 | "rho_s": [0.55, 0.9, 2.0, 5.0], 104 | "alpha": 0.1, # leaky rate 105 | "verbose": True, 106 | "random_state": 0, 107 | "model_name": "twiesn-7k", 108 | "model_save_directory": SAVE_DIR, 109 | }, 110 | }, 111 | }, 112 | } 113 | ) 114 | -------------------------------------------------------------------------------- /envs/sktime-0.5.3.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/envs/sktime-0.5.3.zip -------------------------------------------------------------------------------- /envs/sktime-dl-0.2.0-modify.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/envs/sktime-dl-0.2.0-modify.zip -------------------------------------------------------------------------------- /img/Method-DSTS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/img/Method-DSTS.png -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | from utils.data import load_swdd_xk, load_swdd_xk_emb 2 | from utils.extractor import WeiboText, get_post_time 3 | from utils.symptom import symptoms_dsm_5 as symptoms 4 | from sentence_transformers import SentenceTransformer, util 5 | import torch 6 | from torch.nn import ZeroPad2d 7 | import pandas as pd 8 | import numpy as np 9 | import os 10 | 11 | 12 | def extract_embedding( 13 | data_dir="swdd-7k", 14 | save_dir="swdd-7k_embedding", 15 | modelname="paraphrase-xlm-r-multilingual-v1" 16 | ): 17 | """提取推文向量 18 | model_list = [ 19 | "distiluse-base-multilingual-cased-v1", # 512 20 | "paraphrase-xlm-r-multilingual-v1", # 768, best 21 | "stsb-xlm-r-multilingual", # 768 22 | ] 23 | """ 24 | data = load_swdd_xk(data_dir=data_dir) 25 | model = SentenceTransformer(modelname) 26 | weibo_cleaner = WeiboText() 27 | 28 | if not os.path.exists(save_dir): 29 | os.mkdir(save_dir) 30 | 31 | cnt = 0 32 | for i in range(len(data)): 33 | cnt += 1 34 | if not cnt % 100: 35 | print(cnt, end=" ", flush=True) 36 | if not cnt % 1000: 37 | print() 38 | # extract embedding 39 | tweets = data[i]["tweets"] 40 | df_tweets = pd.DataFrame( 41 | [ 42 | { 43 | "is_origin": tweet["is_origin"], 44 | "time": get_post_time(tweet["post_time"]), 45 | "text": weibo_cleaner.get_cleaned_text(tweet["text"]), 46 | } 47 | for tweet in tweets 48 | ] 49 | ) 50 | 51 | tweets_emb = model.encode(df_tweets["text"].tolist()) 52 | df_emb = pd.DataFrame({"embedding": list(tweets_emb)}) 53 | df = pd.concat([df_tweets, df_emb], axis=1) 54 | 55 | np.savez( 56 | os.path.join(save_dir, "%04d.npz" % (i)), 57 | is_origin=df["is_origin"].tolist(), 58 | time=df["time"].tolist(), 59 | text=df["text"].tolist(), 60 | embedding=df["embedding"].tolist(), 61 | ) # 不能用to_numpy!! 62 | 63 | 64 | def extract_time_series_feature( 65 | data_dir="swdd-7k_embedding", 66 | modelname="paraphrase-xlm-r-multilingual-v1", 67 | pad_len=500, 68 | interval_spans=0, 69 | origin_only=False 70 | ): 71 | if origin_only: 72 | save_dir = "{}_origin_{}_{}".format(data_dir, pad_len, interval_spans) 73 | else: 74 | save_dir = "{}_{}_{}".format(data_dir, pad_len, interval_spans) 75 | if not os.path.exists(save_dir): 76 | os.mkdir(save_dir) 77 | 78 | model = SentenceTransformer(modelname) 79 | symp_text = [v for k, v in symptoms.items()] 80 | symp_emb = model.encode(symp_text) 81 | symp_emb = torch.from_numpy(symp_emb) 82 | print("Symptom Embed: ({}, {})".format(symp_emb.shape[0], symp_emb.shape[1])) 83 | 84 | # load_emb 85 | data = load_swdd_xk_emb(data_dir=data_dir) 86 | for i in range(len(data)): 87 | # Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. 88 | # tweet_emb = torch.Tensor(data[i]["embedding"].tolist()) 89 | df = data[i] 90 | if origin_only: # NOTE: 可能出现origin为0的情况,仅这样不妥,需要保证origin数量充分多 91 | # print('origin only') 92 | df = df[df['is_origin']] 93 | df = df.reset_index(drop=True) 94 | # extract embedding 95 | # tweet_emb = torch.Tensor(np.array(data[i]["embedding"].tolist())) # NOTE: 96 | tweet_emb = torch.Tensor(np.array(df["embedding"].tolist())) 97 | # Compute cosine-similarits 98 | cosine_scores = util.pytorch_cos_sim(symp_emb, tweet_emb) 99 | # print(cosine_scores.shape) 100 | pad = ZeroPad2d(padding=(0, pad_len - tweet_emb.shape[0], 0, 0)) 101 | cosine_scores = pad(cosine_scores) 102 | # print(cosine_scores.shape) 103 | # np.save(os.path.join(save_dir, "%04d" % i), cosine_scores) 104 | # yield cosine_scores 105 | # 如何将pad和interval_day结合起来? 106 | # df = data[i] # NOTE: 107 | time_series = cosine_scores 108 | if interval_spans: 109 | num = 0 110 | for k, v in symptoms.items(): 111 | df_symp = pd.DataFrame({k[:3]: time_series[num]}) 112 | df = pd.concat([df, df_symp], axis=1) 113 | num += 1 114 | 115 | # 按照时间索引 116 | df['time'] = df['time'].astype(np.string_) # np.str_-> np.string_ 117 | df['time'] = df['time'].apply(lambda x: str(x, encoding='utf-8')) # bytes -> str 118 | df['time'] = pd.to_datetime(df['time']) 119 | df.set_index('time', inplace=True) 120 | # time_series = df.resample('{}D'.format(interval_days)).mean().fillna(0).values.T 121 | idx = df.index 122 | time_series = df.resample((np.max(idx)-np.min(idx))/(interval_spans-1)).mean().fillna(0).values.T 123 | # print(time_series.shape) 124 | np.save(os.path.join(save_dir, "%04d" % i), time_series) 125 | # yield time_series 126 | return save_dir -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | icecream 2 | jsonlines 3 | pandas 4 | harvesttext 5 | scikit-learn 6 | pyhanlp 7 | matplotlib 8 | lxml 9 | bs4 10 | weibo-preprocess-toolkit 11 | sentence-transformers 12 | keras=2.3.1 13 | esig 14 | tsfresh -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__init__.py -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/analysis.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/analysis.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/analysis.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/analysis.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/clog.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/clog.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/clog.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/clog.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/data.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/data.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/data.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/data.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/extractor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/extractor.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/informer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/informer.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/parser.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/parser.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/summarizor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/summarizor.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/symptom.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/symptom.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/system_monitor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/system_monitor.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/system_monitor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/system_monitor.cpython-38.pyc -------------------------------------------------------------------------------- /utils/analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2021/3/12 12:10 4 | # @Author : cendeavor 5 | # @Site : 6 | # @File : analysis_utils.py 7 | # @Software: PyCharm 8 | 9 | import tensorflow as tf 10 | import keras 11 | import numpy as np 12 | 13 | 14 | def cal_basic_metrics(y_true, y_pred): 15 | from sklearn.metrics import ( 16 | confusion_matrix, 17 | precision_score, 18 | accuracy_score, 19 | recall_score, 20 | f1_score, 21 | roc_auc_score, 22 | precision_recall_fscore_support, 23 | roc_curve, 24 | classification_report, 25 | ) 26 | 27 | # print('accuracy:{}'.format(accuracy_score(y_true, y_pred))) # 不存在average 28 | # print('precision:{}'.format(precision_score(y_true, y_pred,average='micro'))) 29 | # print('recall:{}'.format(recall_score(y_true, y_pred,average='micro'))) 30 | # print('f1-score:{}'.format(f1_score(y_true, y_pred,average='micro'))) 31 | # print('f1-score-for-each-class:{}'.format(precision_recall_fscore_support(y_true, y_pred))) # for macro 32 | ans = classification_report(y_true, y_pred, digits=5) # 小数点后保留5位有效数字 33 | print(ans) 34 | return ans 35 | 36 | 37 | def cal_auc(y_true_one_hot, y_pred_prob): 38 | from sklearn.metrics import roc_auc_score 39 | 40 | # AUC值 41 | # 使用micro,会计算n_classes个roc曲线,再取平均 42 | auc = roc_auc_score(y_true_one_hot, y_pred_prob, average="micro") 43 | print("AUC y_pred = proba:", auc) 44 | return auc 45 | 46 | 47 | def plot_roc(y_true, y_pred_prob): 48 | # The magic happens here 49 | import matplotlib.pyplot as plt 50 | import scikitplot as skplt 51 | 52 | fig, ax = plt.subplots() # 可以用figsize=(16,12)指定画布大小 53 | skplt.metrics.plot_roc(y_true, y_pred_prob, ax=ax) 54 | return fig 55 | # skplt.metrics.plot_roc(y_true, y_pred_prob) 56 | # plt.show() #没必要 57 | # return plt 58 | 59 | 60 | def plot_prc(y_true, y_pred_prob): 61 | import matplotlib.pyplot as plt 62 | import scikitplot as skplt 63 | 64 | fig, ax = plt.subplots() # 可以用figsize=(16,12)指定画布大小 65 | skplt.metrics.plot_precision_recall_curve(y_true, y_pred_prob, ax=ax) 66 | return fig 67 | 68 | 69 | def plot_confusion_matrix(y_true, y_pred): 70 | import matplotlib.pyplot as plt 71 | import scikitplot as skplt 72 | 73 | fig, ax = plt.subplots() 74 | skplt.metrics.plot_confusion_matrix(y_true, y_pred, normalize=True, ax=ax) 75 | return fig 76 | # plot = skplt.metrics.plot_confusion_matrix(y_true, y_pred, normalize=True) 77 | # plt.show() #没必要 78 | # return plt 79 | 80 | 81 | def report_model_performance(y_true_one_hot, y_pred_prob): 82 | import numpy as np 83 | 84 | y_pred = np.argmax(y_pred_prob, axis=1) 85 | y_true = np.argmax(y_true_one_hot, axis=1) 86 | 87 | cal_basic_metrics(y_true, y_pred) 88 | cal_auc(y_true_one_hot, y_pred_prob) 89 | plt1 = plot_roc(y_true, y_pred_prob) 90 | plt2 = plot_confusion_matrix(y_true, y_pred) 91 | return plt1, plt2 92 | 93 | 94 | # 精确率评价指标 95 | def metric_precision(y_true, y_pred): 96 | TP = tf.reduce_sum(y_true * tf.round(y_pred)) 97 | TN = tf.reduce_sum((1 - y_true) * (1 - tf.round(y_pred))) 98 | FP = tf.reduce_sum((1 - y_true) * tf.round(y_pred)) 99 | FN = tf.reduce_sum(y_true * (1 - tf.round(y_pred))) 100 | precision = TP / (TP + FP) 101 | return precision 102 | 103 | 104 | # 召回率评价指标 105 | def metric_recall(y_true, y_pred): 106 | TP = tf.reduce_sum(y_true * tf.round(y_pred)) 107 | TN = tf.reduce_sum((1 - y_true) * (1 - tf.round(y_pred))) 108 | FP = tf.reduce_sum((1 - y_true) * tf.round(y_pred)) 109 | FN = tf.reduce_sum(y_true * (1 - tf.round(y_pred))) 110 | recall = TP / (TP + FN) 111 | return recall 112 | 113 | 114 | # F1-score评价指标 115 | def metric_F1score(y_true, y_pred): 116 | TP = tf.reduce_sum(y_true * tf.round(y_pred)) 117 | TN = tf.reduce_sum((1 - y_true) * (1 - tf.round(y_pred))) 118 | FP = tf.reduce_sum((1 - y_true) * tf.round(y_pred)) 119 | FN = tf.reduce_sum(y_true * (1 - tf.round(y_pred))) 120 | precision = TP / (TP + FP) 121 | recall = TP / (TP + FN) 122 | F1score = 2 * precision * recall / (precision + recall) 123 | return F1score 124 | 125 | 126 | # 编译阶段引用自定义评价指标示例 127 | # model.compile(optimizer='adam', 128 | # loss='binary_crossentropy', 129 | # metrics=['accuracy', 130 | # metric_precision, 131 | # metric_recall, 132 | # metric_F1score]) 133 | 134 | 135 | def get_hardest_k_examples(test_dataset, model, k=32): 136 | class_probs = model(test_dataset.x) 137 | predictions = np.argmax(class_probs, axis=1) 138 | losses = keras.losses.categorical_crossentropy(test_dataset.y, class_probs) 139 | argsort_loss = np.argsort(losses) 140 | 141 | highest_k_losses = np.array(losses)[argsort_loss[-k:]] 142 | hardest_k_examples = test_dataset.x[argsort_loss[-k:]] 143 | true_labels = np.argmax(test_dataset.y[argsort_loss[-k:]], axis=1) 144 | 145 | return highest_k_losses, hardest_k_examples, true_labels, predictions 146 | 147 | 148 | def calc_metrics_binary(model, X_test, y_test): 149 | from sklearn.metrics import ( 150 | classification_report, 151 | accuracy_score, 152 | f1_score, 153 | roc_auc_score, 154 | recall_score, 155 | precision_score, 156 | ) 157 | 158 | y_pred = model.predict(X_test) 159 | report = classification_report( 160 | y_test, y_pred, target_names=["Normal", "Depressed"], digits=4 161 | ) 162 | # # 计算准确率 163 | # acc = accuracy_score(y_test, y_pred) 164 | # # 计算F1-Score 165 | # f1 = f1_score(y_test, y_pred, pos_label='1') 166 | # # 计算精确率 167 | # prec = precision_score(y_test, y_pred) 168 | # # 计算召回率 169 | # rec = recall_score(y_test, y_pred) 170 | # # 计算AUC面积(2分类) 171 | # AUC = roc_auc_score(y_test, y_pred) 172 | return report # , acc, f1, prec, rec, AUC 173 | -------------------------------------------------------------------------------- /utils/clog.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | 4 | def count_time(func): 5 | """装饰器:计算程序运行时间""" 6 | from datetime import datetime 7 | 8 | @functools.wraps(func) # 保留被装饰函数信息 9 | def wrapper(*args, **kw): 10 | start_time = datetime.now() 11 | res = func(*args, **kw) 12 | print("[%s] RUN TIME: %s" % (func.__name__, str(datetime.now() - start_time))) 13 | return res # 返回被装饰函数的返回值 14 | 15 | return wrapper 16 | -------------------------------------------------------------------------------- /utils/data.py: -------------------------------------------------------------------------------- 1 | from utils.clog import count_time 2 | import os 3 | import jsonlines 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | data_dir = "swdd" 8 | dep_file = "depressed.jsonl" 9 | con_file = "control.jsonl" 10 | dep_path = os.path.join(data_dir, dep_file) 11 | con_path = os.path.join(data_dir, con_file) 12 | 13 | 14 | @count_time 15 | def load_swdd_all(data_dir): 16 | """ """ 17 | 18 | dep_file = "depressed.jsonl" 19 | con_file = "control.jsonl" 20 | dep_path = os.path.join(data_dir, dep_file) 21 | con_path = os.path.join(data_dir, con_file) 22 | 23 | data = [] 24 | for filename in [dep_path, con_path]: 25 | print("Loading {}".format(filename)) 26 | cnt = 0 27 | with open(filename, "r", encoding="utf8") as f: 28 | for item in jsonlines.Reader(f): 29 | datum = { 30 | "label": 1 if item["label"]["depressed"] else 0, 31 | **item["user"], 32 | "tweets": pd.DataFrame(item["tweets"]), 33 | } 34 | data.append(datum) 35 | cnt += 1 36 | print("Sample Num: {}".format(cnt)) 37 | df = pd.DataFrame(data) 38 | return df 39 | 40 | 41 | def get_quantile_upper_outliers(df, column_name, quantile=0.75): 42 | s = df[column_name] 43 | 44 | df_ = df.copy() 45 | # 这里将大于上四分位数(Q3)的设定为异常值 46 | # df_['isOutlier'] = s > s.quantile(0.75) 47 | df_.loc[:, "isOutlier"] = s > s.quantile(quantile) 48 | df_rst = df_[df_["isOutlier"] == True] 49 | return df_rst 50 | 51 | 52 | def get_quantile_lower_outliers(df, column_name, quantile=0.25): 53 | s = df[column_name] 54 | 55 | df_ = df.copy() 56 | # 这里将小于下四分位数(Q1)的设定为异常值 57 | df_.loc[:, "isOutlier"] = s < s.quantile(quantile) 58 | df_rst = df_[df_["isOutlier"] == True] 59 | return df_rst 60 | 61 | 62 | def get_box_plot_outliers(df, column_name): 63 | s = df[column_name] 64 | 65 | df_ = df.copy() 66 | q1, q3 = s.quantile(0.25), s.quantile(0.75) 67 | iqr = q3 - q1 68 | low, up = q1 - 1.5 * iqr, q3 + 1.5 * iqr 69 | df_.loc[:, "isOutlier"] = s.mask((s < low) | (s > up)) 70 | df_rst = df_[df_["isOutlier"] == True] 71 | return df_rst 72 | 73 | 74 | @count_time 75 | def gen_swdd_7k(data_dir): 76 | """ """ 77 | 78 | import os 79 | import numpy as np 80 | import json 81 | import jsonlines 82 | 83 | # 加载数据集 84 | df = load_swdd_all(data_dir=data_dir) 85 | 86 | # 删除不要字段 87 | cols = [ 88 | i 89 | for i in df.columns 90 | if i 91 | not in ["avatar_url", "cover_image_url", "verified_reason", "verified_type"] 92 | ] 93 | df = df[cols] 94 | 95 | # 删除离异点 96 | dep_follow_outliers = get_quantile_upper_outliers( 97 | df[df["label"] == 1], column_name="follow_count", quantile=0.999 98 | ) 99 | dep_follower_outliers = get_quantile_upper_outliers( 100 | df[df["label"] == 1], column_name="followers_count", quantile=0.99 101 | ) 102 | dep_outliers = df.iloc[ 103 | np.union1d(dep_follow_outliers.index.values, dep_follower_outliers.index.values) 104 | ] 105 | con_follow_outliers = get_box_plot_outliers( 106 | df[df["label"] == 0], column_name="follow_count" 107 | ) 108 | con_follower_outliers = get_box_plot_outliers( 109 | df[df["label"] == 0], column_name="followers_count" 110 | ) 111 | con_outliers = df.iloc[ 112 | np.union1d(con_follow_outliers.index.values, con_follower_outliers.index.values) 113 | ] 114 | 115 | df_ = df.copy() 116 | df_ = df_.drop(dep_outliers.index.values) 117 | df_ = df_.drop(con_outliers.index.values).reset_index(drop=True) 118 | print(df_.describe()) 119 | 120 | 121 | # 删除原创推文少于30的 122 | for i in range(len(df_)): 123 | if len(df_['tweets'][i][df_['tweets'][i]['is_origin']]) < 30: 124 | df_ = df_.drop(i) 125 | 126 | 127 | # 采样7k 128 | samp_cnt = 3500 129 | df_7k = ( 130 | ( 131 | pd.concat( 132 | [ 133 | df_[df_["label"] == 1].sample(n=samp_cnt), 134 | df_[df_["label"] == 0].sample(n=samp_cnt), 135 | ] 136 | ) 137 | ) 138 | .sample(n=samp_cnt * 2) 139 | .reset_index(drop=True) 140 | ) 141 | 142 | # 删除推文字段(剔除转发推文) 143 | cols = [ 144 | i 145 | for i in df_7k.iloc[0]["tweets"].columns 146 | if i 147 | not in [ 148 | "edit_at", 149 | "pics_url", 150 | "publish_place", 151 | "publish_tool", 152 | "video_url", 153 | "article_url", 154 | "topics", 155 | "at_users", 156 | "retweet", 157 | ] 158 | ] 159 | df_ = df_7k.copy() 160 | for i in range(len(df_)): 161 | df_["tweets"][i] = df_["tweets"][i][cols] 162 | 163 | df_7k = df_ 164 | print(df_7k.describe()) 165 | 166 | # return df_7k 167 | swdd_7k_dir = data_dir + "-7k" 168 | if not os.path.exists(swdd_7k_dir): 169 | os.mkdir(swdd_7k_dir) 170 | 171 | print("Writing to {}".format(swdd_7k_dir)) 172 | 173 | for i in range(len(df_7k)): 174 | samp = json.loads(df_7k.iloc[i].to_json(orient="columns")) 175 | samp["tweets"] = json.loads( 176 | df_7k.iloc[i]["tweets"].to_json(orient="records") 177 | ) # fuck it !!!! 178 | 179 | with jsonlines.open( 180 | os.path.join(swdd_7k_dir, "%04d.jsonl" % (i)), mode="w" 181 | ) as writer: 182 | writer.write(samp) 183 | 184 | print("Done") 185 | 186 | return df_7k 187 | 188 | 189 | @count_time 190 | def gen_swdd_4k(data_dir): 191 | import os 192 | import numpy as np 193 | import json 194 | import jsonlines 195 | 196 | # data_dir = "swdd" 197 | 198 | # 加载数据集 199 | df = load_swdd_all(data_dir=data_dir) 200 | 201 | # 删除不要字段 202 | cols = [ 203 | i 204 | for i in df.columns 205 | if i 206 | not in ["avatar_url", "cover_image_url", "verified_reason", "verified_type"] 207 | ] 208 | df = df[cols] 209 | 210 | # 删除离异点 211 | dep_follow_outliers = get_quantile_upper_outliers( 212 | df[df["label"] == 1], column_name="follow_count", quantile=0.999 213 | ) 214 | dep_follower_outliers = get_quantile_upper_outliers( 215 | df[df["label"] == 1], column_name="followers_count", quantile=0.99 216 | ) 217 | dep_outliers = df.iloc[ 218 | np.union1d(dep_follow_outliers.index.values, dep_follower_outliers.index.values) 219 | ] 220 | con_follow_outliers = get_box_plot_outliers( 221 | df[df["label"] == 0], column_name="follow_count" 222 | ) 223 | con_follower_outliers = get_box_plot_outliers( 224 | df[df["label"] == 0], column_name="followers_count" 225 | ) 226 | con_outliers = df.iloc[ 227 | np.union1d(con_follow_outliers.index.values, con_follower_outliers.index.values) 228 | ] 229 | 230 | import pandas as pd 231 | 232 | # 采样4k 233 | samp_cnt = 4000 234 | 235 | for dep_prop in range(10, 100, 10): 236 | df_ = df.copy() 237 | df_ = df_.drop(dep_outliers.index.values) 238 | df_ = df_.drop(con_outliers.index.values).reset_index(drop=True) 239 | print(df_.describe()) 240 | 241 | # 删除原创推文少于20的 242 | for i in range(len(df_)): 243 | if len(df_['tweets'][i][df_['tweets'][i]['is_origin']]) < 20: 244 | df_ = df_.drop(i) 245 | 246 | dep_prop = dep_prop / 100 247 | dep_cnt = int(samp_cnt * dep_prop) 248 | # con_cnt = int(samp_cnt * (1 - dep_prop)) # 浮点数问题。。 249 | con_cnt = samp_cnt - dep_cnt 250 | if dep_cnt % 100: 251 | con_cnt = int(samp_cnt * (1 - dep_prop)) 252 | dep_cnt = samp_cnt - con_cnt 253 | 254 | print(dep_cnt, con_cnt) 255 | 256 | df_4k = ( 257 | ( 258 | pd.concat( 259 | [ 260 | df_[df_["label"] == 1].sample(n=dep_cnt), 261 | df_[df_["label"] == 0].sample(n=con_cnt), 262 | ] 263 | ) 264 | ) 265 | .sample(n=samp_cnt) 266 | .reset_index(drop=True) 267 | ) 268 | 269 | # 删除推文字段(剔除转发推文) 270 | cols = [ 271 | i 272 | for i in df_4k.iloc[0]["tweets"].columns 273 | if i 274 | not in [ 275 | "edit_at", 276 | "pics_url", 277 | "publish_place", 278 | "publish_tool", 279 | "video_url", 280 | "article_url", 281 | "topics", 282 | "at_users", 283 | "retweet", 284 | ] 285 | ] 286 | df_ = df_4k.copy() 287 | for i in range(len(df_)): 288 | df_["tweets"][i] = df_["tweets"][i][cols] 289 | 290 | df_4k = df_ 291 | print(df_4k.describe()) 292 | 293 | swdd_4k_dir = data_dir + "-4k_{}".format(int(dep_prop * 100)) 294 | if not os.path.exists(swdd_4k_dir): 295 | os.mkdir(swdd_4k_dir) 296 | 297 | print("Writing to {}".format(swdd_4k_dir)) 298 | 299 | for i in range(len(df_4k)): 300 | samp = json.loads(df_4k.iloc[i].to_json(orient="columns")) 301 | samp["tweets"] = json.loads( 302 | df_4k.iloc[i]["tweets"].to_json(orient="records") 303 | ) # fuck it !!!! 304 | 305 | with jsonlines.open( 306 | os.path.join(swdd_4k_dir, "%04d.jsonl" % (i)), mode="w" 307 | ) as writer: 308 | writer.write(samp) 309 | 310 | print("Done") 311 | 312 | 313 | @count_time 314 | def load_swdd_xk(data_dir): 315 | data = [] 316 | # 乱序。。 317 | # for _, _, files in os.walk(data_dir): 318 | # for file in files: 319 | # with open(os.path.join(data_dir, file), "r", encoding="utf8") as f: 320 | # for item in jsonlines.Reader(f): 321 | # data.append(item) 322 | files = os.listdir(data_dir) 323 | files.sort() 324 | for file in files: 325 | with open(os.path.join(data_dir, file), "r", encoding="utf8") as f: 326 | for item in jsonlines.Reader(f): 327 | data.append(item) 328 | return data 329 | 330 | 331 | @count_time 332 | def load_swdd_xk_emb(data_dir): 333 | import numpy as np 334 | 335 | data = [] 336 | 337 | files = os.listdir(data_dir) 338 | files.sort() 339 | for file in files: 340 | datum = np.load(os.path.join(data_dir, file), allow_pickle=True) 341 | df = pd.DataFrame({k: list(datum[k]) for k in datum.files}) 342 | data.append(df) 343 | return data 344 | 345 | 346 | @count_time 347 | def load_swdd_xk_npz(data_dir): 348 | import os 349 | import numpy as np 350 | 351 | train_data = np.load(os.path.join(data_dir, "train.npz"), allow_pickle=True) 352 | test_data = np.load(os.path.join(data_dir, "test.npz"), allow_pickle=True) 353 | X_train = [] 354 | id_train = [] 355 | X_test = [] 356 | id_test = [] 357 | 358 | for datum in train_data["X"]: 359 | X_train.append(datum[0]) 360 | id_train.append(datum[1]) 361 | 362 | for datum in test_data["X"]: 363 | X_test.append(datum[0]) 364 | id_test.append(datum[1]) 365 | 366 | X_train = np.array(X_train) 367 | X_test = np.array(X_test) 368 | id_train = np.array(id_train) 369 | id_test = np.array(id_test) 370 | y_train = train_data["y"] 371 | y_test = test_data["y"] 372 | 373 | return X_train, X_test, y_train, y_test, id_train, id_test 374 | 375 | 376 | def inspect_time_series(data_dir, dir_suffix, file_id): 377 | """查看某个用户的时间序列特征,封装成DataFrame返回 378 | # basic usage 379 | df.iloc[:144].plot(subplots=True, figsize=(10,12)) 380 | df.loc['2020-05'].plot() 381 | df.resample('Q')['sui'].mean() 382 | df.resample('2W').mean().fillna(0).values.T[0] 383 | # plot month average bar 384 | df_month = df.resample("M").mean() 385 | fig, ax = plt.subplots(figsize=(10, 6)) 386 | ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m')) 387 | ax.bar(df_month['2020':].index, df_month.loc['2020':, 388 | "ene"], width=25, align='center') 389 | """ 390 | import numpy as np 391 | import os 392 | from utils.symptom import symptoms_dsm_5 as symptoms 393 | 394 | time_series = np.load( 395 | os.path.join("{}_{}".format(data_dir, dir_suffix), "%04d.npy" % file_id) 396 | ) 397 | tweet_meta = np.load( 398 | os.path.join(data_dir, "%04d.npz" % file_id), allow_pickle=True 399 | ) 400 | df = pd.DataFrame({k: list(tweet_meta[k]) for k in tweet_meta.files}) 401 | 402 | num = 0 403 | for k, v in symptoms.items(): 404 | df_symp = pd.DataFrame({k[:3]: time_series[num]}) 405 | df = pd.concat([df, df_symp], axis=1) 406 | num += 1 407 | 408 | # 按照时间索引 409 | df["time"] = df["time"].astype(np.string_) # np.str_-> np.string_ 410 | df["time"] = df["time"].apply(lambda x: str(x, encoding="utf-8")) # bytes -> str 411 | df["time"] = pd.to_datetime(df["time"]) 412 | df.set_index("time", inplace=True) 413 | 414 | return df 415 | 416 | 417 | def plot_ex_width(x, y, x_maxsize): 418 | 419 | plt.plot(x, y) 420 | # plt.ylim((0, 1000)) 421 | # plt.title("Demo") 422 | plt.xlabel("x") 423 | plt.ylabel("y") 424 | 425 | # change x internal size 426 | plt.gca().margins(x=0) 427 | plt.gcf().canvas.draw() 428 | 429 | # set size 430 | maxsize = x_maxsize 431 | m = 0.2 432 | N = len(x) 433 | s = maxsize / plt.gcf().dpi * N + 2 * m 434 | margin = m / plt.gcf().get_size_inches()[0] 435 | 436 | plt.gcf().subplots_adjust(left=margin, right=1.0 - margin) 437 | plt.gcf().set_size_inches(s, plt.gcf().get_size_inches()[1]) 438 | 439 | 440 | def plot_time_series(time_series, expand_width=2): 441 | from utils.symptom import symptoms_dsm_5 as symptoms 442 | 443 | label_list = [k[:3] for k, v in symptoms.items()] 444 | 445 | x_values = list(range(1, time_series.shape[1] + 1)) 446 | for i in range(time_series.shape[0]): 447 | # plt.plot(x_values, time_series[i]) 448 | plot_ex_width(x_values, time_series[i], expand_width) 449 | plt.legend(labels=label_list) 450 | -------------------------------------------------------------------------------- /utils/extractor.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | from lxml import etree 4 | from bs4 import BeautifulSoup 5 | from weibo_preprocess_toolkit import WeiboPreprocess 6 | from harvesttext import HarvestText 7 | import pyhanlp 8 | 9 | 10 | def get_post_time(timestr="Mon Dec 14 11:26:56 +0800 2020"): 11 | import calendar 12 | import locale 13 | 14 | # locale.setlocale(locale.LC_ALL, "C.UTF-8") 15 | 16 | if not timestr: 17 | return "" 18 | temp = timestr.split(" ") 19 | time_area = temp[-2] 20 | if time_area != "+0800": 21 | print(time_area) 22 | # day_time = ':'.join(temp[3].split(':')[:-1]) 23 | day_time = temp[3] 24 | return ( 25 | temp[-1] 26 | + "-" 27 | + "{:0=2}".format(list(calendar.month_abbr).index(temp[1])) 28 | + "-" 29 | + temp[2] 30 | + " " 31 | + day_time 32 | ) 33 | 34 | 35 | class WeiboText: 36 | def __init__(self): 37 | 38 | self.preprocess = WeiboPreprocess() 39 | self.ht = HarvestText() 40 | self.CharTable = pyhanlp.JClass("com.hankcs.hanlp.dictionary.other.CharTable") 41 | self.d1 = re.compile(r"(.*)") 42 | self.d2 = re.compile(r"点击播放") # 点击播放>> 43 | self.d3 = re.compile(r"在.*获取更多信息") 44 | self.d4 = re.compile(r"速围观") 45 | self.d5 = re.compile(r"我获得了.*的红包") 46 | self.d6 = re.compile(r"#.*#") 47 | 48 | def get_cleaned_text(self, html): 49 | # TODO:
-> 空格 50 | # TODO: 多空格压缩(HarvestText) 51 | # TODO: 去奇怪符号 52 | # TODO: 表情字符icon翻译(翻译表) + 冗余字符icon去除(HarvestText) 53 | # TODO: 去数字(?) 54 | # TODO: 繁简体转化 55 | # TODO: 固定噪音去除(weibo_preprocess_toolkit) + HarvestText中自定义 56 | # 1. (分享自)、(通过 录制) 57 | # 2. 点击播放>> 58 | # 3. 在XXX获取更多信息 59 | # 4. 速围观 60 | # 5. ...全文 61 | # 6. 我获得了XXX的红包 62 | # 7. 打卡第X天 63 | soup = BeautifulSoup(html, features="lxml") 64 | tmp_a = [i.extract() for i in soup.find_all("a")] 65 | # 保留图片表情文本(但是一些表情比如微笑可能有反讽意味) 66 | # for i in soup.find_all('span', class_='url-icon'): 67 | # i.append(i.img.attrs['alt']) 68 | # return soup.get_text().lower().strip() 69 | text = soup.get_text() 70 | text = self.d1.sub("", text) 71 | text = self.d2.sub("", text) 72 | text = self.d3.sub("", text) 73 | text = self.d4.sub("", text) 74 | text = self.d5.sub("", text) 75 | text = self.d6.sub("", text) 76 | # 使用HarvestText清洗文本(空格压缩,去字符表情) 77 | content = self.CharTable.convert(text) 78 | cleaned_text = self.ht.clean_text(content, weibo_topic=True) 79 | # 使用weibo_preprocess_toolkit清洗文本(繁简体转化,去固定噪音,去数字,) 80 | cleaned_text = self.preprocess.clean(cleaned_text) 81 | return cleaned_text.strip() 82 | 83 | @staticmethod 84 | def get_raw_text(text_body): 85 | return etree.HTML(text_body).xpath("string(.)") 86 | 87 | @staticmethod 88 | def get_weibo_selector(text_body): 89 | return etree.HTML(text_body) 90 | 91 | @staticmethod 92 | def string_to_int(string): 93 | """字符串转换为整数""" 94 | if isinstance(string, int): 95 | return string 96 | elif string.endswith("万+"): 97 | string = int(string[:-2] + "0000") 98 | elif string.endswith("万"): 99 | string = int(string[:-1] + "0000") 100 | return int(string) 101 | 102 | @staticmethod 103 | def standardize_info(weibo): 104 | """标准化信息,去除乱码""" 105 | for k, v in weibo.items(): 106 | if ( 107 | "bool" not in str(type(v)) 108 | and "int" not in str(type(v)) 109 | and "list" not in str(type(v)) 110 | and "long" not in str(type(v)) 111 | ): 112 | weibo[k] = ( 113 | v.replace("\u200b", "") 114 | .encode(sys.stdout.encoding, "ignore") 115 | .decode(sys.stdout.encoding) 116 | ) 117 | return weibo -------------------------------------------------------------------------------- /utils/symptom.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2021/2/9 10:51 4 | # @Author : cendeavor 5 | # @Site : 6 | # @File : symptom.py 7 | # @Software: PyCharm 8 | 9 | # DSM-5 10 | symptoms_dsm_5 = { 11 | # 1. 抑郁心境:感到哀伤、空洞、绝望等 12 | "depressive_mood": "长时间不开心、不高兴、不快乐,心情低落消沉、郁闷压抑、沮丧或绝望,经常总是悲伤想哭、伤心流泪、痛苦难过、感到空虚难熬惆怅", 13 | # 2. 显著的日常(娱乐)活动的减少: 做事时提不起劲或没有兴趣 14 | "interest_pleasure_loss": "对几乎所有活动没兴趣、没意思没动力,乐趣明显减少、没有愉悦感,厌世、成天无精打采", 15 | # 3. 胃口或体重明显改变:食欲不振或吃太多,在未节食的情况下体重明显减轻,或体重增加 16 | "appetite_weight_problem": "食欲减退、经常饱、没胃口、想吐或食欲增加、不明原因暴食,体重明显减轻或体重增加", 17 | # 4. 睡眠困扰:入睡困难、睡不安稳,或睡眠过多 18 | "insomnia_or_hypersomnia": "经常失眠睡不着、服用安眠药,凌晨发帖时间多,嗜睡、起床困难、睡不醒", 19 | # 5. 日常行动迟缓或多动:动作或说话速度缓慢到别人已经觉察; 烦躁或坐立不安、动来动去的情况更胜于平常 20 | "retardation_or_agitation": "反应麻木迟钝、动作迟缓,对疼痛不敏感;精神性躁动、易感烦躁、坐立难安,言行冲动、易怒、易抓狂", 21 | # 6. 感到劳累: 感觉疲倦或精力不足 22 | "energy_loss": "经常感到累、困、昏晕乏力、疲惫没力气、没有精神", 23 | # 7. 感到无用,自责:觉得自己很糟,或觉得自己很失败,或让自己或家人失望 24 | "self_blame": "经常自我否定,我好没用、没有价值、一无是处、一事无成、好失败,让自己或家人失望,经常对不起、内疚自责、都是我的错", 25 | # 8. 无法集中注意力,决策力下降:对事物专注有困难,例如阅读报纸或看电视时不能集中注意力 26 | "concentration_problem": "注意力下降、无法专注、感到集中注意力困难、思考能力减退、犹豫不决、精神恍惚", 27 | # 9. 不断地想到死亡,自杀念头但是没有计划:有不如死掉或用某种方式伤害自己的念头 28 | "suicidal_ideation": "反复想到死亡、想死、自杀、结束生命,用刀片割腕自残、想跳楼自杀、计划自杀", 29 | # 10. 交感神经唤醒 30 | "sympathetic_arousal": "心慌、心悸、胸闷、喘不过气、颤抖、视力模糊、冒冷汗", 31 | # 11. 恐慌&焦虑:对于特定的物体有显著的恐惧和焦虑或者逃避,且持续呈现; 32 | "panic_and_anxious": "经常好怕、害怕、恐惧、恐慌,想逃避;浑身躁动、经常烦躁、焦躁不安、焦虑、过度紧张、过度担心、忐忑、精神紧绷" 33 | } 34 | 35 | # DSM-5结合论文 36 | symptoms_combined = { 37 | # 1. 抑郁心境:感到哀伤、空洞、绝望等 38 | "depressive_mood": "长时间不开心、不高兴、不快乐,心情低落消沉、郁闷压抑、沮丧或绝望,经常总是悲伤想哭、伤心流泪、痛苦难过、感到空虚难熬惆怅", 39 | # 2. 显著的日常(娱乐)活动的减少: 做事时提不起劲或没有兴趣 40 | "interest_pleasure_loss": "对几乎所有活动没兴趣、没意思没动力,乐趣明显减少、没有愉悦感,厌世、成天无精打采", 41 | # 3. 胃口或体重明显改变:食欲不振或吃太多,在未节食的情况下体重明显减轻,或体重增加 42 | "appetite_weight_problem": "食欲减退、经常饱、没胃口、想吐或食欲增加、不明原因暴食,体重明显减轻或体重增加", 43 | # 4. 睡眠困扰:入睡困难、睡不安稳,或睡眠过多 44 | "insomnia_or_hypersomnia": "经常失眠睡不着、服用安眠药,凌晨发帖时间多,嗜睡、起床困难、睡不醒", 45 | # 5. 日常行动迟缓:动作或说话速度缓慢到别人已经觉察; 46 | "retardation": "反应麻木迟钝、动作迟缓,对疼痛不敏感", 47 | # 6. 感到劳累: 感觉疲倦或精力不足 48 | "energy_loss": "经常感到累、困、昏晕乏力、疲惫没力气、没有精神", 49 | # 7. 感到无用,自责:觉得自己很糟,或觉得自己很失败,或让自己或家人失望 50 | "self_blame": "经常自我否定,我好没用、没有价值、一无是处、一事无成、好失败,让自己或家人失望,经常对不起、内疚自责、都是我的错", 51 | # 8. 无法集中注意力,决策力下降:对事物专注有困难,例如阅读报纸或看电视时不能集中注意力 52 | "concentration_problem": "注意力下降、无法专注、感到集中注意力困难、思考能力减退、犹豫不决、精神恍惚", 53 | # 9. 不断地想到死亡,自杀念头但是没有计划:有不如死掉或用某种方式伤害自己的念头 54 | "suicidal_ideation": "反复想到死亡、想死、自杀、结束生命,用刀片割腕自残、想跳楼自杀、计划自杀", 55 | # 10. 交感神经唤醒 56 | "sympathetic_arousal": "心慌、心悸、胸闷、喘不过气、颤抖、视力模糊、冒冷汗", 57 | # 11. 恐慌&焦虑&狂躁:对于特定的物体有显著的恐惧和焦虑或者逃避,且持续呈现;烦躁或坐立不安、动来动去的情况更胜于平常 58 | "panic_and_anxious_or_agitation": "经常好怕、害怕、恐惧、恐慌,想逃避;浑身躁动、经常烦躁、焦躁不安、坐立难安、焦虑、过度担心、紧张、忐忑、精神紧绷;言行冲动、易怒、易抓狂" 59 | } 60 | 61 | # 纯粹按照论文 62 | symptoms_disaggregate = { 63 | # 1. 感到劳累: 感觉疲倦或精力不足 64 | "energy_loss": "经常感到累、困、昏晕乏力、疲惫没力气、没有精神", 65 | # 2. 抑郁心境:感到哀伤、空洞、绝望等 66 | "sadness": "心情低落消沉、郁闷压抑、沮丧或绝望,经常总是悲伤想哭、伤心流泪、痛苦难过、感到空虚难熬惆怅", 67 | # 3. 交感神经唤醒 68 | "sympathetic_arousal": "心慌、心悸、胸闷、喘不过气、颤抖、视力模糊、冒冷汗", 69 | # 4. 显著的日常(娱乐)活动的减少: 做事时提不起劲或没有兴趣 70 | "interest_loss": "对几乎所有活动没兴趣、没意思没动力、厌世、成天无精打采", 71 | # 5. 感受不到快乐,愉悦感消失 72 | "pleasure_loss": "乐趣明显减少、没有愉悦感,感受不到快乐、不开心、不高兴", 73 | # 6. 无法集中注意力,决策力下降:对事物专注有困难,例如阅读报纸或看电视时不能集中注意力 74 | "concentration_problem": "注意力下降、无法专注、感到集中注意力困难、思考能力减退、犹豫不决、精神恍惚", 75 | # 7. 恐慌:对于特定的物体有显著的恐惧和焦虑或者逃避,且持续呈现 76 | "panic": "经常好怕、害怕、恐惧、恐慌,想逃避", 77 | # 8. 胃口或体重明显改变:食欲不振或吃太多,在未节食的情况下体重明显减轻,或体重增加 78 | "appetite_problem": "食欲减退、经常饱、没胃口、想吐,食欲增加、不明原因暴食", 79 | # 9. 失眠:入睡困难、睡不安稳 80 | "insomnia": "经常失眠睡不着、服用安眠药,凌晨发帖时间多", 81 | # 10. 焦虑:对于特定的物体有显著的恐惧和焦虑或者逃避,且持续呈现 82 | "anxious": "浑身躁动、经常烦躁、焦躁不安、焦虑、过度紧张、过度担心、忐忑、精神紧绷", 83 | # 11. 感到无用,自责:觉得自己很糟,或觉得自己很失败,或让自己或家人失望 84 | "self_blame": "经常自我否定,我好没用、没有价值、一无是处、一事无成、好失败,让自己或家人失望,经常对不起、内疚自责、都是我的错", 85 | # 12. 日常行动迟缓或多动:动作或说话速度缓慢到别人已经觉察; 烦躁或坐立不安、动来动去的情况更胜于平常 86 | "retardation": "反应麻木迟钝、动作迟缓,对疼痛不敏感", 87 | # 9. 不断地想到死亡,自杀念头但是没有计划:有不如死掉或用某种方式伤害自己的念头 88 | "suicidal_ideation": "反复想到死亡、想死、自杀、结束生命,用刀片割腕自残、想跳楼自杀、计划自杀", 89 | # 8. 胃口或体重明显改变:食欲不振或吃太多,在未节食的情况下体重明显减轻,或体重增加 90 | "weight_problem": "体重明显减轻或体重增加", 91 | # 15. 狂躁:烦躁或坐立不安、动来动去的情况更胜于平常 92 | "agitation": "精神性躁动、易感烦躁、坐立难安,言行冲动、易怒、易抓狂", 93 | # 16. 嗜睡:睡眠过度 94 | "hypersomnia": "嗜睡、起床困难、睡不醒", 95 | } 96 | 97 | a = { 98 | # 1. 显著的日常(娱乐)活动的减少: 做事时提不起劲或没有兴趣 99 | "interest_loss": "对几乎所有活动兴趣减少", 100 | # "keywords": ["兴趣", "没意思", "无精打采", "厌世", "动力"] # "没兴趣" 101 | # 2. 快感消失:感到心情低落、沮丧或绝望 102 | "pleasure_loss": "长时间开心不起来,没有愉悦感", 103 | # "keywords": ["不开心", "不高兴", "不快乐", "郁闷", "压抑", "难熬", "消沉", "低落", '丧'] 104 | # 3. 感到劳累: 感觉疲倦或没有活力 105 | "energy_loss": "乏力,经常性疲惫,没有精神", 106 | # "keywords": ["累", "没力气", "发软", "躺", "困", "昏", "晕"] 107 | # 4. 抑郁心境:感到哀伤,空洞,绝望等 108 | "sadness": "悲伤、空虚、无望", 109 | # "keywords": ["哭", "伤心", "难受", "痛苦", "惆怅", "难过"] 110 | # 5. 交感神经唤醒 111 | "sympathetic_arousal": "心悸、颤抖、视力模糊、冒冷汗", 112 | # "keywords": ["心悸", "颤抖", "模糊", "冒汗", "冷汗", "胸闷", "心慌"] 113 | # 6. 无法集中注意力,决策力下降:对事物专注有困难,例如阅读报纸或看电视时不能集中注意力 114 | "concentration_problem": "思考或注意力集中的能力减退或犹豫不决、出现精神恍惚", 115 | # "keywords": ["注意力", "不集中"] 116 | # 7. 恐慌:对于特定的物体有显著的恐惧和焦虑或者逃避,且持续呈现 117 | "panic": "经常无故感到害怕", 118 | # "keywords": ["好怕", "恐惧", "恐慌", "害怕"] # 怕 119 | # 8. 胃口的改变:食欲不振或吃太多 120 | "appetite_problem": "食欲减退、经常没胃口、呕吐或食欲增加、不明原因暴饮暴食", 121 | # "keywords": ["吐", "没胃口", "饱", "暴食"] 122 | # 9. 睡眠困扰(失眠或轻度睡眠困扰):入睡困难、睡不安稳 123 | "insomnia": "自述经常失眠、凌晨0-6点发帖时间较多", 124 | # "keywords": ["安眠药", "失眠", "睡不着"] 125 | # 10. 焦虑:浑身躁动,时刻感到十分烦恼 126 | "anxious": "经常感到焦躁不安,包含了紧张、过度担心", 127 | # "keywords": ["焦虑", "紧张", "担心", "紧绷", "喘不过气", "忐忑"] 128 | # 11. 感到无用,自责:觉得自己很糟,或觉得自己很失败,或让自己或家人失望 129 | "self_blame": "感到愧疚或没有价值", 130 | # "keywords": ["对不起", "没用", "一无是处", "不中用", "自我否定", "一事无成", "孤独"] # "错", 131 | # 12. 日常行动迟缓:动作或说话速度缓慢到别人已经觉察 132 | "retardation": "反应不敏捷,对疼痛不敏感", 133 | # "keywords": ["麻木", "笨"] # "慢", 134 | # 13. 不断地想到死亡,自杀念头但是没有计划:有不如死掉或用某种方式伤害自己的念头 135 | "suicidal_ideation": "重复地想到死亡,重复地想到自杀但是没有详细计划,自杀尝试或者明确的自杀计划", 136 | # "keywords": ["自杀", "结束", "死", "自残"] 137 | # 14. 体重的明显改变: 在未节食的情况下体重明显减轻,或体重增加 138 | "weight_problem": "", 139 | # "keywords": ["变重", "变轻"] # "肥", "瘦" 140 | # 15. 日常行为的多动:烦躁或坐立不安、动来动去的情况更胜于平常 141 | "agitation": "精神性躁动,言语偏激、易怒", 142 | # "keywords": ["烦", "抓狂", "冲动", "易怒"] 143 | # 16. 嗜睡:睡眠过多 144 | "hypersomnia": "", 145 | # "keywords": ["懒", "起床", "嗜睡", "睡不醒"] 146 | } 147 | --------------------------------------------------------------------------------