├── 10_lab_ablation_ml.py
├── 1_gen_swdd_xk.py
├── 2_extract_embedding.py
├── 3_extract_time_series_feature.py
├── 4_make_ts_dataset.py
├── 5_lab_length.py
├── 6_lab_prop.py
├── 7_lab_ablation.py
├── 8_lab_length_ml.py
├── 9_lab_prop_ml.py
├── LICENSE
├── README.md
├── config.py
├── envs
    ├── sktime-0.5.3.zip
    └── sktime-dl-0.2.0-modify.zip
├── img
    └── Method-DSTS.png
├── preprocess.py
├── requirements.txt
└── utils
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-37.pyc
        ├── __init__.cpython-38.pyc
        ├── analysis.cpython-37.pyc
        ├── analysis.cpython-38.pyc
        ├── clog.cpython-37.pyc
        ├── clog.cpython-38.pyc
        ├── data.cpython-37.pyc
        ├── data.cpython-38.pyc
        ├── extractor.cpython-37.pyc
        ├── informer.cpython-37.pyc
        ├── parser.cpython-37.pyc
        ├── summarizor.cpython-37.pyc
        ├── symptom.cpython-37.pyc
        ├── system_monitor.cpython-37.pyc
        └── system_monitor.cpython-38.pyc
    ├── analysis.py
    ├── clog.py
    ├── data.py
    ├── extractor.py
    └── symptom.py


/10_lab_ablation_ml.py:
--------------------------------------------------------------------------------
  1 | def calc_metrics_binary(model, X_test, y_test):
  2 |     from sklearn.metrics import (
  3 |         classification_report,
  4 |         accuracy_score,
  5 |         f1_score,
  6 |         roc_auc_score,
  7 |         recall_score,
  8 |         precision_score,
  9 |     )
 10 | 
 11 |     y_pred = model.predict(X_test)
 12 |     report = classification_report(
 13 |         y_test, y_pred, target_names=["Normal", "Depressed"], digits=4
 14 |     )
 15 |     return report
 16 | 
 17 | 
 18 | from datetime import datetime
 19 | 
 20 | 
 21 | def train_TimeSeriesForest(X_train, y_train, X_test, y_test):
 22 |     start_time = datetime.now()
 23 |     # base
 24 |     print("***TimeSeriesForestClassifier***")
 25 |     from sklearn.pipeline import Pipeline
 26 |     from sktime.classification.interval_based import TimeSeriesForestClassifier
 27 |     from sktime.classification.compose import ColumnEnsembleClassifier
 28 |     from sktime.transformations.panel.compose import ColumnConcatenator
 29 | 
 30 |     steps = [
 31 |         ("concatenate", ColumnConcatenator()),
 32 |         ("classify", TimeSeriesForestClassifier(n_estimators=100)),
 33 |     ]
 34 |     clf = Pipeline(steps)
 35 |     clf.fit(X_train, y_train)
 36 | 
 37 |     report = calc_metrics_binary(clf, X_test, y_test)
 38 | 
 39 |     print(report)
 40 | 
 41 |     print(str(datetime.now() - start_time))
 42 | 
 43 |     return clf
 44 | 
 45 | 
 46 | def train_ROCKETClassifier(X_train, y_train, X_test, y_test):
 47 |     start_time = datetime.now()
 48 |     # 2020
 49 |     print("***ROCKETClassifier***")
 50 |     from sktime.classification.kernel_based import ROCKETClassifier
 51 | 
 52 |     clf = ROCKETClassifier(num_kernels=500)
 53 |     clf.fit(X_train, y_train)
 54 | 
 55 |     report = calc_metrics_binary(clf, X_test, y_test)
 56 | 
 57 |     print(report)
 58 | 
 59 |     print(str(datetime.now() - start_time))
 60 | 
 61 |     return clf
 62 | 
 63 | 
 64 | def train_Signature(X_train, y_train, X_test, y_test):
 65 |     start_time = datetime.now()
 66 | 
 67 |     # 2020
 68 |     print("***SignatureClassifier***")
 69 |     from sktime.classification.feature_based import SignatureClassifier
 70 | 
 71 |     clf = SignatureClassifier()
 72 |     clf.fit(X_train, y_train)
 73 | 
 74 |     report = calc_metrics_binary(clf, X_test, y_test)
 75 | 
 76 |     print(report)
 77 | 
 78 |     print(str(datetime.now() - start_time))
 79 | 
 80 |     return clf
 81 | 
 82 | 
 83 | def train_Arsenal(X_train, y_train, X_test, y_test):
 84 |     start_time = datetime.now()
 85 |     # uni
 86 |     print("***Arsenal***")
 87 |     from sktime.classification.kernel_based import Arsenal
 88 | 
 89 |     clf = Arsenal(num_kernels=200, n_estimators=5)
 90 |     clf.fit(X_train, y_train)
 91 | 
 92 |     report = calc_metrics_binary(clf, X_test, y_test)
 93 | 
 94 |     print(report)
 95 |     print(str(datetime.now() - start_time))
 96 |     start_time = datetime.now()
 97 | 
 98 |     return clf
 99 | 
100 | 
101 | def train_TSFresh(X_train, y_train, X_test, y_test):
102 |     start_time = datetime.now()
103 |     # 2018
104 |     print("***TSFreshClassifier***")
105 |     from sktime.classification.feature_based import TSFreshClassifier
106 | 
107 |     clf = TSFreshClassifier()
108 |     clf.fit(X_train, y_train)
109 | 
110 |     report = calc_metrics_binary(clf, X_test, y_test)
111 | 
112 |     print(report)
113 | 
114 |     print(str(datetime.now() - start_time))
115 | 
116 |     return clf
117 | 
118 | 
119 | def train_HIVECOTEV2(X_train, y_train, X_test, y_test):
120 |     start_time = datetime.now()
121 |     # No module named sktime.classificastion.shapelet_based.mrseql.mrseql
122 |     print("***HIVECOTEV2***")
123 |     from sktime.classification.hybrid import HIVECOTEV2
124 |     from sktime.contrib.vector_classifiers._rotation_forest import RotationForest
125 | 
126 |     clf = HIVECOTEV2(
127 |         stc_params={
128 |             "estimator": RotationForest(n_estimators=3),
129 |             "n_shapelet_samples": 500,
130 |             "max_shapelets": 20,
131 |             "batch_size": 100,
132 |         },
133 |         drcif_params={"n_estimators": 10},
134 |         arsenal_params={"num_kernels": 100, "n_estimators": 5},
135 |         tde_params={
136 |             "n_parameter_samples": 25,
137 |             "max_ensemble_size": 5,
138 |             "randomly_selected_params": 10,
139 |         },
140 |     )
141 |     clf.fit(X_train, y_train)
142 | 
143 |     report = calc_metrics_binary(clf, X_test, y_test)
144 | 
145 |     print(report)
146 |     print(str(datetime.now() - start_time))
147 |     start_time = datetime.now()
148 | 
149 |     return clf
150 | 
151 | 
152 | def train_ShapeletTransform(X_train, y_train, X_test, y_test):
153 |     start_time = datetime.now()
154 |     # shapelet
155 |     print("***ShapeletTransformClassifier***")
156 |     from sktime.classification.shapelet_based import ShapeletTransformClassifier
157 |     from sktime.contrib.vector_classifiers._rotation_forest import RotationForest
158 | 
159 |     clf = ShapeletTransformClassifier(
160 |         estimator=RotationForest(n_estimators=3),
161 |         n_shapelet_samples=500,
162 |         max_shapelets=20,
163 |         batch_size=100,
164 |     )
165 |     clf.fit(X_train, y_train)
166 | 
167 |     report = calc_metrics_binary(clf, X_test, y_test)
168 | 
169 |     print(report)
170 |     print(str(datetime.now() - start_time))
171 |     start_time = datetime.now()
172 | 
173 |     return clf
174 | 
175 | 
176 | model_list = ["bst", "rocket", "gs", "hc2", "tsf"]
177 | 
178 | if __name__ == "__main__":
179 |     import os
180 |     import numpy as np
181 |     from sktime.utils.data_io import load_from_tsfile_to_dataframe
182 |     from utils.symptom import symptoms_dsm_5 as symptoms
183 | 
184 |     # 情绪
185 |     feat_emo = [
186 |         "depressive_mood",
187 |         "retardation_or_agitation",
188 |         "panic_and_anxious",
189 |     ]  # sad, agi, pan
190 |     # 认知
191 |     feat_cog = [
192 |         "interest_pleasure_loss",
193 |         "self_blame",
194 |         "suicidal_ideation",
195 |         "concentration_problem",
196 |     ]  # int, sel(low-esteem), sui, con
197 |     # 躯体
198 |     feat_bod = [
199 |         "appetite_weight_problem",
200 |         "insomnia_or_hypersomnia",
201 |         "energy_loss",
202 |         "sympathetic_arousal",
203 |     ]  # app, ins, ene, sym
204 |     # 行为？
205 | 
206 |     feat_emo_dim = []
207 |     feat_cog_dim = []
208 |     feat_bod_dim = []
209 | 
210 |     for i, k in enumerate(symptoms):
211 |         print(i, k)
212 |         feat_id = "dim_{}".format(i)
213 |         if k in feat_emo:
214 |             feat_emo_dim.append(feat_id)
215 |         elif k in feat_cog:
216 |             feat_cog_dim.append(feat_id)
217 |         elif k in feat_bod:
218 |             feat_bod_dim.append(feat_id)
219 | 
220 |     print(feat_emo_dim, feat_cog_dim, feat_bod_dim)
221 | 
222 |     feat_group = [feat_emo_dim, feat_cog_dim, feat_bod_dim]
223 | 
224 | 
225 |     flag = 1
226 |     for r in range(10):
227 | 
228 |         # load data
229 |         data_dir = "dataset/swdd-7k_ts_origin_500_0"
230 | 
231 |         X_train_all, y_train = load_from_tsfile_to_dataframe(
232 |             os.path.join(data_dir, "train.ts")
233 |         )
234 |         X_test_all, y_test = load_from_tsfile_to_dataframe(
235 |             os.path.join(data_dir, "test.ts")
236 |         )
237 | 
238 |         print(X_train_all.shape, y_train.shape, X_test_all.shape, y_test.shape)
239 |         print(np.unique(y_train))
240 | 
241 |         for i in range(len(feat_group)):
242 |             if flag and i < 1:
243 |                 continue
244 | 
245 |             feat_group_train = []
246 |             for j in range(len(feat_group)):
247 |                 if i != j:
248 |                     feat_group_train += feat_group[j]
249 |             print(feat_group_train)
250 | 
251 |             save_dir = "results/swdd-7k_model_500_0_remove_feat_group_{}".format(i)
252 | 
253 |             print(data_dir, save_dir)
254 | 
255 |             # ablation
256 |             X_train = X_train_all[feat_group_train]
257 |             X_test = X_test_all[feat_group_train]
258 | 
259 |             for cls in model_list:
260 |                 if flag and cls != "hc2":
261 |                     continue
262 |                 if flag:
263 |                     flag = 0
264 |                 print(cls)
265 |                 if cls == "bst":
266 |                     try:
267 |                         clf = train_ShapeletTransform(
268 |                             X_train, y_train, X_test, y_test
269 |                         )  # need at least one array to concatenate
270 |                     except Exception as e:
271 |                         print(e)
272 |                 elif cls == "rocket":
273 |                     clf = train_ROCKETClassifier(X_train, y_train, X_test, y_test)
274 |                 elif cls == "gs":
275 |                     # try:
276 |                     #     clf = train_TSFresh(
277 |                     #         X_train, y_train, X_test, y_test
278 |                     #     )
279 |                     # except Exception as e:
280 |                     #     print(e)
281 |                     try:
282 |                         # clf = train_Signature(X_train, y_train, X_test, y_test)
283 |                         clf = train_Arsenal(X_train, y_train, X_test, y_test)
284 |                     except Exception as e:
285 |                         print(e)
286 |                 elif cls == "hc2":
287 |                     try:
288 |                         clf = train_HIVECOTEV2(
289 |                             X_train, y_train, X_test, y_test
290 |                         )  # not work
291 |                     except Exception as e:
292 |                         print(e)
293 |                 elif cls == "tsf":
294 |                     clf = train_TimeSeriesForest(X_train, y_train, X_test, y_test)
295 | 
296 |                 # # analyze model
297 |                 report = calc_metrics_binary(clf, X_test, y_test)
298 | 
299 |                 res_save_path = os.path.join(save_dir, cls + ".txt")
300 |                 with open(res_save_path, "a+") as f:
301 |                     f.write(report)
302 |                     f.write("\n" + "*" * 15 + "\n")
303 | 
304 | 


--------------------------------------------------------------------------------
/1_gen_swdd_xk.py:
--------------------------------------------------------------------------------
1 | from utils.data import gen_swdd_4k, gen_swdd_7k
2 | 
3 | if __name__ == "__main__":
4 |     data_dir = "dataset/swdd"
5 |     # gen_swdd_7k(data_dir=data_dir)
6 |     gen_swdd_4k(data_dir=data_dir)
7 | 


--------------------------------------------------------------------------------
/2_extract_embedding.py:
--------------------------------------------------------------------------------
 1 | from preprocess import extract_embedding
 2 | 
 3 | import os
 4 | 
 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 6 | 
 7 | if __name__ == "__main__":
 8 |     # data_dir = "dataset/swdd-7k"
 9 |     # emb_dir = data_dir + "_embedding"
10 |     # extract_embedding(
11 |     #     data_dir=data_dir,
12 |     #     save_dir=emb_dir,
13 |     #     modelname="paraphrase-xlm-r-multilingual-v1",
14 |     # )
15 | 
16 |     for i in range(10, 100, 10):
17 |         data_dir = "dataset/swdd-4k_{}".format(i)
18 |         emb_dir = data_dir + "_embedding"
19 |         extract_embedding(
20 |             data_dir=data_dir,
21 |             save_dir=emb_dir,
22 |             modelname="paraphrase-xlm-r-multilingual-v1",
23 |         )
24 | 


--------------------------------------------------------------------------------
/3_extract_time_series_feature.py:
--------------------------------------------------------------------------------
 1 | from preprocess import extract_time_series_feature
 2 | import os
 3 | 
 4 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 5 | 
 6 | if __name__ == "__main__":
 7 |     # extract_time_series_feature(data_dir="dataset/swdd-7k_embedding", origin_only=True)
 8 |     for i in range(10, 100, 10):
 9 |         extract_time_series_feature(
10 |             data_dir="dataset/swdd-4k_{}_embedding".format(i), origin_only=True
11 |         )
12 | 


--------------------------------------------------------------------------------
/4_make_ts_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import jsonlines
  3 | import numpy as np
  4 | from sklearn import preprocessing
  5 | from sklearn.model_selection import train_test_split
  6 | from collections import namedtuple
  7 | 
  8 | 
  9 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 10 | 
 11 | Dataset = namedtuple("Dataset", ["X", "y"])
 12 | 
 13 | 
 14 | def preprocess(dataset, normalize=True, unify_dims=False, to_categorical=True):
 15 |     """
 16 |     ## Prepare the data
 17 |     """
 18 |     X, y = dataset.X, dataset.y
 19 | 
 20 |     if normalize:
 21 |         X_mean = X.mean()
 22 |         X_std = X.std()
 23 |         X = (X - X_mean) / (X_std + 1e-8)
 24 | 
 25 |     if unify_dims:
 26 |         # cutoff or expand timestamps
 27 |         pass
 28 | 
 29 |     if to_categorical:
 30 |         # 将标签独热编码
 31 |         lb = preprocessing.LabelBinarizer()
 32 |         y = lb.fit_transform(y)
 33 | 
 34 |     return Dataset(X, y)
 35 | 
 36 | 
 37 | def make_dataset_ts(
 38 |     data_dir="swdd-7k",
 39 |     feat_dir="swdd-7k_embedding_500_50",
 40 |     save_dir="swdd-7k_ts_500_50",
 41 |     samp_cnt=7000,
 42 | ):
 43 |     """
 44 |     Note: 1. 在x中继续嵌套元组，将索引编进去，这样最后导出来就是带编号的sample，可以进行错误案例分析
 45 |           2. 时间序列Z-Score预处理不在此处进行
 46 |     """
 47 |     if not os.path.exists(save_dir):
 48 |         os.mkdir(save_dir)
 49 | 
 50 |     X = []
 51 |     y = []
 52 | 
 53 |     for i in range(samp_cnt):
 54 |         file_id = i
 55 |         time_series = np.load(os.path.join(feat_dir, "%04d.npy" % file_id))
 56 |         X.append(time_series)
 57 |         with open(
 58 |             os.path.join(data_dir, "%04d.jsonl" % file_id), "r", encoding="utf8"
 59 |         ) as f:
 60 |             for item in jsonlines.Reader(f):
 61 |                 datum = item
 62 |                 y.append(datum["label"])
 63 |     X = np.array(X)
 64 |     y = np.array(y)
 65 |     Dataset.X = X
 66 |     Dataset.y = y
 67 |     # Dataset_pre = preprocess(dataset=Dataset)
 68 | 
 69 |     X_train, X_test, y_train, y_test = train_test_split(
 70 |         Dataset.X,
 71 |         Dataset.y,
 72 |         test_size=0.4,
 73 |         random_state=2022,
 74 |         stratify=Dataset.y,
 75 |     )
 76 |     # val_x, X_test, val_y, y_test = train_test_split(
 77 |     #     X_test, y_test, test_size=0.5, random_state=2022, stratify=y_test
 78 |     # )
 79 |     # print(len(y_train), len(y_test), len(val_y), np.sum(y_test == 1))
 80 |     print(len(y_train), len(y_test), np.sum(y_test == 1))
 81 | 
 82 |     with open(os.path.join(save_dir, "train.ts"), "w") as f:
 83 |         for idx, v in enumerate(X_train):
 84 |             for i in range(v.shape[0]):
 85 |                 for j in range(v.shape[1]):
 86 |                     f.write(str(v[i][j]))
 87 |                     if j < v.shape[1] - 1:
 88 |                         f.write(",")
 89 |                 f.write(":")
 90 |             f.write(str(y_train[idx]) + "\n")
 91 | 
 92 |     with open(os.path.join(save_dir, "test.ts"), "w") as f:
 93 |         for idx, v in enumerate(X_test):
 94 |             for i in range(v.shape[0]):
 95 |                 for j in range(v.shape[1]):
 96 |                     f.write(str(v[i][j]))
 97 |                     if j < v.shape[1] - 1:
 98 |                         f.write(",")
 99 |                 f.write(":")
100 |             f.write(str(y_test[idx]) + "\n")
101 | 
102 | 
103 | def dataset_7k_origin_main():
104 | 
105 |     make_dataset_ts(
106 |         data_dir="dataset/swdd-7k",
107 |         feat_dir="dataset/swdd-7k_embedding_origin_500_0",
108 |         save_dir="dataset/swdd-7k_ts_origin_500_0",
109 |     )
110 | 
111 | 
112 | def dataset_4k_prop_origin_main():
113 |     for i in range(10, 100, 10):
114 |         data_dir = "dataset/swdd-4k_{}".format(i)
115 | 
116 |         make_dataset_ts(
117 |             data_dir=data_dir,
118 |             feat_dir="{}_embedding_origin_500_0".format(data_dir),
119 |             save_dir="{}_ts_origin_500_0".format(data_dir),
120 |             samp_cnt=4000,
121 |         )
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     # dataset_7k_origin_main()
126 |     dataset_4k_prop_origin_main()
127 | 


--------------------------------------------------------------------------------
/5_lab_length.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from utils.analysis import calc_metrics_binary
  3 | 
  4 | # from types import new_class
  5 | import tensorflow as tf
  6 | import keras
  7 | import gc
  8 | from config import lab_config
  9 | import os
 10 | 
 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 12 | 
 13 | config = lab_config.sktime_dl
 14 | 
 15 | 
 16 | def setNetwork(cls, network_config=None, train_config=None):
 17 |     """
 18 |     Basic way of determining the classifier to build. To differentiate settings just and another elif.
 19 |     :param cls: String indicating which classifier you want
 20 |     :return: A classifier
 21 |     """
 22 |     from sktime_dl.deeplearning import FCNClassifier
 23 |     from sktime_dl.deeplearning import MCDCNNClassifier
 24 |     from sktime_dl.deeplearning import CNNClassifier
 25 |     from sktime_dl.deeplearning import TWIESNClassifier
 26 |     from sktime_dl.deeplearning import InceptionTimeClassifier
 27 |     from pathlib import Path
 28 |     import os
 29 | 
 30 |     # fold = train_config["random_state"]
 31 |     model_save_dir = train_config.get("model_save_directory", "")
 32 |     # model_name = cls + "_" + str(fold)
 33 |     # train_config["model_name"] = model_name
 34 | 
 35 |     if model_save_dir:
 36 |         try:
 37 |             os.makedirs(model_save_dir)
 38 |         except os.error:
 39 |             pass
 40 | 
 41 |     # fold = int(fold)
 42 |     cls = cls.lower()
 43 |     if cls == "mcnn":
 44 |         return CNNClassifier(**network_config, **train_config)
 45 |     elif cls == "fcn":
 46 |         return FCNClassifier(**network_config, **train_config)
 47 |     elif cls == "mcdcnn":
 48 |         return MCDCNNClassifier(**network_config, **train_config)
 49 |     elif cls == "twiesn":
 50 |         train_cfg_copy = {
 51 |             k: v for k, v in train_config.items() if k != "model_save_directory"
 52 |         }
 53 |         return TWIESNClassifier(**network_config, **train_cfg_copy)
 54 |     elif cls == "inception":
 55 |         return InceptionTimeClassifier(**network_config, **train_config)
 56 |     else:
 57 |         raise Exception("UNKNOWN CLASSIFIER: " + cls)
 58 | 
 59 | 
 60 | def read_dataset(data_dir):
 61 |     import os
 62 |     import numpy as np
 63 |     from sktime.utils.data_io import load_from_tsfile_to_dataframe
 64 | 
 65 |     X_train, y_train = load_from_tsfile_to_dataframe(os.path.join(data_dir, "train.ts"))
 66 |     X_test, y_test = load_from_tsfile_to_dataframe(os.path.join(data_dir, "test.ts"))
 67 | 
 68 |     from sklearn.model_selection import train_test_split
 69 | 
 70 |     X_val, X_test, y_val, y_test = train_test_split(
 71 |         X_test,
 72 |         y_test,
 73 |         test_size=0.5,
 74 |         random_state=2022,
 75 |         stratify=y_test,
 76 |     )
 77 |     print(
 78 |         X_train.shape,
 79 |         y_train.shape,
 80 |         X_test.shape,
 81 |         y_test.shape,
 82 |         X_val.shape,
 83 |         y_val.shape,
 84 |     )
 85 |     print(np.unique(y_train))
 86 |     return (X_train, y_train), (X_val, y_val), (X_test, y_test)
 87 | 
 88 | 
 89 | model_list = [
 90 |     "fcn",
 91 |     "mcnn",
 92 |     "mcdcnn",
 93 |     "twiesn",  # sklearn Ridge object has no attribute save. 明明是个sklearn，却要用keras.save。。
 94 |     "inception",  # 作者居然说这是strongest??
 95 | ]
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     flag = 1
100 |     for r in range(10):
101 |         data_dir = "dataset/swdd-7k_ts_origin_500_0"
102 |         # load data
103 |         (X_train, y_train), (X_val, y_val), (X_test, y_test) = read_dataset(
104 |             data_dir=data_dir
105 |         )
106 |         for i in range(50, 501, 50):
107 |             # TODO:
108 |             # if flag and i < 500:
109 |             #     continue
110 |             # data_dir = "swdd-7k_ts_500_{}".format(i)
111 |             save_dir = "results/swdd-7k_model_500_{}_simple".format(i)
112 | 
113 |             print(data_dir, save_dir)
114 | 
115 |             for cls in model_list:
116 |                 # TODO:
117 |                 # if flag and cls != "tlenet":
118 |                 #     continue
119 |                 if flag:
120 |                     flag = 0
121 |                 import os
122 | 
123 |                 train_cfg = config["train_config"][cls]
124 |                 network_cfg = config["network_config"][cls]
125 |                 train_cfg["model_save_directory"] = save_dir
126 | 
127 |                 # build network
128 |                 network = setNetwork(
129 |                     cls, network_config=network_cfg, train_config=train_cfg
130 |                 )
131 | 
132 |                 # train
133 |                 network.fit(
134 |                     X_train[:i],
135 |                     y_train[:i],
136 |                     validation_X=X_val[:i],
137 |                     validation_y=y_val[:i],
138 |                 )
139 | 
140 |                 # # analyze model
141 |                 report = calc_metrics_binary(network, X_test[:i], y_test[:i])
142 | 
143 |                 res_save_path = os.path.join(save_dir, cls + ".txt")
144 |                 with open(res_save_path, "a+") as f:
145 |                     f.write(report)
146 |                     f.write("\n" + "*" * 15 + "\n")
147 |                 del network
148 |                 gc.collect()
149 |                 keras.backend.clear_session()
150 |             tf.keras.backend.clear_session()
151 | 


--------------------------------------------------------------------------------
/6_lab_prop.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from utils.analysis import calc_metrics_binary
  3 | 
  4 | # from types import new_class
  5 | import tensorflow as tf
  6 | import keras
  7 | import gc
  8 | from config import lab_config
  9 | import os
 10 | 
 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 12 | 
 13 | config = lab_config.sktime_dl
 14 | 
 15 | 
 16 | def setNetwork(cls, network_config=None, train_config=None):
 17 |     """
 18 |     Basic way of determining the classifier to build. To differentiate settings just and another elif.
 19 |     :param cls: String indicating which classifier you want
 20 |     :return: A classifier
 21 |     """
 22 |     from sktime_dl.deeplearning import FCNClassifier
 23 |     from sktime_dl.deeplearning import MCDCNNClassifier
 24 |     from sktime_dl.deeplearning import CNNClassifier
 25 |     from sktime_dl.deeplearning import TWIESNClassifier
 26 |     from sktime_dl.deeplearning import InceptionTimeClassifier
 27 |     from pathlib import Path
 28 |     import os
 29 | 
 30 |     # fold = train_config["random_state"]
 31 |     model_save_dir = train_config.get("model_save_directory", "")
 32 |     # model_name = cls + "_" + str(fold)
 33 |     # train_config["model_name"] = model_name
 34 | 
 35 |     if model_save_dir:
 36 |         try:
 37 |             os.makedirs(model_save_dir)
 38 |         except os.error:
 39 |             pass
 40 | 
 41 |     # fold = int(fold)
 42 |     cls = cls.lower()
 43 |     if cls == "mcnn":
 44 |         return CNNClassifier(**network_config, **train_config)
 45 |     elif cls == "fcn":
 46 |         return FCNClassifier(**network_config, **train_config)
 47 |     elif cls == "mcdcnn":
 48 |         return MCDCNNClassifier(**network_config, **train_config)
 49 |     elif cls == "twiesn":
 50 |         train_cfg_copy = {
 51 |             k: v for k, v in train_config.items() if k != "model_save_directory"
 52 |         }
 53 |         return TWIESNClassifier(**network_config, **train_cfg_copy)
 54 |     elif cls == "inception":
 55 |         return InceptionTimeClassifier(**network_config, **train_config)
 56 |     else:
 57 |         raise Exception("UNKNOWN CLASSIFIER: " + cls)
 58 | 
 59 | def read_dataset(data_dir):
 60 |     import os
 61 |     import numpy as np
 62 |     from sktime.utils.data_io import load_from_tsfile_to_dataframe
 63 | 
 64 |     X_train, y_train = load_from_tsfile_to_dataframe(os.path.join(data_dir, "train.ts"))
 65 |     X_test, y_test = load_from_tsfile_to_dataframe(os.path.join(data_dir, "test.ts"))
 66 | 
 67 |     from sklearn.model_selection import train_test_split
 68 | 
 69 |     X_val, X_test, y_val, y_test = train_test_split(
 70 |         X_test, y_test, test_size=0.5, random_state=2022, stratify=y_test,
 71 |     )
 72 |     print(
 73 |         X_train.shape,
 74 |         y_train.shape,
 75 |         X_test.shape,
 76 |         y_test.shape,
 77 |         X_val.shape,
 78 |         y_val.shape,
 79 |     )
 80 |     print(np.unique(y_train))
 81 |     return (X_train, y_train), (X_val, y_val), (X_test, y_test)
 82 | 
 83 | 
 84 | model_list = [
 85 |     "fcn",
 86 |     "mcnn",
 87 |     "mcdcnn",
 88 |     "twiesn",  # sklearn Ridge object has no attribute save. 明明是个sklearn，却要用keras.save。。
 89 |     "inception",  # 作者居然说这是strongest??
 90 | ]
 91 | 
 92 | if __name__ == "__main__":
 93 |     flag = 1
 94 |     for r in range(10):
 95 |         for i in range(10, 100, 10):
 96 |             # TODO:
 97 |             # if flag and i < 90:
 98 |             #     continue
 99 | 
100 |             data_dir = "dataset/swdd-4k_{}_ts_origin_500_0".format(i)
101 |             save_dir = "results/swdd-4k_{}_model_500_0".format(i)
102 | 
103 |             print(data_dir, save_dir)
104 | 
105 |             # load data
106 |             (X_train, y_train), (X_val, y_val), (X_test, y_test) = read_dataset(
107 |                 data_dir=data_dir
108 |             )
109 |             for cls in model_list:
110 |                 # if flag and cls != "tlenet":
111 |                 #     continue
112 |                 if flag:
113 |                     flag = 0
114 |                 print(cls)
115 |                 import os
116 | 
117 |                 train_cfg = config["train_config"][cls]
118 |                 network_cfg = config["network_config"][cls]
119 |                 train_cfg["model_save_directory"] = save_dir
120 | 
121 |                 # build network
122 |                 network = setNetwork(
123 |                     cls, network_config=network_cfg, train_config=train_cfg
124 |                 )
125 | 
126 |                 # train
127 |                 network.fit(X_train, y_train, validation_X=X_val, validation_y=y_val)
128 | 
129 |                 # # analyze model
130 |                 report = calc_metrics_binary(network, X_test, y_test)
131 | 
132 |                 res_save_path = os.path.join(save_dir, cls + ".txt")
133 |                 with open(res_save_path, "a+") as f:
134 |                     f.write(report)
135 |                     f.write("\n" + "*" * 15 + "\n")
136 |                 del network
137 |                 gc.collect()
138 |                 keras.backend.clear_session()
139 |             tf.keras.backend.clear_session()
140 | 


--------------------------------------------------------------------------------
/7_lab_ablation.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from utils.analysis import calc_metrics_binary
  3 | 
  4 | # from types import new_class
  5 | import tensorflow as tf
  6 | import keras
  7 | import gc
  8 | from config import lab_config
  9 | import os
 10 | 
 11 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 12 | 
 13 | config = lab_config.sktime_dl
 14 | 
 15 | 
 16 | def setNetwork(cls, network_config=None, train_config=None):
 17 |     """
 18 |     Basic way of determining the classifier to build. To differentiate settings just and another elif.
 19 |     :param cls: String indicating which classifier you want
 20 |     :return: A classifier
 21 |     """
 22 | 
 23 |     from sktime_dl.deeplearning import FCNClassifier
 24 |     from sktime_dl.deeplearning import MCDCNNClassifier
 25 |     from sktime_dl.deeplearning import CNNClassifier
 26 |     from sktime_dl.deeplearning import TWIESNClassifier
 27 |     from sktime_dl.deeplearning import InceptionTimeClassifier
 28 |     from pathlib import Path
 29 |     import os
 30 | 
 31 |     # fold = train_config["random_state"]
 32 |     model_save_dir = train_config.get("model_save_directory", "")
 33 |     # model_name = cls + "_" + str(fold)
 34 |     # train_config["model_name"] = model_name
 35 | 
 36 |     if model_save_dir:
 37 |         try:
 38 |             os.makedirs(model_save_dir)
 39 |         except os.error:
 40 |             pass
 41 | 
 42 |     # fold = int(fold)
 43 |     cls = cls.lower()
 44 |     if cls == "mcnn":
 45 |         return CNNClassifier(**network_config, **train_config)
 46 |     elif cls == "fcn":
 47 |         return FCNClassifier(**network_config, **train_config)
 48 |     elif cls == "mcdcnn":
 49 |         return MCDCNNClassifier(**network_config, **train_config)
 50 |     elif cls == "twiesn":
 51 |         train_cfg_copy = {
 52 |             k: v for k, v in train_config.items() if k != "model_save_directory"
 53 |         }
 54 |         return TWIESNClassifier(**network_config, **train_cfg_copy)
 55 |     elif cls == "inception":
 56 |         return InceptionTimeClassifier(**network_config, **train_config)
 57 |     else:
 58 |         raise Exception("UNKNOWN CLASSIFIER: " + cls)
 59 | 
 60 | 
 61 | def read_dataset(data_dir):
 62 |     import os
 63 |     import numpy as np
 64 |     from sktime.utils.data_io import load_from_tsfile_to_dataframe
 65 | 
 66 |     X_train, y_train = load_from_tsfile_to_dataframe(os.path.join(data_dir, "train.ts"))
 67 |     X_test, y_test = load_from_tsfile_to_dataframe(os.path.join(data_dir, "test.ts"))
 68 | 
 69 |     from sklearn.model_selection import train_test_split
 70 | 
 71 |     X_val, X_test, y_val, y_test = train_test_split(
 72 |         X_test,
 73 |         y_test,
 74 |         test_size=0.5,
 75 |         random_state=2022,
 76 |         stratify=y_test,
 77 |     )
 78 |     print(
 79 |         X_train.shape,
 80 |         y_train.shape,
 81 |         X_test.shape,
 82 |         y_test.shape,
 83 |         X_val.shape,
 84 |         y_val.shape,
 85 |     )
 86 |     print(np.unique(y_train))
 87 |     return (X_train, y_train), (X_val, y_val), (X_test, y_test)
 88 | 
 89 | 
 90 | model_list = [
 91 |     "fcn",
 92 |     "mcnn",
 93 |     "mcdcnn",
 94 |     "twiesn",  # sklearn Ridge object has no attribute save. 明明是个sklearn，却要用keras.save。。
 95 |     "inception",  # 作者居然说这是strongest??
 96 | ]
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     from utils.symptom import symptoms_dsm_5 as symptoms
101 | 
102 |     # 情绪
103 |     feat_emo = [
104 |         "depressive_mood",
105 |         "retardation_or_agitation",
106 |         "panic_and_anxious",
107 |     ]  # sad, agi, pan
108 |     # 认知
109 |     feat_cog = [
110 |         "interest_pleasure_loss",
111 |         "self_blame",
112 |         "suicidal_ideation",
113 |         "concentration_problem",
114 |     ]  # int, sel(low-esteem), sui, con
115 |     # 躯体
116 |     feat_bod = [
117 |         "appetite_weight_problem",
118 |         "insomnia_or_hypersomnia",
119 |         "energy_loss",
120 |         "sympathetic_arousal",
121 |     ]  # app, ins, ene, sym
122 |     # 行为？
123 |     for r in range(10):
124 |         feat_emo_dim = []
125 |         feat_cog_dim = []
126 |         feat_bod_dim = []
127 | 
128 |         for i, k in enumerate(symptoms):
129 |             print(i, k)
130 |             feat_id = "dim_{}".format(i)
131 |             if k in feat_emo:
132 |                 feat_emo_dim.append(feat_id)
133 |             elif k in feat_cog:
134 |                 feat_cog_dim.append(feat_id)
135 |             elif k in feat_bod:
136 |                 feat_bod_dim.append(feat_id)
137 | 
138 |         print(feat_emo_dim, feat_cog_dim, feat_bod_dim)
139 | 
140 |         feat_group = [feat_emo_dim, feat_cog_dim, feat_bod_dim]
141 | 
142 |         # load data
143 |         data_dir = "dataset/swdd-7k_ts_origin_500_0"
144 | 
145 |         (X_train_all, y_train), (X_val_all, y_val), (X_test_all, y_test) = read_dataset(
146 |             data_dir=data_dir
147 |         )
148 | 
149 |         for i in range(len(feat_group)):
150 |             feat_group_train = []
151 |             for j in range(len(feat_group)):
152 |                 if i != j:
153 |                     feat_group_train += feat_group[j]
154 |             print(feat_group_train)
155 |             
156 |             save_dir = "results/swdd-7k_model_500_0_remove_feat_group_{}".format(i)
157 | 
158 |             print(data_dir, save_dir)
159 | 
160 |             # ablation
161 |             X_train = X_train_all[feat_group_train]
162 |             X_val = X_val_all[feat_group_train]
163 |             X_test = X_test_all[feat_group_train]
164 | 
165 |             for cls in model_list:
166 |                 import os
167 | 
168 |                 train_cfg = config["train_config"][cls]
169 |                 network_cfg = config["network_config"][cls]
170 |                 train_cfg["model_save_directory"] = save_dir
171 | 
172 |                 # build network
173 |                 network = setNetwork(
174 |                     cls, network_config=network_cfg, train_config=train_cfg
175 |                 )
176 | 
177 |                 # train
178 |                 network.fit(X_train, y_train, validation_X=X_val, validation_y=y_val)
179 | 
180 |                 # # analyze model
181 |                 report = calc_metrics_binary(network, X_test, y_test)
182 | 
183 |                 res_save_path = os.path.join(save_dir, cls + ".txt")
184 |                 with open(res_save_path, "a+") as f:
185 |                     f.write(report)
186 |                     f.write("\n" + "*" * 15 + "\n")
187 |                 del network
188 |                 gc.collect()
189 |                 keras.backend.clear_session()
190 |             tf.keras.backend.clear_session()
191 | 
192 |         with open(os.path.join(save_dir, "feat_group_train.txt"), "w+") as f:
193 |             f.write(str(feat_group_train))
194 | 


--------------------------------------------------------------------------------
/8_lab_length_ml.py:
--------------------------------------------------------------------------------
  1 | def calc_metrics_binary(model, X_test, y_test):
  2 |     from sklearn.metrics import (
  3 |         classification_report,
  4 |         accuracy_score,
  5 |         f1_score,
  6 |         roc_auc_score,
  7 |         recall_score,
  8 |         precision_score,
  9 |     )
 10 | 
 11 |     y_pred = model.predict(X_test)
 12 |     report = classification_report(
 13 |         y_test, y_pred, target_names=["Normal", "Depressed"], digits=4
 14 |     )
 15 |     return report
 16 | 
 17 | 
 18 | from datetime import datetime
 19 | 
 20 | 
 21 | def train_TimeSeriesForest(X_train, y_train, X_test, y_test):
 22 |     start_time = datetime.now()
 23 |     # base
 24 |     print("***TimeSeriesForestClassifier***")
 25 |     from sklearn.pipeline import Pipeline
 26 |     from sktime.classification.interval_based import TimeSeriesForestClassifier
 27 |     from sktime.classification.compose import ColumnEnsembleClassifier
 28 |     from sktime.transformations.panel.compose import ColumnConcatenator
 29 | 
 30 |     steps = [
 31 |         ("concatenate", ColumnConcatenator()),
 32 |         ("classify", TimeSeriesForestClassifier(n_estimators=100)),
 33 |     ]
 34 |     clf = Pipeline(steps)
 35 |     clf.fit(X_train, y_train)
 36 | 
 37 |     report = calc_metrics_binary(clf, X_test, y_test)
 38 | 
 39 |     print(report)
 40 | 
 41 |     print(str(datetime.now() - start_time))
 42 | 
 43 |     return clf
 44 | 
 45 | 
 46 | def train_ROCKETClassifier(X_train, y_train, X_test, y_test):
 47 |     start_time = datetime.now()
 48 |     # 2020
 49 |     print("***ROCKETClassifier***")
 50 |     from sktime.classification.kernel_based import ROCKETClassifier
 51 | 
 52 |     clf = ROCKETClassifier(num_kernels=500)
 53 |     clf.fit(X_train, y_train)
 54 | 
 55 |     report = calc_metrics_binary(clf, X_test, y_test)
 56 | 
 57 |     print(report)
 58 | 
 59 |     print(str(datetime.now() - start_time))
 60 | 
 61 |     return clf
 62 | 
 63 | 
 64 | def train_Signature(X_train, y_train, X_test, y_test):
 65 |     start_time = datetime.now()
 66 | 
 67 |     # 2020
 68 |     print("***SignatureClassifier***")
 69 |     from sktime.classification.feature_based import SignatureClassifier
 70 | 
 71 |     clf = SignatureClassifier()
 72 |     clf.fit(X_train, y_train)
 73 | 
 74 |     report = calc_metrics_binary(clf, X_test, y_test)
 75 | 
 76 |     print(report)
 77 | 
 78 |     print(str(datetime.now() - start_time))
 79 | 
 80 |     return clf
 81 | 
 82 | 
 83 | def train_Arsenal(X_train, y_train, X_test, y_test):
 84 |     start_time = datetime.now()
 85 |     # uni
 86 |     print("***Arsenal***")
 87 |     from sktime.classification.kernel_based import Arsenal
 88 | 
 89 |     clf = Arsenal(num_kernels=200, n_estimators=5)
 90 |     clf.fit(X_train, y_train)
 91 | 
 92 |     report = calc_metrics_binary(clf, X_test, y_test)
 93 | 
 94 |     print(report)
 95 |     print(str(datetime.now() - start_time))
 96 |     start_time = datetime.now()
 97 | 
 98 |     return clf
 99 | 
100 | 
101 | def train_TSFresh(X_train, y_train, X_test, y_test):
102 |     start_time = datetime.now()
103 |     # 2018
104 |     print("***TSFreshClassifier***")
105 |     from sktime.classification.feature_based import TSFreshClassifier
106 | 
107 |     clf = TSFreshClassifier()
108 |     clf.fit(X_train, y_train)
109 | 
110 |     report = calc_metrics_binary(clf, X_test, y_test)
111 | 
112 |     print(report)
113 | 
114 |     print(str(datetime.now() - start_time))
115 | 
116 |     return clf
117 | 
118 | 
119 | def train_HIVECOTEV2(X_train, y_train, X_test, y_test):
120 |     start_time = datetime.now()
121 |     # No module named sktime.classificastion.shapelet_based.mrseql.mrseql
122 |     print("***HIVECOTEV2***")
123 |     from sktime.classification.hybrid import HIVECOTEV2
124 |     from sktime.contrib.vector_classifiers._rotation_forest import RotationForest
125 | 
126 |     clf = HIVECOTEV2(
127 |         stc_params={
128 |             "estimator": RotationForest(n_estimators=3),
129 |             "n_shapelet_samples": 500,
130 |             "max_shapelets": 20,
131 |             "batch_size": 100,
132 |         },
133 |         drcif_params={"n_estimators": 10},
134 |         arsenal_params={"num_kernels": 100, "n_estimators": 5},
135 |         tde_params={
136 |             "n_parameter_samples": 25,
137 |             "max_ensemble_size": 5,
138 |             "randomly_selected_params": 10,
139 |         },
140 |     )
141 |     clf.fit(X_train, y_train)
142 | 
143 |     report = calc_metrics_binary(clf, X_test, y_test)
144 | 
145 |     print(report)
146 |     print(str(datetime.now() - start_time))
147 |     start_time = datetime.now()
148 | 
149 |     return clf
150 | 
151 | 
152 | def train_ShapeletTransform(X_train, y_train, X_test, y_test):
153 |     start_time = datetime.now()
154 |     # shapelet
155 |     print("***ShapeletTransformClassifier***")
156 |     from sktime.classification.shapelet_based import ShapeletTransformClassifier
157 |     from sktime.contrib.vector_classifiers._rotation_forest import RotationForest
158 | 
159 |     clf = ShapeletTransformClassifier(
160 |         estimator=RotationForest(n_estimators=3),
161 |         n_shapelet_samples=500,
162 |         max_shapelets=20,
163 |         batch_size=100,
164 |     )
165 |     clf.fit(X_train, y_train)
166 | 
167 |     report = calc_metrics_binary(clf, X_test, y_test)
168 | 
169 |     print(report)
170 |     print(str(datetime.now() - start_time))
171 |     start_time = datetime.now()
172 | 
173 |     return clf
174 | 
175 | 
176 | model_list = [
177 |     "bst",
178 |     "rocket",
179 |     "gs",
180 |     "hc2",
181 |     "tsf"
182 | ]
183 | 
184 | 
185 | if __name__ == "__main__":
186 |     import os
187 |     import numpy as np
188 |     from sktime.utils.data_io import load_from_tsfile_to_dataframe
189 | 
190 |     for r in range(10):
191 |         # data_dir = "swdd-7k_ts_500_500"
192 |         data_dir = "dataset/swdd-7k_ts_origin_500_0"
193 | 
194 |         X_train, y_train = load_from_tsfile_to_dataframe(
195 |             os.path.join(data_dir, "train.ts")
196 |         )
197 |         X_test, y_test = load_from_tsfile_to_dataframe(
198 |             os.path.join(data_dir, "test.ts")
199 |         )
200 | 
201 |         print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
202 |         print(np.unique(y_train))
203 | 
204 |         for i in range(50, 501, 50):
205 |             # if flag and i < 500:
206 |             #     continue
207 |             # data_dir = "swdd-7k_ts_500_{}".format(i)
208 |             save_dir = "results/swdd-7k_model_500_{}_simple".format(i)
209 | 
210 |             print(data_dir, save_dir)
211 | 
212 |             for cls in model_list:
213 |                 if cls == "bst":
214 |                     try:
215 |                         clf = train_ShapeletTransform(
216 |                             X_train[:i], y_train[:i], X_test[:i], y_test[:i]
217 |                         )  # need at least one array to concatenate
218 |                     except Exception as e:
219 |                         print(e)
220 |                 elif cls == "rocket":
221 |                     clf = train_ROCKETClassifier(
222 |                         X_train[:i], y_train[:i], X_test[:i], y_test[:i]
223 |                     )
224 |                 elif cls == "gs":
225 |                     # try:
226 |                     #     clf = train_TSFresh(
227 |                     #         X_train[:i], y_train[:i], X_test[:i], y_test[:i]
228 |                     #     )
229 |                     # except Exception as e:
230 |                     #     print(e)
231 |                     try:
232 |                         # clf = train_Signature(X_train[:i], y_train[:i], X_test[:i], y_test[:i])
233 |                         clf = train_Arsenal(
234 |                             X_train[:i], y_train[:i], X_test[:i], y_test[:i]
235 |                         )
236 |                     except Exception as e:
237 |                         print(e)
238 |                 elif cls == "hc2":
239 |                     try:
240 |                         clf = train_HIVECOTEV2(
241 |                             X_train[:i], y_train[:i], X_test[:i], y_test[:i]
242 |                         )  # not work
243 |                     except Exception as e:
244 |                         print(e)
245 |                 elif cls == "tsf":
246 |                     clf = train_TimeSeriesForest(
247 |                         X_train[:i], y_train[:i], X_test[:i], y_test[:i]
248 |                     )
249 | 
250 |                 # # analyze model
251 |                 report = calc_metrics_binary(clf, X_test[:i], y_test[:i])
252 | 
253 |                 res_save_path = os.path.join(save_dir, cls + ".txt")
254 |                 with open(res_save_path, "a+") as f:
255 |                     f.write(report)
256 |                     f.write("\n" + "*" * 15 + "\n")
257 | 


--------------------------------------------------------------------------------
/9_lab_prop_ml.py:
--------------------------------------------------------------------------------
  1 | def calc_metrics_binary(model, X_test, y_test):
  2 |     from sklearn.metrics import (
  3 |         classification_report,
  4 |         accuracy_score,
  5 |         f1_score,
  6 |         roc_auc_score,
  7 |         recall_score,
  8 |         precision_score,
  9 |     )
 10 | 
 11 |     y_pred = model.predict(X_test)
 12 |     report = classification_report(
 13 |         y_test, y_pred, target_names=["Normal", "Depressed"], digits=4
 14 |     )
 15 |     return report
 16 | 
 17 | 
 18 | from datetime import datetime
 19 | 
 20 | 
 21 | def train_TimeSeriesForest(X_train, y_train, X_test, y_test):
 22 |     start_time = datetime.now()
 23 |     # base
 24 |     print("***TimeSeriesForestClassifier***")
 25 |     from sklearn.pipeline import Pipeline
 26 |     from sktime.classification.interval_based import TimeSeriesForestClassifier
 27 |     from sktime.classification.compose import ColumnEnsembleClassifier
 28 |     from sktime.transformations.panel.compose import ColumnConcatenator
 29 | 
 30 |     steps = [
 31 |         ("concatenate", ColumnConcatenator()),
 32 |         ("classify", TimeSeriesForestClassifier(n_estimators=100)),
 33 |     ]
 34 |     clf = Pipeline(steps)
 35 |     clf.fit(X_train, y_train)
 36 | 
 37 |     report = calc_metrics_binary(clf, X_test, y_test)
 38 | 
 39 |     print(report)
 40 | 
 41 |     print(str(datetime.now() - start_time))
 42 | 
 43 |     return clf
 44 | 
 45 | 
 46 | def train_ROCKETClassifier(X_train, y_train, X_test, y_test):
 47 |     start_time = datetime.now()
 48 |     # 2020
 49 |     print("***ROCKETClassifier***")
 50 |     from sktime.classification.kernel_based import ROCKETClassifier
 51 | 
 52 |     clf = ROCKETClassifier(num_kernels=500)
 53 |     clf.fit(X_train, y_train)
 54 | 
 55 |     report = calc_metrics_binary(clf, X_test, y_test)
 56 | 
 57 |     print(report)
 58 | 
 59 |     print(str(datetime.now() - start_time))
 60 | 
 61 |     return clf
 62 | 
 63 | 
 64 | def train_Signature(X_train, y_train, X_test, y_test):
 65 |     start_time = datetime.now()
 66 | 
 67 |     # 2020
 68 |     print("***SignatureClassifier***")
 69 |     from sktime.classification.feature_based import SignatureClassifier
 70 | 
 71 |     clf = SignatureClassifier()
 72 |     clf.fit(X_train, y_train)
 73 | 
 74 |     report = calc_metrics_binary(clf, X_test, y_test)
 75 | 
 76 |     print(report)
 77 | 
 78 |     print(str(datetime.now() - start_time))
 79 | 
 80 |     return clf
 81 | 
 82 | 
 83 | def train_Arsenal(X_train, y_train, X_test, y_test):
 84 |     start_time = datetime.now()
 85 |     # uni
 86 |     print("***Arsenal***")
 87 |     from sktime.classification.kernel_based import Arsenal
 88 | 
 89 |     clf = Arsenal(num_kernels=200, n_estimators=5)
 90 |     clf.fit(X_train, y_train)
 91 | 
 92 |     report = calc_metrics_binary(clf, X_test, y_test)
 93 | 
 94 |     print(report)
 95 |     print(str(datetime.now() - start_time))
 96 |     start_time = datetime.now()
 97 | 
 98 |     return clf
 99 | 
100 | 
101 | def train_TSFresh(X_train, y_train, X_test, y_test):
102 |     start_time = datetime.now()
103 |     # 2018
104 |     print("***TSFreshClassifier***")
105 |     from sktime.classification.feature_based import TSFreshClassifier
106 | 
107 |     clf = TSFreshClassifier()
108 |     clf.fit(X_train, y_train)
109 | 
110 |     report = calc_metrics_binary(clf, X_test, y_test)
111 | 
112 |     print(report)
113 | 
114 |     print(str(datetime.now() - start_time))
115 | 
116 |     return clf
117 | 
118 | 
119 | def train_HIVECOTEV2(X_train, y_train, X_test, y_test):
120 |     start_time = datetime.now()
121 |     # No module named sktime.classificastion.shapelet_based.mrseql.mrseql
122 |     print("***HIVECOTEV2***")
123 |     from sktime.classification.hybrid import HIVECOTEV2
124 |     from sktime.contrib.vector_classifiers._rotation_forest import RotationForest
125 | 
126 |     clf = HIVECOTEV2(
127 |         stc_params={
128 |             "estimator": RotationForest(n_estimators=3),
129 |             "n_shapelet_samples": 500,
130 |             "max_shapelets": 20,
131 |             "batch_size": 100,
132 |         },
133 |         drcif_params={"n_estimators": 10},
134 |         arsenal_params={"num_kernels": 100, "n_estimators": 5},
135 |         tde_params={
136 |             "n_parameter_samples": 25,
137 |             "max_ensemble_size": 5,
138 |             "randomly_selected_params": 10,
139 |         },
140 |     )
141 |     clf.fit(X_train, y_train)
142 | 
143 |     report = calc_metrics_binary(clf, X_test, y_test)
144 | 
145 |     print(report)
146 |     print(str(datetime.now() - start_time))
147 |     start_time = datetime.now()
148 | 
149 |     return clf
150 | 
151 | 
152 | def train_ShapeletTransform(X_train, y_train, X_test, y_test):
153 |     start_time = datetime.now()
154 |     # shapelet
155 |     print("***ShapeletTransformClassifier***")
156 |     from sktime.classification.shapelet_based import ShapeletTransformClassifier
157 |     from sktime.contrib.vector_classifiers._rotation_forest import RotationForest
158 | 
159 |     clf = ShapeletTransformClassifier(
160 |         estimator=RotationForest(n_estimators=3),
161 |         n_shapelet_samples=500,
162 |         max_shapelets=20,
163 |         batch_size=100,
164 |     )
165 |     clf.fit(X_train, y_train)
166 | 
167 |     report = calc_metrics_binary(clf, X_test, y_test)
168 | 
169 |     print(report)
170 |     print(str(datetime.now() - start_time))
171 |     start_time = datetime.now()
172 | 
173 |     return clf
174 | 
175 | 
176 | model_list = ["bst", "rocket", "gs", "hc2", "tsf"]
177 | 
178 | if __name__ == "__main__":
179 |     import os
180 |     import numpy as np
181 |     from sktime.utils.data_io import load_from_tsfile_to_dataframe
182 | 
183 |     flag = 1
184 |     for r in range(9):
185 |         for i in range(10, 100, 10):
186 |             # TODO:
187 |             if flag and i < 10:
188 |                 continue
189 | 
190 |             data_dir = "dataset/swdd-4k_{}_ts_origin_500_0".format(i)
191 |             save_dir = "results/swdd-4k_{}_model_500_0".format(i)
192 | 
193 |             if not os.path.exists(save_dir):
194 |                 os.makedirs(save_dir)
195 | 
196 |             print(data_dir, save_dir)
197 | 
198 |             # load data
199 |             X_train, y_train = load_from_tsfile_to_dataframe(
200 |                 os.path.join(data_dir, "train.ts")
201 |             )
202 |             X_test, y_test = load_from_tsfile_to_dataframe(
203 |                 os.path.join(data_dir, "test.ts")
204 |             )
205 | 
206 |             print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
207 |             print(np.unique(y_train))
208 | 
209 |             for cls in model_list:
210 |                 if flag and cls != "hc2":
211 |                     continue
212 |                 if flag:
213 |                     flag = 0
214 |                 print(cls)
215 |                 if cls == "bst":
216 |                     try:
217 |                         clf = train_ShapeletTransform(
218 |                             X_train, y_train, X_test, y_test
219 |                         )  # need at least one array to concatenate
220 |                     except Exception as e:
221 |                         print(e)
222 |                 elif cls == "rocket":
223 |                     clf = train_ROCKETClassifier(X_train, y_train, X_test, y_test)
224 |                 elif cls == "gs":
225 |                     # try:
226 |                     #     clf = train_TSFresh(
227 |                     #         X_train, y_train, X_test, y_test
228 |                     #     )
229 |                     # except Exception as e:
230 |                     #     print(e)
231 |                     try:
232 |                         # clf = train_Signature(X_train, y_train, X_test, y_test)
233 |                         clf = train_Arsenal(X_train, y_train, X_test, y_test)
234 |                     except Exception as e:
235 |                         print(e)
236 |                 elif cls == "hc2":
237 |                     try:
238 |                         clf = train_HIVECOTEV2(
239 |                             X_train, y_train, X_test, y_test
240 |                         )  # not work
241 |                     except Exception as e:
242 |                         print(e)
243 |                 elif cls == "tsf":
244 |                     clf = train_TimeSeriesForest(X_train, y_train, X_test, y_test)
245 | 
246 |                 # # analyze model
247 |                 report = calc_metrics_binary(clf, X_test, y_test)
248 | 
249 |                 res_save_path = os.path.join(save_dir, cls + ".txt")
250 |                 with open(res_save_path, "a+") as f:
251 |                     f.write(report)
252 |                     f.write("\n" + "*" * 15 + "\n")
253 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 YiChengCai1999
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Depression Detection on Online Social Network with Multivariate Time Series Feature of User Depressive Symptoms
  2 | 
  3 | The source code for [**Depression Detection on Online Social Network with Multivariate Time Series Feature of User Depressive Symptoms**](https://www.sciencedirect.com/science/article/pii/S0957417423000398) paper, accepted at ESWA 2023.  
  4 | 
  5 | ## Abstract
  6 | 
  7 | In recent years, depression has attracted worldwide attention because of its prevalence and great risk for suicide. Existing studies have confirmed the feasibility of depression detection on online social networks. Most existing researches extract the overall features of users during a specific period, which cannot reflect the dynamic variation of depression. Besides, the methods proposed in these studies are often lack in interpretability and fail to establish the correlation between features and depressive symptoms in clinical. To address these problems, we propose a novel framework for depression detection based on multivariate time series feature of user depressive symptoms. Firstly, we construct and publish a well-labeled dataset collecting from the most popular Chinese social network platform Sina Weibo. To the best of our knowledge, it is the first large-scale depression dataset with complete collection of user tweeting histories, which includes 3,711 depressed users and 19,526 non-depressed users. Then, we propose a feature extraction method that reveals user depression symptoms variation in the form of multivariate time series. Moreover, we explore the various influencing factors to the performance of our proposed framework. In addition, we also explore the contributions of features to classification as well as their interpretability and conduct feature ablations on them. The experimental results show that our proposed method is effective and the extracted multivariate time series feature can well characterize the depressive state variation of users. Finally, we analyze the shortcomings and challenges of this study. Our research work also provides methods and ideas for tracking and visualizing the development of depression among online social network users.
  8 | 
  9 | ![Illustration of our feature extraction method - DSTS](https://github.com/cyc21csri/DepressionDetection/blob/main/img/Method-DSTS.png)
 10 | 
 11 | ## Dataset
 12 | 
 13 | You can download and aquire the information the datasets from the following links [SWDD](https://github.com/cyc21csri/SWDD).
 14 | 
 15 | ### Requirements
 16 | 
 17 | Alongside the packages mentioned in the file "requirements.txt", you should install the following packages as below to run the **deep learning classifiers** in the paper.
 18 | 
 19 | - sktime-dl-0.2.0 (a modification to the origin package sktime-dl-0.1.0 by the author of this work)
 20 | 
 21 | - sktime-0.5.3 
 22 | 
 23 | We have packaged the two packages in zip files, see folder envs.
 24 | 
 25 | For the **machine learning classifiers** in this paper (when running the file 8_lab_length_ml.py, 9_lab_prop_ml.py, 10_lab_ablation_ml.py), however, **you should update the sktime package to version 0.8.1**.
 26 | 
 27 | ## How to Run
 28 | 
 29 | We have uploaded all the source code needed to reproduce the results of experiments in the paper. Follow the instructions as below to run the code:
 30 | 
 31 | ```
 32 | git clone https://github.com/cyc21csri/DepressionDetection.git
 33 | cd DepressionDetection
 34 | mkdir -p data/swdd dataset results
 35 | ```
 36 | 
 37 | - Download `SWDD` data from [here](https://drive.google.com/file/d/1fNKtoo4SP98OAhalMjNRZfFqmQZsQ0fh/view?usp=sharing) and unzip it to `data/swdd` folder
 38 | 
 39 | - The source code is reorganized and the filename is renamed in the form of "[No.]\_[FileName]", where [No.] indicates the execution order of the scripts.
 40 | 
 41 | Notice that after you have executed the "4_make_ts_dataset.py", you should manually add the following info to the head of generated dataset file "train.ts" and "test.ts" in case it cannot be recognized as a time-series dataset.
 42 | 
 43 | ```
 44 | @problemName MDDWeibo
 45 | @timeStamps false
 46 | @missing false
 47 | @univariate false
 48 | @dimensions 11
 49 | @equalLength true
 50 | @seriesLength 500
 51 | @classLabel true 0 1
 52 | @data
 53 | ```
 54 | 
 55 | For more information of the time-series dataset format adopted in this paper, see [here](https://timeseriesclassification.com/) and download one of the dataset in [UCR Archive](https://www.cs.ucr.edu/~eamonn/time_series_data_2018/) to obtain full comprehension of the format.
 56 | 
 57 | ## Cite (BibTex)
 58 | 
 59 | Please cite the following paper, if you find our work useful in your research:
 60 | 
 61 | ```
 62 | @article{cai2023depression,
 63 |   title={Depression Detection on Online Social Network with Multivariate Time Series Feature of User Depressive Symptoms},
 64 |   author={Cai, Yicheng and Wang, Haizhou and Ye, Huali and Jin, Yanwen and Gao, Wei},
 65 |   journal={Expert Systems with Applications},
 66 |   pages={119538},
 67 |   volume = {217},
 68 |   year={2023},
 69 |   doi={10.1016/j.eswa.2023.119538}
 70 | }
 71 | ```
 72 | 
 73 | ## Supplementary Infos to the paper
 74 | 
 75 | ### Depression Symptom Descriptions
 76 | 
 77 | > in social media language
 78 | 
 79 | | #   | 症状名 (Symptom)                    | 症状描述 (Description)                                                                                                                                                                                                                                                                                                                |
 80 | |:---:|:--------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
 81 | | 1   | 悲伤情绪 (Sadness)                   | 长时间不开心、不高兴、不快乐，心情低落消沉、郁闷压抑、沮丧或绝望，经常总是悲伤想哭、伤心流泪、痛苦难过、感到空虚难熬惆怅 (Feeling unhappy, down, sad, or depressed for a long time, with a low, gloomy, distressed, discouraged or hopeless mood. Often feeling like crying, shedding tears of sorrow, agony and grief, feeling empty, unbearable melancholy and desolation.)                 |
 82 | | 2   | 兴致下降 (Loss of interest/pleasure) | 对几乎所有活动没兴趣、没意思没动力，乐趣明显减少、没有愉悦感，厌世、成天无精打采 (Having little interest or pleasure in almost all activities, feeling that things are meaningless or worthless, lacking motivation and drive. Experiencing a noticeable decrease in enjoyment, inability to feel joy, disenchantment with life, feeling lifeless and apathetic all day.) |
 83 | | 3   | 食欲问题 (Appetite problem)          | 食欲减退、经常饱、没胃口、想吐 (Loss of appetite, feeling full frequently, no desire to eat, feeling like vomiting.)                                                                                                                                                                                                                             |
 84 | | 4   | 睡眠障碍 (Insomnia)                  | 经常失眠睡不着、服用安眠药、熬夜到凌晨 (Frequently unable to sleep, insomnia, taking sleeping pills, staying up late into the early morning.)                                                                                                                                                                                                        |
 85 | | 5   | 急躁 (Agitation)                   | 精神性躁动、易感烦躁、坐立难安，言行冲动、易怒、易抓狂 (Mental agitation, easily irritated, restless, impulsive in words and actions, prone to anger, easily driven crazy.)                                                                                                                                                                                  |
 86 | | 6   | 精力不足 (Energy Loss)               | 经常感到累、困、昏晕乏力、疲惫没力气、没有精神 (Often feeling tired, sleepy, dizzy, weak, fatigued, lacking energy and vitality.)                                                                                                                                                                                                                        |
 87 | | 7   | 自责 (Self-blame)                  | 经常自我否定，我好没用、没有价值、一无是处、一事无成、好失败，让自己或家人失望，经常对不起、内疚自责、都是我的错 (Frequently self-negating, feeling useless, worthless, incompetent, a failure who lets myself or family down. Often feeling guilty, blaming and being hard on myself, thinking everything is my fault.)                                                                  |
 88 | | 8   | 注意力下降 (Concentration Problem)    | 注意力下降、无法专注、感到集中注意力困难、思考能力减退、犹豫不决、精神恍惚 (Decreased attention, inability to focus, difficulty concentrating, reduced thinking ability, indecisiveness, mental confusion.)                                                                                                                                                            |
 89 | | 9   | 自杀倾向 (Suicidal Ideation)         | 反复想到死亡、想死、自杀、结束生命，用刀片割腕自残、想跳楼自杀、计划自杀 (Repeated thoughts of death, wanting to die, suicide, ending one's life. Self-harming with razor blades, thinking of jumping off a building to commit suicide, making suicide plans.)                                                                                                        |
 90 | | 10  | 交感神经唤醒 (Sympathetic Arousal)     | 心慌、心悸、胸闷、喘不过气、颤抖、视力模糊、冒冷汗 (Feeling panic, heart palpitations, chest tightness, shortness of breath, trembling, blurred vision, breaking out in a cold sweat.)                                                                                                                                                                     |
 91 | | 11  | 恐慌 (Panic)                       | 经常好怕、害怕、恐惧、恐慌，想逃避 (Often feeling scared, afraid, terrified, panicked, wanting to escape.)                                                                                                                                                                                                                                         |
 92 | 
 93 | ### Depressive search words
 94 | 
 95 | > To find depression indicative tweets in Sina Weibo. 
 96 | > 
 97 | > The crawler of Sina Weibo see https://github.com/cyc21csri/SinaWeiboCrawler
 98 | 
 99 | | #   | Chinese Search Words | English Meaning                                          |
100 | |:---:|:--------------------:|:--------------------------------------------------------:|
101 | | 1   | \#抑郁症\#              | Sina Weibo super topic of "Depression" in English        |
102 | | 2   | 文拉法辛                 | "Venlafaxine" in English                                 |
103 | | 3   | 舍曲林                  | "Sertraline" in English                                  |
104 | | 4   | 度洛西汀                 | "Duloxetine" in English                                  |
105 | | 5   | 抑郁 一无是处              | "Depression" and "Good for nothing" in English           |
106 | | 6   | 抑郁 生无可恋              | "Depression" and "Don't want to live anymore" in English |
107 | | 7   | 抑郁 没意思               | "Depression" and "Boring" in English                     |
108 | | 8   | 抑郁 难熬                | "Depression" and "Suffering" in English                  |
109 | | 9   | 抑郁 自残                | "Depression" and "Self-harm" in English                  |
110 | | 10  | 抑郁 吃药                | "Depression" and "Take medicine" in English              |
111 | | 11  | 抑郁 想哭                | "Depression" and "Want to cry" in English                |
112 | | 12  | 抑郁 想死                | "Depression" and "Want to die" in English                |
113 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
  1 | from munch import Munch
  2 | import numpy as np
  3 | 
  4 | NB_VARIABLES = 11
  5 | TIMESTEPS = 500
  6 | NUM_CLASSES = 2
  7 | MALSTM_DATA_DIR = "dataset/swdd-7k_npz_500_{}".format(TIMESTEPS)
  8 | SKTIME_DATA_DIR = "dataset/swdd-7k_ts_origin_500_{}".format(TIMESTEPS)
  9 | SAVE_DIR = "results/swdd-7k_model_500_{}".format(TIMESTEPS)
 10 | 
 11 | lab_config = Munch(
 12 |     {
 13 |         "model_save_directory": SAVE_DIR,
 14 |         "malstm": {
 15 |             "data_dir": MALSTM_DATA_DIR,
 16 |             "model_config": {
 17 |                 "alstm_units": 8,
 18 |                 "dropout": 0.8,
 19 |                 "filters": [128, 256, 128],
 20 |                 "kernel_sizes": [8, 5, 3],
 21 |                 "padding": "same",
 22 |                 "kernel_initializer": "he_uniform",
 23 |                 "activation": "relu",
 24 |                 "num_classes": NUM_CLASSES,
 25 |                 "input_shape": (NB_VARIABLES, TIMESTEPS),  # TIMESTEPS
 26 |             },
 27 |             "train_config": {
 28 |                 "batch_size": 128,
 29 |                 "epochs": 100,
 30 |                 "learning_rate": 1e-3,
 31 |                 "callback_config": {
 32 |                     "reduce_lr": {
 33 |                         "monitor": "val_loss",  # "loss",
 34 |                         "patience": 100,
 35 |                         "mode": "auto",
 36 |                         "factor": 1.0 / np.cbrt(2),
 37 |                         "min_lr": 1e-4,
 38 |                     },
 39 |                 },
 40 |             },
 41 |             "model_file": "malstm_fcn_7k.keras",
 42 |         },
 43 |         "sktime_dl": {
 44 |             "data_dir": SKTIME_DATA_DIR,
 45 |             "network_config": {
 46 |                 "mcnn": {
 47 |                     "kernel_size": 7,
 48 |                     "avg_pool_size": 3,
 49 |                     "nb_conv_layers": 2,
 50 |                     "filter_sizes": [6, 12],
 51 |                 },
 52 |                 "fcn": {},
 53 |                 "mcdcnn": {
 54 |                     "kernel_size": 5,
 55 |                     "pool_size": 2,
 56 |                     "filter_sizes": [8, 8],
 57 |                     "dense_units": 732,
 58 |                 },
 59 |                 "twiesn": {},
 60 |                 "inception": {
 61 |                     "nb_filters": 32,
 62 |                     "use_residual": True,
 63 |                     "use_bottleneck": True,
 64 |                     "bottleneck_size": 32,
 65 |                     "depth": 6,
 66 |                     "kernel_size": 41 - 1,
 67 |                 },
 68 |             },
 69 |             "train_config": {
 70 |                 "mcnn": {
 71 |                     "batch_size": 32,  # 16,
 72 |                     "nb_epochs": 200,  # 200
 73 |                     "verbose": True,
 74 |                     "random_state": 0,  # 在comparison实验中需统一
 75 |                     "model_save_directory": SAVE_DIR,
 76 |                     "model_name": "cnn-7k",
 77 |                 },
 78 |                 "fcn": {
 79 |                     "nb_epochs": 200,  # 200, # 2000,
 80 |                     "batch_size": 64,  # 16,
 81 |                     "verbose": True,
 82 |                     "random_state": 0,
 83 |                     "model_name": "fcn-7k",
 84 |                     "model_save_directory": SAVE_DIR,
 85 |                 },
 86 |                 "inception": {
 87 |                     "nb_epochs": 500,  # 500 # 1500,
 88 |                     "batch_size": 64,
 89 |                     "verbose": True,
 90 |                     "random_state": 0,
 91 |                     "model_name": "inception-7k",
 92 |                     "model_save_directory": SAVE_DIR,
 93 |                 },
 94 |                 "mcdcnn": {
 95 |                     "nb_epochs": 120,  # 120
 96 |                     "batch_size": 16,
 97 |                     "verbose": True,
 98 |                     "random_state": 0,
 99 |                     "model_name": "mcdcnn-7k",
100 |                     "model_save_directory": SAVE_DIR,
101 |                 },
102 |                 "twiesn": {
103 |                     "rho_s": [0.55, 0.9, 2.0, 5.0],
104 |                     "alpha": 0.1,  # leaky rate
105 |                     "verbose": True,
106 |                     "random_state": 0,
107 |                     "model_name": "twiesn-7k",
108 |                     "model_save_directory": SAVE_DIR,
109 |                 },
110 |             },
111 |         },
112 |     }
113 | )
114 | 


--------------------------------------------------------------------------------
/envs/sktime-0.5.3.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/envs/sktime-0.5.3.zip


--------------------------------------------------------------------------------
/envs/sktime-dl-0.2.0-modify.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/envs/sktime-dl-0.2.0-modify.zip


--------------------------------------------------------------------------------
/img/Method-DSTS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/img/Method-DSTS.png


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | from utils.data import load_swdd_xk, load_swdd_xk_emb
  2 | from utils.extractor import WeiboText, get_post_time
  3 | from utils.symptom import symptoms_dsm_5 as symptoms
  4 | from sentence_transformers import SentenceTransformer, util
  5 | import torch
  6 | from torch.nn import ZeroPad2d
  7 | import pandas as pd
  8 | import numpy as np
  9 | import os
 10 | 
 11 | 
 12 | def extract_embedding(
 13 |     data_dir="swdd-7k",
 14 |     save_dir="swdd-7k_embedding",
 15 |     modelname="paraphrase-xlm-r-multilingual-v1"
 16 | ):
 17 |     """提取推文向量
 18 |     model_list = [
 19 |         "distiluse-base-multilingual-cased-v1",  # 512
 20 |         "paraphrase-xlm-r-multilingual-v1",  # 768, best
 21 |         "stsb-xlm-r-multilingual",  # 768
 22 |     ]
 23 |     """
 24 |     data = load_swdd_xk(data_dir=data_dir)
 25 |     model = SentenceTransformer(modelname)
 26 |     weibo_cleaner = WeiboText()
 27 | 
 28 |     if not os.path.exists(save_dir):
 29 |         os.mkdir(save_dir)
 30 | 
 31 |     cnt = 0
 32 |     for i in range(len(data)):
 33 |         cnt += 1
 34 |         if not cnt % 100:
 35 |             print(cnt, end=" ", flush=True)
 36 |         if not cnt % 1000:
 37 |             print()
 38 |         # extract embedding
 39 |         tweets = data[i]["tweets"]
 40 |         df_tweets = pd.DataFrame(
 41 |             [
 42 |                 {
 43 |                     "is_origin": tweet["is_origin"],
 44 |                     "time": get_post_time(tweet["post_time"]),
 45 |                     "text": weibo_cleaner.get_cleaned_text(tweet["text"]),
 46 |                 }
 47 |                 for tweet in tweets
 48 |             ]
 49 |         )
 50 | 
 51 |         tweets_emb = model.encode(df_tweets["text"].tolist())
 52 |         df_emb = pd.DataFrame({"embedding": list(tweets_emb)})
 53 |         df = pd.concat([df_tweets, df_emb], axis=1)
 54 | 
 55 |         np.savez(
 56 |             os.path.join(save_dir, "%04d.npz" % (i)),
 57 |             is_origin=df["is_origin"].tolist(),
 58 |             time=df["time"].tolist(),
 59 |             text=df["text"].tolist(),
 60 |             embedding=df["embedding"].tolist(),
 61 |         )  # 不能用to_numpy!!
 62 | 
 63 | 
 64 | def extract_time_series_feature(
 65 |     data_dir="swdd-7k_embedding",
 66 |     modelname="paraphrase-xlm-r-multilingual-v1",
 67 |     pad_len=500,
 68 |     interval_spans=0,
 69 |     origin_only=False
 70 | ):
 71 |     if origin_only:
 72 |         save_dir = "{}_origin_{}_{}".format(data_dir, pad_len, interval_spans)
 73 |     else:
 74 |         save_dir = "{}_{}_{}".format(data_dir, pad_len, interval_spans)
 75 |     if not os.path.exists(save_dir):
 76 |         os.mkdir(save_dir)
 77 | 
 78 |     model = SentenceTransformer(modelname)
 79 |     symp_text = [v for k, v in symptoms.items()]
 80 |     symp_emb = model.encode(symp_text)
 81 |     symp_emb = torch.from_numpy(symp_emb)
 82 |     print("Symptom Embed: ({}, {})".format(symp_emb.shape[0], symp_emb.shape[1]))
 83 | 
 84 |     # load_emb
 85 |     data = load_swdd_xk_emb(data_dir=data_dir)
 86 |     for i in range(len(data)):
 87 |         # Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. 
 88 |         # tweet_emb = torch.Tensor(data[i]["embedding"].tolist())
 89 |         df = data[i]
 90 |         if origin_only:  # NOTE: 可能出现origin为0的情况，仅这样不妥，需要保证origin数量充分多
 91 |             # print('origin only')
 92 |             df = df[df['is_origin']]
 93 |             df = df.reset_index(drop=True)
 94 |         # extract embedding
 95 |         # tweet_emb = torch.Tensor(np.array(data[i]["embedding"].tolist())) # NOTE:
 96 |         tweet_emb = torch.Tensor(np.array(df["embedding"].tolist()))
 97 |         # Compute cosine-similarits
 98 |         cosine_scores = util.pytorch_cos_sim(symp_emb, tweet_emb)
 99 |         # print(cosine_scores.shape)
100 |         pad = ZeroPad2d(padding=(0, pad_len - tweet_emb.shape[0], 0, 0))
101 |         cosine_scores = pad(cosine_scores)
102 |         # print(cosine_scores.shape)
103 |         # np.save(os.path.join(save_dir, "%04d" % i), cosine_scores)
104 |         # yield cosine_scores
105 |         # 如何将pad和interval_day结合起来？
106 |         # df = data[i] # NOTE:
107 |         time_series = cosine_scores
108 |         if interval_spans:
109 |             num = 0
110 |             for k, v in symptoms.items():
111 |                 df_symp = pd.DataFrame({k[:3]: time_series[num]})
112 |                 df = pd.concat([df, df_symp], axis=1)
113 |                 num += 1
114 | 
115 |             # 按照时间索引
116 |             df['time'] = df['time'].astype(np.string_)  # np.str_-> np.string_
117 |             df['time'] = df['time'].apply(lambda x: str(x, encoding='utf-8'))  # bytes -> str
118 |             df['time'] = pd.to_datetime(df['time'])
119 |             df.set_index('time', inplace=True)
120 |             # time_series = df.resample('{}D'.format(interval_days)).mean().fillna(0).values.T
121 |             idx = df.index
122 |             time_series = df.resample((np.max(idx)-np.min(idx))/(interval_spans-1)).mean().fillna(0).values.T
123 |         # print(time_series.shape)
124 |         np.save(os.path.join(save_dir, "%04d" % i), time_series)
125 |         # yield time_series       
126 |     return save_dir


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | icecream
 2 | jsonlines
 3 | pandas
 4 | harvesttext
 5 | scikit-learn
 6 | pyhanlp
 7 | matplotlib
 8 | lxml
 9 | bs4
10 | weibo-preprocess-toolkit
11 | sentence-transformers
12 | keras=2.3.1
13 | esig
14 | tsfresh


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__init__.py


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/analysis.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/analysis.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/analysis.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/analysis.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/clog.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/clog.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/clog.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/clog.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/data.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/data.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/data.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/data.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/extractor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/extractor.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/informer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/informer.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/parser.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/parser.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/summarizor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/summarizor.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/symptom.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/symptom.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/system_monitor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/system_monitor.cpython-37.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/system_monitor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ethan-nicholas-tsai/DepressionDetection/fb75bb15976acb00843d851d328ba129593fec55/utils/__pycache__/system_monitor.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/analysis.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2021/3/12 12:10
  4 | # @Author  : cendeavor
  5 | # @Site    :
  6 | # @File    : analysis_utils.py
  7 | # @Software: PyCharm
  8 | 
  9 | import tensorflow as tf
 10 | import keras
 11 | import numpy as np
 12 | 
 13 | 
 14 | def cal_basic_metrics(y_true, y_pred):
 15 |     from sklearn.metrics import (
 16 |         confusion_matrix,
 17 |         precision_score,
 18 |         accuracy_score,
 19 |         recall_score,
 20 |         f1_score,
 21 |         roc_auc_score,
 22 |         precision_recall_fscore_support,
 23 |         roc_curve,
 24 |         classification_report,
 25 |     )
 26 | 
 27 |     # print('accuracy:{}'.format(accuracy_score(y_true, y_pred))) # 不存在average
 28 |     # print('precision:{}'.format(precision_score(y_true, y_pred,average='micro')))
 29 |     # print('recall:{}'.format(recall_score(y_true, y_pred,average='micro')))
 30 |     # print('f1-score:{}'.format(f1_score(y_true, y_pred,average='micro')))
 31 |     # print('f1-score-for-each-class:{}'.format(precision_recall_fscore_support(y_true, y_pred))) # for macro
 32 |     ans = classification_report(y_true, y_pred, digits=5)  # 小数点后保留5位有效数字
 33 |     print(ans)
 34 |     return ans
 35 | 
 36 | 
 37 | def cal_auc(y_true_one_hot, y_pred_prob):
 38 |     from sklearn.metrics import roc_auc_score
 39 | 
 40 |     # AUC值
 41 |     # 使用micro，会计算n_classes个roc曲线，再取平均
 42 |     auc = roc_auc_score(y_true_one_hot, y_pred_prob, average="micro")
 43 |     print("AUC y_pred = proba:", auc)
 44 |     return auc
 45 | 
 46 | 
 47 | def plot_roc(y_true, y_pred_prob):
 48 |     # The magic happens here
 49 |     import matplotlib.pyplot as plt
 50 |     import scikitplot as skplt
 51 | 
 52 |     fig, ax = plt.subplots()  # 可以用figsize=(16,12)指定画布大小
 53 |     skplt.metrics.plot_roc(y_true, y_pred_prob, ax=ax)
 54 |     return fig
 55 |     # skplt.metrics.plot_roc(y_true, y_pred_prob)
 56 |     # plt.show()  #没必要
 57 |     # return plt
 58 | 
 59 | 
 60 | def plot_prc(y_true, y_pred_prob):
 61 |     import matplotlib.pyplot as plt
 62 |     import scikitplot as skplt
 63 | 
 64 |     fig, ax = plt.subplots()  # 可以用figsize=(16,12)指定画布大小
 65 |     skplt.metrics.plot_precision_recall_curve(y_true, y_pred_prob, ax=ax)
 66 |     return fig
 67 | 
 68 | 
 69 | def plot_confusion_matrix(y_true, y_pred):
 70 |     import matplotlib.pyplot as plt
 71 |     import scikitplot as skplt
 72 | 
 73 |     fig, ax = plt.subplots()
 74 |     skplt.metrics.plot_confusion_matrix(y_true, y_pred, normalize=True, ax=ax)
 75 |     return fig
 76 |     # plot = skplt.metrics.plot_confusion_matrix(y_true, y_pred, normalize=True)
 77 |     # plt.show()  #没必要
 78 |     # return plt
 79 | 
 80 | 
 81 | def report_model_performance(y_true_one_hot, y_pred_prob):
 82 |     import numpy as np
 83 | 
 84 |     y_pred = np.argmax(y_pred_prob, axis=1)
 85 |     y_true = np.argmax(y_true_one_hot, axis=1)
 86 | 
 87 |     cal_basic_metrics(y_true, y_pred)
 88 |     cal_auc(y_true_one_hot, y_pred_prob)
 89 |     plt1 = plot_roc(y_true, y_pred_prob)
 90 |     plt2 = plot_confusion_matrix(y_true, y_pred)
 91 |     return plt1, plt2
 92 | 
 93 | 
 94 | # 精确率评价指标
 95 | def metric_precision(y_true, y_pred):
 96 |     TP = tf.reduce_sum(y_true * tf.round(y_pred))
 97 |     TN = tf.reduce_sum((1 - y_true) * (1 - tf.round(y_pred)))
 98 |     FP = tf.reduce_sum((1 - y_true) * tf.round(y_pred))
 99 |     FN = tf.reduce_sum(y_true * (1 - tf.round(y_pred)))
100 |     precision = TP / (TP + FP)
101 |     return precision
102 | 
103 | 
104 | # 召回率评价指标
105 | def metric_recall(y_true, y_pred):
106 |     TP = tf.reduce_sum(y_true * tf.round(y_pred))
107 |     TN = tf.reduce_sum((1 - y_true) * (1 - tf.round(y_pred)))
108 |     FP = tf.reduce_sum((1 - y_true) * tf.round(y_pred))
109 |     FN = tf.reduce_sum(y_true * (1 - tf.round(y_pred)))
110 |     recall = TP / (TP + FN)
111 |     return recall
112 | 
113 | 
114 | # F1-score评价指标
115 | def metric_F1score(y_true, y_pred):
116 |     TP = tf.reduce_sum(y_true * tf.round(y_pred))
117 |     TN = tf.reduce_sum((1 - y_true) * (1 - tf.round(y_pred)))
118 |     FP = tf.reduce_sum((1 - y_true) * tf.round(y_pred))
119 |     FN = tf.reduce_sum(y_true * (1 - tf.round(y_pred)))
120 |     precision = TP / (TP + FP)
121 |     recall = TP / (TP + FN)
122 |     F1score = 2 * precision * recall / (precision + recall)
123 |     return F1score
124 | 
125 | 
126 | # 编译阶段引用自定义评价指标示例
127 | # model.compile(optimizer='adam',
128 | #               loss='binary_crossentropy',
129 | #               metrics=['accuracy',
130 | #                        metric_precision,
131 | #                        metric_recall,
132 | #                        metric_F1score])
133 | 
134 | 
135 | def get_hardest_k_examples(test_dataset, model, k=32):
136 |     class_probs = model(test_dataset.x)
137 |     predictions = np.argmax(class_probs, axis=1)
138 |     losses = keras.losses.categorical_crossentropy(test_dataset.y, class_probs)
139 |     argsort_loss = np.argsort(losses)
140 | 
141 |     highest_k_losses = np.array(losses)[argsort_loss[-k:]]
142 |     hardest_k_examples = test_dataset.x[argsort_loss[-k:]]
143 |     true_labels = np.argmax(test_dataset.y[argsort_loss[-k:]], axis=1)
144 | 
145 |     return highest_k_losses, hardest_k_examples, true_labels, predictions
146 | 
147 | 
148 | def calc_metrics_binary(model, X_test, y_test):
149 |     from sklearn.metrics import (
150 |         classification_report,
151 |         accuracy_score,
152 |         f1_score,
153 |         roc_auc_score,
154 |         recall_score,
155 |         precision_score,
156 |     )
157 | 
158 |     y_pred = model.predict(X_test)
159 |     report = classification_report(
160 |         y_test, y_pred, target_names=["Normal", "Depressed"], digits=4
161 |     )
162 |     # # 计算准确率
163 |     # acc = accuracy_score(y_test, y_pred)
164 |     # # 计算F1-Score
165 |     # f1 = f1_score(y_test, y_pred, pos_label='1')
166 |     # # 计算精确率
167 |     # prec = precision_score(y_test, y_pred)
168 |     # # 计算召回率
169 |     # rec = recall_score(y_test, y_pred)
170 |     # # 计算AUC面积(2分类)
171 |     # AUC = roc_auc_score(y_test, y_pred)
172 |     return report  # , acc, f1, prec, rec, AUC
173 | 


--------------------------------------------------------------------------------
/utils/clog.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | 
 3 | 
 4 | def count_time(func):
 5 |     """装饰器：计算程序运行时间"""
 6 |     from datetime import datetime
 7 | 
 8 |     @functools.wraps(func)  # 保留被装饰函数信息
 9 |     def wrapper(*args, **kw):
10 |         start_time = datetime.now()
11 |         res = func(*args, **kw)
12 |         print("[%s] RUN TIME: %s" % (func.__name__, str(datetime.now() - start_time)))
13 |         return res  # 返回被装饰函数的返回值
14 | 
15 |     return wrapper
16 | 


--------------------------------------------------------------------------------
/utils/data.py:
--------------------------------------------------------------------------------
  1 | from utils.clog import count_time
  2 | import os
  3 | import jsonlines
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | data_dir = "swdd"
  8 | dep_file = "depressed.jsonl"
  9 | con_file = "control.jsonl"
 10 | dep_path = os.path.join(data_dir, dep_file)
 11 | con_path = os.path.join(data_dir, con_file)
 12 | 
 13 | 
 14 | @count_time
 15 | def load_swdd_all(data_dir):
 16 |     """ """
 17 | 
 18 |     dep_file = "depressed.jsonl"
 19 |     con_file = "control.jsonl"
 20 |     dep_path = os.path.join(data_dir, dep_file)
 21 |     con_path = os.path.join(data_dir, con_file)
 22 | 
 23 |     data = []
 24 |     for filename in [dep_path, con_path]:
 25 |         print("Loading {}".format(filename))
 26 |         cnt = 0
 27 |         with open(filename, "r", encoding="utf8") as f:
 28 |             for item in jsonlines.Reader(f):
 29 |                 datum = {
 30 |                     "label": 1 if item["label"]["depressed"] else 0,
 31 |                     **item["user"],
 32 |                     "tweets": pd.DataFrame(item["tweets"]),
 33 |                 }
 34 |                 data.append(datum)
 35 |                 cnt += 1
 36 |         print("Sample Num: {}".format(cnt))
 37 |     df = pd.DataFrame(data)
 38 |     return df
 39 | 
 40 | 
 41 | def get_quantile_upper_outliers(df, column_name, quantile=0.75):
 42 |     s = df[column_name]
 43 | 
 44 |     df_ = df.copy()
 45 |     # 这里将大于上四分位数(Q3)的设定为异常值
 46 |     # df_['isOutlier'] = s > s.quantile(0.75)
 47 |     df_.loc[:, "isOutlier"] = s > s.quantile(quantile)
 48 |     df_rst = df_[df_["isOutlier"] == True]
 49 |     return df_rst
 50 | 
 51 | 
 52 | def get_quantile_lower_outliers(df, column_name, quantile=0.25):
 53 |     s = df[column_name]
 54 | 
 55 |     df_ = df.copy()
 56 |     # 这里将小于下四分位数(Q1)的设定为异常值
 57 |     df_.loc[:, "isOutlier"] = s < s.quantile(quantile)
 58 |     df_rst = df_[df_["isOutlier"] == True]
 59 |     return df_rst
 60 | 
 61 | 
 62 | def get_box_plot_outliers(df, column_name):
 63 |     s = df[column_name]
 64 | 
 65 |     df_ = df.copy()
 66 |     q1, q3 = s.quantile(0.25), s.quantile(0.75)
 67 |     iqr = q3 - q1
 68 |     low, up = q1 - 1.5 * iqr, q3 + 1.5 * iqr
 69 |     df_.loc[:, "isOutlier"] = s.mask((s < low) | (s > up))
 70 |     df_rst = df_[df_["isOutlier"] == True]
 71 |     return df_rst
 72 | 
 73 | 
 74 | @count_time
 75 | def gen_swdd_7k(data_dir):
 76 |     """ """
 77 | 
 78 |     import os
 79 |     import numpy as np
 80 |     import json
 81 |     import jsonlines
 82 | 
 83 |     # 加载数据集
 84 |     df = load_swdd_all(data_dir=data_dir)
 85 | 
 86 |     # 删除不要字段
 87 |     cols = [
 88 |         i
 89 |         for i in df.columns
 90 |         if i
 91 |         not in ["avatar_url", "cover_image_url", "verified_reason", "verified_type"]
 92 |     ]
 93 |     df = df[cols]
 94 | 
 95 |     # 删除离异点
 96 |     dep_follow_outliers = get_quantile_upper_outliers(
 97 |         df[df["label"] == 1], column_name="follow_count", quantile=0.999
 98 |     )
 99 |     dep_follower_outliers = get_quantile_upper_outliers(
100 |         df[df["label"] == 1], column_name="followers_count", quantile=0.99
101 |     )
102 |     dep_outliers = df.iloc[
103 |         np.union1d(dep_follow_outliers.index.values, dep_follower_outliers.index.values)
104 |     ]
105 |     con_follow_outliers = get_box_plot_outliers(
106 |         df[df["label"] == 0], column_name="follow_count"
107 |     )
108 |     con_follower_outliers = get_box_plot_outliers(
109 |         df[df["label"] == 0], column_name="followers_count"
110 |     )
111 |     con_outliers = df.iloc[
112 |         np.union1d(con_follow_outliers.index.values, con_follower_outliers.index.values)
113 |     ]
114 | 
115 |     df_ = df.copy()
116 |     df_ = df_.drop(dep_outliers.index.values)
117 |     df_ = df_.drop(con_outliers.index.values).reset_index(drop=True)
118 |     print(df_.describe())
119 | 
120 | 
121 |     # 删除原创推文少于30的
122 |     for i in range(len(df_)):
123 |         if len(df_['tweets'][i][df_['tweets'][i]['is_origin']]) < 30:
124 |             df_ = df_.drop(i)
125 | 
126 | 
127 |     # 采样7k
128 |     samp_cnt = 3500
129 |     df_7k = (
130 |         (
131 |             pd.concat(
132 |                 [
133 |                     df_[df_["label"] == 1].sample(n=samp_cnt),
134 |                     df_[df_["label"] == 0].sample(n=samp_cnt),
135 |                 ]
136 |             )
137 |         )
138 |         .sample(n=samp_cnt * 2)
139 |         .reset_index(drop=True)
140 |     )
141 | 
142 |     # 删除推文字段（剔除转发推文）
143 |     cols = [
144 |         i
145 |         for i in df_7k.iloc[0]["tweets"].columns
146 |         if i
147 |         not in [
148 |             "edit_at",
149 |             "pics_url",
150 |             "publish_place",
151 |             "publish_tool",
152 |             "video_url",
153 |             "article_url",
154 |             "topics",
155 |             "at_users",
156 |             "retweet",
157 |         ]
158 |     ]
159 |     df_ = df_7k.copy()
160 |     for i in range(len(df_)):
161 |         df_["tweets"][i] = df_["tweets"][i][cols]
162 | 
163 |     df_7k = df_
164 |     print(df_7k.describe())
165 | 
166 |     # return df_7k
167 |     swdd_7k_dir = data_dir + "-7k"
168 |     if not os.path.exists(swdd_7k_dir):
169 |         os.mkdir(swdd_7k_dir)
170 | 
171 |     print("Writing to {}".format(swdd_7k_dir))
172 | 
173 |     for i in range(len(df_7k)):
174 |         samp = json.loads(df_7k.iloc[i].to_json(orient="columns"))
175 |         samp["tweets"] = json.loads(
176 |             df_7k.iloc[i]["tweets"].to_json(orient="records")
177 |         )  # fuck it !!!!
178 | 
179 |         with jsonlines.open(
180 |             os.path.join(swdd_7k_dir, "%04d.jsonl" % (i)), mode="w"
181 |         ) as writer:
182 |             writer.write(samp)
183 | 
184 |     print("Done")
185 | 
186 |     return df_7k
187 | 
188 | 
189 | @count_time
190 | def gen_swdd_4k(data_dir):
191 |     import os
192 |     import numpy as np
193 |     import json
194 |     import jsonlines
195 | 
196 |     # data_dir = "swdd"
197 | 
198 |     # 加载数据集
199 |     df = load_swdd_all(data_dir=data_dir)
200 | 
201 |     # 删除不要字段
202 |     cols = [
203 |         i
204 |         for i in df.columns
205 |         if i
206 |         not in ["avatar_url", "cover_image_url", "verified_reason", "verified_type"]
207 |     ]
208 |     df = df[cols]
209 | 
210 |     # 删除离异点
211 |     dep_follow_outliers = get_quantile_upper_outliers(
212 |         df[df["label"] == 1], column_name="follow_count", quantile=0.999
213 |     )
214 |     dep_follower_outliers = get_quantile_upper_outliers(
215 |         df[df["label"] == 1], column_name="followers_count", quantile=0.99
216 |     )
217 |     dep_outliers = df.iloc[
218 |         np.union1d(dep_follow_outliers.index.values, dep_follower_outliers.index.values)
219 |     ]
220 |     con_follow_outliers = get_box_plot_outliers(
221 |         df[df["label"] == 0], column_name="follow_count"
222 |     )
223 |     con_follower_outliers = get_box_plot_outliers(
224 |         df[df["label"] == 0], column_name="followers_count"
225 |     )
226 |     con_outliers = df.iloc[
227 |         np.union1d(con_follow_outliers.index.values, con_follower_outliers.index.values)
228 |     ]
229 | 
230 |     import pandas as pd
231 | 
232 |     # 采样4k
233 |     samp_cnt = 4000
234 | 
235 |     for dep_prop in range(10, 100, 10):
236 |         df_ = df.copy()
237 |         df_ = df_.drop(dep_outliers.index.values)
238 |         df_ = df_.drop(con_outliers.index.values).reset_index(drop=True)
239 |         print(df_.describe())
240 | 
241 |         # 删除原创推文少于20的
242 |         for i in range(len(df_)):
243 |             if len(df_['tweets'][i][df_['tweets'][i]['is_origin']]) < 20:
244 |                 df_ = df_.drop(i)
245 | 
246 |         dep_prop = dep_prop / 100
247 |         dep_cnt = int(samp_cnt * dep_prop)
248 |         # con_cnt = int(samp_cnt * (1 - dep_prop)) # 浮点数问题。。
249 |         con_cnt = samp_cnt - dep_cnt
250 |         if dep_cnt % 100:
251 |             con_cnt = int(samp_cnt * (1 - dep_prop))
252 |             dep_cnt = samp_cnt - con_cnt
253 | 
254 |         print(dep_cnt, con_cnt)
255 | 
256 |         df_4k = (
257 |             (
258 |                 pd.concat(
259 |                     [
260 |                         df_[df_["label"] == 1].sample(n=dep_cnt),
261 |                         df_[df_["label"] == 0].sample(n=con_cnt),
262 |                     ]
263 |                 )
264 |             )
265 |             .sample(n=samp_cnt)
266 |             .reset_index(drop=True)
267 |         )
268 | 
269 |         # 删除推文字段（剔除转发推文）
270 |         cols = [
271 |             i
272 |             for i in df_4k.iloc[0]["tweets"].columns
273 |             if i
274 |             not in [
275 |                 "edit_at",
276 |                 "pics_url",
277 |                 "publish_place",
278 |                 "publish_tool",
279 |                 "video_url",
280 |                 "article_url",
281 |                 "topics",
282 |                 "at_users",
283 |                 "retweet",
284 |             ]
285 |         ]
286 |         df_ = df_4k.copy()
287 |         for i in range(len(df_)):
288 |             df_["tweets"][i] = df_["tweets"][i][cols]
289 | 
290 |         df_4k = df_
291 |         print(df_4k.describe())
292 | 
293 |         swdd_4k_dir = data_dir + "-4k_{}".format(int(dep_prop * 100))
294 |         if not os.path.exists(swdd_4k_dir):
295 |             os.mkdir(swdd_4k_dir)
296 | 
297 |         print("Writing to {}".format(swdd_4k_dir))
298 | 
299 |         for i in range(len(df_4k)):
300 |             samp = json.loads(df_4k.iloc[i].to_json(orient="columns"))
301 |             samp["tweets"] = json.loads(
302 |                 df_4k.iloc[i]["tweets"].to_json(orient="records")
303 |             )  # fuck it !!!!
304 | 
305 |             with jsonlines.open(
306 |                 os.path.join(swdd_4k_dir, "%04d.jsonl" % (i)), mode="w"
307 |             ) as writer:
308 |                 writer.write(samp)
309 | 
310 |         print("Done")
311 | 
312 | 
313 | @count_time
314 | def load_swdd_xk(data_dir):
315 |     data = []
316 |     # 乱序。。
317 |     # for _, _, files in os.walk(data_dir):
318 |     #     for file in files:
319 |     #         with open(os.path.join(data_dir, file), "r", encoding="utf8") as f:
320 |     #             for item in jsonlines.Reader(f):
321 |     #                 data.append(item)
322 |     files = os.listdir(data_dir)
323 |     files.sort()
324 |     for file in files:
325 |         with open(os.path.join(data_dir, file), "r", encoding="utf8") as f:
326 |             for item in jsonlines.Reader(f):
327 |                 data.append(item)
328 |     return data
329 | 
330 | 
331 | @count_time
332 | def load_swdd_xk_emb(data_dir):
333 |     import numpy as np
334 | 
335 |     data = []
336 | 
337 |     files = os.listdir(data_dir)
338 |     files.sort()
339 |     for file in files:
340 |         datum = np.load(os.path.join(data_dir, file), allow_pickle=True)
341 |         df = pd.DataFrame({k: list(datum[k]) for k in datum.files})
342 |         data.append(df)
343 |     return data
344 | 
345 | 
346 | @count_time
347 | def load_swdd_xk_npz(data_dir):
348 |     import os
349 |     import numpy as np
350 | 
351 |     train_data = np.load(os.path.join(data_dir, "train.npz"), allow_pickle=True)
352 |     test_data = np.load(os.path.join(data_dir, "test.npz"), allow_pickle=True)
353 |     X_train = []
354 |     id_train = []
355 |     X_test = []
356 |     id_test = []
357 | 
358 |     for datum in train_data["X"]:
359 |         X_train.append(datum[0])
360 |         id_train.append(datum[1])
361 | 
362 |     for datum in test_data["X"]:
363 |         X_test.append(datum[0])
364 |         id_test.append(datum[1])
365 | 
366 |     X_train = np.array(X_train)
367 |     X_test = np.array(X_test)
368 |     id_train = np.array(id_train)
369 |     id_test = np.array(id_test)
370 |     y_train = train_data["y"]
371 |     y_test = test_data["y"]
372 | 
373 |     return X_train, X_test, y_train, y_test, id_train, id_test
374 | 
375 | 
376 | def inspect_time_series(data_dir, dir_suffix, file_id):
377 |     """查看某个用户的时间序列特征，封装成DataFrame返回
378 |     # basic usage
379 |     df.iloc[:144].plot(subplots=True, figsize=(10,12))
380 |     df.loc['2020-05'].plot()
381 |     df.resample('Q')['sui'].mean()
382 |     df.resample('2W').mean().fillna(0).values.T[0]
383 |     # plot month average bar
384 |     df_month = df.resample("M").mean()
385 |     fig, ax = plt.subplots(figsize=(10, 6))
386 |     ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
387 |     ax.bar(df_month['2020':].index, df_month.loc['2020':,
388 |         "ene"], width=25, align='center')
389 |     """
390 |     import numpy as np
391 |     import os
392 |     from utils.symptom import symptoms_dsm_5 as symptoms
393 | 
394 |     time_series = np.load(
395 |         os.path.join("{}_{}".format(data_dir, dir_suffix), "%04d.npy" % file_id)
396 |     )
397 |     tweet_meta = np.load(
398 |         os.path.join(data_dir, "%04d.npz" % file_id), allow_pickle=True
399 |     )
400 |     df = pd.DataFrame({k: list(tweet_meta[k]) for k in tweet_meta.files})
401 | 
402 |     num = 0
403 |     for k, v in symptoms.items():
404 |         df_symp = pd.DataFrame({k[:3]: time_series[num]})
405 |         df = pd.concat([df, df_symp], axis=1)
406 |         num += 1
407 | 
408 |     # 按照时间索引
409 |     df["time"] = df["time"].astype(np.string_)  # np.str_-> np.string_
410 |     df["time"] = df["time"].apply(lambda x: str(x, encoding="utf-8"))  # bytes -> str
411 |     df["time"] = pd.to_datetime(df["time"])
412 |     df.set_index("time", inplace=True)
413 | 
414 |     return df
415 | 
416 | 
417 | def plot_ex_width(x, y, x_maxsize):
418 | 
419 |     plt.plot(x, y)
420 |     # plt.ylim((0, 1000))
421 |     # plt.title("Demo")
422 |     plt.xlabel("x")
423 |     plt.ylabel("y")
424 | 
425 |     # change x internal size
426 |     plt.gca().margins(x=0)
427 |     plt.gcf().canvas.draw()
428 | 
429 |     # set size
430 |     maxsize = x_maxsize
431 |     m = 0.2
432 |     N = len(x)
433 |     s = maxsize / plt.gcf().dpi * N + 2 * m
434 |     margin = m / plt.gcf().get_size_inches()[0]
435 | 
436 |     plt.gcf().subplots_adjust(left=margin, right=1.0 - margin)
437 |     plt.gcf().set_size_inches(s, plt.gcf().get_size_inches()[1])
438 | 
439 | 
440 | def plot_time_series(time_series, expand_width=2):
441 |     from utils.symptom import symptoms_dsm_5 as symptoms
442 | 
443 |     label_list = [k[:3] for k, v in symptoms.items()]
444 | 
445 |     x_values = list(range(1, time_series.shape[1] + 1))
446 |     for i in range(time_series.shape[0]):
447 |         # plt.plot(x_values, time_series[i])
448 |         plot_ex_width(x_values, time_series[i], expand_width)
449 |     plt.legend(labels=label_list)
450 | 


--------------------------------------------------------------------------------
/utils/extractor.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import re
  3 | from lxml import etree
  4 | from bs4 import BeautifulSoup
  5 | from weibo_preprocess_toolkit import WeiboPreprocess
  6 | from harvesttext import HarvestText
  7 | import pyhanlp
  8 | 
  9 | 
 10 | def get_post_time(timestr="Mon Dec 14 11:26:56 +0800 2020"):
 11 |     import calendar
 12 |     import locale
 13 | 
 14 |     # locale.setlocale(locale.LC_ALL, "C.UTF-8")
 15 | 
 16 |     if not timestr:
 17 |         return ""
 18 |     temp = timestr.split(" ")
 19 |     time_area = temp[-2]
 20 |     if time_area != "+0800":
 21 |         print(time_area)
 22 |     # day_time = ':'.join(temp[3].split(':')[:-1])
 23 |     day_time = temp[3]
 24 |     return (
 25 |         temp[-1]
 26 |         + "-"
 27 |         + "{:0=2}".format(list(calendar.month_abbr).index(temp[1]))
 28 |         + "-"
 29 |         + temp[2]
 30 |         + " "
 31 |         + day_time
 32 |     )
 33 | 
 34 | 
 35 | class WeiboText:
 36 |     def __init__(self):
 37 | 
 38 |         self.preprocess = WeiboPreprocess()
 39 |         self.ht = HarvestText()
 40 |         self.CharTable = pyhanlp.JClass("com.hankcs.hanlp.dictionary.other.CharTable")
 41 |         self.d1 = re.compile(r"（.*）")
 42 |         self.d2 = re.compile(r"点击播放")  # 点击播放>>
 43 |         self.d3 = re.compile(r"在.*获取更多信息")
 44 |         self.d4 = re.compile(r"速围观")
 45 |         self.d5 = re.compile(r"我获得了.*的红包")
 46 |         self.d6 = re.compile(r"#.*#")
 47 | 
 48 |     def get_cleaned_text(self, html):
 49 |         # TODO: <br /> -> 空格
 50 |         # TODO: 多空格压缩（HarvestText）
 51 |         # TODO: 去奇怪符号
 52 |         # TODO: 表情字符icon翻译（翻译表） + 冗余字符icon去除（HarvestText）
 53 |         # TODO: 去数字（？）
 54 |         # TODO: 繁简体转化
 55 |         # TODO: 固定噪音去除（weibo_preprocess_toolkit） + HarvestText中自定义
 56 |         # 1. （分享自）、（通过 录制）
 57 |         # 2. 点击播放>>
 58 |         # 3. 在XXX获取更多信息
 59 |         # 4. 速围观
 60 |         # 5. ...全文
 61 |         # 6. 我获得了XXX的红包
 62 |         # 7. 打卡第X天
 63 |         soup = BeautifulSoup(html, features="lxml")
 64 |         tmp_a = [i.extract() for i in soup.find_all("a")]
 65 |         # 保留图片表情文本（但是一些表情比如微笑可能有反讽意味）
 66 |         # for i in soup.find_all('span', class_='url-icon'):
 67 |         #     i.append(i.img.attrs['alt'])
 68 |         # return soup.get_text().lower().strip()
 69 |         text = soup.get_text()
 70 |         text = self.d1.sub("", text)
 71 |         text = self.d2.sub("", text)
 72 |         text = self.d3.sub("", text)
 73 |         text = self.d4.sub("", text)
 74 |         text = self.d5.sub("", text)
 75 |         text = self.d6.sub("", text)
 76 |         # 使用HarvestText清洗文本（空格压缩，去字符表情）
 77 |         content = self.CharTable.convert(text)
 78 |         cleaned_text = self.ht.clean_text(content, weibo_topic=True)
 79 |         # 使用weibo_preprocess_toolkit清洗文本（繁简体转化，去固定噪音，去数字，）
 80 |         cleaned_text = self.preprocess.clean(cleaned_text)
 81 |         return cleaned_text.strip()
 82 | 
 83 |     @staticmethod
 84 |     def get_raw_text(text_body):
 85 |         return etree.HTML(text_body).xpath("string(.)")
 86 | 
 87 |     @staticmethod
 88 |     def get_weibo_selector(text_body):
 89 |         return etree.HTML(text_body)
 90 | 
 91 |     @staticmethod
 92 |     def string_to_int(string):
 93 |         """字符串转换为整数"""
 94 |         if isinstance(string, int):
 95 |             return string
 96 |         elif string.endswith("万+"):
 97 |             string = int(string[:-2] + "0000")
 98 |         elif string.endswith("万"):
 99 |             string = int(string[:-1] + "0000")
100 |         return int(string)
101 | 
102 |     @staticmethod
103 |     def standardize_info(weibo):
104 |         """标准化信息，去除乱码"""
105 |         for k, v in weibo.items():
106 |             if (
107 |                 "bool" not in str(type(v))
108 |                 and "int" not in str(type(v))
109 |                 and "list" not in str(type(v))
110 |                 and "long" not in str(type(v))
111 |             ):
112 |                 weibo[k] = (
113 |                     v.replace("\u200b", "")
114 |                     .encode(sys.stdout.encoding, "ignore")
115 |                     .decode(sys.stdout.encoding)
116 |                 )
117 |         return weibo


--------------------------------------------------------------------------------
/utils/symptom.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Time    : 2021/2/9 10:51
  4 | # @Author  : cendeavor
  5 | # @Site    : 
  6 | # @File    : symptom.py
  7 | # @Software: PyCharm
  8 | 
  9 | # DSM-5
 10 | symptoms_dsm_5 = {
 11 |     # 1. 抑郁心境：感到哀伤、空洞、绝望等
 12 |     "depressive_mood": "长时间不开心、不高兴、不快乐，心情低落消沉、郁闷压抑、沮丧或绝望，经常总是悲伤想哭、伤心流泪、痛苦难过、感到空虚难熬惆怅",
 13 |     # 2. 显著的日常（娱乐）活动的减少: 做事时提不起劲或没有兴趣
 14 |     "interest_pleasure_loss": "对几乎所有活动没兴趣、没意思没动力，乐趣明显减少、没有愉悦感，厌世、成天无精打采",
 15 |     # 3. 胃口或体重明显改变：食欲不振或吃太多,在未节食的情况下体重明显减轻，或体重增加
 16 |     "appetite_weight_problem": "食欲减退、经常饱、没胃口、想吐或食欲增加、不明原因暴食，体重明显减轻或体重增加",
 17 |     # 4. 睡眠困扰：入睡困难、睡不安稳，或睡眠过多
 18 |     "insomnia_or_hypersomnia": "经常失眠睡不着、服用安眠药，凌晨发帖时间多，嗜睡、起床困难、睡不醒",
 19 |     # 5. 日常行动迟缓或多动：动作或说话速度缓慢到别人已经觉察; 烦躁或坐立不安、动来动去的情况更胜于平常
 20 |     "retardation_or_agitation": "反应麻木迟钝、动作迟缓，对疼痛不敏感；精神性躁动、易感烦躁、坐立难安，言行冲动、易怒、易抓狂",
 21 |     # 6. 感到劳累： 感觉疲倦或精力不足
 22 |     "energy_loss": "经常感到累、困、昏晕乏力、疲惫没力气、没有精神",
 23 |     # 7. 感到无用，自责：觉得自己很糟，或觉得自己很失败，或让自己或家人失望
 24 |     "self_blame": "经常自我否定，我好没用、没有价值、一无是处、一事无成、好失败，让自己或家人失望，经常对不起、内疚自责、都是我的错",
 25 |     # 8. 无法集中注意力，决策力下降：对事物专注有困难，例如阅读报纸或看电视时不能集中注意力
 26 |     "concentration_problem": "注意力下降、无法专注、感到集中注意力困难、思考能力减退、犹豫不决、精神恍惚",
 27 |     # 9. 不断地想到死亡，自杀念头但是没有计划：有不如死掉或用某种方式伤害自己的念头
 28 |     "suicidal_ideation": "反复想到死亡、想死、自杀、结束生命，用刀片割腕自残、想跳楼自杀、计划自杀",
 29 |     # 10. 交感神经唤醒
 30 |     "sympathetic_arousal": "心慌、心悸、胸闷、喘不过气、颤抖、视力模糊、冒冷汗",
 31 |     # 11. 恐慌&焦虑：对于特定的物体有显著的恐惧和焦虑或者逃避，且持续呈现；
 32 |     "panic_and_anxious": "经常好怕、害怕、恐惧、恐慌，想逃避；浑身躁动、经常烦躁、焦躁不安、焦虑、过度紧张、过度担心、忐忑、精神紧绷"
 33 | }
 34 | 
 35 | # DSM-5结合论文
 36 | symptoms_combined = {
 37 |     # 1. 抑郁心境：感到哀伤、空洞、绝望等
 38 |     "depressive_mood": "长时间不开心、不高兴、不快乐，心情低落消沉、郁闷压抑、沮丧或绝望，经常总是悲伤想哭、伤心流泪、痛苦难过、感到空虚难熬惆怅",
 39 |     # 2. 显著的日常（娱乐）活动的减少: 做事时提不起劲或没有兴趣
 40 |     "interest_pleasure_loss": "对几乎所有活动没兴趣、没意思没动力，乐趣明显减少、没有愉悦感，厌世、成天无精打采",
 41 |     # 3. 胃口或体重明显改变：食欲不振或吃太多,在未节食的情况下体重明显减轻，或体重增加
 42 |     "appetite_weight_problem": "食欲减退、经常饱、没胃口、想吐或食欲增加、不明原因暴食，体重明显减轻或体重增加",
 43 |     # 4. 睡眠困扰：入睡困难、睡不安稳，或睡眠过多
 44 |     "insomnia_or_hypersomnia": "经常失眠睡不着、服用安眠药，凌晨发帖时间多，嗜睡、起床困难、睡不醒",
 45 |     # 5. 日常行动迟缓：动作或说话速度缓慢到别人已经觉察;
 46 |     "retardation": "反应麻木迟钝、动作迟缓，对疼痛不敏感",
 47 |     # 6. 感到劳累： 感觉疲倦或精力不足
 48 |     "energy_loss": "经常感到累、困、昏晕乏力、疲惫没力气、没有精神",
 49 |     # 7. 感到无用，自责：觉得自己很糟，或觉得自己很失败，或让自己或家人失望
 50 |     "self_blame": "经常自我否定，我好没用、没有价值、一无是处、一事无成、好失败，让自己或家人失望，经常对不起、内疚自责、都是我的错",
 51 |     # 8. 无法集中注意力，决策力下降：对事物专注有困难，例如阅读报纸或看电视时不能集中注意力
 52 |     "concentration_problem": "注意力下降、无法专注、感到集中注意力困难、思考能力减退、犹豫不决、精神恍惚",
 53 |     # 9. 不断地想到死亡，自杀念头但是没有计划：有不如死掉或用某种方式伤害自己的念头
 54 |     "suicidal_ideation": "反复想到死亡、想死、自杀、结束生命，用刀片割腕自残、想跳楼自杀、计划自杀",
 55 |     # 10. 交感神经唤醒
 56 |     "sympathetic_arousal": "心慌、心悸、胸闷、喘不过气、颤抖、视力模糊、冒冷汗",
 57 |     # 11. 恐慌&焦虑&狂躁：对于特定的物体有显著的恐惧和焦虑或者逃避，且持续呈现；烦躁或坐立不安、动来动去的情况更胜于平常
 58 |     "panic_and_anxious_or_agitation": "经常好怕、害怕、恐惧、恐慌，想逃避；浑身躁动、经常烦躁、焦躁不安、坐立难安、焦虑、过度担心、紧张、忐忑、精神紧绷；言行冲动、易怒、易抓狂"
 59 | }
 60 | 
 61 | # 纯粹按照论文
 62 | symptoms_disaggregate = {
 63 |     # 1. 感到劳累： 感觉疲倦或精力不足
 64 |     "energy_loss": "经常感到累、困、昏晕乏力、疲惫没力气、没有精神",
 65 |     # 2. 抑郁心境：感到哀伤、空洞、绝望等
 66 |     "sadness": "心情低落消沉、郁闷压抑、沮丧或绝望，经常总是悲伤想哭、伤心流泪、痛苦难过、感到空虚难熬惆怅",
 67 |     # 3. 交感神经唤醒
 68 |     "sympathetic_arousal": "心慌、心悸、胸闷、喘不过气、颤抖、视力模糊、冒冷汗",
 69 |     # 4. 显著的日常（娱乐）活动的减少: 做事时提不起劲或没有兴趣
 70 |     "interest_loss": "对几乎所有活动没兴趣、没意思没动力、厌世、成天无精打采",
 71 |     # 5. 感受不到快乐，愉悦感消失
 72 |     "pleasure_loss": "乐趣明显减少、没有愉悦感，感受不到快乐、不开心、不高兴",
 73 |     # 6. 无法集中注意力，决策力下降：对事物专注有困难，例如阅读报纸或看电视时不能集中注意力
 74 |     "concentration_problem": "注意力下降、无法专注、感到集中注意力困难、思考能力减退、犹豫不决、精神恍惚",
 75 |     # 7. 恐慌：对于特定的物体有显著的恐惧和焦虑或者逃避，且持续呈现
 76 |     "panic": "经常好怕、害怕、恐惧、恐慌，想逃避",
 77 |     # 8. 胃口或体重明显改变：食欲不振或吃太多,在未节食的情况下体重明显减轻，或体重增加
 78 |     "appetite_problem": "食欲减退、经常饱、没胃口、想吐，食欲增加、不明原因暴食",
 79 |     # 9. 失眠：入睡困难、睡不安稳
 80 |     "insomnia": "经常失眠睡不着、服用安眠药，凌晨发帖时间多",
 81 |     # 10. 焦虑：对于特定的物体有显著的恐惧和焦虑或者逃避，且持续呈现
 82 |     "anxious": "浑身躁动、经常烦躁、焦躁不安、焦虑、过度紧张、过度担心、忐忑、精神紧绷",
 83 |     # 11. 感到无用，自责：觉得自己很糟，或觉得自己很失败，或让自己或家人失望
 84 |     "self_blame": "经常自我否定，我好没用、没有价值、一无是处、一事无成、好失败，让自己或家人失望，经常对不起、内疚自责、都是我的错",
 85 |     # 12. 日常行动迟缓或多动：动作或说话速度缓慢到别人已经觉察; 烦躁或坐立不安、动来动去的情况更胜于平常
 86 |     "retardation": "反应麻木迟钝、动作迟缓，对疼痛不敏感",
 87 |     # 9. 不断地想到死亡，自杀念头但是没有计划：有不如死掉或用某种方式伤害自己的念头
 88 |     "suicidal_ideation": "反复想到死亡、想死、自杀、结束生命，用刀片割腕自残、想跳楼自杀、计划自杀",
 89 |     # 8. 胃口或体重明显改变：食欲不振或吃太多,在未节食的情况下体重明显减轻，或体重增加
 90 |     "weight_problem": "体重明显减轻或体重增加",
 91 |     # 15. 狂躁：烦躁或坐立不安、动来动去的情况更胜于平常
 92 |     "agitation": "精神性躁动、易感烦躁、坐立难安，言行冲动、易怒、易抓狂",
 93 |     # 16. 嗜睡：睡眠过度
 94 |     "hypersomnia": "嗜睡、起床困难、睡不醒",
 95 | }
 96 | 
 97 | a = {
 98 |     # 1. 显著的日常（娱乐）活动的减少: 做事时提不起劲或没有兴趣
 99 |     "interest_loss": "对几乎所有活动兴趣减少",
100 |     # "keywords": ["兴趣", "没意思", "无精打采", "厌世", "动力"]  # "没兴趣"
101 |     # 2. 快感消失：感到心情低落、沮丧或绝望
102 |     "pleasure_loss": "长时间开心不起来，没有愉悦感",
103 |     # "keywords": ["不开心", "不高兴", "不快乐", "郁闷", "压抑", "难熬", "消沉", "低落", '丧']
104 |     # 3. 感到劳累： 感觉疲倦或没有活力
105 |     "energy_loss": "乏力，经常性疲惫，没有精神",
106 |     # "keywords": ["累", "没力气", "发软", "躺", "困", "昏", "晕"]
107 |     # 4. 抑郁心境：感到哀伤，空洞，绝望等
108 |     "sadness": "悲伤、空虚、无望",
109 |     # "keywords": ["哭", "伤心", "难受", "痛苦", "惆怅", "难过"]
110 |     # 5. 交感神经唤醒
111 |     "sympathetic_arousal": "心悸、颤抖、视力模糊、冒冷汗",
112 |     # "keywords": ["心悸", "颤抖", "模糊", "冒汗", "冷汗", "胸闷", "心慌"]
113 |     # 6. 无法集中注意力，决策力下降：对事物专注有困难，例如阅读报纸或看电视时不能集中注意力
114 |     "concentration_problem": "思考或注意力集中的能力减退或犹豫不决、出现精神恍惚",
115 |     # "keywords": ["注意力", "不集中"]
116 |     # 7. 恐慌：对于特定的物体有显著的恐惧和焦虑或者逃避，且持续呈现
117 |     "panic": "经常无故感到害怕",
118 |     # "keywords": ["好怕", "恐惧", "恐慌", "害怕"]  # 怕
119 |     # 8. 胃口的改变：食欲不振或吃太多
120 |     "appetite_problem": "食欲减退、经常没胃口、呕吐或食欲增加、不明原因暴饮暴食",
121 |     # "keywords": ["吐", "没胃口", "饱", "暴食"]
122 |     # 9. 睡眠困扰（失眠或轻度睡眠困扰）：入睡困难、睡不安稳
123 |     "insomnia": "自述经常失眠、凌晨0-6点发帖时间较多",
124 |     # "keywords": ["安眠药", "失眠", "睡不着"]
125 |     # 10. 焦虑：浑身躁动，时刻感到十分烦恼
126 |     "anxious": "经常感到焦躁不安，包含了紧张、过度担心",
127 |     # "keywords": ["焦虑", "紧张", "担心", "紧绷", "喘不过气", "忐忑"]
128 |     # 11. 感到无用，自责：觉得自己很糟，或觉得自己很失败，或让自己或家人失望
129 |     "self_blame": "感到愧疚或没有价值",
130 |     # "keywords": ["对不起", "没用", "一无是处", "不中用", "自我否定", "一事无成", "孤独"]  # "错",
131 |     # 12. 日常行动迟缓：动作或说话速度缓慢到别人已经觉察
132 |     "retardation": "反应不敏捷，对疼痛不敏感",
133 |     # "keywords": ["麻木", "笨"]  # "慢",
134 |     # 13. 不断地想到死亡，自杀念头但是没有计划：有不如死掉或用某种方式伤害自己的念头
135 |     "suicidal_ideation": "重复地想到死亡，重复地想到自杀但是没有详细计划，自杀尝试或者明确的自杀计划",
136 |     # "keywords": ["自杀", "结束", "死", "自残"]
137 |     # 14. 体重的明显改变: 在未节食的情况下体重明显减轻，或体重增加
138 |     "weight_problem": "",
139 |     # "keywords": ["变重", "变轻"]  # "肥", "瘦"
140 |     # 15. 日常行为的多动：烦躁或坐立不安、动来动去的情况更胜于平常
141 |     "agitation": "精神性躁动，言语偏激、易怒",
142 |     # "keywords": ["烦", "抓狂", "冲动", "易怒"]
143 |     # 16. 嗜睡：睡眠过多
144 |     "hypersomnia": "",
145 |     # "keywords": ["懒", "起床", "嗜睡", "睡不醒"]
146 | }
147 | 


--------------------------------------------------------------------------------