├── Analyse ├── keyWordsGeneration │ ├── Date.py │ ├── curveFit.py │ ├── drawPicLab.py │ ├── drawRadarMap.py │ ├── getCOVKeywords.py │ ├── getMachineLearningRes.py │ ├── hotAndCOV.py │ ├── redme.md │ └── wordFromJSTV.py └── sentimentDictionary │ ├── emotionWord_count.py │ ├── polarSocre.py │ ├── redme.md │ └── snownlp-01.py ├── Data ├── README.md ├── 四阶段新浪新闻标题_title_肺炎 │ ├── README.md │ ├── stage1News2019_sina.csv │ ├── stage1News2019_sina_title.csv │ ├── stage1News2020_sina.csv │ ├── stage1News2020_sina_title.csv │ ├── stage2News_sina_title.csv │ ├── stage3News_sina_title.csv │ └── stage4News_sina_title.csv └── 相关内容在NJUBOX上.md ├── README.md ├── Report ├── README.md ├── 数据获取.md └── 研究背景.md └── Spyder ├── Weibo.py ├── jstvSpyder.py ├── newsSpyder-.py ├── newsSpyder.py └── tryloadjson.py /Analyse/keyWordsGeneration/Date.py: -------------------------------------------------------------------------------- 1 | """ 2 | 日期类 3 | """ 4 | 5 | 6 | import re 7 | 8 | hanziS = u"[\u4e00-\u9fa5]+" 9 | hanzi = re.compile(hanziS) 10 | 11 | 12 | # 把 YYYY年MM月DD日变成YYYY-MM-DD-格式方便后续处理 13 | def getDate(rdate): 14 | date = re.sub(pattern=hanzi, repl="-", string=rdate, count=3) 15 | return date 16 | 17 | 18 | # 由数字生成对应日期 19 | def generateDate(year, month, day): 20 | return str(fillAZero(year) + '-' + fillAZero(month) + '-' + fillAZero(day) + '-') 21 | 22 | 23 | # 生成下一个日期地址 24 | # 有一说一,这个是真的烦 25 | def getNextDate(date): 26 | numbers = str(date).split('-') 27 | year = int(numbers[0]) 28 | month = int(numbers[1]) 29 | day = int(numbers[2]) 30 | day += 1 31 | bigMonth = [1, 3, 5, 7, 8, 10, 12] 32 | smallMonth = [4, 6, 9, 11] 33 | if month in bigMonth: 34 | if day == 32: 35 | month += 1 36 | day = 1 37 | elif month in smallMonth: 38 | if (day == 31): 39 | month += 1 40 | day = 1 41 | elif month == 2: 42 | if year % 4 == 0 and year % 100 != 0: # 闰年 43 | if day == 30: 44 | month += 1 45 | day = 1 46 | else: # 其他年份 47 | if day == 29: 48 | month += 1 49 | day = 1 50 | else: 51 | print("ERROR!") 52 | if month == 13: 53 | year += 1 54 | month = 1 55 | return generateDate(year, month, day) 56 | 57 | 58 | # 补‘0’ 59 | def fillAZero(num): 60 | if num < 10: 61 | return '0' + str(num) 62 | else: 63 | return str(num) 64 | 65 | 66 | # 比较两个日期的大小,如果第一个的日期在第二个之后,则返回1;相等则返回0;否则返回-1 67 | def dateCmp(date1, date2): 68 | numbers1 = int(str(date1).replace('-', '')) 69 | numbers2 = int(str(date2).replace('-', '')) 70 | if numbers1 > numbers2: 71 | return 1 72 | elif numbers1 == numbers2: 73 | return 0 74 | else: 75 | return -1 76 | -------------------------------------------------------------------------------- /Analyse/keyWordsGeneration/curveFit.py: -------------------------------------------------------------------------------- 1 | """ 2 | 曲线拟合的例程 3 | """ 4 | 5 | 6 | import numpy as np 7 | import pylab as plt 8 | # import matplotlib.pyplot as plt 9 | from scipy.optimize import curve_fit 10 | 11 | x = range(0, 10, 1) 12 | y = [0, 1, 2, 3, 4, 5, 4, 3, 2, 1] 13 | 14 | # 拟合高斯分布 15 | def gaussian(x, *param): 16 | return param[0] * np.exp(-np.power(x - param[2], 2.) / (2 * np.power(param[4], 2.))) + \ 17 | param[1] * np.exp(-np.power(x - param[3], 2.) / (2 * np.power(param[5], 2.))) 18 | # 拟合了两次可还行 19 | 20 | 21 | popt, pcov = curve_fit(gaussian, x, y, p0=[3, 4, 3, 6, 1, 1]) 22 | print('popt') 23 | print(popt) 24 | print('pcov') 25 | print(pcov) 26 | 27 | plt.plot(x, y, 'b+:', label='data') 28 | plt.plot(x, gaussian(x, *popt), 'ro:', label='fit') 29 | plt.legend() 30 | plt.show() -------------------------------------------------------------------------------- /Analyse/keyWordsGeneration/drawPicLab.py: -------------------------------------------------------------------------------- 1 | """ 2 | 画散点图的例程 3 | """ 4 | 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import json 8 | 9 | path = 'jstv' 10 | beginDate = '2019-12-08-' 11 | endDate = '2019-12-26-' 12 | date = '1970-01-01-' 13 | 14 | 15 | def run(): 16 | filePath = path + '/' + 'keywords-Stage1.json' 17 | fp = open(filePath, mode='r', encoding='utf-8') 18 | a = json.load(fp) 19 | plt.rcParams['font.sans-serif'] = ['SimHei'] # for Chinese characters 20 | plt.rcParams['figure.dpi'] = 300 21 | plt.xlabel = '横坐标' 22 | 23 | x = [i[0] for i in a] 24 | y = [i[1] for i in a] 25 | maxN = max(y) 26 | y = [i / maxN for i in y] 27 | x = x[0:15] 28 | y = y[0:15] 29 | plt.plot(x, y, color="r", linestyle="-", marker="^", linewidth=1) 30 | plt.show() 31 | 32 | 33 | if __name__ == '__main__': 34 | run() -------------------------------------------------------------------------------- /Analyse/keyWordsGeneration/drawRadarMap.py: -------------------------------------------------------------------------------- 1 | # 导入第三方模块 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import math 5 | import Date 6 | import stage 7 | import json 8 | 9 | stageNo = 5 10 | path = stage.stage[stageNo]['path'] 11 | beginDate = stage.stage[stageNo]['beginDate'] 12 | print(beginDate) 13 | endDate = stage.stage[stageNo]['endDate'] 14 | print(endDate) 15 | date = '1970-01-01-' 16 | 17 | 18 | def run(): 19 | date = beginDate 20 | while Date.dateCmp(date, endDate) != 1: 21 | try: 22 | drawRadar(date) 23 | print(date + ' Finished!') 24 | except Exception as e: 25 | print(e) 26 | date = Date.getNextDate(date) 27 | # break 28 | # drawRadar(endDate) 29 | 30 | 31 | def drawRadar(date): 32 | # 中文和负号的正常显示 33 | plt.clf() 34 | plt.rcParams['font.sans-serif'] = 'Microsoft YaHei' 35 | plt.rcParams['axes.unicode_minus'] = False 36 | 37 | # 使用ggplot的绘图风格 38 | plt.style.use('ggplot') 39 | 40 | # 构造数据 41 | # values = [3.2, 2.1, 3.5, 2.8, 3] 42 | fp = open('weibo' + 'Stage2' + 'Score/' + date[0:10] + 'weiboScore.csv', mode='r', encoding='utf-8') 43 | values = getValuefromFile(fp) 44 | # print(values) 45 | feature = ['乐', '好', '怒', '哀', '惧', '恶', '惊'] 46 | N = len(values[0]) 47 | # print(values[0]) 48 | print('N ' + str(N)) 49 | Num = len(values) 50 | average = values[0] 51 | for i in range(0, N): 52 | for j in range(1, Num): 53 | average[i] += values[j][i] 54 | average[i] /= Num 55 | 56 | # 设置雷达图的角度,用于平分切开一个圆面 57 | angles = np.linspace(0, 2 * np.pi, N, endpoint=False) 58 | # print(angles) 59 | 60 | # 为了使雷达图一圈封闭起来,需要下面的步骤 61 | values_ = [] 62 | for item in values: 63 | item1 = np.concatenate((item, [item[0]])) 64 | # print(item) 65 | values_.append(item1) 66 | values = values_ 67 | angles = np.concatenate((angles, [angles[0]])) 68 | average = np.concatenate((average, [average[0]])) 69 | # print(angles) 70 | 71 | # 绘图 72 | fig = plt.figure() 73 | ax = fig.add_subplot(111, polar=True) 74 | # 绘制折线图 75 | for item in values: 76 | # print(len(angles), len(item)) 77 | # print(item) 78 | ax.plot(angles, item, '', linewidth=0.1, color='y', label='e', alpha=0) 79 | # print('filled Item' + str(item)) 80 | # 填充颜色 81 | ax.fill(angles, item, color='y', alpha=0.05) 82 | # 绘制第二条折线图 83 | ax.plot(angles, average, '', linewidth=2, color='r', alpha=0.9, label='') 84 | ax.fill(angles, average, color='r', alpha=0.25) 85 | for a, b in zip(angles, average): 86 | plt.text(a, b, (round(b, 2)), ha='center', va='bottom', color='black' , fontsize=10) 87 | # 添加每个特征的标签 88 | ax.set_thetagrids(angles * 180 / np.pi, feature) 89 | # 设置雷达图的范围 90 | maxN = max([max(x) for x in values]) 91 | ax.set_ylim(-1, maxN) 92 | # 添加标题 93 | plt.title(date + '疫情相关重点微博评论心态分析') 94 | 95 | # 添加网格线 96 | ax.grid(True) 97 | # 设置图例 98 | # plt.legend(loc='best') 99 | 100 | # fp = open('jstvscore/' + date[0:10] + 'jstvScore.csv', mode='r', encoding='utf-8') 101 | # 显示图形 102 | plt.savefig(str(stage.stage[stageNo]['path'] + '/' + date + '/' + date + 'weiboEmotion.png'), dpi=200) 103 | 104 | 105 | def drawAllPic(): 106 | # 中文和负号的正常显示 107 | plt.clf() 108 | plt.rcParams['font.sans-serif'] = 'Microsoft YaHei' 109 | plt.rcParams['axes.unicode_minus'] = False 110 | 111 | # 使用ggplot的绘图风格 112 | plt.style.use('ggplot') 113 | 114 | values = [] 115 | date = beginDate 116 | while Date.dateCmp(date, endDate) != 1: 117 | try: 118 | fp = open('weibo' + 'Stage2' + 'Score/' + date[0:10] + 'weiboScore.csv', mode='r', 119 | encoding='utf-8') 120 | tmpDateValues = getValuefromFile(fp) 121 | for item in tmpDateValues: 122 | values.append(item) 123 | print(date + ' Finished!') 124 | except Exception as e: 125 | print(e) 126 | date = Date.getNextDate(date) 127 | 128 | # print(values) 129 | feature = ['乐', '好', '怒', '哀', '惧', '恶', '惊'] 130 | N = len(values[0]) 131 | # print(values[0]) 132 | print('N ' + str(N)) 133 | Num = len(values) 134 | average = values[0] 135 | for i in range(0, N): 136 | for j in range(1, Num): 137 | average[i] += values[j][i] 138 | average[i] /= Num 139 | 140 | # 设置雷达图的角度,用于平分切开一个圆面 141 | angles = np.linspace(0, 2 * np.pi, N, endpoint=False) 142 | # print(angles) 143 | 144 | # 为了使雷达图一圈封闭起来,需要下面的步骤 145 | values_ = [] 146 | for item in values: 147 | item1 = np.concatenate((item, [item[0]])) 148 | # print(item) 149 | values_.append(item1) 150 | values = values_ 151 | angles = np.concatenate((angles, [angles[0]])) 152 | average = np.concatenate((average, [average[0]])) 153 | # print(angles) 154 | 155 | # 绘图 156 | fig = plt.figure() 157 | ax = fig.add_subplot(111, polar=True) 158 | # 绘制折线图 159 | for item in values: 160 | # print(len(angles), len(item)) 161 | # print(item) 162 | ax.plot(angles, item, '', linewidth=0.1, color='y', label='e', alpha= 0.0) 163 | # print('filled Item' + str(item)) 164 | # 填充颜色 165 | ax.fill(angles, item, color='y', alpha= 0.01) 166 | # 绘制第二条折线图 167 | ax.plot(angles, average, '', linewidth=2, color='r', alpha=0.9, label='') 168 | ax.fill(angles, average, color='r', alpha=0.25) 169 | 170 | # 添加每个特征的标签 171 | ax.set_thetagrids(angles * 180 / np.pi, feature) 172 | # 设置雷达图的范围 173 | maxN = max([max(x) for x in values]) 174 | ax.set_ylim(-1, maxN) 175 | # 添加标题 176 | plt.title('阶段 ' + str(stageNo) + ' 疫情相关重点微博评论心态分析') 177 | for a, b in zip(angles, average): 178 | plt.text(a, b, (round(b, 2)), ha='center', va='bottom', color='black' , fontsize=10) 179 | # 添加网格线 180 | ax.grid(True) 181 | # 设置图例 182 | # plt.legend(loc='best') 183 | 184 | # fp = open('jstvscore/' + date[0:10] + 'jstvScore.csv', mode='r', encoding='utf-8') 185 | # 显示图形 186 | plt.savefig(str(stage.stage[stageNo]['path'] + '/weiboEmotion.png'), dpi=200) 187 | plt.show() 188 | 189 | 190 | def getValuefromFile(fp): 191 | tmpLine = fp.readline() 192 | desList = tmpLine.split(',')[1:8] 193 | valuesList = [] 194 | tmpLine = fp.readline() 195 | while tmpLine != '': 196 | rawValue = [math.log2(float(x) + 1) for x in tmpLine.split(',')[1:8]] 197 | valuesList.append(rawValue) 198 | tmpLine = fp.readline() 199 | # print(rawValue) 200 | return valuesList 201 | 202 | 203 | def getPolarfromFile(fp): 204 | tmpLine = fp.readline() 205 | des = tmpLine.split(',')[0] 206 | valuesList = [] 207 | tmpLine = fp.readline() 208 | while tmpLine != '': 209 | rawValue = float(tmpLine.split(',')[0]) 210 | valuesList.append(rawValue) 211 | tmpLine = fp.readline() 212 | # print(rawValue) 213 | return valuesList 214 | 215 | 216 | def drawPolarByDate(): 217 | # 中文和负号的正常显示 218 | plt.clf() 219 | plt.rcParams['font.sans-serif'] = 'Microsoft YaHei' 220 | plt.rcParams['axes.unicode_minus'] = False 221 | 222 | averagePolarList = [] 223 | 224 | date = beginDate 225 | while Date.dateCmp(date, endDate) != 1: 226 | try: 227 | fp = open('weibo' + 'Stage2' + 'Score/' + date[0:10] + 'weiboScore.csv', mode='r', encoding='utf-8') 228 | curPolarList = getPolarfromFile(fp) 229 | curPolar = np.average(curPolarList) 230 | averagePolarList.append(curPolar) 231 | print(date + ' Finished!') 232 | except Exception as e: 233 | print(e) 234 | date = Date.getNextDate(date) 235 | print('Average' + str(averagePolarList)) 236 | xplot = [x for x in range(1, len(averagePolarList) + 1)] 237 | avgList = [] 238 | for item in averagePolarList: 239 | if item > 0: 240 | avgList.append(math.log2(item)) 241 | else: 242 | avgList.append(-math.log2(-item)) 243 | plt.xticks(np.arange(0, len(avgList) + 2, 2)) 244 | plt.plot(xplot, avgList, color="b", marker=".", linewidth=1, alpha=0.9, label='极性') 245 | plt.title('阶段 ' + str(stageNo) + ' 疫情相关重点微博评论心态分析') 246 | plt.axhline(y=0, color="purple") 247 | plt.legend() 248 | plt.savefig(str(stage.stage[stageNo]['path'] + '/weiboPolar.png'), dpi=200) 249 | plt.show() 250 | 251 | 252 | 253 | if __name__ == '__main__': 254 | # run() 255 | # drawAllPic() 256 | drawPolarByDate() -------------------------------------------------------------------------------- /Analyse/keyWordsGeneration/getCOVKeywords.py: -------------------------------------------------------------------------------- 1 | """ 2 | 从荔枝新闻关于新冠疫情的搜索结果中的新闻全文中利用textrank算法筛选关键词 3 | 注意到stage0和stage1由于部分日期内荔枝新闻检索无结果,因而用当日的新浪新闻标题中提取的关键词代替 4 | """ 5 | 6 | 7 | from jieba import analyse 8 | from pyecharts import options as opts 9 | from pyecharts.charts import WordCloud 10 | import math 11 | import os 12 | import re 13 | import Date 14 | import time 15 | import json 16 | from scipy.optimize import curve_fit 17 | import matplotlib.pyplot as plt 18 | import numpy as np 19 | import stage 20 | 21 | 22 | stageNo = 6 23 | path = stage.stage[stageNo]['path'] 24 | beginDate = stage.stage[stageNo]['beginDate'] 25 | print(beginDate) 26 | endDate = stage.stage[stageNo]['endDate'] 27 | print(endDate) 28 | date = '1970-01-01-' 29 | textrank = analyse.textrank 30 | 31 | words = dict() 32 | 33 | def run(): 34 | date = beginDate 35 | i = 0 36 | fp = open(path + '/' + date + '/' + date + 'keywords.json', mode='r', encoding='utf-8') 37 | while True: 38 | try: 39 | curWords = json.load(fp) 40 | except Exception as e: 41 | print(e) 42 | date = Date.getNextDate(date) 43 | if Date.dateCmp(date, endDate) == 1: 44 | break 45 | try: 46 | fp = open(path + '/' + date + '/' + date + 'keywords.json', mode='r', encoding='utf-8') 47 | except Exception as e: 48 | print(e) 49 | continue 50 | continue 51 | print(curWords) 52 | for item in curWords: 53 | words[item[0]] = words.get(item[0], 0) + item[1] 54 | print(words) 55 | print(date + 'Finished') 56 | date = Date.getNextDate(date) 57 | print(date) 58 | if Date.dateCmp(date, endDate) == 1: 59 | print('Finished') 60 | break 61 | try: 62 | fp = open(path + '/' + date + '/' + date + 'keywords.json', mode='r', encoding='utf-8') 63 | except Exception as e: 64 | print(e) 65 | continue 66 | i += 1 67 | # if i > 30: 68 | # break 69 | fp = open((path + '/' + 'keywords-Stage' + str(stageNo) + '.json'), mode='w', encoding='utf-8') 70 | wordl = sorted(words.items(), key=lambda items: items[1], reverse=True) 71 | # wordl = wordl[0:math.floor(i / 3)] # 只选取前面的四分之一的词汇 72 | wordsS = json.dumps(wordl, indent=4, separators=(',', ':'), ensure_ascii=False) 73 | fp.write(wordsS) 74 | fp.flush() 75 | 76 | 77 | if __name__ == '__main__': 78 | run() -------------------------------------------------------------------------------- /Analyse/keyWordsGeneration/getMachineLearningRes.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import math 5 | import os 6 | import re 7 | import Date 8 | import time 9 | import json 10 | import stage 11 | import splitWeibo 12 | 13 | 14 | path = 'stage3/2020-02-21-/2020-02-21-blog.json' 15 | score_path = 'stage3/2020-02-21-/test_result_oneday.txt' 16 | out_path = 'stage3/2020-02-21-/valuedBlog.json' 17 | 18 | allWeibo = [] 19 | 20 | def run(): 21 | fp = open(path, mode='r', encoding='utf-8') 22 | fp2 = open(score_path, mode='r', encoding='utf-8') 23 | allWeibo = json.load(fp) 24 | curPositive = 0 25 | curNegative = 0 26 | curMid = 0 27 | for weibo in allWeibo: 28 | num = len(weibo['评论']) 29 | for i in range(0, num): 30 | line = fp2.readline() 31 | possibility = line.split(' ') 32 | pos = float(possibility[0]) 33 | neg = float(possibility[1]) 34 | if (pos - neg) > 0.1: 35 | curPositive += 1 36 | elif (neg - pos) < 0.1: 37 | curNegative += 1 38 | else: 39 | curMid += 1 40 | weibo['积极评论数'] = curPositive 41 | weibo['消极评论数'] = curNegative 42 | weibo['中性评论数'] = curMid 43 | fp3 = open(out_path, mode='w', encoding='utf-8') 44 | json.dump(allWeibo, fp3, indent=4, separators=(',', ':'), ensure_ascii=False) 45 | 46 | 47 | 48 | if __name__ == '__main__': 49 | run() -------------------------------------------------------------------------------- /Analyse/keyWordsGeneration/hotAndCOV.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import math 4 | import os 5 | import re 6 | import Date 7 | import time 8 | import json 9 | import jieba 10 | import stage 11 | import splitWeibo 12 | from scipy.optimize import curve_fit 13 | from scipy.stats import chi2_contingency 14 | 15 | stageNo = 4 16 | path = stage.stage[stageNo]['path'] 17 | beginDate = stage.stage[stageNo]['beginDate'] 18 | print(beginDate) 19 | endDate = stage.stage[stageNo]['endDate'] 20 | print(endDate) 21 | date = '1970-01-01-' 22 | 23 | keywords = [] 24 | scoreList = [] 25 | starList = [] 26 | forwardList = [] 27 | commentList = [] 28 | allWeibo = [] 29 | weiboNumByDate = [] 30 | COVWeiboNumList = [] 31 | minimenCOV = 0 32 | 33 | starRatio = [] 34 | forwardRatio = [] 35 | commentRatio = [] 36 | COVIndexByDate = [] 37 | 38 | def run(): 39 | plt.rcParams['font.sans-serif'] = ['SimHei'] 40 | loadALlWeibo() 41 | 42 | 43 | def loadALlWeibo(): 44 | global allWeibo 45 | global commentList 46 | global forwardList 47 | global starList 48 | global scoreList 49 | minimenCOV = getLimit() 50 | date = beginDate 51 | while Date.dateCmp(date, endDate) != 1: 52 | try: 53 | loadSingleDateWeibo(date) 54 | print(date + ' Loading Finished!') 55 | except Exception as e: 56 | print(e) 57 | date = Date.getNextDate(date) 58 | # break 59 | allWeibo = sorted(allWeibo, key=lambda x: x['疫情相关度']) 60 | scoreList = [x['疫情相关度'] for x in allWeibo] 61 | commentList = [x['评论数'] for x in allWeibo] 62 | starList = [x['点赞数'] for x in allWeibo] 63 | forwardList = [x['转发数'] for x in allWeibo] 64 | drawPic() 65 | # hotDegree = [getHot(x) for x in allWeibo] 66 | 67 | 68 | def loadSingleDateWeibo(date): 69 | with open((path + '/' + date + '/' + date + 'blog-Scored.json') , mode='r', encoding='utf-8') as fp: 70 | curDateWeiboList = json.load(fp) 71 | for item in curDateWeiboList: 72 | allWeibo.append(item) 73 | return curDateWeiboList 74 | 75 | def loadSingleDateCOVWeibo(date): 76 | with open((path + '/' + date + '/' + date + 'blog-COV.json') , mode='r', encoding='utf-8') as fp: 77 | curDateWeiboList = json.load(fp) 78 | for item in curDateWeiboList: 79 | allWeibo.append(item) 80 | return curDateWeiboList 81 | 82 | 83 | def getLimit(): 84 | with open((path + '/' + 'stageCOVWeiboByImportance.json') , mode='r', encoding='utf-8') as fp: 85 | curDateWeiboList = json.load(fp) 86 | length = len(curDateWeiboList) 87 | return curDateWeiboList[length - 1]['疫情相关度'] 88 | 89 | 90 | def drawPic(): 91 | xplot = [x for x in range(1, len(allWeibo) + 1)] 92 | starList1 = [math.log10(x + 1) * 40 for x in starList] 93 | commentList1 = [math.log10(x + 1) * 40 for x in commentList] 94 | forwardList1 = [math.log10(x + 1) * 40 for x in forwardList] 95 | plt.scatter(xplot, starList1, color="g", s=5, marker=".", linewidth=1, label='点赞数') 96 | plt.scatter(xplot, commentList1, color="y", s=5, marker=".", linewidth=1, alpha=0.9, label='评论数') 97 | plt.scatter(xplot, forwardList1, color="b", s=5, marker=".", linewidth=1, alpha=0.9, label='转发数') 98 | plt.scatter(xplot, scoreList, s=2, color="5", marker=".", linewidth=1, alpha=0.9, label='疫情相关度') 99 | plt.rcParams['figure.dpi'] = 300 100 | plt.xlabel = '微博数量(从左到右递增)' 101 | plt.legend() 102 | plt.savefig(path + '/SaveTest-拟合.png', dpi=300) 103 | # plt.show() 104 | 105 | 106 | # 计算某条微博的热度指标 107 | # def getHot(x): 108 | # comment = x['评论数'] 109 | # return x 110 | 111 | 112 | def getHotRatioByDate(): 113 | date = beginDate 114 | while Date.dateCmp(date, endDate) != 1: 115 | try: 116 | allCurDateWeibo = loadSingleDateWeibo(date) 117 | allCurDateCOVWeibo = loadSingleDateCOVWeibo(date) 118 | 119 | sumStar = 0 120 | sumComment = 0 121 | sumForward = 0 122 | 123 | COVStar = 0 124 | COVComment = 0 125 | COVForward = 0 126 | sumCOV = 0 127 | for item in allCurDateWeibo: 128 | sumStar += item['点赞数'] 129 | sumComment += item['评论数'] 130 | sumForward += item['转发数'] 131 | for item in allCurDateCOVWeibo: 132 | sumCOV += item['疫情相关度'] 133 | COVStar += item['点赞数'] 134 | COVComment += item['评论数'] 135 | COVForward += item['转发数'] 136 | print(date + ' Loading Finished!') 137 | starRatio.append(COVStar / sumStar) 138 | commentRatio.append(COVComment / sumComment) 139 | forwardRatio.append(COVForward / sumForward) 140 | COVIndexByDate.append(sumCOV) 141 | except Exception as e: 142 | print(e) 143 | date = Date.getNextDate(date) 144 | # break 145 | print(forwardRatio) 146 | print(commentRatio) 147 | print(starRatio) 148 | drawHotRatioByDate() 149 | 150 | 151 | def drawHotRatioByDate(): 152 | global COVIndexByDate 153 | plt.rcParams['font.sans-serif'] = ['SimHei'] 154 | xplot = [x for x in range(1, len(COVIndexByDate) + 1)] 155 | COVIndexByDate = [x / 1000 for x in COVIndexByDate] 156 | # commentList1 = [math.log10(x + 1) * 40 for x in commentList] 157 | # forwardList1 = [math.log10(x + 1) * 40 for x in forwardList] 158 | print(len(xplot)) 159 | print(len(starRatio)) 160 | print(len(commentRatio)) 161 | print(len(forwardRatio)) 162 | print(len(COVIndexByDate)) 163 | plt.scatter(xplot, starRatio, color="g", s=8, marker=".", linewidth=1, label='点赞比例') 164 | plt.scatter(xplot, forwardRatio, color="y", s=8, marker=".", linewidth=1, alpha=0.9, label='转发比例') 165 | plt.scatter(xplot, commentRatio, color="b", s=8, marker=".", linewidth=1, alpha=0.9, label='评论比例') 166 | plt.scatter(xplot, COVIndexByDate, s=8, color="r", marker=".", linewidth=1, alpha=0.9, label='疫情相关度 / 200') 167 | plt.rcParams['figure.dpi'] = 300 168 | plt.xlabel = '微博数量(从左到右递增)' 169 | plt.legend() 170 | plt.savefig(path + '/SaveTest-热度.png', dpi=300) 171 | # plt.show() 172 | # 173 | 174 | if __name__ == '__main__': 175 | # run() 176 | getHotRatioByDate() -------------------------------------------------------------------------------- /Analyse/keyWordsGeneration/redme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Analyse/keyWordsGeneration/wordFromJSTV.py: -------------------------------------------------------------------------------- 1 | """ 2 | 从荔枝新闻的正文中提取关键词分天存到对应的文件夹中,考虑到疫情初期基本没有什么疫情相关新闻(2020年1月1日以前), 3 | 故改用新浪新闻标题生成2019年12月部分的关键词 4 | """ 5 | 6 | from jieba import analyse 7 | from pyecharts import options as opts 8 | from pyecharts.charts import WordCloud 9 | import math 10 | import os 11 | import re 12 | import Date 13 | import time 14 | 15 | # 汉字包 16 | hanziS = u"[\u4e00-\u9fa5]+" 17 | hanzi = re.compile(hanziS) 18 | textrank = analyse.textrank # 引入jieba中的TextRank 19 | beginDate = '2019-12-04-' 20 | endDate = '2020-1-1-' 21 | 22 | # 特殊停用词 23 | ExtraStopWords = ['荔枝', '新闻', 'trog'] 24 | 25 | # TODO 26 | def run(): 27 | date = beginDate 28 | i = 0 29 | while True: 30 | filePath = "jstv/" + date + "/" + "jstvRAW.csv" 31 | print(filePath) 32 | text = readNews(filePath) 33 | if text == '': 34 | date = Date.getNextDate(date) 35 | continue 36 | keywords = textrank(text, topK=36, withWeight=True, withFlag=True) 37 | print(keywords) 38 | words = [] 39 | for item in keywords: 40 | a = list() 41 | items = str() 42 | curWord = str(item[0]).split('/')[0] 43 | if curWord in ExtraStopWords: 44 | continue 45 | a.append(curWord) 46 | a.append(math.floor((item[1] * 100))) 47 | words.append(tuple(a)) 48 | # print(words) 49 | wordsS = str(words) 50 | wordsS = wordsS.replace("'", '"') 51 | wordsS = wordsS.replace("(", '[') 52 | wordsS = wordsS.replace(")", ']') 53 | print(wordsS) 54 | outPath = "jstv/" + date + "/" + date + "keywords.json" 55 | with open(outPath, mode='w', encoding='utf-8') as f3: 56 | f3.write(wordsS) 57 | fig = wordcloud_base(words) 58 | figRute = 'jstv/' + date + '/' + date + 'keywords.html' 59 | fig.render(figRute) 60 | date = Date.getNextDate(date) 61 | print(date) 62 | time.sleep(3) 63 | if Date.dateCmp(date, endDate) == 1 or i > 1000: 64 | break 65 | i += 1 66 | 67 | 68 | def print_hi(name): 69 | # Use a breakpoint in the code line below to debug your script. 70 | print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint. 71 | 72 | 73 | # 直接返回单个新闻的全部内容 74 | def readNews(filepath): 75 | outContent = "" 76 | try: 77 | with open(filepath, mode='r' , encoding='utf-8') as f: 78 | # for i in range(0, 100): 79 | # outContent += f.readline() 80 | outContent = f.read() 81 | except Exception as e: 82 | print("文件不存在") 83 | return '' 84 | return outContent 85 | 86 | 87 | def wordcloud_base(words) -> WordCloud: 88 | c = ( 89 | WordCloud() 90 | .add("", words, word_size_range=[20, 100], shape='roundRect') # SymbolType.ROUND_RECT 91 | .set_global_opts(title_opts=opts.TitleOpts(title='WordCloud词云')) 92 | ) 93 | return c 94 | 95 | 96 | # 生成按日区分的荔枝新闻原文,并建立好后续处理所需要用到的文件夹 97 | def splitJSTVByDate(filepath): 98 | with open(filepath, mode='r', encoding='utf-8') as f: 99 | date = '1970-01-01' 100 | f2 = open('jstv/Begin.data', mode='w', encoding='utf-8') 101 | curLine = 1 102 | while True: 103 | tmpLine = f.readline() 104 | if (tmpLine == ""): 105 | break 106 | print(curLine) 107 | curLine += 1 108 | tmpItems = tmpLine.split(',') 109 | tmpDate = Date.getDate(tmpItems[0]) 110 | if (tmpDate != date): 111 | f2.write(tmpLine) 112 | try: 113 | os.mkdir(('jstv/' + tmpDate)) 114 | except Exception as e: 115 | print(e) 116 | continue 117 | date = tmpDate 118 | f2 = open(file=('jstv/' + tmpDate + '/jstvRAW.csv'), mode='w+', encoding='utf-8') 119 | else: 120 | f2.write(tmpLine) 121 | 122 | 123 | 124 | # Press the green button in the gutter to run the script. 125 | if __name__ == '__main__': 126 | run() 127 | # splitJSTVByDate("jstv/jstv_2001-2800.csv") 对所有源文件操作一次 128 | 129 | 130 | # See PyCharm help at https://www.jetbrains.com/help/pycharm/ 131 | -------------------------------------------------------------------------------- /Analyse/sentimentDictionary/emotionWord_count.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import pandas as pd 3 | import jieba 4 | import time 5 | import json 6 | 7 | 8 | stopWord_path=r"../source/哈工大停用词表.txt" 9 | # -------------------------------------情感词典读取------------------------------- 10 | # 注意: 11 | # 1.词典中怒的标记(NA)识别不出被当作空值,情感分类列中的NA都给替换成NAU 12 | # 2.大连理工词典中有情感分类的辅助标注(有NA),故把情感分类列改好再替换原词典中 13 | 14 | # 扩展前的词典 15 | df = pd.read_excel(r"../source/大连理工大学中文情感词汇本体.xlsx") 16 | # print(df.head(10)) #输出10行看下格式 17 | 18 | df = df[['词语', '词性种类', '词义数', '词义序号', '情感分类', '强度', '极性']] 19 | # df.head() 20 | 21 | # -------------------------------------七种情绪的运用------------------------------- 22 | Happy = [] 23 | Good = [] 24 | Surprise = [] 25 | Anger = [] 26 | Sad = [] 27 | Fear = [] 28 | Disgust = [] 29 | 30 | # df.iterrows()功能是迭代遍历每一行 31 | for idx, row in df.iterrows(): 32 | if row['情感分类'] in ['PA', 'PE']: 33 | Happy.append(row['词语']) 34 | if row['情感分类'] in ['PD', 'PH', 'PG', 'PB', 'PK']: 35 | Good.append(row['词语']) 36 | if row['情感分类'] in ['PC']: 37 | Surprise.append(row['词语']) 38 | if row['情感分类'] in ['NB', 'NJ', 'NH', 'PF']: 39 | Sad.append(row['词语']) 40 | if row['情感分类'] in ['NI', 'NC', 'NG']: 41 | Fear.append(row['词语']) 42 | if row['情感分类'] in ['NE', 'ND', 'NN', 'NK', 'NL']: 43 | Disgust.append(row['词语']) 44 | if row['情感分类'] in ['NAU']: # 修改: 原NA算出来没结果 45 | Anger.append(row['词语']) 46 | 47 | # 正负计算不是很准 自己可以制定规则 48 | Positive = Happy + Good + Surprise 49 | Negative = Anger + Sad + Fear + Disgust 50 | print('情绪词语列表整理完成') 51 | # print(Anger) #输出看下anger里有哪些词 52 | 53 | # ---------------------------------------中文分词--------------------------------- 54 | 55 | # 添加自定义词典和停用词 56 | # jieba.load_userdict("user_dict.txt") 57 | def stopwordslist(filepath): 58 | stopwords = [line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()] 59 | return stopwords 60 | 61 | stop_list=stopwordslist(stopWord_path) 62 | 63 | 64 | def txt_cut(sentence): 65 | return [w for w in jieba.lcut(sentence) if w not in stop_list] # 可增加len(w)>1,lcut直接返回列表 66 | 67 | 68 | # ---------------------------------------情感计算--------------------------------- 69 | def emotion_caculate(text): 70 | text = text.replace('\n', '') # 停用词里加\n会转义成\\n,手动剔除一下 71 | 72 | positive = 0 73 | negative = 0 74 | 75 | anger = 0 76 | disgust = 0 77 | fear = 0 78 | sad = 0 79 | surprise = 0 80 | good = 0 81 | happy = 0 82 | 83 | anger_list = [] 84 | disgust_list = [] 85 | fear_list = [] 86 | sad_list = [] 87 | surprise_list = [] 88 | good_list = [] 89 | happy_list = [] 90 | 91 | wordlist = txt_cut(text) # ['武汉', '好想你', '热干面', '致敬', '白衣天使', '致敬', '英雄', '加油', '加油', '加油', '一定', '好好', '勇敢', '站', '起来', '行'] 92 | # wordlist = jieba.lcut(text) 93 | wordset = set(wordlist) 94 | wordfreq = [] 95 | for word in wordset: 96 | freq = wordlist.count(word) 97 | if word in Positive: 98 | positive += freq 99 | if word in Negative: 100 | negative += freq 101 | if word in Anger: 102 | anger += freq 103 | anger_list.append(word) 104 | if word in Disgust: 105 | disgust += freq 106 | disgust_list.append(word) 107 | if word in Fear: 108 | fear += freq 109 | fear_list.append(word) 110 | if word in Sad: 111 | sad += freq 112 | sad_list.append(word) 113 | if word in Surprise: 114 | surprise += freq 115 | surprise_list.append(word) 116 | if word in Good: 117 | good += freq 118 | good_list.append(word) 119 | if word in Happy: 120 | happy += freq 121 | happy_list.append(word) 122 | 123 | emotion_info = { 124 | 'length': len(wordlist), 125 | 'positive': positive, 126 | 'negative': negative, 127 | 'anger': anger, 128 | 'disgust': disgust, 129 | 'fear': fear, 130 | 'good': good, 131 | 'sadness': sad, 132 | 'surprise': surprise, 133 | 'happy': happy, 134 | 135 | } 136 | 137 | indexs = ['length', 'positive', 'negative', 'anger', 'disgust', 'fear', 'sadness', 'surprise', 'good', 'happy'] 138 | # return pd.Series(emotion_info, index=indexs), anger_list, disgust_list, fear_list, sad_list, surprise_list, good_list, happy_list 139 | #('length', 16) ('positive', 5) ('negative', 0) ('anger', 0) ('disgust', 0) ('fear', 0) ('sadness', 0) ('surprise', 0) ('good', 4) ('happy', 1) 140 | return pd.Series(emotion_info, index=indexs) 141 | 142 | 143 | # 测试 (res, anger_list, disgust_list, fear_list, sad_list, surprise_list, good_list, happy_list) 144 | text = """ 145 | 晚安,人生最累的莫过于站在幸福里找幸福,生在福中不知福。懂得知足,才能常常感到心满意足 146 | """ 147 | # res, anger, disgust, fear, sad, surprise, good, happy = emotion_caculate(text) 148 | res = emotion_caculate(text) #输出 length 16 positive 5 negtive 0 anger 0 disgust 0...... happy 1 149 | print(res) 150 | 151 | 152 | 153 | # -------------------------------------获取数据集--------------------------------- 154 | # weibo_path=r"../source/2019-12-08info.json" 155 | # 156 | # def weiboread(filepath,start,end): 157 | # ''' 158 | # :param filepath: json的路径 159 | # :param start: 帖子开始的索引 160 | # :param end: 帖子结束的索引 161 | # :return: 列表,元素为一篇帖子下所有评论的字符串 162 | # ''' 163 | # f = open(filepath, encoding='utf-8') 164 | # comment_list=json.load(f) 165 | # out=[] 166 | # for i in range(start,end): 167 | # out.append(''.join(comment_list['weibo'][i]['评论'])) 168 | # #返回列表,每个元素是帖子下所有评论的拼接 169 | # return out 170 | # 171 | # 172 | # comment_list=weiboread(weibo_path,0,3) 173 | # #list转为dataframe 174 | # c={ 175 | # "review":comment_list 176 | # } 177 | # weibo_df=pd.DataFrame(c) 178 | # emotion_df = weibo_df['review'].apply(emotion_caculate) 179 | # 180 | # #---------------------------------------情感计算--------------------------------- 181 | # output_df = pd.concat([weibo_df, emotion_df], axis=1) 182 | # 183 | # # 储存结果 184 | # outPath=r"../source/weibo_qx.csv" 185 | # output_df.to_csv(outPath, encoding='utf-8', index=False) 186 | 187 | 188 | -------------------------------------------------------------------------------- /Analyse/sentimentDictionary/polarSocre.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import math 3 | import sys 4 | import gzip 5 | from collections import defaultdict 6 | import datetime 7 | from itertools import product 8 | import jieba 9 | import csv 10 | import pandas as pd 11 | import json 12 | import os 13 | 14 | 15 | class Struct(object): 16 | def __init__(self, word, sentiment, pos, value, class_value): 17 | self.word = word 18 | self.sentiment = sentiment 19 | self.pos = pos 20 | self.value = value 21 | self.class_value = class_value 22 | 23 | 24 | class Result(object): 25 | def __init__(self, score, score_words, not_word, degree_word): 26 | self.score = score 27 | self.score_words = score_words 28 | self.not_word = not_word 29 | self.degree_word = degree_word 30 | 31 | 32 | class Score(object): 33 | # 七个情感大类对应的小类简称: 尊敬 34 | score_class = {'乐': ['PA', 'PE'], 35 | '好': ['PD', 'PH', 'PG', 'PB', 'PK'], 36 | '怒': ['NA'], 37 | '哀': ['NB', 'NJ', 'NH', 'PF'], 38 | '惧': ['NI', 'NC', 'NG'], 39 | '恶': ['NE', 'ND', 'NN', 'NK', 'NL'], 40 | '惊': ['PC'] 41 | } 42 | # 大连理工大学 -> ICTPOS 3.0 43 | POS_MAP = { 44 | 'noun': 'n', 45 | 'verb': 'v', 46 | 'adj': 'a', 47 | 'adv': 'd', 48 | 'nw': 'al', # 网络用语 49 | 'idiom': 'al', 50 | 'prep': 'p', 51 | } 52 | 53 | # 否定词 TODO 找否定语料库,从文件加载 54 | NOT_DICT = set(['不是', '不大', '没', '非', '莫', '弗', '毋', '没有','勿', '未', '否', '别', '無', '休', 55 | '缺乏', '缺少', '不', '甭', '勿', '别', '未', '反', '没', '否', '木有', '非', '无', '请勿', '无须', '并非', '毫无', '决不', '休想', '永不', '不要', 56 | '未尝', '未曾', '毋', '莫', '从未', '从未有过', '尚未', '一无', '并未', '尚无', '从没', '绝非', '远非', '切莫', '绝不', '毫不', '禁止', '忌', '拒绝', 57 | '杜绝', '弗']) 58 | 59 | 60 | def __init__(self, sentiment_dict_path, degree_dict_path, stop_dict_path): 61 | self.sentiment_struct, self.sentiment_dict = self.load_sentiment_dict(sentiment_dict_path) 62 | self.degree_dict = self.load_degree_dict(degree_dict_path) 63 | self.stop_words = self.load_stop_words(stop_dict_path) 64 | 65 | def load_stop_words(self, stop_dict_path): 66 | stop_words = [w.strip() for w in open(stop_dict_path, encoding='utf-8').readlines()] 67 | # print (stop_words[:100]) 68 | return stop_words 69 | 70 | def remove_stopword(self, words): 71 | words = [w.strip() for w in words if w not in self.stop_words] 72 | return words 73 | 74 | def load_degree_dict(self, dict_path): 75 | """读取程度副词词典 76 | Args: 77 | dict_path: 程度副词词典路径. 格式为 word\tdegree 78 | 所有的词可以分为6个级别,分别对应极其, 很, 较, 稍, 欠, 超 79 | Returns: 80 | 返回 dict = {word: degree} 81 | """ 82 | degree_dict = {} 83 | with open(dict_path, 'r', encoding='UTF-8') as f: 84 | for line in f: 85 | line = line.strip() 86 | word, degree = line.split('\t') 87 | degree = float(degree) 88 | degree_dict[word] = degree 89 | return degree_dict 90 | 91 | def load_sentiment_dict(self, dict_path): 92 | """读取情感词词典 93 | Args: 94 | dict_path: 情感词词典路径. 格式请看 README.md 95 | Returns: 96 | 返回 dict = {(word, postag): 极性} 97 | """ 98 | sentiment_dict = {} 99 | sentiment_struct = [] 100 | 101 | with open(dict_path, 'r', encoding='UTF-8') as f: 102 | # with gzip.open(dict_path) as f: 103 | for index, line in enumerate(f): 104 | if index == 0: # title,即第一行的标题 105 | continue 106 | items = line.split('\t') 107 | word = items[0] 108 | pos = items[1] 109 | sentiment = items[4] 110 | intensity = items[5] # 1, 3, 5, 7, 9五档, 9表示强度最大, 1为强度最小. 111 | polar = items[6] # 极性 112 | 113 | # 将词性转为 ICTPOS 词性体系 114 | pos = self.__class__.POS_MAP[pos] 115 | intensity = int(intensity) 116 | polar = int(polar) 117 | 118 | # 转换情感倾向的表现形式, 负数为消极, 0 为中性, 正数为积极 119 | # 数值绝对值大小表示极性的强度 // 分成3类,极性:褒(+1)、中(0)、贬(-1); 强度为权重值 120 | value = None 121 | if polar == 0: # neutral 122 | value = 0 123 | elif polar == 1: # positive 124 | value = intensity 125 | elif polar == 2: # negtive 126 | value = -1 * intensity 127 | else: # invalid 128 | continue 129 | 130 | # key = (word, pos, sentiment ) 131 | key = word 132 | sentiment_dict[key] = value 133 | 134 | # 找对应的大类 135 | for item in self.score_class.items(): 136 | key = item[0] 137 | values = item[1] 138 | # print(key) 139 | # print(value) 140 | for x in values: 141 | if (sentiment == x): 142 | class_value = key # 如果values中包含,则获取key 143 | sentiment_struct.append(Struct(word, sentiment, pos, value, class_value)) 144 | return sentiment_struct, sentiment_dict 145 | 146 | def findword(self, text): # 查找文本中包含哪些情感词 147 | word_list = [] 148 | for item in self.sentiment_struct: 149 | if item.word in text: 150 | word_list.append(item) 151 | return word_list 152 | 153 | def classify_words(self, words): 154 | # 这3个键是词的序号(索引) 155 | 156 | sen_word = {} 157 | not_word = {} 158 | degree_word = {} 159 | # 找到对应的sent, not, degree; words 是分词后的列表 160 | for index, word in enumerate(words): 161 | if word in self.sentiment_dict and word not in self.__class__.NOT_DICT and word not in self.degree_dict: 162 | sen_word[index] = self.sentiment_dict[word] 163 | elif word in self.__class__.NOT_DICT and word not in self.degree_dict: 164 | not_word[index] = -1 165 | elif word in self.degree_dict: 166 | degree_word[index] = self.degree_dict[word] 167 | return sen_word, not_word, degree_word 168 | 169 | def get2score_position(self, words): 170 | sen_word, not_word, degree_word = self.classify_words(words) # 是字典 171 | 172 | score = 0 173 | start = 0 174 | # 存所有情感词、否定词、程度副词的位置(索引、序号)的列表 175 | sen_locs = sen_word.keys() 176 | not_locs = not_word.keys() 177 | degree_locs = degree_word.keys() 178 | senloc = -1 179 | # 遍历句子中所有的单词words,i为单词的绝对位置 180 | for i in range(0, len(words)): 181 | if i in sen_locs: 182 | W = 1 # 情感词间权重重置 183 | not_locs_index = 0 184 | degree_locs_index = 0 185 | 186 | # senloc为情感词位置列表的序号,之前的sen_locs是情感词再分词后列表中的位置序号 187 | senloc += 1 188 | # score += W * float(sen_word[i]) 189 | if (senloc == 0): # 第一个情感词,前面是否有否定词,程度词 190 | start = 0 191 | elif senloc < len(sen_locs): # 和前面一个情感词之间,是否有否定词,程度词 192 | # j为绝对位置 193 | start = previous_sen_locs 194 | 195 | for j in range(start, i): # 词间的相对位置 196 | # 如果有否定词 197 | if j in not_locs: 198 | W *= -1 199 | not_locs_index = j 200 | # 如果有程度副词 201 | elif j in degree_locs: 202 | W *= degree_word[j] 203 | degree_locs_index = j 204 | 205 | # 判断否定词和程度词的位置:1)否定词在前,程度词减半(加上正值);不是很 2)否定词在后,程度增强(不变),很不是 206 | if ((not_locs_index > 0) and (degree_locs_index > 0)): 207 | if (not_locs_index < degree_locs_index): 208 | degree_reduce = (float(degree_word[degree_locs_index] / 2)) 209 | W += degree_reduce 210 | # print (W) 211 | score += W * float(sen_word[i]) # 直接添加该情感词分数 212 | # print(score) 213 | previous_sen_locs = i 214 | return score 215 | 216 | def getscore(self, text):#所有情感的得分 217 | word_list = self.findword(text) ##查找文本中包含哪些情感词 218 | # 增加程度副词+否定词 219 | not_w = 1 220 | not_word = [] 221 | for notword in self.__class__.NOT_DICT: # 否定词 222 | if notword in text: 223 | not_w = not_w * -1 224 | not_word.append(notword) 225 | degree_word = [] 226 | degree=0 227 | for degreeword in self.degree_dict.keys(): 228 | if degreeword in text: 229 | degree = self.degree_dict[degreeword] 230 | # polar = polar + degree if polar > 0 else polar - degree 231 | degree_word.append(degreeword) 232 | # 7大类找对应感情大类的词语,分别统计分数= 词极性*词权重 233 | result = [] 234 | for key in self.score_class.keys(): # 区分7大类 235 | score = 0 236 | score_words = [] 237 | for word in word_list: 238 | 239 | if (key == word.class_value): 240 | score = score + word.value 241 | score_words.append(word.word) 242 | if score > 0: 243 | score = score + degree 244 | elif score < 0: 245 | score = score - degree # 看分数>0,程度更强; 分数<0,程度减弱? 246 | score = score * not_w 247 | 248 | x = '{}_score={}; word={}; nor_word={}; degree_word={};'.format(key, score, score_words, not_word, 249 | degree_word) 250 | # x='{}'.format(score) 251 | # print(x) 252 | result.append(score) 253 | # key + '_score=%d; word=%s; nor_word=%s; degree_word=%s;'% (score, score_words,not_word, degree_word)) 254 | return result 255 | 256 | #文件读取 257 | 258 | def weiboread(filepath): 259 | ''' 260 | :param filepath: json的路径 261 | :param start: 帖子开始的索引 262 | :param end: 帖子结束的索引 263 | :return: 列表,元素为一篇帖子下所有评论的字符串 264 | ''' 265 | out=[] 266 | try: 267 | f = open(filepath, encoding='utf-8') 268 | comment_list = json.load(f,strict=False) 269 | 270 | end=len(comment_list) 271 | # print(end) 272 | for i in range(0, end): 273 | out.append(''.join(comment_list[i]['评论']))#单一帖子的所有评论合成一个字符串 274 | except Exception as e: 275 | print(e) 276 | # print(out) 277 | # 返回列表,每个元素是帖子下所有评论的拼接 278 | return out 279 | 280 | 281 | #输入int数字,返回'2020-01-01' 282 | def timeitr(smonth,sday,emonth,eday,year=2020): #遍历一定范围内的日期,返回日期字符串列表,闭区间 283 | begin = datetime.date(year, smonth, sday) 284 | end = datetime.date(year, emonth, eday) 285 | outDaylst=[] 286 | for i in range((end - begin).days + 1): 287 | outday = begin + datetime.timedelta(days=i) 288 | outDaylst.append(str(outday)) 289 | return outDaylst 290 | 291 | #返回列表[最强情感的字符串 第二强情感的字符串] 292 | def find_1st2nd_max(intlist_moodScore): 293 | moods = ["乐", "好", "怒", "哀", "惧", "恶", "惊"] 294 | max1st_index=intlist_moodScore.index(max(intlist_moodScore)) 295 | mood1st=moods[max1st_index] 296 | 297 | min_index=intlist_moodScore.index(min(intlist_moodScore))#将最大值换为最小值去找第二强情感 298 | intlist_moodScore[max1st_index]=intlist_moodScore[min_index] 299 | max2nd_index=intlist_moodScore.index(max(intlist_moodScore)) 300 | mood2nd=moods[max2nd_index] 301 | 302 | return [mood1st,mood2nd] 303 | 304 | 305 | def jstvRead(csv_path): 306 | csvFile = open(csv_path, "r",encoding='utf-8') 307 | reader = csv.reader(csvFile) 308 | out=[] 309 | for item in reader: 310 | out.append(item[3]) 311 | #列表,每个元素是当日新闻正文 312 | return out 313 | 314 | 315 | def weiboScore(smonth,sday,emonth,eday,year,stage): 316 | timeStage = timeitr(smonth, sday, emonth, eday, year) # 日期参数 317 | for ymd in timeStage: 318 | weibo_path = r"../source/SplitedWeibo/stage{0:}/{1:}-/{2:}-blog-COV.json".format(stage,ymd,ymd) 319 | out_path = r"../out/weibo/{}weiboScore.csv".format(ymd) 320 | comment_list = weiboread(weibo_path) 321 | c = { 322 | "review": comment_list 323 | } 324 | weibodf = pd.DataFrame(c) 325 | # print(weibodf) 326 | 327 | # 文件写入 328 | outFile = open(out_path, 'a+', encoding='utf-8') 329 | 330 | # 写入表头 331 | moodlist = ["polar", "乐", "好", "怒", "哀", "惧", "恶", "惊", "最强情感", "次强情感"] 332 | for moodType_index in range(len(moodlist) - 1): 333 | outFile.write(moodlist[moodType_index] + ',') 334 | outFile.write(moodlist[-1] + '\n') 335 | 336 | # 写入极性和每种情感的得分,一条评论有太多否定词会出现负分 337 | for temp in weibodf['review']: 338 | score = Score(sentiment_dict_path, degree_dict_path, stop_dict_path) 339 | words = [x.strip() for x in jieba.cut(temp)] # 分词 340 | words_noStopWords = score.remove_stopword(words) 341 | commentLen = len(words_noStopWords) 342 | # 分词->情感词间是否有否定词/程度词+前后顺序->分数累加 343 | 344 | # polar分 345 | result = score.get2score_position(words_noStopWords) # polar 346 | polarScore = 0 347 | if (commentLen): # 因爬虫原因,json的评论下可能没评论 348 | polarScore = float(result) / math.log(commentLen) 349 | outFile.write(str(polarScore) + ',') 350 | 351 | # 乐,好,怒,哀,惧,恶,惊 352 | emotionScore_list = score.getscore(words_noStopWords) # 6种情感 353 | # 大连理工情感词典里表示好和恶的情感很多,消除情感字典情感词数量的影响 354 | weight = [1967, 10640, 388, 2314, 1179, 10282, 288] # 每个情感词汇个数 355 | for i in range(len(emotionScore_list)): # 除以对数评论字符串长度 356 | emotionScore_list[i] = emotionScore_list[i] / math.log((commentLen + 2) / math.log(weight[i])) 357 | 358 | emotionScore_list=changeScore(emotionScore_list)#分数修正 359 | 360 | for i in range(len(emotionScore_list)): 361 | outFile.write(str(emotionScore_list[i]) + ',') # 写入情感分数 362 | 363 | moods1st2nd = find_1st2nd_max(emotionScore_list) 364 | outFile.write(moods1st2nd[0] + ',') 365 | outFile.write(moods1st2nd[1] + '\n') # 写入最强,次强情感 366 | 367 | outFile.close() 368 | # woc 我靠 常见口头语表现为怒(微博评论区吵架会用到,表示惊叹也会用到),可能结果有些偏 369 | 370 | 371 | def jstvSocre(smonth,sday,emonth,eday,year,stage): 372 | timeStage = timeitr(smonth, sday, emonth, eday, year) # 日期参数 373 | for ymd in timeStage: 374 | try: 375 | jstv_csv_path = r"../source/jstv/stage{}/{}-/jstvRAW.csv".format(stage,ymd)#有些天会没有 376 | out_path = r"../out/jstv/{}jstvScore.csv".format(ymd) 377 | comment_list = jstvRead(jstv_csv_path) 378 | c = { 379 | "newsContent": comment_list 380 | } 381 | jstvdf = pd.DataFrame(c) 382 | # print(weibodf) 383 | 384 | # 文件写入 385 | outFile = open(out_path, 'a+', encoding='utf-8') 386 | 387 | # 写入表头 388 | moodlist = ["polar", "乐", "好", "怒", "哀", "惧", "恶", "惊", "最强情感", "次强情感"] 389 | for moodType_index in range(len(moodlist) - 1): 390 | outFile.write(moodlist[moodType_index] + ',') 391 | outFile.write(moodlist[-1] + '\n') 392 | 393 | # 写入极性和每种情感的得分,一条评论有太多否定词会出现负分 394 | for temp in jstvdf['newsContent']: 395 | score = Score(sentiment_dict_path, degree_dict_path, stop_dict_path) 396 | words = [x.strip() for x in jieba.cut(temp)] # 分词 397 | words_noStopWords = score.remove_stopword(words) 398 | commentLen = len(words_noStopWords) 399 | # 分词->情感词间是否有否定词/程度词+前后顺序->分数累加 400 | 401 | # polar分 402 | result = score.get2score_position(words_noStopWords) # polar 403 | polarScore = 0 404 | if (commentLen): # 因爬虫原因,json的评论下可能没评论 405 | polarScore = float(result) / math.log(commentLen) 406 | outFile.write(str(polarScore) + ',') 407 | 408 | # 乐,好,怒,哀,惧,恶,惊 409 | emotionScore_list = score.getscore(words_noStopWords) # 6种情感 410 | # 大连理工情感词典里表示好和恶的情感很多,消除情感字典情感词数量的影响 411 | weight = [1967, 10640, 388, 2314, 1179, 10282, 288] # 每个情感词汇个数 412 | for i in range(len(emotionScore_list)): # 除以对数评论字符串长度 413 | emotionScore_list[i] = emotionScore_list[i] / math.log((commentLen + 2) / math.log(weight[i])) 414 | 415 | #由于有负分,进行情感修正 416 | emotionScore_list=changeScore(emotionScore_list) 417 | 418 | for i in range(len(emotionScore_list)): 419 | outFile.write(str(emotionScore_list[i]) + ',') # 写入情感分数 420 | 421 | moods1st2nd = find_1st2nd_max(emotionScore_list) 422 | outFile.write(moods1st2nd[0] + ',') 423 | outFile.write(moods1st2nd[1] + '\n') # 写入最强,次强情感 424 | except Exception as e: 425 | print(e) 426 | outFile.close() 427 | 428 | 429 | def changeScore(scoreList): 430 | ''' 431 | 432 | :param scoreList: 433 | :return: 434 | ''' 435 | 436 | key = [ "乐", "好", "怒", "哀", "惧", "恶", "惊"] 437 | ''' 438 | 乐 反义 0.4哀 0.1恶 0.5惧 439 | 好 反义 0.5惧 0.5恶 440 | 怒 反义 乐 441 | 哀 反义 0.4好 0.6乐 442 | 惧 反义 好 443 | 恶 反义 乐 444 | 惊 反义 0.6乐 0.4好 445 | ''' 446 | anti_dict = { 447 | # 乐 好 怒 哀 惧 恶 惊" 448 | '乐': [0, 0, 0, 0.4, 0.5, 0.1, 0], 449 | '好': [0, 0, 0, 0, 0.5, 0.5, 0], 450 | '怒': [1, 0, 0, 0, 0, 0, 0], 451 | '哀': [0.6, 0.4, 0, 0, 0, 0, 0], 452 | '惧': [0, 1, 0, 0, 0, 0, 0], 453 | '恶': [0, 1, 0, 0, 0, 0, 0], 454 | '惊': [0.6, 0.4, 0, 0, 0, 0, 0] 455 | 456 | } 457 | initial_mood_score_dict = dict(zip(key, scoreList)) 458 | minusScore_dict = {} # {'惧': -1.7929313807730507} 459 | for kv in initial_mood_score_dict.items(): 460 | if (kv[1] < -0.000000001 ): 461 | minusScore_dict[kv[0]] = kv[1] 462 | 463 | adj_scoreList_dict = dict(zip(key, scoreList)) 464 | 465 | for kv in minusScore_dict: 466 | adj_scoreList_dict[kv[0]] = 0.0 467 | # print(minusScore_dict,adj_scoreList_dict,end=' ') 468 | for kv in minusScore_dict.items(): 469 | moodType = kv[0] 470 | score = kv[1] 471 | 472 | if (moodType == '乐'): 473 | adj_scoreList_dict['哀'] += -0.5 * score 474 | adj_scoreList_dict['恶'] += -0.1 * score 475 | adj_scoreList_dict['惧'] += -0.4 * score 476 | if (moodType == '好'): 477 | adj_scoreList_dict['惧'] += -0.5 * score 478 | adj_scoreList_dict['恶'] += -0.5 * score 479 | if (moodType == '怒'): 480 | adj_scoreList_dict['乐'] += -score 481 | if (moodType == '哀'): 482 | adj_scoreList_dict['好'] += -0.5 * score 483 | adj_scoreList_dict['乐'] += -0.5 * score 484 | if (moodType == '惧'): 485 | adj_scoreList_dict['好'] += -score 486 | if (moodType == '恶'): 487 | adj_scoreList_dict['好'] += -0.7 * score 488 | adj_scoreList_dict['乐'] += -0.3 * score 489 | if (moodType == '惊'): 490 | adj_scoreList_dict['乐'] += -0.4 * score 491 | adj_scoreList_dict['好'] += -0.6 * score 492 | # print(adj_scoreList_dict) 493 | adjScore_list=list(adj_scoreList_dict.values())#添加修正后的分数 494 | return adjScore_list 495 | 496 | if __name__ == '__main__': 497 | sentiment_dict_path = r"../source/sentiment_words_chinese.tsv" 498 | degree_dict_path = r"../source/degree_dict.txt" 499 | stop_dict_path = r"../source/哈工大停用词表.txt" 500 | # weibo_path=r"../source/2019-12-08info.json" 501 | # out_path=r"../source/Score.csv" 502 | 503 | weiboScore(4,29,6,20,2020,5) 504 | 505 | # jstvSocre(6,21,12,21,2020,6) 506 | 507 | 508 | -------------------------------------------------------------------------------- /Analyse/sentimentDictionary/redme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Analyse/sentimentDictionary/snownlp-01.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from snownlp import SnowNLP 3 | s1 = SnowNLP(u"我今天很开心") 4 | print(u"s1情感分数:") 5 | print(s1.sentiments) 6 | 7 | s2 = SnowNLP(u"我今天很沮丧") 8 | print(u"s2情感分数:") 9 | print(s2.sentiments) 10 | 11 | s3 = SnowNLP(u"大傻瓜,你脾气真差,动不动就打人") 12 | print(u"s3情感分数:") 13 | print(s3.sentiments) 14 | -------------------------------------------------------------------------------- /Data/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Data/四阶段新浪新闻标题_title_肺炎/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Data/四阶段新浪新闻标题_title_肺炎/stage1News2019_sina_title.csv: -------------------------------------------------------------------------------- 1 | 我家宝宝是肺炎吗?这6个症状爸妈收好了,不明白就对照一下,https://k.sina.com.cn/article_1557303822_m5cd2920e03300l76u.html?from=baby,儿科医生鱼小南,2019-12-08 2 | 小儿肺炎可大可小,每次宝宝肺炎,妈妈都会非常担心,那么,https://k.sina.com.cn/article_2055410091_m7a8311ab03300md7i.html?from=baby,辣妈潮爸萌宝宝,2019-12-08 3 | 投资者提问:您好!公司13价肺炎球菌多糖结合疫苗目前处于样品检验阶段,待检...,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-09/doc-iihnzhfz4707564.shtml,问董秘,2019-12-09 4 | 「肺炎 X 光病灶识别」挑战赛:几行代码,就能让医疗检测准确率 20% 的提高!,https://tech.sina.com.cn/roll/2019-12-09/doc-iihnzahi6354768.shtml,雷锋网,2019-12-09 5 | 康泰生物:13价肺炎球菌结合疫苗申请新药生产注册获得受理,https://finance.sina.com.cn/stock/relnews/cn/2019-12-09/doc-iihnzahi6342940.shtml,金融界网站,2019-12-09 6 | 70亿市场再迎入局者 康泰生物申请生产13价肺炎疫苗,https://news.sina.com.cn/c/2019-12-10/doc-iihnzahi6634089.shtml,新京报,2019-12-10 7 | 尤亮因发烧昏睡不起,怎料到医院都快烧成肺炎了,让小花十分担心,https://k.sina.com.cn/article_6824565942_m196c69cb600100oea2.html,爆料影视,2019-12-10 8 | 康泰生物(300601)13价肺炎疫苗报产受理,在研品种即将步入收获期,https://finance.sina.com.cn/stock/relnews/cn/2019-12-10/doc-iihnzhfz4931384.shtml,安信证券,2019-12-10 9 | 康泰生物(300601)13价肺炎疫苗上市申请获得受理,研发管线步入收获期,https://finance.sina.com.cn/stock/relnews/cn/2019-12-10/doc-iihnzahi6550405.shtml,广发证券,2019-12-10 10 | 儿子想让父亲住院治疗,只好让下属一起欺骗他们,向是肺炎,https://k.sina.com.cn/article_6427141121_m17f16640100100l4o6.html?from=movie,悲痛炭黑焦,2019-12-10 11 | 康泰生物(300601):重磅爆款13价肺炎疫苗正式报产 有望于2020年获批上市,http://stock.finance.sina.com.cn/stock/go.php/vReport_Show/kind/lastest/rptid/629257833199/index.phtml,中泰证券,2019-12-10 12 | 康泰生物(300601)公告点评:13价肺炎疫苗上市申请获得受理 研发管线步入收获期,http://stock.finance.sina.com.cn/stock/go.php/vReport_Show/kind/lastest/rptid/629257561373/index.phtml,广发证券,2019-12-10 13 | 康泰生物(300601):13价肺炎疫苗报产受理 在研品种即将步入收获期,http://stock.finance.sina.com.cn/stock/go.php/vReport_Show/kind/lastest/rptid/629243101365/index.phtml,安信证券,2019-12-10 14 | 新生宝宝口吐粘稠泡泡 当心是因肺炎,http://baby.sina.com.cn/health/bbjk/hxse/2019-12-10/doc-iihnzhfz4552714.shtml,信息时报,2019-12-10 15 | 妹妹生病得肺炎,小伙没钱治疗,不料遇见好心人帮助他们,https://k.sina.com.cn/article_6831302378_m1972d66ea00100oap6.html?from=ent&subch=star,小猴叔玩综艺,2019-12-11 16 | 康泰生物(300601)13价肺炎报产获受理,重磅品种陆续进入收获期,https://finance.sina.com.cn/stock/relnews/cn/2019-12-11/doc-iihnzahi6741569.shtml,太平洋证券,2019-12-11 17 | 注意了,加湿器使用不当可能会引发肺炎!,https://tech.sina.com.cn/roll/2019-12-11/doc-iihnzhfz5053589.shtml,中国家电网,2019-12-11 18 | 康泰生物(300601):13价肺炎报产获受理 重磅品种陆续进入收获期,http://stock.finance.sina.com.cn/stock/go.php/vReport_Show/kind/lastest/rptid/629341595811/index.phtml,太平洋证券,2019-12-11 19 | 小秋沥川聊天,沥川说自己得了肺炎,哄小秋入睡!,https://k.sina.com.cn/article_6179910863_m17059f4cf03300liz2.html?from=health,舍友和你聊影视,2019-12-12 20 | 遇见王沥川:小秋沥川聊天,沥川说自己得了肺炎,哄小秋入睡!,http://v.sina.com.cn/mobile/2019-12-12/detail-iihnzhfz5353943.d.html,舍友和你聊影视,2019-12-12 21 | 贵州威宁一婴儿突发肺炎,却遇大雪封路,民警冒雪驱车护送就医,https://k.sina.com.cn/article_6539404397_m185c7646d02000lczz.html?from=local,冰视频,2019-12-16 22 | 间质性肺炎吃什么食物好?医生详解,https://k.sina.com.cn/article_3902018328_me894131803300l8g9.html?from=health,名医在线网,2019-12-16 23 | 医生讲堂 间质性肺炎是绝症吗,https://k.sina.com.cn/article_3902018328_me894131803300l8g7.html?from=health,名医在线网,2019-12-16 24 | 得了间质性肺炎能治好吗?医生这么说,https://k.sina.com.cn/article_3902018328_me894131803300l8fz.html?from=health,名医在线网,2019-12-16 25 | 孩子不发烧?肺部无异响?有些肺炎很隐匿、很狡猾,https://news.sina.com.cn/c/2019-12-17/doc-iihnzhfz6525011.shtml,新京报,2019-12-17 26 | 沈阳近期支原体肺炎发病率上升 孩子反复咳嗽发热需警惕,https://k.sina.com.cn/article_1829303331_m6d08f42303300uhgh.html?from=baby,直播生活官方微博,2019-12-17 27 | 投资者提问:董秘好!公司的23介多糖肺炎疫苗研发即将进入临床研究阶段,与此...,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-17/doc-iihnzahi8028416.shtml,问董秘,2019-12-17 28 | 如何辨别儿童感冒还是肺炎?,https://k.sina.com.cn/article_1979899604_p7602ded402700kdbz.html?from=finance,涨涨麻麻,2019-12-18 29 | 河南新乡7岁女童王梓乐病情危重,因一次发烧引发肺炎肺水肿,https://k.sina.com.cn/article_1015473395_m3c86e4f303300qmp5.html?from=news&subch=onews,河北高速交警,2019-12-18 30 | 河南新乡7岁女童王梓乐病情危重,因一次发烧引发肺炎肺水肿,http://v.sina.com.cn/auto/2019-12-18/detail-iihnzahi8337506.d.html,河北高速交警,2019-12-18 31 | 重症肺炎患者引发呼吸困难,如何正确急救,https://k.sina.com.cn/article_5103305851_m1302e447b03300lpsf.html?from=health,医学微视,2019-12-18 32 | 孩子不发烧?肺部无异响?有些肺炎很隐匿、很狡猾,http://cq.sina.com.cn/health/tips/2019-12-18/detail-iihnzhfz6649053.shtml,地方站-新京报,2019-12-18 33 | 河南省新乡市7岁女童 王梓乐 肺炎肺水肿 引发多器官衰竭,http://v.sina.com.cn/auto/2019-12-18/detail-iihnzhfz6624077.d.html,河北高速交警,2019-12-18 34 | 宝宝感冒总是反反复复好不了,会发展成肺炎吗?,http://blog.sina.com.cn/s/blog_70db28510102yd95.html,鲍秀兰诊室,2019-12-18 35 | 婴儿肺炎窒息 交警紧急救助火速送医,https://k.sina.com.cn/article_1746072563_m6812f3f303300kl6t.html?from=news&subch=onews,早安江苏JSBC,2019-12-19 36 | 小儿肺炎的几大典型症状家长都知道吗,https://k.sina.com.cn/article_6545062778_m1861dbb7a00100ms1d.html?from=baby,小艳美食L,2019-12-19 37 | 婴幼儿发生呛奶,原因多样!如果处理不及时,轻者肺炎重者窒息,https://k.sina.com.cn/article_2718644634_ma20b399a02000mu7h.html?from=news&subch=onews,福建台新闻频道,2019-12-20 38 | 前苏联最后一位领导人戈尔巴乔夫因肺炎住院,http://v.sina.com.cn/mobile/2019-12-20/detail-iihnzhfz7119537.d.html,新京报我们视频,2019-12-20 39 | 情陷夜中环2大结局:江妈感染肺炎,海澜愧对母亲,决定去自首,https://k.sina.com.cn/article_7061291463_m1a4e2c1c7001014hkf.html?from=movie,娱乐刺客小蔓,2019-12-21 40 | 新生儿肺炎症状及表现,https://k.sina.com.cn/article_1817302641_m6c51d67102000ornj.html?from=health,寻医问药,2019-12-21 41 | 男子因为太辛苦的了肺炎,一家都很着急,https://k.sina.com.cn/article_6829993608_m197196e8800100v1d7.html?from=ent&subch=star,综艺我最全,2019-12-23 42 | 肺炎的“克星”发现了,经常吃一点,健脾润肺,赶走肺部炎症!,https://k.sina.com.cn/article_6832727138_m19743246200100njqn.html?from=mood,Queen之声,2019-12-24 43 | 肺炎很难除?绿豆加一物,常吃清除肺部垃圾,呼吸更平稳!,https://k.sina.com.cn/article_6811981953_m19606988100100oy7v.html?from=mood,好剧不能停2018,2019-12-24 44 | 男子一年得10次肺炎 一查竟是因为家里这东西,https://k.sina.com.cn/article_1807058715_m6bb5871b03300red7.html,辽视说天下,2019-12-25 45 | 投资者提问:请问贵公司13价肺炎疫苗何时才能进3合1.谢谢,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-25/doc-iihnzahi9945187.shtml,问董秘,2019-12-25 46 | 生死时速!河南交警接力救助肺炎早产儿 下高速到医院用时仅十分钟,http://v.sina.com.cn/mobile/2019-12-25/detail-iihnzahi9938434.d.html,中国网,2019-12-25 47 | 肺炎是潜伏在老人身边的“杀手”,http://blog.sina.com.cn/s/blog_ab9e9eef0102ygwf.html,健康指南杂志社官博,2019-12-25 48 | 肺炎,http://v.sina.com.cn/mobile/2019-12-26/detail-iihnzahk0210440.d.html,澳洲妇幼Dr韩,2019-12-26 49 | 一段关于肺炎的科普视频,讲解和动画展示还是蛮清晰的。来源,https://k.sina.com.cn/article_2231724507_m850569db03300mdu9.html?from=animation,韩珊珊,2019-12-26 50 | 小伙子吃的不错,肺炎的猫咪。,https://k.sina.com.cn/article_1732807563_m67488b8b03300lvao.html?from=pet,兽医陈满福,2019-12-26 51 | 郭德纲使坏把于谦弄成肺炎,http://v.sina.com.cn/mobile/2019-12-26/detail-iihnzahk0140926.d.html,相声集结号,2019-12-26 52 | 沃森生物13价肺炎疫苗上市在即,https://finance.sina.com.cn/roll/2019-12-26/doc-iihnzhfz8454439.shtml,金融界,2019-12-26 53 | 这次打针太快了,哈哈,铁锤没反应过来,搞定,最后一针肺炎疫苗!,http://v.sina.com.cn/mobile/2019-12-26/detail-iihnzhfz8376331.d.html,鲸鱼老爸,2019-12-26 54 | 疫苗批签发周度跟踪:默沙东HPV全年超800万支 13价肺炎达475万支,https://finance.sina.com.cn/stock/relnews/us/2019-12-26/doc-iihnzahk0031844.shtml,安信证券股份有限公司,2019-12-26 55 | 药闻速递 | 首个国产13价肺炎疫苗即将获批上市;中药协发布致歉函 撤销对鸿茅药酒的表彰,https://finance.sina.com.cn/stock/stockzmt/2019-12-27/doc-iihnzahk0426758.shtml,新浪财经-自媒体综合,2019-12-27 56 | 投资者提问:13价肺炎疫苗黄镇老师采用多价肺炎多糖原料研发,这种方法是他发...,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-27/doc-iihnzahk0400760.shtml,问董秘,2019-12-27 57 | 肺部的“清道夫”发现了,经常吃一点,清肺化痰,预防肺炎!,https://k.sina.com.cn/article_3964658581_mec4fe39500100le1v.html?from=mood,综艺笑呵呵,2019-12-27 58 | 6岁孩子得肺炎,母亲坚持带她去摄影导致晕倒高空坠落,真狠心,https://k.sina.com.cn/article_7241624853_m1afa26d1500100m3ta.html?from=movie,把我推下去莹莹,2019-12-27 59 | 带娃赴香港接种肺炎疫苗攻略,https://k.sina.com.cn/article_2141934923_p7fab554b02700qors.html?from=baby,孕事,2019-12-28 60 | 第一诊室老人得了肺炎必须用抗生素吗,https://k.sina.com.cn/article_6364273313_m17b571aa100100l31x.html?from=health,永日音响,2019-12-29 61 | 肺炎很难除?穿心莲加一物,常吃排除肺部毒素,声音更动听,https://k.sina.com.cn/article_6778174212_m19402bb0400100k3o4.html,毛小驴说剧情,2019-12-29 62 | 小儿肺炎症状,https://k.sina.com.cn/article_1817302641_m6c51d67102000p3na.html?from=health,寻医问药,2019-12-29 63 | 冬日使用羽绒寝具,出现长达3个月气喘虚弱,民众因此患上肺炎,https://k.sina.com.cn/article_2718644634_ma20b399a02000n49u.html?from=news&subch=onews,福建台新闻频道,2019-12-30 64 | 2015乙未年,巳午未火用神三合绊住,火减力不克金,金为肺,得肺炎,https://k.sina.com.cn/article_1098047880_m4172e18803300wyfs.html?from=health,武汉风水师谢淳西微博,2019-12-30 65 | 曾经的他红遍全国,一生只爱章子怡终身未娶,却因肺炎去世,https://k.sina.com.cn/article_7240660682_p1af93b6ca00100ov0l.html?from=ent&subch=star,王者天黑君翎儿,2019-12-30 66 | 郑贵浪:持续高热+咳嗽进展,小心儿童腺病毒肺炎,http://v.sina.com.cn/mobile/2019-12-30/detail-iihnzhfz9213723.d.html,广州日报,2019-12-30 67 | 关注!武汉市卫健委通报肺炎疫情:已发现27例病例,7例病情严重,https://k.sina.com.cn/article_6145283913_m16e49974902000y8g9.html?from=news&subch=onews,看看新闻KNEWS,2019-12-31 68 | 关于武汉肺炎疫情 武汉市委机关报回答6个问题,https://news.sina.com.cn/s/2019-12-31/doc-iihnzahk1223010.shtml,新浪新闻综合,2019-12-31 69 | 肺炎后的武汉华南海鲜城:卫生消毒比以往都频繁,https://news.sina.com.cn/s/2019-12-31/doc-iihnzahk1218328.shtml,界面,2019-12-31 70 | 武汉发现27例肺炎病例,7例病情严重,大部分为华南海鲜城经营户,https://k.sina.com.cn/article_6145283913_m16e49974902000y8e3.html?from=news&subch=onews,看看新闻KNEWS,2019-12-31 71 | 武汉现27例不明原因肺炎,鲁抗医药领携禽流感板块掀涨停潮,https://finance.sina.com.cn/roll/2019-12-31/doc-iihnzahk1219428.shtml,金融界,2019-12-31 72 | 沃森生物13价肺炎球菌多糖结合疫苗上市注册申请获得批准 日出东方澄清称公司业务及研发未涉及有关网红带货领域,https://tech.sina.com.cn/roll/2019-12-31/doc-iihnzahk1208367.shtml,财联社,2019-12-31 73 | 武汉中心医院承认30日凌晨收治多位疑似肺炎病人,https://news.sina.com.cn/o/2019-12-31/doc-iihnzahk1201015.shtml,中国经营网,2019-12-31 74 | 武汉现27例病毒性肺炎,尚未分离出具体病毒类型,https://tech.sina.com.cn/roll/2019-12-31/doc-iihnzhfz9542283.shtml,《财经》杂志,2019-12-31 75 | 武汉卫健委通报27例肺炎疫情:初步认为是病毒性肺炎,未发现人传人现象,https://finance.sina.com.cn/wm/2019-12-31/doc-iihnzahk1192017.shtml,新京报公众号,2019-12-31 76 | 首个国产13价肺炎疫苗获批上市,挑战辉瑞“利润奶牛”,https://news.sina.com.cn/o/2019-12-31/doc-iihnzahk1190540.shtml,第一财经网,2019-12-31 77 | #武汉肺炎未发现明显人传人现象#【实探病,http://v.sina.com.cn/finance/2019-12-31/detail-iihnzahk1186688.d.html,秒拍,2019-12-31 78 | 沃森生物:13价肺炎球菌多糖结合疫苗上市注册申请获批,https://finance.sina.com.cn/roll/2019-12-31/doc-iihnzhfz9534034.shtml,中国证券网,2019-12-31 79 | 医药股异动!针对武汉不明原因肺炎,官方回应:未发现明显人传人现象,https://finance.sina.com.cn/wm/2019-12-31/doc-iihnzahk1185283.shtml,中国证券报,2019-12-31 80 | 武汉:已发现27例肺炎病例 大部分为华南海鲜城经营户,https://k.sina.com.cn/article_6145283913_m16e49974902000y86o.html?from=news&subch=onews,看看新闻KNEWS,2019-12-31 81 | 沃森生物:13价肺炎球菌多糖结合疫苗上市注册获批,https://finance.sina.com.cn/stock/s/2019-12-31/doc-iihnzahk1182178.shtml,新浪财经,2019-12-31 82 | 实探病毒性肺炎收治医院:运营正常 无专门防疫通知,https://k.sina.com.cn/article_1649173367_m624c637705300qqox.html?from=health,每日经济新闻,2019-12-31 83 | 首个国产13价肺炎结合疫苗获批上市,每经独家获悉:价格低于进口疫苗,距正式接种还需3个月以上,https://finance.sina.com.cn/roll/2019-12-31/doc-iihnzahk1181284.shtml,每经网,2019-12-31 84 | 沃森生物:公司13价肺炎球菌多糖结合疫苗上市注册申请获得批准,https://finance.sina.com.cn/stock/relnews/cn/2019-12-31/doc-iihnzhfz9522140.shtml,东方财富,2019-12-31 85 | 实探病毒性肺炎收治医院:运营正常 无专门防疫通知,https://k.sina.com.cn/article_6452231600_m180953db003301qx6r.html?from=finance,NBD视频,2019-12-31 86 | 沃森生物:13价肺炎结合疫苗上市注册申请获批,https://finance.sina.com.cn/roll/2019-12-31/doc-iihnzahk1178969.shtml,金融界,2019-12-31 87 | 我家宝宝是肺炎吗?这6个症状爸妈收好了,不明白就对照一下,https://k.sina.com.cn/article_1557303822_m5cd2920e03300l76u.html?from=baby,儿科医生鱼小南,2019-12-08 88 | 小儿肺炎可大可小,每次宝宝肺炎,妈妈都会非常担心,那么,https://k.sina.com.cn/article_2055410091_m7a8311ab03300md7i.html?from=baby,辣妈潮爸萌宝宝,2019-12-08 89 | 投资者提问:您好!公司13价肺炎球菌多糖结合疫苗目前处于样品检验阶段,待检...,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-09/doc-iihnzhfz4707564.shtml,问董秘,2019-12-09 90 | 「肺炎 X 光病灶识别」挑战赛:几行代码,就能让医疗检测准确率 20% 的提高!,https://tech.sina.com.cn/roll/2019-12-09/doc-iihnzahi6354768.shtml,雷锋网,2019-12-09 91 | 康泰生物:13价肺炎球菌结合疫苗申请新药生产注册获得受理,https://finance.sina.com.cn/stock/relnews/cn/2019-12-09/doc-iihnzahi6342940.shtml,金融界网站,2019-12-09 92 | 70亿市场再迎入局者 康泰生物申请生产13价肺炎疫苗,https://news.sina.com.cn/c/2019-12-10/doc-iihnzahi6634089.shtml,新京报,2019-12-10 93 | 尤亮因发烧昏睡不起,怎料到医院都快烧成肺炎了,让小花十分担心,https://k.sina.com.cn/article_6824565942_m196c69cb600100oea2.html,爆料影视,2019-12-10 94 | 康泰生物(300601)13价肺炎疫苗报产受理,在研品种即将步入收获期,https://finance.sina.com.cn/stock/relnews/cn/2019-12-10/doc-iihnzhfz4931384.shtml,安信证券,2019-12-10 95 | 康泰生物(300601)13价肺炎疫苗上市申请获得受理,研发管线步入收获期,https://finance.sina.com.cn/stock/relnews/cn/2019-12-10/doc-iihnzahi6550405.shtml,广发证券,2019-12-10 96 | 儿子想让父亲住院治疗,只好让下属一起欺骗他们,向是肺炎,https://k.sina.com.cn/article_6427141121_m17f16640100100l4o6.html?from=movie,悲痛炭黑焦,2019-12-10 97 | 康泰生物(300601):重磅爆款13价肺炎疫苗正式报产 有望于2020年获批上市,http://stock.finance.sina.com.cn/stock/go.php/vReport_Show/kind/lastest/rptid/629257833199/index.phtml,中泰证券,2019-12-10 98 | 康泰生物(300601)公告点评:13价肺炎疫苗上市申请获得受理 研发管线步入收获期,http://stock.finance.sina.com.cn/stock/go.php/vReport_Show/kind/lastest/rptid/629257561373/index.phtml,广发证券,2019-12-10 99 | 康泰生物(300601):13价肺炎疫苗报产受理 在研品种即将步入收获期,http://stock.finance.sina.com.cn/stock/go.php/vReport_Show/kind/lastest/rptid/629243101365/index.phtml,安信证券,2019-12-10 100 | 新生宝宝口吐粘稠泡泡 当心是因肺炎,http://baby.sina.com.cn/health/bbjk/hxse/2019-12-10/doc-iihnzhfz4552714.shtml,信息时报,2019-12-10 101 | 妹妹生病得肺炎,小伙没钱治疗,不料遇见好心人帮助他们,https://k.sina.com.cn/article_6831302378_m1972d66ea00100oap6.html?from=ent&subch=star,小猴叔玩综艺,2019-12-11 102 | 康泰生物(300601)13价肺炎报产获受理,重磅品种陆续进入收获期,https://finance.sina.com.cn/stock/relnews/cn/2019-12-11/doc-iihnzahi6741569.shtml,太平洋证券,2019-12-11 103 | 注意了,加湿器使用不当可能会引发肺炎!,https://tech.sina.com.cn/roll/2019-12-11/doc-iihnzhfz5053589.shtml,中国家电网,2019-12-11 104 | 康泰生物(300601):13价肺炎报产获受理 重磅品种陆续进入收获期,http://stock.finance.sina.com.cn/stock/go.php/vReport_Show/kind/lastest/rptid/629341595811/index.phtml,太平洋证券,2019-12-11 105 | 小秋沥川聊天,沥川说自己得了肺炎,哄小秋入睡!,https://k.sina.com.cn/article_6179910863_m17059f4cf03300liz2.html?from=health,舍友和你聊影视,2019-12-12 106 | 遇见王沥川:小秋沥川聊天,沥川说自己得了肺炎,哄小秋入睡!,http://v.sina.com.cn/mobile/2019-12-12/detail-iihnzhfz5353943.d.html,舍友和你聊影视,2019-12-12 107 | 贵州威宁一婴儿突发肺炎,却遇大雪封路,民警冒雪驱车护送就医,https://k.sina.com.cn/article_6539404397_m185c7646d02000lczz.html?from=local,冰视频,2019-12-16 108 | 间质性肺炎吃什么食物好?医生详解,https://k.sina.com.cn/article_3902018328_me894131803300l8g9.html?from=health,名医在线网,2019-12-16 109 | 医生讲堂 间质性肺炎是绝症吗,https://k.sina.com.cn/article_3902018328_me894131803300l8g7.html?from=health,名医在线网,2019-12-16 110 | 得了间质性肺炎能治好吗?医生这么说,https://k.sina.com.cn/article_3902018328_me894131803300l8fz.html?from=health,名医在线网,2019-12-16 111 | 孩子不发烧?肺部无异响?有些肺炎很隐匿、很狡猾,https://news.sina.com.cn/c/2019-12-17/doc-iihnzhfz6525011.shtml,新京报,2019-12-17 112 | 沈阳近期支原体肺炎发病率上升 孩子反复咳嗽发热需警惕,https://k.sina.com.cn/article_1829303331_m6d08f42303300uhgh.html?from=baby,直播生活官方微博,2019-12-17 113 | 投资者提问:董秘好!公司的23介多糖肺炎疫苗研发即将进入临床研究阶段,与此...,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-17/doc-iihnzahi8028416.shtml,问董秘,2019-12-17 114 | 如何辨别儿童感冒还是肺炎?,https://cj.sina.com.cn/articles/view/1979899604/p7602ded402700kdbz?from=finance,涨涨麻麻,2019-12-18 115 | 河南新乡7岁女童王梓乐病情危重,因一次发烧引发肺炎肺水肿,https://k.sina.com.cn/article_1015473395_m3c86e4f303300qmp5.html?from=news&subch=onews,河北高速交警,2019-12-18 116 | 河南新乡7岁女童王梓乐病情危重,因一次发烧引发肺炎肺水肿,http://v.sina.com.cn/auto/2019-12-18/detail-iihnzahi8337506.d.html,河北高速交警,2019-12-18 117 | 重症肺炎患者引发呼吸困难,如何正确急救,https://k.sina.com.cn/article_5103305851_m1302e447b03300lpsf.html?from=health,医学微视,2019-12-18 118 | 孩子不发烧?肺部无异响?有些肺炎很隐匿、很狡猾,http://cq.sina.com.cn/health/tips/2019-12-18/detail-iihnzhfz6649053.shtml,地方站-新京报,2019-12-18 119 | 河南省新乡市7岁女童 王梓乐 肺炎肺水肿 引发多器官衰竭,http://v.sina.com.cn/auto/2019-12-18/detail-iihnzhfz6624077.d.html,河北高速交警,2019-12-18 120 | 宝宝感冒总是反反复复好不了,会发展成肺炎吗?,http://blog.sina.com.cn/s/blog_70db28510102yd95.html,鲍秀兰诊室,2019-12-18 121 | 婴儿肺炎窒息 交警紧急救助火速送医,https://k.sina.com.cn/article_1746072563_m6812f3f303300kl6t.html?from=news&subch=onews,早安江苏JSBC,2019-12-19 122 | 小儿肺炎的几大典型症状家长都知道吗,https://k.sina.com.cn/article_6545062778_m1861dbb7a00100ms1d.html?from=baby,小艳美食L,2019-12-19 123 | 婴幼儿发生呛奶,原因多样!如果处理不及时,轻者肺炎重者窒息,https://k.sina.com.cn/article_2718644634_ma20b399a02000mu7h.html?from=news&subch=onews,福建台新闻频道,2019-12-20 124 | 前苏联最后一位领导人戈尔巴乔夫因肺炎住院,http://v.sina.com.cn/mobile/2019-12-20/detail-iihnzhfz7119537.d.html,新京报我们视频,2019-12-20 125 | 情陷夜中环2大结局:江妈感染肺炎,海澜愧对母亲,决定去自首,https://k.sina.com.cn/article_7061291463_m1a4e2c1c7001014hkf.html?from=movie,娱乐刺客小蔓,2019-12-21 126 | 新生儿肺炎症状及表现,https://k.sina.com.cn/article_1817302641_m6c51d67102000ornj.html?from=health,寻医问药,2019-12-21 127 | 男子因为太辛苦的了肺炎,一家都很着急,https://k.sina.com.cn/article_6829993608_m197196e8800100v1d7.html?from=ent&subch=star,综艺我最全,2019-12-23 128 | 肺炎的“克星”发现了,经常吃一点,健脾润肺,赶走肺部炎症!,https://k.sina.com.cn/article_6832727138_m19743246200100njqn.html?from=mood,Queen之声,2019-12-24 129 | 肺炎很难除?绿豆加一物,常吃清除肺部垃圾,呼吸更平稳!,https://k.sina.com.cn/article_6811981953_m19606988100100oy7v.html?from=mood,好剧不能停2018,2019-12-24 130 | 男子一年得10次肺炎 一查竟是因为家里这东西,https://k.sina.com.cn/article_1807058715_m6bb5871b03300red7.html,辽视说天下,2019-12-25 131 | 投资者提问:请问贵公司13价肺炎疫苗何时才能进3合1.谢谢,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-25/doc-iihnzahi9945187.shtml,问董秘,2019-12-25 132 | 生死时速!河南交警接力救助肺炎早产儿 下高速到医院用时仅十分钟,http://v.sina.com.cn/mobile/2019-12-25/detail-iihnzahi9938434.d.html,中国网,2019-12-25 133 | 肺炎是潜伏在老人身边的“杀手”,http://blog.sina.com.cn/s/blog_ab9e9eef0102ygwf.html,健康指南杂志社官博,2019-12-25 134 | 肺炎,http://v.sina.com.cn/mobile/2019-12-26/detail-iihnzahk0210440.d.html,澳洲妇幼Dr韩,2019-12-26 135 | 一段关于肺炎的科普视频,讲解和动画展示还是蛮清晰的。来源,https://k.sina.com.cn/article_2231724507_m850569db03300mdu9.html?from=animation,韩珊珊,2019-12-26 136 | 小伙子吃的不错,肺炎的猫咪。,https://k.sina.com.cn/article_1732807563_m67488b8b03300lvao.html?from=pet,兽医陈满福,2019-12-26 137 | 郭德纲使坏把于谦弄成肺炎,http://v.sina.com.cn/mobile/2019-12-26/detail-iihnzahk0140926.d.html,相声集结号,2019-12-26 138 | 沃森生物13价肺炎疫苗上市在即,https://finance.sina.com.cn/roll/2019-12-26/doc-iihnzhfz8454439.shtml,金融界,2019-12-26 139 | 这次打针太快了,哈哈,铁锤没反应过来,搞定,最后一针肺炎疫苗!,http://v.sina.com.cn/mobile/2019-12-26/detail-iihnzhfz8376331.d.html,鲸鱼老爸,2019-12-26 140 | 疫苗批签发周度跟踪:默沙东HPV全年超800万支 13价肺炎达475万支,https://finance.sina.com.cn/stock/relnews/us/2019-12-26/doc-iihnzahk0031844.shtml,安信证券股份有限公司,2019-12-26 141 | 药闻速递 | 首个国产13价肺炎疫苗即将获批上市;中药协发布致歉函 撤销对鸿茅药酒的表彰,https://finance.sina.com.cn/stock/stockzmt/2019-12-27/doc-iihnzahk0426758.shtml,新浪财经-自媒体综合,2019-12-27 142 | 投资者提问:13价肺炎疫苗黄镇老师采用多价肺炎多糖原料研发,这种方法是他发...,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-27/doc-iihnzahk0400760.shtml,问董秘,2019-12-27 143 | 肺部的“清道夫”发现了,经常吃一点,清肺化痰,预防肺炎!,https://k.sina.com.cn/article_3964658581_mec4fe39500100le1v.html?from=mood,综艺笑呵呵,2019-12-27 144 | 6岁孩子得肺炎,母亲坚持带她去摄影导致晕倒高空坠落,真狠心,https://k.sina.com.cn/article_7241624853_m1afa26d1500100m3ta.html?from=movie,把我推下去莹莹,2019-12-27 145 | 带娃赴香港接种肺炎疫苗攻略,https://k.sina.com.cn/article_2141934923_p7fab554b02700qors.html?from=baby,孕事,2019-12-28 146 | 第一诊室老人得了肺炎必须用抗生素吗,https://k.sina.com.cn/article_6364273313_m17b571aa100100l31x.html?from=health,永日音响,2019-12-29 147 | 肺炎很难除?穿心莲加一物,常吃排除肺部毒素,声音更动听,https://k.sina.com.cn/article_6778174212_m19402bb0400100k3o4.html,毛小驴说剧情,2019-12-29 148 | 小儿肺炎症状,https://k.sina.com.cn/article_1817302641_m6c51d67102000p3na.html?from=health,寻医问药,2019-12-29 149 | 冬日使用羽绒寝具,出现长达3个月气喘虚弱,民众因此患上肺炎,https://k.sina.com.cn/article_2718644634_ma20b399a02000n49u.html?from=news&subch=onews,福建台新闻频道,2019-12-30 150 | 2015乙未年,巳午未火用神三合绊住,火减力不克金,金为肺,得肺炎,https://k.sina.com.cn/article_1098047880_m4172e18803300wyfs.html?from=health,武汉风水师谢淳西微博,2019-12-30 151 | 曾经的他红遍全国,一生只爱章子怡终身未娶,却因肺炎去世,https://k.sina.com.cn/article_7240660682_p1af93b6ca00100ov0l.html?from=ent&subch=star,王者天黑君翎儿,2019-12-30 152 | 郑贵浪:持续高热+咳嗽进展,小心儿童腺病毒肺炎,http://v.sina.com.cn/mobile/2019-12-30/detail-iihnzhfz9213723.d.html,广州日报,2019-12-30 153 | 关注!武汉市卫健委通报肺炎疫情:已发现27例病例,7例病情严重,https://k.sina.com.cn/article_6145283913_m16e49974902000y8g9.html?from=news&subch=onews,看看新闻KNEWS,2019-12-31 154 | 关于武汉肺炎疫情 武汉市委机关报回答6个问题,https://news.sina.com.cn/s/2019-12-31/doc-iihnzahk1223010.shtml,新浪新闻综合,2019-12-31 155 | 肺炎后的武汉华南海鲜城:卫生消毒比以往都频繁,https://news.sina.com.cn/s/2019-12-31/doc-iihnzahk1218328.shtml,界面,2019-12-31 156 | 武汉发现27例肺炎病例,7例病情严重,大部分为华南海鲜城经营户,https://k.sina.com.cn/article_6145283913_m16e49974902000y8e3.html?from=news&subch=onews,看看新闻KNEWS,2019-12-31 157 | 武汉现27例不明原因肺炎,鲁抗医药领携禽流感板块掀涨停潮,https://finance.sina.com.cn/roll/2019-12-31/doc-iihnzahk1219428.shtml,金融界,2019-12-31 158 | 沃森生物13价肺炎球菌多糖结合疫苗上市注册申请获得批准 日出东方澄清称公司业务及研发未涉及有关网红带货领域,https://tech.sina.com.cn/roll/2019-12-31/doc-iihnzahk1208367.shtml,财联社,2019-12-31 159 | 武汉中心医院承认30日凌晨收治多位疑似肺炎病人,https://news.sina.com.cn/o/2019-12-31/doc-iihnzahk1201015.shtml,中国经营网,2019-12-31 160 | 武汉现27例病毒性肺炎,尚未分离出具体病毒类型,https://tech.sina.com.cn/roll/2019-12-31/doc-iihnzhfz9542283.shtml,《财经》杂志,2019-12-31 161 | 武汉卫健委通报27例肺炎疫情:初步认为是病毒性肺炎,未发现人传人现象,https://finance.sina.com.cn/wm/2019-12-31/doc-iihnzahk1192017.shtml,新京报公众号,2019-12-31 162 | 首个国产13价肺炎疫苗获批上市,挑战辉瑞“利润奶牛”,https://news.sina.com.cn/o/2019-12-31/doc-iihnzahk1190540.shtml,第一财经网,2019-12-31 163 | #武汉肺炎未发现明显人传人现象#【实探病,http://v.sina.com.cn/finance/2019-12-31/detail-iihnzahk1186688.d.html,秒拍,2019-12-31 164 | 沃森生物:13价肺炎球菌多糖结合疫苗上市注册申请获批,https://finance.sina.com.cn/roll/2019-12-31/doc-iihnzhfz9534034.shtml,中国证券网,2019-12-31 165 | 医药股异动!针对武汉不明原因肺炎,官方回应:未发现明显人传人现象,https://finance.sina.com.cn/wm/2019-12-31/doc-iihnzahk1185283.shtml,中国证券报,2019-12-31 166 | 武汉:已发现27例肺炎病例 大部分为华南海鲜城经营户,https://k.sina.com.cn/article_6145283913_m16e49974902000y86o.html?from=news&subch=onews,看看新闻KNEWS,2019-12-31 167 | 沃森生物:13价肺炎球菌多糖结合疫苗上市注册获批,https://finance.sina.com.cn/stock/s/2019-12-31/doc-iihnzahk1182178.shtml,新浪财经,2019-12-31 168 | 实探病毒性肺炎收治医院:运营正常 无专门防疫通知,https://k.sina.com.cn/article_1649173367_m624c637705300qqox.html?from=health,每日经济新闻,2019-12-31 169 | 首个国产13价肺炎结合疫苗获批上市,每经独家获悉:价格低于进口疫苗,距正式接种还需3个月以上,https://finance.sina.com.cn/roll/2019-12-31/doc-iihnzahk1181284.shtml,每经网,2019-12-31 170 | 沃森生物:公司13价肺炎球菌多糖结合疫苗上市注册申请获得批准,https://finance.sina.com.cn/stock/relnews/cn/2019-12-31/doc-iihnzhfz9522140.shtml,东方财富,2019-12-31 171 | 实探病毒性肺炎收治医院:运营正常 无专门防疫通知,https://k.sina.com.cn/article_6452231600_m180953db003301qx6r.html?from=finance,NBD视频,2019-12-31 172 | 沃森生物:13价肺炎结合疫苗上市注册申请获批,https://finance.sina.com.cn/roll/2019-12-31/doc-iihnzahk1178969.shtml,金融界,2019-12-31 173 | 武汉市卫健委通报肺炎疫情情况:已发现27例病例 7例病情严重,https://k.sina.com.cn/article_6145283913_m16e49974902000y85f.html?from=news&subch=onews,看看新闻KNEWS,2019-12-31 174 | 沃森生物晚间公告称,公司13价肺炎球菌多糖结合疫苗上市注册申请获得批准。,https://finance.sina.com.cn/7x24/2019-12-31/doc-iihnzhfz9517349.shtml,新浪财经,2019-12-31 175 | 武汉肺炎疫情未现明显人传人现象,预防“病毒性肺炎”记住这3点,https://k.sina.com.cn/article_6145283913_m16e49974902000y844.html?from=news&subch=onews,看看新闻KNEWS,2019-12-31 176 | 武汉已发现27例病毒性肺炎:该肺炎常见流感病毒引发,https://k.sina.com.cn/article_5463794433_m145aae30100100qmdv.html?from=society,新京报动新闻,2019-12-31 177 | 武汉肺炎病因尚未明确 一大波抗生素公司纷纷涨停,https://finance.sina.com.cn/stock/stockzmt/2019-12-31/doc-iihnzahk1169960.shtml,新浪财经-自媒体综合,2019-12-31 178 | 病毒性肺炎,http://blog.sina.com.cn/s/blog_3f7db8260102yxb1.html,张友平医生,2019-12-31 179 | 最新!武汉市卫健委通报当前肺炎疫情,https://k.sina.com.cn/article_1977460817_m75dda85103300py4l.html?from=health,央视网快看,2019-12-31 180 | 什么是肺炎,http://v.sina.com.cn/mobile/2019-12-31/detail-iihnzahk1160146.d.html,秦岭二月,2019-12-31 181 | 华大基因:网传武汉肺炎检测报告并非来自华大基因,https://finance.sina.com.cn/stock/s/2019-12-31/doc-iihnzahk1158633.shtml,新浪财经,2019-12-31 182 | 华大基因:网传武汉肺炎检测报告并非来自华大基因,https://finance.sina.com.cn/7x24/2019-12-31/doc-iihnzhfz9499591.shtml,新浪财经,2019-12-31 183 | 首个国产十三价肺炎球菌多糖结合疫苗获批上市,https://finance.sina.com.cn/7x24/2019-12-31/doc-iihnzhfz9498406.shtml,新浪财经,2019-12-31 184 | 华大基因:网传武汉肺炎检测报告并非来自华大基因,https://news.sina.com.cn/c/2019-12-31/doc-iihnzahk1157815.shtml,新京报,2019-12-31 185 | 首个国产十三价肺炎球菌多糖结合疫苗获批上市,https://tech.sina.com.cn/roll/2019-12-31/doc-iihnzahk1157297.shtml,21世纪经济报道,2019-12-31 186 | 武汉发现不明原因肺炎 国家卫建委已派专家实地检测,http://v.sina.com.cn/mobile/2019-12-31/detail-iihnzahk1154929.d.html,中国经济网,2019-12-31 187 | 武汉确认27例不明原因肺炎是病毒性肺炎,不确定是SARS,https://tech.sina.com.cn/roll/2019-12-31/doc-iihnzahk1152450.shtml,钛媒体,2019-12-31 188 | 为什么吸烟的人容易得肺炎?知道这几点,不用害怕二次患上肺炎!,https://k.sina.com.cn/article_6183821245_m170959fbd03300n71u.html?from=health,生活健康的小诀窍,2019-12-31 189 | 投资者提问:公司有治疗肺炎的中药颗粒吗,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-31/doc-iihnzhfz9489624.shtml,问董秘,2019-12-31 190 | 【财经下午茶】直击武汉肺炎事发海鲜批发市场,商户仍在正常营业,https://finance.sina.com.cn/roll/2019-12-31/doc-iihnzahk1154727.shtml,中新经纬,2019-12-31 191 | 武汉肺炎未发现明显人传人现象,http://v.sina.com.cn/mobile/2019-12-31/detail-iihnzhfz9518300.d.html,新文化报,2019-12-31 192 | 湖南卫生热线:如有不明原因肺炎需登记,https://k.sina.com.cn/article_6066193547_m16992c48b05300kjjt.html?from=health,七环视频,2019-12-31 193 | 投资者提问:董秘,武汉肺炎事件是否能申请加快新药进度?或者针对此病的临床?,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-31/doc-iihnzhfz9485851.shtml,问董秘,2019-12-31 194 | 投资者提问:刚刚发生的肺炎,看之前的新药对肺炎等也有疗效,请问董秘,这样是...,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-31/doc-iihnzhfz9485967.shtml,问董秘,2019-12-31 195 | 肺炎高发季,病毒性肺炎如何预防,专家强调这3点,https://k.sina.com.cn/article_6046246944_m16862682002000ouoi.html?from=health,尚医健康,2019-12-31 196 | 投资者提问:甲基化能检测病毒性肺炎吗?,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-31/doc-iihnzahk1144620.shtml,问董秘,2019-12-31 197 | 武汉出现不明原因肺炎 A股疫苗概念板块多股涨停,https://finance.sina.com.cn/roll/2019-12-31/doc-iihnzahk1146962.shtml,江苏经济报,2019-12-31 198 | 视频-紧邻湖北 湖南卫生热线:如有不明原因肺炎需登记,http://video.sina.com.cn/p/news/2019-12-31/detail-iihnzhfz9483093.d.html,澎湃新闻,2019-12-31 199 | 投资者提问:请问:武汉发现了不明原因肺炎,公司是否有产品可以治疗肺炎?,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-31/doc-iihnzahk1141588.shtml,问董秘,2019-12-31 200 | 武汉市卫健委通报:多例肺炎病例为病毒性肺炎,https://news.sina.com.cn/o/2019-12-31/doc-iihnzahk1140782.shtml,新华网,2019-12-31 201 | 华大基因否认武汉不明原因肺炎事件疑似检测报告,https://news.sina.com.cn/s/2019-12-31/doc-iihnzahk1140397.shtml,界面,2019-12-31 202 | 紧邻湖北 湖南卫生热线:如有不明原因肺炎需登记,https://news.sina.com.cn/c/2019-12-31/doc-iihnzhfz9481422.shtml,澎湃新闻,2019-12-31 203 | 武汉肺炎疫情样品已经送至武汉国家生物安全实验室,https://finance.sina.com.cn/china/gncj/2019-12-31/doc-iihnzhfz9477163.shtml,新浪财经综合,2019-12-31 204 | 武汉肺炎病原检测正在进行 中国已具备最危险病毒研究条件,https://finance.sina.com.cn/7x24/2019-12-31/doc-iihnzhfz9476131.shtml,新浪财经,2019-12-31 205 | 武汉肺炎病原检测预计今天出结果:按常规流程处理,https://news.sina.com.cn/o/2019-12-31/doc-iihnzhfz9475740.shtml,第一财经网,2019-12-31 206 | 最新!武汉通报肺炎疫情:尚未发现明显人传人现象,https://news.sina.com.cn/o/2019-12-31/doc-iihnzhfz9477649.shtml,中国青年报,2019-12-31 207 | 不明原因肺炎出现地:商户戴口罩做生意,http://v.sina.com.cn/mobile/2019-12-31/detail-iihnzhfz9473185.d.html,新浪视频,2019-12-31 208 | 【[话筒]科普!#肺炎的症状及发病原因是,http://v.sina.com.cn/finance/2019-12-31/detail-iihnzahk1130757.d.html,秒拍,2019-12-31 209 | 视频-武汉发现不明原因肺炎:事发海鲜市场商户戴口罩营业,http://video.sina.com.cn/p/news/2019-12-31/detail-iihnzahk1131302.d.html,澎湃新闻,2019-12-31 210 | 武汉通报肺炎疫情 有2例病情好转拟于近期出院,https://finance.sina.com.cn/wm/2019-12-31/doc-iihnzhfz9471686.shtml,长安街知事,2019-12-31 211 | 武汉出现肺炎疫情 7例病情严重 啥是不明原因肺炎?,https://k.sina.com.cn/article_5463794433_m145aae30100100qmax.html?from=society,新京报动新闻,2019-12-31 212 | 投资者提问:请问公司有没生产抗病毒的药物?尤其是对肺炎之类有预防的药物有吗...,https://finance.sina.com.cn/stock/relnews/dongmiqa/2019-12-31/doc-iihnzhfz9468204.shtml,问董秘,2019-12-31 213 | -------------------------------------------------------------------------------- /Data/四阶段新浪新闻标题_title_肺炎/stage2News_sina_title.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ironstarboy/emotionAnalysis/098c669f15f99e48d38bbfb6137b2d13bbde13c7/Data/四阶段新浪新闻标题_title_肺炎/stage2News_sina_title.csv -------------------------------------------------------------------------------- /Data/四阶段新浪新闻标题_title_肺炎/stage3News_sina_title.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ironstarboy/emotionAnalysis/098c669f15f99e48d38bbfb6137b2d13bbde13c7/Data/四阶段新浪新闻标题_title_肺炎/stage3News_sina_title.csv -------------------------------------------------------------------------------- /Data/四阶段新浪新闻标题_title_肺炎/stage4News_sina_title.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ironstarboy/emotionAnalysis/098c669f15f99e48d38bbfb6137b2d13bbde13c7/Data/四阶段新浪新闻标题_title_肺炎/stage4News_sina_title.csv -------------------------------------------------------------------------------- /Data/相关内容在NJUBOX上.md: -------------------------------------------------------------------------------- 1 | https://box.nju.edu.cn/f/d76288f81d8d4aaaa516/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # emotionAnalysis 2 | 疫情背景下,基于情感词典和机器学习对新闻和微博评论的情感分析 3 | 4 | # Data Science Basics in SWI, NJU, 2020-Fall 5 | 6 | > ## 计算社会学:基于NLP技术的新冠疫情下的社会心态研究 7 | 8 | Cong Jin , YDJSIR, Sugar Xu‘s project of 2020 Data Science Basic Course in SWI, NJU. 9 | 10 | 此为发布开源的版本而不是开发环境中使用的版本。 11 | 12 | 13 | 14 | ## 文件结构 15 | 16 | ```bash 17 | │ LICENSE 18 | │ README.md 19 | ├─Analyze # 分析数据的过程中所使用的所有代码 20 | ├─Data # 原始数据以及处理过后的所有数据 21 | ├─Report # 报告相关源文件以及最终报告的成品 22 | └─Spyder # 爬虫代码 23 | ``` 24 | 25 | 文件结构经过事后整理,并不是工作时目录的状态,因而代码中所涉及的路径需要稍加修改后运行。 26 | 27 | 原始报告数据在评分后抹掉相关关键词后后放出。 28 | 29 | 30 | 31 | > ### `Data`目录下文件结构 32 | > 33 | > 该目录下共有6个文件夹,分别对应`stage0` - `stage6` 34 | > 35 | > ##### stage内文件目录结构 36 | > 37 | > ```bash 38 | > │ COVkeywords-Stage-.json # 人工筛选后的疫情相关关键词 39 | > │ COVkeywords-Stage.json # 未经筛选的疫情关键词 40 | > │ keywords-Stage.json # 从荔枝新闻中获取的原始结果 41 | > │ ratioByDate.png # 该阶段内每日疫情相关重点微博占比 42 | > │ SaveTest.png # 疫情相关度分布拟合结果图1 43 | > │ SaveTest_Fit.png # 疫情相关度分布拟合结果图2 44 | > │ stageCOVWeibo.json # 该阶段内疫情相关重点微博(按时间先后排序) 45 | > │ stageCOVWeiboByImportance.json # 该阶段内疫情相关重点微博(按疫情相关度排序) 46 | > | SaveTest-热度.png # 各项热度指标占比 47 | > │ stageInfo.json # 该阶段基础信息 48 | > │ weiboPolar.png # 疫情相关重点微博情感极性图 49 | > | weiboEmotion.png # 当前阶段的疫情相关微博情感倾向 50 | > ├─YYYY-MM-DD- 51 | > ├─YYYY-MM-DD- 52 | > ├─YYYY-MM-DD- 53 | > ├─YYYY-MM-DD- 54 | > ... 55 | > └─YYYY-MM-DD- 56 | > ``` 57 | > 58 | > ##### 每个日期内文件目录结构 59 | > 60 | > ```bash 61 | > YYYY-MM-DD 62 | > | jstvRAW.csv # 疫情相关关键词检索得到的荔枝新闻原始数据 63 | > | keywords.json # 荔枝新闻正文提取出来的关键词及其乘以100以后的TextRank权值 64 | > | wordcloud.html # 由荔枝新闻生成的词云图 65 | > | blog-Scored.json # 每篇微博都有一个疫情相关度 66 | > | blog-COV.json # 筛选后的新冠疫情相关微博 67 | > | blogInfo.json # 当日博客相关基础信息 68 | > | weiboEmotion.png # 基于心态词典的当日疫情相关微博重点评论情感分析生成的雷达图 69 | > └─weiboEmotion.csv # 基于心态词典的当日疫情相关微博重点评论情感分析原始数据 70 | > ``` 71 | 72 | ======= 73 | 74 | # emotionAnalysis 75 | 76 | 疫情背景下,基于情感词典和机器学习对新闻和微博评论的情感分析 77 | 78 | >>>>>>> 2a2647e2590bc86a53c28a4257d00c8a8c399fed 79 | -------------------------------------------------------------------------------- /Report/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Report/数据获取.md: -------------------------------------------------------------------------------- 1 | ## 数据获取 2 | 3 | 我们通过爬虫对新浪微博、江苏电视台荔枝新闻和新浪新闻进行了数据的爬取。这其中,对新浪微博的爬取采用的是通过移动端网页版API(m.weibo.cn)爬取的方式,而对荔枝新闻和新浪新闻的爬取则采用对html源码进行解析的方式。详情可以在我们的开源仓库中查看。由于在开发新浪微博爬虫的过程中和郝晨杰同学有密切的合作,故我们在征得对方同意后使用了郝晨杰同学获取的数据。我们获取的数据在经过处理后均为json格式的数据,读取和写入十分方便。 4 | 5 | ## 数据分析 6 | 7 | 为了从更细的粒度上考察随着新冠疫情变化而不断变化的社会心态,我们将研究的基本单位定为天。在不同阶段的文件夹下,每一天分割后的原始数据和处理后的数据结果都在阶段文件夹下的对应日期文件夹内,而该阶段整体上的基础信息则写在该阶段文件下。由于不同阶段本身的特点,不同日期内拥有的数据种类可能有所不同,但是总体的命名格式是统一的。例如,在潜在的与疫情相关的关键词的选取上,在第0阶段和第1阶段,荔枝新闻没有对应文章的,关键词生成的源文本就会用新浪新闻的标题做补充。详细情况可以看`data`文件夹下的`README`文件。 8 | 9 | 我们首先提取了不同阶段的疫情关键词。这一步操作对每一个阶段而言都是分别进行的。首先,我们从以“肺炎”为关键词检索的荔枝新闻正文(部分采用了新浪新闻标题)中分词、去除停用词后对每一天生成了前36个关键词(生成的结果已经归一化,最大权值的词的权值为1),并将数据乘以100后绘制词云图(对所有值下取整后)并存储起来。在这些词云图中,我们可以鲜明地看到每一天的疫情相关新闻的重点的演变。在此基础上,我们将这些结果加和汇总,即相同的词语的权值相加,得到一份较大的关键词表,而后再对这其中的每个词语的权值取10的对数并归一化,得到一份原始疫情相关关键词表。我们发现,这样的设计能有效防止因某个关键词的权值过高(如“新冠”、“肺炎”而导致一出现该词对应文本的最终权重就可能过高),也能避免对所有关键词直接采用TF-IDF或者TextRank算法时可能会漏掉一些仅在特定日子中出现但却和疫情演变进程密切相关的关键词。显然,上面所述步骤里并没有人工的介入,关键词中难免混入一些杂质。因此我们还人工地对这些关键词进行了筛选。 10 | 11 | 而后,我们在每一阶段的疫情相关关键词的基础上,筛选出与疫情相关度较高的微博作为后面进一步分析的数据源。在这一步骤中,我们先对微博正文进行分词,然后在分词后的结果中一一检索是否有关键词表中的词,如果有,就给该微博的疫情相关度加上该词的权重乘以该词的出现次数。在这一步中,对每条微博的评论进行精简,仅保留前五分之一的评论。在得到所有微博的疫情相关度指标后,将其从小到大排序并进行函数拟合,拟合函数如下: 12 | $$ 13 | f ( x ) = a * b^{ x - d } +c 14 | $$ 15 | 在拟合后,先计算 $MSE$、$RMSE$ 和 $R^2$ 以检验拟合的效果,并绘制对应图像,从图像中观察相关度的分布。与此同时,图像中还绘制了微博相关度由小到大排序后对应的热度(从三个指标来看:点赞数、评论数和转发数)的散点图以探究疫情相关度高低与热度高低之间的关系。 16 | 17 | 在此之后,我们利用拟合的曲线取某个上百分位点作为阈值(默认选取函数图像上的“拐点”,并加以人工复核与调整),疫情相关度高于该阈值的微博视作为疫情相关重点微博,并将所有疫情相关重点微博存储到对应文件中。与此同时,计算并绘制每日的疫情相关微博占比,以反映当日当日央媒对疫情相关新闻的关注度与当日疫情相关消息的重要程度。 18 | 19 | 在这些步骤进行后,我们在一定程度上掌握了不同阶段下新冠疫情相关信息的分布、热度及重要程度,为下面都分析展开了基础。 -------------------------------------------------------------------------------- /Report/研究背景.md: -------------------------------------------------------------------------------- 1 | # 研究背景 2 | 3 | ## 社会背景 4 | 5 | 日益多元复杂的公众情绪随着社会信息化程度的提高与大数据、人工智能等技术的不断普及在网络强大的传播能力下正以前所未有的速度剧烈变化着。互联网的普及让更多人的情感与想法得以在数字空间得到表达与传播,让研究者们能在一定程度上摆脱传统社会心态的调研方式的制约而以数字手段收集原始研究资料,并定量地研究情绪变化的具体过程。当下新型冠状病毒(COVID-19) 肆虐全球,给人们的生产和生活产生了极大影响,也形成了疫情下独特的网络社会心态和公众情绪。因此,立足此次新型冠状病毒(COVID-19)疫情(下简称新冠疫情),借助适宜的数据与计量手段以和NLP(自然语言处理,Natural Language Processing)技术,便可在一定程度上相对准确地了解新冠疫情这一特殊事件在其自身不同发展阶段对中国大众心态的影响,并对央媒及政府在疫情防控常态化阶段社会心态的引导提出一些参考性意见。 6 | 7 | ## 文献综述 8 | 9 | > TODO 10 | 11 | ## 研究简介 12 | 13 | 我们首先是基于爬虫技术获取了2020年12月8日至2020年6月20日的人民日报在新浪微博的官方微博的正文与评论与对应时间段新浪新闻的所有标题和2019年3月至2020年12月江苏卫视荔枝新闻所有以“肺炎”为关键词的新闻正文。在这些数据的基础上,我们基于TextRank等算法,采取多种统计学方法对采集到的人民日报微博内容及其评论、新浪新闻标题与江苏电视台荔枝新闻等数据源进行疫情相关关键词语料提取与分布分析。而后,我们对人民日报的重点疫情相关微博利用心态词典进行了多种情感的分析,并对荔枝新闻的正文利用机器学习的方式给出积极/消极的情绪极性分析以研究疫情期间大众心态的变,并对我们的成果做了大量可视化工作。最后,我们对这些数据进行综合分析,勾勒出一幅新冠疫情爆发以来中国社会大众心态演变的大致图景。 14 | 15 | 在研究的过程中,我们把此次新冠疫情大致分为6个阶段:阶段0指的是2019年12月8日武汉卫健委通报首例新冠病例至2020年12月26日;阶段1至5分别对应《抗击新冠肺炎疫情的中国行动》白皮书中的第一至第五阶段,即第一阶段:迅即应对突发疫情(2019年12月27日至2020年1月19日)、第二阶段:初步遏制疫情蔓延势头(2020年1月20日至2020年2月20日)、第三阶段:本土新增病例数逐步下降至个位数(2020年2月21日至2020年3月17日)、第四阶段:取得武汉保卫战、湖北保卫战决定性成果(2020年3月18日至2020年4月28日)和第五阶段:全国疫情防控进入常态化(2020年4月29日至2020年6月20日);阶段6对应在此之后的时间段(数据最大收集到2020年12月21日)。下文所提及的阶段均为此处阶段分划下的结果。 16 | 17 | 18 | 19 | 20 | 21 | > 一方面,互联网的普及让更多人的情感与想法得以在数字空间得到表达与传播,情绪的传染面比以往更大;另一方面,由于精准推送与个性化推送等技术的使用,“信息茧房”现象也开始出现,在一定程度上极化了某些群体的特定情绪。 -------------------------------------------------------------------------------- /Spyder/Weibo.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import time 4 | import random 5 | from pyquery import PyQuery as pq 6 | 7 | limitOfErrors = 10 8 | startPage = 0 9 | 10 | 11 | def run(): 12 | BrowserHeaders = { 13 | 'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 14 | } 15 | 16 | # 微博用户主页api通用url 17 | url = 'https://m.weibo.cn/api/container/getIndex' 18 | 19 | # 所有要爬的用户的uid 20 | uid_list = ['2212518065'] 21 | 22 | # 最外层循环,每次爬取完成一个用户 23 | for uid in uid_list: 24 | # url+parm得到特定用户主页API地址 25 | param = { 26 | 'type': 'uid', 27 | 'value': uid, 28 | } 29 | # 发起请求,得到用户主页 30 | response = requests.get(url=url, params=param, headers=BrowserHeaders) 31 | homepage = response.json() 32 | 33 | userInfo = { 34 | '微博用户名': homepage['data']['userInfo']['screen_name'], 35 | '微博主页地址': homepage['data']['userInfo']['profile_url'], 36 | '微博认证名': homepage['data']['userInfo']['verified_reason'], 37 | '微博说明': homepage['data']['userInfo']['description'], 38 | '关注数量': homepage['data']['userInfo']['follow_count'], 39 | '粉丝数量': homepage['data']['userInfo']['followers_count'], 40 | } 41 | 42 | print(userInfo) 43 | # 将该字典存入文件并覆盖原文件,如原先无该文件则创建 44 | with open('dingxiangdoctor.json', 'w', encoding='utf-8') as fp: 45 | fp.write('[\n') 46 | fp.write(json.dumps(userInfo, indent=4, separators=(',', ':'), ensure_ascii=False)) 47 | # fp.write('\n') 48 | fp = open('dingxiangdoctor.json', 'a', encoding='utf-8') 49 | 50 | # 取containerid 51 | tab_list = homepage['data']['tabsInfo']['tabs'] 52 | containerid = '' 53 | 54 | for tab in tab_list: 55 | if (tab['tabKey'] == 'weibo'): 56 | containerid = tab['containerid'] 57 | 58 | # 将containerid参数加入parm时通过url+parm就可以爬取到包含该user发布微博内容的第一个数据包,而后可以根据该数据包拿到接下来的ID 59 | param['containerid'] = containerid 60 | 61 | i = 0 # 本次运行期间已爬取的微博数量 62 | pageNo = startPage 63 | errCount = 0 64 | page = homepage 65 | 66 | cookie = input('Input First Cookie\n') 67 | BrowserHeaders['cookie'] = cookie 68 | 69 | # 不断发起请求,获取用户微博内容 70 | while True: 71 | pageNo += 1 72 | param['page'] = str(pageNo) 73 | response = requests.get(url=url, params=param, headers=BrowserHeaders) 74 | page = response.json() 75 | # print(page) 76 | print('' + str(pageNo)) 77 | 78 | # 每一次循环对应每一页内第几条微博 79 | for card in page['data']['cards']: 80 | try: 81 | if card['card_type'] == 9: 82 | print(card['mblog']['created_at']) 83 | content_dic = { 84 | "微博地址:": card['scheme'], 85 | "发布时间": card['mblog']['created_at'], 86 | "转发数": card['mblog']['reposts_count'], 87 | "评论数": card['mblog']['comments_count'], 88 | "点赞数:": card['mblog']['attitudes_count'], 89 | "微博内容:": get_weibo_content(card['mblog']['id']), 90 | "评论": get_comment(card['mblog']['mid'], card['scheme'], BrowserHeaders['cookie']) 91 | } 92 | fp.write(',\n') 93 | fp.write(json.dumps(content_dic, indent=4, separators=(',', ':'), ensure_ascii=False)) 94 | fp.flush() 95 | print('已爬取' + str(i) + ' 条微博') 96 | print(pageNo) 97 | i += 1 98 | except Exception as e: 99 | print(e) 100 | errCount += 1 101 | if errCount == limitOfErrors: # 达到一定数量的错误数后要求 102 | print('爬取当前出错页面是' + str(pageNo)) 103 | BrowserHeaders['cookie'] = input('New Cookie\n') 104 | errCount = 0 105 | continue 106 | # 注意到最后得手动改一下结尾才能变成json(补一个]) 107 | 108 | 109 | # 获取微博正文 110 | def get_weibo_content(id): 111 | url = 'https://m.weibo.cn/statuses/extend' 112 | headers = { 113 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 114 | } 115 | param = { 116 | 'id': id 117 | } 118 | response = requests.get(url=url, params=param, headers=headers) 119 | return pq(response.json()['data']['longTextContent']).text() 120 | 121 | 122 | # 获取微博评论,默认爬取全部 123 | def get_comment(mid, scheme, cookie): 124 | url = 'https://m.weibo.cn/comments/hotflow' 125 | headers = { 126 | 'Accept': 'application / json, text / plain, * / *', 127 | 'MWeibo-Pwa': '1', 128 | 'Referer': scheme, 129 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36', 130 | 'X-Requested-With': 'XMLHttpRequest', 131 | 'X-XSRF-TOKEN': 'c7e587', 132 | 'cookie': cookie 133 | } 134 | param = { 135 | 'id': mid, 136 | 'mid': mid, 137 | 'max_id_type': '0' 138 | } 139 | 140 | # 获取第一个评论数据包 141 | response = requests.get(url=url, params=param, headers=headers) 142 | comment_page = response.json() 143 | 144 | # 此处代表获得了一个空包,可能由于各种错误导致 145 | if comment_page['ok'] == 0: 146 | return [] 147 | 148 | comment_content = [] 149 | 150 | for user in comment_page['data']['data']: 151 | comment_content.append(pq(user['text']).text()) 152 | 153 | # 只要还有数据就继续爬取 154 | if comment_page['data']['max_id'] != 0: 155 | while True: 156 | try: 157 | param['max_id'] = comment_page['data']['max_id'] # 获取 158 | response = requests.get(url=url, params=param, headers=headers) 159 | comment_page = response.json() 160 | if comment_page['ok'] == 0 or comment_page['data']['max_id'] == 0: # 如果是空包或者没有更多内容则跳出 161 | break 162 | for user in comment_page['data']['data']: 163 | comment_content.append(pq(user['text']).text()) 164 | sleepTime = random.uniform(1, 2.13) 165 | time.sleep(sleepTime) 166 | except Exception as e: 167 | print(url) 168 | print(e) 169 | print("爬取评论出错,可能需要更新Cookie") 170 | headers['Cookie'] = input('Cookie') 171 | # headers['TOKEN'] = input('token') 172 | continue 173 | return comment_content 174 | 175 | 176 | if __name__ == "__main__": 177 | run() 178 | -------------------------------------------------------------------------------- /Spyder/jstvSpyder.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from fake_useragent import UserAgent 3 | import requests 4 | import re 5 | from bs4 import BeautifulSoup as bs 6 | import time 7 | import random 8 | import newsSpyder 9 | 10 | 11 | def urlParam(page, keyword='肺炎'): 12 | '''time:2020-01-01''' 13 | out='https://so.jstv.com/?keyword={keyword}&page={page}'.format(keyword=keyword,page=page) 14 | return out 15 | 16 | 17 | def getOutcomeHtmlText(htmltext): 18 | 'soup得到搜索页面对应结果的源代码部分 列表' 19 | soup = bs(htmltext, 'html.parser', ) 20 | eachOutcomeText = soup.find_all('div', attrs={'class': "lzxw_per_r"}) # 返回每个搜索结果的对应源代码部分,这个class会漏信息,父级又多信息可恶 21 | return eachOutcomeText 22 | 23 | 24 | def get_content_htmlText(htmlText): 25 | '获取新闻正文网页的源代码中正文部分,包含标题' 26 | soup = bs(htmlText, 'html.parser', ) 27 | contentText = soup.find_all('div', attrs={'class': "article"}) # 返回每个搜索结果的对应源代码部分 28 | 29 | return contentText 30 | 31 | 32 | 33 | def write_jumpUrl2csv(htmlText,filename): 34 | jumpUrl = '' 35 | try: 36 | jumpUrl = re.search('', htmlText).group(1).strip()#每一页会有部分搜索结果无法匹配 37 | except Exception as e: 38 | jumpUrl='' 39 | 40 | with open(filename, 'a+',encoding='utf-8') as f: # 可能会出现编码错误 41 | try: 42 | f.write(jumpUrl + '\n') 43 | except: 44 | pass 45 | pass 46 | 47 | 48 | def save_newsDetial(news_wholehtml,article_htmltext,fileName): #article_htmltext:soup之后的一小段text 49 | '存标题,正文,时间,来源到csv' 50 | title='' 51 | content='' 52 | publish_time='' 53 | source='' 54 | 55 | try: 56 | title=re.search('(.*?)',news_wholehtml[:300]).group(1) 57 | 58 | something_with_content = re.findall('(.*?)

', article_htmltext)#匹配下来是元组列表,且包含

\u3000等冗余信息 59 | content_with_word='' 60 | for tup in something_with_content: 61 | content_with_word += tup[1] 62 | content = re.sub('[\u3000]|[]|()|( )', '', content_with_word) 63 | 64 | 65 | publish_time_withClock=re.search('(.*?) ',article_htmltext).group(1)#匹配下来带时分秒 66 | spaceIndex=publish_time_withClock.index(' ') 67 | publish_time=publish_time_withClock[:spaceIndex] 68 | 69 | source=re.search('来源:(.*?)',article_htmltext).group(1) 70 | except Exception as e: 71 | print('新闻正文 正则匹配错误') 72 | 73 | 74 | with open(fileName,'a+',encoding='utf-8') as f: 75 | f.write(publish_time + ',') 76 | f.write(title+',') 77 | f.write(source+',') 78 | f.write(content + '\n') 79 | 80 | 81 | def save_outComeUrl(spage,epage,keyword,outcomeUrl_filename): 82 | ' 先把2000页的的结果的url存到本地文件' 83 | for pageNum in range(spage,epage+1): 84 | current_pageOutcome_url = urlParam(str(pageNum), keyword) # 获取当前页数和keyword的url 85 | current_pageOutcome_text = newsSpyder.get_html_text(current_pageOutcome_url) # 获取当前搜索结果页面的源代码 86 | outcomePrecise_htmlText_list = getOutcomeHtmlText(current_pageOutcome_text) 87 | 88 | for outcomePrecise_htmltext in outcomePrecise_htmlText_list: 89 | write_jumpUrl2csv(str(outcomePrecise_htmltext).replace('\n', ''), outcomeUrl_filename) 90 | print("搜索结果跳转链接已经爬取并保存!") 91 | 92 | 93 | 94 | @newsSpyder.print_run_time 95 | def run(): 96 | #这里修改参数 97 | keyword='肺炎' 98 | total_page=2000#一般有几千页 99 | spage = 1#给多台机器部署的时候,修改这里的爬取 起始页码 和 中止页码,闭区间 100 | epage = 200 101 | outcomeUrl_filename='jstv_肺炎_搜索结果链接_{}-{}.csv'.format(spage,epage)#存所有搜索结果跳转链接的文件 102 | newsDetail_filename='jstv_肺炎_{}-{}.csv'.format(spage,epage)#存新闻正文等细节新的文件 103 | 104 | 105 | save_outComeUrl(spage,epage,keyword,outcomeUrl_filename) 106 | 107 | 108 | count=0 109 | with open(outcomeUrl_filename,'r', encoding='utf-8') as f: 110 | urllines = f.readlines() 111 | for url in urllines: 112 | if url!='': 113 | count+=1 114 | url=url.replace('\n','') 115 | text=newsSpyder.get_html_text(url) 116 | content_htmltext_list=get_content_htmlText(text) 117 | for content_htmltext in content_htmltext_list: 118 | save_newsDetial(text,str(content_htmltext).replace('\n',''),newsDetail_filename) 119 | if (count%50)==0: 120 | print('已爬取{}个正文'.format(count)) 121 | 122 | print('done!') 123 | 124 | if __name__=='__main__': 125 | 126 | run() 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /Spyder/newsSpyder-.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from fake_useragent import UserAgent 3 | import requests 4 | import re 5 | import os 6 | from bs4 import BeautifulSoup as bs 7 | import time 8 | import random 9 | 10 | def show_time(seconds): 11 | m, s = divmod(seconds, 60) 12 | h, m = divmod(m, 60) 13 | return "%d:%02d:%02d" % (h, m, s) 14 | 15 | def get_random_header():#随机头 16 | ua = UserAgent() 17 | user_agent = ua.random 18 | return user_agent 19 | 20 | 21 | def get_html_text(url): 22 | '''获取当前url源代码''' 23 | sleepTime=random.uniform(1,2.33)#等待时间,不要太小吧 24 | time.sleep(sleepTime) 25 | 26 | myheader=get_random_header() 27 | 28 | try: 29 | r=requests.request("GET",url,headers={'user-agent':myheader},timeout=3) 30 | r.encoding='utf-8' 31 | #r.apparent_encoding 32 | return r.text 33 | except Exception as e: 34 | return '' 35 | 36 | 37 | ''' 38 |

39 | 40 |

国家卫健委:目前全国传染病疫情形势总体平稳 41 | 中国新闻网 2019-12-26 15:23:32

42 |
43 | 44 |
45 | 46 |
47 |

  国家卫健委:目前中国传染病疫情形势总体平稳 中新社北京12月26日电 (记者 李亚南)中国国家卫生健康委员会疾病预防控制局副局长王斌26日在北京表示

48 |
49 |
50 | ''' 51 | def getOutcomeHtmlText(htmltext):#得到包含搜索结果源代码文本 列表,格式如上 52 | soup = bs(htmltext, 'html.parser',) 53 | eachOutcomeText=soup.find_all('div',attrs={'class':"box-result clearfix"}) #返回每个搜索结果的对应源代码部分 54 | return eachOutcomeText 55 | 56 | 57 | def save_outcome_info2csv(htmlText,filename): 58 | title = '' 59 | jumpUrl='' 60 | source_and_time='' 61 | source='' 62 | publish_time='' 63 | try: 64 | title = re.search('target="_blank">(.*?)', htmlText).group(1).strip().replace('','').replace('','') 65 | jumpUrl = re.search('',htmlText).group(1).strip() 66 | source_and_time=re.search('(.*?)',htmlText).group(1).strip() 67 | spaceIndex=source_and_time.index(' ') 68 | source=source_and_time[:spaceIndex] 69 | publish_time=source_and_time[spaceIndex+1:spaceIndex+11] 70 | except Exception as e: 71 | print(e) 72 | 73 | with open(filename,'a+') as f: #可能会出现编码错误 74 | try: 75 | f.write(title+',') 76 | f.write(jumpUrl+',') 77 | f.write(source+',') 78 | f.write(publish_time+'\n') 79 | except: 80 | f.write('\n') 81 | 82 | 83 | 84 | 85 | def urlParam(stime, etime, page, keyword='%e8%82%ba%e7%82%8e', my_range='title'):#range:all全文 title标题 86 | '''time:2020-01-01''' 87 | out='https://search.sina.com.cn/?q={keyword}&c=news&range={my_range}&size=20&time=2020&stime={stime}%2000:00:00&etime={etime}%2023:59:59&num=10&page={page}'.format(keyword=keyword,my_range=my_range, stime=stime, etime=etime, page=page) 88 | return out 89 | 90 | 91 | def timeitr(smonth,sday,emonth,eday,year=2020): #遍历一定范围内的日期,返回日期字符串列表,闭区间 92 | begin = datetime.date(year, smonth, sday) 93 | end = datetime.date(year, emonth, eday) 94 | outDaylst=[] 95 | for i in range((end - begin).days + 1): 96 | outday = begin + datetime.timedelta(days=i) 97 | outDaylst.append(str(outday)) 98 | return outDaylst 99 | 100 | 101 | def run(): 102 | #这里修改参数 103 | keyword='肺炎' 104 | my_range='all'#全文:all,标题:title 105 | fileName=r'test.csv' 106 | days=timeitr(3,18,3,18,2020)#闭区间,跨年需要分2段 107 | 108 | 109 | 110 | for ymd in days:#ymd:year month day 111 | for page in range(1): 112 | currentPageUrl=urlParam(ymd,ymd,str(page),keyword,my_range) 113 | currentPageText=get_html_text(currentPageUrl) 114 | outcomeTextList=getOutcomeHtmlText(currentPageText) 115 | for i in range(len(outcomeTextList)): 116 | text=str(outcomeTextList[i]).replace('\n','') 117 | save_outcome_info2csv(text,fileName) 118 | print(ymd+' done!') 119 | 120 | 121 | print('done!') 122 | 123 | if __name__=='__main__': 124 | 125 | start_time = datetime.datetime.now() # 计算主程序运行时间 126 | 127 | run() 128 | 129 | end_time = datetime.datetime.now() 130 | seconds=(end_time - start_time).seconds 131 | spendTime=show_time(seconds) 132 | print(spendTime) 133 | 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /Spyder/newsSpyder.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from fake_useragent import UserAgent 3 | import requests 4 | import re 5 | from bs4 import BeautifulSoup as bs 6 | import time 7 | import random 8 | 9 | 10 | def print_run_time(func): 11 | def wrapper(*args, **kw): 12 | local_time = time.time() 13 | func(*args, **kw) 14 | spendTime=int(time.time() - local_time) 15 | print('总耗时 {}'.format(show_time(spendTime))) 16 | return wrapper 17 | 18 | def show_time(seconds): 19 | m, s = divmod(seconds, 60) 20 | h, m = divmod(m, 60) 21 | return "%d:%02d:%02d" % (h, m, s) 22 | 23 | def get_random_header():#随机头 24 | ua = UserAgent() 25 | user_agent = ua.random 26 | return user_agent 27 | 28 | 29 | def get_html_text(url): 30 | '''获取当前url源代码''' 31 | sleepTime=random.uniform(1,2.33)#等待时间,不要太小吧 32 | time.sleep(sleepTime) 33 | 34 | myheader=get_random_header() 35 | 36 | try: 37 | r=requests.request("GET",url,headers={'user-agent':myheader},timeout=3) 38 | r.encoding='utf-8' 39 | #r.apparent_encoding 40 | return r.text 41 | except Exception as e: 42 | return '' 43 | 44 | 45 | ''' 46 |
47 | 48 |

国家卫健委:目前全国传染病疫情形势总体平稳 49 | 中国新闻网 2019-12-26 15:23:32

50 |
51 | 52 |
53 | 54 |
55 |

  国家卫健委:目前中国传染病疫情形势总体平稳 中新社北京12月26日电 (记者 李亚南)中国国家卫生健康委员会疾病预防控制局副局长王斌26日在北京表示

56 |
57 |
58 | ''' 59 | def getOutcomeHtmlText(htmltext):#得到包含搜索结果源代码文本 列表,格式如上 60 | soup = bs(htmltext, 'html.parser',) 61 | eachOutcomeText=soup.find_all('div',attrs={'class':"box-result clearfix"}) #返回每个搜索结果的对应源代码部分 62 | return eachOutcomeText 63 | 64 | 65 | def save_outcome_info2csv(htmlText,filename): 66 | title = '' 67 | jumpUrl='' 68 | source_and_time='' 69 | source='' 70 | publish_time='' 71 | try: 72 | title = re.search('target="_blank">(.*?)', htmlText).group(1).strip().replace('','').replace('','') 73 | jumpUrl = re.search('',htmlText).group(1).strip() 74 | source_and_time=re.search('(.*?)',htmlText).group(1).strip() 75 | spaceIndex=source_and_time.index(' ') 76 | source=source_and_time[:spaceIndex] 77 | publish_time=source_and_time[spaceIndex+1:spaceIndex+11] 78 | except Exception as e: 79 | print(e) 80 | 81 | with open(filename,'a+',encoding='utf-8') as f: #可能会出现编码错误,默认gbk好像 82 | try: 83 | f.write(title+',') 84 | f.write(jumpUrl+',') 85 | f.write(source+',') 86 | f.write(publish_time+'\n') 87 | except: 88 | f.write('\n') 89 | 90 | 91 | 92 | 93 | def urlParam(stime, etime, page, keyword='%e8%82%ba%e7%82%8e', my_range='title'):#range:all全文 title标题 94 | '''time:2020-01-01''' 95 | out='https://search.sina.com.cn/?q={keyword}&c=news&range={my_range}&size=20&time=2020&stime={stime}%2000:00:00&etime={etime}%2023:59:59&num=10&page={page}'.format(keyword=keyword,my_range=my_range, stime=stime, etime=etime, page=page) 96 | return out 97 | 98 | 99 | def timeitr(smonth,sday,emonth,eday,year=2020): #遍历一定范围内的日期,返回日期字符串列表,闭区间 100 | begin = datetime.date(year, smonth, sday) 101 | end = datetime.date(year, emonth, eday) 102 | outDaylst=[] 103 | for i in range((end - begin).days + 1): 104 | outday = begin + datetime.timedelta(days=i) 105 | outDaylst.append(str(outday)) 106 | return outDaylst 107 | 108 | 109 | @print_run_time 110 | def run(): 111 | #这里修改参数 112 | keyword='肺炎' 113 | my_range='all'#全文:all,标题:title 114 | fileName=r'test.csv' 115 | days=timeitr(1,18,1,18,2020)#闭区间,跨年需要分2段 116 | 117 | 118 | 119 | for ymd in days:#ymd:year month day 120 | for page in range(1): 121 | currentPageUrl=urlParam(ymd,ymd,str(page),keyword,my_range) 122 | currentPageText=get_html_text(currentPageUrl) 123 | outcomeTextList=getOutcomeHtmlText(currentPageText) 124 | for i in range(len(outcomeTextList)): 125 | text=str(outcomeTextList[i]).replace('\n','') 126 | save_outcome_info2csv(text,fileName) 127 | print(ymd+' done!') 128 | 129 | 130 | print('done!') 131 | 132 | if __name__=='__main__': 133 | 134 | 135 | run() 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /Spyder/tryloadjson.py: -------------------------------------------------------------------------------- 1 | import splitWeibo 2 | import json 3 | 4 | if __name__ == "__main__": 5 | filePath = '2019-12-09info.json' 6 | fp = open(filePath, 'r', encoding='utf-8') 7 | res = '' 8 | a = fp.readlines() 9 | for line in a: 10 | res += line 11 | jsonA = json.loads(res) 12 | print(jsonA) 13 | print(jsonA[0]) --------------------------------------------------------------------------------