├── 情感分析结果.xlsx ├── myDict ├── punc_dict.txt ├── degree_dict.txt ├── not_dict.txt └── stop_dict.txt └── WRD_DataMining.ipynb /情感分析结果.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CheneyWoo/Weibo-Sentiment-Calculating/master/情感分析结果.xlsx -------------------------------------------------------------------------------- /myDict/punc_dict.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CheneyWoo/Weibo-Sentiment-Calculating/master/myDict/punc_dict.txt -------------------------------------------------------------------------------- /myDict/degree_dict.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CheneyWoo/Weibo-Sentiment-Calculating/master/myDict/degree_dict.txt -------------------------------------------------------------------------------- /myDict/not_dict.txt: -------------------------------------------------------------------------------- 1 | 不 2 | 不是 3 | 并不 4 | 没 5 | 没有 6 | 无 7 | 非 8 | 并非 9 | 莫 10 | 弗 11 | 勿 12 | 毋 13 | 未 14 | 尚未 15 | 否 16 | 别 17 | 無 18 | 休 19 | 难道 20 | 差 -------------------------------------------------------------------------------- /myDict/stop_dict.txt: -------------------------------------------------------------------------------- 1 | ! 2 | ! 3 | , 4 | " 5 | # 6 | $ 7 | % 8 | & 9 | ' 10 | ( 11 | ) 12 | * 13 | + 14 | , 15 | - 16 | -- 17 | . 18 | .. 19 | ... 20 | ...... 21 | @ 22 | ./ 23 | .一 24 | .数 25 | .日 26 | \ 27 | \\ 28 | / 29 | // 30 | 0 31 | 1 32 | 2 33 | 3 34 | 4 35 | 5 36 | 6 37 | 7 38 | 8 39 | 9 40 | : 41 | :// 42 | :: 43 | ; 44 | < 45 | = 46 | > 47 | >> 48 | ? 49 | ? 50 | @ 51 | A 52 | Lex 53 | [ 54 | \ 55 | ] 56 | ^ 57 | _ 58 | ` 59 | exp 60 | sub 61 | sup 62 | | 63 | } 64 | ~ 65 | ~~~~ 66 | · 67 | × 68 | ××× 69 | Δ 70 | Ψ 71 | γ 72 | μ 73 | φ 74 | φ. 75 | В 76 | — 77 | —— 78 | ——— 79 | ‘ 80 | ’ 81 | ’‘ 82 | “ 83 | ” 84 | ”, 85 | … 86 | …… 87 | …………………………………………………③ 88 | ′∈ 89 | ′| 90 | ℃ 91 | Ⅲ 92 | ↑ 93 | → 94 | ∈[ 95 | ∪φ∈ 96 | ≈ 97 | ① 98 | ② 99 | ②c 100 | ③ 101 | ③] 102 | ④ 103 | ⑤ 104 | ⑥ 105 | ⑦ 106 | ⑧ 107 | ⑨ 108 | ⑩ 109 | ── 110 | ■ 111 | ▲ 112 |   113 | 、 114 | 。 115 | 〈 116 | 〉 117 | 《 118 | 》 119 | 》), 120 | 」 121 | 『 122 | 』 123 | 【 124 | 】 125 | 〔 126 | 〕 127 | -------------------------------------------------------------------------------- /WRD_DataMining.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from snownlp import SnowNLP\n", 10 | "s = SnowNLP(content_list[8])\n", 11 | "for sentence in s.sentences:\n", 12 | " print sentence" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 6, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/plain": [ 23 | "0.5" 24 | ] 25 | }, 26 | "execution_count": 6, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "s1 = SnowNLP(s.sentences[0])\n", 33 | "s1.sentiments" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 1, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from collections import defaultdict\n", 43 | "import os\n", 44 | "import re\n", 45 | "import codecs\n", 46 | "import sys\n", 47 | "reload(sys)\n", 48 | "sys.setdefaultencoding(\"utf-8\")" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 57, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "#coding=utf-8\n", 58 | "import pandas as pd\n", 59 | "import numpy as np\n", 60 | "import csv\n", 61 | "\n", 62 | "df1 = pd.read_csv('Kunshan_Case/凤凰网视频.csv')\n", 63 | "content_list = list(df1['转发微博内容'])[8000:]" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 58, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "import math\n", 73 | "for i in range(len(content_list)):\n", 74 | " if isinstance(content_list[i],float) and math.isnan(float(content_list[i])):\n", 75 | " print i,content_list[i]\n", 76 | " content_list[i] = 0" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 59, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "for i in range(140, len(content_list)):\n", 86 | " content_list[i] = re.sub(r'//.*:','。',content_list[i])\n", 87 | " content_list[i] = re.sub(r'//.*:','。',content_list[i])\n", 88 | " #print i,content_list[i]" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 60, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "import jieba\n", 98 | "seg_list = jieba.cut(\"你今天有点美\", cut_all = False)\n", 99 | "content_str = \"/ \".join(seg_list)\n", 100 | "print content_str" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 61, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "result_list = []\n", 110 | "for i in range(len(content_list)):\n", 111 | " if isinstance(content_list[i], str):\n", 112 | " seg_list = jieba.cut(content_list[i].decode('utf-8'), cut_all = False)\n", 113 | " content_str = \"/ \".join(seg_list)\n", 114 | " print content_str\n", 115 | " temp_list = content_str.split('/')\n", 116 | " result_list.append(temp_list)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 62, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "#情感词\n", 126 | "emotionDict = defaultdict()\n", 127 | "emotionList = open('myDict/emotion_dict.txt', 'rw+')\n", 128 | "lines = emotionList.readlines()\n", 129 | "for item in lines:\n", 130 | " emotionDict[item.split('\\t')[0]] = item.split('\\t')[1][:-2]\n", 131 | "del emotionDict['']\n", 132 | "#否定词\n", 133 | "notDict = defaultdict()\n", 134 | "notList = open('myDict/not_dict.txt', 'rw+')\n", 135 | "lines = notList.readlines()\n", 136 | "for item in lines:\n", 137 | " notDict[item[:-1]] = -1\n", 138 | "#程度副词\n", 139 | "degreeDict = defaultdict()\n", 140 | "degreeList = open('myDict/degree_dict.txt', 'rw+')\n", 141 | "lines = degreeList.readlines()\n", 142 | "for item in lines:\n", 143 | " degreeDict[item.split('\\t')[0]] = item.split('\\t')[1][:-2]\n", 144 | "#结尾语气\n", 145 | "puncDict = defaultdict()\n", 146 | "puncDict['?'] = -1.5\n", 147 | "puncDict['!'] = 2\n", 148 | "puncDict['~'] = 1.2\n", 149 | "puncDict['?'] = -1.5\n", 150 | "puncDict['!'] = 2\n", 151 | "#停顿\n", 152 | "f = open('myDict/stop_dict.txt', 'rw+')\n", 153 | "stopList = f.readlines()\n", 154 | "for i in range(len(stopList)):\n", 155 | " stopList[i] = stopList[i][:-1]" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 63, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "#计算句子得分\n", 165 | "def SentScore(sentence):\n", 166 | " Score = 0\n", 167 | " emotion = 0\n", 168 | " degree = 1\n", 169 | " notword = 0\n", 170 | " punc = 0\n", 171 | " for word in sentence:\n", 172 | " word = word[1:]\n", 173 | " if word not in stopList:\n", 174 | " if word in emotionDict.keys() and word not in notDict.keys() and word not in degreeDict.keys():\n", 175 | " emotion += float(emotionDict[word.encode('utf-8')])\n", 176 | " #print \"emotion:\",word,float(emotionDict[word.encode('utf-8')])\n", 177 | " elif word in notDict.keys() and word not in degreeDict.keys():\n", 178 | " notword = -1\n", 179 | " #print \"not:\", word\n", 180 | " elif word in degreeDict.keys():\n", 181 | " degree += float(degreeDict[word.encode('utf-8')])\n", 182 | " #print \"degree:\", float(degreeDict[word.encode('utf-8')])\n", 183 | " else:\n", 184 | " continue\n", 185 | " elif word in stopList:\n", 186 | " #print \"stop\"\n", 187 | " if word in puncDict.keys():\n", 188 | " punc += float(puncDict[word.encode('utf-8')])\n", 189 | " #print \"punc:\", word, float(puncDict[word.encode('utf-8')])\n", 190 | " Score += ((-1)**notword)*degree*emotion+punc\n", 191 | " return Score" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 64, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "SentResult = []\n", 201 | "for i in range(len(result_list)):\n", 202 | " result = SentScore(result_list[i])\n", 203 | " SentResult.append(result)\n", 204 | " print i,result\n", 205 | "print \"Finished\"" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 65, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "import xlrd \n", 215 | "import xlwt\n", 216 | "\n", 217 | "f = xlwt.Workbook()\n", 218 | "sheet1 = f.add_sheet(u'sheet1',cell_overwrite_ok=True) \n", 219 | "for i in range(len(SentResult)):\n", 220 | " #SentResult[i] = MaxMinNormalization(SentResult[i], max(SentResult), min(SentResult))\n", 221 | " sheet1.write(i,0,SentResult[i])\n", 222 | "print \"Finished\"\n", 223 | "f.save('result.xls')" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 18, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "def MaxMinNormalization(x, Max, Min):\n", 233 | " if x>0:\n", 234 | " x = ((x) / Max)*10\n", 235 | " elif x<0:\n", 236 | " x = -(x) / (Min)*10\n", 237 | " elif x==0:\n", 238 | " x = 0\n", 239 | " return x\n", 240 | "print SentResult[9]\n", 241 | "print MaxMinNormalization(SentResult[9], max(SentResult), min(SentResult))" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [] 250 | } 251 | ], 252 | "metadata": { 253 | "kernelspec": { 254 | "display_name": "Python 2", 255 | "language": "python", 256 | "name": "python2" 257 | }, 258 | "language_info": { 259 | "codemirror_mode": { 260 | "name": "ipython", 261 | "version": 2 262 | }, 263 | "file_extension": ".py", 264 | "mimetype": "text/x-python", 265 | "name": "python", 266 | "nbconvert_exporter": "python", 267 | "pygments_lexer": "ipython2", 268 | "version": "2.7.15" 269 | } 270 | }, 271 | "nbformat": 4, 272 | "nbformat_minor": 2 273 | } 274 | --------------------------------------------------------------------------------