├── data └── README.md ├── spider ├── README.md ├── chromedriver.exe └── main.py ├── README.md ├── model.py ├── utils.py └── train.py /data/README.md: -------------------------------------------------------------------------------- 1 | # 训练数据和训练结果保存 2 | 3 | rt 4 | 5 | -------------------------------------------------------------------------------- /spider/README.md: -------------------------------------------------------------------------------- 1 | # 使用chromedriver简单爬取抖音评论 2 | 理论上能使用,几个月前写的啦!代码很简单 傻瓜式操作 3 | -------------------------------------------------------------------------------- /spider/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengzi1013/Sentiment-Analysis-for-Douyin/HEAD/spider/chromedriver.exe -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sentiment-Analysis-for-Douyin 2 | > 1,使用spider中代码爬取抖音短视频中评论,点赞数,用户id等信息,供后续使用。 3 | > 4 | > 2,使用BERT等预训练模型,针对抖音评论进行简单的多分类情感分析任务。 5 | 6 | > + 情感分类数据格式 7 | > 8 | > | text | label | 9 | > | ------------------------------------------------------------ | ----- | 10 | > | 今天晚上跟着你俩运动了一会,感觉两教练很有正能量,原来你们就是最近很火的踢毽子舞教练[比心] | 1 | 11 | > | 一点都不喜欢那个台湾腔,不喜欢,没多漂亮 | -1 | 12 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # __author:Administrator 2 | # date: 2022/9/15 3 | 4 | import torch 5 | import torch.nn as nn 6 | from transformers import BertModel 7 | 8 | 9 | class EmotionClassifier(nn.Module): 10 | def __init__(self, parms): 11 | super(EmotionClassifier, self).__init__() 12 | self.parms = parms 13 | self.drop = nn.Dropout(self.parms['drop_rate']) 14 | self.bert = BertModel.from_pretrained(self.parms['pretrain_file']) # 加载预训练模型 15 | self.classifier = nn.Linear(self.bert.config.hidden_size, self.parms['n_classes']) 16 | 17 | def forward(self, input_ids, attention_mask, token_type_ids, label_ids=None): 18 | 19 | _, pooled_output = self.bert( 20 | input_ids=input_ids, 21 | attention_mask=attention_mask, 22 | token_type_ids=token_type_ids, 23 | return_dict = False 24 | ) 25 | 26 | logits = self.classifier(self.drop(pooled_output)) 27 | total_loss = 0 28 | loss_func = nn.CrossEntropyLoss() 29 | loss = loss_func(logits.view(-1, self.parms['n_classes']), label_ids.view(-1)) 30 | total_loss += loss 31 | 32 | outputs = (logits,) 33 | outputs = (total_loss,) + outputs 34 | 35 | return outputs # loss, logits -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # __author:Administrator 2 | # 一些配置文件 3 | # date: 2022/9/15 4 | 5 | import numpy as np 6 | import torch 7 | from torch.utils.data import DataLoader, SequentialSampler 8 | from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report 9 | from torch.utils.data import Dataset 10 | 11 | 12 | class myDataset(Dataset): 13 | def __init__(self, text_len, texts, labels, tokenizer): 14 | self.text_len = text_len 15 | self.texts = texts 16 | self.labels = labels 17 | self.tokenizer = tokenizer 18 | 19 | def __len__(self): 20 | return len(self.texts) 21 | 22 | def __getitem__(self, item): 23 | text = str(self.texts[item]) 24 | label = self.labels[item] 25 | encoding = self.tokenizer.encode_plus( 26 | text, 27 | add_special_tokens=True, 28 | max_length=self.text_len, 29 | padding='max_length', 30 | truncation=True, 31 | return_token_type_ids=True, 32 | return_attention_mask=True, 33 | return_tensors='pt', 34 | ) 35 | 36 | samples = { 37 | 'text':text, 38 | 'input_ids':encoding['input_ids'].flatten(), 39 | 'attention_mask':encoding['attention_mask'].flatten(), 40 | 'token_type_ids':encoding['token_type_ids'].flatten(), 41 | 'label':torch.tensor(label, dtype=torch.long) 42 | } 43 | 44 | return samples 45 | 46 | 47 | def compute_metrics(true_labels, pred_labels): 48 | assert len(pred_labels) == len(true_labels) 49 | results = {} 50 | acc = {'acc': accuracy_score(true_labels, pred_labels)} 51 | p = {'precision': precision_score(true_labels, pred_labels, average='macro')} 52 | r = {'recall': recall_score(true_labels, pred_labels, average='macro')} 53 | f1 = {'f1': f1_score(true_labels, pred_labels,average='macro')} 54 | rp = {'report': classification_report(true_labels, pred_labels, target_names=['负面', '中性', '正面'])} 55 | results.update(acc) 56 | results.update(p) 57 | results.update(r) 58 | results.update(f1) 59 | results.update(rp) 60 | 61 | return results 62 | 63 | def evaluate(model, dataset, device, parms): 64 | '''验证''' 65 | eval_sampler = SequentialSampler(dataset) 66 | eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=parms['batch_size']) 67 | 68 | eval_loss = 0.0 69 | eval_step = 0 70 | label_pred = None 71 | label_true = None 72 | 73 | model.eval() 74 | for batch in eval_dataloader: 75 | input_ids = batch['input_ids'].to(device) 76 | attention_mask = batch['attention_mask'].to(device) 77 | token_type_ids = batch['token_type_ids'].to(device) 78 | label = batch['label'].to(device) 79 | 80 | with torch.no_grad(): 81 | outputs = model(input_ids, attention_mask, token_type_ids, label) 82 | tmp_loss, label_logits = outputs 83 | eval_loss += tmp_loss.mean().item() 84 | 85 | eval_step += 1 86 | # 87 | if label_pred is None: 88 | label_pred = label_logits.detach().cpu().numpy() 89 | label_true = label.detach().cpu().numpy() 90 | else: 91 | label_pred = np.append(label_pred, label_logits.detach().cpu().numpy(), axis=0) 92 | label_true = np.append(label_true, label.detach().cpu().numpy(), axis=0) 93 | 94 | eval_loss = eval_loss / eval_step 95 | results = { 96 | 'val_loss': eval_loss 97 | } 98 | label_pred = np.argmax(label_pred, axis=1) 99 | total_result = compute_metrics(label_true, label_pred) # 验证 100 | results.update(total_result) 101 | 102 | for key in sorted(results.keys()): 103 | print(key, str(results[key])) 104 | 105 | return results -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # __author:Administrator 2 | # 抖音文本情感分类, 使用Bert+softmax 3 | # date: 2022/9/15 4 | 5 | import os 6 | import time 7 | import pandas as pd 8 | import numpy as np 9 | from tqdm import tqdm 10 | import random 11 | from sklearn.model_selection import train_test_split 12 | import torch 13 | from utils import evaluate, myDataset 14 | from model import EmotionClassifier 15 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler 16 | from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup 17 | 18 | import warnings 19 | warnings.filterwarnings('ignore') 20 | 21 | seed = 1234 22 | random.seed(seed) 23 | np.random.seed(seed) 24 | torch.manual_seed(seed) 25 | if torch.cuda.is_available(): 26 | torch.cuda.manual_seed_all(seed) 27 | 28 | # 参数设置 29 | parms = { 30 | 'learn_rate': 5e-5, # 学习率 31 | 'batch_size': 32, # 批处理大小· 32 | 'max_length': 32, # 文本长度 33 | 'epochs': 10, # 训练轮次 34 | 'n_classes': 3, # 类别数量 35 | 'logger_steps': 50, # 验证步数 36 | 'drop_rate': 0.3, # 丢失率 37 | 'pretrain_file': "./bert-base-chinese", # 预训练模型 38 | 'data_file': './data', # 数据读取和保存文件 39 | 'raw_data': './DY.csv', # 原始数据文件 40 | 'save_file': './save' # 保存文件 41 | } 42 | 43 | if not os.path.exists(parms['data_file'] + parms['save_file']): 44 | os.makedirs(parms['data_file'] + parms['save_file']) 45 | 46 | start_time = time.time() 47 | # 数据loading train: val: test: = 8: 1: 1 48 | texts = pd.read_csv(parms["data_file"] + parms['raw_data']) 49 | texts, labels = texts['text'].tolist(), (texts['label'] + 1).tolist() 50 | x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=1234, stratify=labels) 51 | x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=1234, stratify=y_test) 52 | 53 | # tokenizer 54 | tokenizer = BertTokenizer.from_pretrained(parms['pretrain_file']) 55 | train_dataset = myDataset(parms['max_length'], x_train, y_train, tokenizer) 56 | val_dataset = myDataset(parms['max_length'], x_val, y_val, tokenizer) 57 | test_dataset = myDataset(parms['max_length'], x_test, y_test, tokenizer) 58 | 59 | # 加载模型 60 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 61 | model = EmotionClassifier(parms) 62 | model.to(device) 63 | # 差分学习率 64 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 65 | bert_param_optimizer = list(model.bert.named_parameters()) 66 | 67 | optimizer_grouped_parameters = [ 68 | {'params': [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)], 69 | 'weight_decay': 0.1}, 70 | {'params': [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, 71 | ] 72 | 73 | optimizer = AdamW(optimizer_grouped_parameters, lr=parms['learn_rate'], eps=1e-8) 74 | 75 | # 训练 76 | train_sampler = RandomSampler(train_dataset) 77 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=parms['batch_size']) 78 | t_total = len(train_dataloader) * parms['epochs'] 79 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*t_total, num_training_steps=t_total) 80 | global_step = 0 81 | tr_loss = 0.0 82 | best_acc = 0.0 83 | epoch = 0 84 | while epoch < int(parms['epochs']): 85 | losses = [] 86 | epoch_iterator = tqdm(train_dataloader, desc='Iteration') 87 | for step, batch in enumerate(epoch_iterator): 88 | model.train() 89 | input_ids = batch['input_ids'].to(device) 90 | attention_mask = batch['attention_mask'].to(device) 91 | token_type_ids = batch['token_type_ids'].to(device) 92 | label = batch['label'].to(device) 93 | outputs = model(input_ids, attention_mask, token_type_ids, label) 94 | loss = outputs[0] 95 | losses.append(loss.item()) 96 | loss.backward() 97 | 98 | optimizer.step() 99 | scheduler.step() 100 | optimizer.zero_grad() 101 | global_step += 1 102 | 103 | if global_step % parms['logger_steps'] == 0: 104 | 105 | print("global step %d, epoch: %d, loss: %.5f, lr: %.10f" 106 | % (global_step, epoch, np.mean(losses), float(scheduler.get_last_lr()[0]))) 107 | 108 | result = evaluate(model, val_dataset, device, parms) 109 | # saving best model 110 | if result['acc'] > best_acc: 111 | best_acc = result['acc'] 112 | torch.save(model.state_dict(), parms['data_file'] + parms['save_file'] + '/model.pt') 113 | 114 | epoch += 1 115 | 116 | print("************train_best_acc: %.5f**********" % best_acc) 117 | 118 | # 测试 119 | print("************测试结果**********") 120 | model.load_state_dict(torch.load(parms['data_file'] + parms['save_file']+ '/model.pt')) 121 | evaluate(model, test_dataset, device, parms) 122 | print("***********消耗时间:%.5f***********" % (time.time() -start_time )) 123 | print("***********结束***********") 124 | -------------------------------------------------------------------------------- /spider/main.py: -------------------------------------------------------------------------------- 1 | # __author:Administrator 2 | # date: 2022/5/19 3 | 4 | # 打开页面后,尽量直接拉到评论下面!!!! 5 | 6 | 7 | 8 | import random 9 | import pandas as pd 10 | import time 11 | from selenium import webdriver 12 | from tqdm import tqdm, trange 13 | from selenium.webdriver.common.by import By 14 | from selenium.webdriver import ChromeOptions 15 | 16 | # 实现规避检测 17 | option = ChromeOptions() 18 | option.add_experimental_option('excludeSwitches',['enable-automation']) 19 | 20 | 21 | def getData(url): 22 | 23 | # chromedriver.exe,下载,这个看自己安装的Google的版本,下载解压后放到当前代码路径下。下载地址 http://chromedriver.storage.googleapis.com/index.html 24 | driver = webdriver.Chrome(options=option) 25 | driver.get(url) 26 | time.sleep(20) # 手动点下弹窗关闭登录,或者自己扫码登录!!! 27 | 28 | userNames = [] # 用户名 29 | userId = [] # 用户抖音号 30 | userAddress = [] # 用户属地 31 | userFollow = [] # 用户关注 32 | userFan = [] # 用户粉丝 33 | userLiked = [] # 用户获赞数 34 | timeList = [] # 发表时间 35 | comments = [] # 评论文本 36 | likeNums = [] # 该条评论的点赞数 37 | 38 | 39 | for i in trange(1, 1000): # 自行设定爬取条数,不建议太多!!! 40 | 41 | try: 42 | # 去掉中途出现的登录页面 43 | driver.find_element(by=By.XPATH, 44 | value='//*[@id="login-pannel"]/div[2]').click() 45 | except: 46 | try: 47 | t = random.uniform(1.5, 2) # 随机浮点数t 48 | sw = random.randint(150, 180) # 滑动像素点 49 | time.sleep(t) # 睡眠t时间 50 | 51 | # 用户名 52 | userName = driver.find_element(by=By.XPATH, 53 | value= f"//*[@id='root']//div[{i}]/div/div[2]/div[1]/div[2]/div[1]/div/a/span/span/span/span/span").text 54 | 55 | # 发表时间 56 | time_ = driver.find_element(by= By.XPATH, 57 | value= f"//*[@id='root']//div[{i}]/div/div[2]/div[1]/div[2]/div[1]/p").text 58 | 59 | # 评论 60 | comment = driver.find_element(by= By.XPATH, 61 | value= f"//*[@id='root']//div[{i}]/div/div[2]/div[1]/p/span/span/span/span/span/span").text 62 | 63 | # 该条评论的点赞数 64 | likeNum = driver.find_element(by=By.XPATH, 65 | value= f"//*[@id='root']//div[{i}]/div/div[2]/div[1]/div[3]/div/p/span").text 66 | 67 | # 跳转到用户主页 68 | # 可能获取不到用户信息,该用户也许注销了 https://www.douyin.com/user/MS4wLjABAAAAOwQt3GN0ydFoV7cEc_bzjS-wT7CWuxOTW7wTcDKS3_c 69 | 70 | driver.find_element(by=By.XPATH, 71 | value= f"//*[@id='root']//div[{i}]/div/div[2]/div[1]/div[2]/div[1]/div/a/span/span/span/span/span").click() 72 | 73 | 74 | # time.sleep(t) 75 | driver.switch_to.window(driver.window_handles[1]) 76 | time.sleep(t) 77 | 78 | # 抖音号 79 | try: 80 | id = driver.find_element(by=By.XPATH, 81 | value="//*[@id='root']/div/div[2]/div/div/div[2]/div[1]/p[1]/span[1]").text.split(':')[1].strip() 82 | except: 83 | id = "" 84 | # 关注 85 | try: 86 | follow = driver.find_element(by=By.XPATH, 87 | value="//*[@id='root']/div/div[2]/div/div/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]").text 88 | except: 89 | follow = "" 90 | # 粉丝 91 | try: 92 | fan = driver.find_element(by=By.XPATH, 93 | value="//*[@id='root']/div/div[2]/div/div/div[2]/div[1]/div[1]/div[2]/div[2]/div[2]").text 94 | except: 95 | fan = "" 96 | # 获赞 97 | try: 98 | liked = driver.find_element(by=By.XPATH, 99 | value="//*[@id='root']/div/div[2]/div/div/div[2]/div[1]/div[1]/div[2]/div[3]/div[2]").text 100 | except: 101 | liked = "" 102 | 103 | # ip属地 104 | try: 105 | ip_address = driver.find_element(by=By.XPATH, 106 | value="//*[@id='root']/div/div[2]/div/div/div[2]/div[1]/p[1]/span[2]").text.split(':')[1].strip() 107 | except: 108 | ip_address = "" 109 | 110 | time.sleep(0.2) 111 | driver.close() 112 | driver.switch_to.window(driver.window_handles[0]) 113 | driver.execute_script(f"window.scrollBy(0, {sw})") 114 | 115 | userId.append(id) 116 | userNames.append(userName) 117 | userAddress.append(ip_address) 118 | userFan.append(fan) 119 | userFollow.append(follow) 120 | userLiked.append(liked) 121 | timeList.append(time_) 122 | comments.append(comment) 123 | likeNums.append(likeNum) 124 | print(f"第{i}条下载完成!!!") 125 | 126 | except: 127 | continue 128 | 129 | return userId, userNames, userAddress, userFan, userFollow, userLiked, timeList, comments, likeNums 130 | 131 | 132 | 133 | if __name__ == "__main__": 134 | 135 | id = "7045926793802501416" # 这串数字是视频ID 136 | url = f"https://www.douyin.com/video/{id}" 137 | userId, userNames, userAddress, userFan, userFollow, userLiked, timeList, comments, likeNums = getData(url) 138 | data = pd.DataFrame({"userId":userId, "userName":userNames, "userAddress": userAddress, "userFan": userFan, 139 | "userFollow": userFollow, "userLiked": userLiked,"date": timeList, "comments": comments, "likeNuns": likeNums}) 140 | data.to_csv(f"./result_ID{id}.csv") # save path 141 | print("**********done***********") --------------------------------------------------------------------------------