├── .gitignore ├── keyword.csv ├── zh-README.md ├── README.md ├── EN-README.md ├── main.py ├── GetFunsNum.py └── functions.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea/ 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /keyword.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QingYunA/Twitter-Crawler/HEAD/keyword.csv -------------------------------------------------------------------------------- /zh-README.md: -------------------------------------------------------------------------------- 1 | # 推特爬虫 2 | 这个项目是一个爬虫程序。它可以通过你设置的关键字自动从Twitter收集信息。使用这个程序你可以得到: 3 | - 用户名 4 | - 昵称 5 | - 发布的文字 6 | - 日期 7 | - 推文语言 8 | - 评论数 9 | - 转发数 10 | - 点赞数 11 | - 粉丝数 12 | - ··· 13 | ## 参数 14 | 15 | — keywords_path:需要搜索的关键字 16 | 17 | — stop_num:需要搜索的推文数 18 | 19 | — output_dir:数据保存路径 20 | 21 | — date_time:需要查询的日期 22 | 23 | — limit_language:需要搜索的语言。它的默认值是'all'。即不限定搜索语言 24 | ## 使用方法 25 | 在python文件`main.py`中设置正确的chromedriver路径和相关参数,然后运行程序。 26 | ## 环境要求 27 | - python>=3.6 28 | - 注意chromedriver需要与chrome版本匹配 29 | ## 联系方式 30 | - 邮箱: serein7z@163.com 31 | 有任何问题或建议,欢迎联系。 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 推特爬虫 2 | English README [here](https://github.com/QingYunA/Twitter-Crawler/blob/master/EN-README.md) 3 | 4 | 这个项目是一个爬虫程序。它可以通过你设置的关键字自动从Twitter收集信息。使用这个程序你可以得到: 5 | - 用户名 6 | - 昵称 7 | - 发布的文字 8 | - 日期 9 | - 推文语言 10 | - 评论数 11 | - 转发数 12 | - 点赞数 13 | - 粉丝数 14 | - ··· 15 | ## 更新日志 16 | - [x] 简易的GUI版本的爬虫,有需要的朋友可以测试~ 分支:"" 17 | ## 参数 18 | 19 | — keywords_path:需要搜索的关键字 20 | 21 | — stop_num:需要搜索的推文数 22 | 23 | — output_dir:数据保存路径 24 | 25 | — date_time:需要查询的日期 26 | 27 | — limit_language:需要搜索的语言。它的默认值是'all'。即不限定搜索语言 28 | ## 使用方法 29 | 在python文件`main.py`中设置正确的chromedriver路径和相关参数,然后运行程序。 30 | ## 环境要求 31 | - **与chrome版本匹配的chromedriver** 32 | - python>=3.6 33 | ## 联系方式 34 | - 邮箱: serein7z@163.com 35 | 有任何问题或建议,欢迎联系。 36 | -------------------------------------------------------------------------------- /EN-README.md: -------------------------------------------------------------------------------- 1 | # Twitter-Crawler 2 | 3 | ### Read Chinese README.md here! [zh-README](https://github.com/QingYunA/Twitter-Crawler/blob/master/zh-README.md) 4 | ## introduction 5 | This project is a crawler program. which can automatically collect information from Twitter by the keywords you set. use this program you can get: 6 | - username 7 | - name 8 | - date 9 | - language 10 | - comments num 11 | - likes num 12 | - forward n 13 | ## parameters 14 | - keywords_path: the keywords you want to search 15 | - stop_num: the number of tweets you want to search 16 | - output_dir: the directory you want to save the result 17 | - date_time: the date you want to search 18 | - limit_language: the language you want to search.it's default value is 'all' 19 | ## Usage 20 | - in `main.py` Please set chrome driver path and parameters correctly. 21 | - if you want to get followers of a user, please use file `GetFunsNum.py`. you just need to enter the chrome driver path and the csv file path which is get from `main.py`. 22 | - in fact, the GetFunsNum.py can use a csv file which just contains username. 23 | ## To do 24 | - [x] Get follower count from Twitter (here I use a noob method to get it) 25 | ## Requirements 26 | - python>=3.6 27 | - chromedriver version need to match your Chrome version 28 | ## Contact Information 29 | - Email: serein7z@163.com 30 | 31 | If you have any questions, please contact me. 32 | 33 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from functions import * 2 | # from selenium.webdriver.chrome.options import Options # => 引入Chrome的配置 3 | from selenium.webdriver import ChromeOptions 4 | from selenium import webdriver 5 | import os 6 | 7 | 8 | def Chrome_Config(Chrome_path): 9 | options = ChromeOptions() 10 | options.add_experimental_option('excludeSwitches', ['enable-automation']) 11 | options.add_experimental_option('useAutomationExtension', False) 12 | # options.add_argument("--headless") # => 为Chrome配置无头模式 13 | # options.add_argument("--headless") 14 | 15 | driver = webdriver.Chrome(Chrome_path, options=options) 16 | return driver 17 | 18 | 19 | if __name__ == '__main__': 20 | 21 | Chrome_path = "F:\Code//2022\chromedriver_win32\chromedriver.exe" # Enter your chrome driver path here 22 | driver = Chrome_Config(Chrome_path) # In this function, you can config your chrome driver 23 | print('------------------------------------------------------------------------') 24 | # -------------------------------------------------------------------------------- 25 | # parameters 26 | Keyword_Path = 'keyword.csv' 27 | Stop_num = 10 # this is the number of the items you want to crawl 28 | kw_start_point = 0 # this parameter decides the start keyword of the crawler.its default value is 0 29 | save_path = 'data' # this is the path where you want to save the crawled data 30 | start_date = '2022-04-29' # this parameter decides the start date of the crawler.its default value is 2021-01-01 31 | end_date = '2020-01-01' # this parameter decides the end date of the crawler.its default value is 2020-01-01 32 | limit_language = 'all' # this parameter decides the language of the crawler.its default value is en 33 | # ---------------------------------------------------------------------------------- 34 | if not os.path.exists(save_path): 35 | os.mkdir(save_path) 36 | Twitter_Crawler(driver, Keyword_Path, Stop_num, kw_start_point, save_path, start_date, end_date, limit_language) 37 | print('------------------------------------------------------------------------') 38 | driver.close() 39 | -------------------------------------------------------------------------------- /GetFunsNum.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import time 3 | import datetime 4 | from xml.etree.ElementTree import Comment 5 | from hyperlink import URL 6 | from numpy import common_type 7 | import pandas as pd 8 | from selenium import webdriver 9 | from selenium.webdriver import ActionChains 10 | from bs4 import BeautifulSoup 11 | from selenium import webdriver 12 | import logging 13 | import os 14 | from selenium.webdriver.common.by import By 15 | from selenium.webdriver.common.keys import Keys 16 | from selenium.webdriver.support import expected_conditions as EC 17 | from selenium.webdriver.support.wait import WebDriverWait 18 | import pandas as pd 19 | from selenium.webdriver import ChromeOptions 20 | 21 | 22 | def creatBroserDriver(path): 23 | options = ChromeOptions() 24 | options.add_experimental_option('excludeSwitches', ['enable-automation']) 25 | options.add_experimental_option('useAutomationExtension', False) 26 | options.add_argument("--headless") # => 为Chrome配置无头模式 27 | # options.add_argument("--headless") 28 | driver = webdriver.Chrome(path, options=options) 29 | return driver 30 | 31 | 32 | # 从csv文件中读出某个元素 33 | def readDf(path, elementName): 34 | df = pd.read_csv(path, encoding='UTF-8') 35 | return df[elementName] 36 | 37 | 38 | def searchFunsNum(driver, userName): 39 | urlPath = 'https://twitter.com/search?q=%s&src=typed_query&f=user' % userName 40 | driver.get(urlPath) 41 | driver.implicitly_wait(10); 42 | try: 43 | nameElement = driver.find_element(by=By.XPATH, 44 | value="//*[contains(@class,'css-1dbjc4n r-12181gd r-1pi2tsx r-1ny4l3l r-o7ynqc r-6416eg r-13qz1uu')]") 45 | ActionChains(driver).move_to_element(nameElement).click().perform() 46 | except: 47 | return 'Cant find this user!' 48 | # driver.implicitly_wait(5); 49 | divs = [] 50 | noResTimes = 0 51 | while (not divs): 52 | try: 53 | html = driver.page_source 54 | soup = BeautifulSoup(html, 'html.parser') 55 | divs = soup.find_all('span', 56 | {'class': 'css-901oao css-16my406 r-18jsvk2 r-poiln3 r-b88u0q r-bcqeeo r-qvutc0'}) 57 | noResTimes += 1 58 | if noResTimes >= 20: 59 | return 'Cant find this user!' 60 | except: 61 | return 'Cant find this user!' 62 | 63 | funsNum = divs[1].get_text() 64 | if (',' in funsNum): 65 | temp = funsNum.split(',') 66 | funsNum = int(temp[0]) * 1000 + int(temp[1]) 67 | return funsNum 68 | 69 | elif ('万' in funsNum): 70 | temp = funsNum.split('万') 71 | funsNum = float(temp[0]) * 10000 72 | return funsNum 73 | elif (funsNum == 'Cant find this user!'): 74 | return funsNum 75 | else: 76 | return funsNum 77 | 78 | 79 | def saveToCsv(savePath, oriDfPath, insertData, columnName): 80 | df = pd.read_csv(oriDfPath, encoding="UTF-8") 81 | df.insert(loc=len(df.columns), column=columnName, value=insertData) 82 | df.to_csv(savePath, encoding='utf_8_sig') 83 | 84 | 85 | def tranFunNum(csvPath, savePath): 86 | df = pd.read_csv(csvPath, encoding="UTF-8") 87 | FunsNum = df['Funs'] 88 | procFun = [] 89 | for i in FunsNum: 90 | if (',' in i): 91 | temp = i.split(',') 92 | realNum = int(temp[0]) * 1000 + int(temp[1]) 93 | procFun.append(realNum) 94 | # print(realNum) 95 | elif ('万' in i): 96 | temp = temp = i.split('万') 97 | realNum = float(temp[0]) * 10000 98 | procFun.append(realNum) 99 | elif (i == 'Cant find this user!'): 100 | procFun.append('Cant find this user!') 101 | else: 102 | procFun.append(int(i)) 103 | df.drop(columns='Funs', axis="columns", inplace=True) 104 | df.insert(loc=len(df.columns), column='Funs', value=procFun) 105 | df.to_csv(savePath, encoding='utf_8_sig') 106 | 107 | 108 | def transDir(basePath, savebasePath): 109 | fileNames = os.listdir(basePath) 110 | for filePath in fileNames: 111 | csvPath = os.path.join(basePath, filePath) 112 | # savePath=savebasePath+'F:\Code/2022/CYQ-spider/0313-testAddFuns/afterProcess/Trans—%s'%filePath 113 | savePath = savebasePath + 'Trans—%s' % filePath 114 | print('Transfering file %s' % csvPath) 115 | # print(csvPath) 116 | tranFunNum(csvPath, savePath) 117 | print('File %s transfer done!' % csvPath) 118 | 119 | 120 | if __name__ == '__main__': 121 | #----------------------------------------------------- 122 | driverPath = "F:\Code//2022\chromedriver_win32\chromedriver.exe" #Enter your chromedriver path here 123 | basePath = 'data/' # Enter the path of the csv file you want to process 124 | savaBasePath = '0313data/addFuns/' # Enter the path of the csv file you want to save 125 | #----------------------------------------------------- 126 | if (not os.path.exists(savaBasePath)): 127 | os.mkdir(savaBasePath) 128 | driver = creatBroserDriver(driverPath) 129 | fileNames = os.listdir(basePath) 130 | for filePath in fileNames: 131 | csvPath = os.path.join(basePath, filePath) 132 | if (not os.path.isdir(csvPath)): 133 | print('processing file %s' % filePath) 134 | savePath = savaBasePath + 'Done-%s' % filePath 135 | funsArray = [] 136 | 137 | userNames = readDf(csvPath, elementName='User_name') 138 | for index, userName in enumerate(userNames): 139 | funsNum = searchFunsNum(driver, userName=userName) 140 | funsArray.append(funsNum) 141 | if len(funsArray) % 500 == 0: 142 | print('Process %s items' % len(funsArray)) 143 | 144 | saveToCsv(savePath=savePath, oriDfPath=csvPath, insertData=funsArray, columnName='Funs') 145 | print('Process have done! saved in %s' % savePath) 146 | -------------------------------------------------------------------------------- /functions.py: -------------------------------------------------------------------------------- 1 | # import BeautifulSoup 2 | from math import fabs 3 | import time 4 | import datetime 5 | from xml.etree.ElementTree import Comment 6 | from hyperlink import URL 7 | from numpy import common_type 8 | import pandas as pd 9 | from selenium import webdriver 10 | from bs4 import BeautifulSoup 11 | from selenium import webdriver 12 | import logging 13 | from selenium.webdriver.common.by import By 14 | from selenium.webdriver.common.keys import Keys 15 | from selenium.webdriver.support import expected_conditions as EC 16 | from selenium.webdriver.support.wait import WebDriverWait 17 | import pandas as pd 18 | from selenium.webdriver import ActionChains 19 | from selenium.webdriver import ChromeOptions 20 | 21 | 22 | def Twitter_Crawler(driver, Keyword_Path, Stop_num, kw_start_point=0, save_path=None, start_date=None, end_date=None, 23 | limit_language='all'): 24 | ''' 25 | core function 26 | :param driver: Chrome Driver 27 | :param Keyword_Path:the file directory of your keywords which should be csv 28 | :param Stop_num: the number of the items need to be collect 29 | :param kw_start_point:start of your keyword 30 | :param save_path: the file directory of your data which should be csv 31 | :param start_date: the start date of search 32 | :param end_date: the end date of search 33 | :return: 34 | ''' 35 | df = pd.read_csv(Keyword_Path, encoding='GB18030') 36 | page_index = 1 37 | search_end = False 38 | for index, kw in enumerate(df['关键词']): 39 | if (index >= kw_start_point): 40 | Data_List = [] 41 | History_data = [] 42 | url = 'https://twitter.com/search?q=%s&src=typed_query&f=live' % kw 43 | driver.get(url) 44 | driver.implicitly_wait(10) 45 | try: 46 | old_scroll_height = 0 # 表明页面在最上端 47 | js1 = 'return document.body.scrollHeight' # 获取页面高度的javascript语句 48 | js2 = 'window.scrollTo(0, document.body.scrollHeight)' # 将页面下拉的Javascript语句 49 | while ((driver.execute_script(js1) > old_scroll_height and len( 50 | Data_List) < Stop_num) and search_end == False): # 将当前页面高度与上一次的页面高度进行对比 51 | old_scroll_height = driver.execute_script(js1) # 获取到当前页面高度 52 | driver.execute_script(js2) # 操控浏览器进行下拉 53 | time.sleep(3) # 空出加载的时间 54 | html = driver.page_source 55 | soup = BeautifulSoup(html, 'html.parser') 56 | divs = soup.find_all('div', {'class': 'css-1dbjc4n r-1iusvr4 r-16y2uox r-1777fci r-kzbkwu'}) 57 | for divIndex, div in enumerate(divs): 58 | data_list = [] 59 | 60 | try: 61 | content = div.find('div', { 62 | 'class': 'css-901oao r-18jsvk2 r-37j5jr r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-bnwqim r-qvutc0'}) 63 | if (content): 64 | str_content = content.get_text() 65 | else: 66 | content = div.find('div', { 67 | 'class': 'css-901oao r-18jsvk2 r-1tl8opc r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-bnwqim r-qvutc0'}) 68 | str_content = content.get_text() 69 | 70 | # 获取名字 71 | name = div.find( 72 | 'span', {'class': 'css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0'}).get_text() 73 | # 获取用户名 74 | user_name = div.find( 75 | 'div', {'class': 'css-1dbjc4n r-18u37iz r-1wbh5a2 r-13hce6t'}).get_text() 76 | # 写入时间 77 | date = div.find('time') 78 | date = date['datetime'] 79 | date = date.split('T')[0] 80 | # 校验推文发布时间是否在范围内 81 | # if (date > start_date): 82 | # continue 83 | # if (date < end_date): 84 | # search_end = True 85 | # print('时间超出%s,搜索结束!' % end_date) 86 | # 写入转发 评论 点赞数量 87 | temp = div.find_all('span', { 88 | 'class': 'css-901oao css-16my406 r-poiln3 r-n6v787 r-1cwl3u0 r-1k6nrdp r-1e081e0 r-qvutc0'}) 89 | interactionDatas = [] 90 | for span in temp: 91 | interactionDatas.append(span.get_text()) 92 | try: 93 | language = content.get('lang') 94 | except: 95 | language = 'unknown' 96 | # 写入dataSet 97 | if ((language == limit_language or limit_language=='all') and (str_content not in History_data)): 98 | data_list.append(name) # 名字 99 | data_list.append(user_name) # 用户名 100 | data_list.append(date) 101 | data_list.append(str(str_content).strip().replace('\n', '')) # 内容 102 | for interactionData in interactionDatas: 103 | data_list.append(interactionData) 104 | data_list.append(language) 105 | History_data.append(str_content) 106 | else: 107 | continue 108 | Data_List.append(data_list) 109 | except: 110 | continue 111 | except Exception as e: 112 | print(e) 113 | SaveToCSV(Data_List, index, df, page_index, save_path) 114 | 115 | 116 | def SaveToCSV(Data_List, index, keyword_df, page_index, save_path): 117 | ''' 118 | 将数据保存到CSV文件中 119 | ''' 120 | df_Sheet = pd.DataFrame(Data_List, columns=[ 121 | 'Name', 'User_name', 'Date', 'Content', 'Comments', 'Forward', 'Like', 'Language']) 122 | TIMEFORMAT = '%y%m%d-%H%M%S' 123 | now = datetime.datetime.now().strftime(TIMEFORMAT) 124 | kw = keyword_df['关键词'][index] 125 | kw = kw.split(' ')[0] 126 | csv_path = save_path + '/kw=%s-%s.csv' % (kw, now) 127 | df_Sheet.to_csv(csv_path, encoding='utf_8_sig') 128 | print('第 {} 个URL信息已获取完毕。'.format(page_index)) 129 | try: 130 | print("共采集了%s条数据" % len(Data_List)) 131 | except: 132 | print('意外') 133 | --------------------------------------------------------------------------------