├── crawler ├── __init__.py ├── api.py ├── decorator.py ├── database.py ├── basicinfo_crawler.py ├── web_crawler.py ├── relation_crawler.py └── tweets_crawler.py ├── portrayal ├── __init__.py ├── tools │ ├── __init__.py │ ├── preprocess.py │ ├── function.py │ └── generate_xml.py ├── influence │ ├── __init__.py │ └── calculate_influence.py ├── career_classify │ ├── __init__.py │ ├── classify.py │ ├── training.py │ └── preprocess.py ├── interest_extract │ ├── __init__.py │ ├── tag_cloud.py │ └── interest_extract.py ├── sentiment_classify │ ├── __init__.py │ ├── classify.py │ ├── process_dict.py │ ├── sentiment_classify.py │ ├── sentiment_dict.py │ └── training.py ├── user_profile.py └── resource │ └── stop_words.txt ├── README.md ├── testing.py ├── .gitattributes ├── .gitignore ├── neo4j.py ├── crawling.py └── typical.py /crawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /portrayal/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /portrayal/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /portrayal/influence/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /portrayal/career_classify/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /portrayal/interest_extract/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## user-portrait 2 | 3 | ### 用户画像 -------------------------------------------------------------------------------- /portrayal/sentiment_classify/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /testing.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from crawling import get_user_all_info 3 | from portrayal.user_profile import user_profile 4 | 5 | 6 | def main(): 7 | # pass 8 | user = get_user_all_info(screen_name = 'David_Cameron') 9 | user = user_profile(user) 10 | del user['tweets'] 11 | print user 12 | 13 | 14 | 15 | if __name__ == '__main__': 16 | main() -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain -------------------------------------------------------------------------------- /crawler/api.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from twitter import Api as TwitterAPI 3 | from config import APP_INFO 4 | 5 | API_LIST = [] 6 | API_COUNT = len(APP_INFO) 7 | 8 | for i in range(API_COUNT): 9 | API_LIST.append(TwitterAPI(consumer_key = APP_INFO[i]['consumer_key'], 10 | consumer_secret = APP_INFO[i]['consumer_secret'], 11 | access_token_key = APP_INFO[i]['access_token_key'], 12 | access_token_secret = APP_INFO[i]['access_token_secret'], 13 | cache = None)) 14 | 15 | class Api: 16 | def __init__(self): 17 | self.api_index = 0 18 | 19 | ''' 20 | 获取 twitter app,每次调用返回一个新的 app 21 | ''' 22 | def get_api(self): 23 | api_index = self.api_index 24 | api_index = (api_index + 1) % API_COUNT 25 | self.api_index = api_index 26 | 27 | return API_LIST[api_index] -------------------------------------------------------------------------------- /portrayal/interest_extract/tag_cloud.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | 4 | from pytagcloud import create_tag_image, make_tags 5 | from pytagcloud.lang.counter import get_tag_counts 6 | 7 | from .. config import PROJECT_PATH 8 | 9 | file_path = PROJECT_PATH + "portrayal/resource/tag_cloud/" 10 | 11 | 12 | def generate_tag_cloud(text, user_id): 13 | word_count = [] 14 | word_list = text.split(',') 15 | length = len(word_list) * 2 16 | 17 | for word in word_list: 18 | word_count.append((word, length / 10)) 19 | length -= 1 20 | 21 | tags = make_tags(word_count, maxsize = 48) 22 | 23 | for item in tags: 24 | item['tag'] = re.sub(r'label(\w+)label',r'#\1', item['tag']) 25 | 26 | file_name = file_path + '%d.png' % user_id 27 | create_tag_image(tags, file_name, size = (999, 688), fontname = 'Lobster', background=(0, 0, 0, 0)) 28 | 29 | return file_name -------------------------------------------------------------------------------- /crawler/decorator.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import time 4 | 5 | from twitter import error 6 | from api import API_COUNT 7 | 8 | ''' 9 | 生成装饰器 10 | ''' 11 | def generate_decorator(sleep_time = 700): 12 | 13 | # 处理Twitter异常装饰器 14 | def handle_exception(func): 15 | def wrapper(*args, **kw): 16 | sleep_count = 0 17 | 18 | while True: 19 | try: 20 | return func(*args, **kw) 21 | except error.TwitterError as te: 22 | try: 23 | if te.message[0]['code'] == 88: 24 | sleep_count += 1 25 | 26 | if sleep_count >= API_COUNT: 27 | print "sleeping..." 28 | sleep_count = 0 29 | time.sleep(sleep_time) 30 | continue 31 | 32 | else: 33 | print te 34 | return None 35 | except Exception as ee: 36 | print ee 37 | return None 38 | 39 | except Exception as e: 40 | print e 41 | return None 42 | 43 | return wrapper 44 | 45 | return handle_exception -------------------------------------------------------------------------------- /crawler/database.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import MySQLdb 4 | 5 | from py2neo import Graph 6 | from pymongo import MongoClient 7 | from config import MYSQL, MONGO_DB, NEO4J 8 | 9 | 10 | class Mysql: 11 | def connect(self): 12 | db = MySQLdb.connect(MYSQL['DB_HOST'], MYSQL['DB_USER'], MYSQL['DB_PASSWORD'], MYSQL['DB_DATABASE']) 13 | cursor = db.cursor() 14 | self.cursor = cursor 15 | self.db = db 16 | return db 17 | 18 | def execute(self, sql): 19 | self.cursor.execute(sql) 20 | self.db.commit() 21 | 22 | def fetchall(self, sql): 23 | self.cursor.execute(sql) 24 | res = self.cursor.fetchall() 25 | 26 | return res 27 | 28 | def close(self): 29 | self.db.close() 30 | 31 | 32 | class MongoDB: 33 | def connect(self, db_name = MONGO_DB['DB_DATABASE']): 34 | client = MongoClient(MONGO_DB['DB_HOST'], MONGO_DB['DB_PORT']) 35 | db = client[db_name] 36 | db.authenticate(MONGO_DB['DB_USER'], MONGO_DB['DB_PASSWORD']) 37 | self.db = db 38 | 39 | return db 40 | 41 | 42 | class Neo4j: 43 | def connect(self): 44 | graph = Graph(NEO4J['DB_HOST'], 45 | username = NEO4J['DB_USER'], 46 | password = NEO4J['DB_PASSWORD']) 47 | 48 | return graph -------------------------------------------------------------------------------- /portrayal/user_profile.py: -------------------------------------------------------------------------------- 1 | from sentiment_classify import sentiment_classify 2 | from career_classify.classify import exe_classify 3 | from interest_extract.interest_extract import extract_tags 4 | from influence.calculate_influence import calculate_influence, calc_activity_sequence 5 | 6 | 7 | def user_profile(user): 8 | tweets = user['tweets'] 9 | 10 | if not tweets or len(tweets) == 0: 11 | return user 12 | 13 | final_sentiment, psy_with_time1, psy_with_time2, psy_with_count1, psy_with_count2 = sentiment_classify.exe_sentiment_classify(tweets) 14 | user['psy'] = final_sentiment 15 | user['psy_with_time1'] = psy_with_time1 16 | user['psy_with_time2'] = psy_with_time2 17 | user['psy_with_count1'] = psy_with_count1 18 | user['psy_with_count2'] = psy_with_count2 19 | 20 | text = '' 21 | for tweet in tweets: 22 | text += tweet['text'] 23 | 24 | category, categories_score = exe_classify(text) 25 | user['category'] = category 26 | user['category_score'] = categories_score 27 | 28 | user['interest_tags'] = extract_tags(tweets, user['description']) 29 | 30 | influence_score, activity = calculate_influence(user['followers_count'], tweets) 31 | user['influence_score'] = influence_score 32 | user['activity'] = activity 33 | 34 | user['activity_list'] = calc_activity_sequence(tweets) 35 | 36 | return user -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | /portrayal/career_classify/data 49 | /portrayal/career_classify/data_category 50 | /portrayal/career_classify/data_processed 51 | /portrayal/career_classify/data_pruned 52 | /portrayal/career_classify/pickle 53 | /portrayal/career_classify/pickle_category 54 | /portrayal/career_classify/statistics 55 | /portrayal/interest_extract/data 56 | /portrayal/interest_extract/pickle 57 | /portrayal/sentiment_classify/data 58 | /portrayal/sentiment_classify/data1 59 | /portrayal/sentiment_classify/pickle 60 | /portrayal/resource/tag_cloud 61 | /portrayal/resource/users_xml 62 | *.pyc 63 | config.py -------------------------------------------------------------------------------- /portrayal/sentiment_classify/classify.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | ''' 3 | * @Author: Marco 4 | * @Date: 2017-09-09 14:29:43 5 | * @Last Modified by: Marco 6 | * @Last Modified time: 2017-09-09 14:29:43 7 | ''' 8 | import os 9 | import pickle 10 | import training 11 | from statistics import mode 12 | 13 | from .. tools.preprocess import preprocess_del_stopwords 14 | from .. config import PROJECT_PATH 15 | 16 | 17 | module_path = PROJECT_PATH + "portrayal/sentiment_classify/" 18 | pickle_path = module_path + "pickle/" 19 | 20 | 21 | class VotingClassifier: 22 | classifier_list = [] 23 | words_feature = None 24 | 25 | def init(self): 26 | self.load_classifier() 27 | 28 | 29 | def load_classifier(self): 30 | classifier_names = [ 31 | 'naivebayes', 32 | 'mnb_classifier', 33 | 'bnb_classifier', 34 | 'lr_classifier', 35 | 'lsv_classifier', 36 | 'sgd_classifier' 37 | ] 38 | 39 | # 加载之前保存的训练好的分类器模型 40 | for name in classifier_names: 41 | if not os.path.exists(pickle_path + name + ".pickle"): 42 | training.training() 43 | 44 | classifier_file = open(pickle_path + name + ".pickle", "rb") 45 | classifier = pickle.load(classifier_file) 46 | classifier_file.close() 47 | 48 | self.classifier_list.append(classifier) 49 | 50 | 51 | def classify(self, tts): 52 | text = '' 53 | for item in tts: 54 | text += item['text'] + ' ' 55 | 56 | if len(self.classifier_list) == 0: 57 | self.load_classifier() 58 | 59 | feature = self.word2features(text) 60 | 61 | if not feature: 62 | return None 63 | 64 | votes = [] 65 | for classifier in self.classifier_list: 66 | vote = classifier.classify(feature) 67 | votes.append(vote) 68 | 69 | try: 70 | res= mode(votes) 71 | except Exception as e: 72 | print e 73 | return 0 74 | 75 | return res 76 | 77 | def word2features(self, document): 78 | if not self.words_feature: 79 | feature_file = open(pickle_path + "words_feature.pickle") 80 | self.words_feature = pickle.load(feature_file) 81 | feature_file.close() 82 | 83 | word_list = set(preprocess_del_stopwords(document)) 84 | 85 | if not word_list: 86 | return None 87 | 88 | feature = {} 89 | for w in self.words_feature: 90 | feature[w] = w in word_list 91 | 92 | return feature 93 | 94 | 95 | voting_classifier = VotingClassifier() 96 | 97 | 98 | def classify(tweets): 99 | return voting_classifier.classify(tweets) -------------------------------------------------------------------------------- /portrayal/tools/preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import re 3 | import sys 4 | import nltk 5 | 6 | from nltk.tokenize import word_tokenize 7 | 8 | from function import get_stop_words, get_slang_set 9 | 10 | reload(sys) 11 | sys.setdefaultencoding('utf-8') 12 | 13 | stop_words = get_stop_words() 14 | slang_set = get_slang_set() 15 | 16 | def data_cleaning(text): 17 | # clear @/#/链接/RT 18 | # 去除表达较口语化的语言时,经常使用重复的字符 19 | text = text.lower() 20 | text = re.sub(r"(\w)\1{2,}", r"\1\1", text) 21 | text = re.sub(r"(..)\1{2,}", r"\1\1", text) 22 | text = re.sub(r'(rt)?\s?@\w+:?|#|(ht|f)tp[^\s]+', " ", text) 23 | text = text.replace('wanna', 'want to').replace('gonna', 'will').replace('gotta', 'must').replace('have to', 'haveto').replace('hungrryy', 'hungry') 24 | 25 | return text.strip() 26 | 27 | 28 | def preprocess(text, return_type = "string"): 29 | text = text.lower() 30 | text = re.sub(r'rt @\w+:|@\w+|#|(ht|f)tp[^\s]+', " ", text) 31 | 32 | try: 33 | words = word_tokenize(text) 34 | except Exception as e: 35 | print e 36 | return None 37 | 38 | word_list = [w for w in words if w not in stop_words and w.isalpha()] 39 | 40 | return word_list if return_type == 'list' else ' '.join(word_list) 41 | 42 | 43 | def preprocess_del_stopwords(text): 44 | try: 45 | words = word_tokenize(text) 46 | except Exception as e: 47 | print e 48 | return None 49 | 50 | word_list = [w for w in words if w not in stop_words and w.isalpha()] 51 | 52 | return word_list 53 | 54 | 55 | def preprocess_postag(text): 56 | text = text.lower() 57 | text = re.sub(r"(\w)\1{2,}", r"\1\1", text) 58 | text = re.sub(r"(..)\1{2,}", r"\1\1", text) 59 | text = re.sub(r'(rt)?\s?@\w+:?|#|(ht|f)tp[^\s]+', " ", text) 60 | 61 | try: 62 | words = word_tokenize(text) 63 | word_tags = nltk.pos_tag(words) 64 | except Exception as e: 65 | print e 66 | return None 67 | 68 | res = [] 69 | for item in word_tags: 70 | if item[0] not in stop_words and item[0].isalpha(): 71 | res.append(item) 72 | 73 | return res 74 | 75 | 76 | def preprocess_postag_label(text): 77 | text = text.lower() 78 | text = re.sub(r"(\w)\1{2,}", r"\1\1", text) 79 | text = re.sub(r"(..)\1{2,}", r"\1\1", text) 80 | text = re.sub(r'#(\w+)', "label\g<1>label ", text) 81 | text = re.sub(r'(rt)?\s?@\w+:?|#|hahah\w*|(ht|f)tp[^\s]+', " ", text) 82 | text = text.replace('new york', "newyork") 83 | 84 | try: 85 | words = word_tokenize(text) 86 | word_tags = nltk.pos_tag(words) 87 | except Exception as e: 88 | print e 89 | return None 90 | 91 | res = [] 92 | for item in word_tags: 93 | if item[0] not in stop_words and item[0].isalpha() and item[0] not in slang_set: 94 | word = re.sub(r'label(\w+)label', r'#\1' , item[0]) 95 | res.append((word, item[1])) 96 | 97 | return res -------------------------------------------------------------------------------- /neo4j.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from py2neo import Graph, Node, Relationship 3 | from crawler.database import Neo4j, MongoDB 4 | 5 | 6 | ''' 7 | 创建neo4j节点及关系 8 | ''' 9 | def create_relation(): 10 | graph = Neo4j().connect() 11 | mongo = MongoDB().connect() 12 | 13 | # 清空neo4j数据库 14 | # graph = graph.delete_all() 15 | 16 | tus = mongo['typical'].find({}, {'_id': 1}) 17 | 18 | for item in tus: 19 | # 创建用户节点 20 | user = Node("Typical", user_id = item['_id']) 21 | graph.create(user) 22 | 23 | tus = mongo['relation'].find({}, {'_id': 1}) 24 | user_list = map(lambda item: item['_id'], tus) 25 | 26 | # 创建用户节点之间的关系 27 | for user_id in user_list: 28 | friends = mongo['relation'].find_one({'_id': user_id}) 29 | friends = set(friends['friends']) 30 | 31 | node1 = graph.find_one("Typical", 32 | property_key = "user_id", 33 | property_value = user_id) 34 | 35 | for user_id1 in user_list: 36 | if user_id1 == user_id: 37 | continue 38 | 39 | if user_id1 in friends: 40 | node2 = graph.find_one("Typical", 41 | property_key = "user_id", 42 | property_value = user_id1) 43 | 44 | following = Relationship(node1, 'following', node2) 45 | graph.create(following) 46 | 47 | 48 | ''' 49 | 更新neo4j节点属性 50 | ''' 51 | def update_attr(): 52 | graph = Neo4j().connect() 53 | mongo = MongoDB().connect() 54 | 55 | tus = mongo['typical'].find({}, {'name': 1, 'category': 1, 'followers_count': 1, 'location': 1, 'utc_offset': 1, 56 | 'statuses_count': 1, 'description': 1, 'friends_count': 1, 'psy': 1, 'verified': 1, 'lang': 1, 'favourites_count': 1, 57 | 'screen_name': 1, 'influence_score': 1, 'created_at': 1, 'time_zone': 1, 'protected': 1, 'activity': 1}) 58 | 59 | for item in tus: 60 | node = graph.find_one("Typical", 61 | property_key = "user_id", 62 | property_value = item['_id']) 63 | node['name'] = item['name'] 64 | node['category'] = item['category'] 65 | node['followers_count'] = item['followers_count'] 66 | node['location'] = item['location'] 67 | node['utc_offset'] = item['utc_offset'] 68 | node['statuses_count'] = item['statuses_count'] 69 | node['description'] = item['description'] 70 | node['friends_count'] = item['friends_count'] 71 | node['psy'] = item['psy'] 72 | node['verified'] = item['verified'] 73 | node['lang'] = item['lang'] 74 | node['favourites_count'] = item['favourites_count'] 75 | node['screen_name'] = item['screen_name'] 76 | node['influence_score'] = item['influence_score'] 77 | node['created_at'] = item['created_at'] 78 | node['time_zone'] = item['time_zone'] 79 | node['protected'] = item['protected'] 80 | node['activity'] = item['activity'] 81 | 82 | graph.push(node) 83 | 84 | 85 | if __name__ == '__main__': 86 | # create_relation() 87 | update_attr() -------------------------------------------------------------------------------- /portrayal/sentiment_classify/process_dict.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from .. config import PROJECT_PATH 3 | 4 | module_path = PROJECT_PATH + "portrayal/sentiment_classify/" 5 | pickle_path = module_path + "pickle/" 6 | 7 | sentiment_dict = None 8 | 9 | def load_sentiment_dict(): 10 | if not sentiment_dict: 11 | if not os.path.exists(pickle_path + "sentiment_dict.pickle"): 12 | sentiment_dict = generate_sentiment_dict() 13 | 14 | 15 | def generate_new_dict(): 16 | new_sentiment_dict = {} 17 | 18 | if not sentiment_dict: 19 | load_sentiment_dict() 20 | 21 | for item in sentiment_dict: 22 | sl = item.split("#") 23 | if(sl[1] == 'a' or sl[1] == 'v' or sl[1] == 'r') and sl[0].isalpha() and (sl[0] not in sentiment_dict): 24 | score = sentiment_dict[item] * 30 / 5 25 | 26 | if int(score) > 5 or int(score) < -5: 27 | print sl 28 | print score 29 | else: 30 | if abs(score) >= 1: 31 | new_sentiment_dict[sl[0]] = int(score) 32 | 33 | elif abs(score) > 0.66: 34 | if score < 0: 35 | new_sentiment_dict[sl[0]] = -1 36 | if score > 0: 37 | new_sentiment_dict[sl[0]] = 1 38 | 39 | senti_file = open(module_path + "data/sentiment_temp.txt", 'w') 40 | for item in new_sentiment_dict: 41 | senti_file.write(item + "\t" + str(new_sentiment_dict[item]) + "\n") 42 | 43 | return 44 | 45 | 46 | def attr_change(word_tuple, score): 47 | word = word_tuple[0] 48 | 49 | if (word_tuple[1] == 'r' or word_tuple[1] == 'v') and (word + '#a' in sentiment_dict): 50 | score += sentiment_dict[word + '#a'] * rate 51 | print score 52 | elif (word_tuple[1] == 'n') and (word + '#v' in sentiment_dict): 53 | score += sentiment_dict[word + '#v'] * rate 54 | print score 55 | 56 | 57 | def generate_sentiment_dict(): 58 | sentiment_dict = {} 59 | file = open(module_path + 'data/sentiment_words.txt') 60 | 61 | data = [] 62 | while 1: 63 | lines = file.readlines(100000) 64 | if not lines: 65 | break 66 | 67 | for line in lines: 68 | if line.strip().startswith("#"): 69 | continue 70 | else: 71 | data = line.split("\t") 72 | if len(data) != 6: 73 | print line 74 | print 'invalid data' 75 | continue 76 | 77 | word_type = data[0] 78 | synset_score = float(data[2]) - float(data[3]) 79 | syn_terms_list = data[4].split(" ") 80 | 81 | for w in syn_terms_list: 82 | term_and_num = w.split("#") 83 | 84 | syn_term = term_and_num[0] + "#" + word_type 85 | term_num = int(term_and_num[1]) 86 | 87 | if sentiment_dict.has_key(syn_term): 88 | sentiment_dict[syn_term].append((term_num, synset_score)) 89 | 90 | else: 91 | sentiment_dict[syn_term] = [] 92 | sentiment_dict[syn_term].append((term_num, synset_score)) 93 | 94 | res = {} 95 | for key in sentiment_dict: 96 | score_sum = 0 97 | count = 0 98 | for word_tuple in sentiment_dict[key]: 99 | score_sum += word_tuple[1] * word_tuple[0] 100 | count += word_tuple[0] 101 | 102 | if score_sum / count != 0: 103 | res[key] = score_sum / count 104 | 105 | file = open(pickle_path + "sentiment_dict.pickle", 'w') 106 | pickle.dump(res, file) 107 | file.close() 108 | 109 | return res -------------------------------------------------------------------------------- /portrayal/tools/function.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import time 3 | import datetime 4 | 5 | from .. config import PROJECT_PATH 6 | 7 | slang = None 8 | stop_words = None 9 | slang_set = None 10 | 11 | ''' 12 | 读取停用词 13 | ''' 14 | def get_stop_words(file_path = PROJECT_PATH + "portrayal/resource/stop_words.txt"): 15 | global stop_words 16 | 17 | if not stop_words: 18 | stop_words = set() 19 | else: 20 | return stop_words 21 | 22 | file = open(file_path, "r") 23 | for line in file: 24 | stop_words.add(line[0 : -1]) 25 | 26 | file.close() 27 | 28 | return stop_words 29 | 30 | 31 | ''' 32 | 读取俚语 33 | ''' 34 | def get_slang(file_path = PROJECT_PATH + "portrayal/resource/slang.txt"): 35 | global slang 36 | 37 | if not slang: 38 | slang = {} 39 | else: 40 | return slang 41 | 42 | file = open(file_path, "r") 43 | for line in file: 44 | l_l = line.split(":") 45 | slang[l_l[0].strip()] = l_l[1].strip() 46 | 47 | file.close() 48 | 49 | return slang 50 | 51 | 52 | ''' 53 | 读取俚语 54 | ''' 55 | def get_slang_set(file_path = PROJECT_PATH + "portrayal/resource/slang.txt"): 56 | global slang_set 57 | 58 | if not slang_set: 59 | slang_set = set() 60 | else: 61 | return slang_set 62 | 63 | file = open(file_path, "r") 64 | for line in file: 65 | l_l = line.split(":") 66 | slang_set.add(l_l[0].strip()) 67 | 68 | file.close() 69 | 70 | return slang_set 71 | 72 | 73 | ''' 74 | 计算两个时间相差的天数 75 | ''' 76 | def calc_time_differ(t1, t2): 77 | t1 = time.strptime(t1, "%Y-%m-%d %H:%M:%S") 78 | t2 = time.strptime(t2, "%Y-%m-%d %H:%M:%S") 79 | t1 = datetime.datetime(t1[0], t1[1], t1[2], t1[3], t1[4], t1[5]) 80 | t2 = datetime.datetime(t2[0], t2[1], t2[2], t2[3], t2[4], t2[5]) 81 | 82 | return abs((t2 - t1).days) 83 | 84 | 85 | ''' 86 | 将推文分割为按月为单位的推文列表 87 | 返回: 88 | 二维推文列表 89 | ''' 90 | def split_tweets_same_time(tweets = [], period = 1): 91 | threshold = period * 30 92 | 93 | if len(tweets) == 0: 94 | return 95 | 96 | start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweets[0]['created_at'].replace('+0000 ',''))) 97 | start_time_temp = start_time 98 | 99 | tts = [] 100 | tweets_list = [] 101 | 102 | for tweet in tweets: 103 | time_temp = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweet['created_at'].replace('+0000 ',''))) 104 | 105 | if calc_time_differ(time_temp, start_time_temp) <= threshold: 106 | tts.append(tweet) 107 | else: 108 | start_time_temp = time_temp 109 | tweets_list.append(tts) 110 | tts = [] 111 | tts.append(tweet) 112 | 113 | if len(tts) != 0: 114 | tweets_list.append(tts) 115 | 116 | return tweets_list if len(tweets_list[-1]) > 20 else tweets_list[0 : -1] 117 | 118 | 119 | def split_tweets_same_count(tweets = [], count = 66): 120 | count = count if count <= 100 else 100 121 | count = count if count >= 40 else 40 122 | 123 | if len(tweets) < 1200: 124 | count = 40 125 | 126 | tts = [] 127 | tweets_list = [] 128 | 129 | i = 0 130 | for tweet in tweets: 131 | i += 1 132 | tts.append(tweet) 133 | 134 | if i > count: 135 | tweets_list.append(tts) 136 | tts = [] 137 | i = 0 138 | 139 | if len(tts) > 20: 140 | tweets_list.append(tts) 141 | 142 | return tweets_list -------------------------------------------------------------------------------- /portrayal/sentiment_classify/sentiment_classify.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | import nltk 4 | 5 | from statistics import mode 6 | from classify import classify 7 | from nltk.tokenize import word_tokenize 8 | from sentiment_dict import calc_sentiment_score 9 | 10 | from .. config import PROJECT_PATH 11 | from .. tools.preprocess import data_cleaning 12 | from .. tools.function import split_tweets_same_time, split_tweets_same_count 13 | 14 | module_path = PROJECT_PATH + "portrayal/sentiment_classify/" 15 | 16 | def replace_emotion(tweets): 17 | tweets_temp = [] 18 | emotion = { 19 | ":33": ". happy", 20 | "^_^": ". happy", 21 | ":-)": ". happy", 22 | ":)))": ". happy happy", 23 | ":)": ". happy", 24 | "(:": ". happy", 25 | "(-:": ". happy", 26 | "<3": ". happy", 27 | ":*": ". happy", 28 | ":-D": ". happy", 29 | ":D": ". happy", 30 | "X-D": ". happy happy", 31 | "XD": ". happy happy", 32 | "xD": ". happy happy", 33 | ";-)": ". happy", 34 | ";)": ". happy", 35 | ";-D": ". happy", 36 | ";D": ". happy", 37 | "(;": ". happy", 38 | "(-;": ". happy", 39 | ":-(": ". unhappy", 40 | ":((": ". sad", 41 | ":(": ". unhappy", 42 | "(:": ". unhappy", 43 | "(-:": ". unhappy", 44 | ":,(": ". sad", 45 | ":'(": ". sad", 46 | ":”(": ". sad" 47 | } 48 | 49 | for tweet in tweets: 50 | text = tweet['text'] 51 | 52 | for item in emotion: 53 | text = text.replace(item, emotion[item]) 54 | 55 | text = data_cleaning(text) 56 | 57 | tweets_temp.append({ 58 | 'text': text, 59 | 'created_at': tweet['created_at'] 60 | }) 61 | 62 | return tweets_temp 63 | 64 | 65 | def sentiment_with_time(tweets, time_span = 1): 66 | tweets_list = split_tweets_same_time(tweets, time_span) 67 | 68 | sequence1 = [] 69 | sequence2 = [] 70 | for tts in tweets_list: 71 | res = classify(tts) 72 | 73 | if res == 'pos': 74 | sequence1.append(1) 75 | elif res == 'neg': 76 | sequence1.append(-1) 77 | else: 78 | sequence1.append(0) 79 | 80 | score = calc_sentiment_score(tts) 81 | 82 | if not score: 83 | sequence2.append(0) 84 | else: 85 | sequence2.append(score) 86 | 87 | return sequence1, sequence2 88 | 89 | 90 | def sentiment_with_count(tweets, count = 66): 91 | tweets_list = split_tweets_same_count(tweets, count) 92 | 93 | sequence1 = [] 94 | sequence2 = [] 95 | for tts in tweets_list: 96 | res = classify(tts) 97 | 98 | if res == 'pos': 99 | sequence1.append(1) 100 | elif res == 'neg': 101 | sequence1.append(-1) 102 | else: 103 | sequence1.append(0) 104 | 105 | score = calc_sentiment_score(tts) 106 | 107 | if not score: 108 | sequence2.append(0) 109 | else: 110 | sequence2.append(score) 111 | 112 | return sequence1, sequence2 113 | 114 | 115 | def exe_sentiment_classify(tweets): 116 | if not tweets or len(tweets) == 0: 117 | return None 118 | 119 | tweets = replace_emotion(tweets) 120 | 121 | psy_with_time1, psy_with_time2 = sentiment_with_time(tweets) 122 | psy_with_count1, psy_with_count2 = sentiment_with_count(tweets) 123 | 124 | count = 0 125 | for item in psy_with_time2: 126 | if item > 0: 127 | count += 1 128 | elif item < 0: 129 | count -= 1 130 | 131 | if count > 0: 132 | final_sentiment = 1 133 | 134 | elif count < 0: 135 | final_sentiment = -1 136 | 137 | else: 138 | final_sentiment = 0 139 | 140 | return final_sentiment, psy_with_time1, psy_with_time2, psy_with_count1, psy_with_count2 -------------------------------------------------------------------------------- /portrayal/career_classify/classify.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | * @Author: Marco 4 | * @Date: 2017-09-05 16:18:19 5 | * @Last Modified by: Marco 6 | * @Last Modified time: 2017-09-05 16:18:19 7 | ''' 8 | import math 9 | import pickle 10 | 11 | from .. config import PROJECT_PATH 12 | 13 | module_path = PROJECT_PATH + "portrayal/career_classify/" 14 | 15 | 16 | def classify(text = '', pickle_path = module_path + "pickle/"): 17 | categories_path = pickle_path + "categories.pickle" 18 | count_vector_path = pickle_path + "count_vector.pickle" 19 | tf_idf_transformer_path = pickle_path + "tf_idf_transformer.pickle" 20 | 21 | if text == "": 22 | return None 23 | 24 | text = [text] 25 | 26 | count_vector_file = open(count_vector_path) 27 | count_vector = pickle.load(count_vector_file) 28 | count_vector_file.close() 29 | 30 | count_feature_matrix = count_vector.transform(text) 31 | 32 | tf_idf_transformer_file = open(tf_idf_transformer_path) 33 | tf_idf_transformer = pickle.load(tf_idf_transformer_file) 34 | tf_idf_transformer_file.close() 35 | 36 | tf_idf_feature_matrix = tf_idf_transformer.transform(count_feature_matrix) 37 | 38 | # 分类器 39 | classifier_path = pickle_path + 'multi_classifier.pickle' 40 | # classifier_path = pickle_path + 'bagging_classifier.pickle' 41 | 42 | # 分类 43 | multi_classifier_file = open(classifier_path) 44 | multi_classifier = pickle.load(multi_classifier_file) 45 | multi_classifier_file.close() 46 | 47 | categories_file = open(categories_path) 48 | target_names = pickle.load(categories_file) 49 | categories_file.close() 50 | 51 | category = target_names[multi_classifier.predict(tf_idf_feature_matrix.toarray())[0]] 52 | # return category 53 | 54 | score_list = multi_classifier._joint_log_likelihood(tf_idf_feature_matrix.toarray())[0] 55 | 56 | min_value = min(score_list) 57 | min_value = math.floor(min_value) 58 | 59 | categories_score = {} 60 | for i in range(len(score_list)): 61 | categories_score[target_names[i]] = round((score_list[i] - min_value) * 25, 2) 62 | 63 | return category, categories_score 64 | 65 | 66 | def classify_special_category(category_list, text = '', pickle_path = module_path + "pickle_category/"): 67 | category_list.sort() 68 | 69 | dir_name = '' 70 | for item in category_list: 71 | dir_name += '_' + item 72 | 73 | dir_name = dir_name[1 : ] 74 | 75 | return classify(text, pickle_path + dir_name + '/') 76 | 77 | 78 | def exe_classify(text = None): 79 | if not text: 80 | return None 81 | 82 | related_category_dict = { 83 | 'Politics': ["Education"], 84 | # 'Technology': ["Economy"] 85 | } 86 | 87 | category, categories_score = classify(text) 88 | 89 | if related_category_dict.has_key(category): 90 | for related_category in related_category_dict[category]: 91 | if (categories_score[category] < categories_score[related_category] * 2) and (categories_score[category] - categories_score[related_category]) < 36: 92 | category_temp, categories_score_temp = classify_special_category([category, related_category], text) 93 | 94 | if category_temp != category: 95 | # if categories_score_temp[category_temp] < categories_score_temp[category] * 1.2: 96 | # continue 97 | 98 | categories_score[category_temp] = categories_score[category] + (categories_score_temp[category_temp] - categories_score_temp[category]) * 5 99 | 100 | return category_temp, categories_score 101 | 102 | return category, categories_score -------------------------------------------------------------------------------- /portrayal/career_classify/training.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | ''' 3 | * @Author: Marco 4 | * @Date: 2017-09-05 14:07:32 5 | * @Last Modified by: Marco 6 | * @Last Modified time: 2017-09-05 14:07:32 7 | */ 8 | ''' 9 | import os 10 | import pickle 11 | 12 | from sklearn import datasets 13 | from sklearn.feature_extraction.text import CountVectorizer 14 | from sklearn.feature_extraction.text import TfidfTransformer 15 | 16 | from sklearn.naive_bayes import MultinomialNB 17 | from sklearn.ensemble import BaggingClassifier 18 | from .. config import PROJECT_PATH 19 | 20 | module_path = PROJECT_PATH + "portrayal/career_classify/" 21 | 22 | ''' 23 | BCC: business/entertainment/politics/sport/technology 24 | CNN: agriculture/economy/education/entertainment/military/politics/religion/sports/technology 25 | 26 | data: BCC新闻数据集 + 维基词条文章 + 部分CNN文本 27 | ''' 28 | def training(dataset_path = module_path + "data_processed", pickle_path = module_path + "pickle/"): 29 | print "读入训练数据..." 30 | training_dataset = datasets.load_files(dataset_path) 31 | 32 | # 类别标签 33 | categories = training_dataset.target_names 34 | print categories 35 | categories_path = pickle_path + "categories.pickle" 36 | categories_file = open(categories_path, "wb") 37 | pickle.dump(categories, categories_file) 38 | categories_file.close() 39 | 40 | # 词频统计 41 | count_vector = CountVectorizer(stop_words = "english", decode_error = "ignore") 42 | count_feature_matrix = count_vector.fit_transform(training_dataset.data) 43 | 44 | count_vector_path = pickle_path + "count_vector.pickle" 45 | count_vector_file = open(count_vector_path, "wb") 46 | pickle.dump(count_vector, count_vector_file) 47 | count_vector_file.close() 48 | 49 | # 计算词频-逆文档频率 50 | tf_idf_transformer = TfidfTransformer().fit(count_feature_matrix) 51 | 52 | # 持久化 tf_idf_transformer 53 | tf_idf_transformer_path = pickle_path + "tf_idf_transformer.pickle" 54 | tf_idf_transformer_file = open(tf_idf_transformer_path, "wb") 55 | pickle.dump(tf_idf_transformer, tf_idf_transformer_file) 56 | tf_idf_transformer_file.close() 57 | print "词频-逆文档频率已保存" 58 | 59 | tf_idf_feature_matrix = tf_idf_transformer.transform(count_feature_matrix) 60 | 61 | # bagging 62 | # bagging = BaggingClassifier(base_estimator=MultinomialNB(),max_features=0.5,n_estimators=60,n_jobs=-1) 63 | # bagging_classifier = bagging.fit(tf_idf_feature_matrix, training_dataset.target) 64 | # print "Bagging classifier has been trained" 65 | # bagging_classifier_path = pickle_path + "bagging_classifier.pickle" 66 | # bagging_classifier_file = open(bagging_classifier_path,'wb') 67 | # pickle.dump(bagging_classifier,bagging_classifier_file) 68 | # bagging_classifier_file.close() 69 | # print "bagging classifier has been saved" 70 | 71 | 72 | # 多项式贝叶斯分类器分类 73 | multi_classifier = MultinomialNB().fit(tf_idf_feature_matrix, training_dataset.target) 74 | print "多项式贝叶斯分类器训练完成" 75 | 76 | multi_classifier_path = pickle_path + "multi_classifier.pickle" 77 | multi_classifier_file = open(multi_classifier_path, "wb") 78 | pickle.dump(multi_classifier, multi_classifier_file) 79 | multi_classifier_file.close() 80 | print "多项式分类器已保存" 81 | 82 | 83 | def training_special_category(category_list, dataset_path = module_path + "data_category/", pickle_path = module_path + "pickle_category/"): 84 | category_list.sort() 85 | 86 | dir_name = '' 87 | for item in category_list: 88 | dir_name += '_' + item 89 | 90 | dir_name = dir_name[1 : ] 91 | 92 | pickle_category_path = pickle_path + dir_name + "/" 93 | 94 | if not os.path.exists(pickle_category_path): 95 | os.makedirs(pickle_category_path) 96 | 97 | training(dataset_path + dir_name, pickle_category_path) -------------------------------------------------------------------------------- /crawling.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import time 3 | 4 | from pymongo import MongoClient 5 | from crawler.basicinfo_crawler import BasicinfoCrawler 6 | from crawler.tweets_crawler import TweetsCrawler 7 | from crawler.relation_crawler import RelationCrawler 8 | 9 | tweets_crawler = TweetsCrawler() 10 | relation_crawler = RelationCrawler() 11 | basicinfo_crawler = BasicinfoCrawler() 12 | 13 | 14 | ''' 15 | 获取用户基础信息和推文信息,以字典形式返回 16 | ''' 17 | def get_user_all_info(user_id = None, screen_name = None): 18 | if not user_id and not screen_name: 19 | return None 20 | 21 | try: 22 | user = basicinfo_crawler.get_user(user_id = user_id, screen_name = screen_name) 23 | except Exception as e: 24 | print e 25 | return None 26 | 27 | if not user: 28 | return None 29 | 30 | tweets = [] 31 | if not user.protected: 32 | tweets = tweets_crawler.get_user_all_timeline_return(user_id = user_id, screen_name = screen_name) 33 | 34 | if not tweets: 35 | return None 36 | 37 | return { 38 | 'user_id': long(user.id), 39 | 'screen_name': user.screen_name, 40 | 'name': user.name, 41 | 'verified': user.verified, 42 | 'friends_count': user.friends_count, 43 | 'description': user.description, 44 | 'crawler_date': time.strftime('%Y-%m-%d',time.localtime(time.time())), 45 | 'followers_count': user.followers_count, 46 | 'location': user.location, 47 | 'statuses_count': user.statuses_count, 48 | 'favourites_count': user.favourites_count, 49 | 'lang': user.lang, 50 | 'utc_offset': user.utc_offset, 51 | 'protected': user.protected, 52 | 'profile_background_color': user.profile_background_color, 53 | 'default_profile_image': user.default_profile_image, 54 | 'created_at': user.created_at, 55 | 'time_zone': user.time_zone, 56 | 'profile_image_url': user.profile_image_url, 57 | 'listed_count': user.listed_count, 58 | 'geo_enabled': user.geo_enabled, 59 | 'profile_sidebar_fill_color': user.profile_sidebar_fill_color, 60 | 'profile_banner_url': user.profile_banner_url, 61 | 'tweets': tweets 62 | } 63 | 64 | 65 | # 同步典型人物的关系信息 66 | def get_friends(): 67 | client = MongoClient('127.0.0.1', 27017) 68 | db = client['twitter'] 69 | 70 | collect1 = db['typical'] 71 | collect2 = db['relation'] 72 | 73 | tus = collect1.find({}, {'_id': 1}) 74 | 75 | user_list = [] 76 | for item in tus: 77 | user_list.append(item['_id']) 78 | 79 | relation_list = [] 80 | tur = collect2.find({}, {'_id': 1}) 81 | 82 | for item in tur: 83 | if item['_id'] not in user_list: 84 | collect2.delete_one({'_id': item['_id']}) 85 | 86 | else: 87 | relation_list.append(item['_id']) 88 | 89 | for user_id in user_list: 90 | 91 | if user_id in relation_list: 92 | continue 93 | 94 | cursor = -1 95 | friends = [] 96 | 97 | while cursor != 0: 98 | out = relation_crawler.get_friendids_paged_sleep(user_id = user_id, 99 | cursor = cursor, 100 | count = 5000) 101 | if not out: 102 | break 103 | 104 | friends = friends + out[2] 105 | cursor = out[0] 106 | 107 | collect2.insert_one({ 108 | '_id': user_id, 109 | 'friends': friends 110 | }) 111 | 112 | 113 | # 获取所有用户的朋友信息 114 | def get_all_users_friends(user_list): 115 | client = MongoClient('127.0.0.1', 27017) 116 | db = client['twitter'] 117 | 118 | collect = db['1020_friends'] 119 | 120 | for user_id in user_list: 121 | 122 | cursor = -1 123 | friends = [] 124 | 125 | while cursor != 0: 126 | out = relation_crawler.get_friendids_paged_sleep(user_id = user_id, 127 | cursor = cursor, 128 | count = 5000) 129 | if not out: 130 | break 131 | 132 | friends = friends + out[2] 133 | cursor = out[0] 134 | 135 | collect.insert_one({ 136 | '_id': user_id, 137 | 'friends': friends 138 | }) -------------------------------------------------------------------------------- /portrayal/influence/calculate_influence.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | ''' 3 | * @Author: Marco 4 | * @Date: 2017-08-29 15:48:56 5 | * @Last Modified by: Marco 6 | * @Last Modified time: 2017-08-29 15:48:56 7 | ''' 8 | import re 9 | import math 10 | import time 11 | 12 | from .. tools.function import split_tweets_same_time, calc_time_differ 13 | 14 | 15 | ''' 16 | 参数计算 17 | ''' 18 | def calc_parameters(tweets): 19 | origin_count = rt_count = 0 # 原创推文和转发推文 20 | origin_retweet_count = origin_retweet_average = origin_retweet_max = 0 # 原创推文转发 总数、平均值、最大值 21 | origin_favorite_count = origin_favorite_average = origin_favorite_max = 0 # 原创推文点赞 总数、平均值、最大值 22 | 23 | if len(tweets) == 0: 24 | return 25 | 26 | tweet_start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweets[0]['created_at'].replace('+0000 ',''))) 27 | tweet_end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweets[-1]['created_at'].replace('+0000 ',''))) 28 | 29 | for tweet in tweets: 30 | # 转推 31 | if re.match(r"^RT @[\w|\d|_]+", tweet["text"]) != None: 32 | rt_count += 1 33 | 34 | # 非转推 35 | else: 36 | retweet_count = tweet["retweet_count"] 37 | favorite_count = tweet["favorite_count"] 38 | 39 | origin_count += 1 40 | origin_retweet_count += retweet_count 41 | origin_favorite_count += favorite_count 42 | 43 | if retweet_count > origin_retweet_max: 44 | origin_retweet_max = retweet_count 45 | 46 | if favorite_count > origin_favorite_max: 47 | origin_favorite_max = favorite_count 48 | 49 | origin_retweet_average = origin_retweet_count * 1.0 / origin_count if origin_count else 0 50 | origin_favorite_average = origin_favorite_count * 1.0 / origin_count if origin_count else 0 51 | 52 | return tweet_start_time, tweet_end_time, origin_count, rt_count, origin_retweet_count, \ 53 | origin_retweet_average, origin_retweet_max, origin_favorite_count, origin_favorite_average, origin_favorite_max 54 | 55 | 56 | ''' 57 | 参数计算:只返回原创推文数和转发推文数 58 | ''' 59 | def calc_parameters_4sequence(tweets): 60 | origin_count = rt_count = 0 # 原创推文和转发推文 61 | 62 | if len(tweets) == 0: 63 | return 64 | 65 | for tweet in tweets: 66 | if re.match(r"^RT @\w+", tweet["text"]) != None: 67 | rt_count += 1 68 | 69 | else: 70 | origin_count += 1 71 | 72 | return origin_count, rt_count 73 | 74 | 75 | ''' 76 | 计算活跃度 77 | ''' 78 | def calc_activity(origin_count, rt_count, time_span): 79 | time_span = time_span if time_span else 1 80 | rate = 1000.0 / time_span 81 | total = 0.65 * math.log(origin_count * rate + 1) + 0.35 * math.log(rt_count * rate + 1) 82 | 83 | return total 84 | 85 | 86 | ''' 87 | 活跃度序列计算 88 | 参数: 89 | period:时间跨度,默认为 1,表示每一个月计算一次活跃度 90 | ''' 91 | def calc_activity_sequence(tweets, period = 1): 92 | tweets_list = split_tweets_same_time(tweets, 1) 93 | 94 | res = [] 95 | for tts in tweets_list: 96 | origin_count, rt_count = calc_parameters_4sequence(tts) 97 | 98 | activity = calc_activity(origin_count, rt_count, period * 30) 99 | res.append(activity) 100 | 101 | return res 102 | 103 | 104 | ''' 105 | 计算推文影响力 106 | ''' 107 | def calc_tweet_influence(origin_retweet_count, origin_retweet_average, origin_retweet_max, \ 108 | origin_favorite_count, origin_favorite_average, origin_favorite_max): 109 | retweet_rate = 0.45 * math.log(origin_retweet_count + 1) + 0.35 * math.log(origin_retweet_average + 1) + 0.2 * math.log(origin_retweet_max + 1) 110 | favorite_rate = 0.45 * math.log(origin_favorite_count + 1) + 0.35 * math.log(origin_favorite_average + 1) + 0.2 * math.log(origin_favorite_max + 1) 111 | 112 | return 0.6 * retweet_rate + 0.4 * favorite_rate 113 | 114 | 115 | ''' 116 | 计算粉丝影响力 117 | ''' 118 | def calc_follower_influence(followers_count): 119 | return math.log(followers_count + 1) 120 | 121 | 122 | ''' 123 | 影响力计算 124 | ''' 125 | def calculate_influence(followers_count, tweets): 126 | tweet_start_time, tweet_end_time, origin_count, rt_count, origin_retweet_count, origin_retweet_average, \ 127 | origin_retweet_max, origin_favorite_count, origin_favorite_average, origin_favorite_max = calc_parameters(tweets) 128 | 129 | time_span = calc_time_differ(tweet_start_time, tweet_end_time) 130 | 131 | activity = calc_activity(origin_count, rt_count, time_span) 132 | tweet_influence = calc_tweet_influence(origin_retweet_count, origin_retweet_average, origin_retweet_max, \ 133 | origin_favorite_count, origin_favorite_average, origin_favorite_max) 134 | 135 | follower_influence = calc_follower_influence(followers_count) 136 | 137 | return (0.5 * tweet_influence + 0.2 * activity + 0.3 * follower_influence) * 10, activity -------------------------------------------------------------------------------- /portrayal/sentiment_classify/sentiment_dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | ''' 3 | * @Author: Marco 4 | * @Date: 2017-08-30 14:16:44 5 | * @Last Modified by: Marco 6 | * @Last Modified time: 2017-08-30 14:16:44 7 | ''' 8 | import re 9 | import os 10 | import sys 11 | import math 12 | import nltk 13 | import pickle 14 | 15 | from classify import classify 16 | from nltk.tokenize import word_tokenize 17 | 18 | from .. config import PROJECT_PATH 19 | from .. tools.preprocess import preprocess_postag 20 | from .. tools.function import get_stop_words 21 | from .. tools.function import get_slang 22 | 23 | reload(sys) 24 | sys.setdefaultencoding('utf-8') 25 | 26 | slang = get_slang() 27 | stop_words = get_stop_words() 28 | 29 | module_path = PROJECT_PATH + "portrayal/sentiment_classify/" 30 | pickle_path = module_path + "pickle/" 31 | 32 | 33 | class SentimentDict: 34 | sentiment_dict = None 35 | 36 | def preprocess(self, tweets): 37 | res = [] 38 | but_words = set(["but", "however"]) 39 | hope_words = set(["hope", "wish"]) 40 | deny_words = set(['not', "n't", 'no', 'never', 'none', 'hardly', 'isnt', 'doesnt']) 41 | degree_words = set(["fairly", "pretty", "quite", "very", "much", "too", "greatly", "highly", "really", "extremely", "so"]) 42 | filter_set = set(['affected', 'allow', 'allows', 'backed', 'backing', 'backs', 'best', 'better', 'big', 'certain', 'clear', 'clearly', 'good', 'greetings', 'ha', 'haa', 'hah', 'haha', 'hahaa', 'help', 'hid', 'hopefully', 'ignored', 'importance', 'important', 'kind', 'like', 'liked', 'lmao', 'matter', 'miss', 'novel', 'please', 'sorry', 'substantially', 'thk', 'thx', 'thank', 'thanks', 'thanx', 'thaanks', 'true', 'unfortunately', 'useful', 'usefully', 'usefulness', 'want', 'welcome', 'woohoo', 'yeah', 'yeahh', 'yes']) 43 | 44 | for tweet in tweets: 45 | text = tweet['text'].lower() 46 | 47 | if text == '': 48 | continue 49 | 50 | try: 51 | words = word_tokenize(text) 52 | 53 | for i in range(len(words)): 54 | if words[i] in slang: 55 | words[i] = slang[words[i]] 56 | 57 | word_tags = nltk.pos_tag(words) 58 | 59 | except Exception as e: 60 | print e 61 | continue 62 | 63 | deny = False 64 | degree = False 65 | but = False 66 | hope = False 67 | 68 | length = len(word_tags) 69 | 70 | for i in range(length): 71 | item = word_tags[i] 72 | word = item[0] 73 | 74 | if word in deny_words: 75 | deny = True 76 | degree = False 77 | continue 78 | elif word in but_words: 79 | but = True 80 | j = i - 1 81 | flag = True 82 | 83 | while j >= 0 and (flag or word_tags[j][0].isalpha()): 84 | if not word_tags[j][0].isalpha() or i - j > 2: 85 | flag = False 86 | 87 | w_t = word_tags[j][0] 88 | t_t = word_tags[j][1][0] 89 | if w_t not in stop_words: 90 | flag = False 91 | if t_t == 'J' or t_t == 'V' or t_t == 'R' or t_t == 'N': 92 | res.append("FOT_" + w_t) 93 | 94 | j -= 1 95 | continue 96 | elif word in degree_words: 97 | degree = True 98 | continue 99 | elif word in hope_words: 100 | hope = True 101 | continue 102 | 103 | if not word.isalpha() and not (item[1][0] == 'J' or item[1][0] == 'V' or item[1][0] == 'R' or item[1][0] == 'N'): 104 | deny = False 105 | degree = False 106 | hope = False 107 | 108 | if i == 0 or word_tags[i - 1] not in but_words: 109 | but = False 110 | 111 | elif word not in stop_words or word in filter_set: 112 | prefix = "" 113 | if deny: 114 | prefix += "NOT_" 115 | if hope: 116 | prefix += "HOP_" 117 | if degree and item[1][0] == 'J': 118 | prefix += "TWO_" 119 | 120 | if not word.isalpha(): 121 | temp_list = word.split(" ") 122 | for item_temp in temp_list: 123 | if item_temp.isalpha() and (item_temp not in stop_words or item_temp in filter_set): 124 | res.append(prefix + item_temp) 125 | 126 | elif item[1][0] == 'J' or item[1][0] == 'V' or item[1][0] == 'R' or item[1][0] == 'N': 127 | res.append(prefix + word) 128 | 129 | return res 130 | 131 | 132 | def calc_sentiment_score(self, tweets): 133 | if not self.sentiment_dict: 134 | self.sentiment_dict = {} 135 | 136 | senti_file = open(module_path + "data/sentiment_words1.txt").read() 137 | 138 | for line in senti_file.split("\n"): 139 | sp = line.split("\t") 140 | self.sentiment_dict[sp[0].strip()] = int(sp[1]) 141 | 142 | score = 0 143 | word_list = self.preprocess(tweets) 144 | 145 | if not word_list: 146 | return None 147 | 148 | for word in word_list: 149 | rate = 1 150 | 151 | if "FOT_" in word: 152 | rate *= -0.9 153 | word = word.replace("FOT_", '') 154 | 155 | if "NOT_" in word: 156 | if "TWO_" in word: 157 | rate *= -0.3 158 | word = word.replace("TWO_", '') 159 | else: 160 | rate *= -0.8 161 | 162 | if "HOP_" in word: 163 | rate *= -0.4 164 | word = word.replace("HOP_", '') 165 | 166 | word = word.replace("NOT_", '') 167 | else: 168 | if "TWO_" in word: 169 | rate *= 1.8 170 | word = word.replace("TWO_", '') 171 | 172 | if "HOP_" in word: 173 | rate *= 0.6 174 | word = word.replace("HOP_", '') 175 | 176 | if word in self.sentiment_dict: 177 | score += self.sentiment_dict[word] * rate 178 | 179 | return score 180 | 181 | 182 | sentiment_dict = SentimentDict() 183 | 184 | def calc_sentiment_score(tts): 185 | return sentiment_dict.calc_sentiment_score(tts) -------------------------------------------------------------------------------- /portrayal/career_classify/preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import pickle 4 | 5 | from nltk.tokenize import word_tokenize 6 | 7 | from .. tools.preprocess import preprocess 8 | from .. config import PROJECT_PATH 9 | 10 | 11 | module_path = PROJECT_PATH + "portrayal/career_classify" 12 | data_dir = module_path + '/data/' 13 | data_processed_dir = module_path + '/data_processed/' 14 | statistics_dir = module_path + '/statistics/' 15 | data_category_dir = module_path + '/data_category/' 16 | 17 | def process_training_data(): 18 | category_dirs = os.listdir(data_dir) 19 | 20 | for item in category_dirs: 21 | if not os.path.exists(data_processed_dir + item): 22 | os.makedirs(data_processed_dir + item) 23 | 24 | cat_files = os.listdir(data_dir + item) 25 | 26 | for cat_file in cat_files: 27 | file = open(data_dir + item + "/" + cat_file) 28 | file_processed = open(data_processed_dir + item + "/" + cat_file, 'w') 29 | 30 | for line in file: 31 | if line.strip() == '': 32 | continue 33 | 34 | line = preprocess(line.strip()) 35 | 36 | try: 37 | file_processed.write(line + '\n') 38 | except: 39 | continue 40 | 41 | 42 | def word_count(): 43 | count_dict = {} 44 | category_dirs = os.listdir(data_processed_dir) 45 | 46 | for item in category_dirs: 47 | cat_files = os.listdir(data_processed_dir + item) 48 | 49 | for cat_file in cat_files: 50 | file_processed = open(data_processed_dir + item + "/" + cat_file, 'r') 51 | 52 | for line in file_processed: 53 | word_list = line.split(" ") 54 | 55 | for word in word_list: 56 | word = word.strip() 57 | 58 | if count_dict.has_key(word): 59 | count_dict[word] += 1 60 | 61 | else: 62 | count_dict[word] = 1 63 | 64 | count_dict = sorted(count_dict.iteritems(), key = lambda i: i[1], reverse = True) 65 | 66 | file = open(statistics_dir + item + ".pickle", 'wb') 67 | pickle.dump(count_dict, file) 68 | file.close() 69 | 70 | count_dict = {} 71 | 72 | 73 | # def get_ambiguity_words(category_pair = None): 74 | # if not category_pair: 75 | # category_pair = [ 76 | # ("Entertainment", "Sports", 40), 77 | # ("Economy", "Agriculture", 40), 78 | # ("Politics", "Religion", 40), 79 | # ("Military", "Politics", 40), 80 | # ("Education", "Technology", 40), 81 | # ("Education", "Politics", 40), 82 | # ("Education", "Entertainment", 40), 83 | # ("Education", "Agriculture", 40), 84 | # ("Technology", "Entertainment", 40), 85 | # ("Economy", "Technology", 40), 86 | # ("Economy", "Politics", 40) 87 | # ] 88 | 89 | # delete_set = set() 90 | # for pair in category_pair: 91 | # top_words0 = set() 92 | 93 | # file = open(statistics_dir + pair[0] + ".pickle") 94 | # count_dict = pickle.load(file) 95 | 96 | # for item in count_dict: 97 | # if item[1] >= pair[2]: 98 | # top_words0.add(item[0]) 99 | 100 | # else: 101 | # break 102 | 103 | # top_words1 = set() 104 | 105 | # file = open(statistics_dir + pair[1] + ".pickle") 106 | # count_dict = pickle.load(file) 107 | 108 | # for item in count_dict: 109 | # if item[1] >= pair[2]: 110 | # top_words1.add(item[0]) 111 | 112 | # else: 113 | # break 114 | 115 | # delete_set |= top_words0 & top_words1 116 | 117 | # print len(delete_set) 118 | # return delete_set 119 | 120 | 121 | # def delete_ambiguity(category_pair = None): 122 | # ambiguity_words = get_ambiguity_words(category_pair) 123 | 124 | # category_dirs = os.listdir(data_processed_dir) 125 | 126 | # for item in category_dirs: 127 | # if not os.path.exists(data_pruned_dir + item): 128 | # os.makedirs(data_pruned_dir + item) 129 | 130 | # cat_files = os.listdir(data_processed_dir + item) 131 | 132 | # for cat_file in cat_files: 133 | # file = open(data_processed_dir + item + "/" + cat_file) 134 | # file_pruned = open(data_pruned_dir + item + "/" + cat_file, 'w') 135 | 136 | # for line in file: 137 | # if line.strip() == '': 138 | # continue 139 | 140 | # word_list = line.strip().split(" ") 141 | # word_list = [w for w in word_list if w not in ambiguity_words] 142 | 143 | # try: 144 | # file_pruned.write(' '.join(word_list) + '\n') 145 | # except: 146 | # continue 147 | 148 | 149 | def get_ambiguity_words(category_list, count = 20): 150 | delete_set = None 151 | 152 | for category in category_list: 153 | top_words = set() 154 | 155 | file = open(statistics_dir + category + ".pickle") 156 | count_dict = pickle.load(file) 157 | file.close() 158 | 159 | for item in count_dict: 160 | if item[1] >= count: 161 | top_words.add(item[0]) 162 | 163 | else: 164 | break 165 | print len(top_words) 166 | if not delete_set: 167 | delete_set = top_words 168 | else: 169 | delete_set &= top_words 170 | 171 | print len(delete_set) 172 | return delete_set 173 | 174 | 175 | def delete_ambiguity(category_list, count = 20): 176 | category_list.sort() 177 | 178 | ambiguity_words = get_ambiguity_words(category_list) 179 | 180 | dir_name = '' 181 | for item in category_list: 182 | dir_name += '_' + item 183 | 184 | dir_name = dir_name[1 : ] 185 | 186 | category_path = data_category_dir + dir_name + "/" 187 | 188 | if not os.path.exists(category_path): 189 | os.makedirs(category_path) 190 | 191 | for category in category_list: 192 | 193 | if not os.path.exists(category_path + category): 194 | os.makedirs(category_path + category) 195 | 196 | cat_files = os.listdir(data_processed_dir + category) 197 | 198 | for cat_file in cat_files: 199 | file = open(data_processed_dir + category + "/" + cat_file) 200 | file_category = open(category_path + category + "/" + cat_file, 'w') 201 | 202 | for line in file: 203 | if line.strip() == '': 204 | continue 205 | 206 | word_list = line.strip().split(" ") 207 | word_list = [w for w in word_list if w not in ambiguity_words] 208 | 209 | try: 210 | file_category.write(' '.join(word_list) + '\n') 211 | except: 212 | continue -------------------------------------------------------------------------------- /portrayal/sentiment_classify/training.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | * @Author: Marco 4 | * @Date: 2017-09-07 20:18:27 5 | * @Last Modified by: Marco 6 | * @Last Modified time: 2017-09-07 20:18:27 7 | ''' 8 | import os 9 | import nltk 10 | import pickle 11 | import random 12 | 13 | from sklearn.svm import LinearSVC 14 | from nltk.tokenize import word_tokenize 15 | from nltk.classify.scikitlearn import SklearnClassifier 16 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB 17 | from sklearn.linear_model import LogisticRegression, SGDClassifier 18 | 19 | from .. tools.function import get_stop_words 20 | from .. tools.preprocess import preprocess, preprocess_postag 21 | from .. config import PROJECT_PATH 22 | 23 | stop_words = get_stop_words() 24 | module_path = PROJECT_PATH + "portrayal/sentiment_classify/" 25 | pickle_path = module_path + "pickle/" 26 | 27 | 28 | # 对一段文档建立特征 29 | def word2features(document, word_features): 30 | features = {} 31 | words = word_tokenize(document, language='english') 32 | 33 | for w in word_features: 34 | features[w] = w in words 35 | 36 | return features 37 | 38 | 39 | def save_feature_document(): 40 | # 加载语料库 41 | pos_corpus = open(module_path + "data/positive.txt", "r").read() 42 | neg_corpus = open(module_path + "data/negative.txt", "r").read() 43 | 44 | documents = [] 45 | words_feature = [] 46 | 47 | # J是代表形容词,R代表副词,V代表动词 48 | allowed_types = ['J'] 49 | 50 | n = 0 51 | p_temp = '' 52 | 53 | for p in pos_corpus.split("\n"): 54 | word_tags = preprocess_postag(p) 55 | 56 | if not word_tags: 57 | continue 58 | 59 | # 形容词对情感影响较大,所以选取形容词为特征 60 | for item in word_tags: 61 | p_temp += item[0] + " " 62 | 63 | if item[1][0] in allowed_types: 64 | words_feature.append(item[0]) 65 | 66 | n += 1 67 | if n % 15 == 0: 68 | documents.append((p_temp, "pos")) 69 | p_temp = '' 70 | 71 | if n > 7: 72 | documents.append((p_temp, "pos")) 73 | 74 | n = 0 75 | p_temp = '' 76 | 77 | for p in neg_corpus.split("\n"): 78 | word_tags = preprocess_postag(p) 79 | 80 | if not word_tags: 81 | continue 82 | 83 | for item in word_tags: 84 | p_temp += item[0] + " " 85 | 86 | if item[1][0] in allowed_types: 87 | words_feature.append(item[0]) 88 | 89 | n += 1 90 | if n % 15 == 0: 91 | documents.append((p_temp, "neg")) 92 | p_temp = '' 93 | 94 | if n > 7: 95 | documents.append((p_temp, "neg")) 96 | 97 | # 将处理好的文档持久化 98 | documents_file = open(pickle_path + "documents.pickle","wb") 99 | pickle.dump(documents, documents_file) 100 | documents_file.close() 101 | print "Documents saved!" 102 | 103 | # 统计 104 | words_feature = nltk.FreqDist(words_feature) 105 | words_feature_temp = sorted(words_feature.iteritems(), key = lambda i: i[1], reverse = True) 106 | 107 | words_feature = map(lambda tuple: tuple[0], words_feature_temp) 108 | words_feature = words_feature[0 : 5000] 109 | 110 | # 将特征属性持久化 111 | feature_file = open(pickle_path + "words_feature.pickle", "wb") 112 | pickle.dump(words_feature, feature_file) 113 | 114 | feature_file.close() 115 | print "words_feature saved!" 116 | 117 | feature_sets = [(word2features(p, words_feature), category) for (p, category) in documents] 118 | 119 | # 将特征属性持久化 120 | feature_file = open(pickle_path + "feature_sets.pickle", "wb") 121 | pickle.dump(feature_sets, feature_file) 122 | 123 | feature_file.close() 124 | print "feature_sets saved!" 125 | 126 | 127 | def training(): 128 | if not os.path.exists(pickle_path + "feature_sets.pickle"): 129 | save_feature_document() 130 | 131 | feature_file = open(pickle_path + "feature_sets.pickle") 132 | feature_sets = pickle.load(feature_file) 133 | 134 | # 打乱,为了抽取训练集和测试集 135 | random.shuffle(feature_sets) 136 | print "Length of feature_sets: " 137 | print len(feature_sets) 138 | 139 | testing_set = feature_sets[150:] 140 | print("testing: %d" % len(testing_set)) 141 | training_set = feature_sets 142 | print("training: %d" % len(training_set)) 143 | 144 | # 分类器选择 145 | # 朴素贝叶斯-nltk自带分类器 146 | classifier = nltk.NaiveBayesClassifier.train(training_set) 147 | print "NaiveBayesClassifier accuracy:" 148 | print(nltk.classify.accuracy(classifier, testing_set)) 149 | # 分类器持久化 150 | classifier_file = open(pickle_path + "naivebayes.pickle", "wb") 151 | pickle.dump(classifier, classifier_file) 152 | classifier_file.close() 153 | 154 | # 多项式分类器-sklearn 155 | mnb_classifier = SklearnClassifier(MultinomialNB()) 156 | mnb_classifier.train(training_set) 157 | print "MultinomialNB accuracy:" 158 | print(nltk.classify.accuracy(mnb_classifier, testing_set)) 159 | # 分类器持久化 160 | classifier_file = open(pickle_path + "mnb_classifier.pickle", "wb") 161 | pickle.dump(classifier, classifier_file) 162 | classifier_file.close() 163 | 164 | # 伯努利分类器-sklearn 165 | bnb_classifier = SklearnClassifier(BernoulliNB()) 166 | bnb_classifier.train(training_set) 167 | print "BernoulliNB accuracy:" 168 | print(nltk.classify.accuracy(bnb_classifier, testing_set)) 169 | # 分类器持久化 170 | classifier_file = open(pickle_path + "bnb_classifier.pickle", "wb") 171 | pickle.dump(classifier, classifier_file) 172 | classifier_file.close() 173 | 174 | # 逻辑回归分类-sklearn 175 | lr_classifier = SklearnClassifier(LogisticRegression()) 176 | lr_classifier.train(training_set) 177 | print "LogisticRegression accuracy:" 178 | print(nltk.classify.accuracy(lr_classifier, testing_set)) 179 | # 分类器持久化 180 | classifier_file = open(pickle_path + "lr_classifier.pickle", "wb") 181 | pickle.dump(classifier, classifier_file) 182 | classifier_file.close() 183 | 184 | # 线性支持向量机分类器-sklearn 185 | lsv_classifier = SklearnClassifier(LinearSVC()) 186 | lsv_classifier.train(training_set) 187 | print "LinearSVC accuracy:" 188 | print(nltk.classify.accuracy(lsv_classifier, testing_set)) 189 | # 分类器持久化 190 | classifier_file = open(pickle_path + "lsv_classifier.pickle", "wb") 191 | pickle.dump(classifier, classifier_file) 192 | classifier_file.close() 193 | 194 | # 梯度下降分类器-sklearn 195 | sgd_classifier = SklearnClassifier(SGDClassifier()) 196 | sgd_classifier.train(training_set) 197 | print "SGDClassifier accuracy:" 198 | print(nltk.classify.accuracy(sgd_classifier, testing_set)) 199 | # 分类器持久化 200 | classifier_file = open(pickle_path + "sgd_classifier.pickle","wb") 201 | pickle.dump(classifier, classifier_file) 202 | classifier_file.close() -------------------------------------------------------------------------------- /crawler/basicinfo_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import gc 3 | import time 4 | import threading 5 | 6 | from config import THREAD_NUM 7 | from twitter import error 8 | from api import API_COUNT, Api 9 | from database import Mysql 10 | from decorator import generate_decorator 11 | 12 | handle_exception = generate_decorator(400) 13 | 14 | class BasicinfoCrawler: 15 | get_api = Api().get_api 16 | 17 | ''' 18 | 获取与term相关的用户信息 19 | 20 | Parameters: 21 | term – Term to search by. 22 | page – Page of results to return. Default is 1 [Optional] 23 | count – Number of results to return. Default is 20 [Optional] 24 | include_entities – If True, each tweet will include a node called “entities,”. 25 | This node offers a variety of metadata about the tweet in a discrete structure, 26 | including: user_mentions, urls, and hashtags. [Optional] 27 | Returns: 28 | A sequence of twitter.User instances, one for each message containing the term 29 | ''' 30 | def get_users_search(self, 31 | term = None, 32 | page = 1, 33 | count = 20, 34 | include_entities = True): 35 | 36 | if term == None: 37 | return None 38 | 39 | return self.get_api().GetUsersSearch(term = term, 40 | page = page, 41 | count = count, 42 | include_entities = include_entities) 43 | 44 | 45 | ''' 46 | 获取单个用户的信息 47 | 48 | Parameters: 49 | user_id (int, optional): 50 | The id of the user to retrieve. 51 | screen_name (str, optional): 52 | The screen name of the user for whom to return results for. 53 | Either a user_id or screen_name is required for this method. 54 | include_entities (bool, optional): 55 | The entities node will be omitted when set to False. 56 | Returns: 57 | A twitter.User instance representing that user 58 | ''' 59 | def get_user(self, 60 | user_id = None, 61 | screen_name = None, 62 | include_entities = True): 63 | 64 | if user_id == None and screen_name == None: 65 | return None 66 | 67 | return self.get_api().GetUser(user_id = user_id, 68 | screen_name = screen_name, 69 | include_entities = include_entities) 70 | 71 | 72 | ''' 73 | 获取单个用户的基础信息并返回,如果超时则休眠 400s 后返回 74 | ''' 75 | def get_user_sleep(self, 76 | user_id = None, 77 | screen_name = None, 78 | include_entities = True): 79 | 80 | if user_id == None and screen_name == None: 81 | return None 82 | 83 | wrapper_func = handle_exception(self.get_user) 84 | 85 | user = wrapper_func(user_id = user_id, screen_name = screen_name, include_entities = include_entities) 86 | 87 | return user 88 | 89 | ''' 90 | 获取单个用户的基础信息并保存(参考 get_user ) 91 | 92 | 参数: 93 | table_name (str, optional): 94 | 存储数据表名,默认 user_task (保证数据库中存在该表) 95 | 96 | 返回: 97 | None 98 | ''' 99 | def get_user_save(self, 100 | user_id = None, 101 | table_name = "user_task", 102 | screen_name = None, 103 | include_entities = True): 104 | 105 | wrapper_func = handle_exception(self.get_user) 106 | 107 | user = wrapper_func(user_id = user_id, screen_name = screen_name, include_entities = include_entities) 108 | 109 | user and self.save_user(user, table_name) 110 | 111 | 112 | ''' 113 | 获取多个用户的信息,并存入数据库中 114 | 115 | 参数: 116 | user_list (list, optional): 117 | 存放用户 user_id / screen_name 的列表 118 | table_name (str, optional): 119 | 存储数据表名,默认 user_task (保证数据库中存在该表) 120 | search_type (str, optional): 121 | 抓取方式,如果为 screen_name ,则认为 user_list 中 存放的是用户 screen_name, 122 | 否则认为 user_list 中 存放的是用户 user_id 123 | ''' 124 | def get_all_users(self, 125 | user_list = None, 126 | table_name = "user_task", 127 | search_type = "user_id"): 128 | 129 | if len(user_list) == 0: 130 | return None 131 | 132 | i = 0 133 | thread_pool = [] 134 | length = len(user_list) 135 | 136 | per_thread = length / THREAD_NUM 137 | 138 | while i < THREAD_NUM: 139 | if i + 1 == THREAD_NUM: 140 | crawler_thread = threading.Thread(target = self.get_users_thread, args = (user_list[i * per_thread : ], table_name, search_type,)) 141 | else: 142 | crawler_thread = threading.Thread(target = self.get_users_thread, args = (user_list[i * per_thread : (i + 1) * per_thread], table_name, search_type,)) 143 | 144 | crawler_thread.start() 145 | thread_pool.append(crawler_thread) 146 | 147 | i += 1 148 | 149 | for thread in thread_pool: 150 | thread.join() 151 | 152 | 153 | ''' 154 | 线程:获取多个用户信息(参考 get_all_users ) 155 | ''' 156 | def get_users_thread(self, 157 | user_list = None, 158 | table_name = "user_task", 159 | search_type = "user_id"): 160 | 161 | if search_type != "screen_name": 162 | while len(user_list) > 0: 163 | user_id = user_list.pop(0) 164 | self.get_user_save(user_id = user_id, table_name = table_name) 165 | 166 | else: 167 | while len(user_list) > 0: 168 | screen_name = user_list.pop(0) 169 | self.get_user_save(screen_name = screen_name, table_name = table_name) 170 | 171 | 172 | ''' 173 | 保存用户信息 174 | 175 | 参数: 176 | user(User, optional): 177 | 要保存的用户 178 | table_name (str, optional): 179 | 存储数据表名,默认 user_task (保证数据库中存在该表) 180 | ''' 181 | def save_user(self, 182 | user = None, 183 | table_name = "user_task"): 184 | 185 | if not user: 186 | return 187 | 188 | mysql = Mysql() 189 | mysql.connect() 190 | 191 | try: 192 | name = user.name.replace('\\','\\\\').replace("'","\\'").replace(':','\\:') 193 | location = user.location.replace('\\','\\\\').replace("'","\\'").replace(':','\\:') if user.location else '' 194 | description = user.description.replace('\\','\\\\').replace("'","\\'").replace(':','\\:') if user.description else '' 195 | 196 | time_zone = user.time_zone.replace("'","\\'") if user.time_zone else '' 197 | utc_offset = user.utc_offset if user.utc_offset else '' 198 | profile_banner_url = user.profile_banner_url if user.profile_banner_url else '' 199 | 200 | protected = 1 if user.protected else 0 201 | verified = 1 if user.verified else 0 202 | geo_enabled = 1 if user.geo_enabled else 0 203 | default_profile_image = 1 if user.default_profile_image else 0 204 | 205 | sql = """INSERT INTO %s (user_id, screen_name, name, location, created_at, description, statuses_count, friends_count, 206 | followers_count, favourites_count, lang, protected, time_zone, verified, utc_offset, geo_enabled, listed_count, 207 | default_profile_image, profile_background_color, profile_sidebar_fill_color, profile_image_url, profile_banner_url, crawler_date) VALUES 208 | ('%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, '%s', %d, '%s', %d, '%s', %d, %d, %d, 209 | '%s', '%s', '%s', '%s', '%s')""" % (table_name, user.id, user.screen_name, name, location, user.created_at, description, \ 210 | user.statuses_count, user.friends_count, user.followers_count, user.favourites_count, user.lang, protected, time_zone, verified, \ 211 | utc_offset, geo_enabled, user.listed_count, default_profile_image, user.profile_background_color, \ 212 | user.profile_sidebar_fill_color, user.profile_image_url, profile_banner_url, time.strftime('%Y-%m-%d',time.localtime(time.time()))) 213 | 214 | sql = sql.encode("utf-8").decode("latin1") 215 | except Exception as e: 216 | print e 217 | mysql.close() 218 | return 219 | 220 | try: 221 | mysql.execute(sql) 222 | except Exception as e: 223 | print e 224 | mysql.close() 225 | return 226 | 227 | mysql.close() 228 | 229 | del user 230 | gc.collect() 231 | 232 | 233 | if __name__ == '__main__': 234 | bc = BasicinfoCrawler() -------------------------------------------------------------------------------- /portrayal/interest_extract/interest_extract.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | * @Author: Marco 4 | * @Date: 2017-09-10 22:54:55 5 | * @Last Modified by: Marco 6 | * @Last Modified time: 2017-09-10 22:54:55 7 | ''' 8 | import re 9 | import os 10 | import math 11 | import pickle 12 | 13 | from collections import Counter 14 | from nltk.stem import WordNetLemmatizer 15 | 16 | from .. tools.preprocess import preprocess_postag_label 17 | 18 | from .. config import PROJECT_PATH 19 | 20 | from .. tools.function import get_stop_words 21 | 22 | 23 | module_path = PROJECT_PATH + "portrayal/interest_extract/" 24 | data_path = module_path + "data/" 25 | pickle_path = module_path + "pickle/" 26 | 27 | corpus = None 28 | stop_words = get_stop_words() 29 | 30 | def generate_pickle(): 31 | files = os.listdir(data_path + 'corpus') 32 | corpus_list = [] 33 | 34 | for f in files: 35 | text = '' 36 | corpus_set = set() 37 | file = open(data_path + 'corpus/' + f, "r").read() 38 | for p in file.split("\n"): 39 | text += p 40 | 41 | word_tags = preprocess_postag_label(text) 42 | 43 | if not word_tags: 44 | continue 45 | 46 | for word in word_tags: 47 | if word[1][0] == 'N': 48 | corpus_set.add(word[0]) 49 | 50 | corpus_list.append(corpus_set) 51 | 52 | file = open(pickle_path + "corpus.pickle", 'w') 53 | pickle.dump(corpus_list, file) 54 | file.close() 55 | 56 | return corpus_list 57 | 58 | 59 | def import_corpus(): 60 | global corpus 61 | if not corpus: 62 | if not os.path.exists(pickle_path + "corpus.pickle"): 63 | corpus = generate_pickle() 64 | else: 65 | file = open(pickle_path + "corpus.pickle", 'r') 66 | corpus = pickle.load(file) 67 | file.close() 68 | 69 | 70 | def generate_candidate(word_tags): 71 | if len(word_tags) < 1: 72 | return [] 73 | 74 | candidate_list = [] 75 | phrase_list = [] 76 | 77 | lemmatizer = WordNetLemmatizer() 78 | 79 | for item in word_tags: 80 | if item[1][0] == 'N' or item[0][0] == '#': 81 | if item[0][0] == '#': 82 | candidate_list.append(item[0]) 83 | 84 | else: 85 | word = lemmatizer.lemmatize(item[0], 'n') 86 | if word not in stop_words: 87 | candidate_list.append(word) 88 | 89 | if len(word_tags) == 2 and (word_tags[0][1][0] == 'J' or word_tags[0][1][0] == 'V') and word_tags[1][1][0] == 'N': 90 | if word_tags[0][1][0] == 'J': 91 | prefix = lemmatizer.lemmatize(word_tags[0][0], 'a') 92 | 93 | if word_tags[0][1][0] == 'V': 94 | prefix = lemmatizer.lemmatize(word_tags[0][0], 'v') 95 | 96 | suffix = lemmatizer.lemmatize(word_tags[1][0], 'n') 97 | 98 | if prefix not in stop_words and suffix not in stop_words: 99 | phrase_list.append(prefix + " " + suffix) 100 | 101 | i = 0 102 | while(i < len(word_tags) - 2): 103 | if word_tags[i][0][0] == '#': 104 | i += 1 105 | continue 106 | 107 | if word_tags[i + 1][0][0] == '#': 108 | i += 2 109 | continue 110 | 111 | if word_tags[i][1][0] == 'V' and (word_tags[i + 1][1][0] == 'N' or (word_tags[i + 1][1][0] == 'J' and word_tags[i + 2][1][0] == 'N')): 112 | if word_tags[i + 1][1][0] == 'J': 113 | suffix = lemmatizer.lemmatize(word_tags[i + 1][0], 'a') + " " 114 | suffix += lemmatizer.lemmatize(word_tags[i + 2][0], 'n') 115 | phrase_list.append(lemmatizer.lemmatize(word_tags[i][0], 'v') + " " + suffix) 116 | i += 3 117 | else: 118 | prefix = lemmatizer.lemmatize(word_tags[i][0], 'v') 119 | suffix = lemmatizer.lemmatize(word_tags[i + 1][0], 'n') 120 | 121 | if prefix not in stop_words and suffix not in stop_words: 122 | phrase_list.append(prefix + " " + suffix) 123 | 124 | i += 2 125 | 126 | elif word_tags[i][1][0] == 'J' and word_tags[i + 1][1][0] == 'N': 127 | prefix = lemmatizer.lemmatize(word_tags[i][0], 'a') 128 | suffix = lemmatizer.lemmatize(word_tags[i + 1][0], 'n') 129 | 130 | if prefix not in stop_words and suffix not in stop_words: 131 | phrase_list.append(prefix + " " + suffix) 132 | 133 | i += 2 134 | 135 | else: 136 | i += 1 137 | 138 | if i != 0 and i == len(word_tags) - 2 and word_tags[i + 1][1][0] == 'N': 139 | if word_tags[i][1][0] == 'J': 140 | prefix = lemmatizer.lemmatize(word_tags[i][0], 'a') 141 | suffix = lemmatizer.lemmatize(word_tags[i + 1][0], 'n') 142 | 143 | if prefix not in stop_words and suffix not in stop_words: 144 | phrase_list.append(prefix + " " + suffix) 145 | 146 | elif word_tags[i][1][0] == 'V': 147 | prefix = lemmatizer.lemmatize(word_tags[i][0], 'v') 148 | suffix = lemmatizer.lemmatize(word_tags[i + 1][0], 'n') 149 | 150 | if prefix not in stop_words and suffix not in stop_words: 151 | phrase_list.append(prefix + " " + suffix) 152 | 153 | return candidate_list + phrase_list 154 | 155 | 156 | def calc_tf_idf(candidate_list): 157 | if corpus == None: 158 | import_corpus() 159 | 160 | count = Counter(candidate_list) 161 | common_word = count.most_common(150) 162 | 163 | tf_idf = {} 164 | corpus_len = len(corpus) 165 | for item in common_word: 166 | n = 1 167 | for corpus_set in corpus: 168 | if item[0] in corpus_set: 169 | n += 1 170 | 171 | idf = math.log(corpus_len * 1.0 / n, 10) 172 | tf_idf[item[0]] = item[1] * idf 173 | 174 | candidate_list = sorted(tf_idf.iteritems(), key = lambda item: item[1], reverse = True) 175 | 176 | return candidate_list[:90] 177 | 178 | 179 | def calc_weight(tweets, candidates): 180 | weight_dict = {} 181 | 182 | length = len(candidates) 183 | 184 | for item in candidates: 185 | weight_dict[item] = {} 186 | 187 | for sub_item in candidates: 188 | if item != sub_item: 189 | weight_dict[item][sub_item] = 0 190 | 191 | for i in range(length): 192 | item = candidates[i] 193 | j = i + 1 194 | 195 | while j < length: 196 | sub_item = candidates[j] 197 | j += 1 198 | 199 | for tweet in tweets: 200 | text = tweet['text'] 201 | 202 | if item in text and sub_item in text: 203 | weight_dict[item][sub_item] += 1 204 | weight_dict[sub_item][item] += 1 205 | 206 | o_vector = {} 207 | 208 | for item in candidates: 209 | o_vector[item] = 0 210 | 211 | for sub_item in candidates: 212 | if item != sub_item: 213 | o_vector[item] += weight_dict[item][sub_item] 214 | 215 | return weight_dict, o_vector 216 | 217 | 218 | def text_rank(tweets, candidates_list): 219 | candidates = {} 220 | for item in candidates_list: 221 | candidates[item[0]] = item[1] 222 | 223 | weight_dict, o_vector = calc_weight(tweets, candidates.keys()) 224 | 225 | alpha = 0.85 226 | score_vector = {} 227 | related_items = {} 228 | 229 | for item in candidates: 230 | score_vector[item] = 1 231 | related_items[item] = [] 232 | 233 | for sub_item in candidates: 234 | if item != sub_item and weight_dict[item][sub_item] != 0: 235 | related_items[item].append(sub_item) 236 | 237 | i = 0 238 | while i < 88: 239 | score_vector_temp = {} 240 | 241 | for item in candidates: 242 | score_temp = (1 - alpha) * candidates[item] 243 | 244 | for sub_item in related_items[item]: 245 | score_temp += weight_dict[item][sub_item] * 1.0 / o_vector[sub_item] * score_vector[sub_item] 246 | 247 | score_vector_temp[item] = alpha * score_temp 248 | 249 | if calc_differ(score_vector, score_vector_temp) < 1: 250 | score_vector = score_vector_temp 251 | break 252 | 253 | score_vector = score_vector_temp 254 | 255 | i += 1 256 | 257 | return sorted(score_vector.iteritems(), key = lambda item: item[1], reverse = True) 258 | 259 | 260 | def calc_differ(score_vector1, score_vector2): 261 | differ = 0 262 | 263 | for item in score_vector1: 264 | differ += abs(score_vector1[item] - score_vector2[item]) 265 | 266 | return differ 267 | 268 | 269 | def get_top_tags(candidate_tags, count, filter_set): 270 | interset_tags = map(lambda tag: tag[0], candidate_tags) 271 | 272 | res_tags = [] 273 | 274 | for item in interset_tags: 275 | if len(res_tags) >= count: 276 | break 277 | 278 | item_temp = item.replace('#', '') 279 | word_list = item.split(' ') 280 | 281 | if len(word_list) == 1: 282 | if item_temp not in filter_set and len(item) > 2: 283 | res_tags.append(item) 284 | filter_set.add(item_temp) 285 | else: 286 | for word in word_list: 287 | if word.strip() != '': 288 | filter_set.add(word) 289 | if word in res_tags: 290 | res_tags.remove(word) 291 | 292 | res_tags.append(item) 293 | 294 | return res_tags[:count] 295 | 296 | 297 | def join_top_tags(tfidf_tags, textrank_tags, count): 298 | final_set = tfidf_tags[:count * 3 / 5] 299 | 300 | for item in tfidf_tags[count * 3 / 5:]: 301 | if item[0] == '#': 302 | final_set.append(item) 303 | 304 | for item in textrank_tags: 305 | if item not in final_set: 306 | final_set.append(item) 307 | 308 | return final_set[:count] 309 | 310 | def extract_tags(tweets, description = '', count = 36): 311 | text = '' 312 | for tweet in tweets: 313 | text += tweet['text'] + " , " 314 | 315 | word_tags = preprocess_postag_label(description + text + description) 316 | candidate_list = generate_candidate(word_tags) 317 | 318 | filter_set = set(["dis", "fuck", "hell", "damn", "shit", "bitch", "wow", "cool", "fun", "glad", 319 | "joy", "luck", "laugh", "bless", "appreciate", "wish", "hope", "play", "set", "close", "talk", 320 | "change", "join", "move", "watch", "meet", "post", "wait", "live", "deal", "eat", "call", 321 | "pick", "start", "end", "kid", "boy", "home", "tweet", "video", "bang", "dope", 322 | "year", "month", "hour", "minute", "second", "moment", "morning", "afternoon", "evening"]) 323 | 324 | candidate_tags = calc_tf_idf(candidate_list) 325 | tfidf_tags = get_top_tags(candidate_tags, count, filter_set) 326 | 327 | candidate_tags = text_rank(tweets, candidate_tags) 328 | textrank_tags = get_top_tags(candidate_tags, count, filter_set) 329 | 330 | tfidf_tags = join_top_tags(tfidf_tags, textrank_tags, count) 331 | 332 | return ','.join(tfidf_tags) -------------------------------------------------------------------------------- /crawler/web_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import urllib 3 | import urllib2 4 | import MySQLdb 5 | import config 6 | import time 7 | import cookielib 8 | import random 9 | import re 10 | import socket 11 | from bs4 import BeautifulSoup 12 | from pybloom import BloomFilter 13 | 14 | ''' 15 | 采用 web 方式抓取 Twitter 的类(已经弃用) 16 | ''' 17 | class Crawler: 18 | def __init__(self): 19 | 20 | #获取一个保存cookie的对象 21 | cj = cookielib.LWPCookieJar() 22 | #将一个保存cookie对象,和一个HTTP的cookie的处理器绑定 23 | cookie_support = urllib2.HTTPCookieProcessor(cj) 24 | #创建一个opener,将保存了cookie的http处理器,还有设置一个handler用于处理http的URL的打开 25 | opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) 26 | #将包含了cookie、http处理器、http的handler的资源和urllib2对象板顶在一起 27 | urllib2.install_opener(opener) 28 | 29 | headers = { 30 | 'User-Agent':config.USER_AGENT, 31 | 'referer':'https://twitter.com/login' 32 | } 33 | self.headers = [{ 34 | 'User-Agent':config.USER_AGENT, 35 | 'referer':'https://twitter.com' 36 | }, { 37 | 'User-Agent':config.USER_AGENT, 38 | 'referer':'https://twitter.com/login' 39 | }, { 40 | 'User-Agent':config.USER_AGENT, 41 | 'referer':'https://twitter.com/mrmarcohan' 42 | }] 43 | request = urllib2.Request("https://twitter.com/login", headers = headers) 44 | response = urllib2.urlopen(request) 45 | pageHtml = response.read() 46 | soup = BeautifulSoup(pageHtml, 'html.parser') 47 | csrf = soup.find_all("input", attrs={"name": "authenticity_token"})[0]['value'] 48 | 49 | postdata = { 50 | 'session[username_or_email]':'mrmarcohan', 51 | 'session[password]':'han123456', 52 | 'authenticity_token':csrf, 53 | 'scribe_log':'', 54 | 'redirect_after_login':'' 55 | } 56 | 57 | req = urllib2.Request( 58 | url = 'https://twitter.com/sessions', 59 | data = urllib.urlencode(postdata), 60 | headers = headers 61 | ) 62 | 63 | res = urllib2.urlopen(req) 64 | page = res.read() 65 | 66 | socket.setdefaulttimeout(5) 67 | # request = urllib2.Request("https://twitter.com/taylorswift13/following", headers = headers) 68 | # response = urllib2.urlopen(request) 69 | # pageHtml = response.read() 70 | 71 | # file_obj = open('a.html','w') 72 | # file_obj.write(pageHtml) 73 | # file_obj.close() 74 | 75 | # cookie = cookielib.CookieJar() 76 | # opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) 77 | # response = opener.open('https://twitter.com') 78 | # for item in cookie: 79 | # print item 80 | # # if item.name == 'some_cookie_item_name': 81 | # # print item.value 82 | # return 83 | 84 | self.urlList = [config.INITIAL_USER] 85 | self.months = dict(Jan = '1', Feb = '2', Mar = '3', Apr = '4', \ 86 | May = '5', Jun = '6', Jul = '7', Aug = '8', \ 87 | Sep = '9', Oct = '10', Nov = '11', Dec = '12') 88 | 89 | db = MySQLdb.connect(config.DB_HOST, config.DB_USER, config.DB_PASSWORD, config.DB_DATABASE) 90 | # 使用cursor()方法获取操作游标 91 | cursor = db.cursor() 92 | self.cursor = cursor 93 | self.db = db 94 | self.bf = BloomFilter(capacity=1000000, error_rate=0.001) 95 | self.bf.add(config.INITIAL_USER) 96 | self.restart() 97 | 98 | def getUsersInfo(self): 99 | urlList = self.urlList 100 | count = 0 101 | print "starting..." 102 | while len(urlList) > 0: 103 | count = count + 1 104 | if count % 3000 == 0: 105 | print count 106 | print "sleeping..." 107 | time.sleep(400 + random.randint(200,1000)) 108 | user = urlList.pop() 109 | url = "https://twitter.com/" + user 110 | print url 111 | self.currentUser = user 112 | time.sleep(1 + random.uniform(1, 4)) 113 | try: 114 | flag = 1 115 | flag = self.getBasicInfo() 116 | # print flag 117 | if flag != -1: 118 | time.sleep(1 + random.uniform(1, 4)) 119 | self.getFollowing() 120 | # time.sleep(1 + random.uniform(1, 3)) 121 | # self.getFollowers() 122 | except: 123 | print "something wrong" 124 | continue 125 | # if self.getBasicInfo() != -1: 126 | # time.sleep(2 + random.uniform(1, 3)) 127 | # self.getFollowing() 128 | 129 | def getBasicInfo(self): 130 | url = "https://twitter.com/" + self.currentUser 131 | 132 | try: 133 | request = urllib2.Request(url, headers = self.headers[0]) 134 | response = urllib2.urlopen(request, timeout = 5) 135 | pageHtml = response.read() 136 | # file_obj = open('a.html','w') 137 | # file_obj.write(pageHtml) 138 | # file_obj.close() 139 | except: 140 | print "basic info 请求超时" 141 | return -1 142 | 143 | soup = BeautifulSoup(pageHtml, 'html.parser', from_encoding="unicode") 144 | 145 | name = soup.select_one(".ProfileHeaderCard-nameLink").text 146 | screenname = soup.select_one(".u-linkComplex-target").text 147 | bio = soup.select_one(".ProfileHeaderCard-bio").text 148 | jd = soup.select_one(".ProfileHeaderCard-joinDateText")['title'] 149 | location = soup.select_one(".ProfileHeaderCard-locationText").text 150 | try: 151 | jd = jd.split(' ')[2] 152 | joindate = re.sub('[^\d]+',"-",jd) 153 | joindate = joindate[0 : -1] 154 | except: 155 | joindate = "" 156 | 157 | try: 158 | tn = soup.select_one(".ProfileNav-item--tweets") \ 159 | .select_one(".ProfileNav-stat--link")['title'] 160 | tweetNum = tn.split(' ')[0].replace(',','') 161 | if int(tweetNum) < 60: 162 | return -1 163 | except: 164 | return -1 165 | 166 | try: 167 | fing = soup.select_one(".ProfileNav-item--following") \ 168 | .select_one(".ProfileNav-stat--link")['title'] 169 | following = fing.split(' ')[0].replace(',','') 170 | except: 171 | following = 0 172 | try: 173 | fers = soup.select_one(".ProfileNav-item--followers") \ 174 | .select_one(".ProfileNav-stat--link")['title'] 175 | followers = fers.split(' ')[0].replace(',','') 176 | except: 177 | followers = 0 178 | 179 | try: 180 | fates = soup.select_one(".ProfileNav-item--favorites") \ 181 | .select_one(".ProfileNav-stat--link")['title'] 182 | favorites = fates.split(' ')[0].replace(',','') 183 | except: 184 | favorites = 0 185 | 186 | # SQL 插入语句 187 | sql = """INSERT INTO user(screenname, name, location, joinDate, bio, tweetNum, watchNum, 188 | fansNum, likeNum, created_at) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', 189 | '%s', '%s', '%s')""" % (screenname, name, location, joindate, bio, tweetNum, \ 190 | following, followers, favorites, time.strftime('%Y-%m-%d',time.localtime(time.time()))) 191 | try: 192 | # 执行sql语句 193 | self.cursor.execute(sql) 194 | # 提交到数据库执行 195 | self.db.commit() 196 | except: 197 | return -1 198 | 199 | tweets = soup.select(".js-stream-item") 200 | file_obj = open('tweet/' + self.currentUser + '.txt','a') 201 | for i in range(len(tweets)): 202 | try: 203 | tt = tweets[i].select_one(".js-tweet-text-container").text.replace(u'\xa0', u' ').replace('\n',' ') 204 | file_obj.write(tt.encode('utf-8')) 205 | file_obj.write("\n") 206 | except: 207 | continue 208 | try: 209 | timestamp = tweets[i].select_one(".stream-item-header").select_one(".js-short-timestamp")['data-time'] 210 | user = tweets[i].select_one(".stream-item-header").select_one(".username").select_one('b').text 211 | itemFooter = tweets[i].select_one(".stream-item-footer") 212 | reply = itemFooter.select_one(".ProfileTweet-action--reply").select_one(".ProfileTweet-actionCount")['data-tweet-stat-count'] 213 | retweet = itemFooter.select_one(".ProfileTweet-action--retweet").select_one(".ProfileTweet-actionCount ")['data-tweet-stat-count'] 214 | favorite = itemFooter.select_one(".ProfileTweet-action--favorite").select_one(".ProfileTweet-actionCount ")['data-tweet-stat-count'] 215 | except: 216 | print "tweets bottom error" 217 | file_obj.write(user + " " + timestamp + " " + reply + " " + retweet + " " + favorite) 218 | file_obj.write('\n') 219 | file_obj.close() 220 | 221 | def getTweet(self): 222 | return 223 | 224 | def getFollowing(self): 225 | url = "https://twitter.com/" + self.currentUser + "/following" 226 | 227 | try: 228 | request = urllib2.Request(url, headers = self.headers[1]) 229 | response = urllib2.urlopen(request, timeout = 5) 230 | pageHtml = response.read() 231 | except: 232 | print "following 请求超时" 233 | return 234 | 235 | soup = BeautifulSoup(pageHtml, 'html.parser', from_encoding="unicode") 236 | pcList = soup.select(".ProfileCard") 237 | file_obj = open('following/' + self.currentUser + '.txt','a') 238 | for i in range(len(pcList)): 239 | pc = pcList[i].select_one(".ProfileCard-screennameLink").select_one(".u-linkComplex-target").text.replace(u'\xa0', u' ') 240 | if pc not in self.bf: 241 | self.bf.add(pc) 242 | self.urlList.append(pc) 243 | try: 244 | file_obj.write(pc + " ") 245 | except: 246 | print pc 247 | continue 248 | 249 | def getFollowers(self): 250 | url = "https://twitter.com/" + self.currentUser + "/following" 251 | 252 | try: 253 | request = urllib2.Request(url, headers = self.headers[2]) 254 | response = urllib2.urlopen(request, timeout = 5) 255 | pageHtml = response.read() 256 | 257 | except urllib2.URLError, e: 258 | if hasattr(e,"reason"): 259 | print e.reason 260 | return 261 | 262 | soup = BeautifulSoup(pageHtml, 'html.parser', from_encoding="unicode") 263 | 264 | return 265 | 266 | def getFavorite(self): 267 | url = "https://twitter.com/" + self.currentUser + "/following" 268 | 269 | try: 270 | request = urllib2.Request(url, headers = self.headers[0]) 271 | response = urllib2.urlopen(request, timeout = 5) 272 | pageHtml = response.read() 273 | 274 | except urllib2.URLError, e: 275 | if hasattr(e,"reason"): 276 | print e.reason 277 | return 278 | 279 | soup = BeautifulSoup(pageHtml, 'html.parser', from_encoding="unicode") 280 | 281 | return 282 | 283 | def crawlerFinish(self): 284 | self.db.close() 285 | 286 | def restart(self): 287 | sql = "select screenname from user" 288 | try: 289 | # 执行sql语句 290 | self.cursor.execute(sql) 291 | info = self.cursor.fetchall() 292 | for ii in info: 293 | self.bf.add(ii[0]) 294 | except: 295 | return -1 296 | self.getUsersInfo() 297 | 298 | spider = Crawler() 299 | -------------------------------------------------------------------------------- /typical.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | # import nltk 4 | 5 | from crawler.database import MongoDB 6 | # from portrayal.config import PROJECT_PATH 7 | # from nltk.tokenize import word_tokenize 8 | # from portrayal.tools import preprocess 9 | # from portrayal.career_classify import training, classify 10 | # from portrayal.interest_extract import interest_extract 11 | # from portrayal.sentiment_classify import sentiment_classify 12 | # from portrayal.sentiment_classify import sentiment_dict as sentiment_dict_classifier 13 | # from portrayal.tools import preprocess 14 | # from portrayal.user_profile import user_profile 15 | 16 | import numpy as np 17 | import matplotlib as mpl 18 | import matplotlib.pyplot as plt 19 | from scipy.interpolate import spline 20 | 21 | 22 | 23 | # graph = Neo4j().connect() 24 | 25 | ''' 26 | 职业领域分类 27 | ''' 28 | def classify_career(): 29 | db = MongoDB().connect() 30 | users = db['typical'].find() 31 | 32 | n = 0 33 | err_dict = {} 34 | for user in users: 35 | tweets = user['tweets'] 36 | text = user['description'] 37 | for tweet in tweets: 38 | text += ' ' + tweet['text'] 39 | 40 | text = preprocess.preprocess(text) 41 | res = classify.exe_classify(text) 42 | 43 | if err_dict.has_key(res[0]): 44 | err_dict[res[0]] += 1 45 | else: 46 | err_dict[res[0]] = 1 47 | 48 | if res[0] == user['category']: 49 | n += 1 50 | 51 | print err_dict 52 | print n 53 | 54 | 55 | ''' 56 | 兴趣标签导出 57 | ''' 58 | def extract_interset(): 59 | db = MongoDB().connect() 60 | users = db['typical'].find() 61 | 62 | for u in users: 63 | text = '' 64 | for item in u['tweets']: 65 | text += item['text'] + ' ' 66 | 67 | try: 68 | tags = interest_extract.extract_tags(text, u['description']) 69 | except Exception as e: 70 | print u['_id'] 71 | print e 72 | continue 73 | 74 | db['typical'].update({'_id': u['_id']}, {"$set": {"interest_tags": tags}}) 75 | 76 | 77 | ''' 78 | 心理状态 79 | ''' 80 | def calc_sentiment(): 81 | db = MongoDB().connect() 82 | users = db['typical'].find() 83 | 84 | for user in users: 85 | try: 86 | final_sentiment, psy_with_time1, psy_with_time2, psy_with_count1, psy_with_count2 = sentiment_classify.exe_sentiment_classify(user['tweets']) 87 | except Exception as e: 88 | print user['_id'] 89 | print e 90 | continue 91 | 92 | db['typical'].update({'_id': user['_id']}, {"$set": {"psy": final_sentiment, "psy_with_time1": psy_with_time1, "psy_with_time2": psy_with_time2, "psy_with_count1": psy_with_count1, "psy_with_count2": psy_with_count2}}) 93 | 94 | 95 | def calc_sentiment_score(): 96 | sentiment_dict = dict(map(lambda (k,v): (k,int(v)), 97 | [ line.split('\t') for line in open("portrayal/sentiment_classify/data/sentiment_words1.txt") ])) 98 | 99 | db = MongoDB().connect() 100 | users = db['typical'].find({'screen_name': 'EP_Agriculture'}).limit(1) 101 | 102 | for user in users: 103 | final_sentiment, psy_with_time1, psy_with_time2, psy_with_count1, psy_with_count2 = sentiment_classify.exe_sentiment_classify(user['tweets']) 104 | # print tags 105 | db['typical'].update({'_id': user['_id']}, {"$set": {"psy": final_sentiment, "psy_with_time1": psy_with_time1, "psy_with_time2": psy_with_time2, "psy_with_count1": psy_with_count1, "psy_with_count2": psy_with_count2}}) 106 | 107 | 108 | def sentiment_dict_test(): 109 | # sentiment_dict = dict(map(lambda (k,v): (k,int(v)), 110 | # [ line.split('\t') for line in open("portrayal/sentiment_classify/data/sentiment_words1.txt") ])) 111 | text = sentiment_classify.replace_emotion([{'text': 'hope the er isnt to busy today but the nice weather doesnt keep people healthy or safe', 'created_at': '1'}]) 112 | print text 113 | print sentiment_dict_classifier.calc_sentiment_score(text) 114 | return 115 | n = 0 116 | tts = [] 117 | total = 0 118 | wrong = 0 119 | for line in open("portrayal/sentiment_classify/data/positive.txt"): 120 | n += 1 121 | tts.append({'text': line}) 122 | 123 | if n % 1 == 0: 124 | score = sentiment_dict_classifier.calc_sentiment_score(tts) 125 | total += 1 126 | if score < 0: 127 | wrong += 1 128 | # print score 129 | # if wrong == 7: 130 | # print tts 131 | # break 132 | tts = [] 133 | 134 | print wrong 135 | print total 136 | 137 | 138 | def update_user_category(): 139 | db = MongoDB().connect() 140 | users = db['typical'].find({}, {'_id': 1, 'screen_name': 1, 'category': 1, 'category_score': 1}) 141 | 142 | count = 0 143 | category_name = ['Politics', 'Religion', 'Military', 'Economy', 'Technology', 'Education', 'Agriculture', 'Entertainment', 'Sports'] 144 | 145 | users_temp = [] 146 | 147 | for item in users: 148 | sorted_list = sorted(item['category_score'].iteritems(), key = lambda asd:asd[1], reverse = True) 149 | 150 | if sorted_list[0][1] > 2 * sorted_list[1][1] or sorted_list[0][1] - sorted_list[1][1] > 50: 151 | if sorted_list[0][0] != item['category']: 152 | count += 1 153 | continue 154 | 155 | score_differ = (2 * sorted_list[0][1] - sorted_list[1][1] - sorted_list[-1][1]) / 2 156 | 157 | relation_dict = { 158 | sorted_list[0][0]: 0, 159 | sorted_list[1][0]: 0, 160 | sorted_list[2][0]: 0, 161 | sorted_list[3][0]: 0 162 | } 163 | # for name in category_name: 164 | # relation_dict[name] = 0 165 | 166 | cql = '''MATCH(a{user_id:%s})-[:following]->(f) return distinct f.user_id as user_id''' % (item['_id']) 167 | res = graph.data(cql) 168 | 169 | for f in res: 170 | user = db['typical'].find_one({'_id': f['user_id']}, {'category_score': 1}) 171 | category_score = user['category_score'] 172 | max_category = max(category_score, key = lambda x: category_score[x]) 173 | 174 | if max_category in relation_dict: 175 | relation_dict[max_category] += 1 176 | 177 | cql = '''MATCH(a{user_id:%s})<-[:following]-(f) return distinct f.user_id as user_id''' % (item['_id']) 178 | res = graph.data(cql) 179 | 180 | for f in res: 181 | user = db['typical'].find_one({'_id': f['user_id']}, {'category_score': 1}) 182 | category_score = user['category_score'] 183 | max_category = max(category_score, key = lambda x: category_score[x]) 184 | 185 | if max_category in relation_dict: 186 | relation_dict[max_category] += 1 187 | 188 | relation_total = 0 189 | 190 | for ri in relation_dict: 191 | relation_total += relation_dict[ri] 192 | 193 | if relation_total < 10: 194 | if sorted_list[0][0] != item['category']: 195 | count += 1 196 | continue 197 | 198 | for ri in relation_dict: 199 | item['category_score'][ri] += round(score_differ * relation_dict[ri] / relation_total, 2) 200 | 201 | users_temp.append({'_id':item['_id'], "category_score": item['category_score']}) 202 | 203 | s1 = sorted_list[0][0] 204 | 205 | sorted_list = sorted(item['category_score'].iteritems(), key = lambda asd:asd[1], reverse = True) 206 | 207 | # if sorted_list[0][0] == item['category'] and s1 != item['category']: 208 | # print item['screen_name'] 209 | 210 | if sorted_list[0][0] != item['category']: 211 | count += 1 212 | 213 | print count 214 | 215 | for item in users_temp: 216 | db['typical'].update({'_id': item['_id']}, {"$set": {"category_score": item['category_score']}}) 217 | 218 | 219 | 220 | 221 | 222 | if __name__ == "__main__": 223 | # update_user_category() 224 | # db = MongoDB().connect() 225 | # users = db['typical'].find_one({'_id': 4418090668}) 226 | 227 | # for t in users['tweets']: 228 | # try: 229 | # print t['text'] 230 | # except Exception as e: 231 | # continue 232 | # count = 0 233 | # for user in users: 234 | # # tags = interest_extract.extract_tags(user['tweets'], user['description']) 235 | # # print tags 236 | 237 | # max_score = sorted(user['category_score'].iteritems(), key = lambda asd:asd[1], reverse=True) 238 | # # print 239 | # if user['category'] != max_score[0][0]: 240 | # # print max_score[0][0] 241 | # count += 1 242 | # print user['screen_name'] 243 | # # break 244 | # print count 245 | # for tt in user['tweets']: 246 | # print tt['text'] 247 | # extract_interset() 248 | # calc_sentiment() 249 | # calc_sentiment_score() 250 | # sentiment_dict_test() 251 | 252 | # try: 253 | # words = word_tokenize("What a beautiful sunday . happy") 254 | # print nltk.pos_tag(words) 255 | # except Exception as e: 256 | # print e 257 | db = MongoDB().connect() 258 | users = db['typical'].find() 259 | 260 | count = 1 261 | data_set = { 262 | 'retweet_favorite_rate': [], 263 | 'fans_retweet_rate': [], 264 | 'fans_favorite_rate': [] 265 | } 266 | for user in users: 267 | tweets = user['tweets'] 268 | fans = user['followers_count'] 269 | 270 | # if fans > 2000000: 271 | # continue 272 | count += 1 273 | 274 | if count > 150: 275 | break 276 | tweet_count = 0 277 | retweet_count = 0 278 | favorite_count = 0 279 | for tweet in tweets: 280 | if 'RT @' not in tweet['text']: 281 | tweet_count += 1. 282 | 283 | retweet_count += tweet['retweet_count'] 284 | favorite_count += tweet['favorite_count'] 285 | 286 | fans_retweet_rate = fans / (retweet_count / tweet_count) 287 | if fans_retweet_rate > 600000 or fans_retweet_rate < 50: 288 | continue 289 | 290 | fans_favorite_rate = fans / (favorite_count / tweet_count) 291 | if fans_favorite_rate > 600000 or fans_favorite_rate < 50: 292 | continue 293 | 294 | retweet_favorite_rate = (retweet_count / tweet_count) / (favorite_count / tweet_count) 295 | # print fans, tweet_count, retweet_count, retweet_count / tweet_count 296 | if fans_retweet_rate < 0: 297 | print user['_id'] 298 | data_set['retweet_favorite_rate'].append(retweet_favorite_rate) 299 | data_set['fans_retweet_rate'].append(fans_retweet_rate) 300 | data_set['fans_favorite_rate'].append(fans_favorite_rate) 301 | 302 | x_axix = range(len(data_set['retweet_favorite_rate'])) 303 | x_axix = np.array(x_axix) 304 | x_axix_new = np.linspace(x_axix.min(), x_axix.max(), 4000) 305 | 306 | y_axix = data_set['retweet_favorite_rate'] 307 | y_axix_new = spline(x_axix ,y_axix, x_axix_new) 308 | 309 | plt.plot(x_axix_new, y_axix_new, color='green', label='Retweet Favorite Rate') 310 | 311 | y_axix = data_set['fans_retweet_rate'] 312 | y_axix_new = spline(x_axix ,y_axix, x_axix_new) 313 | 314 | plt.plot(x_axix_new, y_axix_new, color='red', label='Fans Retweet Rate') 315 | 316 | # y_axix = data_set['fans_favorite_rate'] 317 | # y_axix_new = spline(x_axix, y_axix, x_axix_new) 318 | plt.plot(x_axix, y_axix, color='blue', label='Fans Retweet Rate') 319 | # plt.plot(x_axix, train_pn_dis, color='skyblue', label='PN distance') 320 | # plt.plot(x_axix, thresholds, color='blue', label='threshold') 321 | plt.legend() # 显示图例 322 | 323 | # # print y_axix, x_axix_new 324 | # for i in x_axix_new: 325 | # print i 326 | plt.xlabel('Users') 327 | plt.ylabel('Rate') 328 | plt.show() 329 | 330 | -------------------------------------------------------------------------------- /portrayal/resource/stop_words.txt: -------------------------------------------------------------------------------- 1 | 'd 2 | 'll 3 | 'm 4 | 're 5 | 's 6 | 't 7 | 've 8 | ZT 9 | ZZ 10 | a 11 | a's 12 | able 13 | about 14 | above 15 | abst 16 | accordance 17 | according 18 | accordingly 19 | across 20 | act 21 | actually 22 | added 23 | adj 24 | adopted 25 | affected 26 | affecting 27 | affects 28 | after 29 | afterwards 30 | again 31 | against 32 | ah 33 | aha 34 | ahh 35 | ain't 36 | all 37 | allow 38 | allows 39 | almost 40 | along 41 | already 42 | also 43 | although 44 | always 45 | am 46 | among 47 | amongst 48 | amp 49 | an 50 | and 51 | announce 52 | another 53 | any 54 | anybody 55 | anyhow 56 | anymore 57 | anyone 58 | anything 59 | anyway 60 | anyways 61 | anywhere 62 | apart 63 | apparently 64 | appear 65 | approximately 66 | are 67 | area 68 | areas 69 | aren 70 | aren't 71 | arent 72 | arise 73 | around 74 | as 75 | aside 76 | ask 77 | asked 78 | asking 79 | asks 80 | associated 81 | at 82 | auth 83 | away 84 | b 85 | back 86 | backed 87 | backing 88 | backs 89 | be 90 | became 91 | because 92 | become 93 | becomes 94 | becoming 95 | been 96 | before 97 | beforehand 98 | began 99 | begin 100 | beginning 101 | beginnings 102 | begins 103 | behind 104 | being 105 | beings 106 | believe 107 | below 108 | beside 109 | besides 110 | best 111 | better 112 | between 113 | beyond 114 | big 115 | biol 116 | bit 117 | both 118 | brief 119 | briefly 120 | bro 121 | bruh 122 | but 123 | by 124 | c 125 | c'mon 126 | c's 127 | ca 128 | came 129 | can 130 | can't 131 | cannot 132 | cant 133 | case 134 | cases 135 | cause 136 | causes 137 | certain 138 | certainly 139 | changes 140 | clear 141 | clearly 142 | click 143 | co 144 | com 145 | come 146 | comes 147 | concerning 148 | consequently 149 | consider 150 | considering 151 | contain 152 | containing 153 | contains 154 | corresponding 155 | could 156 | couldn't 157 | couldnt 158 | course 159 | currently 160 | d 161 | date 162 | dawg 163 | day 164 | definitely 165 | describe 166 | described 167 | despite 168 | did 169 | didn 170 | didn't 171 | differ 172 | different 173 | differently 174 | discuss 175 | do 176 | does 177 | doesn 178 | doesn't 179 | doing 180 | don't 181 | done 182 | down 183 | downed 184 | downing 185 | downs 186 | downwards 187 | dub 188 | dude 189 | due 190 | during 191 | e 192 | each 193 | early 194 | ed 195 | edu 196 | effect 197 | eg 198 | eight 199 | eighty 200 | either 201 | else 202 | elsewhere 203 | end 204 | ended 205 | ending 206 | ends 207 | enough 208 | entirely 209 | especially 210 | et 211 | et-al 212 | etc 213 | even 214 | evenly 215 | ever 216 | every 217 | everybody 218 | everyone 219 | everything 220 | everywhere 221 | ex 222 | exactly 223 | example 224 | except 225 | f 226 | face 227 | faces 228 | fact 229 | facts 230 | far 231 | fella 232 | felt 233 | few 234 | ff 235 | fifth 236 | find 237 | finds 238 | first 239 | five 240 | fix 241 | followed 242 | following 243 | follows 244 | for 245 | former 246 | formerly 247 | forth 248 | found 249 | four 250 | from 251 | full 252 | fully 253 | further 254 | furthered 255 | furthering 256 | furthermore 257 | furthers 258 | g 259 | gave 260 | general 261 | generally 262 | get 263 | gets 264 | getting 265 | give 266 | given 267 | gives 268 | giving 269 | go 270 | goes 271 | going 272 | gon 273 | gone 274 | goo 275 | good 276 | goods 277 | got 278 | gotten 279 | greetings 280 | group 281 | grouped 282 | grouping 283 | groups 284 | guy 285 | God 286 | god 287 | h 288 | ha 289 | haa 290 | had 291 | hadn't 292 | hah 293 | haha 294 | hahaa 295 | happens 296 | hardly 297 | has 298 | hasn't 299 | have 300 | haven't 301 | having 302 | he 303 | he's 304 | hed 305 | heh 306 | hello 307 | help 308 | hence 309 | her 310 | here 311 | here's 312 | hereafter 313 | hereby 314 | herein 315 | heres 316 | hereupon 317 | hers 318 | herself 319 | hes 320 | hey 321 | hi 322 | hid 323 | hii 324 | high 325 | higher 326 | highest 327 | him 328 | himself 329 | his 330 | hither 331 | home 332 | homie 333 | homies 334 | hoo 335 | hopefully 336 | how 337 | howbeit 338 | however 339 | http 340 | https 341 | hundred 342 | i 343 | i'd 344 | i'll 345 | i'm 346 | i've 347 | id 348 | ie 349 | if 350 | ignored 351 | im 352 | immediate 353 | immediately 354 | importance 355 | important 356 | in 357 | inasmuch 358 | inc 359 | include 360 | indeed 361 | index 362 | indicate 363 | indicated 364 | indicates 365 | information 366 | inner 367 | insofar 368 | instead 369 | into 370 | inward 371 | is 372 | isn 373 | isn't 374 | it 375 | it'd 376 | it'll 377 | it's 378 | itd 379 | its 380 | itself 381 | j 382 | just 383 | k 384 | keep 385 | keeps 386 | kept 387 | keys 388 | kg 389 | kind 390 | km 391 | knew 392 | know 393 | known 394 | knows 395 | l 396 | large 397 | largely 398 | last 399 | lately 400 | later 401 | latest 402 | latter 403 | latterly 404 | least 405 | less 406 | lest 407 | let 408 | let's 409 | lets 410 | life 411 | like 412 | liked 413 | likely 414 | lil 415 | line 416 | link 417 | little 418 | lmao 419 | long 420 | longer 421 | longest 422 | look 423 | looking 424 | looks 425 | lot 426 | ltd 427 | m 428 | made 429 | mainly 430 | make 431 | makes 432 | making 433 | man 434 | many 435 | matter 436 | may 437 | maybe 438 | me 439 | mean 440 | means 441 | meantime 442 | meanwhile 443 | member 444 | members 445 | men 446 | merely 447 | mg 448 | might 449 | million 450 | miss 451 | ml 452 | more 453 | moreover 454 | most 455 | mostly 456 | mr 457 | mrs 458 | much 459 | mug 460 | must 461 | my 462 | myself 463 | n 464 | n't 465 | na 466 | name 467 | namely 468 | nay 469 | nd 470 | near 471 | nearly 472 | necessarily 473 | necessary 474 | need 475 | needed 476 | needing 477 | needs 478 | neither 479 | never 480 | nevertheless 481 | new 482 | newer 483 | newest 484 | next 485 | nine 486 | ninety 487 | night 488 | no 489 | nobody 490 | non 491 | none 492 | nonetheless 493 | noone 494 | nope 495 | nor 496 | normally 497 | nos 498 | not 499 | noted 500 | nothing 501 | novel 502 | now 503 | nowhere 504 | number 505 | numbers 506 | o 507 | obtain 508 | obtained 509 | obviously 510 | of 511 | off 512 | often 513 | oh 514 | ok 515 | okay 516 | old 517 | older 518 | oldest 519 | omitted 520 | on 521 | once 522 | one 523 | ones 524 | only 525 | onto 526 | open 527 | opened 528 | opening 529 | opens 530 | or 531 | ord 532 | order 533 | ordered 534 | ordering 535 | orders 536 | other 537 | others 538 | otherwise 539 | ought 540 | our 541 | ours 542 | ourselves 543 | out 544 | outside 545 | over 546 | overall 547 | owing 548 | own 549 | p 550 | page 551 | pages 552 | part 553 | parted 554 | particular 555 | particularly 556 | parting 557 | parts 558 | past 559 | per 560 | perhaps 561 | people 562 | place 563 | placed 564 | places 565 | please 566 | plus 567 | point 568 | pointed 569 | pointing 570 | points 571 | poorly 572 | possible 573 | possibly 574 | potentially 575 | pp 576 | predominantly 577 | present 578 | presented 579 | presenting 580 | presents 581 | presumably 582 | previously 583 | primarily 584 | probably 585 | promptly 586 | provides 587 | put 588 | puts 589 | q 590 | que 591 | quickly 592 | quite 593 | qv 594 | r 595 | ran 596 | rather 597 | rd 598 | re 599 | readily 600 | really 601 | recent 602 | recently 603 | ref 604 | refs 605 | regarding 606 | regardless 607 | regards 608 | related 609 | relatively 610 | research 611 | respectively 612 | resulted 613 | resulting 614 | results 615 | right 616 | room 617 | rooms 618 | run 619 | RT 620 | rt 621 | s 622 | said 623 | same 624 | saw 625 | say 626 | saying 627 | says 628 | sec 629 | second 630 | secondly 631 | seconds 632 | section 633 | see 634 | seeing 635 | seem 636 | seemed 637 | seeming 638 | seems 639 | seen 640 | sees 641 | self 642 | selves 643 | sensible 644 | sent 645 | serious 646 | seriously 647 | seven 648 | several 649 | shall 650 | she 651 | she'll 652 | shed 653 | shes 654 | should 655 | shouldn't 656 | show 657 | showed 658 | showing 659 | shown 660 | showns 661 | shows 662 | side 663 | sides 664 | similar 665 | similarly 666 | since 667 | sir 668 | six 669 | slightly 670 | small 671 | smaller 672 | smallest 673 | smh 674 | so 675 | some 676 | somebody 677 | somehow 678 | someone 679 | somethan 680 | something 681 | sometime 682 | sometimes 683 | somewhat 684 | somewhere 685 | soon 686 | sorry 687 | specifically 688 | specified 689 | specify 690 | specifying 691 | state 692 | states 693 | still 694 | stop 695 | strongly 696 | sub 697 | substantially 698 | successfully 699 | such 700 | suggest 701 | sup 702 | sure 703 | t 704 | t's 705 | take 706 | taken 707 | taking 708 | tell 709 | tends 710 | th 711 | thk 712 | thx 713 | than 714 | thank 715 | thaanks 716 | thanks 717 | thanx 718 | that 719 | that'll 720 | that's 721 | that've 722 | thats 723 | the 724 | their 725 | theirs 726 | them 727 | themselves 728 | then 729 | thence 730 | there 731 | there'll 732 | there's 733 | there've 734 | thereafter 735 | thereby 736 | thered 737 | therefore 738 | therein 739 | thereof 740 | therere 741 | theres 742 | thereto 743 | thereupon 744 | these 745 | they 746 | they'd 747 | they'll 748 | they're 749 | they've 750 | theyd 751 | theyre 752 | thing 753 | things 754 | think 755 | thinks 756 | third 757 | this 758 | tho 759 | thorough 760 | thoroughly 761 | those 762 | thou 763 | though 764 | thoughh 765 | thought 766 | thoughts 767 | thousand 768 | three 769 | throug 770 | through 771 | throughout 772 | thru 773 | thus 774 | til 775 | time 776 | tip 777 | to 778 | tday 779 | today 780 | together 781 | tomorrow 782 | tonight 783 | too 784 | took 785 | toward 786 | towards 787 | tried 788 | tries 789 | true 790 | truly 791 | try 792 | trying 793 | ts 794 | turn 795 | turned 796 | turning 797 | turns 798 | twice 799 | two 800 | u 801 | un 802 | under 803 | unfortunately 804 | unless 805 | unlike 806 | unlikely 807 | until 808 | unto 809 | up 810 | upon 811 | ups 812 | us 813 | use 814 | used 815 | useful 816 | usefully 817 | usefulness 818 | uses 819 | using 820 | usually 821 | uucp 822 | URL 823 | v 824 | value 825 | various 826 | very 827 | via 828 | viz 829 | vol 830 | vols 831 | vs 832 | w 833 | want 834 | wanted 835 | wanting 836 | wants 837 | was 838 | wasn't 839 | way 840 | ways 841 | we 842 | we'd 843 | we'll 844 | we're 845 | we've 846 | wed 847 | week 848 | welcome 849 | well 850 | wells 851 | went 852 | were 853 | weren't 854 | what 855 | what'll 856 | what's 857 | whatever 858 | whats 859 | when 860 | whence 861 | whenever 862 | where 863 | where's 864 | whereafter 865 | whereas 866 | whereby 867 | wherein 868 | wheres 869 | whereupon 870 | wherever 871 | whether 872 | which 873 | while 874 | whim 875 | whither 876 | who 877 | whoa 878 | who'll 879 | who's 880 | whod 881 | whoever 882 | whole 883 | whom 884 | whomever 885 | whos 886 | whose 887 | why 888 | widely 889 | will 890 | willing 891 | wish 892 | with 893 | within 894 | without 895 | won't 896 | wonder 897 | woohoo 898 | words 899 | work 900 | worked 901 | working 902 | works 903 | world 904 | would 905 | wouldn't 906 | www 907 | x 908 | y 909 | yeah 910 | yeahh 911 | year 912 | years 913 | yes 914 | yet 915 | yesterday 916 | you 917 | you'd 918 | you'll 919 | you're 920 | you've 921 | youd 922 | young 923 | younger 924 | youngest 925 | your 926 | youre 927 | yours 928 | yourself 929 | yourselves 930 | yuu 931 | z 932 | zero 933 | zt 934 | zz -------------------------------------------------------------------------------- /portrayal/tools/generate_xml.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import sys 3 | import xml.dom.minidom 4 | 5 | from .. config import PROJECT_PATH, XML_PATH 6 | 7 | # reload(sys) 8 | # sys.setdefaultencoding('utf8') 9 | 10 | # 获取DOM树实现对象 11 | impl = xml.dom.minidom.getDOMImplementation() 12 | 13 | # 生成用户的XML文件 14 | def generate_user_xml(user): 15 | dom = impl.createDocument(None, 'TwitterUser', None) 16 | root = dom.documentElement 17 | 18 | # 创建子节点 19 | basic_info = dom.createElement('基础信息') 20 | implicit_info = dom.createElement('隐性属性') 21 | root.appendChild(basic_info) 22 | root.appendChild(implicit_info) 23 | 24 | 25 | # id 26 | id_ele = dom.createElement('用户ID') 27 | 28 | if user.has_key('_id'): 29 | id_text = dom.createTextNode(str(user['_id'])) 30 | else: 31 | id_text = dom.createTextNode(str(user['user_id'])) 32 | 33 | id_ele.appendChild(id_text) 34 | 35 | 36 | # screen_name 37 | sn_ele = dom.createElement('screen_name') 38 | sn_text = dom.createTextNode(user['screen_name']) 39 | sn_ele.appendChild(sn_text) 40 | 41 | 42 | # name 43 | name_ele = dom.createElement('name') 44 | # 标签增加属性,设置编码方式 45 | # name_ele.setAttribute("coding", "utf-8") 46 | _name = user['name'] if user['name'] else '' 47 | name_text = dom.createTextNode(_name) 48 | name_ele.appendChild(name_text) 49 | 50 | 51 | # 个人描述 52 | des_ele = dom.createElement('简介') 53 | _description = user['description'] if user['description'] else '' 54 | des_text = dom.createTextNode(_description) 55 | des_ele.appendChild(des_text) 56 | 57 | 58 | # 地理位置 59 | location_ele = dom.createElement('地理位置') 60 | _location = user['location'] if user['location'] else '' 61 | location_text = dom.createTextNode(_location) 62 | location_ele.appendChild(location_text) 63 | 64 | 65 | # 帐号创建日期 66 | create_ele = dom.createElement('帐号创建日期') 67 | create_text = dom.createTextNode(str(user['created_at'])) 68 | create_ele.appendChild(create_text) 69 | 70 | 71 | # 粉丝数 72 | follower_ele = dom.createElement('粉丝数') 73 | follower_text = dom.createTextNode(str(user['followers_count'])) 74 | follower_ele.appendChild(follower_text) 75 | 76 | 77 | # 朋友数 78 | friends_ele = dom.createElement('朋友数') 79 | friends_text = dom.createTextNode(str(user['friends_count'])) 80 | friends_ele.appendChild(friends_text) 81 | 82 | 83 | # 推文数 84 | status_ele = dom.createElement('推文数') 85 | status_text = dom.createTextNode(str(user['statuses_count'])) 86 | status_ele.appendChild(status_text) 87 | 88 | 89 | # 喜欢的推文数 90 | favourite_ele = dom.createElement('喜欢的推文数') 91 | favourite_text = dom.createTextNode(str(user['favourites_count'])) 92 | favourite_ele.appendChild(favourite_text) 93 | 94 | 95 | # 列表数量 96 | list_ele = dom.createElement('所属列表数') 97 | list_text = dom.createTextNode(str(user['listed_count'])) 98 | list_ele.appendChild(list_text) 99 | 100 | 101 | # 是否认证 102 | verified_ele = dom.createElement('官方认证') 103 | verified_text = dom.createTextNode(str(user['verified'])) 104 | verified_ele.appendChild(verified_text) 105 | 106 | 107 | # 隐私保护 108 | pro_ele = dom.createElement('隐私保护') 109 | pro_text = dom.createTextNode(str(user['protected'])) 110 | pro_ele.appendChild(pro_text) 111 | 112 | 113 | #地理位置共享 114 | geo_enabled_ele = dom.createElement('地理位置共享') 115 | geo_enabled_text = dom.createTextNode(str(user['geo_enabled'])) 116 | geo_enabled_ele.appendChild(geo_enabled_text) 117 | 118 | 119 | # 使用语言 120 | lang_ele = dom.createElement('语言') 121 | _lang = user['lang'] if user['lang'] else '' 122 | lang_text = dom.createTextNode(_lang) 123 | lang_ele.appendChild(lang_text) 124 | 125 | 126 | # 时区 127 | time_zone = dom.createElement('时区') 128 | _time_zone = user['time_zone'] if user['time_zone'] else '' 129 | time_text = dom.createTextNode(_time_zone) 130 | time_zone.appendChild(time_text) 131 | 132 | 133 | # 国际协调时偏移量 134 | utc_ele = dom.createElement('国际协调时偏移量') 135 | utc_text = dom.createTextNode(str(user['utc_offset'])) 136 | utc_ele.appendChild(utc_text) 137 | 138 | 139 | # 是否使用默认头像 140 | default_ele = dom.createElement('是否使用默认头像') 141 | default_text = dom.createTextNode(str(user['default_profile_image'])) 142 | default_ele.appendChild(default_text) 143 | 144 | 145 | # 头像链接 146 | profile_ele = dom.createElement('头像链接') 147 | profile_text = dom.createTextNode(user['profile_image_url']) 148 | profile_ele.appendChild(profile_text) 149 | 150 | 151 | # 背景图片链接 152 | banner_ele = dom.createElement('背景图片链接') 153 | _profile_banner_url = user['profile_banner_url'] if user['profile_banner_url'] else '' 154 | banner_text = dom.createTextNode(_profile_banner_url) 155 | banner_ele.appendChild(banner_text) 156 | 157 | 158 | # 主页背景颜色 159 | bgcolor_ele = dom.createElement('主页背景颜色') 160 | bgcolor_text = dom.createTextNode(user['profile_background_color']) 161 | bgcolor_ele.appendChild(bgcolor_text) 162 | 163 | 164 | #侧边栏填充颜色 165 | profile_sidebar_ele = dom.createElement('侧边栏填充颜色') 166 | profile_sidebar_text = dom.createTextNode(user['profile_sidebar_fill_color']) 167 | profile_sidebar_ele.appendChild(profile_sidebar_text) 168 | 169 | 170 | #抓取到的推文数 171 | tweets_crawled_ele = dom.createElement('抓取到的推文数') 172 | tweets_crawled_text = dom.createTextNode(str(len(user['tweets']))) 173 | tweets_crawled_ele.appendChild(tweets_crawled_text) 174 | 175 | 176 | #已抓取推文开始时间 177 | tweets_crawled_start_ele = dom.createElement('已抓取推文开始时间') 178 | tweets_crawled_start_text = dom.createTextNode(user['tweets'][0]['created_at'] if len(user['tweets']) > 0 else '') 179 | tweets_crawled_start_ele.appendChild(tweets_crawled_start_text) 180 | 181 | 182 | #已抓取推文结束时间 183 | tweets_crawled_end_ele = dom.createElement('已抓取推文结束时间') 184 | tweets_crawled_end_text = dom.createTextNode(user['tweets'][-1]['created_at'] if len(user['tweets']) > 0 else '') 185 | tweets_crawled_end_ele.appendChild(tweets_crawled_end_text) 186 | 187 | 188 | # 抓取日期 189 | crawler_date_ele = dom.createElement('抓取日期') 190 | crawler_date_text = dom.createTextNode(str(user['crawler_date'])) 191 | crawler_date_ele.appendChild(crawler_date_text) 192 | 193 | 194 | # 把基本信息加入到basic_info节点中 195 | basic_info.appendChild(id_ele) 196 | basic_info.appendChild(sn_ele) 197 | basic_info.appendChild(name_ele) 198 | basic_info.appendChild(des_ele) 199 | basic_info.appendChild(location_ele) 200 | basic_info.appendChild(create_ele) 201 | basic_info.appendChild(follower_ele) 202 | basic_info.appendChild(friends_ele) 203 | basic_info.appendChild(status_ele) 204 | basic_info.appendChild(favourite_ele) 205 | basic_info.appendChild(list_ele) 206 | basic_info.appendChild(verified_ele) 207 | basic_info.appendChild(pro_ele) 208 | basic_info.appendChild(geo_enabled_ele) 209 | basic_info.appendChild(lang_ele) 210 | basic_info.appendChild(time_zone) 211 | basic_info.appendChild(utc_ele) 212 | basic_info.appendChild(default_ele) 213 | basic_info.appendChild(profile_ele) 214 | basic_info.appendChild(banner_ele) 215 | basic_info.appendChild(bgcolor_ele) 216 | basic_info.appendChild(profile_sidebar_ele) 217 | basic_info.appendChild(tweets_crawled_ele) 218 | basic_info.appendChild(tweets_crawled_start_ele) 219 | basic_info.appendChild(tweets_crawled_end_ele) 220 | basic_info.appendChild(crawler_date_ele) 221 | 222 | 223 | 224 | # 职业领域分类 225 | category_ele = dom.createElement("职业领域") 226 | category_text = dom.createTextNode(user['category']) 227 | category_ele.appendChild(category_text) 228 | 229 | 230 | # 职业领域得分 231 | category_score_ele = dom.createElement("职业领域得分") 232 | category_score_str = '' 233 | 234 | for item in user['category_score']: 235 | category_score_str += item + ": " + str(user['category_score'][item]) + "; " 236 | 237 | category_score_text = dom.createTextNode(category_score_str[0:-2]) 238 | category_score_ele.appendChild(category_score_text) 239 | 240 | 241 | # 用户社交影响力 242 | influence_ele = dom.createElement("影响力分数") 243 | influence_text = dom.createTextNode(str(user['influence_score'])) 244 | influence_ele.appendChild(influence_text) 245 | 246 | 247 | if user['influence_score'] >= 110: 248 | influence_rank = '高' 249 | elif user['influence_score'] >= 60: 250 | influence_rank = '中' 251 | else: 252 | influence_rank = '低' 253 | # 用户社交影响力大小 254 | influence_rank_ele = dom.createElement("影响力等级") 255 | influence_rank_text = dom.createTextNode(influence_rank) 256 | influence_rank_ele.appendChild(influence_rank_text) 257 | 258 | 259 | # 用户心里状态标签 260 | psy_ele = dom.createElement("心理状态") 261 | 262 | if user['psy'] == 1: 263 | psy_temp = '正面' 264 | elif user['psy'] == -1: 265 | psy_temp = '负面' 266 | else: 267 | psy_temp = '中性' 268 | 269 | psy_text = dom.createTextNode(psy_temp) 270 | psy_ele.appendChild(psy_text) 271 | 272 | 273 | # 用户兴趣爱好标签 274 | interest_ele = dom.createElement("兴趣爱好标签") 275 | interest_text = dom.createTextNode(user['interest_tags']) 276 | interest_ele.appendChild(interest_text) 277 | 278 | 279 | # 活跃度 280 | activity_ele = dom.createElement("活跃度") 281 | activity_text = dom.createTextNode(str(user['activity'])) 282 | activity_ele.appendChild(activity_text) 283 | 284 | 285 | # 活跃度变化 286 | activity_list_ele = dom.createElement("活跃度变化") 287 | activity_list_str = '' 288 | 289 | for item in user['activity_list']: 290 | activity_list_str += str(item) + ", " 291 | 292 | activity_list_text = dom.createTextNode(activity_list_str[0:-2]) 293 | activity_list_ele.appendChild(activity_list_text) 294 | 295 | 296 | # 心理状态变化(相同推文数,方法1) 297 | psy_with_count1_ele = dom.createElement("心理状态变化") 298 | psy_with_count1_ele.setAttribute("type", "相同推文数") 299 | psy_with_count1_ele.setAttribute("method", "分类器分类") 300 | psy_with_count1_str = '' 301 | 302 | for item in user['psy_with_count1']: 303 | psy_with_count1_str += str(item) + ", " 304 | 305 | psy_with_count1_text = dom.createTextNode(psy_with_count1_str[0:-2]) 306 | psy_with_count1_ele.appendChild(psy_with_count1_text) 307 | 308 | 309 | # 心理状态变化(相同推文数,方法2) 310 | psy_with_count2_ele = dom.createElement("心理状态变化") 311 | psy_with_count2_ele.setAttribute("type", "相同推文数") 312 | psy_with_count2_ele.setAttribute("method", "情感字典") 313 | psy_with_count2_str = '' 314 | 315 | for item in user['psy_with_count2']: 316 | psy_with_count2_str += str(item) + ", " 317 | 318 | psy_with_count2_text = dom.createTextNode(psy_with_count2_str[0:-2]) 319 | psy_with_count2_ele.appendChild(psy_with_count2_text) 320 | 321 | 322 | # 心理状态变化(相同时间间隔,方法1) 323 | psy_with_time1_ele = dom.createElement("心理状态变化") 324 | psy_with_time1_ele.setAttribute("type", "相同时间间隔") 325 | psy_with_time1_ele.setAttribute("method", "分类器分类") 326 | psy_with_time1_str = '' 327 | 328 | for item in user['psy_with_time1']: 329 | psy_with_time1_str += str(item) + ", " 330 | 331 | psy_with_time1_text = dom.createTextNode(psy_with_time1_str[0:-2]) 332 | psy_with_time1_ele.appendChild(psy_with_time1_text) 333 | 334 | 335 | # 心理状态变化(相同时间间隔,方法2) 336 | psy_with_time2_ele = dom.createElement("心理状态变化") 337 | psy_with_time2_ele.setAttribute("type", "相同时间间隔") 338 | psy_with_time2_ele.setAttribute("method", "情感字典") 339 | psy_with_time2_str = '' 340 | 341 | for item in user['psy_with_time2']: 342 | psy_with_time2_str += str(item) + ", " 343 | 344 | psy_with_time2_text = dom.createTextNode(psy_with_time2_str[0:-2]) 345 | psy_with_time2_ele.appendChild(psy_with_time2_text) 346 | 347 | 348 | # 将隐性属性标签加入到隐性标签中 349 | implicit_info.appendChild(category_ele) 350 | implicit_info.appendChild(category_score_ele) 351 | implicit_info.appendChild(influence_ele) 352 | implicit_info.appendChild(influence_rank_ele) 353 | implicit_info.appendChild(psy_ele) 354 | implicit_info.appendChild(interest_ele) 355 | implicit_info.appendChild(activity_ele) 356 | implicit_info.appendChild(activity_list_ele) 357 | implicit_info.appendChild(psy_with_count1_ele) 358 | implicit_info.appendChild(psy_with_count2_ele) 359 | implicit_info.appendChild(psy_with_time1_ele) 360 | implicit_info.appendChild(psy_with_time2_ele) 361 | 362 | 363 | # 将用户信息写入文件 364 | with open(XML_PATH + '%s.xml' % user['screen_name'], 'w') as f: 365 | dom.writexml(f, addindent=" ", newl='\n', encoding="utf-8") 366 | 367 | 368 | return XML_PATH + '%s.xml' % user['screen_name'] -------------------------------------------------------------------------------- /crawler/relation_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import time 3 | import threading 4 | 5 | from twitter import error 6 | from api import Api, API_COUNT 7 | from decorator import generate_decorator 8 | 9 | handle_exception = generate_decorator(720) 10 | 11 | class RelationCrawler: 12 | get_api = Api().get_api 13 | 14 | 15 | ''' 16 | Returns information about the relationship between the two users. 17 | 18 | Parameters: 19 | source_id – The user_id of the subject user [Optional] 20 | source_screen_name – The screen_name of the subject user [Optional] 21 | target_id – The user_id of the target user [Optional] 22 | target_screen_name – The screen_name of the target user [Optional] 23 | Returns: 24 | A Twitter Json structure. 25 | ''' 26 | def show_friendship(self, 27 | source_user_id = None, 28 | source_screen_name = None, 29 | target_user_id = None, 30 | target_screen_name = None): 31 | 32 | if not source_user_id and not source_screen_name: 33 | return None 34 | 35 | if not target_user_id and not target_screen_name: 36 | return None 37 | 38 | return self.get_api().ShowFriendship(source_user_id, 39 | source_screen_name, 40 | target_user_id, 41 | target_screen_name) 42 | 43 | 44 | ''' 45 | 获取用户关系信息,如果超时则会休眠800s,然后返回关系信息(参考 show_friendship ) 46 | ''' 47 | def show_friendship_sleep(self, 48 | source_user_id = None, 49 | source_screen_name = None, 50 | target_user_id = None, 51 | target_screen_name = None): 52 | 53 | wrapper_func = handle_exception(self.show_friendship) 54 | relation = wrapper_func(source_user_id, source_screen_name, target_user_id, target_screen_name) 55 | 56 | return relation 57 | 58 | 59 | ''' 60 | Fetch a sequence of user ids, one for each friend. Returns a list of all the given user’s friends’ IDs. 61 | 62 | Parameters: 63 | user_id – The id of the user to retrieve the id list for. [Optional] 64 | screen_name – The screen_name of the user to retrieve the id list for. [Optional] 65 | cursor – Specifies the Twitter API Cursor location to start at. Note: there are pagination limits. [Optional] 66 | total_count – The total amount of UIDs to retrieve. Good if the account has many followers and you don’t want to get rate limited. 67 | 68 | Returns: 69 | A list of integers, one for each user id. 70 | ''' 71 | def get_friendids(self, 72 | user_id = None, 73 | screen_name = None, 74 | cursor = None, 75 | total_count = 60000): 76 | 77 | if user_id == None and screen_name == None: 78 | return None 79 | 80 | return self.get_api().GetFriendIDs(user_id = user_id, 81 | screen_name = screen_name, 82 | cursor = cursor, 83 | total_count = total_count) 84 | 85 | 86 | ''' 87 | Make a cursor driven call to return the list of all friends 88 | The caller is responsible for handling the cursor value and looping to gather all of the data 89 | 90 | Parameters: 91 | user_id – The twitter id of the user whose friends you are fetching. [Optional] 92 | screen_name – The twitter name of the user whose friends you are fetching. If not specified, defaults to the authenticated user. [Optional] 93 | cursor – Should be set to -1 for the initial call and then is used to control what result page Twitter returns. 94 | stringify_ids – if True then twitter will return the ids as strings instead of integers. [Optional] 95 | count – The number of user id’s to retrieve per API request. Please be aware that this might get you rate-limited if set to a small number. 96 | By default Twitter will retrieve 5000 UIDs per call. [Optional] 97 | 98 | Returns: 99 | next_cursor, previous_cursor, data sequence of twitter.User instances, one for each friend 100 | ''' 101 | def get_friendids_paged(self, 102 | user_id = None, 103 | screen_name = None, 104 | cursor = -1, 105 | count = 5000, 106 | stringify_ids = False): 107 | 108 | if user_id == None and screen_name == None: 109 | return None 110 | 111 | return self.get_api().GetFriendIDsPaged(user_id = user_id, 112 | screen_name = screen_name, 113 | cursor = cursor, 114 | count = count, 115 | stringify_ids = stringify_ids) 116 | 117 | 118 | ''' 119 | 分页获取用户朋友id,如果超时则会休眠800s,然后返回朋友信息(参考 get_friendids_paged ) 120 | ''' 121 | def get_friendids_paged_sleep(self, 122 | user_id = None, 123 | screen_name = None, 124 | cursor = -1, 125 | stringify_ids = False, 126 | count = 5000): 127 | 128 | wrapper_func = handle_exception(self.get_friendids_paged) 129 | friendids = wrapper_func(user_id = user_id, 130 | screen_name = screen_name, 131 | cursor = cursor, 132 | stringify_ids = stringify_ids, 133 | count = count) 134 | 135 | return friendids 136 | 137 | 138 | ''' 139 | Fetch the sequence of twitter.User instances, one for each friend. 140 | If both user_id and screen_name are specified, this call will return the followers of the user specified by screen_name, 141 | however this behavior is undocumented by Twitter and may change without warning. 142 | 143 | Parameters: 144 | user_id – The twitter id of the user whose friends you are fetching. [Optional] 145 | screen_name – The twitter name of the user whose friends you are fetching. [Optional] 146 | cursor – Should be set to -1 for the initial call and then is used to control what result page Twitter returns. 147 | total_count – The upper bound of number of users to return. 148 | skip_status – If True the statuses will not be returned in the user items. [Optional] 149 | include_user_entities – When True, the user entities will be included. [Optional] 150 | 151 | Returns: 152 | A sequence of twitter.User instances, one for each friend 153 | ''' 154 | def get_friends(self, 155 | user_id = None, 156 | screen_name = None, 157 | cursor = None, 158 | total_count = 2500, 159 | skip_status = True, 160 | include_user_entities = True): 161 | 162 | if user_id == None and screen_name == None: 163 | return None 164 | 165 | return self.get_api().GetFriends(user_id = user_id, 166 | screen_name = screen_name, 167 | cursor = cursor, 168 | total_count = total_count, 169 | skip_status = skip_status, 170 | include_user_entities = include_user_entities) 171 | 172 | ''' 173 | 分页获取用户朋友信息(参考 get_friends ) 174 | ''' 175 | def get_friends_paged(self, 176 | user_id = None, 177 | screen_name = None, 178 | cursor = -1, 179 | count = 200, 180 | skip_status = True, 181 | include_user_entities = True): 182 | 183 | if user_id == None and screen_name == None: 184 | return None 185 | 186 | return self.get_api().GetFriendsPaged(user_id = user_id, 187 | screen_name = screen_name, 188 | cursor = cursor, 189 | count = count, 190 | skip_status = skip_status, 191 | include_user_entities = include_user_entities) 192 | 193 | ''' 194 | 获取用户所有朋友的id,并保存 195 | ''' 196 | # def get_all_friendids(self, 197 | # user_id = None, 198 | # screen_name = None): 199 | 200 | # cursor = -1 201 | # while cursor != 0: 202 | # out = self.get_friendids_paged_sleep(user_id = user_id, 203 | # screen_name = screen_name, 204 | # cursor = cursor, 205 | # count = 5000) 206 | # if not out: 207 | # return None 208 | 209 | # cursor = out[0] 210 | # friend_list = out[2] 211 | 212 | 213 | ''' 214 | Returns a list of twitter user id’s for every person that is following the specified user. 215 | 216 | Parameters: 217 | user_id – The id of the user to retrieve the id list for. [Optional] 218 | screen_name – The screen_name of the user to retrieve the id list for. [Optional] 219 | cursor – Specifies the Twitter API Cursor location to start at. Note: there are pagination limits. [Optional] 220 | total_count – The total amount of UIDs to retrieve. Good if the account has many followers and you don’t want to get rate limited. 221 | 222 | Returns: 223 | A list of integers, one for each user id. 224 | ''' 225 | def get_followerids(self, 226 | user_id = None, 227 | screen_name = None, 228 | cursor = None, 229 | total_count = 60000): 230 | 231 | if user_id == None and screen_name == None: 232 | return None 233 | 234 | return self.get_api().GetFollowerIDs(user_id = user_id, 235 | screen_name = screen_name, 236 | cursor = cursor, 237 | total_count = total_count) 238 | 239 | 240 | ''' 241 | Make a cursor driven call to return a list of one page followers. 242 | The caller is responsible for handling the cursor value and looping to gather all of the data 243 | 244 | Parameters: 245 | user_id – The twitter id of the user whose followers you are fetching. [Optional] 246 | screen_name – The twitter name of the user whose followers you are fetching. [Optional] 247 | cursor – Should be set to -1 for the initial call and then is used to control what result page Twitter returns. 248 | stringify_ids – if True then twitter will return the ids as strings instead of integers. [Optional] 249 | count – The number of user id’s to retrieve per API request. Please be aware that this might get you rate-limited if set to a small number. 250 | By default Twitter will retrieve 5000 UIDs per call. [Optional] 251 | 252 | Returns: 253 | next_cursor, previous_cursor, data sequence of user ids, one for each follower 254 | ''' 255 | def get_followerids_paged(self, 256 | user_id = None, 257 | screen_name = None, 258 | cursor = -1, 259 | stringify_ids = False, 260 | count = 5000): 261 | 262 | if user_id == None and screen_name == None: 263 | return None 264 | 265 | return self.get_api().GetFollowerIDsPaged(user_id = user_id, 266 | screen_name = screen_name, 267 | cursor = cursor, 268 | count = count, 269 | stringify_ids = stringify_ids) 270 | 271 | 272 | ''' 273 | 分页获取用户粉丝id,如果超时则会休眠800s,然后返回粉丝信息(参考 get_followerids_page ) 274 | ''' 275 | def get_followerids_paged_sleep(self, 276 | user_id = None, 277 | screen_name = None, 278 | cursor = -1, 279 | stringify_ids = False, 280 | count = 5000): 281 | 282 | wrapper_func = handle_exception(self.get_followerids_paged) 283 | followerids = wrapper_func(user_id = user_id, 284 | screen_name = screen_name, 285 | cursor = cursor, 286 | stringify_ids = stringify_ids, 287 | count = count) 288 | 289 | return followerids 290 | 291 | 292 | ''' 293 | Fetch the sequence of twitter.User instances, one for each follower. 294 | If both user_id and screen_name are specified, this call will return the followers of the user specified by screen_name, 295 | however this behavior is undocumented by Twitter and may change without warning. 296 | 297 | Parameters: 298 | user_id – The twitter id of the user whose followers you are fetching. [Optional] 299 | screen_name – The twitter name of the user whose followers you are fetching. [Optional] 300 | cursor – Should be set to -1 for the initial call and then is used to control what result page Twitter returns. 301 | total_count – The upper bound of number of users to return, defaults to None. 302 | skip_status – If True the statuses will not be returned in the user items. [Optional] 303 | include_user_entities – When True, the user entities will be included. [Optional] 304 | 305 | Returns: 306 | A sequence of twitter.User instances, one for each follower 307 | ''' 308 | def get_followers(self, 309 | user_id = None, 310 | screen_name = None, 311 | cursor = None, 312 | total_count = 2500, 313 | skip_status = True, 314 | include_user_entities = True): 315 | 316 | if user_id == None and screen_name == None: 317 | return None 318 | 319 | return self.get_api().GetFollowers(user_id = user_id, 320 | screen_name = screen_name, 321 | cursor = cursor, 322 | total_count = total_count, 323 | skip_status = skip_status, 324 | include_user_entities = include_user_entities) 325 | 326 | 327 | ''' 328 | 分页获取用户粉丝信息(参考 get_followers ) 329 | ''' 330 | def get_followers_paged(self, 331 | user_id = None, 332 | screen_name = None, 333 | cursor = -1, 334 | count = 200, 335 | skip_status = True, 336 | include_user_entities = True): 337 | 338 | if user_id == None and screen_name == None: 339 | return None 340 | 341 | return self.get_api().GetFollowersPaged(user_id = user_id, 342 | screen_name = screen_name, 343 | cursor = cursor, 344 | count = count, 345 | skip_status = skip_status, 346 | include_user_entities = include_user_entities) 347 | 348 | 349 | ''' 350 | 获取用户所有粉丝的id,并保存 351 | ''' 352 | # def get_all_followersids(self, 353 | # user_id = None, 354 | # screen_name = None): 355 | 356 | # cursor = -1 357 | # while cursor != 0: 358 | # out = self.get_followerids_paged_sleep(user_id = user_id, 359 | # screen_name = screen_name, 360 | # cursor = cursor, 361 | # count = 5000) 362 | # if not out: 363 | # return None 364 | 365 | # cursor = out[0] 366 | # follower_list = out[2] 367 | 368 | 369 | if __name__ == '__main__': 370 | rc = RelationCrawler() 371 | print rc.get_followers_paged(screen_name='mrmarcohan') -------------------------------------------------------------------------------- /crawler/tweets_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import time 3 | import threading 4 | 5 | from config import THREAD_NUM 6 | from twitter import error 7 | from api import Api, API_COUNT 8 | from database import MongoDB 9 | from decorator import generate_decorator 10 | 11 | handle_exception = generate_decorator(300) 12 | 13 | class TweetsCrawler: 14 | get_api = Api().get_api 15 | 16 | 17 | ''' 18 | Fetch the sequence of public Status messages for a single user. 19 | 20 | Parameters: 21 | user_id (int, optional) – Specifies the ID of the user for whom to return the user_timeline. 22 | Helpful for disambiguating when a valid user ID is also a valid screen name. 23 | screen_name (str, optional) – Specifies the screen name of the user for whom to return the user_timeline. 24 | Helpful for disambiguating when a valid screen name is also a user ID. 25 | since_id (int, optional) – Returns results with an ID greater than (that is, more recent than) the specified ID. 26 | There are limits to the number of Tweets which can be accessed through the API. If the limit of Tweets has 27 | occurred since the since_id, the since_id will be forced to the oldest ID available. 28 | max_id (int, optional) – Returns only statuses with an ID less than (that is, older than) or equal to the specified ID. 29 | count (int, optional) – Specifies the number of statuses to retrieve. May not be greater than 200. 30 | include_rts (bool, optional) – If True, the timeline will contain native retweets (if they exist) in addition to the standard stream of tweets. 31 | trim_user (bool, optional) – If True, statuses will only contain the numerical user ID only. Otherwise a full user object will be returned for each status. 32 | exclude_replies (bool, optional) – If True, this will prevent replies from appearing in the returned timeline. Using exclude_replies with the 33 | count parameter will mean you will receive up-to count tweets - this is because the count parameter retrieves that many tweets 34 | before filtering out retweets and replies. This parameter is only supported for JSON and XML responses. 35 | 36 | Returns: 37 | A sequence of Status instances, one for each message up to count 38 | ''' 39 | def get_user_timeline(self, 40 | user_id = None, 41 | screen_name = None, 42 | since_id = None, 43 | max_id = None, 44 | count = None, 45 | include_rts = True, 46 | trim_user = True, 47 | exclude_replies = False): 48 | 49 | if user_id == None and screen_name == None: 50 | return None 51 | 52 | return self.get_api().GetUserTimeline(user_id = user_id, 53 | screen_name = screen_name, 54 | since_id = since_id, 55 | max_id = max_id, 56 | count = count, 57 | include_rts = include_rts, 58 | trim_user = trim_user, 59 | exclude_replies = exclude_replies) 60 | 61 | 62 | ''' 63 | 获取用户所有推文信息,并保存在数据库(MongoDB)中(参考 get_user_timeline ) 64 | 65 | 参数: 66 | collect_name:数据库集合名,默认 tweets_task 67 | ''' 68 | def get_user_all_timeline(self, 69 | user_id = None, 70 | collect_name = "tweets_task", 71 | screen_name = None, 72 | include_rts = True, 73 | exclude_replies = False): 74 | 75 | if user_id == None and screen_name == None: 76 | return None 77 | 78 | if user_id: 79 | try: 80 | user_id = long(user_id) 81 | except Exception as e: 82 | print e 83 | return None 84 | 85 | flag = True 86 | tweets = [0] 87 | sleep_count = 0 88 | 89 | db = MongoDB().connect() 90 | collect = db[collect_name] 91 | get_api = self.get_api 92 | 93 | while len(tweets) > 0: 94 | try: 95 | if flag: 96 | tweets = get_api().GetUserTimeline(user_id = user_id, 97 | screen_name = screen_name, 98 | include_rts = include_rts, 99 | exclude_replies = exclude_replies, 100 | trim_user = True, 101 | count = 200) 102 | flag = False 103 | 104 | else: 105 | tweets = get_api().GetUserTimeline(user_id = user_id, 106 | screen_name = screen_name, 107 | include_rts = include_rts, 108 | exclude_replies = exclude_replies, 109 | trim_user = True, 110 | count = 200, 111 | max_id = tweets[-1].id - 1) 112 | 113 | except error.TwitterError as te: 114 | try: 115 | if te.message == 'Not authorized.': 116 | print 'Not authorized.' 117 | return 118 | 119 | if te.message[0]['code'] == 88: 120 | sleep_count += 1 121 | 122 | if sleep_count >= API_COUNT: 123 | print "sleeping..." 124 | sleep_count = 0 125 | time.sleep(300) 126 | continue 127 | 128 | else: 129 | print te 130 | break 131 | except Exception as ee: 132 | print ee 133 | break 134 | except Exception as e: 135 | break 136 | 137 | for tt in tweets: 138 | tweet = self.tweetobj_to_dict(tt) 139 | 140 | if not tweet: 141 | continue 142 | 143 | try: 144 | collect.insert_one(tweet) 145 | except Exception as e: 146 | continue 147 | 148 | 149 | ''' 150 | 获取用户所有推文信息,并返回(参考 get_user_timeline ) 151 | ''' 152 | def get_user_all_timeline_return(self, 153 | user_id = None, 154 | screen_name = None, 155 | include_rts = True, 156 | exclude_replies = False): 157 | 158 | if user_id == None and screen_name == None: 159 | return None 160 | 161 | if user_id: 162 | try: 163 | user_id = long(user_id) 164 | except Exception as e: 165 | print e 166 | return None 167 | 168 | flag = True 169 | tweets = [0] 170 | sleep_count = 0 171 | 172 | tweet_list = [] 173 | 174 | get_api = self.get_api 175 | 176 | while len(tweets) > 0: 177 | try: 178 | if flag: 179 | tweets = get_api().GetUserTimeline(user_id = user_id, 180 | screen_name = screen_name, 181 | include_rts = include_rts, 182 | exclude_replies = exclude_replies, 183 | trim_user = True, 184 | count = 200) 185 | flag = False 186 | 187 | else: 188 | tweets = get_api().GetUserTimeline(user_id = user_id, 189 | screen_name = screen_name, 190 | include_rts = include_rts, 191 | exclude_replies = exclude_replies, 192 | trim_user = True, 193 | count = 200, 194 | max_id = tweets[-1].id - 1) 195 | 196 | except error.TwitterError as te: 197 | try: 198 | if te.message == 'Not authorized.': 199 | print 'Not authorized.' 200 | return None 201 | 202 | if te.message[0]['code'] == 88: 203 | sleep_count += 1 204 | 205 | if sleep_count >= API_COUNT: 206 | print "sleeping..." 207 | sleep_count = 0 208 | time.sleep(300) 209 | continue 210 | 211 | else: 212 | print te 213 | break 214 | except Exception as ee: 215 | print ee 216 | break 217 | except Exception as e: 218 | print e 219 | break 220 | 221 | for tt in tweets: 222 | tweet = self.tweetobj_to_dict(tt) 223 | 224 | if not tweet: 225 | continue 226 | 227 | try: 228 | tweet_list.append(tweet) 229 | except Exception as e: 230 | continue 231 | 232 | return tweet_list 233 | 234 | 235 | ''' 236 | 获取所有用户推文信息 237 | 238 | 参数: 239 | user_list (list, optional): 240 | 存放用户 user_id / screen_name 的列表 241 | collect_name (str, optional): 242 | 存储数据集合名,默认 tweets_task 243 | search_type (str, optional): 244 | 抓取方式,如果为 screen_name ,则认为 user_list 中 存放的是用户 screen_name, 245 | 否则认为 user_list 中 存放的是用户 user_id 246 | 247 | ''' 248 | def get_all_users_timeline(self, 249 | user_list = [], 250 | collect_name = "tweets_task", 251 | search_type = "user_id", 252 | include_rts = True, 253 | exclude_replies = False): 254 | 255 | if len(user_list) == 0: 256 | return 257 | 258 | i = 0 259 | thread_pool = [] 260 | length = len(user_list) 261 | per_thread = length / THREAD_NUM 262 | 263 | while i < THREAD_NUM: 264 | if i + 1 == THREAD_NUM: 265 | crawler_thread = threading.Thread(target = self.get_all_users_timeline_thread, 266 | args = (user_list[i * per_thread : ], collect_name, search_type, include_rts, exclude_replies,)) 267 | else: 268 | crawler_thread = threading.Thread(target = self.get_all_users_timeline_thread, 269 | args = (user_list[i * per_thread : (i + 1) * per_thread], collect_name, search_type, include_rts, exclude_replies,)) 270 | 271 | crawler_thread.start() 272 | thread_pool.append(crawler_thread) 273 | 274 | i += 1 275 | 276 | for t in thread_pool: 277 | t.join() 278 | 279 | 280 | ''' 281 | 线程:获取多个用户推文信息(参考 get_all_users_timeline ) 282 | ''' 283 | def get_all_users_timeline_thread(self, 284 | user_list = [], 285 | collect_name = "tweets_task", 286 | search_type = "user_id", 287 | include_rts = True, 288 | exclude_replies = False): 289 | 290 | if search_type != "screen_name": 291 | while len(user_list) > 0: 292 | user_id = user_list.pop(0) 293 | 294 | self.get_user_all_timeline(user_id = user_id, 295 | collect_name = collect_name, 296 | include_rts = include_rts, 297 | exclude_replies = exclude_replies) 298 | else: 299 | while len(user_list) > 0: 300 | screen_name = user_list.pop(0) 301 | 302 | self.get_user_all_timeline(screen_name = screen_name, 303 | collect_name = collect_name, 304 | include_rts = include_rts, 305 | exclude_replies = exclude_replies) 306 | 307 | 308 | ''' 309 | Returns a single status message, specified by the status_id parameter. 310 | 311 | Parameters: 312 | status_id – The numeric ID of the status you are trying to retrieve. 313 | trim_user – When set to True, each tweet returned in a timeline will include a user object including only the status authors numerical ID. 314 | Omit this parameter to receive the complete user object. [Optional] 315 | include_entities – If False, the entities node will be disincluded. This node offers a variety of metadata about the tweet in a 316 | discreet structure, including: user_mentions, urls, and hashtags. [Optional] 317 | 318 | Returns: 319 | A twitter.Status instance representing that status message 320 | ''' 321 | def get_status(self, 322 | status_id = None, 323 | trim_user = True, 324 | include_entities = True): 325 | 326 | if status_id == None: 327 | return None 328 | 329 | return self.get_api().GetStatus(status_id = status_id, 330 | trim_user = trim_user, 331 | include_my_retweet = False, 332 | include_entities = include_entities) 333 | 334 | 335 | ''' 336 | 根据推文ID获取所有推文信息(参考 get_status ) 337 | 338 | 参数: 339 | status_list (list, optional): 340 | 存放tweet id 的列表 341 | collect_name (str, optional): 342 | 存储数据集合名,默认 status 343 | ''' 344 | def get_all_status(self, 345 | status_list = [], 346 | collect_name = 'status', 347 | trim_user = True, 348 | include_entities = True): 349 | 350 | if len(status_list) == 0: 351 | return 352 | 353 | i = 0 354 | thread_pool = [] 355 | length = len(status_list) 356 | per_thread = length / THREAD_NUM 357 | 358 | while i < THREAD_NUM: 359 | if i + 1 == THREAD_NUM: 360 | crawler_thread = threading.Thread(target = self.get_all_status_thread, 361 | args = (status_list[i * per_thread : ], collect_name, trim_user, include_entities,)) 362 | else: 363 | crawler_thread = threading.Thread(target = self.get_all_status_thread, 364 | args = (status_list[i * per_thread : (i + 1) * per_thread], collect_name, trim_user, include_entities,)) 365 | 366 | crawler_thread.start() 367 | thread_pool.append(crawler_thread) 368 | 369 | i += 1 370 | 371 | for t in thread_pool: 372 | t.join() 373 | 374 | 375 | ''' 376 | 线程:根据推文ID获取所有推文信息(参考 get_all_status ) 377 | ''' 378 | def get_all_status_thread(self, 379 | status_list = [], 380 | collect_name = 'status', 381 | trim_user = True, 382 | include_entities = True): 383 | 384 | wrapper_func = handle_exception(self.get_status) 385 | 386 | db = MongoDB().connect() 387 | collect = db[collect_name] 388 | 389 | while len(status_list) > 0: 390 | status_id = status_list.pop(0) 391 | status_obj = wrapper_func(status_id) 392 | 393 | status = self.tweetobj_to_dict(status_obj) 394 | 395 | if not status: 396 | continue 397 | 398 | try: 399 | collect.insert_one(status) 400 | except Exception as e: 401 | continue 402 | 403 | 404 | ''' 405 | 将推文对象转换为字典类型 406 | ''' 407 | def tweetobj_to_dict(self, tt): 408 | if tt == None: 409 | return None 410 | 411 | try: 412 | tweet = { 413 | 'coordinates': tt.coordinates, # Coordinates 414 | 'created_at': tt.created_at, # String 415 | 'favorite_count': tt.favorite_count, # int 416 | 'filter_level': tt.filter_level if hasattr(tt, 'filter_level') else '', # String 417 | 'hashtags': map(lambda x: x.text, tt.hashtags), # {'0': ,'1':} 418 | '_id': tt.id_str, # String 419 | 'in_reply_to_status_id': tt.in_reply_to_status_id, 420 | 'in_reply_to_user_id': tt.in_reply_to_user_id, 421 | 'lang': tt.lang, # String 422 | 'place': tt.place, # Place 423 | 'possibly_sensitive': tt.possibly_sensitive, # Boolean 424 | 'retweet_count': tt.retweet_count, # int 425 | 'source': tt.source, # String 426 | 'text': tt.text, # String 427 | 'user_id': tt.user.id, # int 428 | 'user_mentions': map(lambda x: x.id, tt.user_mentions), # [] 429 | 'withheld_copyright': tt.withheld_copyright, # Boolean 430 | 'withheld_in_countries': tt.withheld_in_countries, # Array of String 431 | 'withheld_scope': tt.withheld_scope, #String 432 | } 433 | 434 | except Exception as e: 435 | print e 436 | return None 437 | 438 | return tweet 439 | 440 | 441 | if __name__ == '__main__': 442 | ts = TweetsCrawler() 443 | print ts.get_user_all_timeline(screen_name = 'mrmarcohan') --------------------------------------------------------------------------------