├── crawler
    ├── __init__.py
    ├── api.py
    ├── decorator.py
    ├── database.py
    ├── basicinfo_crawler.py
    ├── web_crawler.py
    ├── relation_crawler.py
    └── tweets_crawler.py
├── portrayal
    ├── __init__.py
    ├── tools
    │   ├── __init__.py
    │   ├── preprocess.py
    │   ├── function.py
    │   └── generate_xml.py
    ├── influence
    │   ├── __init__.py
    │   └── calculate_influence.py
    ├── career_classify
    │   ├── __init__.py
    │   ├── classify.py
    │   ├── training.py
    │   └── preprocess.py
    ├── interest_extract
    │   ├── __init__.py
    │   ├── tag_cloud.py
    │   └── interest_extract.py
    ├── sentiment_classify
    │   ├── __init__.py
    │   ├── classify.py
    │   ├── process_dict.py
    │   ├── sentiment_classify.py
    │   ├── sentiment_dict.py
    │   └── training.py
    ├── user_profile.py
    └── resource
    │   └── stop_words.txt
├── README.md
├── testing.py
├── .gitattributes
├── .gitignore
├── neo4j.py
├── crawling.py
└── typical.py


/crawler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/portrayal/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/portrayal/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/portrayal/influence/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/portrayal/career_classify/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/portrayal/interest_extract/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## user-portrait
2 | 
3 | ### 用户画像


--------------------------------------------------------------------------------
/portrayal/sentiment_classify/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/testing.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | from crawling import get_user_all_info
 3 | from portrayal.user_profile import user_profile
 4 | 
 5 | 
 6 | def main():
 7 | 	# pass
 8 | 	user = get_user_all_info(screen_name = 'David_Cameron')
 9 | 	user = user_profile(user)
10 | 	del user['tweets']
11 | 	print user
12 | 
13 | 
14 | 
15 | if __name__ == '__main__':
16 | 	main()


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain


--------------------------------------------------------------------------------
/crawler/api.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | from twitter import Api as TwitterAPI
 3 | from config import APP_INFO
 4 | 
 5 | API_LIST = []
 6 | API_COUNT = len(APP_INFO)
 7 | 
 8 | for i in range(API_COUNT):
 9 | 	API_LIST.append(TwitterAPI(consumer_key = APP_INFO[i]['consumer_key'],
10 | 					  consumer_secret = APP_INFO[i]['consumer_secret'],
11 | 					  access_token_key = APP_INFO[i]['access_token_key'],
12 | 					  access_token_secret = APP_INFO[i]['access_token_secret'],
13 | 					  cache = None))
14 | 
15 | class Api:
16 | 	def __init__(self):
17 | 		self.api_index = 0
18 | 
19 | 	'''
20 | 	获取 twitter app，每次调用返回一个新的 app
21 | 	'''
22 | 	def get_api(self):
23 | 		api_index = self.api_index
24 | 		api_index = (api_index + 1) % API_COUNT
25 | 		self.api_index = api_index
26 | 
27 | 		return API_LIST[api_index]


--------------------------------------------------------------------------------
/portrayal/interest_extract/tag_cloud.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | 
 4 | from pytagcloud import create_tag_image, make_tags
 5 | from pytagcloud.lang.counter import get_tag_counts
 6 | 
 7 | from .. config import PROJECT_PATH
 8 | 
 9 | file_path = PROJECT_PATH + "portrayal/resource/tag_cloud/"
10 | 
11 | 
12 | def generate_tag_cloud(text, user_id):
13 | 	word_count = []
14 | 	word_list = text.split(',')
15 | 	length = len(word_list) * 2
16 | 
17 | 	for word in word_list:
18 | 		word_count.append((word, length / 10))
19 | 		length -= 1
20 | 
21 | 	tags = make_tags(word_count, maxsize = 48)
22 | 
23 | 	for item in tags:
24 | 		item['tag'] = re.sub(r'label(\w+)label',r'#\1', item['tag'])
25 | 
26 | 	file_name = file_path + '%d.png' % user_id
27 | 	create_tag_image(tags, file_name, size = (999, 688), fontname = 'Lobster',  background=(0, 0, 0, 0))
28 | 
29 | 	return file_name


--------------------------------------------------------------------------------
/crawler/decorator.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import time
 4 | 
 5 | from twitter import error
 6 | from api import API_COUNT
 7 | 
 8 | '''
 9 | 生成装饰器
10 | '''
11 | def generate_decorator(sleep_time = 700):
12 | 
13 | 	# 处理Twitter异常装饰器
14 | 	def handle_exception(func):
15 | 		def wrapper(*args, **kw):
16 | 			sleep_count = 0
17 | 
18 | 			while True:
19 | 				try:
20 | 					return func(*args, **kw)
21 | 				except error.TwitterError as te:
22 | 					try:
23 | 						if te.message[0]['code'] == 88:
24 | 							sleep_count += 1
25 | 
26 | 							if sleep_count >= API_COUNT:
27 | 								print "sleeping..."
28 | 								sleep_count = 0
29 | 								time.sleep(sleep_time)
30 | 							continue
31 | 
32 | 						else:
33 | 							print te
34 | 							return None
35 | 					except Exception as ee:
36 | 						print ee
37 | 						return None
38 | 
39 | 				except Exception as e:
40 | 					print e
41 | 					return None
42 | 
43 | 		return wrapper
44 | 
45 | 	return handle_exception


--------------------------------------------------------------------------------
/crawler/database.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import MySQLdb
 4 | 
 5 | from py2neo import Graph	
 6 | from pymongo import MongoClient
 7 | from config import MYSQL, MONGO_DB, NEO4J
 8 | 
 9 | 
10 | class Mysql:
11 | 	def connect(self):
12 | 		db = MySQLdb.connect(MYSQL['DB_HOST'], MYSQL['DB_USER'], MYSQL['DB_PASSWORD'], MYSQL['DB_DATABASE'])
13 | 		cursor = db.cursor()
14 | 		self.cursor = cursor
15 | 		self.db = db
16 | 		return db
17 | 
18 | 	def execute(self, sql):
19 | 		self.cursor.execute(sql)
20 | 		self.db.commit()
21 | 
22 | 	def fetchall(self, sql):
23 | 		self.cursor.execute(sql)
24 | 		res = self.cursor.fetchall()
25 | 
26 | 		return res
27 | 
28 | 	def close(self):
29 | 		self.db.close()
30 | 
31 | 
32 | class MongoDB:
33 | 	def connect(self, db_name = MONGO_DB['DB_DATABASE']):
34 | 		client = MongoClient(MONGO_DB['DB_HOST'], MONGO_DB['DB_PORT'])
35 | 		db = client[db_name]
36 | 		db.authenticate(MONGO_DB['DB_USER'], MONGO_DB['DB_PASSWORD'])
37 | 		self.db = db
38 | 
39 | 		return db
40 | 
41 | 
42 | class Neo4j:
43 | 	def connect(self):
44 | 		graph = Graph(NEO4J['DB_HOST'], 
45 | 					 username = NEO4J['DB_USER'], 
46 | 					 password = NEO4J['DB_PASSWORD'])
47 | 
48 | 		return graph


--------------------------------------------------------------------------------
/portrayal/user_profile.py:
--------------------------------------------------------------------------------
 1 | from sentiment_classify import sentiment_classify
 2 | from career_classify.classify import exe_classify
 3 | from interest_extract.interest_extract import extract_tags
 4 | from influence.calculate_influence import calculate_influence, calc_activity_sequence
 5 | 
 6 | 
 7 | def user_profile(user):
 8 | 	tweets = user['tweets']
 9 | 	
10 | 	if not tweets or len(tweets) == 0:
11 | 		return user
12 | 
13 | 	final_sentiment, psy_with_time1, psy_with_time2, psy_with_count1, psy_with_count2 = sentiment_classify.exe_sentiment_classify(tweets)
14 | 	user['psy'] = final_sentiment
15 | 	user['psy_with_time1'] = psy_with_time1
16 | 	user['psy_with_time2'] = psy_with_time2
17 | 	user['psy_with_count1'] = psy_with_count1
18 | 	user['psy_with_count2'] = psy_with_count2
19 | 
20 | 	text = ''
21 | 	for tweet in tweets:
22 | 		text += tweet['text']
23 | 
24 | 	category, categories_score = exe_classify(text)
25 | 	user['category'] = category
26 | 	user['category_score'] = categories_score
27 | 
28 | 	user['interest_tags'] = extract_tags(tweets, user['description'])
29 | 
30 | 	influence_score, activity = calculate_influence(user['followers_count'], tweets)
31 | 	user['influence_score'] = influence_score
32 | 	user['activity'] = activity
33 | 
34 | 	user['activity_list'] = calc_activity_sequence(tweets)
35 | 
36 | 	return user


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | /portrayal/career_classify/data
49 | /portrayal/career_classify/data_category
50 | /portrayal/career_classify/data_processed
51 | /portrayal/career_classify/data_pruned
52 | /portrayal/career_classify/pickle
53 | /portrayal/career_classify/pickle_category
54 | /portrayal/career_classify/statistics
55 | /portrayal/interest_extract/data
56 | /portrayal/interest_extract/pickle
57 | /portrayal/sentiment_classify/data
58 | /portrayal/sentiment_classify/data1
59 | /portrayal/sentiment_classify/pickle
60 | /portrayal/resource/tag_cloud
61 | /portrayal/resource/users_xml
62 | *.pyc
63 | config.py


--------------------------------------------------------------------------------
/portrayal/sentiment_classify/classify.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | '''
 3 |  * @Author: Marco 
 4 |  * @Date: 2017-09-09 14:29:43 
 5 |  * @Last Modified by: Marco 
 6 |  * @Last Modified time: 2017-09-09 14:29:43 
 7 | '''
 8 | import os
 9 | import pickle
10 | import training
11 | from statistics import mode
12 | 
13 | from .. tools.preprocess import preprocess_del_stopwords
14 | from .. config import PROJECT_PATH
15 | 
16 | 
17 | module_path = PROJECT_PATH + "portrayal/sentiment_classify/"
18 | pickle_path = module_path + "pickle/"
19 | 
20 | 
21 | class VotingClassifier:
22 | 	classifier_list = []
23 | 	words_feature = None
24 | 
25 | 	def init(self):
26 | 		self.load_classifier()
27 | 	
28 | 
29 | 	def load_classifier(self):
30 | 		classifier_names = [
31 | 			'naivebayes',
32 | 			'mnb_classifier',
33 | 			'bnb_classifier',
34 | 			'lr_classifier',
35 | 			'lsv_classifier',
36 | 			'sgd_classifier'
37 | 		]
38 | 
39 | 		# 加载之前保存的训练好的分类器模型
40 | 		for name in classifier_names:
41 | 			if not os.path.exists(pickle_path + name + ".pickle"):
42 | 				training.training()
43 | 
44 | 			classifier_file = open(pickle_path + name + ".pickle", "rb")
45 | 			classifier = pickle.load(classifier_file)
46 | 			classifier_file.close()
47 | 
48 | 			self.classifier_list.append(classifier)
49 | 		
50 | 
51 | 	def classify(self, tts):
52 | 		text = ''
53 | 		for item in tts:
54 | 			text += item['text'] + ' '
55 | 
56 | 		if len(self.classifier_list) == 0:
57 | 			self.load_classifier()
58 | 		
59 | 		feature = self.word2features(text)
60 | 
61 | 		if not feature:
62 | 			return None
63 | 
64 | 		votes = []
65 | 		for classifier in self.classifier_list:
66 | 			vote = classifier.classify(feature)
67 | 			votes.append(vote)
68 | 		
69 | 		try:
70 | 			res= mode(votes)
71 | 		except Exception as e:
72 | 			print e
73 | 			return 0
74 | 		
75 | 		return res
76 | 
77 | 	def word2features(self, document):
78 | 		if not self.words_feature:
79 | 			feature_file = open(pickle_path + "words_feature.pickle")
80 | 			self.words_feature = pickle.load(feature_file)
81 | 			feature_file.close()
82 | 
83 | 		word_list = set(preprocess_del_stopwords(document))
84 | 		
85 | 		if not word_list:
86 | 			return None
87 | 		
88 | 		feature = {}
89 | 		for w in self.words_feature:
90 | 			feature[w] = w in word_list
91 | 		
92 | 		return feature
93 | 
94 | 
95 | voting_classifier = VotingClassifier()
96 | 
97 | 
98 | def classify(tweets):
99 | 	return voting_classifier.classify(tweets)


--------------------------------------------------------------------------------
/portrayal/tools/preprocess.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import re
 3 | import sys
 4 | import nltk
 5 | 
 6 | from nltk.tokenize import word_tokenize
 7 | 
 8 | from function import get_stop_words, get_slang_set
 9 | 
10 | reload(sys)
11 | sys.setdefaultencoding('utf-8')
12 | 
13 | stop_words = get_stop_words()
14 | slang_set = get_slang_set()
15 | 
16 | def data_cleaning(text):
17 | 	# clear @/#/链接/RT
18 | 	# 去除表达较口语化的语言时，经常使用重复的字符
19 | 	text = text.lower()
20 | 	text = re.sub(r"(\w)\1{2,}", r"\1\1", text)
21 | 	text = re.sub(r"(..)\1{2,}", r"\1\1", text)
22 | 	text = re.sub(r'(rt)?\s?@\w+:?|#|(ht|f)tp[^\s]+', " ", text)
23 | 	text = text.replace('wanna', 'want to').replace('gonna', 'will').replace('gotta', 'must').replace('have to', 'haveto').replace('hungrryy', 'hungry')
24 | 
25 | 	return text.strip()
26 | 
27 | 
28 | def preprocess(text, return_type = "string"):
29 | 	text = text.lower()
30 | 	text = re.sub(r'rt @\w+:|@\w+|#|(ht|f)tp[^\s]+', " ", text)
31 | 
32 | 	try:
33 | 		words = word_tokenize(text)
34 | 	except Exception as e:
35 | 		print e
36 | 		return None
37 | 
38 | 	word_list = [w for w in words if w not in stop_words and w.isalpha()]
39 | 
40 | 	return word_list if return_type == 'list' else ' '.join(word_list)
41 | 
42 | 
43 | def preprocess_del_stopwords(text):
44 | 	try:
45 | 		words = word_tokenize(text)
46 | 	except Exception as e:
47 | 		print e
48 | 		return None
49 | 
50 | 	word_list = [w for w in words if w not in stop_words and w.isalpha()]
51 | 
52 | 	return word_list
53 | 
54 | 
55 | def preprocess_postag(text):
56 | 	text = text.lower()
57 | 	text = re.sub(r"(\w)\1{2,}", r"\1\1", text)
58 | 	text = re.sub(r"(..)\1{2,}", r"\1\1", text)
59 | 	text = re.sub(r'(rt)?\s?@\w+:?|#|(ht|f)tp[^\s]+', " ", text)
60 | 
61 | 	try:
62 | 		words = word_tokenize(text)
63 | 		word_tags = nltk.pos_tag(words)
64 | 	except Exception as e:
65 | 		print e
66 | 		return None
67 | 	
68 | 	res = []
69 | 	for item in word_tags:
70 | 		if item[0] not in stop_words and item[0].isalpha():
71 | 			res.append(item)
72 | 	
73 | 	return res
74 | 
75 | 
76 | def preprocess_postag_label(text):
77 | 	text = text.lower()
78 | 	text = re.sub(r"(\w)\1{2,}", r"\1\1", text)
79 | 	text = re.sub(r"(..)\1{2,}", r"\1\1", text)
80 | 	text = re.sub(r'#(\w+)', "label\g<1>label ", text)
81 | 	text = re.sub(r'(rt)?\s?@\w+:?|#|hahah\w*|(ht|f)tp[^\s]+', " ", text)
82 | 	text = text.replace('new york', "newyork")
83 | 
84 | 	try:
85 | 		words = word_tokenize(text)
86 | 		word_tags = nltk.pos_tag(words)
87 | 	except Exception as e:
88 | 		print e
89 | 		return None
90 | 	
91 | 	res = []
92 | 	for item in word_tags:
93 | 		if item[0] not in stop_words and item[0].isalpha() and item[0] not in slang_set:
94 | 			word = re.sub(r'label(\w+)label', r'#\1' , item[0])
95 | 			res.append((word, item[1]))
96 | 
97 | 	return res


--------------------------------------------------------------------------------
/neo4j.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from py2neo import Graph, Node, Relationship
 3 | from crawler.database import Neo4j, MongoDB
 4 | 
 5 | 
 6 | '''
 7 | 创建neo4j节点及关系
 8 | '''
 9 | def create_relation():
10 | 	graph = Neo4j().connect()
11 | 	mongo = MongoDB().connect()
12 | 
13 | 	# 清空neo4j数据库
14 | 	# graph = graph.delete_all()
15 | 
16 | 	tus = mongo['typical'].find({}, {'_id': 1})
17 | 
18 | 	for item in tus:
19 | 		# 创建用户节点
20 | 		user = Node("Typical", user_id = item['_id'])
21 | 		graph.create(user)
22 | 
23 | 	tus = mongo['relation'].find({}, {'_id': 1})
24 | 	user_list = map(lambda item: item['_id'], tus)
25 | 
26 | 	# 创建用户节点之间的关系
27 | 	for user_id in user_list:
28 | 		friends = mongo['relation'].find_one({'_id': user_id})
29 | 		friends = set(friends['friends'])
30 | 
31 | 		node1 = graph.find_one("Typical",
32 | 							   property_key = "user_id",
33 | 							   property_value = user_id)
34 | 
35 | 		for user_id1 in user_list:
36 | 			if user_id1 == user_id:
37 | 				continue
38 | 
39 | 			if user_id1 in friends:
40 | 				node2 = graph.find_one("Typical",
41 | 									   property_key = "user_id",
42 |   									   property_value = user_id1)
43 | 
44 | 				following = Relationship(node1, 'following', node2)
45 | 				graph.create(following)
46 | 
47 | 
48 | '''
49 | 更新neo4j节点属性
50 | '''
51 | def update_attr():
52 | 	graph = Neo4j().connect()
53 | 	mongo = MongoDB().connect()
54 | 
55 | 	tus = mongo['typical'].find({}, {'name': 1, 'category': 1, 'followers_count': 1, 'location': 1, 'utc_offset': 1, 
56 | 		'statuses_count': 1, 'description': 1, 'friends_count': 1, 'psy': 1, 'verified': 1, 'lang': 1, 'favourites_count': 1, 
57 | 		'screen_name': 1, 'influence_score': 1, 'created_at': 1, 'time_zone': 1, 'protected': 1, 'activity': 1})
58 | 
59 | 	for item in tus:
60 | 		node = graph.find_one("Typical",
61 | 							  property_key = "user_id",
62 | 							  property_value = item['_id'])
63 | 		node['name'] = item['name']
64 | 		node['category'] = item['category']
65 | 		node['followers_count'] = item['followers_count']
66 | 		node['location'] = item['location']
67 | 		node['utc_offset'] = item['utc_offset']
68 | 		node['statuses_count'] = item['statuses_count']
69 | 		node['description'] = item['description']
70 | 		node['friends_count'] = item['friends_count']
71 | 		node['psy'] = item['psy']
72 | 		node['verified'] = item['verified']
73 | 		node['lang'] = item['lang']
74 | 		node['favourites_count'] = item['favourites_count']
75 | 		node['screen_name'] = item['screen_name']
76 | 		node['influence_score'] = item['influence_score']
77 | 		node['created_at'] = item['created_at']
78 | 		node['time_zone'] = item['time_zone']
79 | 		node['protected'] = item['protected']
80 | 		node['activity'] = item['activity']
81 | 
82 | 		graph.push(node)
83 | 
84 | 
85 | if __name__ == '__main__':
86 | 	# create_relation()
87 | 	update_attr()


--------------------------------------------------------------------------------
/portrayal/sentiment_classify/process_dict.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from .. config import PROJECT_PATH
  3 | 
  4 | module_path = PROJECT_PATH + "portrayal/sentiment_classify/"
  5 | pickle_path = module_path + "pickle/"
  6 | 
  7 | sentiment_dict = None
  8 | 
  9 | def load_sentiment_dict():
 10 | 	if not sentiment_dict:
 11 | 		if not os.path.exists(pickle_path + "sentiment_dict.pickle"):
 12 | 			sentiment_dict = generate_sentiment_dict()
 13 | 
 14 | 
 15 | def generate_new_dict():
 16 | 	new_sentiment_dict = {}
 17 | 
 18 | 	if not sentiment_dict:
 19 | 		load_sentiment_dict()
 20 | 
 21 | 	for item in sentiment_dict:
 22 | 		sl = item.split("#")
 23 | 		if(sl[1] == 'a' or sl[1] == 'v' or sl[1] == 'r') and sl[0].isalpha() and (sl[0] not in sentiment_dict):
 24 | 			score = sentiment_dict[item] * 30 / 5
 25 | 
 26 | 			if int(score) > 5 or int(score) < -5:
 27 | 				print sl
 28 | 				print score
 29 | 			else:
 30 | 				if abs(score) >= 1:
 31 | 					new_sentiment_dict[sl[0]] = int(score)
 32 | 
 33 | 				elif abs(score) > 0.66:
 34 | 					if score < 0:
 35 | 						new_sentiment_dict[sl[0]] = -1
 36 | 					if score > 0:
 37 | 						new_sentiment_dict[sl[0]] = 1
 38 | 
 39 | 	senti_file = open(module_path + "data/sentiment_temp.txt", 'w')
 40 | 	for item in new_sentiment_dict:
 41 | 		senti_file.write(item + "\t" + str(new_sentiment_dict[item]) + "\n")
 42 | 
 43 | 	return
 44 | 
 45 | 
 46 | def attr_change(word_tuple, score):
 47 | 	word = word_tuple[0]
 48 | 	
 49 | 	if (word_tuple[1] == 'r' or word_tuple[1] == 'v') and (word + '#a' in sentiment_dict):
 50 | 		score += sentiment_dict[word + '#a'] * rate
 51 | 		print score
 52 | 	elif (word_tuple[1] == 'n') and (word + '#v' in sentiment_dict):
 53 | 		score += sentiment_dict[word + '#v'] * rate
 54 | 		print score
 55 | 
 56 | 
 57 | def generate_sentiment_dict():
 58 | 	sentiment_dict = {}
 59 | 	file = open(module_path + 'data/sentiment_words.txt')
 60 | 
 61 | 	data = []
 62 | 	while 1:
 63 | 		lines = file.readlines(100000)
 64 | 		if not lines:
 65 | 			break
 66 | 
 67 | 		for line in lines:
 68 | 			if line.strip().startswith("#"):
 69 | 				continue
 70 | 			else:
 71 | 				data = line.split("\t")
 72 | 				if len(data) != 6:
 73 | 					print line
 74 | 					print 'invalid data'
 75 | 					continue
 76 | 
 77 | 			word_type = data[0]
 78 | 			synset_score = float(data[2]) - float(data[3])
 79 | 			syn_terms_list = data[4].split(" ")
 80 | 
 81 | 			for w in syn_terms_list:
 82 | 				term_and_num = w.split("#")
 83 | 
 84 | 				syn_term = term_and_num[0] + "#" + word_type
 85 | 				term_num = int(term_and_num[1])
 86 | 
 87 | 				if sentiment_dict.has_key(syn_term):
 88 | 					sentiment_dict[syn_term].append((term_num, synset_score))
 89 | 				
 90 | 				else:
 91 | 					sentiment_dict[syn_term] = []
 92 | 					sentiment_dict[syn_term].append((term_num, synset_score))
 93 | 	
 94 | 	res = {}
 95 | 	for key in sentiment_dict:
 96 | 		score_sum = 0
 97 | 		count = 0
 98 | 		for word_tuple in sentiment_dict[key]:
 99 | 			score_sum += word_tuple[1] * word_tuple[0]
100 | 			count += word_tuple[0]
101 | 		
102 | 		if score_sum / count != 0:
103 | 			res[key] = score_sum / count
104 | 
105 | 	file = open(pickle_path + "sentiment_dict.pickle", 'w')
106 | 	pickle.dump(res, file)
107 | 	file.close()
108 | 
109 | 	return res


--------------------------------------------------------------------------------
/portrayal/tools/function.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import time
  3 | import datetime
  4 | 
  5 | from .. config import PROJECT_PATH
  6 | 
  7 | slang = None
  8 | stop_words = None
  9 | slang_set = None
 10 | 
 11 | '''
 12 | 读取停用词
 13 | '''
 14 | def get_stop_words(file_path = PROJECT_PATH + "portrayal/resource/stop_words.txt"):
 15 | 	global stop_words
 16 | 
 17 | 	if not stop_words:
 18 | 		stop_words = set()
 19 | 	else:
 20 | 		return stop_words
 21 | 
 22 | 	file = open(file_path, "r")
 23 | 	for line in file:
 24 | 		stop_words.add(line[0 : -1])
 25 | 
 26 | 	file.close()
 27 | 	
 28 | 	return stop_words
 29 | 
 30 | 
 31 | '''
 32 | 读取俚语
 33 | '''
 34 | def get_slang(file_path = PROJECT_PATH + "portrayal/resource/slang.txt"):
 35 | 	global slang
 36 | 
 37 | 	if not slang:
 38 | 		slang = {}
 39 | 	else:
 40 | 		return slang
 41 | 
 42 | 	file = open(file_path, "r")
 43 | 	for line in file:
 44 | 		l_l = line.split(":")
 45 | 		slang[l_l[0].strip()] = l_l[1].strip()
 46 | 
 47 | 	file.close()
 48 | 
 49 | 	return slang
 50 | 
 51 | 
 52 | '''
 53 | 读取俚语
 54 | '''
 55 | def get_slang_set(file_path = PROJECT_PATH + "portrayal/resource/slang.txt"):
 56 | 	global slang_set
 57 | 
 58 | 	if not slang_set:
 59 | 		slang_set = set()
 60 | 	else:
 61 | 		return slang_set
 62 | 
 63 | 	file = open(file_path, "r")
 64 | 	for line in file:
 65 | 		l_l = line.split(":")
 66 | 		slang_set.add(l_l[0].strip())
 67 | 
 68 | 	file.close()
 69 | 
 70 | 	return slang_set
 71 | 
 72 | 
 73 | '''
 74 | 计算两个时间相差的天数
 75 | '''
 76 | def calc_time_differ(t1, t2):
 77 | 	t1 = time.strptime(t1, "%Y-%m-%d %H:%M:%S")
 78 | 	t2 = time.strptime(t2, "%Y-%m-%d %H:%M:%S")
 79 | 	t1 = datetime.datetime(t1[0], t1[1], t1[2], t1[3], t1[4], t1[5])
 80 | 	t2 = datetime.datetime(t2[0], t2[1], t2[2], t2[3], t2[4], t2[5])
 81 | 
 82 | 	return abs((t2 - t1).days)
 83 | 
 84 | 
 85 | '''
 86 | 将推文分割为按月为单位的推文列表
 87 | 返回：
 88 | 	二维推文列表
 89 | '''
 90 | def split_tweets_same_time(tweets = [], period = 1):
 91 | 	threshold = period * 30
 92 | 	
 93 | 	if len(tweets) == 0:
 94 | 		return
 95 | 
 96 | 	start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweets[0]['created_at'].replace('+0000 ','')))
 97 | 	start_time_temp = start_time
 98 | 
 99 | 	tts = []
100 | 	tweets_list = []
101 | 
102 | 	for tweet in tweets:
103 | 		time_temp = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweet['created_at'].replace('+0000 ','')))
104 | 		
105 | 		if calc_time_differ(time_temp, start_time_temp) <= threshold:
106 | 			tts.append(tweet)
107 | 		else:
108 | 			start_time_temp = time_temp
109 | 			tweets_list.append(tts)
110 | 			tts = []
111 | 			tts.append(tweet)
112 | 
113 | 	if len(tts) != 0:
114 | 		tweets_list.append(tts)
115 | 
116 | 	return tweets_list if len(tweets_list[-1]) > 20 else tweets_list[0 : -1]
117 | 
118 | 
119 | def split_tweets_same_count(tweets = [], count = 66):
120 | 	count = count if count <= 100 else 100
121 | 	count = count if count >= 40 else 40
122 | 
123 | 	if len(tweets) < 1200:
124 | 		count = 40
125 | 
126 | 	tts = []
127 | 	tweets_list = []
128 | 
129 | 	i = 0
130 | 	for tweet in tweets:
131 | 		i += 1
132 | 		tts.append(tweet)
133 | 
134 | 		if i > count:
135 | 			tweets_list.append(tts)
136 | 			tts = []
137 | 			i = 0
138 | 
139 | 	if len(tts) > 20:
140 | 		tweets_list.append(tts)
141 | 
142 | 	return tweets_list


--------------------------------------------------------------------------------
/portrayal/sentiment_classify/sentiment_classify.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import os
  3 | import nltk
  4 | 
  5 | from statistics import mode
  6 | from classify import classify
  7 | from nltk.tokenize import word_tokenize
  8 | from sentiment_dict import calc_sentiment_score
  9 | 
 10 | from .. config import PROJECT_PATH
 11 | from .. tools.preprocess import data_cleaning
 12 | from .. tools.function import split_tweets_same_time, split_tweets_same_count
 13 | 
 14 | module_path = PROJECT_PATH + "portrayal/sentiment_classify/"
 15 | 
 16 | def replace_emotion(tweets):
 17 | 	tweets_temp = []
 18 | 	emotion = {
 19 | 		":33": ". happy",
 20 | 		"^_^": ". happy",
 21 | 		":-)": ". happy",
 22 | 		":)))": ". happy happy",
 23 | 		":)": ". happy",
 24 | 		"(:": ". happy",
 25 | 		"(-:": ". happy",
 26 | 		"<3": ". happy",
 27 | 		":*": ". happy",
 28 | 		":-D": ". happy",
 29 | 		":D": ". happy",
 30 | 		"X-D": ". happy happy",
 31 | 		"XD": ". happy happy",
 32 | 		"xD": ". happy happy",
 33 | 		";-)": ". happy",
 34 | 		";)": ". happy",
 35 | 		";-D": ". happy",
 36 | 		";D": ". happy",
 37 | 		"(;": ". happy",
 38 | 		"(-;": ". happy",
 39 | 		":-(": ". unhappy",
 40 | 		":((": ". sad",
 41 | 		":(": ". unhappy",
 42 | 		"(:": ". unhappy",
 43 | 		"(-:": ". unhappy",
 44 | 		":,(": ". sad",
 45 | 		":'(": ". sad",
 46 | 		":”(": ". sad"
 47 | 	}
 48 | 
 49 | 	for tweet in tweets:
 50 | 		text = tweet['text']
 51 | 
 52 | 		for item in emotion:
 53 | 			text = text.replace(item, emotion[item])
 54 | 		
 55 | 		text = data_cleaning(text)
 56 | 
 57 | 		tweets_temp.append({
 58 | 			'text': text,
 59 | 			'created_at': tweet['created_at']
 60 | 		})
 61 | 
 62 | 	return tweets_temp
 63 | 
 64 | 
 65 | def sentiment_with_time(tweets, time_span = 1):
 66 | 	tweets_list = split_tweets_same_time(tweets, time_span)
 67 | 
 68 | 	sequence1 = []
 69 | 	sequence2 = []
 70 | 	for tts in tweets_list:
 71 | 		res = classify(tts)
 72 | 
 73 | 		if res == 'pos':
 74 | 			sequence1.append(1)
 75 | 		elif res == 'neg':
 76 | 			sequence1.append(-1)
 77 | 		else:
 78 | 			sequence1.append(0)
 79 | 	
 80 | 		score = calc_sentiment_score(tts)
 81 | 
 82 | 		if not score:
 83 | 			sequence2.append(0)
 84 | 		else:
 85 | 			sequence2.append(score)
 86 | 		
 87 | 	return sequence1, sequence2
 88 | 
 89 | 
 90 | def sentiment_with_count(tweets, count = 66):
 91 | 	tweets_list = split_tweets_same_count(tweets, count)
 92 | 
 93 | 	sequence1 = []
 94 | 	sequence2 = []
 95 | 	for tts in tweets_list:
 96 | 		res = classify(tts)
 97 | 		
 98 | 		if res == 'pos':
 99 | 			sequence1.append(1)
100 | 		elif res == 'neg':
101 | 			sequence1.append(-1)
102 | 		else:
103 | 			sequence1.append(0)
104 | 	
105 | 		score = calc_sentiment_score(tts)
106 | 
107 | 		if not score:
108 | 			sequence2.append(0)
109 | 		else:
110 | 			sequence2.append(score)
111 | 
112 | 	return sequence1, sequence2
113 | 
114 | 
115 | def exe_sentiment_classify(tweets):
116 | 	if not tweets or len(tweets) == 0:
117 | 		return None
118 | 
119 | 	tweets = replace_emotion(tweets)
120 | 
121 | 	psy_with_time1, psy_with_time2 = sentiment_with_time(tweets)
122 | 	psy_with_count1, psy_with_count2 = sentiment_with_count(tweets)
123 | 
124 | 	count = 0
125 | 	for item in psy_with_time2:
126 | 		if item > 0:
127 | 			count += 1
128 | 		elif item < 0:
129 | 			count -= 1
130 | 
131 | 	if count > 0:
132 | 		final_sentiment = 1
133 | 	
134 | 	elif count < 0:
135 | 		final_sentiment = -1
136 | 	
137 | 	else:
138 | 		final_sentiment = 0
139 | 
140 | 	return final_sentiment, psy_with_time1, psy_with_time2, psy_with_count1, psy_with_count2


--------------------------------------------------------------------------------
/portrayal/career_classify/classify.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 |  * @Author: Marco 
  4 |  * @Date: 2017-09-05 16:18:19 
  5 |  * @Last Modified by: Marco 
  6 |  * @Last Modified time: 2017-09-05 16:18:19 
  7 |  '''
  8 | import math
  9 | import pickle
 10 | 
 11 | from .. config import PROJECT_PATH
 12 | 
 13 | module_path = PROJECT_PATH + "portrayal/career_classify/"
 14 | 
 15 | 
 16 | def classify(text = '', pickle_path = module_path + "pickle/"):
 17 | 	categories_path = pickle_path + "categories.pickle"
 18 | 	count_vector_path = pickle_path + "count_vector.pickle"
 19 | 	tf_idf_transformer_path = pickle_path + "tf_idf_transformer.pickle"
 20 | 
 21 | 	if text == "":
 22 | 		return None
 23 | 	
 24 | 	text = [text]
 25 | 
 26 | 	count_vector_file = open(count_vector_path)
 27 | 	count_vector = pickle.load(count_vector_file)
 28 | 	count_vector_file.close()
 29 | 
 30 | 	count_feature_matrix = count_vector.transform(text)
 31 | 
 32 | 	tf_idf_transformer_file = open(tf_idf_transformer_path)
 33 | 	tf_idf_transformer = pickle.load(tf_idf_transformer_file)
 34 | 	tf_idf_transformer_file.close()
 35 | 
 36 | 	tf_idf_feature_matrix = tf_idf_transformer.transform(count_feature_matrix)
 37 | 
 38 | 	# 分类器
 39 | 	classifier_path = pickle_path + 'multi_classifier.pickle'
 40 | 	# classifier_path = pickle_path + 'bagging_classifier.pickle'
 41 | 
 42 | 	# 分类
 43 | 	multi_classifier_file = open(classifier_path)
 44 | 	multi_classifier = pickle.load(multi_classifier_file)
 45 | 	multi_classifier_file.close()
 46 | 
 47 | 	categories_file = open(categories_path)
 48 | 	target_names = pickle.load(categories_file)
 49 | 	categories_file.close()
 50 | 
 51 | 	category = target_names[multi_classifier.predict(tf_idf_feature_matrix.toarray())[0]]
 52 | 	# return category
 53 | 
 54 | 	score_list = multi_classifier._joint_log_likelihood(tf_idf_feature_matrix.toarray())[0]
 55 | 
 56 | 	min_value = min(score_list)
 57 | 	min_value = math.floor(min_value)
 58 | 
 59 | 	categories_score = {}
 60 | 	for i in range(len(score_list)):
 61 | 		categories_score[target_names[i]] = round((score_list[i] - min_value) * 25, 2)
 62 | 
 63 | 	return category, categories_score
 64 | 
 65 | 
 66 | def classify_special_category(category_list, text = '', pickle_path = module_path + "pickle_category/"):
 67 | 	category_list.sort()
 68 | 
 69 | 	dir_name = ''
 70 | 	for item in category_list:
 71 | 		dir_name += '_' + item
 72 | 
 73 | 	dir_name = dir_name[1 : ]
 74 | 
 75 | 	return classify(text, pickle_path + dir_name + '/')
 76 | 
 77 | 
 78 | def exe_classify(text = None):
 79 | 	if not text:
 80 | 		return None
 81 | 	
 82 | 	related_category_dict = {
 83 | 		'Politics': ["Education"],
 84 | 		# 'Technology': ["Economy"]
 85 | 	}
 86 | 
 87 | 	category, categories_score = classify(text)
 88 | 
 89 | 	if related_category_dict.has_key(category):
 90 | 		for related_category in related_category_dict[category]:
 91 | 			if (categories_score[category] < categories_score[related_category] * 2) and (categories_score[category] - categories_score[related_category]) < 36:
 92 | 				category_temp, categories_score_temp = classify_special_category([category, related_category], text)
 93 | 				
 94 | 				if category_temp != category:
 95 | 					# if categories_score_temp[category_temp] < categories_score_temp[category] * 1.2:
 96 | 					# 	continue
 97 | 
 98 | 					categories_score[category_temp] = categories_score[category] + (categories_score_temp[category_temp] - categories_score_temp[category]) * 5
 99 | 					
100 | 					return category_temp, categories_score
101 | 	
102 | 	return category, categories_score


--------------------------------------------------------------------------------
/portrayal/career_classify/training.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | '''
 3 |  * @Author: Marco 
 4 |  * @Date: 2017-09-05 14:07:32 
 5 |  * @Last Modified by: Marco 
 6 |  * @Last Modified time: 2017-09-05 14:07:32 
 7 |  */
 8 | '''
 9 | import os
10 | import pickle
11 | 
12 | from sklearn import datasets
13 | from sklearn.feature_extraction.text import CountVectorizer
14 | from sklearn.feature_extraction.text import TfidfTransformer
15 | 
16 | from sklearn.naive_bayes import MultinomialNB
17 | from sklearn.ensemble import BaggingClassifier
18 | from .. config import PROJECT_PATH
19 | 
20 | module_path = PROJECT_PATH + "portrayal/career_classify/"
21 | 
22 | '''
23 | BCC: business/entertainment/politics/sport/technology
24 | CNN: agriculture/economy/education/entertainment/military/politics/religion/sports/technology
25 | 
26 | data: BCC新闻数据集 + 维基词条文章 + 部分CNN文本
27 | '''
28 | def training(dataset_path = module_path + "data_processed", pickle_path = module_path + "pickle/"):
29 | 	print "读入训练数据..."
30 | 	training_dataset = datasets.load_files(dataset_path)
31 | 
32 | 	# 类别标签
33 | 	categories = training_dataset.target_names
34 | 	print categories
35 | 	categories_path = pickle_path + "categories.pickle"
36 | 	categories_file = open(categories_path, "wb")
37 | 	pickle.dump(categories, categories_file)
38 | 	categories_file.close()
39 | 
40 | 	# 词频统计
41 | 	count_vector = CountVectorizer(stop_words = "english", decode_error = "ignore")
42 | 	count_feature_matrix = count_vector.fit_transform(training_dataset.data)
43 | 
44 | 	count_vector_path = pickle_path + "count_vector.pickle"
45 | 	count_vector_file = open(count_vector_path, "wb")
46 | 	pickle.dump(count_vector, count_vector_file)
47 | 	count_vector_file.close()
48 | 
49 | 	# 计算词频-逆文档频率
50 | 	tf_idf_transformer = TfidfTransformer().fit(count_feature_matrix)
51 | 
52 | 	# 持久化 tf_idf_transformer
53 | 	tf_idf_transformer_path = pickle_path + "tf_idf_transformer.pickle"
54 | 	tf_idf_transformer_file = open(tf_idf_transformer_path, "wb")
55 | 	pickle.dump(tf_idf_transformer, tf_idf_transformer_file)
56 | 	tf_idf_transformer_file.close()
57 | 	print "词频-逆文档频率已保存"
58 | 
59 | 	tf_idf_feature_matrix = tf_idf_transformer.transform(count_feature_matrix)
60 | 
61 | 	# bagging
62 | 	# bagging = BaggingClassifier(base_estimator=MultinomialNB(),max_features=0.5,n_estimators=60,n_jobs=-1)
63 | 	# bagging_classifier = bagging.fit(tf_idf_feature_matrix, training_dataset.target)
64 | 	# print "Bagging classifier has been trained"
65 | 	# bagging_classifier_path = pickle_path + "bagging_classifier.pickle"
66 | 	# bagging_classifier_file = open(bagging_classifier_path,'wb')
67 | 	# pickle.dump(bagging_classifier,bagging_classifier_file)
68 | 	# bagging_classifier_file.close()
69 | 	# print "bagging classifier has been saved"
70 | 
71 | 
72 | 	# 多项式贝叶斯分类器分类
73 | 	multi_classifier = MultinomialNB().fit(tf_idf_feature_matrix, training_dataset.target)
74 | 	print "多项式贝叶斯分类器训练完成"
75 | 
76 | 	multi_classifier_path = pickle_path + "multi_classifier.pickle"
77 | 	multi_classifier_file = open(multi_classifier_path, "wb")
78 | 	pickle.dump(multi_classifier, multi_classifier_file)
79 | 	multi_classifier_file.close()
80 | 	print "多项式分类器已保存"
81 | 
82 | 
83 | def training_special_category(category_list, dataset_path = module_path + "data_category/", pickle_path = module_path + "pickle_category/"):
84 | 	category_list.sort()
85 | 
86 | 	dir_name = ''
87 | 	for item in category_list:
88 | 		dir_name += '_' + item
89 | 
90 | 	dir_name = dir_name[1 : ]
91 | 
92 | 	pickle_category_path = pickle_path + dir_name + "/"
93 | 
94 | 	if not os.path.exists(pickle_category_path):
95 | 		os.makedirs(pickle_category_path) 
96 | 	
97 | 	training(dataset_path + dir_name, pickle_category_path)


--------------------------------------------------------------------------------
/crawling.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | import time
  3 | 
  4 | from pymongo import MongoClient
  5 | from crawler.basicinfo_crawler import BasicinfoCrawler
  6 | from crawler.tweets_crawler import TweetsCrawler
  7 | from crawler.relation_crawler import RelationCrawler
  8 | 
  9 | tweets_crawler = TweetsCrawler()
 10 | relation_crawler = RelationCrawler()
 11 | basicinfo_crawler = BasicinfoCrawler()
 12 | 
 13 | 
 14 | '''
 15 | 获取用户基础信息和推文信息，以字典形式返回
 16 | '''
 17 | def get_user_all_info(user_id = None, screen_name = None):
 18 | 	if not user_id and not screen_name:
 19 | 		return None
 20 | 
 21 | 	try:
 22 | 		user = basicinfo_crawler.get_user(user_id = user_id, screen_name = screen_name)
 23 | 	except Exception as e:
 24 | 		print e
 25 | 		return None
 26 | 
 27 | 	if not user:
 28 | 		return None
 29 | 	
 30 | 	tweets = []
 31 | 	if not user.protected:
 32 | 		tweets = tweets_crawler.get_user_all_timeline_return(user_id = user_id, screen_name = screen_name)
 33 | 
 34 | 	if not tweets:
 35 | 		return None
 36 | 
 37 | 	return {
 38 | 		'user_id': long(user.id),
 39 | 		'screen_name': user.screen_name,
 40 | 		'name': user.name,
 41 | 		'verified': user.verified,
 42 | 		'friends_count': user.friends_count,
 43 | 		'description': user.description,
 44 | 		'crawler_date': time.strftime('%Y-%m-%d',time.localtime(time.time())),
 45 | 		'followers_count': user.followers_count,
 46 | 		'location': user.location,
 47 | 		'statuses_count': user.statuses_count,
 48 | 		'favourites_count': user.favourites_count,
 49 | 		'lang': user.lang,
 50 | 		'utc_offset': user.utc_offset,
 51 | 		'protected': user.protected,
 52 | 		'profile_background_color': user.profile_background_color,
 53 | 		'default_profile_image': user.default_profile_image,
 54 | 		'created_at': user.created_at,
 55 | 		'time_zone': user.time_zone,
 56 | 		'profile_image_url': user.profile_image_url,
 57 | 		'listed_count': user.listed_count,
 58 | 		'geo_enabled': user.geo_enabled,
 59 | 		'profile_sidebar_fill_color': user.profile_sidebar_fill_color,
 60 | 		'profile_banner_url': user.profile_banner_url,
 61 | 		'tweets': tweets
 62 | 	}
 63 | 
 64 | 	
 65 | # 同步典型人物的关系信息
 66 | def get_friends():
 67 | 	client = MongoClient('127.0.0.1', 27017)
 68 | 	db = client['twitter']
 69 | 
 70 | 	collect1 = db['typical']
 71 | 	collect2 = db['relation']
 72 | 
 73 | 	tus = collect1.find({}, {'_id': 1})
 74 | 
 75 | 	user_list = []
 76 | 	for item in tus:
 77 | 		user_list.append(item['_id'])
 78 | 
 79 | 	relation_list = []
 80 | 	tur = collect2.find({}, {'_id': 1})
 81 | 
 82 | 	for item in tur:
 83 | 		if item['_id'] not in user_list:
 84 | 			collect2.delete_one({'_id': item['_id']})
 85 | 
 86 | 		else:
 87 | 			relation_list.append(item['_id'])
 88 | 
 89 | 	for user_id in user_list:
 90 | 
 91 | 		if user_id in relation_list:
 92 | 			continue
 93 | 
 94 | 		cursor = -1
 95 | 		friends = []
 96 | 
 97 | 		while cursor != 0:
 98 | 			out = relation_crawler.get_friendids_paged_sleep(user_id = user_id,
 99 | 															 cursor = cursor, 
100 | 															 count = 5000)
101 | 			if not out:
102 | 				break
103 | 			
104 | 			friends = friends + out[2]
105 | 			cursor = out[0]
106 | 
107 | 		collect2.insert_one({
108 | 			'_id': user_id,
109 | 			'friends': friends
110 | 		})
111 | 	
112 | 
113 | # 获取所有用户的朋友信息
114 | def get_all_users_friends(user_list):
115 | 	client = MongoClient('127.0.0.1', 27017)
116 | 	db = client['twitter']
117 | 
118 | 	collect = db['1020_friends']
119 | 
120 | 	for user_id in user_list:
121 | 
122 | 		cursor = -1
123 | 		friends = []
124 | 
125 | 		while cursor != 0:
126 | 			out = relation_crawler.get_friendids_paged_sleep(user_id = user_id,
127 | 															 cursor = cursor, 
128 | 															 count = 5000)
129 | 			if not out:
130 | 				break
131 | 			
132 | 			friends = friends + out[2]
133 | 			cursor = out[0]
134 | 
135 | 		collect.insert_one({
136 | 			'_id': user_id,
137 | 			'friends': friends
138 | 		})


--------------------------------------------------------------------------------
/portrayal/influence/calculate_influence.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | '''
  3 |  * @Author: Marco 
  4 |  * @Date: 2017-08-29 15:48:56 
  5 |  * @Last Modified by: Marco 
  6 |  * @Last Modified time: 2017-08-29 15:48:56 
  7 | '''
  8 | import re
  9 | import math
 10 | import time
 11 | 
 12 | from .. tools.function import split_tweets_same_time, calc_time_differ
 13 | 
 14 | 
 15 | '''
 16 | 参数计算
 17 | '''
 18 | def calc_parameters(tweets):
 19 | 	origin_count = rt_count = 0  # 原创推文和转发推文
 20 | 	origin_retweet_count = origin_retweet_average = origin_retweet_max = 0  # 原创推文转发 总数、平均值、最大值
 21 | 	origin_favorite_count = origin_favorite_average = origin_favorite_max = 0 # 原创推文点赞 总数、平均值、最大值
 22 | 
 23 | 	if len(tweets) == 0:
 24 | 		return
 25 | 
 26 | 	tweet_start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweets[0]['created_at'].replace('+0000 ','')))
 27 | 	tweet_end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweets[-1]['created_at'].replace('+0000 ','')))
 28 | 
 29 | 	for tweet in tweets:
 30 | 		# 转推
 31 | 		if re.match(r"^RT @[\w|\d|_]+", tweet["text"]) != None:
 32 | 			rt_count += 1
 33 | 
 34 | 		# 非转推
 35 | 		else:
 36 | 			retweet_count = tweet["retweet_count"]
 37 | 			favorite_count = tweet["favorite_count"]
 38 | 
 39 | 			origin_count += 1
 40 | 			origin_retweet_count += retweet_count
 41 | 			origin_favorite_count += favorite_count
 42 | 
 43 | 			if retweet_count > origin_retweet_max:
 44 | 				origin_retweet_max = retweet_count
 45 | 			
 46 | 			if favorite_count > origin_favorite_max:
 47 | 				origin_favorite_max = favorite_count
 48 | 		
 49 | 	origin_retweet_average = origin_retweet_count * 1.0 / origin_count if origin_count else 0
 50 | 	origin_favorite_average = origin_favorite_count * 1.0 / origin_count if origin_count else 0
 51 | 
 52 | 	return tweet_start_time, tweet_end_time, origin_count, rt_count, origin_retweet_count, \
 53 | 	origin_retweet_average, origin_retweet_max, origin_favorite_count, origin_favorite_average, origin_favorite_max
 54 | 
 55 | 
 56 | '''
 57 | 参数计算：只返回原创推文数和转发推文数
 58 | '''
 59 | def calc_parameters_4sequence(tweets):
 60 | 	origin_count = rt_count = 0  # 原创推文和转发推文
 61 | 
 62 | 	if len(tweets) == 0:
 63 | 		return
 64 | 
 65 | 	for tweet in tweets:
 66 | 		if re.match(r"^RT @\w+", tweet["text"]) != None:
 67 | 			rt_count += 1
 68 | 		   
 69 | 		else:
 70 | 			origin_count += 1
 71 | 
 72 | 	return origin_count, rt_count
 73 | 
 74 | 
 75 | '''
 76 | 计算活跃度
 77 | '''
 78 | def calc_activity(origin_count, rt_count, time_span):
 79 | 	time_span = time_span if time_span else 1
 80 | 	rate = 1000.0 / time_span
 81 | 	total = 0.65 * math.log(origin_count * rate + 1) + 0.35 * math.log(rt_count * rate + 1)
 82 | 
 83 | 	return total
 84 | 
 85 | 
 86 | '''
 87 | 活跃度序列计算
 88 | 参数：
 89 | 	period：时间跨度，默认为 1，表示每一个月计算一次活跃度
 90 | '''
 91 | def calc_activity_sequence(tweets, period = 1):
 92 | 	tweets_list = split_tweets_same_time(tweets, 1)
 93 | 
 94 | 	res = []
 95 | 	for tts in tweets_list:
 96 | 		origin_count, rt_count = calc_parameters_4sequence(tts)
 97 | 
 98 | 		activity = calc_activity(origin_count, rt_count, period * 30)
 99 | 		res.append(activity)
100 | 
101 | 	return res
102 | 
103 | 
104 | '''
105 | 计算推文影响力
106 | '''
107 | def calc_tweet_influence(origin_retweet_count, origin_retweet_average, origin_retweet_max, \
108 | 							 origin_favorite_count, origin_favorite_average, origin_favorite_max):
109 | 	retweet_rate = 0.45 * math.log(origin_retweet_count + 1) + 0.35 * math.log(origin_retweet_average + 1) + 0.2 * math.log(origin_retweet_max + 1)
110 | 	favorite_rate = 0.45 * math.log(origin_favorite_count + 1) + 0.35 * math.log(origin_favorite_average + 1) + 0.2 * math.log(origin_favorite_max + 1)
111 | 
112 | 	return 0.6 * retweet_rate + 0.4 * favorite_rate
113 | 
114 | 
115 | '''
116 | 计算粉丝影响力
117 | '''
118 | def calc_follower_influence(followers_count):
119 | 	return math.log(followers_count + 1)
120 | 
121 | 
122 | '''
123 | 影响力计算
124 | '''
125 | def calculate_influence(followers_count, tweets):
126 | 	tweet_start_time, tweet_end_time, origin_count, rt_count, origin_retweet_count, origin_retweet_average, \
127 | 	origin_retweet_max, origin_favorite_count, origin_favorite_average, origin_favorite_max = calc_parameters(tweets)
128 | 
129 | 	time_span = calc_time_differ(tweet_start_time, tweet_end_time)
130 | 
131 | 	activity = calc_activity(origin_count, rt_count, time_span)
132 | 	tweet_influence = calc_tweet_influence(origin_retweet_count, origin_retweet_average, origin_retweet_max, \
133 | 												origin_favorite_count, origin_favorite_average, origin_favorite_max)
134 | 
135 | 	follower_influence = calc_follower_influence(followers_count)
136 | 
137 | 	return (0.5 * tweet_influence + 0.2 * activity + 0.3 * follower_influence) * 10, activity


--------------------------------------------------------------------------------
/portrayal/sentiment_classify/sentiment_dict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | '''
  3 |  * @Author: Marco 
  4 |  * @Date: 2017-08-30 14:16:44 
  5 |  * @Last Modified by: Marco 
  6 |  * @Last Modified time: 2017-08-30 14:16:44 
  7 | '''
  8 | import re
  9 | import os
 10 | import sys
 11 | import math
 12 | import nltk
 13 | import pickle
 14 | 
 15 | from classify import classify
 16 | from nltk.tokenize import word_tokenize
 17 | 
 18 | from .. config import PROJECT_PATH
 19 | from .. tools.preprocess import preprocess_postag
 20 | from .. tools.function import get_stop_words
 21 | from .. tools.function import get_slang
 22 | 
 23 | reload(sys)
 24 | sys.setdefaultencoding('utf-8')
 25 | 
 26 | slang = get_slang()
 27 | stop_words = get_stop_words()
 28 | 
 29 | module_path = PROJECT_PATH + "portrayal/sentiment_classify/"
 30 | pickle_path = module_path + "pickle/"
 31 | 
 32 | 
 33 | class SentimentDict:
 34 | 	sentiment_dict = None
 35 | 
 36 | 	def preprocess(self, tweets):
 37 | 		res = []
 38 | 		but_words = set(["but", "however"])
 39 | 		hope_words = set(["hope", "wish"])
 40 | 		deny_words = set(['not', "n't", 'no', 'never', 'none', 'hardly', 'isnt', 'doesnt'])
 41 | 		degree_words = set(["fairly", "pretty", "quite", "very", "much", "too", "greatly", "highly", "really", "extremely", "so"])
 42 | 		filter_set = set(['affected', 'allow', 'allows', 'backed', 'backing', 'backs', 'best', 'better', 'big', 'certain', 'clear', 'clearly', 'good', 'greetings', 'ha', 'haa', 'hah', 'haha', 'hahaa', 'help', 'hid', 'hopefully', 'ignored', 'importance', 'important', 'kind', 'like', 'liked', 'lmao', 'matter', 'miss', 'novel', 'please', 'sorry', 'substantially', 'thk', 'thx', 'thank', 'thanks', 'thanx', 'thaanks', 'true', 'unfortunately', 'useful', 'usefully', 'usefulness', 'want', 'welcome', 'woohoo', 'yeah', 'yeahh', 'yes'])
 43 | 
 44 | 		for tweet in tweets:
 45 | 			text = tweet['text'].lower()
 46 | 
 47 | 			if text == '':
 48 | 				continue
 49 | 
 50 | 			try:
 51 | 				words = word_tokenize(text)
 52 | 
 53 | 				for i in range(len(words)):
 54 | 					if words[i] in slang:
 55 | 						words[i] = slang[words[i]]
 56 | 
 57 | 				word_tags = nltk.pos_tag(words)
 58 | 
 59 | 			except Exception as e:
 60 | 				print e
 61 | 				continue
 62 | 			
 63 | 			deny = False
 64 | 			degree = False
 65 | 			but = False
 66 | 			hope = False
 67 | 
 68 | 			length = len(word_tags)
 69 | 			
 70 | 			for i in range(length):
 71 | 				item = word_tags[i]
 72 | 				word = item[0]
 73 | 				
 74 | 				if word in deny_words:
 75 | 					deny = True
 76 | 					degree = False
 77 | 					continue
 78 | 				elif word in but_words:
 79 | 					but = True
 80 | 					j = i - 1
 81 | 					flag = True
 82 | 
 83 | 					while j >= 0 and (flag or word_tags[j][0].isalpha()):
 84 | 						if not word_tags[j][0].isalpha() or i - j > 2:
 85 | 							flag = False
 86 | 
 87 | 						w_t = word_tags[j][0]
 88 | 						t_t = word_tags[j][1][0]
 89 | 						if w_t not in stop_words:
 90 | 							flag = False
 91 | 							if t_t == 'J' or t_t == 'V' or t_t == 'R' or t_t == 'N':
 92 | 								res.append("FOT_" + w_t)
 93 | 
 94 | 						j -= 1
 95 | 					continue
 96 | 				elif word in degree_words:
 97 | 					degree = True
 98 | 					continue
 99 | 				elif word in hope_words:
100 | 					hope = True
101 | 					continue
102 | 
103 | 				if not word.isalpha() and not (item[1][0] == 'J' or item[1][0] == 'V' or item[1][0] == 'R' or item[1][0] == 'N'):
104 | 					deny = False
105 | 					degree = False
106 | 					hope = False
107 | 
108 | 					if i == 0 or word_tags[i - 1] not in but_words:
109 | 						but = False
110 | 
111 | 				elif word not in stop_words or word in filter_set:
112 | 					prefix = ""
113 | 					if deny:
114 | 						prefix += "NOT_"
115 | 					if hope:
116 | 						prefix += "HOP_"
117 | 					if degree and item[1][0] == 'J':
118 | 						prefix += "TWO_"
119 | 
120 | 					if not word.isalpha():
121 | 						temp_list = word.split(" ")
122 | 						for item_temp in temp_list:
123 | 							if item_temp.isalpha() and (item_temp not in stop_words or item_temp in filter_set):
124 | 								res.append(prefix + item_temp)
125 | 
126 | 					elif item[1][0] == 'J' or item[1][0] == 'V' or item[1][0] == 'R' or item[1][0] == 'N':
127 | 						res.append(prefix + word)
128 | 		
129 | 		return res
130 | 
131 | 
132 | 	def calc_sentiment_score(self, tweets):
133 | 		if not self.sentiment_dict:
134 | 			self.sentiment_dict = {}
135 | 
136 | 			senti_file = open(module_path + "data/sentiment_words1.txt").read()
137 | 
138 | 			for line in senti_file.split("\n"):
139 | 				sp = line.split("\t")
140 | 				self.sentiment_dict[sp[0].strip()] = int(sp[1])
141 | 
142 | 		score = 0
143 | 		word_list = self.preprocess(tweets)
144 | 
145 | 		if not word_list:
146 | 			return None
147 | 
148 | 		for word in word_list:
149 | 			rate = 1
150 | 
151 | 			if "FOT_" in word:
152 | 				rate *= -0.9
153 | 				word = word.replace("FOT_", '')
154 | 
155 | 			if "NOT_" in word:
156 | 				if "TWO_" in word:
157 | 					rate *= -0.3
158 | 					word = word.replace("TWO_", '')
159 | 				else:
160 | 					rate *= -0.8
161 | 
162 | 				if "HOP_" in word:
163 | 					rate *= -0.4
164 | 					word = word.replace("HOP_", '')
165 | 				
166 | 				word = word.replace("NOT_", '')
167 | 			else:
168 | 				if "TWO_" in word:
169 | 					rate *= 1.8
170 | 					word = word.replace("TWO_", '')
171 | 
172 | 				if "HOP_" in word:
173 | 					rate *= 0.6
174 | 					word = word.replace("HOP_", '')
175 | 
176 | 			if word in self.sentiment_dict:
177 | 				score += self.sentiment_dict[word] * rate
178 | 
179 | 		return score
180 | 
181 | 
182 | sentiment_dict = SentimentDict()
183 | 
184 | def calc_sentiment_score(tts):
185 | 	return sentiment_dict.calc_sentiment_score(tts)


--------------------------------------------------------------------------------
/portrayal/career_classify/preprocess.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | import os
  3 | import pickle
  4 | 
  5 | from nltk.tokenize import word_tokenize
  6 | 
  7 | from .. tools.preprocess import preprocess
  8 | from .. config import PROJECT_PATH
  9 | 
 10 | 
 11 | module_path = PROJECT_PATH + "portrayal/career_classify"
 12 | data_dir = module_path + '/data/'
 13 | data_processed_dir = module_path + '/data_processed/'
 14 | statistics_dir = module_path + '/statistics/'
 15 | data_category_dir = module_path + '/data_category/'
 16 | 
 17 | def process_training_data():
 18 | 	category_dirs = os.listdir(data_dir)
 19 | 
 20 | 	for item in category_dirs:
 21 | 		if not os.path.exists(data_processed_dir + item):
 22 | 			os.makedirs(data_processed_dir + item) 
 23 | 
 24 | 		cat_files = os.listdir(data_dir + item)
 25 | 
 26 | 		for cat_file in cat_files:
 27 | 			file = open(data_dir + item + "/" + cat_file)
 28 | 			file_processed = open(data_processed_dir + item + "/" + cat_file, 'w')
 29 | 
 30 | 			for line in file:
 31 | 				if line.strip() == '':
 32 | 					continue
 33 | 				
 34 | 				line = preprocess(line.strip())
 35 | 			
 36 | 				try:
 37 | 					file_processed.write(line + '\n')
 38 | 				except:
 39 | 					continue
 40 | 
 41 | 
 42 | def word_count():
 43 | 	count_dict = {}
 44 | 	category_dirs = os.listdir(data_processed_dir)
 45 | 
 46 | 	for item in category_dirs:
 47 | 		cat_files = os.listdir(data_processed_dir + item)
 48 | 
 49 | 		for cat_file in cat_files:
 50 | 			file_processed = open(data_processed_dir + item + "/" + cat_file, 'r')
 51 | 
 52 | 			for line in file_processed:
 53 | 				word_list = line.split(" ")
 54 | 
 55 | 				for word in word_list:
 56 | 					word = word.strip()
 57 | 
 58 | 					if count_dict.has_key(word):
 59 | 						count_dict[word] += 1
 60 | 					
 61 | 					else:
 62 | 						count_dict[word] = 1
 63 | 
 64 | 		count_dict = sorted(count_dict.iteritems(), key = lambda i: i[1], reverse = True)
 65 | 
 66 | 		file = open(statistics_dir + item + ".pickle", 'wb')		
 67 | 		pickle.dump(count_dict, file)
 68 | 		file.close()
 69 | 
 70 | 		count_dict = {}
 71 | 
 72 | 
 73 | # def get_ambiguity_words(category_pair = None):
 74 | # 	if not category_pair:
 75 | # 		category_pair = [
 76 | # 			("Entertainment", "Sports", 40),
 77 | # 			("Economy", "Agriculture", 40),
 78 | # 			("Politics", "Religion", 40),
 79 | # 			("Military", "Politics", 40),
 80 | # 			("Education", "Technology", 40),
 81 | # 			("Education", "Politics", 40),
 82 | # 			("Education", "Entertainment", 40),
 83 | # 			("Education", "Agriculture", 40),
 84 | # 			("Technology", "Entertainment", 40),
 85 | # 			("Economy", "Technology", 40),
 86 | # 			("Economy", "Politics", 40)
 87 | # 		]
 88 | 
 89 | # 	delete_set = set()
 90 | # 	for pair in category_pair:
 91 | # 		top_words0 = set()
 92 | 
 93 | # 		file = open(statistics_dir + pair[0] + ".pickle")
 94 | # 		count_dict = pickle.load(file)
 95 | 
 96 | # 		for item in count_dict:
 97 | # 			if item[1] >= pair[2]:
 98 | # 				top_words0.add(item[0])
 99 | 			
100 | # 			else:
101 | # 				break
102 | 
103 | # 		top_words1 = set()
104 | 
105 | # 		file = open(statistics_dir + pair[1] + ".pickle")
106 | # 		count_dict = pickle.load(file)
107 | 
108 | # 		for item in count_dict:
109 | # 			if item[1] >= pair[2]:
110 | # 				top_words1.add(item[0])
111 | 			
112 | # 			else:
113 | # 				break
114 | 
115 | # 		delete_set |= top_words0 & top_words1
116 | 
117 | # 	print len(delete_set)
118 | # 	return delete_set
119 | 
120 | 
121 | # def delete_ambiguity(category_pair = None):
122 | # 	ambiguity_words = get_ambiguity_words(category_pair)
123 | 
124 | # 	category_dirs = os.listdir(data_processed_dir)
125 | 
126 | # 	for item in category_dirs:
127 | # 		if not os.path.exists(data_pruned_dir + item):
128 | # 			os.makedirs(data_pruned_dir + item) 
129 | 
130 | # 		cat_files = os.listdir(data_processed_dir + item)
131 | 
132 | # 		for cat_file in cat_files:
133 | # 			file = open(data_processed_dir + item + "/" + cat_file)
134 | # 			file_pruned = open(data_pruned_dir + item + "/" + cat_file, 'w')
135 | 
136 | # 			for line in file:
137 | # 				if line.strip() == '':
138 | # 					continue
139 | 				
140 | # 				word_list = line.strip().split(" ")
141 | # 				word_list = [w for w in word_list if w not in ambiguity_words]
142 | 				 
143 | # 				try:
144 | # 					file_pruned.write(' '.join(word_list) + '\n')
145 | # 				except:
146 | # 					continue
147 | 
148 | 
149 | def get_ambiguity_words(category_list, count = 20):
150 | 	delete_set = None
151 | 
152 | 	for category in category_list:
153 | 		top_words = set()
154 | 		
155 | 		file = open(statistics_dir + category + ".pickle")
156 | 		count_dict = pickle.load(file)
157 | 		file.close()
158 | 
159 | 		for item in count_dict:
160 | 			if item[1] >= count:
161 | 				top_words.add(item[0])
162 | 			
163 | 			else:
164 | 				break
165 | 		print len(top_words)
166 | 		if not delete_set:
167 | 			delete_set = top_words
168 | 		else:
169 | 			delete_set &= top_words
170 | 
171 | 	print len(delete_set)
172 | 	return delete_set
173 | 
174 | 
175 | def delete_ambiguity(category_list, count = 20):
176 | 	category_list.sort()
177 | 	
178 | 	ambiguity_words = get_ambiguity_words(category_list)
179 | 
180 | 	dir_name = ''
181 | 	for item in category_list:
182 | 		dir_name += '_' + item
183 | 
184 | 	dir_name = dir_name[1 : ]
185 | 
186 | 	category_path = data_category_dir + dir_name + "/"
187 | 
188 | 	if not os.path.exists(category_path):
189 | 		os.makedirs(category_path) 
190 | 
191 | 	for category in category_list:
192 | 
193 | 		if not os.path.exists(category_path + category):
194 | 			os.makedirs(category_path + category) 
195 | 
196 | 		cat_files = os.listdir(data_processed_dir + category)
197 | 
198 | 		for cat_file in cat_files:
199 | 			file = open(data_processed_dir + category + "/" + cat_file)
200 | 			file_category = open(category_path + category + "/" + cat_file, 'w')
201 | 
202 | 			for line in file:
203 | 				if line.strip() == '':
204 | 					continue
205 | 				
206 | 				word_list = line.strip().split(" ")
207 | 				word_list = [w for w in word_list if w not in ambiguity_words]
208 | 				 
209 | 				try:
210 | 					file_category.write(' '.join(word_list) + '\n')
211 | 				except:
212 | 					continue


--------------------------------------------------------------------------------
/portrayal/sentiment_classify/training.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 |  * @Author: Marco 
  4 |  * @Date: 2017-09-07 20:18:27 
  5 |  * @Last Modified by: Marco 
  6 |  * @Last Modified time: 2017-09-07 20:18:27 
  7 |  '''
  8 | import os
  9 | import nltk
 10 | import pickle
 11 | import random
 12 | 
 13 | from sklearn.svm import LinearSVC
 14 | from nltk.tokenize import word_tokenize
 15 | from nltk.classify.scikitlearn import SklearnClassifier
 16 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB
 17 | from sklearn.linear_model import LogisticRegression, SGDClassifier
 18 | 
 19 | from .. tools.function import get_stop_words
 20 | from .. tools.preprocess import preprocess, preprocess_postag
 21 | from .. config import PROJECT_PATH
 22 | 
 23 | stop_words = get_stop_words()
 24 | module_path = PROJECT_PATH + "portrayal/sentiment_classify/"
 25 | pickle_path = module_path + "pickle/"
 26 | 
 27 | 
 28 | # 对一段文档建立特征
 29 | def word2features(document, word_features):
 30 | 	features = {}
 31 | 	words = word_tokenize(document, language='english')
 32 | 		
 33 | 	for w in word_features:
 34 | 		features[w] = w in words
 35 | 		
 36 | 	return features
 37 | 
 38 | 
 39 | def save_feature_document():
 40 | 	# 加载语料库
 41 | 	pos_corpus = open(module_path + "data/positive.txt", "r").read()
 42 | 	neg_corpus = open(module_path + "data/negative.txt", "r").read()
 43 | 
 44 | 	documents = []
 45 | 	words_feature = []
 46 | 
 47 | 	# J是代表形容词，R代表副词，V代表动词
 48 | 	allowed_types = ['J']
 49 | 
 50 | 	n = 0
 51 | 	p_temp = ''
 52 | 
 53 | 	for p in pos_corpus.split("\n"):
 54 | 		word_tags = preprocess_postag(p)
 55 | 		
 56 | 		if not word_tags:
 57 | 			continue
 58 | 
 59 | 		# 形容词对情感影响较大，所以选取形容词为特征
 60 | 		for item in word_tags:
 61 | 			p_temp += item[0] + " "
 62 | 
 63 | 			if item[1][0] in allowed_types:
 64 | 				words_feature.append(item[0])
 65 | 		
 66 | 		n += 1
 67 | 		if n % 15 == 0:
 68 | 			documents.append((p_temp, "pos"))
 69 | 			p_temp = ''
 70 | 
 71 | 	if n > 7:
 72 | 		documents.append((p_temp, "pos"))
 73 | 	
 74 | 	n = 0
 75 | 	p_temp = ''
 76 | 
 77 | 	for p in neg_corpus.split("\n"):
 78 | 		word_tags = preprocess_postag(p)
 79 | 		
 80 | 		if not word_tags:
 81 | 			continue
 82 | 
 83 | 		for item in word_tags:
 84 | 			p_temp += item[0] + " "
 85 | 
 86 | 			if item[1][0] in allowed_types:
 87 | 				words_feature.append(item[0])
 88 | 		
 89 | 		n += 1
 90 | 		if n % 15 == 0:
 91 | 			documents.append((p_temp, "neg"))
 92 | 			p_temp = ''
 93 | 
 94 | 	if n > 7:
 95 | 		documents.append((p_temp, "neg"))
 96 | 
 97 | 	# 将处理好的文档持久化
 98 | 	documents_file = open(pickle_path + "documents.pickle","wb")
 99 | 	pickle.dump(documents, documents_file)
100 | 	documents_file.close()
101 | 	print "Documents saved!"
102 | 
103 | 	# 统计
104 | 	words_feature = nltk.FreqDist(words_feature)
105 | 	words_feature_temp = sorted(words_feature.iteritems(), key = lambda i: i[1], reverse = True)
106 | 
107 | 	words_feature = map(lambda tuple: tuple[0], words_feature_temp)
108 | 	words_feature = words_feature[0 : 5000]
109 | 
110 | 	# 将特征属性持久化
111 | 	feature_file = open(pickle_path + "words_feature.pickle", "wb")
112 | 	pickle.dump(words_feature, feature_file)
113 | 
114 | 	feature_file.close()
115 | 	print "words_feature saved!"
116 | 
117 | 	feature_sets = [(word2features(p, words_feature), category) for (p, category) in documents]
118 | 
119 | 	# 将特征属性持久化
120 | 	feature_file = open(pickle_path + "feature_sets.pickle", "wb")
121 | 	pickle.dump(feature_sets, feature_file)
122 | 
123 | 	feature_file.close()
124 | 	print "feature_sets saved!"
125 | 
126 | 
127 | def training():
128 | 	if not os.path.exists(pickle_path + "feature_sets.pickle"):
129 | 		save_feature_document()
130 | 		
131 | 	feature_file = open(pickle_path + "feature_sets.pickle")
132 | 	feature_sets = pickle.load(feature_file)
133 | 
134 | 	# 打乱，为了抽取训练集和测试集
135 | 	random.shuffle(feature_sets)
136 | 	print "Length of feature_sets: "
137 | 	print len(feature_sets)
138 | 
139 | 	testing_set = feature_sets[150:]
140 | 	print("testing: %d" % len(testing_set))
141 | 	training_set = feature_sets
142 | 	print("training: %d" % len(training_set))
143 | 
144 | 	# 分类器选择
145 | 	# 朴素贝叶斯-nltk自带分类器
146 | 	classifier = nltk.NaiveBayesClassifier.train(training_set)
147 | 	print "NaiveBayesClassifier accuracy:"
148 | 	print(nltk.classify.accuracy(classifier, testing_set))
149 | 	# 分类器持久化
150 | 	classifier_file = open(pickle_path + "naivebayes.pickle", "wb")
151 | 	pickle.dump(classifier, classifier_file)
152 | 	classifier_file.close()
153 | 
154 | 	# 多项式分类器-sklearn
155 | 	mnb_classifier = SklearnClassifier(MultinomialNB())
156 | 	mnb_classifier.train(training_set)
157 | 	print "MultinomialNB accuracy:"
158 | 	print(nltk.classify.accuracy(mnb_classifier, testing_set))
159 | 	# 分类器持久化
160 | 	classifier_file = open(pickle_path + "mnb_classifier.pickle", "wb")
161 | 	pickle.dump(classifier, classifier_file)
162 | 	classifier_file.close()
163 | 
164 | 	# 伯努利分类器-sklearn
165 | 	bnb_classifier = SklearnClassifier(BernoulliNB())
166 | 	bnb_classifier.train(training_set)
167 | 	print "BernoulliNB accuracy:"
168 | 	print(nltk.classify.accuracy(bnb_classifier, testing_set))
169 | 	# 分类器持久化
170 | 	classifier_file = open(pickle_path + "bnb_classifier.pickle", "wb")
171 | 	pickle.dump(classifier, classifier_file)
172 | 	classifier_file.close()
173 | 
174 | 	# 逻辑回归分类-sklearn
175 | 	lr_classifier = SklearnClassifier(LogisticRegression())
176 | 	lr_classifier.train(training_set)
177 | 	print "LogisticRegression accuracy:"
178 | 	print(nltk.classify.accuracy(lr_classifier, testing_set))
179 | 	# 分类器持久化
180 | 	classifier_file = open(pickle_path + "lr_classifier.pickle", "wb")
181 | 	pickle.dump(classifier, classifier_file)
182 | 	classifier_file.close()
183 | 
184 | 	# 线性支持向量机分类器-sklearn
185 | 	lsv_classifier = SklearnClassifier(LinearSVC())
186 | 	lsv_classifier.train(training_set)
187 | 	print "LinearSVC accuracy:"
188 | 	print(nltk.classify.accuracy(lsv_classifier, testing_set))
189 | 	# 分类器持久化
190 | 	classifier_file = open(pickle_path + "lsv_classifier.pickle", "wb")
191 | 	pickle.dump(classifier, classifier_file)
192 | 	classifier_file.close()
193 | 
194 | 	# 梯度下降分类器-sklearn
195 | 	sgd_classifier = SklearnClassifier(SGDClassifier())
196 | 	sgd_classifier.train(training_set)
197 | 	print "SGDClassifier accuracy:"
198 | 	print(nltk.classify.accuracy(sgd_classifier, testing_set))
199 | 	# 分类器持久化
200 | 	classifier_file = open(pickle_path + "sgd_classifier.pickle","wb")
201 | 	pickle.dump(classifier, classifier_file)
202 | 	classifier_file.close()


--------------------------------------------------------------------------------
/crawler/basicinfo_crawler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import gc
  3 | import time
  4 | import threading
  5 | 
  6 | from config import THREAD_NUM
  7 | from twitter import error
  8 | from api import API_COUNT, Api
  9 | from database import Mysql
 10 | from decorator import generate_decorator
 11 | 
 12 | handle_exception = generate_decorator(400)
 13 | 
 14 | class BasicinfoCrawler:
 15 | 	get_api = Api().get_api
 16 | 
 17 | 	'''
 18 | 	获取与term相关的用户信息
 19 | 
 20 | 	Parameters:	
 21 | 		term – Term to search by.
 22 | 		page – Page of results to return. Default is 1 [Optional]
 23 | 		count – Number of results to return. Default is 20 [Optional]
 24 | 		include_entities – If True, each tweet will include a node called “entities,”. 
 25 | 		This node offers a variety of metadata about the tweet in a discrete structure, 
 26 | 		including: user_mentions, urls, and hashtags. [Optional]
 27 | 	Returns:	
 28 | 		A sequence of twitter.User instances, one for each message containing the term
 29 | 	'''
 30 | 	def get_users_search(self,
 31 | 						 term = None,
 32 | 						 page = 1,
 33 | 						 count = 20,
 34 | 						 include_entities = True):
 35 | 
 36 | 		if term == None:
 37 | 			return None
 38 | 
 39 | 		return self.get_api().GetUsersSearch(term = term, 
 40 | 											 page = page, 
 41 | 											 count = count, 
 42 | 											 include_entities = include_entities)
 43 | 
 44 | 
 45 | 	'''
 46 | 	获取单个用户的信息
 47 | 
 48 | 	Parameters:
 49 | 		user_id (int, optional):
 50 | 			The id of the user to retrieve.
 51 | 		screen_name (str, optional):
 52 | 			The screen name of the user for whom to return results for.
 53 | 			Either a user_id or screen_name is required for this method.
 54 | 		include_entities (bool, optional):
 55 | 			The entities node will be omitted when set to False.
 56 | 	Returns:
 57 | 		A twitter.User instance representing that user
 58 | 	'''
 59 | 	def get_user(self,
 60 | 				 user_id = None,
 61 | 				 screen_name = None,
 62 | 				 include_entities = True):
 63 | 
 64 | 		if user_id == None and screen_name == None:
 65 | 			return None
 66 | 
 67 | 		return self.get_api().GetUser(user_id = user_id,
 68 | 									  screen_name = screen_name,
 69 | 									  include_entities = include_entities)
 70 | 
 71 | 
 72 | 	'''
 73 | 	获取单个用户的基础信息并返回，如果超时则休眠 400s 后返回
 74 | 	'''
 75 | 	def get_user_sleep(self,
 76 | 					   user_id = None,
 77 | 					   screen_name = None,
 78 | 					   include_entities = True):
 79 | 
 80 | 		if user_id == None and screen_name == None:
 81 | 			return None
 82 | 
 83 | 		wrapper_func = handle_exception(self.get_user)
 84 | 
 85 | 		user = wrapper_func(user_id = user_id, screen_name = screen_name, include_entities = include_entities)
 86 | 
 87 | 		return user
 88 | 
 89 | 	'''
 90 | 	获取单个用户的基础信息并保存（参考 get_user ）
 91 | 
 92 | 	参数：
 93 | 		table_name (str, optional):
 94 | 			存储数据表名，默认 user_task (保证数据库中存在该表)
 95 | 	
 96 | 	返回：
 97 | 		None
 98 | 	'''
 99 | 	def get_user_save(self,
100 | 					  user_id = None,
101 | 					  table_name = "user_task",
102 | 					  screen_name = None,
103 | 					  include_entities = True):
104 | 
105 | 		wrapper_func = handle_exception(self.get_user)
106 | 
107 | 		user = wrapper_func(user_id = user_id, screen_name = screen_name, include_entities = include_entities)
108 | 
109 | 		user and self.save_user(user, table_name)
110 | 
111 | 
112 | 	'''
113 | 	获取多个用户的信息，并存入数据库中
114 | 
115 | 	参数：
116 | 		user_list (list, optional):
117 | 			存放用户 user_id / screen_name 的列表
118 | 		table_name (str, optional):
119 | 			存储数据表名，默认 user_task (保证数据库中存在该表)
120 | 		search_type (str, optional):
121 | 			抓取方式，如果为 screen_name ，则认为 user_list 中 存放的是用户 screen_name，
122 | 			否则认为 user_list 中 存放的是用户 user_id
123 | 	'''
124 | 	def get_all_users(self,
125 | 					  user_list = None,
126 | 					  table_name = "user_task",
127 | 					  search_type = "user_id"):
128 | 
129 | 		if len(user_list) == 0:
130 | 			return None
131 | 
132 | 		i = 0
133 | 		thread_pool = []
134 | 		length = len(user_list)
135 | 
136 | 		per_thread = length / THREAD_NUM
137 | 		
138 | 		while i < THREAD_NUM:
139 | 			if i + 1 == THREAD_NUM:
140 | 				crawler_thread = threading.Thread(target = self.get_users_thread, args = (user_list[i * per_thread : ], table_name, search_type,))
141 | 			else:
142 | 				crawler_thread = threading.Thread(target = self.get_users_thread, args = (user_list[i * per_thread : (i + 1) * per_thread], table_name, search_type,))
143 | 			
144 | 			crawler_thread.start()
145 | 			thread_pool.append(crawler_thread)
146 | 
147 | 			i += 1
148 | 
149 | 		for thread in thread_pool:
150 | 			thread.join()
151 | 
152 | 
153 | 	'''
154 | 	线程：获取多个用户信息（参考 get_all_users ）
155 | 	'''
156 | 	def get_users_thread(self,
157 | 						 user_list = None,
158 | 						 table_name  = "user_task",
159 | 						 search_type = "user_id"):
160 | 
161 | 		if search_type != "screen_name":
162 | 			while len(user_list) > 0:
163 | 				user_id = user_list.pop(0)
164 | 				self.get_user_save(user_id = user_id, table_name = table_name)
165 | 
166 | 		else:
167 | 			while len(user_list) > 0:
168 | 				screen_name = user_list.pop(0)
169 | 				self.get_user_save(screen_name = screen_name, table_name = table_name)
170 | 
171 | 
172 | 	'''
173 | 	保存用户信息
174 | 
175 | 	参数：
176 | 		user(User, optional):
177 | 			要保存的用户
178 | 		table_name (str, optional):
179 | 			存储数据表名，默认 user_task (保证数据库中存在该表)
180 | 	'''
181 | 	def save_user(self,
182 | 				  user = None,
183 | 				  table_name = "user_task"):
184 | 		
185 | 		if not user:
186 | 			return
187 | 
188 | 		mysql = Mysql()
189 | 		mysql.connect()
190 | 
191 | 		try:
192 | 			name = user.name.replace('\\','\\\\').replace("'","\\'").replace(':','\\:')
193 | 			location = user.location.replace('\\','\\\\').replace("'","\\'").replace(':','\\:') if user.location else ''
194 | 			description = user.description.replace('\\','\\\\').replace("'","\\'").replace(':','\\:') if user.description else ''
195 | 
196 | 			time_zone = user.time_zone.replace("'","\\'") if user.time_zone else ''
197 | 			utc_offset = user.utc_offset if user.utc_offset else ''
198 | 			profile_banner_url = user.profile_banner_url if user.profile_banner_url else ''
199 | 
200 | 			protected = 1 if user.protected else 0
201 | 			verified = 1 if user.verified else 0
202 | 			geo_enabled = 1 if user.geo_enabled else 0
203 | 			default_profile_image = 1 if user.default_profile_image else 0
204 | 
205 | 			sql = """INSERT INTO %s (user_id, screen_name, name, location, created_at, description, statuses_count, friends_count,
206 | 					followers_count, favourites_count, lang, protected, time_zone, verified, utc_offset, geo_enabled, listed_count,
207 | 					default_profile_image, profile_background_color, profile_sidebar_fill_color, profile_image_url, profile_banner_url, crawler_date) VALUES
208 | 					('%s', '%s', '%s', '%s', '%s', '%s', %d, %d, %d, %d, '%s', %d, '%s', %d, '%s', %d, %d, %d,
209 | 					'%s', '%s', '%s', '%s', '%s')""" % (table_name, user.id, user.screen_name, name, location, user.created_at, description, \
210 | 					user.statuses_count, user.friends_count, user.followers_count, user.favourites_count, user.lang, protected, time_zone, verified, \
211 | 					utc_offset, geo_enabled, user.listed_count, default_profile_image, user.profile_background_color, \
212 | 					user.profile_sidebar_fill_color, user.profile_image_url, profile_banner_url, time.strftime('%Y-%m-%d',time.localtime(time.time()))) 
213 | 
214 | 			sql = sql.encode("utf-8").decode("latin1")
215 | 		except Exception as e:
216 | 			print e
217 | 			mysql.close()
218 | 			return
219 | 
220 | 		try:
221 | 			mysql.execute(sql)
222 | 		except Exception as e:
223 | 			print e
224 | 			mysql.close()
225 | 			return
226 | 
227 | 		mysql.close()
228 | 
229 | 		del user
230 | 		gc.collect()
231 | 
232 | 
233 | if __name__ == '__main__':
234 | 	bc = BasicinfoCrawler()


--------------------------------------------------------------------------------
/portrayal/interest_extract/interest_extract.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | '''
  3 |  * @Author: Marco 
  4 |  * @Date: 2017-09-10 22:54:55 
  5 |  * @Last Modified by: Marco 
  6 |  * @Last Modified time: 2017-09-10 22:54:55 
  7 | '''
  8 | import re
  9 | import os
 10 | import math
 11 | import pickle
 12 | 
 13 | from collections import Counter
 14 | from nltk.stem import WordNetLemmatizer
 15 | 
 16 | from .. tools.preprocess import preprocess_postag_label
 17 | 
 18 | from .. config import PROJECT_PATH
 19 | 
 20 | from .. tools.function import get_stop_words
 21 | 
 22 | 
 23 | module_path = PROJECT_PATH + "portrayal/interest_extract/"
 24 | data_path = module_path + "data/"
 25 | pickle_path = module_path + "pickle/"
 26 | 
 27 | corpus = None
 28 | stop_words = get_stop_words()
 29 | 
 30 | def generate_pickle():
 31 | 	files = os.listdir(data_path + 'corpus')
 32 | 	corpus_list = []
 33 | 
 34 | 	for f in files:
 35 | 		text = ''
 36 | 		corpus_set = set()
 37 | 		file = open(data_path + 'corpus/' + f, "r").read()
 38 | 		for p in file.split("\n"):
 39 | 			text += p
 40 | 
 41 | 		word_tags = preprocess_postag_label(text)	
 42 | 
 43 | 		if not word_tags:
 44 | 			continue
 45 | 
 46 | 		for word in word_tags:
 47 | 			if word[1][0] == 'N':
 48 | 				corpus_set.add(word[0])
 49 | 
 50 | 		corpus_list.append(corpus_set)
 51 | 
 52 | 	file = open(pickle_path + "corpus.pickle", 'w')
 53 | 	pickle.dump(corpus_list, file)
 54 | 	file.close()
 55 | 
 56 | 	return corpus_list
 57 | 
 58 | 
 59 | def import_corpus():
 60 | 	global corpus
 61 | 	if not corpus:
 62 | 		if not os.path.exists(pickle_path + "corpus.pickle"):
 63 | 			corpus = generate_pickle()
 64 | 		else:
 65 | 			file = open(pickle_path + "corpus.pickle", 'r')
 66 | 			corpus = pickle.load(file)
 67 | 			file.close()
 68 | 
 69 | 
 70 | def generate_candidate(word_tags):
 71 | 	if len(word_tags) < 1:
 72 | 		return []
 73 | 	
 74 | 	candidate_list = []
 75 | 	phrase_list = []
 76 | 	
 77 | 	lemmatizer = WordNetLemmatizer()
 78 | 
 79 | 	for item in word_tags:
 80 | 		if item[1][0] == 'N' or item[0][0] == '#':
 81 | 			if item[0][0] == '#':
 82 | 				candidate_list.append(item[0])
 83 | 
 84 | 			else:
 85 | 				word = lemmatizer.lemmatize(item[0], 'n')
 86 | 				if word not in stop_words:
 87 | 					candidate_list.append(word)
 88 | 		
 89 | 	if len(word_tags) == 2 and (word_tags[0][1][0] == 'J' or word_tags[0][1][0] == 'V') and word_tags[1][1][0] == 'N':
 90 | 		if word_tags[0][1][0] == 'J':
 91 | 			prefix = lemmatizer.lemmatize(word_tags[0][0], 'a')
 92 | 		
 93 | 		if word_tags[0][1][0] == 'V':
 94 | 			prefix = lemmatizer.lemmatize(word_tags[0][0], 'v')
 95 | 		
 96 | 		suffix = lemmatizer.lemmatize(word_tags[1][0], 'n')
 97 | 
 98 | 		if prefix not in stop_words and suffix not in stop_words:
 99 | 			phrase_list.append(prefix + " " + suffix)
100 | 
101 | 	i = 0
102 | 	while(i < len(word_tags) - 2):
103 | 		if word_tags[i][0][0] == '#':
104 | 			i += 1
105 | 			continue
106 | 
107 | 		if word_tags[i + 1][0][0] == '#':
108 | 			i += 2
109 | 			continue
110 | 
111 | 		if word_tags[i][1][0] == 'V' and (word_tags[i + 1][1][0] == 'N' or (word_tags[i + 1][1][0] == 'J' and word_tags[i + 2][1][0] == 'N')):
112 | 			if word_tags[i + 1][1][0] == 'J':
113 | 				suffix = lemmatizer.lemmatize(word_tags[i + 1][0], 'a') + " "
114 | 				suffix += lemmatizer.lemmatize(word_tags[i + 2][0], 'n')
115 | 				phrase_list.append(lemmatizer.lemmatize(word_tags[i][0], 'v') + " " + suffix)
116 | 				i += 3
117 | 			else:
118 | 				prefix = lemmatizer.lemmatize(word_tags[i][0], 'v')
119 | 				suffix = lemmatizer.lemmatize(word_tags[i + 1][0], 'n')
120 | 
121 | 				if prefix not in stop_words and suffix not in stop_words:
122 | 					phrase_list.append(prefix + " " + suffix)
123 | 				
124 | 				i += 2
125 | 
126 | 		elif word_tags[i][1][0] == 'J' and word_tags[i + 1][1][0] == 'N':
127 | 			prefix = lemmatizer.lemmatize(word_tags[i][0], 'a')
128 | 			suffix = lemmatizer.lemmatize(word_tags[i + 1][0], 'n')
129 | 
130 | 			if prefix not in stop_words and suffix not in stop_words:
131 | 				phrase_list.append(prefix + " " + suffix)
132 | 
133 | 			i += 2
134 | 
135 | 		else:
136 | 			i += 1
137 | 
138 | 	if i != 0 and i == len(word_tags) - 2 and word_tags[i + 1][1][0] == 'N':
139 | 		if word_tags[i][1][0] == 'J':
140 | 			prefix = lemmatizer.lemmatize(word_tags[i][0], 'a')
141 | 			suffix = lemmatizer.lemmatize(word_tags[i + 1][0], 'n')
142 | 
143 | 			if prefix not in stop_words and suffix not in stop_words:
144 | 				phrase_list.append(prefix + " " + suffix)
145 | 		
146 | 		elif word_tags[i][1][0] == 'V':
147 | 			prefix = lemmatizer.lemmatize(word_tags[i][0], 'v')
148 | 			suffix = lemmatizer.lemmatize(word_tags[i + 1][0], 'n')
149 | 
150 | 			if prefix not in stop_words and suffix not in stop_words:
151 | 				phrase_list.append(prefix + " " + suffix)
152 | 	
153 | 	return candidate_list + phrase_list
154 | 
155 | 
156 | def calc_tf_idf(candidate_list):
157 | 	if corpus == None:
158 | 		import_corpus()
159 | 	
160 | 	count = Counter(candidate_list)
161 | 	common_word = count.most_common(150)
162 | 
163 | 	tf_idf = {}
164 | 	corpus_len = len(corpus)
165 | 	for item in common_word:
166 | 		n = 1
167 | 		for corpus_set in corpus:
168 | 			if item[0] in corpus_set:
169 | 				n += 1
170 | 
171 | 		idf = math.log(corpus_len * 1.0 / n, 10)
172 | 		tf_idf[item[0]] = item[1] * idf
173 | 
174 | 	candidate_list = sorted(tf_idf.iteritems(), key = lambda item: item[1], reverse = True)
175 | 
176 | 	return candidate_list[:90]
177 | 
178 | 
179 | def calc_weight(tweets, candidates):
180 | 	weight_dict = {}
181 | 
182 | 	length = len(candidates)
183 | 
184 | 	for item in candidates:
185 | 		weight_dict[item] = {}
186 | 
187 | 		for sub_item in candidates:
188 | 			if item != sub_item:
189 | 				weight_dict[item][sub_item] = 0
190 | 
191 | 	for i in range(length):
192 | 		item = candidates[i]
193 | 		j = i + 1
194 | 
195 | 		while j < length:
196 | 			sub_item = candidates[j]
197 | 			j += 1
198 | 
199 | 			for tweet in tweets:
200 | 				text = tweet['text']
201 | 
202 | 				if item in text and sub_item in text:
203 | 					weight_dict[item][sub_item] += 1
204 | 					weight_dict[sub_item][item] += 1
205 | 
206 | 	o_vector = {}
207 | 
208 | 	for item in candidates:
209 | 		o_vector[item] = 0
210 | 
211 | 		for sub_item in candidates:
212 | 			if item != sub_item:
213 | 				o_vector[item] += weight_dict[item][sub_item]
214 | 		
215 | 	return weight_dict, o_vector
216 | 
217 | 
218 | def text_rank(tweets, candidates_list):
219 | 	candidates = {}
220 | 	for item in candidates_list:
221 | 		candidates[item[0]] = item[1]
222 | 
223 | 	weight_dict, o_vector = calc_weight(tweets, candidates.keys())
224 | 
225 | 	alpha = 0.85
226 | 	score_vector = {}
227 | 	related_items = {}
228 | 
229 | 	for item in candidates:
230 | 		score_vector[item] = 1
231 | 		related_items[item] = []
232 | 
233 | 		for sub_item in candidates:
234 | 			if item != sub_item and weight_dict[item][sub_item] != 0:
235 | 				related_items[item].append(sub_item)
236 | 
237 | 	i = 0
238 | 	while i < 88:
239 | 		score_vector_temp = {}
240 | 
241 | 		for item in candidates:
242 | 			score_temp = (1 - alpha) * candidates[item]
243 | 
244 | 			for sub_item in related_items[item]:
245 | 				score_temp += weight_dict[item][sub_item] * 1.0 / o_vector[sub_item] * score_vector[sub_item]
246 | 
247 | 			score_vector_temp[item] = alpha * score_temp
248 | 
249 | 		if calc_differ(score_vector, score_vector_temp) < 1:
250 | 			score_vector = score_vector_temp
251 | 			break
252 | 
253 | 		score_vector = score_vector_temp
254 | 
255 | 		i += 1
256 | 
257 | 	return sorted(score_vector.iteritems(), key = lambda item: item[1], reverse = True)
258 | 
259 | 
260 | def calc_differ(score_vector1, score_vector2):
261 | 	differ = 0
262 | 
263 | 	for item in score_vector1:
264 | 		differ += abs(score_vector1[item] - score_vector2[item])
265 | 
266 | 	return differ
267 | 
268 | 
269 | def get_top_tags(candidate_tags, count, filter_set):
270 | 	interset_tags = map(lambda tag: tag[0], candidate_tags)
271 | 
272 | 	res_tags = []
273 | 
274 | 	for item in interset_tags:
275 | 		if len(res_tags) >= count:
276 | 			break
277 | 
278 | 		item_temp = item.replace('#', '')
279 | 		word_list = item.split(' ')
280 | 
281 | 		if len(word_list) == 1:
282 | 			if item_temp not in filter_set and len(item) > 2:
283 | 				res_tags.append(item)
284 | 				filter_set.add(item_temp)
285 | 		else:
286 | 			for word in word_list:
287 | 				if word.strip() != '':
288 | 					filter_set.add(word)
289 | 					if word in res_tags:
290 | 						res_tags.remove(word)
291 | 			
292 | 			res_tags.append(item)
293 | 	
294 | 	return res_tags[:count]
295 | 
296 | 
297 | def join_top_tags(tfidf_tags, textrank_tags, count):
298 | 	final_set = tfidf_tags[:count * 3 / 5]
299 | 
300 | 	for item in tfidf_tags[count * 3 / 5:]:
301 | 		if item[0] == '#':
302 | 			final_set.append(item)
303 | 		
304 | 	for item in textrank_tags:
305 | 		if item not in final_set:
306 | 			final_set.append(item)
307 | 	
308 | 	return final_set[:count]
309 | 
310 | def extract_tags(tweets, description = '', count = 36):
311 | 	text = ''
312 | 	for tweet in tweets:
313 | 		text += tweet['text'] + " , "
314 | 
315 | 	word_tags = preprocess_postag_label(description + text + description)
316 | 	candidate_list = generate_candidate(word_tags)
317 | 
318 | 	filter_set = set(["dis", "fuck", "hell", "damn", "shit", "bitch", "wow", "cool", "fun", "glad",
319 | 		"joy", "luck", "laugh", "bless", "appreciate", "wish", "hope", "play", "set", "close", "talk",
320 | 		"change", "join", "move", "watch", "meet", "post", "wait", "live", "deal", "eat", "call",
321 | 		"pick", "start", "end", "kid", "boy", "home", "tweet", "video", "bang", "dope",
322 | 		"year", "month", "hour", "minute", "second", "moment", "morning", "afternoon", "evening"])
323 | 
324 | 	candidate_tags = calc_tf_idf(candidate_list)
325 | 	tfidf_tags = get_top_tags(candidate_tags, count, filter_set)
326 | 	
327 | 	candidate_tags = text_rank(tweets, candidate_tags)
328 | 	textrank_tags = get_top_tags(candidate_tags, count, filter_set)
329 | 
330 | 	tfidf_tags = join_top_tags(tfidf_tags, textrank_tags, count)
331 | 
332 | 	return ','.join(tfidf_tags)


--------------------------------------------------------------------------------
/crawler/web_crawler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import urllib
  3 | import urllib2
  4 | import MySQLdb
  5 | import config
  6 | import time
  7 | import cookielib
  8 | import random
  9 | import re
 10 | import socket
 11 | from bs4 import BeautifulSoup
 12 | from pybloom import BloomFilter
 13 | 
 14 | '''
 15 | 采用 web 方式抓取 Twitter 的类（已经弃用）
 16 | '''
 17 | class Crawler:
 18 | 	def __init__(self):	
 19 | 
 20 | 		#获取一个保存cookie的对象
 21 | 		cj = cookielib.LWPCookieJar()
 22 | 		#将一个保存cookie对象，和一个HTTP的cookie的处理器绑定
 23 | 		cookie_support = urllib2.HTTPCookieProcessor(cj)
 24 | 		#创建一个opener，将保存了cookie的http处理器，还有设置一个handler用于处理http的URL的打开
 25 | 		opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
 26 | 		#将包含了cookie、http处理器、http的handler的资源和urllib2对象板顶在一起
 27 | 		urllib2.install_opener(opener)
 28 | 
 29 | 		headers = {    
 30 | 			'User-Agent':config.USER_AGENT,
 31 | 			'referer':'https://twitter.com/login'
 32 | 		}   
 33 | 		self.headers = [{
 34 | 			'User-Agent':config.USER_AGENT,
 35 | 			'referer':'https://twitter.com'
 36 | 		}, {
 37 | 			'User-Agent':config.USER_AGENT,
 38 | 			'referer':'https://twitter.com/login'
 39 | 		}, {
 40 | 			'User-Agent':config.USER_AGENT,
 41 | 			'referer':'https://twitter.com/mrmarcohan'
 42 | 		}]
 43 | 		request = urllib2.Request("https://twitter.com/login", headers = headers)
 44 | 		response = urllib2.urlopen(request)
 45 | 		pageHtml = response.read()
 46 | 		soup = BeautifulSoup(pageHtml, 'html.parser')
 47 | 		csrf = soup.find_all("input", attrs={"name": "authenticity_token"})[0]['value']
 48 | 
 49 | 		postdata = {
 50 | 			'session[username_or_email]':'mrmarcohan',
 51 | 			'session[password]':'han123456',
 52 | 			'authenticity_token':csrf,
 53 | 			'scribe_log':'',
 54 | 			'redirect_after_login':''
 55 | 		}
 56 | 
 57 | 		req = urllib2.Request(    
 58 | 			url = 'https://twitter.com/sessions',	
 59 | 			data = urllib.urlencode(postdata),
 60 | 			headers = headers	
 61 | 		)
 62 | 
 63 | 		res = urllib2.urlopen(req)
 64 | 		page = res.read()
 65 | 
 66 | 		socket.setdefaulttimeout(5)
 67 | 		# request = urllib2.Request("https://twitter.com/taylorswift13/following", headers = headers)
 68 | 		# response = urllib2.urlopen(request)
 69 | 		# pageHtml = response.read()
 70 | 		
 71 | 		# file_obj = open('a.html','w')
 72 | 		# file_obj.write(pageHtml)
 73 | 		# file_obj.close()
 74 | 
 75 | 		# cookie = cookielib.CookieJar()
 76 | 		# opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
 77 | 		# response = opener.open('https://twitter.com')
 78 | 		# for item in cookie:
 79 | 		# 	print item
 80 | 		# 	# if item.name == 'some_cookie_item_name':
 81 | 		# 		# print item.value
 82 | 		# return
 83 | 
 84 | 		self.urlList = [config.INITIAL_USER]
 85 | 		self.months = dict(Jan = '1', Feb = '2', Mar = '3', Apr = '4', \
 86 | 						May = '5', Jun = '6', Jul = '7', Aug = '8', \
 87 | 						Sep = '9', Oct = '10', Nov = '11', Dec = '12')
 88 | 
 89 | 		db = MySQLdb.connect(config.DB_HOST, config.DB_USER, config.DB_PASSWORD, config.DB_DATABASE)
 90 | 		# 使用cursor()方法获取操作游标 
 91 | 		cursor = db.cursor()
 92 | 		self.cursor = cursor
 93 | 		self.db = db
 94 | 		self.bf = BloomFilter(capacity=1000000, error_rate=0.001)
 95 | 		self.bf.add(config.INITIAL_USER)
 96 | 		self.restart()
 97 | 
 98 | 	def getUsersInfo(self):
 99 | 		urlList = self.urlList
100 | 		count = 0
101 | 		print "starting..."
102 | 		while len(urlList) > 0:
103 | 			count = count + 1
104 | 			if count % 3000 == 0:
105 | 				print count
106 | 				print "sleeping..."
107 | 				time.sleep(400 + random.randint(200,1000))
108 | 			user = urlList.pop()
109 | 			url = "https://twitter.com/" + user
110 | 			print url
111 | 			self.currentUser = user
112 | 			time.sleep(1 + random.uniform(1, 4))
113 | 			try:
114 | 				flag = 1
115 | 				flag = self.getBasicInfo()
116 | 				# print flag
117 | 				if flag != -1:
118 | 					time.sleep(1 + random.uniform(1, 4))
119 | 					self.getFollowing()
120 | 					# time.sleep(1 + random.uniform(1, 3))
121 | 					# self.getFollowers()
122 | 			except:
123 | 				print "something wrong"
124 | 				continue
125 | 			# if self.getBasicInfo() != -1:
126 | 			# 	time.sleep(2 + random.uniform(1, 3))
127 | 			# 	self.getFollowing()
128 | 
129 | 	def getBasicInfo(self):
130 | 		url = "https://twitter.com/" + self.currentUser
131 | 
132 | 		try:
133 | 			request = urllib2.Request(url, headers = self.headers[0])
134 | 			response = urllib2.urlopen(request, timeout = 5)
135 | 			pageHtml = response.read()
136 | 			# file_obj = open('a.html','w')
137 | 			# file_obj.write(pageHtml)
138 | 			# file_obj.close()
139 | 		except:
140 | 			print "basic info 请求超时"
141 | 			return -1
142 | 
143 | 		soup = BeautifulSoup(pageHtml, 'html.parser', from_encoding="unicode")
144 | 
145 | 		name = soup.select_one(".ProfileHeaderCard-nameLink").text
146 | 		screenname = soup.select_one(".u-linkComplex-target").text
147 | 		bio = soup.select_one(".ProfileHeaderCard-bio").text
148 | 		jd = soup.select_one(".ProfileHeaderCard-joinDateText")['title']
149 | 		location = soup.select_one(".ProfileHeaderCard-locationText").text
150 | 		try:
151 | 			jd = jd.split(' ')[2]
152 | 			joindate = re.sub('[^\d]+',"-",jd)
153 | 			joindate = joindate[0 : -1]
154 | 		except:
155 | 			joindate = ""
156 | 
157 | 		try:
158 | 			tn = soup.select_one(".ProfileNav-item--tweets") \
159 | 				.select_one(".ProfileNav-stat--link")['title']
160 | 			tweetNum = tn.split(' ')[0].replace(',','')
161 | 			if int(tweetNum) < 60:
162 | 				return -1
163 | 		except:
164 | 			return -1
165 | 		
166 | 		try:
167 | 			fing = soup.select_one(".ProfileNav-item--following") \
168 | 						.select_one(".ProfileNav-stat--link")['title']
169 | 			following = fing.split(' ')[0].replace(',','')
170 | 		except:
171 | 			following = 0
172 | 		try:
173 | 			fers = soup.select_one(".ProfileNav-item--followers") \
174 | 					.select_one(".ProfileNav-stat--link")['title']
175 | 			followers = fers.split(' ')[0].replace(',','')
176 | 		except:
177 | 			followers = 0
178 | 
179 | 		try:
180 | 			fates = soup.select_one(".ProfileNav-item--favorites") \
181 | 					.select_one(".ProfileNav-stat--link")['title']
182 | 			favorites = fates.split(' ')[0].replace(',','')
183 | 		except:
184 | 			favorites = 0
185 | 
186 | 		# SQL 插入语句
187 | 		sql = """INSERT INTO user(screenname, name, location, joinDate, bio, tweetNum, watchNum, 
188 | 				fansNum, likeNum, created_at) VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', 
189 | 				'%s', '%s', '%s')""" % (screenname, name, location, joindate, bio, tweetNum, \
190 | 				following, followers, favorites, time.strftime('%Y-%m-%d',time.localtime(time.time()))) 
191 | 		try:
192 | 		   # 执行sql语句
193 | 		   self.cursor.execute(sql)
194 | 		   # 提交到数据库执行
195 | 		   self.db.commit()
196 | 		except:
197 | 		   return -1
198 | 
199 | 		tweets = soup.select(".js-stream-item")
200 | 		file_obj = open('tweet/' + self.currentUser + '.txt','a')
201 | 		for i in range(len(tweets)):
202 | 			try:
203 | 				tt = tweets[i].select_one(".js-tweet-text-container").text.replace(u'\xa0', u' ').replace('\n',' ')
204 | 				file_obj.write(tt.encode('utf-8'))
205 | 				file_obj.write("\n")
206 | 			except:
207 | 				continue
208 | 			try:
209 | 				timestamp = tweets[i].select_one(".stream-item-header").select_one(".js-short-timestamp")['data-time']
210 | 				user = tweets[i].select_one(".stream-item-header").select_one(".username").select_one('b').text
211 | 				itemFooter =  tweets[i].select_one(".stream-item-footer")
212 | 				reply = itemFooter.select_one(".ProfileTweet-action--reply").select_one(".ProfileTweet-actionCount")['data-tweet-stat-count']
213 | 				retweet = itemFooter.select_one(".ProfileTweet-action--retweet").select_one(".ProfileTweet-actionCount ")['data-tweet-stat-count']
214 | 				favorite = itemFooter.select_one(".ProfileTweet-action--favorite").select_one(".ProfileTweet-actionCount ")['data-tweet-stat-count']
215 | 			except:
216 | 				print "tweets bottom error"
217 | 			file_obj.write(user + " " + timestamp + " " + reply + " " + retweet + " " + favorite)
218 | 			file_obj.write('\n')
219 | 		file_obj.close()		
220 | 
221 | 	def getTweet(self): 
222 | 		return
223 | 
224 | 	def getFollowing(self):
225 | 		url = "https://twitter.com/" + self.currentUser + "/following"
226 | 		
227 | 		try:
228 | 			request = urllib2.Request(url, headers = self.headers[1])
229 | 			response = urllib2.urlopen(request, timeout = 5)
230 | 			pageHtml = response.read()
231 | 		except:
232 | 			print "following 请求超时"
233 | 			return
234 | 
235 | 		soup = BeautifulSoup(pageHtml, 'html.parser', from_encoding="unicode")
236 | 		pcList = soup.select(".ProfileCard")
237 | 		file_obj = open('following/' + self.currentUser + '.txt','a')
238 | 		for i in range(len(pcList)):
239 | 			pc = pcList[i].select_one(".ProfileCard-screennameLink").select_one(".u-linkComplex-target").text.replace(u'\xa0', u' ')
240 | 			if pc not in self.bf:
241 | 				self.bf.add(pc)
242 | 				self.urlList.append(pc)
243 | 			try:
244 | 				file_obj.write(pc + " ")
245 | 			except:
246 | 				print pc
247 | 				continue
248 | 
249 | 	def getFollowers(self):
250 | 		url = "https://twitter.com/" + self.currentUser + "/following"
251 | 		
252 | 		try:
253 | 			request = urllib2.Request(url, headers = self.headers[2])
254 | 			response = urllib2.urlopen(request, timeout = 5)
255 | 			pageHtml = response.read()
256 | 
257 | 		except urllib2.URLError, e:
258 | 			if hasattr(e,"reason"):
259 | 				print e.reason
260 | 			return
261 | 
262 | 		soup = BeautifulSoup(pageHtml, 'html.parser', from_encoding="unicode")
263 | 
264 | 		return
265 | 
266 | 	def getFavorite(self):
267 | 		url = "https://twitter.com/" + self.currentUser + "/following"
268 | 		
269 | 		try:
270 | 			request = urllib2.Request(url, headers = self.headers[0])
271 | 			response = urllib2.urlopen(request, timeout = 5)
272 | 			pageHtml = response.read()
273 | 
274 | 		except urllib2.URLError, e:
275 | 			if hasattr(e,"reason"):
276 | 				print e.reason
277 | 			return
278 | 
279 | 		soup = BeautifulSoup(pageHtml, 'html.parser', from_encoding="unicode")
280 | 
281 | 		return 
282 | 
283 | 	def crawlerFinish(self):
284 | 		self.db.close()
285 | 
286 | 	def restart(self):
287 | 		sql = "select screenname from user" 
288 | 		try:
289 | 			# 执行sql语句
290 | 			self.cursor.execute(sql)
291 | 			info = self.cursor.fetchall()
292 | 			for ii in info:
293 | 				self.bf.add(ii[0])
294 | 		except:
295 | 			return -1
296 | 		self.getUsersInfo()
297 | 
298 | spider = Crawler()
299 | 


--------------------------------------------------------------------------------
/typical.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | # import nltk
  4 | 
  5 | from crawler.database import MongoDB
  6 | # from portrayal.config import PROJECT_PATH
  7 | # from nltk.tokenize import word_tokenize
  8 | # from portrayal.tools import preprocess
  9 | # from portrayal.career_classify import training, classify
 10 | # from portrayal.interest_extract import interest_extract
 11 | # from portrayal.sentiment_classify import sentiment_classify
 12 | # from portrayal.sentiment_classify import sentiment_dict as sentiment_dict_classifier
 13 | # from portrayal.tools import preprocess
 14 | # from portrayal.user_profile import user_profile
 15 | 
 16 | import numpy as np
 17 | import matplotlib as mpl
 18 | import matplotlib.pyplot as plt
 19 | from scipy.interpolate import spline
 20 | 
 21 | 
 22 | 
 23 | # graph = Neo4j().connect()
 24 | 
 25 | '''
 26 | 职业领域分类
 27 | '''
 28 | def classify_career():
 29 | 	db = MongoDB().connect()
 30 | 	users = db['typical'].find()
 31 | 
 32 | 	n = 0
 33 | 	err_dict = {}
 34 | 	for user in users:
 35 | 		tweets = user['tweets']
 36 | 		text = user['description']
 37 | 		for tweet in tweets:
 38 | 			text += ' ' + tweet['text']
 39 | 		
 40 | 		text = preprocess.preprocess(text)
 41 | 		res = classify.exe_classify(text)
 42 | 			
 43 | 		if err_dict.has_key(res[0]):
 44 | 			err_dict[res[0]] += 1
 45 | 		else:
 46 | 			err_dict[res[0]] = 1
 47 | 
 48 | 		if res[0] == user['category']:
 49 | 			n += 1
 50 | 
 51 | 	print err_dict
 52 | 	print n
 53 | 
 54 | 
 55 | '''
 56 | 兴趣标签导出
 57 | '''
 58 | def extract_interset():
 59 | 	db = MongoDB().connect()
 60 | 	users = db['typical'].find()
 61 | 	
 62 | 	for u in users:
 63 | 		text = ''
 64 | 		for item in u['tweets']:
 65 | 			text += item['text'] + ' '
 66 | 
 67 | 		try:
 68 | 			tags = interest_extract.extract_tags(text, u['description'])
 69 | 		except Exception as e:
 70 | 			print u['_id']
 71 | 			print e
 72 | 			continue
 73 | 	
 74 | 		db['typical'].update({'_id': u['_id']}, {"$set": {"interest_tags": tags}})
 75 | 
 76 | 
 77 | '''
 78 | 心理状态
 79 | '''
 80 | def calc_sentiment():
 81 | 	db = MongoDB().connect()
 82 | 	users = db['typical'].find()
 83 | 
 84 | 	for user in users:
 85 | 		try:
 86 | 			final_sentiment, psy_with_time1, psy_with_time2, psy_with_count1, psy_with_count2 = sentiment_classify.exe_sentiment_classify(user['tweets'])
 87 | 		except Exception as e:
 88 | 			print user['_id']
 89 | 			print e
 90 | 			continue
 91 | 
 92 | 		db['typical'].update({'_id': user['_id']}, {"$set": {"psy": final_sentiment, "psy_with_time1": psy_with_time1, "psy_with_time2": psy_with_time2, "psy_with_count1": psy_with_count1, "psy_with_count2": psy_with_count2}}) 
 93 | 
 94 | 
 95 | def calc_sentiment_score():
 96 | 	sentiment_dict = dict(map(lambda (k,v): (k,int(v)),
 97 | 						[ line.split('\t') for line in open("portrayal/sentiment_classify/data/sentiment_words1.txt") ]))
 98 | 
 99 | 	db = MongoDB().connect()
100 | 	users = db['typical'].find({'screen_name': 'EP_Agriculture'}).limit(1)
101 | 
102 | 	for user in users:
103 | 		final_sentiment, psy_with_time1, psy_with_time2, psy_with_count1, psy_with_count2 = sentiment_classify.exe_sentiment_classify(user['tweets'])
104 | 		# print tags
105 | 		db['typical'].update({'_id': user['_id']}, {"$set": {"psy": final_sentiment, "psy_with_time1": psy_with_time1, "psy_with_time2": psy_with_time2, "psy_with_count1": psy_with_count1, "psy_with_count2": psy_with_count2}}) 
106 | 
107 | 
108 | def sentiment_dict_test():
109 | 	# sentiment_dict = dict(map(lambda (k,v): (k,int(v)),
110 | 	# 					[ line.split('\t') for line in open("portrayal/sentiment_classify/data/sentiment_words1.txt") ]))
111 | 	text = sentiment_classify.replace_emotion([{'text': 'hope the er isnt to busy today but the nice weather doesnt keep people healthy or safe', 'created_at': '1'}])
112 | 	print text
113 | 	print sentiment_dict_classifier.calc_sentiment_score(text)
114 | 	return
115 | 	n = 0
116 | 	tts = []
117 | 	total = 0
118 | 	wrong = 0
119 | 	for line in open("portrayal/sentiment_classify/data/positive.txt"):
120 | 		n += 1
121 | 		tts.append({'text': line})
122 | 
123 | 		if n % 1 == 0:
124 | 			score = sentiment_dict_classifier.calc_sentiment_score(tts)
125 | 			total += 1
126 | 			if score < 0:
127 | 				wrong += 1
128 | 				# print score
129 | 				# if wrong == 7:
130 | 				# 	print tts
131 | 				# 	break
132 | 			tts = []
133 | 	
134 | 	print wrong
135 | 	print total
136 | 
137 | 
138 | def update_user_category():
139 | 	db = MongoDB().connect()
140 | 	users = db['typical'].find({}, {'_id': 1, 'screen_name': 1, 'category': 1, 'category_score': 1})
141 | 
142 | 	count = 0
143 | 	category_name = ['Politics', 'Religion', 'Military', 'Economy', 'Technology', 'Education', 'Agriculture', 'Entertainment', 'Sports']
144 | 
145 | 	users_temp = []
146 | 
147 | 	for item in users:
148 | 		sorted_list = sorted(item['category_score'].iteritems(), key = lambda asd:asd[1], reverse = True)
149 | 
150 | 		if sorted_list[0][1] > 2 * sorted_list[1][1] or sorted_list[0][1] - sorted_list[1][1] > 50:
151 | 			if sorted_list[0][0] != item['category']:
152 | 				count += 1
153 | 			continue
154 | 
155 | 		score_differ = (2 * sorted_list[0][1] - sorted_list[1][1] - sorted_list[-1][1]) / 2
156 | 
157 | 		relation_dict = {
158 | 			sorted_list[0][0]: 0,
159 | 			sorted_list[1][0]: 0,
160 | 			sorted_list[2][0]: 0,
161 | 			sorted_list[3][0]: 0
162 | 		}
163 | 		# for name in category_name:
164 | 		# 	relation_dict[name] = 0
165 | 
166 | 		cql = '''MATCH(a{user_id:%s})-[:following]->(f) return distinct f.user_id as user_id''' % (item['_id'])
167 | 		res = graph.data(cql)
168 | 
169 | 		for f in res:
170 | 			user = db['typical'].find_one({'_id': f['user_id']}, {'category_score': 1})
171 | 			category_score = user['category_score']
172 | 			max_category = max(category_score, key = lambda x: category_score[x])
173 | 			
174 | 			if max_category in relation_dict:
175 | 				relation_dict[max_category] += 1
176 | 
177 | 		cql = '''MATCH(a{user_id:%s})<-[:following]-(f) return distinct f.user_id as user_id''' % (item['_id'])
178 | 		res = graph.data(cql)
179 | 
180 | 		for f in res:
181 | 			user = db['typical'].find_one({'_id': f['user_id']}, {'category_score': 1})
182 | 			category_score = user['category_score']
183 | 			max_category = max(category_score, key = lambda x: category_score[x])
184 | 
185 | 			if max_category in relation_dict:
186 | 				relation_dict[max_category] += 1
187 | 		
188 | 		relation_total = 0
189 | 
190 | 		for ri in relation_dict:
191 | 			relation_total += relation_dict[ri]
192 | 
193 | 		if relation_total < 10:
194 | 			if sorted_list[0][0] != item['category']:
195 | 				count += 1
196 | 			continue
197 | 
198 | 		for ri in relation_dict:
199 | 			item['category_score'][ri] += round(score_differ * relation_dict[ri] / relation_total, 2)
200 | 		
201 | 		users_temp.append({'_id':item['_id'], "category_score": item['category_score']})
202 | 
203 | 		s1 = sorted_list[0][0]
204 | 
205 | 		sorted_list = sorted(item['category_score'].iteritems(), key = lambda asd:asd[1], reverse = True)
206 | 
207 | 		# if sorted_list[0][0] == item['category'] and s1 != item['category']:
208 | 		# 	print item['screen_name']
209 | 
210 | 		if sorted_list[0][0] != item['category']:
211 | 			count += 1
212 | 		
213 | 		print count
214 | 	
215 | 	for item in users_temp:
216 | 		db['typical'].update({'_id': item['_id']}, {"$set": {"category_score": item['category_score']}}) 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | if __name__ == "__main__":
223 | 	# update_user_category()
224 | 	# db = MongoDB().connect()
225 | 	# users = db['typical'].find_one({'_id': 4418090668})
226 | 
227 | 	# for t in users['tweets']:
228 | 	# 	try:
229 | 	# 		print t['text']
230 | 	# 	except Exception as e:
231 | 	# 		continue
232 | 	# count = 0
233 | 	# for user in users:
234 | 	# 	# tags = interest_extract.extract_tags(user['tweets'], user['description'])
235 | 	# 	# print tags
236 | 		
237 | 	# 	max_score = sorted(user['category_score'].iteritems(), key = lambda asd:asd[1], reverse=True)
238 | 	# 	# print 
239 | 	# 	if user['category'] != max_score[0][0]:
240 | 	# 		# print max_score[0][0]
241 | 	# 		count += 1
242 | 	# 		print user['screen_name']
243 | 	# 	# break
244 | 	# print count
245 | 	# 	for tt in user['tweets']:
246 | 	# 		print tt['text']
247 | 	# extract_interset()
248 | 	# calc_sentiment()
249 | 	# calc_sentiment_score()
250 | 	# sentiment_dict_test()
251 | 
252 | 	# try:
253 | 	# 	words = word_tokenize("What a beautiful sunday . happy")
254 | 	# 	print nltk.pos_tag(words)
255 | 	# except Exception as e:
256 | 	# 	print e
257 | 	db = MongoDB().connect()
258 | 	users = db['typical'].find()
259 | 
260 | 	count = 1
261 | 	data_set = {
262 | 		'retweet_favorite_rate': [],
263 | 		'fans_retweet_rate': [],
264 | 		'fans_favorite_rate': []
265 | 	}
266 | 	for user in users:
267 | 		tweets = user['tweets']
268 | 		fans = user['followers_count']
269 | 
270 | 		# if fans > 2000000:
271 | 		# 	continue
272 | 		count += 1
273 | 
274 | 		if count > 150:
275 | 			break
276 | 		tweet_count = 0
277 | 		retweet_count = 0
278 | 		favorite_count = 0
279 | 		for tweet in tweets:
280 | 			if 'RT @' not in tweet['text']:
281 | 				tweet_count += 1.
282 | 
283 | 				retweet_count += tweet['retweet_count']
284 | 				favorite_count += tweet['favorite_count']
285 | 		
286 | 		fans_retweet_rate = fans / (retweet_count / tweet_count)
287 | 		if fans_retweet_rate > 600000 or fans_retweet_rate < 50:
288 | 			continue
289 | 		
290 | 		fans_favorite_rate = fans / (favorite_count / tweet_count)
291 | 		if fans_favorite_rate > 600000 or fans_favorite_rate < 50:
292 | 			continue
293 | 		
294 | 		retweet_favorite_rate = (retweet_count / tweet_count) / (favorite_count / tweet_count)
295 | 		# print fans, tweet_count, retweet_count, retweet_count / tweet_count
296 | 		if fans_retweet_rate < 0:
297 | 			print user['_id']
298 | 		data_set['retweet_favorite_rate'].append(retweet_favorite_rate)
299 | 		data_set['fans_retweet_rate'].append(fans_retweet_rate)
300 | 		data_set['fans_favorite_rate'].append(fans_favorite_rate)
301 | 
302 | 	x_axix = range(len(data_set['retweet_favorite_rate']))
303 | 	x_axix = np.array(x_axix)
304 | 	x_axix_new = np.linspace(x_axix.min(), x_axix.max(), 4000)
305 | 
306 | 	y_axix = data_set['retweet_favorite_rate']
307 | 	y_axix_new = spline(x_axix ,y_axix, x_axix_new)
308 | 
309 | 	plt.plot(x_axix_new, y_axix_new, color='green', label='Retweet Favorite Rate')
310 | 
311 | 	y_axix = data_set['fans_retweet_rate']
312 | 	y_axix_new = spline(x_axix ,y_axix, x_axix_new)
313 | 
314 | 	plt.plot(x_axix_new, y_axix_new, color='red', label='Fans Retweet Rate')
315 | 
316 | 	# y_axix = data_set['fans_favorite_rate']
317 | 	# y_axix_new = spline(x_axix, y_axix, x_axix_new)
318 | 	plt.plot(x_axix, y_axix, color='blue', label='Fans Retweet Rate')
319 | 	# plt.plot(x_axix, train_pn_dis,  color='skyblue', label='PN distance')
320 | 	# plt.plot(x_axix, thresholds, color='blue', label='threshold')
321 | 	plt.legend() # 显示图例
322 | 
323 | 	# # print y_axix, x_axix_new
324 | 	# for i in x_axix_new:
325 | 	# 	print i
326 | 	plt.xlabel('Users')
327 | 	plt.ylabel('Rate')
328 | 	plt.show()
329 | 
330 | 	


--------------------------------------------------------------------------------
/portrayal/resource/stop_words.txt:
--------------------------------------------------------------------------------
  1 | 'd
  2 | 'll
  3 | 'm
  4 | 're
  5 | 's
  6 | 't
  7 | 've
  8 | ZT
  9 | ZZ
 10 | a
 11 | a's
 12 | able
 13 | about
 14 | above
 15 | abst
 16 | accordance
 17 | according
 18 | accordingly
 19 | across
 20 | act
 21 | actually
 22 | added
 23 | adj
 24 | adopted
 25 | affected
 26 | affecting
 27 | affects
 28 | after
 29 | afterwards
 30 | again
 31 | against
 32 | ah
 33 | aha
 34 | ahh
 35 | ain't
 36 | all
 37 | allow
 38 | allows
 39 | almost
 40 | along
 41 | already
 42 | also
 43 | although
 44 | always
 45 | am
 46 | among
 47 | amongst
 48 | amp
 49 | an
 50 | and
 51 | announce
 52 | another
 53 | any
 54 | anybody
 55 | anyhow
 56 | anymore
 57 | anyone
 58 | anything
 59 | anyway
 60 | anyways
 61 | anywhere
 62 | apart
 63 | apparently
 64 | appear
 65 | approximately
 66 | are
 67 | area
 68 | areas
 69 | aren
 70 | aren't
 71 | arent
 72 | arise
 73 | around
 74 | as
 75 | aside
 76 | ask
 77 | asked
 78 | asking
 79 | asks
 80 | associated
 81 | at
 82 | auth
 83 | away
 84 | b
 85 | back
 86 | backed
 87 | backing
 88 | backs
 89 | be
 90 | became
 91 | because
 92 | become
 93 | becomes
 94 | becoming
 95 | been
 96 | before
 97 | beforehand
 98 | began
 99 | begin
100 | beginning
101 | beginnings
102 | begins
103 | behind
104 | being
105 | beings
106 | believe
107 | below
108 | beside
109 | besides
110 | best
111 | better
112 | between
113 | beyond
114 | big
115 | biol
116 | bit
117 | both
118 | brief
119 | briefly
120 | bro
121 | bruh
122 | but
123 | by
124 | c
125 | c'mon
126 | c's
127 | ca
128 | came
129 | can
130 | can't
131 | cannot
132 | cant
133 | case
134 | cases
135 | cause
136 | causes
137 | certain
138 | certainly
139 | changes
140 | clear
141 | clearly
142 | click
143 | co
144 | com
145 | come
146 | comes
147 | concerning
148 | consequently
149 | consider
150 | considering
151 | contain
152 | containing
153 | contains
154 | corresponding
155 | could
156 | couldn't
157 | couldnt
158 | course
159 | currently
160 | d
161 | date
162 | dawg
163 | day
164 | definitely
165 | describe
166 | described
167 | despite
168 | did
169 | didn
170 | didn't
171 | differ
172 | different
173 | differently
174 | discuss
175 | do
176 | does
177 | doesn
178 | doesn't
179 | doing
180 | don't
181 | done
182 | down
183 | downed
184 | downing
185 | downs
186 | downwards
187 | dub
188 | dude
189 | due
190 | during
191 | e
192 | each
193 | early
194 | ed
195 | edu
196 | effect
197 | eg
198 | eight
199 | eighty
200 | either
201 | else
202 | elsewhere
203 | end
204 | ended
205 | ending
206 | ends
207 | enough
208 | entirely
209 | especially
210 | et
211 | et-al
212 | etc
213 | even
214 | evenly
215 | ever
216 | every
217 | everybody
218 | everyone
219 | everything
220 | everywhere
221 | ex
222 | exactly
223 | example
224 | except
225 | f
226 | face
227 | faces
228 | fact
229 | facts
230 | far
231 | fella
232 | felt
233 | few
234 | ff
235 | fifth
236 | find
237 | finds
238 | first
239 | five
240 | fix
241 | followed
242 | following
243 | follows
244 | for
245 | former
246 | formerly
247 | forth
248 | found
249 | four
250 | from
251 | full
252 | fully
253 | further
254 | furthered
255 | furthering
256 | furthermore
257 | furthers
258 | g
259 | gave
260 | general
261 | generally
262 | get
263 | gets
264 | getting
265 | give
266 | given
267 | gives
268 | giving
269 | go
270 | goes
271 | going
272 | gon
273 | gone
274 | goo
275 | good
276 | goods
277 | got
278 | gotten
279 | greetings
280 | group
281 | grouped
282 | grouping
283 | groups
284 | guy
285 | God
286 | god
287 | h
288 | ha
289 | haa
290 | had
291 | hadn't
292 | hah
293 | haha
294 | hahaa
295 | happens
296 | hardly
297 | has
298 | hasn't
299 | have
300 | haven't
301 | having
302 | he
303 | he's
304 | hed
305 | heh
306 | hello
307 | help
308 | hence
309 | her
310 | here
311 | here's
312 | hereafter
313 | hereby
314 | herein
315 | heres
316 | hereupon
317 | hers
318 | herself
319 | hes
320 | hey
321 | hi
322 | hid
323 | hii
324 | high
325 | higher
326 | highest
327 | him
328 | himself
329 | his
330 | hither
331 | home
332 | homie
333 | homies
334 | hoo
335 | hopefully
336 | how
337 | howbeit
338 | however
339 | http
340 | https
341 | hundred
342 | i
343 | i'd
344 | i'll
345 | i'm
346 | i've
347 | id
348 | ie
349 | if
350 | ignored
351 | im
352 | immediate
353 | immediately
354 | importance
355 | important
356 | in
357 | inasmuch
358 | inc
359 | include
360 | indeed
361 | index
362 | indicate
363 | indicated
364 | indicates
365 | information
366 | inner
367 | insofar
368 | instead
369 | into
370 | inward
371 | is
372 | isn
373 | isn't
374 | it
375 | it'd
376 | it'll
377 | it's
378 | itd
379 | its
380 | itself
381 | j
382 | just
383 | k
384 | keep
385 | keeps
386 | kept
387 | keys
388 | kg
389 | kind
390 | km
391 | knew
392 | know
393 | known
394 | knows
395 | l
396 | large
397 | largely
398 | last
399 | lately
400 | later
401 | latest
402 | latter
403 | latterly
404 | least
405 | less
406 | lest
407 | let
408 | let's
409 | lets
410 | life
411 | like
412 | liked
413 | likely
414 | lil
415 | line
416 | link
417 | little
418 | lmao
419 | long
420 | longer
421 | longest
422 | look
423 | looking
424 | looks
425 | lot
426 | ltd
427 | m
428 | made
429 | mainly
430 | make
431 | makes
432 | making
433 | man
434 | many
435 | matter
436 | may
437 | maybe
438 | me
439 | mean
440 | means
441 | meantime
442 | meanwhile
443 | member
444 | members
445 | men
446 | merely
447 | mg
448 | might
449 | million
450 | miss
451 | ml
452 | more
453 | moreover
454 | most
455 | mostly
456 | mr
457 | mrs
458 | much
459 | mug
460 | must
461 | my
462 | myself
463 | n
464 | n't
465 | na
466 | name
467 | namely
468 | nay
469 | nd
470 | near
471 | nearly
472 | necessarily
473 | necessary
474 | need
475 | needed
476 | needing
477 | needs
478 | neither
479 | never
480 | nevertheless
481 | new
482 | newer
483 | newest
484 | next
485 | nine
486 | ninety
487 | night
488 | no
489 | nobody
490 | non
491 | none
492 | nonetheless
493 | noone
494 | nope
495 | nor
496 | normally
497 | nos
498 | not
499 | noted
500 | nothing
501 | novel
502 | now
503 | nowhere
504 | number
505 | numbers
506 | o
507 | obtain
508 | obtained
509 | obviously
510 | of
511 | off
512 | often
513 | oh
514 | ok
515 | okay
516 | old
517 | older
518 | oldest
519 | omitted
520 | on
521 | once
522 | one
523 | ones
524 | only
525 | onto
526 | open
527 | opened
528 | opening
529 | opens
530 | or
531 | ord
532 | order
533 | ordered
534 | ordering
535 | orders
536 | other
537 | others
538 | otherwise
539 | ought
540 | our
541 | ours
542 | ourselves
543 | out
544 | outside
545 | over
546 | overall
547 | owing
548 | own
549 | p
550 | page
551 | pages
552 | part
553 | parted
554 | particular
555 | particularly
556 | parting
557 | parts
558 | past
559 | per
560 | perhaps
561 | people
562 | place
563 | placed
564 | places
565 | please
566 | plus
567 | point
568 | pointed
569 | pointing
570 | points
571 | poorly
572 | possible
573 | possibly
574 | potentially
575 | pp
576 | predominantly
577 | present
578 | presented
579 | presenting
580 | presents
581 | presumably
582 | previously
583 | primarily
584 | probably
585 | promptly
586 | provides
587 | put
588 | puts
589 | q
590 | que
591 | quickly
592 | quite
593 | qv
594 | r
595 | ran
596 | rather
597 | rd
598 | re
599 | readily
600 | really
601 | recent
602 | recently
603 | ref
604 | refs
605 | regarding
606 | regardless
607 | regards
608 | related
609 | relatively
610 | research
611 | respectively
612 | resulted
613 | resulting
614 | results
615 | right
616 | room
617 | rooms
618 | run
619 | RT
620 | rt
621 | s
622 | said
623 | same
624 | saw
625 | say
626 | saying
627 | says
628 | sec
629 | second
630 | secondly
631 | seconds
632 | section
633 | see
634 | seeing
635 | seem
636 | seemed
637 | seeming
638 | seems
639 | seen
640 | sees
641 | self
642 | selves
643 | sensible
644 | sent
645 | serious
646 | seriously
647 | seven
648 | several
649 | shall
650 | she
651 | she'll
652 | shed
653 | shes
654 | should
655 | shouldn't
656 | show
657 | showed
658 | showing
659 | shown
660 | showns
661 | shows
662 | side
663 | sides
664 | similar
665 | similarly
666 | since
667 | sir
668 | six
669 | slightly
670 | small
671 | smaller
672 | smallest
673 | smh
674 | so
675 | some
676 | somebody
677 | somehow
678 | someone
679 | somethan
680 | something
681 | sometime
682 | sometimes
683 | somewhat
684 | somewhere
685 | soon
686 | sorry
687 | specifically
688 | specified
689 | specify
690 | specifying
691 | state
692 | states
693 | still
694 | stop
695 | strongly
696 | sub
697 | substantially
698 | successfully
699 | such
700 | suggest
701 | sup
702 | sure
703 | t
704 | t's
705 | take
706 | taken
707 | taking
708 | tell
709 | tends
710 | th
711 | thk
712 | thx
713 | than
714 | thank
715 | thaanks
716 | thanks
717 | thanx
718 | that
719 | that'll
720 | that's
721 | that've
722 | thats
723 | the
724 | their
725 | theirs
726 | them
727 | themselves
728 | then
729 | thence
730 | there
731 | there'll
732 | there's
733 | there've
734 | thereafter
735 | thereby
736 | thered
737 | therefore
738 | therein
739 | thereof
740 | therere
741 | theres
742 | thereto
743 | thereupon
744 | these
745 | they
746 | they'd
747 | they'll
748 | they're
749 | they've
750 | theyd
751 | theyre
752 | thing
753 | things
754 | think
755 | thinks
756 | third
757 | this
758 | tho
759 | thorough
760 | thoroughly
761 | those
762 | thou
763 | though
764 | thoughh
765 | thought
766 | thoughts
767 | thousand
768 | three
769 | throug
770 | through
771 | throughout
772 | thru
773 | thus
774 | til
775 | time
776 | tip
777 | to
778 | tday
779 | today
780 | together
781 | tomorrow
782 | tonight
783 | too
784 | took
785 | toward
786 | towards
787 | tried
788 | tries
789 | true
790 | truly
791 | try
792 | trying
793 | ts
794 | turn
795 | turned
796 | turning
797 | turns
798 | twice
799 | two
800 | u
801 | un
802 | under
803 | unfortunately
804 | unless
805 | unlike
806 | unlikely
807 | until
808 | unto
809 | up
810 | upon
811 | ups
812 | us
813 | use
814 | used
815 | useful
816 | usefully
817 | usefulness
818 | uses
819 | using
820 | usually
821 | uucp
822 | URL
823 | v
824 | value
825 | various
826 | very
827 | via
828 | viz
829 | vol
830 | vols
831 | vs
832 | w
833 | want
834 | wanted
835 | wanting
836 | wants
837 | was
838 | wasn't
839 | way
840 | ways
841 | we
842 | we'd
843 | we'll
844 | we're
845 | we've
846 | wed
847 | week
848 | welcome
849 | well
850 | wells
851 | went
852 | were
853 | weren't
854 | what
855 | what'll
856 | what's
857 | whatever
858 | whats
859 | when
860 | whence
861 | whenever
862 | where
863 | where's
864 | whereafter
865 | whereas
866 | whereby
867 | wherein
868 | wheres
869 | whereupon
870 | wherever
871 | whether
872 | which
873 | while
874 | whim
875 | whither
876 | who
877 | whoa
878 | who'll
879 | who's
880 | whod
881 | whoever
882 | whole
883 | whom
884 | whomever
885 | whos
886 | whose
887 | why
888 | widely
889 | will
890 | willing
891 | wish
892 | with
893 | within
894 | without
895 | won't
896 | wonder
897 | woohoo
898 | words
899 | work
900 | worked
901 | working
902 | works
903 | world
904 | would
905 | wouldn't
906 | www
907 | x
908 | y
909 | yeah
910 | yeahh
911 | year
912 | years
913 | yes
914 | yet
915 | yesterday
916 | you
917 | you'd
918 | you'll
919 | you're
920 | you've
921 | youd
922 | young
923 | younger
924 | youngest
925 | your
926 | youre
927 | yours
928 | yourself
929 | yourselves
930 | yuu
931 | z
932 | zero
933 | zt
934 | zz


--------------------------------------------------------------------------------
/portrayal/tools/generate_xml.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import sys
  3 | import xml.dom.minidom
  4 | 
  5 | from .. config import PROJECT_PATH, XML_PATH
  6 | 
  7 | # reload(sys)
  8 | # sys.setdefaultencoding('utf8')
  9 | 
 10 | # 获取DOM树实现对象
 11 | impl = xml.dom.minidom.getDOMImplementation()
 12 | 
 13 | # 生成用户的XML文件
 14 | def generate_user_xml(user):
 15 | 	dom = impl.createDocument(None, 'TwitterUser', None)
 16 | 	root = dom.documentElement
 17 | 
 18 | 	# 创建子节点
 19 | 	basic_info = dom.createElement('基础信息')
 20 | 	implicit_info = dom.createElement('隐性属性')
 21 | 	root.appendChild(basic_info)
 22 | 	root.appendChild(implicit_info)
 23 | 
 24 | 
 25 | 	# id
 26 | 	id_ele = dom.createElement('用户ID')
 27 | 
 28 | 	if user.has_key('_id'):
 29 | 		id_text = dom.createTextNode(str(user['_id']))
 30 | 	else:
 31 | 		id_text = dom.createTextNode(str(user['user_id']))
 32 | 
 33 | 	id_ele.appendChild(id_text)
 34 | 
 35 | 
 36 | 	# screen_name
 37 | 	sn_ele = dom.createElement('screen_name')
 38 | 	sn_text = dom.createTextNode(user['screen_name'])
 39 | 	sn_ele.appendChild(sn_text)
 40 | 
 41 | 
 42 | 	# name
 43 | 	name_ele = dom.createElement('name')
 44 | 	# 标签增加属性,设置编码方式
 45 | 	# name_ele.setAttribute("coding", "utf-8")
 46 | 	_name = user['name'] if user['name'] else ''
 47 | 	name_text = dom.createTextNode(_name)
 48 | 	name_ele.appendChild(name_text)
 49 | 
 50 | 
 51 | 	# 个人描述
 52 | 	des_ele = dom.createElement('简介')
 53 | 	_description = user['description'] if user['description'] else ''
 54 | 	des_text = dom.createTextNode(_description)
 55 | 	des_ele.appendChild(des_text)
 56 | 
 57 | 
 58 | 	# 地理位置
 59 | 	location_ele = dom.createElement('地理位置')
 60 | 	_location = user['location'] if user['location'] else ''
 61 | 	location_text = dom.createTextNode(_location)
 62 | 	location_ele.appendChild(location_text)
 63 | 
 64 | 
 65 | 	# 帐号创建日期
 66 | 	create_ele = dom.createElement('帐号创建日期')
 67 | 	create_text = dom.createTextNode(str(user['created_at']))
 68 | 	create_ele.appendChild(create_text)
 69 | 
 70 | 
 71 | 	# 粉丝数
 72 | 	follower_ele = dom.createElement('粉丝数')
 73 | 	follower_text = dom.createTextNode(str(user['followers_count']))
 74 | 	follower_ele.appendChild(follower_text)
 75 | 
 76 | 
 77 | 	# 朋友数
 78 | 	friends_ele = dom.createElement('朋友数')
 79 | 	friends_text = dom.createTextNode(str(user['friends_count']))
 80 | 	friends_ele.appendChild(friends_text)
 81 | 
 82 | 
 83 | 	# 推文数
 84 | 	status_ele = dom.createElement('推文数')
 85 | 	status_text = dom.createTextNode(str(user['statuses_count']))
 86 | 	status_ele.appendChild(status_text)
 87 | 
 88 | 
 89 | 	# 喜欢的推文数
 90 | 	favourite_ele = dom.createElement('喜欢的推文数')
 91 | 	favourite_text = dom.createTextNode(str(user['favourites_count']))
 92 | 	favourite_ele.appendChild(favourite_text)
 93 | 
 94 | 
 95 | 	# 列表数量
 96 | 	list_ele = dom.createElement('所属列表数')
 97 | 	list_text = dom.createTextNode(str(user['listed_count']))
 98 | 	list_ele.appendChild(list_text)
 99 | 
100 | 
101 | 	# 是否认证
102 | 	verified_ele = dom.createElement('官方认证')
103 | 	verified_text = dom.createTextNode(str(user['verified']))
104 | 	verified_ele.appendChild(verified_text)
105 | 
106 | 
107 | 	# 隐私保护
108 | 	pro_ele = dom.createElement('隐私保护')
109 | 	pro_text = dom.createTextNode(str(user['protected']))
110 | 	pro_ele.appendChild(pro_text)
111 | 
112 | 
113 | 	#地理位置共享
114 | 	geo_enabled_ele = dom.createElement('地理位置共享')
115 | 	geo_enabled_text = dom.createTextNode(str(user['geo_enabled']))
116 | 	geo_enabled_ele.appendChild(geo_enabled_text)
117 | 
118 | 
119 | 	# 使用语言
120 | 	lang_ele = dom.createElement('语言')
121 | 	_lang = user['lang'] if user['lang'] else ''
122 | 	lang_text = dom.createTextNode(_lang)
123 | 	lang_ele.appendChild(lang_text)
124 | 
125 | 
126 | 	# 时区
127 | 	time_zone = dom.createElement('时区')
128 | 	_time_zone = user['time_zone'] if user['time_zone'] else ''
129 | 	time_text = dom.createTextNode(_time_zone)
130 | 	time_zone.appendChild(time_text)
131 | 
132 | 
133 | 	# 国际协调时偏移量
134 | 	utc_ele = dom.createElement('国际协调时偏移量')
135 | 	utc_text = dom.createTextNode(str(user['utc_offset']))
136 | 	utc_ele.appendChild(utc_text)
137 | 
138 | 
139 | 	# 是否使用默认头像
140 | 	default_ele = dom.createElement('是否使用默认头像')
141 | 	default_text = dom.createTextNode(str(user['default_profile_image']))
142 | 	default_ele.appendChild(default_text)
143 | 
144 | 
145 | 	# 头像链接
146 | 	profile_ele = dom.createElement('头像链接')
147 | 	profile_text = dom.createTextNode(user['profile_image_url'])
148 | 	profile_ele.appendChild(profile_text)
149 | 
150 | 
151 | 	# 背景图片链接
152 | 	banner_ele = dom.createElement('背景图片链接')
153 | 	_profile_banner_url = user['profile_banner_url'] if user['profile_banner_url'] else ''
154 | 	banner_text = dom.createTextNode(_profile_banner_url)
155 | 	banner_ele.appendChild(banner_text)
156 | 
157 | 	
158 | 	# 主页背景颜色
159 | 	bgcolor_ele = dom.createElement('主页背景颜色')
160 | 	bgcolor_text = dom.createTextNode(user['profile_background_color'])
161 | 	bgcolor_ele.appendChild(bgcolor_text)
162 | 
163 | 
164 | 	#侧边栏填充颜色
165 | 	profile_sidebar_ele = dom.createElement('侧边栏填充颜色')
166 | 	profile_sidebar_text = dom.createTextNode(user['profile_sidebar_fill_color'])
167 | 	profile_sidebar_ele.appendChild(profile_sidebar_text)
168 | 	
169 | 
170 | 	#抓取到的推文数
171 | 	tweets_crawled_ele = dom.createElement('抓取到的推文数')
172 | 	tweets_crawled_text = dom.createTextNode(str(len(user['tweets'])))
173 | 	tweets_crawled_ele.appendChild(tweets_crawled_text)
174 | 
175 | 
176 | 	#已抓取推文开始时间
177 | 	tweets_crawled_start_ele = dom.createElement('已抓取推文开始时间')
178 | 	tweets_crawled_start_text = dom.createTextNode(user['tweets'][0]['created_at'] if len(user['tweets']) > 0 else '')
179 | 	tweets_crawled_start_ele.appendChild(tweets_crawled_start_text)
180 | 
181 | 
182 | 	#已抓取推文结束时间
183 | 	tweets_crawled_end_ele = dom.createElement('已抓取推文结束时间')
184 | 	tweets_crawled_end_text = dom.createTextNode(user['tweets'][-1]['created_at'] if len(user['tweets']) > 0 else '')
185 | 	tweets_crawled_end_ele.appendChild(tweets_crawled_end_text)
186 | 
187 | 
188 | 	# 抓取日期
189 | 	crawler_date_ele = dom.createElement('抓取日期')
190 | 	crawler_date_text = dom.createTextNode(str(user['crawler_date']))
191 | 	crawler_date_ele.appendChild(crawler_date_text)
192 | 
193 | 
194 | 	# 把基本信息加入到basic_info节点中
195 | 	basic_info.appendChild(id_ele)
196 | 	basic_info.appendChild(sn_ele)
197 | 	basic_info.appendChild(name_ele)
198 | 	basic_info.appendChild(des_ele)
199 | 	basic_info.appendChild(location_ele)
200 | 	basic_info.appendChild(create_ele)
201 | 	basic_info.appendChild(follower_ele)
202 | 	basic_info.appendChild(friends_ele)
203 | 	basic_info.appendChild(status_ele)
204 | 	basic_info.appendChild(favourite_ele)
205 | 	basic_info.appendChild(list_ele)
206 | 	basic_info.appendChild(verified_ele)
207 | 	basic_info.appendChild(pro_ele)
208 | 	basic_info.appendChild(geo_enabled_ele)
209 | 	basic_info.appendChild(lang_ele)
210 | 	basic_info.appendChild(time_zone)
211 | 	basic_info.appendChild(utc_ele)
212 | 	basic_info.appendChild(default_ele)
213 | 	basic_info.appendChild(profile_ele)
214 | 	basic_info.appendChild(banner_ele)
215 | 	basic_info.appendChild(bgcolor_ele)
216 | 	basic_info.appendChild(profile_sidebar_ele)
217 | 	basic_info.appendChild(tweets_crawled_ele)
218 | 	basic_info.appendChild(tweets_crawled_start_ele)
219 | 	basic_info.appendChild(tweets_crawled_end_ele)
220 | 	basic_info.appendChild(crawler_date_ele)
221 | 	
222 | 	
223 | 
224 | 	# 职业领域分类
225 | 	category_ele = dom.createElement("职业领域")
226 | 	category_text = dom.createTextNode(user['category'])
227 | 	category_ele.appendChild(category_text)
228 | 
229 | 
230 | 	# 职业领域得分
231 | 	category_score_ele = dom.createElement("职业领域得分")
232 | 	category_score_str = ''
233 | 
234 | 	for item in user['category_score']:
235 | 		category_score_str += item + ": " + str(user['category_score'][item]) + "; "
236 | 
237 | 	category_score_text = dom.createTextNode(category_score_str[0:-2])
238 | 	category_score_ele.appendChild(category_score_text)
239 | 
240 | 
241 | 	# 用户社交影响力
242 | 	influence_ele = dom.createElement("影响力分数")
243 | 	influence_text = dom.createTextNode(str(user['influence_score']))
244 | 	influence_ele.appendChild(influence_text)
245 | 
246 | 
247 | 	if user['influence_score'] >= 110:
248 | 		influence_rank = '高'
249 | 	elif user['influence_score'] >= 60:
250 | 		influence_rank = '中'
251 | 	else:
252 | 		influence_rank = '低'
253 | 	# 用户社交影响力大小
254 | 	influence_rank_ele = dom.createElement("影响力等级")
255 | 	influence_rank_text = dom.createTextNode(influence_rank)
256 | 	influence_rank_ele.appendChild(influence_rank_text)
257 | 
258 | 
259 | 	# 用户心里状态标签
260 | 	psy_ele = dom.createElement("心理状态")
261 | 
262 | 	if user['psy'] == 1:
263 | 		psy_temp = '正面'
264 | 	elif user['psy'] == -1:
265 | 		psy_temp = '负面'
266 | 	else:
267 | 		psy_temp = '中性'
268 | 
269 | 	psy_text = dom.createTextNode(psy_temp)
270 | 	psy_ele.appendChild(psy_text)
271 | 
272 | 
273 | 	# 用户兴趣爱好标签
274 | 	interest_ele = dom.createElement("兴趣爱好标签")
275 | 	interest_text = dom.createTextNode(user['interest_tags'])
276 | 	interest_ele.appendChild(interest_text)
277 | 
278 | 
279 | 	# 活跃度
280 | 	activity_ele = dom.createElement("活跃度")
281 | 	activity_text = dom.createTextNode(str(user['activity']))
282 | 	activity_ele.appendChild(activity_text)
283 | 
284 | 
285 | 	# 活跃度变化
286 | 	activity_list_ele = dom.createElement("活跃度变化")
287 | 	activity_list_str = ''
288 | 
289 | 	for item in user['activity_list']:
290 | 		activity_list_str += str(item) + ", "
291 | 
292 | 	activity_list_text = dom.createTextNode(activity_list_str[0:-2])
293 | 	activity_list_ele.appendChild(activity_list_text)
294 | 
295 | 
296 | 	# 心理状态变化(相同推文数，方法1)
297 | 	psy_with_count1_ele = dom.createElement("心理状态变化")
298 | 	psy_with_count1_ele.setAttribute("type", "相同推文数")
299 | 	psy_with_count1_ele.setAttribute("method", "分类器分类")
300 | 	psy_with_count1_str = ''
301 | 
302 | 	for item in user['psy_with_count1']:
303 | 		psy_with_count1_str += str(item) + ", "
304 | 
305 | 	psy_with_count1_text = dom.createTextNode(psy_with_count1_str[0:-2])
306 | 	psy_with_count1_ele.appendChild(psy_with_count1_text)
307 | 
308 | 
309 | 	# 心理状态变化(相同推文数，方法2)
310 | 	psy_with_count2_ele = dom.createElement("心理状态变化")
311 | 	psy_with_count2_ele.setAttribute("type", "相同推文数")
312 | 	psy_with_count2_ele.setAttribute("method", "情感字典")
313 | 	psy_with_count2_str = ''
314 | 
315 | 	for item in user['psy_with_count2']:
316 | 		psy_with_count2_str += str(item) + ", "
317 | 
318 | 	psy_with_count2_text = dom.createTextNode(psy_with_count2_str[0:-2])
319 | 	psy_with_count2_ele.appendChild(psy_with_count2_text)
320 | 
321 | 
322 | 	# 心理状态变化(相同时间间隔，方法1)
323 | 	psy_with_time1_ele = dom.createElement("心理状态变化")
324 | 	psy_with_time1_ele.setAttribute("type", "相同时间间隔")
325 | 	psy_with_time1_ele.setAttribute("method", "分类器分类")
326 | 	psy_with_time1_str = ''
327 | 
328 | 	for item in user['psy_with_time1']:
329 | 		psy_with_time1_str += str(item) + ", "
330 | 
331 | 	psy_with_time1_text = dom.createTextNode(psy_with_time1_str[0:-2])
332 | 	psy_with_time1_ele.appendChild(psy_with_time1_text)
333 | 
334 | 
335 | 	# 心理状态变化(相同时间间隔，方法2)
336 | 	psy_with_time2_ele = dom.createElement("心理状态变化")
337 | 	psy_with_time2_ele.setAttribute("type", "相同时间间隔")
338 | 	psy_with_time2_ele.setAttribute("method", "情感字典")
339 | 	psy_with_time2_str = ''
340 | 
341 | 	for item in user['psy_with_time2']:
342 | 		psy_with_time2_str += str(item) + ", "
343 | 
344 | 	psy_with_time2_text = dom.createTextNode(psy_with_time2_str[0:-2])
345 | 	psy_with_time2_ele.appendChild(psy_with_time2_text)
346 | 
347 | 
348 | 	# 将隐性属性标签加入到隐性标签中
349 | 	implicit_info.appendChild(category_ele)
350 | 	implicit_info.appendChild(category_score_ele)
351 | 	implicit_info.appendChild(influence_ele)
352 | 	implicit_info.appendChild(influence_rank_ele)
353 | 	implicit_info.appendChild(psy_ele)
354 | 	implicit_info.appendChild(interest_ele)
355 | 	implicit_info.appendChild(activity_ele)
356 | 	implicit_info.appendChild(activity_list_ele)
357 | 	implicit_info.appendChild(psy_with_count1_ele)
358 | 	implicit_info.appendChild(psy_with_count2_ele)
359 | 	implicit_info.appendChild(psy_with_time1_ele)
360 | 	implicit_info.appendChild(psy_with_time2_ele)
361 | 
362 | 
363 | 	# 将用户信息写入文件
364 | 	with open(XML_PATH + '%s.xml' % user['screen_name'], 'w') as f:
365 | 		dom.writexml(f, addindent="    ", newl='\n', encoding="utf-8")
366 | 
367 | 
368 | 	return XML_PATH + '%s.xml' % user['screen_name']


--------------------------------------------------------------------------------
/crawler/relation_crawler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import time
  3 | import threading
  4 | 
  5 | from twitter import error
  6 | from api import Api, API_COUNT
  7 | from decorator import generate_decorator
  8 | 
  9 | handle_exception = generate_decorator(720)
 10 | 
 11 | class RelationCrawler:
 12 | 	get_api = Api().get_api
 13 | 	
 14 | 
 15 | 	'''
 16 | 	Returns information about the relationship between the two users.
 17 | 
 18 | 	Parameters:	
 19 | 		source_id – The user_id of the subject user [Optional]
 20 | 		source_screen_name – The screen_name of the subject user [Optional]
 21 | 		target_id – The user_id of the target user [Optional]
 22 | 		target_screen_name – The screen_name of the target user [Optional]
 23 | 	Returns:	
 24 | 		A Twitter Json structure.
 25 | 	'''
 26 | 	def show_friendship(self,
 27 | 						source_user_id = None,
 28 | 						source_screen_name = None,
 29 | 						target_user_id = None,
 30 | 						target_screen_name = None):
 31 | 
 32 | 		if not source_user_id and not source_screen_name:
 33 | 			return None
 34 | 
 35 | 		if not target_user_id and not target_screen_name:
 36 | 			return None
 37 | 
 38 | 		return self.get_api().ShowFriendship(source_user_id,
 39 | 											 source_screen_name,
 40 | 											 target_user_id,
 41 | 											 target_screen_name)
 42 | 
 43 | 
 44 | 	'''
 45 | 	获取用户关系信息，如果超时则会休眠800s，然后返回关系信息（参考 show_friendship ）
 46 | 	'''
 47 | 	def show_friendship_sleep(self,
 48 | 							  source_user_id = None,
 49 | 							  source_screen_name = None,
 50 | 							  target_user_id = None,
 51 | 							  target_screen_name = None):
 52 | 
 53 | 		wrapper_func = handle_exception(self.show_friendship)
 54 | 		relation = wrapper_func(source_user_id, source_screen_name, target_user_id, target_screen_name)
 55 | 		
 56 | 		return relation
 57 | 
 58 | 
 59 | 	'''
 60 | 	Fetch a sequence of user ids, one for each friend. Returns a list of all the given user’s friends’ IDs. 
 61 | 
 62 | 	Parameters:
 63 | 		user_id – The id of the user to retrieve the id list for. [Optional]
 64 | 		screen_name – The screen_name of the user to retrieve the id list for. [Optional]
 65 | 		cursor – Specifies the Twitter API Cursor location to start at. Note: there are pagination limits. [Optional]
 66 | 		total_count – The total amount of UIDs to retrieve. Good if the account has many followers and you don’t want to get rate limited. 
 67 | 	
 68 | 	Returns:	
 69 | 		A list of integers, one for each user id.
 70 | 	'''
 71 | 	def get_friendids(self,
 72 | 					  user_id = None,
 73 | 					  screen_name = None,
 74 | 					  cursor = None,
 75 | 					  total_count = 60000):
 76 | 
 77 | 		if user_id == None and screen_name == None:
 78 | 			return None
 79 | 
 80 | 		return self.get_api().GetFriendIDs(user_id = user_id,
 81 | 										   screen_name = screen_name,
 82 | 										   cursor = cursor,
 83 | 										   total_count = total_count)
 84 | 
 85 | 
 86 | 	'''
 87 | 	Make a cursor driven call to return the list of all friends
 88 | 	The caller is responsible for handling the cursor value and looping to gather all of the data
 89 | 
 90 | 	Parameters:	
 91 | 		user_id – The twitter id of the user whose friends you are fetching. [Optional]
 92 | 		screen_name – The twitter name of the user whose friends you are fetching. If not specified, defaults to the authenticated user. [Optional]
 93 | 		cursor – Should be set to -1 for the initial call and then is used to control what result page Twitter returns.
 94 | 		stringify_ids – if True then twitter will return the ids as strings instead of integers. [Optional]
 95 | 		count – The number of user id’s to retrieve per API request. Please be aware that this might get you rate-limited if set to a small number. 
 96 | 				By default Twitter will retrieve 5000 UIDs per call. [Optional]
 97 | 	
 98 | 	Returns:	
 99 | 		next_cursor, previous_cursor, data sequence of twitter.User instances, one for each friend
100 | 	'''
101 | 	def get_friendids_paged(self,
102 | 							user_id = None,
103 | 							screen_name = None,
104 | 							cursor = -1,
105 | 							count = 5000,
106 | 							stringify_ids = False):
107 | 
108 | 		if user_id == None and screen_name == None:
109 | 			return None
110 | 
111 | 		return self.get_api().GetFriendIDsPaged(user_id = user_id,
112 | 												screen_name = screen_name,
113 | 												cursor = cursor,
114 | 												count = count,
115 | 												stringify_ids = stringify_ids)
116 | 
117 | 
118 | 	'''
119 | 	分页获取用户朋友id，如果超时则会休眠800s，然后返回朋友信息(参考 get_friendids_paged )
120 | 	'''
121 | 	def get_friendids_paged_sleep(self,
122 | 								  user_id = None,
123 | 								  screen_name = None,
124 | 								  cursor = -1,
125 | 								  stringify_ids = False,
126 | 								  count = 5000):
127 | 
128 | 		wrapper_func = handle_exception(self.get_friendids_paged)
129 | 		friendids = wrapper_func(user_id = user_id,
130 | 								 screen_name = screen_name,
131 | 								 cursor = cursor,
132 | 								 stringify_ids = stringify_ids,
133 | 								 count = count)
134 | 
135 | 		return friendids
136 | 	
137 | 
138 | 	'''
139 | 	Fetch the sequence of twitter.User instances, one for each friend.
140 | 	If both user_id and screen_name are specified, this call will return the followers of the user specified by screen_name, 
141 | 	however this behavior is undocumented by Twitter and may change without warning.
142 | 
143 | 	Parameters:	
144 | 		user_id – The twitter id of the user whose friends you are fetching. [Optional]
145 | 		screen_name – The twitter name of the user whose friends you are fetching. [Optional]
146 | 		cursor – Should be set to -1 for the initial call and then is used to control what result page Twitter returns.
147 | 		total_count – The upper bound of number of users to return.
148 | 		skip_status – If True the statuses will not be returned in the user items. [Optional]
149 | 		include_user_entities – When True, the user entities will be included. [Optional]
150 | 	
151 | 	Returns:	
152 | 		A sequence of twitter.User instances, one for each friend
153 | 	'''
154 | 	def get_friends(self,
155 | 					user_id = None,
156 | 					screen_name = None,
157 | 					cursor = None,
158 | 					total_count = 2500,
159 | 					skip_status = True,
160 | 					include_user_entities = True):
161 | 
162 | 		if user_id == None and screen_name == None:
163 | 			return None
164 | 
165 | 		return self.get_api().GetFriends(user_id = user_id,
166 | 										 screen_name = screen_name,
167 | 										 cursor = cursor,
168 | 										 total_count = total_count,
169 | 										 skip_status = skip_status,
170 | 										 include_user_entities = include_user_entities)
171 | 		
172 | 	'''
173 | 	分页获取用户朋友信息(参考 get_friends )
174 | 	'''
175 | 	def get_friends_paged(self,
176 | 						  user_id = None,
177 | 						  screen_name = None,
178 | 						  cursor = -1,
179 | 						  count = 200,
180 | 						  skip_status = True,
181 | 						  include_user_entities = True):
182 | 
183 | 		if user_id == None and screen_name == None:
184 | 			return None
185 | 
186 | 		return self.get_api().GetFriendsPaged(user_id = user_id,
187 | 											  screen_name = screen_name,
188 | 											  cursor = cursor,
189 | 											  count = count,
190 | 											  skip_status = skip_status,
191 | 											  include_user_entities = include_user_entities)
192 | 
193 | 	'''
194 | 	获取用户所有朋友的id，并保存
195 | 	'''
196 | 	# def get_all_friendids(self, 
197 | 	# 					  user_id = None, 
198 | 	# 					  screen_name = None):
199 | 
200 | 	# 	cursor = -1
201 | 	# 	while cursor != 0:
202 | 	# 		out = self.get_friendids_paged_sleep(user_id = user_id,
203 | 	# 											 screen_name = screen_name, 
204 | 	# 											 cursor = cursor, 
205 | 	# 											 count = 5000)
206 | 	# 		if not out:
207 | 	# 			return None
208 | 
209 | 	# 		cursor = out[0]
210 | 	# 		friend_list = out[2]
211 | 		
212 | 
213 | 	'''
214 | 	Returns a list of twitter user id’s for every person that is following the specified user.
215 | 
216 | 	Parameters:
217 | 		user_id – The id of the user to retrieve the id list for. [Optional]
218 | 		screen_name – The screen_name of the user to retrieve the id list for. [Optional]
219 | 		cursor – Specifies the Twitter API Cursor location to start at. Note: there are pagination limits. [Optional]
220 | 		total_count – The total amount of UIDs to retrieve. Good if the account has many followers and you don’t want to get rate limited. 
221 | 
222 | 	Returns:
223 | 		A list of integers, one for each user id.
224 | 	'''
225 | 	def get_followerids(self,
226 | 						user_id = None,
227 | 						screen_name = None,
228 | 						cursor = None,
229 | 						total_count = 60000):
230 | 
231 | 		if user_id == None and screen_name == None:
232 | 			return None
233 | 
234 | 		return self.get_api().GetFollowerIDs(user_id = user_id,
235 | 											 screen_name = screen_name,
236 | 											 cursor = cursor,
237 | 											 total_count = total_count)
238 | 
239 | 
240 | 	'''
241 | 	Make a cursor driven call to return a list of one page followers.
242 | 	The caller is responsible for handling the cursor value and looping to gather all of the data
243 | 
244 | 	Parameters:	
245 | 		user_id – The twitter id of the user whose followers you are fetching. [Optional]
246 | 		screen_name – The twitter name of the user whose followers you are fetching. [Optional]
247 | 		cursor – Should be set to -1 for the initial call and then is used to control what result page Twitter returns.
248 | 		stringify_ids – if True then twitter will return the ids as strings instead of integers. [Optional]
249 | 		count – The number of user id’s to retrieve per API request. Please be aware that this might get you rate-limited if set to a small number. 
250 | 				By default Twitter will retrieve 5000 UIDs per call. [Optional]
251 | 	
252 | 	Returns:	
253 | 		next_cursor, previous_cursor, data sequence of user ids, one for each follower
254 | 	'''
255 | 	def get_followerids_paged(self,
256 | 							  user_id = None,
257 | 							  screen_name = None,
258 | 							  cursor = -1,
259 | 							  stringify_ids = False,
260 | 							  count = 5000):
261 | 
262 | 		if user_id == None and screen_name == None:
263 | 			return None
264 | 
265 | 		return self.get_api().GetFollowerIDsPaged(user_id = user_id,
266 | 											 	  screen_name = screen_name,
267 | 											 	  cursor = cursor,
268 | 											 	  count = count,
269 | 											 	  stringify_ids = stringify_ids)
270 | 
271 | 
272 | 	'''
273 | 	分页获取用户粉丝id，如果超时则会休眠800s，然后返回粉丝信息（参考 get_followerids_page ）
274 | 	'''
275 | 	def get_followerids_paged_sleep(self,
276 | 									user_id = None,
277 | 									screen_name = None,
278 | 									cursor = -1,
279 | 									stringify_ids = False,
280 | 									count = 5000):
281 | 
282 | 		wrapper_func = handle_exception(self.get_followerids_paged)
283 | 		followerids = wrapper_func(user_id = user_id,
284 | 								   screen_name = screen_name,
285 | 								   cursor = cursor,
286 | 								   stringify_ids = stringify_ids,
287 | 								   count = count)
288 | 
289 | 		return followerids
290 | 
291 | 
292 | 	'''
293 | 	Fetch the sequence of twitter.User instances, one for each follower.
294 | 	If both user_id and screen_name are specified, this call will return the followers of the user specified by screen_name, 
295 | 	however this behavior is undocumented by Twitter and may change without warning.
296 | 
297 | 	Parameters:	
298 | 		user_id – The twitter id of the user whose followers you are fetching. [Optional]
299 | 		screen_name – The twitter name of the user whose followers you are fetching. [Optional]
300 | 		cursor – Should be set to -1 for the initial call and then is used to control what result page Twitter returns.
301 | 		total_count – The upper bound of number of users to return, defaults to None.
302 | 		skip_status – If True the statuses will not be returned in the user items. [Optional]
303 | 		include_user_entities – When True, the user entities will be included. [Optional]
304 | 
305 | 	Returns:	
306 | 		A sequence of twitter.User instances, one for each follower
307 | 	'''
308 | 	def get_followers(self,
309 | 					  user_id = None,
310 | 					  screen_name = None,
311 | 					  cursor = None,
312 | 					  total_count = 2500,
313 | 					  skip_status = True,
314 | 					  include_user_entities = True):
315 | 
316 | 		if user_id == None and screen_name == None:
317 | 			return None
318 | 
319 | 		return self.get_api().GetFollowers(user_id = user_id,
320 | 										   screen_name = screen_name,
321 | 										   cursor = cursor,
322 | 										   total_count = total_count,
323 | 										   skip_status = skip_status,
324 | 										   include_user_entities = include_user_entities)
325 | 
326 | 
327 | 	'''
328 | 	分页获取用户粉丝信息（参考 get_followers ）
329 | 	'''
330 | 	def get_followers_paged(self,
331 | 							user_id = None,
332 | 							screen_name = None,
333 | 							cursor = -1,
334 | 							count = 200,
335 | 							skip_status = True,
336 | 							include_user_entities = True):
337 | 
338 | 		if user_id == None and screen_name == None:
339 | 			return None
340 | 
341 | 		return self.get_api().GetFollowersPaged(user_id = user_id,
342 | 												screen_name = screen_name,
343 | 												cursor = cursor,
344 | 												count = count,
345 | 												skip_status = skip_status,
346 | 												include_user_entities = include_user_entities)
347 | 
348 | 
349 | 	'''
350 | 	获取用户所有粉丝的id，并保存
351 | 	'''
352 | 	# def get_all_followersids(self, 
353 | 	# 						 user_id = None, 
354 | 	# 						 screen_name = None):
355 | 
356 | 	# 	cursor = -1
357 | 	# 	while cursor != 0:
358 | 	# 		out = self.get_followerids_paged_sleep(user_id = user_id,
359 | 	# 											   screen_name = screen_name, 
360 | 	# 											   cursor = cursor, 
361 | 	# 											   count = 5000)
362 | 	# 		if not out:
363 | 	# 			return None
364 | 
365 | 	# 		cursor = out[0]
366 | 	# 		follower_list = out[2]
367 | 
368 | 
369 | if __name__ == '__main__':
370 | 	rc = RelationCrawler()
371 | 	print rc.get_followers_paged(screen_name='mrmarcohan')


--------------------------------------------------------------------------------
/crawler/tweets_crawler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import time
  3 | import threading
  4 | 
  5 | from config import THREAD_NUM
  6 | from twitter import error
  7 | from api import Api, API_COUNT
  8 | from database import MongoDB
  9 | from decorator import generate_decorator
 10 | 
 11 | handle_exception = generate_decorator(300)
 12 | 
 13 | class TweetsCrawler:
 14 | 	get_api = Api().get_api
 15 | 
 16 | 
 17 | 	'''
 18 | 	Fetch the sequence of public Status messages for a single user.
 19 | 
 20 | 	Parameters:	
 21 | 		user_id (int, optional) – Specifies the ID of the user for whom to return the user_timeline.
 22 | 				Helpful for disambiguating when a valid user ID is also a valid screen name.
 23 | 		screen_name (str, optional) – Specifies the screen name of the user for whom to return the user_timeline.
 24 | 				Helpful for disambiguating when a valid screen name is also a user ID.
 25 | 		since_id (int, optional) – Returns results with an ID greater than (that is, more recent than) the specified ID. 
 26 | 				There are limits to the number of Tweets which can be accessed through the API. If the limit of Tweets has 
 27 | 				occurred since the since_id, the since_id will be forced to the oldest ID available.
 28 | 		max_id (int, optional) – Returns only statuses with an ID less than (that is, older than) or equal to the specified ID.
 29 | 		count (int, optional) – Specifies the number of statuses to retrieve. May not be greater than 200.
 30 | 		include_rts (bool, optional) – If True, the timeline will contain native retweets (if they exist) in addition to the standard stream of tweets.
 31 | 		trim_user (bool, optional) – If True, statuses will only contain the numerical user ID only. Otherwise a full user object will be returned for each status.
 32 | 		exclude_replies (bool, optional) – If True, this will prevent replies from appearing in the returned timeline. Using exclude_replies with the 
 33 | 				count parameter will mean you will receive up-to count tweets - this is because the count parameter retrieves that many tweets 
 34 | 				before filtering out retweets and replies. This parameter is only supported for JSON and XML responses.
 35 | 	
 36 | 	Returns:	
 37 | 		A sequence of Status instances, one for each message up to count
 38 | 	'''
 39 | 	def get_user_timeline(self,
 40 | 						  user_id = None,
 41 | 						  screen_name = None,
 42 | 						  since_id = None,
 43 | 						  max_id = None,
 44 | 						  count = None,
 45 | 						  include_rts = True,
 46 | 						  trim_user = True,
 47 | 						  exclude_replies = False):
 48 | 
 49 | 		if user_id == None and screen_name == None:
 50 | 			return None
 51 | 
 52 | 		return self.get_api().GetUserTimeline(user_id = user_id,
 53 | 											  screen_name = screen_name,
 54 | 											  since_id = since_id,
 55 | 											  max_id = max_id,
 56 | 											  count = count,
 57 | 											  include_rts = include_rts,
 58 | 											  trim_user = trim_user,
 59 | 											  exclude_replies = exclude_replies)
 60 | 
 61 | 
 62 | 	'''
 63 | 	获取用户所有推文信息，并保存在数据库(MongoDB)中（参考 get_user_timeline ）
 64 | 
 65 | 	参数：
 66 | 		collect_name：数据库集合名，默认 tweets_task
 67 | 	'''
 68 | 	def get_user_all_timeline(self,
 69 | 							  user_id = None,
 70 | 							  collect_name = "tweets_task",
 71 | 							  screen_name = None,
 72 | 							  include_rts = True,
 73 | 							  exclude_replies = False):
 74 | 	
 75 | 		if user_id == None and screen_name == None:
 76 | 			return None
 77 | 
 78 | 		if user_id:
 79 | 			try:
 80 | 				user_id = long(user_id)
 81 | 			except Exception as e:
 82 | 				print e
 83 | 				return None
 84 | 			
 85 | 		flag = True
 86 | 		tweets = [0]
 87 | 		sleep_count = 0
 88 | 	
 89 | 		db = MongoDB().connect()
 90 | 		collect = db[collect_name]
 91 | 		get_api = self.get_api
 92 | 
 93 | 		while len(tweets) > 0:
 94 | 			try:
 95 | 				if flag:
 96 | 					tweets = get_api().GetUserTimeline(user_id = user_id,
 97 | 													   screen_name = screen_name,
 98 | 													   include_rts = include_rts,
 99 | 													   exclude_replies = exclude_replies,
100 | 													   trim_user = True,
101 | 													   count = 200)
102 | 					flag = False
103 | 
104 | 				else:
105 | 					tweets = get_api().GetUserTimeline(user_id = user_id,
106 | 													   screen_name = screen_name,
107 | 													   include_rts = include_rts,
108 | 													   exclude_replies = exclude_replies,
109 | 							 						   trim_user = True,
110 | 													   count = 200,
111 | 													   max_id = tweets[-1].id - 1)
112 | 
113 | 			except error.TwitterError as te:
114 | 				try:
115 | 					if te.message == 'Not authorized.':
116 | 						print 'Not authorized.'
117 | 						return
118 | 
119 | 					if te.message[0]['code'] == 88:
120 | 						sleep_count += 1
121 | 
122 | 						if sleep_count >= API_COUNT:
123 | 							print "sleeping..."
124 | 							sleep_count = 0
125 | 							time.sleep(300)
126 | 						continue
127 | 
128 | 					else:
129 | 						print te
130 | 						break
131 | 				except Exception as ee:
132 | 					print ee
133 | 					break
134 | 			except Exception as e:
135 | 				break
136 | 				
137 | 			for tt in tweets:
138 | 				tweet = self.tweetobj_to_dict(tt)
139 | 
140 | 				if not tweet:
141 | 					continue
142 | 
143 | 				try:
144 | 					collect.insert_one(tweet)
145 | 				except Exception as e:
146 | 					continue
147 | 	
148 | 
149 | 	'''
150 | 	获取用户所有推文信息，并返回（参考 get_user_timeline ）
151 | 	'''
152 | 	def get_user_all_timeline_return(self,
153 | 									 user_id = None,
154 | 									 screen_name = None,
155 | 									 include_rts = True,
156 | 									 exclude_replies = False):
157 | 
158 | 		if user_id == None and screen_name == None:
159 | 			return None
160 | 
161 | 		if user_id:
162 | 			try:
163 | 				user_id = long(user_id)
164 | 			except Exception as e:
165 | 				print e
166 | 				return None
167 | 
168 | 		flag = True
169 | 		tweets = [0]
170 | 		sleep_count = 0
171 | 
172 | 		tweet_list = []
173 | 
174 | 		get_api = self.get_api
175 | 
176 | 		while len(tweets) > 0:
177 | 			try:
178 | 				if flag:
179 | 					tweets = get_api().GetUserTimeline(user_id = user_id,
180 | 													   screen_name = screen_name,
181 | 													   include_rts = include_rts,
182 | 													   exclude_replies = exclude_replies,
183 | 													   trim_user = True,
184 | 													   count = 200)
185 | 					flag = False
186 | 
187 | 				else:
188 | 					tweets = get_api().GetUserTimeline(user_id = user_id,
189 | 													   screen_name = screen_name,
190 | 													   include_rts = include_rts,
191 | 													   exclude_replies = exclude_replies,
192 | 							 						   trim_user = True,
193 | 													   count = 200,
194 | 													   max_id = tweets[-1].id - 1)
195 | 
196 | 			except error.TwitterError as te:
197 | 				try:
198 | 					if te.message == 'Not authorized.':
199 | 						print 'Not authorized.'
200 | 						return None
201 | 
202 | 					if te.message[0]['code'] == 88:
203 | 						sleep_count += 1
204 | 
205 | 						if sleep_count >= API_COUNT:
206 | 							print "sleeping..."
207 | 							sleep_count = 0
208 | 							time.sleep(300)
209 | 						continue
210 | 
211 | 					else:
212 | 						print te
213 | 						break
214 | 				except Exception as ee:
215 | 					print ee
216 | 					break
217 | 			except Exception as e:
218 | 				print e
219 | 				break
220 | 
221 | 			for tt in tweets:
222 | 				tweet = self.tweetobj_to_dict(tt)
223 | 
224 | 				if not tweet:
225 | 					continue
226 | 
227 | 				try:
228 | 					tweet_list.append(tweet)
229 | 				except Exception as e:
230 | 					continue
231 | 
232 | 		return tweet_list
233 | 
234 | 
235 | 	'''
236 | 	获取所有用户推文信息
237 | 
238 | 	参数：
239 | 		user_list (list, optional):
240 | 			存放用户 user_id / screen_name 的列表
241 | 		collect_name (str, optional):
242 | 			存储数据集合名，默认 tweets_task
243 | 		search_type (str, optional):
244 | 			抓取方式，如果为 screen_name ，则认为 user_list 中 存放的是用户 screen_name，
245 | 			否则认为 user_list 中 存放的是用户 user_id
246 | 
247 | 	'''
248 | 	def get_all_users_timeline(self,
249 | 							   user_list = [],
250 | 							   collect_name = "tweets_task",
251 | 							   search_type = "user_id",
252 | 							   include_rts = True,
253 | 							   exclude_replies = False):
254 | 
255 | 		if len(user_list) == 0:
256 | 			return
257 | 
258 | 		i = 0
259 | 		thread_pool = []
260 | 		length = len(user_list)
261 | 		per_thread = length / THREAD_NUM
262 | 
263 | 		while i < THREAD_NUM:
264 | 			if i + 1 == THREAD_NUM:
265 | 				crawler_thread = threading.Thread(target = self.get_all_users_timeline_thread, 
266 | 					args = (user_list[i * per_thread : ], collect_name, search_type, include_rts, exclude_replies,))
267 | 			else:
268 | 				crawler_thread = threading.Thread(target = self.get_all_users_timeline_thread, 
269 | 					args = (user_list[i * per_thread : (i + 1) * per_thread], collect_name, search_type, include_rts, exclude_replies,))
270 | 			
271 | 			crawler_thread.start()
272 | 			thread_pool.append(crawler_thread)
273 | 
274 | 			i += 1
275 | 
276 | 		for t in thread_pool:
277 | 			t.join()
278 | 
279 | 
280 | 	'''
281 | 	线程：获取多个用户推文信息（参考 get_all_users_timeline ）
282 | 	'''
283 | 	def get_all_users_timeline_thread(self,
284 | 									  user_list = [],
285 | 									  collect_name = "tweets_task",
286 | 									  search_type = "user_id",
287 | 									  include_rts = True,
288 | 									  exclude_replies = False):
289 | 
290 | 		if search_type != "screen_name":
291 | 			while len(user_list) > 0:
292 | 				user_id = user_list.pop(0)
293 | 
294 | 				self.get_user_all_timeline(user_id = user_id,
295 | 										   collect_name = collect_name,
296 | 										   include_rts = include_rts,
297 | 										   exclude_replies = exclude_replies)
298 | 		else:
299 | 			while len(user_list) > 0:
300 | 				screen_name = user_list.pop(0)
301 | 
302 | 				self.get_user_all_timeline(screen_name = screen_name,
303 | 										   collect_name = collect_name,
304 | 										   include_rts = include_rts, 
305 | 										   exclude_replies = exclude_replies)
306 | 
307 | 
308 | 	'''
309 | 	Returns a single status message, specified by the status_id parameter.
310 | 
311 | 	Parameters:	
312 | 		status_id – The numeric ID of the status you are trying to retrieve.
313 | 		trim_user – When set to True, each tweet returned in a timeline will include a user object including only the status authors numerical ID. 
314 | 				Omit this parameter to receive the complete user object. [Optional]
315 | 		include_entities – If False, the entities node will be disincluded. This node offers a variety of metadata about the tweet in a 
316 | 				discreet structure, including: user_mentions, urls, and hashtags. [Optional]
317 | 	
318 | 	Returns:	
319 | 		A twitter.Status instance representing that status message
320 | 	'''
321 | 	def get_status(self,
322 | 				   status_id = None,
323 | 				   trim_user = True,
324 | 				   include_entities = True):
325 | 
326 | 		if status_id == None:
327 | 			return None
328 | 
329 | 		return self.get_api().GetStatus(status_id = status_id,
330 | 										trim_user = trim_user,
331 | 										include_my_retweet = False,
332 | 										include_entities = include_entities)
333 | 
334 | 
335 | 	'''
336 | 	根据推文ID获取所有推文信息（参考 get_status ）
337 | 
338 | 	参数：
339 | 		status_list (list, optional):
340 | 			存放tweet id 的列表
341 | 		collect_name (str, optional):
342 | 			存储数据集合名，默认 status
343 | 	'''
344 | 	def get_all_status(self,
345 | 					   status_list = [],
346 | 					   collect_name = 'status',
347 | 					   trim_user = True,
348 | 					   include_entities = True):
349 | 
350 | 		if len(status_list) == 0:
351 | 			return
352 | 
353 | 		i = 0
354 | 		thread_pool = []
355 | 		length = len(status_list)
356 | 		per_thread = length / THREAD_NUM
357 | 
358 | 		while i < THREAD_NUM:
359 | 			if i + 1 == THREAD_NUM:
360 | 				crawler_thread = threading.Thread(target = self.get_all_status_thread, 
361 | 					args = (status_list[i * per_thread : ], collect_name, trim_user, include_entities,))
362 | 			else:
363 | 				crawler_thread = threading.Thread(target = self.get_all_status_thread, 
364 | 					args = (status_list[i * per_thread : (i + 1) * per_thread], collect_name, trim_user, include_entities,))
365 | 			
366 | 			crawler_thread.start()
367 | 			thread_pool.append(crawler_thread)
368 | 
369 | 			i += 1
370 | 
371 | 		for t in thread_pool:
372 | 			t.join()
373 | 
374 | 
375 | 	'''
376 | 	线程：根据推文ID获取所有推文信息（参考 get_all_status ）
377 | 	'''
378 | 	def get_all_status_thread(self,
379 | 							  status_list = [],
380 | 							  collect_name = 'status',
381 | 							  trim_user = True,
382 | 							  include_entities = True):
383 | 
384 | 		wrapper_func = handle_exception(self.get_status)
385 | 
386 | 		db = MongoDB().connect()
387 | 		collect = db[collect_name]
388 | 
389 | 		while len(status_list) > 0:
390 | 			status_id = status_list.pop(0)
391 | 			status_obj = wrapper_func(status_id)
392 | 
393 | 			status = self.tweetobj_to_dict(status_obj)
394 | 
395 | 			if not status:
396 | 				continue
397 | 
398 | 			try:
399 | 				collect.insert_one(status)
400 | 			except Exception as e:
401 | 				continue
402 | 	
403 | 
404 | 	'''
405 | 	将推文对象转换为字典类型
406 | 	'''
407 | 	def tweetobj_to_dict(self, tt):
408 | 		if tt == None:
409 | 			return None
410 | 		
411 | 		try:
412 | 			tweet = {
413 | 				'coordinates': tt.coordinates,  # Coordinates
414 | 				'created_at': tt.created_at, # String
415 | 				'favorite_count': tt.favorite_count, # int
416 | 				'filter_level': tt.filter_level if hasattr(tt, 'filter_level') else '', # String
417 | 				'hashtags': map(lambda x: x.text, tt.hashtags), # {'0': ,'1':}
418 | 				'_id': tt.id_str, # String
419 | 				'in_reply_to_status_id': tt.in_reply_to_status_id,
420 | 				'in_reply_to_user_id': tt.in_reply_to_user_id,
421 | 				'lang': tt.lang, # String
422 | 				'place': tt.place, # Place
423 | 				'possibly_sensitive': tt.possibly_sensitive, # Boolean
424 | 				'retweet_count': tt.retweet_count, # int
425 | 				'source': tt.source, # String
426 | 				'text': tt.text, # String
427 | 				'user_id': tt.user.id, # int
428 | 				'user_mentions': map(lambda x: x.id, tt.user_mentions), # []
429 | 				'withheld_copyright': tt.withheld_copyright, # Boolean
430 | 				'withheld_in_countries': tt.withheld_in_countries, # Array of String
431 | 				'withheld_scope': tt.withheld_scope, #String
432 | 			}
433 | 
434 | 		except Exception as e:
435 | 			print e
436 | 			return None
437 | 
438 | 		return tweet
439 | 
440 | 
441 | if __name__ == '__main__':
442 | 	ts = TweetsCrawler()
443 | 	print ts.get_user_all_timeline(screen_name = 'mrmarcohan')


--------------------------------------------------------------------------------