├── .DS_Store
├── helpers
    ├── cat_files.sh
    ├── cummDistPlot.R
    └── frequency_plot.py
├── user_vector_training
    ├── word2vec
    │   └── twitter_training_w2v.sh
    ├── helpers
    │   ├── remove_repeat_contexts.py
    │   ├── node_frequency.py
    │   ├── test_distance.py
    │   └── feature_dist.py
    ├── deg_dist_in_top_users.py
    ├── nearest_neighbours
    │   ├── plot_similarity_nearest_users.py
    │   ├── query_nearest_users_sen.py
    │   ├── test_kdtree_query.py
    │   └── query_nearest_users.py
    ├── distance_w2v.py
    ├── filter_hashtag_sequence.py
    └── sentence_creation
    │   └── helpers
    │       └── test_sentence.py
├── adopter_prediction
    ├── helpers
    │   └── measure.py
    ├── prec_plot.py
    ├── adopter_prediction.py
    ├── adopter_prediction_parallel.py
    ├── adopter_pred_cand_set_stat.py
    ├── adopter_prediction_next_k.py
    ├── adopter_prediction_next_k_weight_learning.py
    └── adopter_prediction_single_topic.py
├── run_script.sh
├── filter_hashtag_sequence.py
├── train_test_split_hashtag_sequence.py
├── logs
    └── notes.log
├── filter_follower_graph.py
├── tsne_plots
    ├── tsne_word_visualisation.py
    ├── tsne_user_visualisation.py
    ├── tsne_hashtag_visualisation.py
    ├── tsne.py
    └── tsne_topic_adopters_visualisation.py
├── misc
    ├── virality_prediction_features.py
    └── test.py
├── README.md
├── Untitled
├── neighbourhood_experiments
    ├── entropy_vs_spread.py
    └── candidate_set_coverage
    │   ├── cand_recall_plot.py
    │   └── cand_cov_vs_spread.py
└── results.txt


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sahil505/social-network-embeddings/HEAD/.DS_Store


--------------------------------------------------------------------------------
/helpers/cat_files.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | indir="sentences_files"
3 | outfile="sentences_files/userSentencesComb"
4 | for string in "$indir"/*.txt ; do
5 | 	cat "$string" >> "$outfile"
6 | 	echo "$string"
7 | 	# rm "$string"
8 | done


--------------------------------------------------------------------------------
/user_vector_training/word2vec/twitter_training_w2v.sh:
--------------------------------------------------------------------------------
1 | make
2 | 
3 | CORPUS=tweets_comb_processed
4 | SAVE_FILE=twitter_vectors.txt
5 | VOCAB_FILE=vocab.txt
6 | 
7 | time ./word2vec -train $CORPUS -output $SAVE_FILE -cbow 0 -size 200 -window 8 -negative 5 -hs 0 -sample 1e-4 -threads 20 -binary 0 -iter 15 -save-vocab $VOCAB_FILE
8 | 


--------------------------------------------------------------------------------
/helpers/cummDistPlot.R:
--------------------------------------------------------------------------------
 1 | # Cumulative distribution plot
 2 | 
 3 | library(data.table)
 4 | library(plyr)
 5 | library(ggplot2)
 6 | 
 7 | dat<-fread("G:/socialnetworks_project_log/degree_distribution/numFriendsPerUser.csv")
 8 | dat<-as.data.frame(dat)
 9 | 
10 | plot(ecdf(dat$V2)) 
11 | 
12 | ggplot(dat,aes(x = V2)) + stat_ecdf() +
13 |   scale_x_log10() +
14 |   scale_y_continuous(expand = c(0,0))  + ylab("Cumulative distribution") + xlab("Value") + theme_bw(16)
15 | 
16 | dat<-fread("G:/socialnetworks_project_log/degree_distribution/numTweetsPerAuthor.csv")
17 | dat<-as.data.frame(dat)
18 | plot(ecdf(dat$V2))
19 | 
20 | dat<-fread("G:/socialnetworks_project_log/degree_distribution/featuresUserSubset.csv")
21 | dat<-as.data.frame(dat)
22 | ggplot(dat,aes(x = V2)) + stat_ecdf() +
23 |   scale_x_log10() +
24 |   scale_y_continuous(expand = c(0,0))  + ylab("Cumulative distribution") + xlab("Value") + theme_bw(16)
25 | ggplot(dat,aes(x = V3)) + stat_ecdf() +
26 |   scale_x_log10() +
27 |   scale_y_continuous(expand = c(0,0))  + ylab("Cumulative distribution") + xlab("Value") + theme_bw(16)
28 | 


--------------------------------------------------------------------------------
/adopter_prediction/helpers/measure.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys, os.path
 4 | from string import *
 5 | from sys import argv
 6 | from subr import *
 7 | 
 8 | if len(argv) < 4:
 9 | 	print "Usage: %s testing_file testing_output_file training_class" % (argv[0])
10 | 	sys.exit(1)
11 | 
12 | def main():
13 | 	original = read_first_column(argv[1])
14 | 	test_output = read_first_column(argv[2])
15 | 	train_new_class = read_first_column(argv[3])
16 | 
17 | 	predict = []
18 | 	for i in range(len(test_output)):
19 | 		idx = atoi(test_output[i][0])
20 | 		predict.append(train_new_class[idx])
21 | 
22 | 	if(len(predict) != len(original)):
23 | 		print "Error: lines of %s and %s are different." % (argv[1],argv[2])
24 | 		sys.exit(1)
25 | 
26 | 	labels = []
27 | 	for i in range(len(train_new_class)):
28 | 		for lab in train_new_class[i]:
29 | 			if (lab not in labels):
30 | 				labels.append(lab)
31 | 
32 | 	print "number of labels = %s" % len(labels)
33 | 
34 | 	result = measure(original,predict,labels)
35 | 
36 | 	print "Exact match ratio: %s" % result[0]
37 | 	print "Microaverage F-measure: %s" % result[1]
38 | 	print "Macroaverage F-measure: %s" % result[2]
39 | 
40 | main()
41 | 


--------------------------------------------------------------------------------
/run_script.sh:
--------------------------------------------------------------------------------
 1 | ## Basic workflow
 2 | 
 3 | cd /dbresearch2/word2vec/degree_distribution
 4 | 
 5 | ### pre-process hashtag adoption sequences ###
 6 | # filter users
 7 | python "deg_dist_in_top_users.py"
 8 | 
 9 | # create hashtag sequences. Output to sentences_files_timeonly/
10 | python "filter_hashtag_sequence.py"
11 | 
12 | # filter follower graph
13 | python "filter_follower_graph.py"
14 | 
15 | # get hashtags tweeted by these users
16 | python "train_test_split_hashtag_sequence.py"
17 | 
18 | ### corpus creation ###
19 | # convert to sentences
20 | python "sentence_creation/sentence_hashtag_adoption.py"
21 | 
22 | # concatenate sentence files. Output userSentencesComb file.
23 | bash cat_files.sh
24 | 
25 | ### train vectors using word2vec ###
26 | # word2vec to get user vectors. Output node_vectors_1hr_bfsr.txt and node_vocab_1hr_bfsr.txt
27 | bash "node_vector_training.sh"
28 | 
29 | ### adopter prediction task ###
30 | # frequency and exposure rank baselines
31 | python "adopter_prediction_baseline.py"
32 | 
33 | # user vector averaging method
34 | python "adopter_prediction_multiple_prec_plot.py"
35 | 
36 | ### geolocation prediction task ###
37 | # classification method and baselines
38 | python "user_vector_cluster_geography.py"


--------------------------------------------------------------------------------
/helpers/frequency_plot.py:
--------------------------------------------------------------------------------
 1 | # file to plot frequency distribution using matplotlib
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | rec_links = []
 7 | same_tags_tweets = []
 8 | with open("featuresUserSubset.csv","rb") as fr:
 9 | 	for line in fr:
10 | 		line = line.rstrip()
11 | 		u = line.split(',')
12 | 		id,rec,tags = int(u[0]),int(u[1]),int(u[2])
13 | 		rec_links.append(rec)
14 | 		same_tags_tweets.append(tags)
15 | rec_links = np.array(rec_links)
16 | same_tags_tweets = np.array(same_tags_tweets)
17 | 
18 | num_bin = 100000
19 | def freq_plot(data,xlab):
20 | 	values, base = np.histogram(data, bins=num_bin)
21 | 	cumulative = np.cumsum(values)
22 | 	plt.plot(base[:-1], values, c='red') #frequency
23 | 	# plt.plot(base[:-1], cumulative/float(len(data)), c='red') #normalised
24 | 	# plt.plot(base[:-1], len(data)-cumulative, c='red') #inverse, greater than
25 | 	# plt.plot(base[:-1], len(data)-np.append(0,cumulative)[:-1], c='red') #inverse, greater than or equal to
26 | 	# plt.yscale('log')
27 | 	plt.xscale('log')
28 | 	plt.xlabel(xlab)
29 | 	plt.xlim(xmin=0)
30 | 	plt.ylabel('cumulative frequency')
31 | 	plt.title('cumulative frequency distribution (greater than or equal to)')
32 | 	plt.grid()
33 | 	plt.show()
34 | 
35 | freq_plot(rec_links,'Users with reciprocal links')
36 | freq_plot(same_tags_tweets,'Users with tweets on same hashtags')


--------------------------------------------------------------------------------
/filter_hashtag_sequence.py:
--------------------------------------------------------------------------------
 1 | #filter only tweets by subset of users in hashtag sequence file
 2 | import time
 3 | import sys
 4 | import os
 5 | import cPickle as pickle
 6 | import random
 7 | 
 8 | min_tweets_sequence = 2 
 9 | selected_users = set()
10 | with open("userSubset.csv","r") as fr:
11 | 	for line in fr:
12 | 		line = line.rstrip()
13 | 		u = line.split(',')
14 | 		id,_,_ = int(u[0]),int(u[1]),int(u[2])
15 | 		selected_users.add(id)
16 | 
17 | m = dict()
18 | fr = open("/twitterSimulations/graph/map.txt")
19 | for line in fr:
20 | 	line = line.rstrip()
21 | 	u = line.split(' ')
22 | 	m[int(u[0])] = int(u[1])
23 | fr.close()
24 | print 'Map Read'
25 | 
26 | adoption_sequence = dict()
27 | with open('/twitterSimulations/timeline_data/dif_timeline1s', 'r') as fr:
28 | 	for line in fr:
29 | 		line = line.rstrip()
30 | 		u = line.split('\t')
31 | 		tag = u[0]
32 | 		time = int(u[1])
33 | 		author = m[int(u[2])]
34 | 		if author not in selected_users:
35 | 			continue
36 | 		try:
37 | 			adoption_sequence[tag].append((time,author))
38 | 		except KeyError:
39 | 			adoption_sequence[tag]=[(time,author)]
40 | print len(adoption_sequence)
41 | 
42 | with open('hashtagAdoptionSequences.txt','wb') as fd: # 'hashtagAdoptionSequences_filter.txt'
43 | 	for tag in adoption_sequence.keys():
44 | 		if len(adoption_sequence[tag])>=min_tweets_sequence:
45 | 			fd.write(tag)
46 | 			for t,a in adoption_sequence[tag]:
47 | 				fd.write(' '+str(t)+','+str(a)) #author is of type str for using join
48 | 			fd.write('\n')


--------------------------------------------------------------------------------
/train_test_split_hashtag_sequence.py:
--------------------------------------------------------------------------------
 1 | #filter only tweets by subset of users in hashtag sequence file
 2 | import time
 3 | import sys
 4 | import os
 5 | import cPickle as pickle
 6 | import random
 7 | 
 8 | #separate sequences into training (80%) and test sequences (20%)
 9 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt"
10 | adoption_sequence = []
11 | large_tag_id = []
12 | count=0
13 | with open(adoption_sequence_filename, 'r') as fr:
14 | 	for line in fr:
15 | 		line = line.rstrip()
16 | 		u = line.split(' ')
17 | 		#tag = u[0]
18 | 		sequence = []
19 | 		if len(u)-1>=100000:
20 | 			large_tag_id.append(count)
21 | 		for i in range(1, len(u)):
22 | 			#timestamp = int(u[i][0:u[i].index(',')])
23 | 			author = int(u[i][u[i].index(',')+1 : ])
24 | 			sequence.append(author)
25 | 		adoption_sequence.append(sequence)
26 | 		count+=1
27 | 
28 | num_lines = len(adoption_sequence) #3617312
29 | print num_lines
30 | seq_random_index=range(0,num_lines)
31 | random.shuffle(seq_random_index)
32 | num_train = int(0.8*num_lines)
33 | print num_train
34 | train_seq_id = seq_random_index[:num_train]
35 | test_seq_id = seq_random_index[num_train:]
36 | with open("sequence_file_split_indices.pickle","wb") as fd:
37 | 	pickle.dump(train_seq_id,fd)
38 | 	pickle.dump(test_seq_id,fd)
39 | users_train=set()
40 | for i in train_seq_id:
41 | 	for u in adoption_sequence[i]:
42 | 		users_train.add(u)
43 | users_test=set()
44 | overlap = set()
45 | for i in test_seq_id:
46 | 	for u in adoption_sequence[i]:
47 | 		users_test.add(u)
48 | 		if u in users_train:
49 | 			overlap.add(u)
50 | print len(users_train), len(users_test), len(overlap)
51 | with open("sequence_file_split_users.pickle","wb") as fd:
52 | 	pickle.dump(users_train,fd)
53 | 	pickle.dump(users_test,fd)
54 | 
55 | with open("sequence_large_hashtags.pickle","wb") as fd:
56 | 	pickle.dump(large_tag_id,fd)


--------------------------------------------------------------------------------
/user_vector_training/helpers/remove_repeat_contexts.py:
--------------------------------------------------------------------------------
 1 | #filter sentences with users repeating in the same context
 2 | 
 3 | from collections import defaultdict
 4 | import cPickle as pickle
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | from multiprocessing import Pool, cpu_count
 8 | 
 9 | adoption_sentence_filename = "/mnt/filer01/word2vec/degree_distribution/sentences_files/userSentencesComb_12hr"
10 | out_file = adoption_sentence_filename+"_filter"
11 | num_files = 13
12 | NUM_PROCESSES = num_files
13 | 
14 | linecount=0
15 | with open(adoption_sentence_filename, 'r') as fr, open(out_file, 'w') as fd:
16 | 	for line in fr:
17 | 		line = line.rstrip()
18 | 		u = line.split(' ')
19 | 		s=set(u)
20 | 		if len(s)>1:
21 | 			fd.write(line+"\n")
22 | 		linecount+=1
23 | 		if linecount%1000000==0:
24 | 			print "path count", linecount
25 | print "Sequence file read"
26 | 
27 | """
28 | #run count_sentence_file on different adoption sentences files in parallel processes
29 | num_workers = min(NUM_PROCESSES,cpu_count())
30 | pool = Pool(processes=num_workers) 
31 | process_num=0
32 | for i in range(0,num_files):
33 | 	pool.apply_async(count_sentence_file, args=(i,process_num))
34 | 	process_num+=1
35 | pool.close()
36 | pool.join()	
37 | 
38 | #combine counts from different pickle file
39 | count=defaultdict(int)
40 | context_count=defaultdict(int)
41 | for file_num in range(0,num_files):
42 | 	presentc = pickle.load(open(out_dir+"/frequencyNodes_1hr"+str(file_num)+".pickle","rb"))
43 | 	presentcc = pickle.load(open(out_dir+"/frequencyContextLength_1hr"+str(file_num)+".pickle","rb"))
44 | 	for i in presentc:
45 | 		count[i]+=presentc[i]
46 | 	for i in presentcc:
47 | 		context_count[i]+=presentcc[i]
48 | pickle.dump(count,open(out_dir+"/comb_frequencyNodes_1hr_timeonly.pickle","wb"))
49 | pickle.dump(context_count,open(out_dir+"/comb_frequencyContextLength_1hr_timeonly.pickle","wb"))	
50 | """
51 | 


--------------------------------------------------------------------------------
/logs/notes.log:
--------------------------------------------------------------------------------
 1 | checking sparsity of number of contexts available for users in terms of number of users with reciprocal links and number of users tweeting on same hashtag
 2 | extracting sentences using paths in hashtag graphs with time and geography based edges and training word2vec for getting user vectors, plots for user count and sentence length frequency
 3 | comparing top 10 similar users obtained using user vectors and using counts of users from tweet data
 4 | splitting sequence file into train and test for training user vectors for adopter prediction task, using breadth-first search and path-based approach for generating sentences from hashtag graph, using sequences with atleast k adopters in training user vectors ?
 5 | tSNE visualisation of user vectors trained using sentences with geography and time or time criteria
 6 | querying for nearest neighbors of user vectors using kd-tree or brute force approach and ranking them using distance metrics (min, average, weighted average) for different values of n and k
 7 | plotting change in precision and map at k with different values of k
 8 | learning weights for combining distances, in sorted order, of candidate users from initial adopters, with candidate set as nearest neighbors of user vectors of initial adopters, taking only subset of test sequences for training weights
 9 | learning weights for individual topics by querying a set of candidates first using nearest neighbour or distance-based methods, and then training and testing classifier to predict adopters from the candidate set
10 | tSNE visualisation of candidate set for individual topics along with predicted labels
11 | stepwise prediction of adopters from the candidate set, predicting adopters based on a prediction probability threshold?
12 | plotting candidate set coverage with increasing size of candidate set in terms of number of nearest neighbours queried or it’s radius, plotting total spread of topics with the candidate set coverage in first 1000 adoptions
13 | training and testing adopter prediction task for particular size of candidate set
14 | predicting geography of users from user vectors, comparing with network-based baselines


--------------------------------------------------------------------------------
/user_vector_training/deg_dist_in_top_users.py:
--------------------------------------------------------------------------------
 1 | #filter users according to number of tweets with hashtags and number of following, plot degree distribution of this subset of users to check if there is senough context available for each user
 2 | import time
 3 | import re
 4 | import datetime
 5 | import dateutil.tz
 6 | import calendar
 7 | import sys
 8 | import os
 9 | import cPickle as pickle
10 | 	
11 | 
12 | m = dict()
13 | fr = open("/twitterSimulations/graph/map.txt")
14 | for line in fr:
15 | 	line = line.rstrip()
16 | 	u = line.split(' ')
17 | 	m[int(u[0])] = int(u[1])
18 | fr.close()
19 | 
20 | num_tagtweets_per_user = dict()
21 | with open('/twitterSimulations/timeline_data/dif_timeline1s', 'r') as fr:
22 | 	for line in fr:
23 | 		line = line.rstrip()
24 | 		u = line.split('\t')
25 | 		author = m[int(u[2])]
26 | 		if author not in num_tagtweets_per_user:
27 | 			num_tagtweets_per_user[author]=0
28 | 		num_tagtweets_per_user[author]+=1
29 | print len(num_tagtweets_per_user)
30 | selected_tagtweets_users = set(num_tagtweets_per_user.keys())
31 | """
32 | with open("numTweetsPerAuthor.csv","w") as fd:
33 | 	for i in num_tagtweets_per_user:
34 | 		fd.write(str(i)+","+str(num_tagtweets_per_user[i])+"\n")
35 | """
36 | node_nbh = pickle.load(open( "/twitterSimulations/friends_count_user.pickle", "rb" ) )
37 | print len(node_nbh)
38 | 
39 | selected_friends_users = set(node_nbh.keys())
40 | """
41 | with open("numFriendsPerUser.csv","w") as fd:
42 | 	for i in node_nbh:
43 | 		fd.write(str(i)+","+str(node_nbh[i])+"\n")
44 | """
45 | common_users = set.intersection(selected_tagtweets_users, selected_friends_users)
46 | print len(common_users)
47 | 
48 | def get_subset(d,t):
49 | 	s = set()
50 | 	for i in d:
51 | 		if d[i]>=t:
52 | 			s.add(i)
53 | 	return s
54 | sel_tagtweets = get_subset(num_tagtweets_per_user,15)
55 | sel_friends = get_subset(node_nbh,200)
56 | 
57 | common_users = set.intersection(sel_tagtweets, sel_friends) #1001525
58 | print len(common_users)
59 | 
60 | with open("userSubset.csv","w") as fd:
61 | 	for i in common_users:
62 | 		fd.write(str(i)+","+str(num_tagtweets_per_user[i])+","+str(node_nbh[i])+"\n")


--------------------------------------------------------------------------------
/filter_follower_graph.py:
--------------------------------------------------------------------------------
 1 | #filter only tweets by subset of users in hashtag sequence file
 2 | import time
 3 | import sys
 4 | import os
 5 | import cPickle as pickle
 6 | import random
 7 | 
 8 | # filter follower files for users in adoption sequence
 9 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt"
10 | adoption_sequence_users = set()
11 | count=0
12 | with open(adoption_sequence_filename, 'r') as fr:
13 | 	for line in fr:
14 | 		line = line.rstrip()
15 | 		u = line.split(' ')
16 | 		#tag = u[0]
17 | 		for i in range(1, len(u)):
18 | 			#timestamp = int(u[i][0:u[i].index(',')])
19 | 			author = int(u[i][u[i].index(',')+1 : ])
20 | 			adoption_sequence_users.add(author)
21 | 		count+=1
22 | print len(adoption_sequence_users), count
23 | 
24 | m = dict()
25 | fr = open("/twitterSimulations/graph/map.txt")
26 | for line in fr:
27 | 	line = line.rstrip()
28 | 	u = line.split(' ')
29 | 	m[int(u[0])] = int(u[1])
30 | fr.close()
31 | print 'Map Read'
32 | 
33 | # arr = ["user_followers_bigger_graph_recrawl_3.txt"]
34 | arr = ["user_followers_bigger_graph.txt","user_followers_bigger_graph_2.txt", "user_followers_bigger_graph_i.txt","user_followers_bigger_graph_recrawl_2.txt", "user_followers_bigger_graph_recrawl_3.txt","user_followers_bigger_graph_recrawl.txt"]
35 | 
36 | follower_adj = [ [] for i in xrange(0, 7697889) ]
37 | 
38 | for i in arr:
39 | 	fr = open("/twitterSimulations/graph/" + i,'r')
40 | 	for line in fr:
41 | 		line = line.rstrip()
42 | 		u = line.split(' ')
43 | 		if(int(u[0]) > 7697889):
44 | 			continue
45 | 		if len(u) > 2:
46 | 			for j in range(2,len(u)):
47 | 				follower_adj[m[int(u[1])]].append(m[int(u[j])])
48 | 	fr.close()
49 | 	print i
50 | 
51 | print 'Graph Read\n'
52 | 
53 | # for i in range(0, 7697889):
54 | 	# follower_adj[i] = set(follower_adj[i])
55 | 
56 | print 'Graph Set\n'
57 | 
58 | with open("graph_files/follower_graph_tweeters","wb") as fd:
59 | 	for i in follower_adj:
60 | 		if i in adoption_sequence_users:
61 | 			fol = set(follower_adj[i])&adoption_sequence_users
62 | 			fol = map(str,list(fol))
63 | 			fd.write(str(len(fol))+" "+str(i)+" "+" ".join(fol)+"\n")
64 | 		


--------------------------------------------------------------------------------
/user_vector_training/nearest_neighbours/plot_similarity_nearest_users.py:
--------------------------------------------------------------------------------
 1 | #plot scatterplot of similarity between nearest users for a query users obtained from user vectors and from hashtag sequence file
 2 | 
 3 | import cPickle as pickle
 4 | from distance_w2v import *
 5 | import matplotlib
 6 | matplotlib.use('Agg')
 7 | import pylab as Plot
 8 | from numpy import array
 9 | 
10 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr.txt"
11 | nearest_users_pickle = "/mnt/filer01/word2vec/degree_distribution/nearest_users_compare1hr_5.pickle"
12 | 
13 | def save_scatterplot(X,overlap,fname):
14 | 	posX,posY,color = zip(*X)
15 | 	#max_d = max(color)
16 | 	#min_d = min(color)
17 | 	#color_norm = [(x-min_d)/float(max_d-min_d) for x in color]
18 | 	fig = Plot.figure()
19 | 	Plot.scatter(posX, posY, s=20, c=color)
20 | 	#Plot.axis('off')
21 | 	Plot.xlim([0,100])
22 | 	Plot.ylim([0,100])
23 | 	Plot.xlabel('User vectors')
24 | 	Plot.ylabel('Counts')
25 | 	Plot.colorbar()
26 | 	fig.suptitle('Overlap '+str(overlap))
27 | 	fig.savefig(fname, dpi=100, bbox_inches='tight')
28 | 
29 | with open(nearest_users_pickle,"rb") as fr:
30 | 	sample_users = pickle.load(fr)
31 | 	overlap_count = pickle.load(fr)
32 | 	nearest_users_seq = pickle.load(fr)
33 | 	nearest_users_w2v = pickle.load(fr)
34 | 
35 | max_overlap = overlap_count.index(max(overlap_count))
36 | min_overlap = overlap_count.index(min(overlap_count))
37 | overlap_query_users = [max(overlap_count),min(overlap_count)]
38 | 
39 | vec,vocab,_ = read_vector_file(vec_file)
40 | 
41 | query_users = [sample_users[max_overlap],sample_users[min_overlap]]
42 | count=0
43 | for query_user in query_users:
44 | 	count+=1
45 | 	users_seq = nearest_users_seq[query_user]
46 | 	users_w2v = nearest_users_w2v[query_user]
47 | 	print len(users_seq),len(users_w2v)
48 | 	X = []
49 | 	for i in range(0,len(users_w2v)):
50 | 		vec1=vec[vocab.index(users_w2v[i])]
51 | 		for j in range(0,len(users_seq)):
52 | 			vec2=vec[vocab.index(users_seq[j])]
53 | 			dist = 0.0
54 | 			for d in range(0,len(vec1)):
55 | 				dist+=vec1[d]*vec2[d]
56 | 			X.append((i+1,j+1,dist))
57 | 	save_scatterplot(X,overlap_query_users[count-1],fname='nearest_users_scatterplot'+str(count))


--------------------------------------------------------------------------------
/user_vector_training/distance_w2v.py:
--------------------------------------------------------------------------------
 1 | #same as distance.c file in word2vec for use in query_nearest_users.py
 2 | 
 3 | from math import sqrt
 4 | 
 5 | def read_vector_file(path_vectors_file):
 6 | 	vocab = []
 7 | 	vectors = []
 8 | 	with open(path_vectors_file,"rb") as fr:
 9 | 		_,dim = next(fr).rstrip().split(' ')
10 | 		word_vector_dim = int(dim)
11 | 		next(fr)
12 | 		for line in fr:
13 | 			line = line.rstrip()
14 | 			u = line.split(' ')
15 | 			if len(u) != word_vector_dim+1:
16 | 				print "vector length error"
17 | 			word = int(u[0])
18 | 			vec = []
19 | 			length = 0.0
20 | 			for d in u[1:]:
21 | 				num=float(d)
22 | 				vec.append(num)
23 | 				length+=num**2
24 | 			#vec = map(float,u[1:])
25 | 			#length = sum(x**2 for x in vec)
26 | 			length = sqrt(length)
27 | 			vec_norm = [x/length for x in vec]
28 | 			vocab.append(word)
29 | 			vectors.append(vec_norm)
30 | 	return vectors, vocab, word_vector_dim
31 | 
32 | def get_Nnearest(query,vec,vocab,N):
33 | 	wordN = [0]*N
34 | 	distN = [0.0]*N
35 | 	try:
36 | 		voc_ind = vocab.index(query)
37 | 	except ValueError:
38 | 		print "query word not present"
39 | 		return
40 | 	query_vec = vec[voc_ind]
41 | 	dim = len(query_vec)
42 | 	for i in range(0,len(vec)):
43 | 		if i==voc_ind:
44 | 			continue
45 | 		pres_word = vocab[i]
46 | 		pres_vec = vec[i]
47 | 		dist = 0.0
48 | 		for x in range(0,dim):
49 | 			dist+=query_vec[x]*pres_vec[x]
50 | 		#dist = sum(query_vec[x]*pres_vec[x] for x in range(0,dim))
51 | 		for j in range(0,N):
52 | 			if dist>distN[j]:
53 | 				for k in range(N-1,j,-1):
54 | 					distN[k] = distN[k-1]
55 | 					wordN[k] = wordN[k-1]
56 | 				distN[j] = dist
57 | 				wordN[j] = pres_word
58 | 				break
59 | 	return wordN #zip(wordN,distN)
60 | 
61 | def get_distance(query1,query2,vec,vocab):
62 | 	dist=0.0
63 | 	try:
64 | 		vec1=vec[vocab.index(query1)]
65 | 		vec2=vec[vocab.index(query2)]
66 | 	except ValueError:
67 | 		print "query word not present"
68 | 		return
69 | 	for i in range(0,len(vec1)):
70 | 		dist+=vec1[i]*vec2[i]
71 | 	return dist
72 | 
73 | #vec,vocab,_ = read_vector_file("/mnt/filer01/word2vec/node_vectors_1hr.txt")
74 | #print get_Nnearest(17,vec,vocab,N=1)
75 | #print get_distance(17,1145375,vec,vocab)
76 | #print get_distance(1,1145375,vec,vocab)==None


--------------------------------------------------------------------------------
/tsne_plots/tsne_word_visualisation.py:
--------------------------------------------------------------------------------
 1 | #visualising top 100 most, least and mid frequent words using t-SNE
 2 | 
 3 | from tsne import *
 4 | from numpy import array
 5 | 
 6 | word_vectors = []
 7 | path_vec_file = '/mnt/filer01/word2vec/twitter_vectors.txt'
 8 | word_vector_dim = 200
 9 | labels = dict()
10 | X_word = []
11 | windex=0
12 | with open(path_vec_file, 'rb') as fr:
13 | 	next(fr)
14 | 	for line in fr:
15 | 		line = line.rstrip()
16 | 		u = line.split(' ')
17 | 		if len(u) != word_vector_dim+1:
18 | 			print "vector length error"
19 | 		word = u[0].decode('latin-1')
20 | 		vec = map(float,u[1:])
21 | 		labels[word]=windex
22 | 		# word_vectors.append([word]+vec)
23 | 		X_word.append(vec)
24 | 		windex+=1
25 | # labels = [x[0] for x in word_vectors]
26 | # X_word = [x[1:] for x in word_vectors]
27 | 
28 | word_freq_sorted = []
29 | path_vocab_file = '/mnt/filer01/word2vec/vocab.txt'
30 | with open(path_vocab_file, 'rb') as fr:
31 | 	for line in fr:
32 | 		line = line.rstrip()
33 | 		u = line.split(' ')
34 | 		word_freq_sorted.append(u[0].decode('latin-1'))
35 | 
36 | def get_word_vectors(wlist):
37 | 	vectors = []
38 | 	for w in wlist:
39 | 		vectors.append(X_word[labels[w]])
40 | 	return array(vectors)
41 | 		
42 | most_freq = word_freq_sorted[0:1000]
43 | mid_freq = word_freq_sorted[-1000:]
44 | half_num_words = int(len(word_freq_sorted)/2.0)
45 | least_freq = word_freq_sorted[half_num_words-500:half_num_words+499]
46 | 
47 | def save_embed_plot(X,labels,fname):
48 | 	Y = tsne(X, 2, word_vector_dim, 20.0);
49 | 	fig = Plot.figure()
50 | 	Plot.scatter(Y[:,0], Y[:,1], 1);
51 | 	for label, x, y in zip(labels, Y[:,0], Y[:,1]):
52 | 		Plot.annotate(label, xy = (x, y), xytext = (0, 0), textcoords = 'offset points', size=5)
53 | 	fig.savefig(fname, dpi=1200)
54 | 	
55 | if __name__ == "__main__":
56 | 	print "Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset."
57 | 	# print "Running example on 2,500 MNIST digits..."
58 | 	# X = Math.loadtxt("mnist2500_X.txt");
59 | 	# labels = Math.loadtxt("mnist2500_labels.txt");
60 | 	
61 | 	# save_embed_plot(get_word_vectors(most_freq),array(most_freq),'embed_mostfreq.png')
62 | 	# save_embed_plot(get_word_vectors(mid_freq),array(mid_freq),'embed_midfreq.png')
63 | 	# save_embed_plot(get_word_vectors(least_freq),array(least_freq),'embed_leastfreq.png')
64 | 	save_embed_plot(get_word_vectors(word_freq_sorted),array(word_freq_sorted),'embed_all.png')
65 | 	


--------------------------------------------------------------------------------
/misc/virality_prediction_features.py:
--------------------------------------------------------------------------------
 1 | #write features for hashtags in virality prediction using user vectors
 2 | 
 3 | import cPickle as pickle
 4 | import time
 5 | from distance_w2v import *
 6 | 
 7 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr.txt"
 8 | timeline_file = "/twitterSimulations/timeline_data/timeline_weng"
 9 | feature_file = "/mnt/filer01/word2vec/degree_distribution/feature_file.csv"
10 | 
11 | vec,vocab,dim = read_vector_file(vec_file)
12 | vocab_index=dict()
13 | for i in range(0,len(vocab)):
14 | 	vocab_index[vocab[i]]=i
15 | 
16 | m = dict()
17 | fr = open("/twitterSimulations/graph/map.txt")
18 | for line in fr:
19 | 	line = line.rstrip()
20 | 	u = line.split(' ')
21 | 	m[int(u[0])] = int(u[1])
22 | fr.close()
23 | print 'Map Read'
24 | 
25 | not_found_vocab=[]
26 | pred_thr = 1500
27 | with open(timeline_file, "rb") as fr, open(feature_file, "wb") as fd:
28 | 	feature_names = ','.join(["max_"+str(x) for x in range(0,dim)])+','+','.join(["min_"+str(x) for x in range(0,dim)])+','+','.join(["avg_"+str(x) for x in range(0,dim)])
29 | 	fd.write("TagName,"+feature_names+",Class\n")
30 | 	for line in fr:	
31 | 		line = line.rstrip()
32 | 		u = line.split(' ')
33 | 		if len(u) <= pred_thr:
34 | 			continue
35 | 		numTweets = 0
36 | 		not_found=0
37 | 		user_vectors = []
38 | 		for i in range(1, len(u)):
39 | 			#timestamp = int(u[i][0:u[i].index(',')])
40 | 			numTweets = i
41 | 			if(numTweets > pred_thr):
42 | 				break
43 | 			author = int(u[i][u[i].index(',')+1 : ])
44 | 			author = m[author]
45 | 			if author in vocab_index:
46 | 				user_vec=vec[vocab_index[author]]
47 | 			else:
48 | 				not_found+=1
49 | 				continue
50 | 			user_vectors.append(user_vec)
51 | 		if user_vectors==[]:
52 | 			max_vec = [0.0]*dim
53 | 			min_vec = [0.0]*dim
54 | 			avg_vec = [0.0]*dim
55 | 			print u[0]
56 | 		else:
57 | 			aggr_vec = zip(*user_vectors)
58 | 			max_vec = []
59 | 			min_vec = []
60 | 			avg_vec = []
61 | 			for i in range(0,len(aggr_vec)):
62 | 				d = aggr_vec[i]
63 | 				max_vec.append(max(d))
64 | 				min_vec.append(min(d))
65 | 				avg_vec.append(sum(d)/float(len(d)))
66 | 		if len(u) > 10000:
67 | 			class_label = '1'
68 | 		else:
69 | 			class_label = '0'
70 | 		fd.write(str(u[0])+','+','.join(map(str,max_vec))+','+','.join(map(str,min_vec))+','+','.join(map(str,avg_vec))+','+class_label+'\n')
71 | 		not_found_vocab.append(not_found)
72 | 
73 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab)
74 | #pickle.dump(not_found_vocab,open("not_found_vocab.pickle","wb"))


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Social Network Embeddings
 2 | Code for methods to embed social network users based on their topic activity, described in the [paper](https://arxiv.org/abs/1710.07622). This experiment is performed on a large-scale social network extracted from Twitter, consisting of about 7.7 million users and their activity on around 3.6 million topics over a month long period to predict the most likely future adopters of a topic and the geo-location of users by training a word2vec (Skip-Gram Model) model in the context of text mining to compute representations of users.
 3 | 
 4 | #### Abstract
 5 | 
 6 | ```
 7 | This article presents a novel approach for learning low-dimensional distributed representations of users in online social networks. Existing methods rely on the network structure formed by the social relationships among users to extract these representations.
 8 | However, the network information can be obsolete, incomplete or dynamically changing. In addition, in some cases, it can be prohibitively expensive to get the network information. Therefore, we propose an alternative approach based on observations from topics being talked on in social networks.
 9 | We utilise the time information of users adopting topics in order to embed them in a real-valued vector space. Through extensive experiments, we investigate the properties of the representations learned and their efficacy in preserving information about link structure among users.
10 | We also evaluate the representations in two different prediction tasks, namely, predicting most likely future adopters of a topic and predicting the geo-location of users. Experiments to validate the proposed methods are performed on a large-scale social network extracted from Twitter, consisting of about 7.7 million users and their activity on around 3.6 million topics over a month-long period.
11 | ```
12 | 
13 | #### Note
14 | Adventurers beware! This repository is meant for version control of scripts used for experiments in the paper. So, not heavily commented and not heavily tested.
15 | 
16 | ## Basic workflow
17 | 1. "user_vector_training/deg_dist_in_top_users.py" -> filter users
18 | 2. "user_vector_training/filter_hashtag_sequence.py" -> get hashtags tweeted by these users
19 | 3. "user_vector_training/sentence_creation/sentence_hashtag_adoption.py" -> convert to sentences
20 | 4. "user_vector_training/word2vec/twitter_training_w2v.sh" -> word2vec to get user vectors
21 | 5. "adopter_prediction/adopter_prediction.py" -> next adopter prediction
22 | 
23 | ## Contact
24 | If you are interested in knowing more or have any questions on the code, feel free to contact me at <sahilkhokhar505@gmail.com> & Harvineet at <harvineet1992@gmail.com>.
25 | 


--------------------------------------------------------------------------------
/adopter_prediction/prec_plot.py:
--------------------------------------------------------------------------------
 1 | #plot MAP, precision, recall at k for different k in single tag case and averaged over 100 tags
 2 | #plot precision at k and R for individual topics with learned weights
 3 | 
 4 | from collections import defaultdict
 5 | import cPickle as pickle
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | """
 9 | top_k = [25,50]+range(100,1001,100) #[100,200,300]+range(500,5001,500)
10 | 
11 | def eval_plot(vec,nbapp,fol,ylab,title):
12 | 	plt.plot(top_k, vec)
13 | 	plt.plot(top_k, nbapp)
14 | 	plt.plot(top_k, fol)
15 | 	plt.legend(['User vectors','Frequency','Followers'])
16 | 	plt.xlabel('k')
17 | 	# plt.xlim(xmin=top_k[0])
18 | 	plt.ylabel(ylab)
19 | 	plt.title(title)
20 | 	plt.grid()
21 | 	plt.show()
22 | 	
23 | #single tag plot
24 | with open("prec_plot_single1.pickle","rb") as fr:
25 | 	mapk = pickle.load(fr)
26 | 	preck = pickle.load(fr)
27 | 	reck = pickle.load(fr)
28 | 	vec,nbapp,fol = zip(*preck)
29 | 	eval_plot(list(vec),list(nbapp),list(fol),'Precision at k','Precision@k values at different k')
30 | 
31 | def average(eval_list):
32 | 	avg_vec = [0.0]*len(top_k)
33 | 	avg_nbapp = [0.0]*len(top_k)
34 | 	avg_fol = [0.0]*len(top_k)
35 | 	for l in eval_list:
36 | 		vec,nbapp,fol = zip(*l)
37 | 		for i,v in enumerate(vec):
38 | 			avg_vec[i]+=v
39 | 		for i,v in enumerate(nbapp):
40 | 			avg_nbapp[i]+=v
41 | 		for i,v in enumerate(fol):
42 | 			avg_fol[i]+=v
43 | 	num_tags = len(eval_list)
44 | 	avg_vec = [v*1.0/num_tags for v in avg_vec]
45 | 	avg_nbapp = [v*1.0/num_tags for v in avg_nbapp]
46 | 	avg_fol = [v*1.0/num_tags for v in avg_fol]
47 | 	return avg_vec,avg_nbapp,avg_fol
48 | 	
49 | top_k = [1,2,5]+range(10,101,10)
50 | #100 tags
51 | with open("prec_plot_k.pickle","rb") as fr:
52 | 	mapk = pickle.load(fr)
53 | 	preck = pickle.load(fr)
54 | 	reck = pickle.load(fr)
55 | 	vec,nbapp,fol = average(mapk)
56 | 	eval_plot(vec,nbapp,fol,'Precision at k','Precision@k values at different k')
57 | """
58 | num_bin = 50
59 | def eval_plot(eval,rec,xlab,title):
60 | 	plt.hist(eval, num_bin)
61 | 	# plt.bar(range(1,len(eval)+1), eval)
62 | 	# plt.bar(range(1,len(rec)+1), rec)
63 | 	plt.xlabel(xlab)
64 | 	plt.ylabel('Frequency')
65 | 	plt.title(title)
66 | 	plt.grid()
67 | 	plt.show()
68 | 
69 | # with open("mean_precision_n10_rf_prec10.pickle","rb") as fr:
70 | 	# prec_k_total = pickle.load(fr)
71 | 	# cand_set_recall = pickle.load(fr)
72 | 	# cand_set_size_list = pickle.load(fr)
73 | 
74 | with open("eval_n10_lr.pickle","rb") as fr:
75 | 	ap_total = pickle.load(fr)
76 | 	prec_k_total = pickle.load(fr)
77 | 	_ = pickle.load(fr)
78 | 	cand_set_recall = pickle.load(fr)	
79 | 
80 | # print sum(cand_set_size_list)
81 | # eval_plot(list(prec_k_total),list(cand_set_size_list),'Precision@k','Histogram of Prec@10 for 100 topics')
82 | user,_,_ = zip(*prec_k_total)
83 | eval_plot(list(user),[],'Precision@k','Histogram of Precision@500')


--------------------------------------------------------------------------------
/user_vector_training/helpers/node_frequency.py:
--------------------------------------------------------------------------------
 1 | #count frequency of nodes occurring in sentences
 2 | 
 3 | from collections import defaultdict
 4 | import cPickle as pickle
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | from multiprocessing import Pool, cpu_count
 8 | """
 9 | in_dir = "/mnt/filer01/word2vec/degree_distribution/sentences_files_timeonly/"
10 | out_dir = "/mnt/filer01/word2vec/degree_distribution/count_files/"
11 | num_files = 8
12 | NUM_PROCESSES = num_files
13 | 
14 | def count_sentence_file(file_num,process_num):
15 | 	count=defaultdict(int)
16 | 	context_count=defaultdict(int)
17 | 	with open(in_dir+'/hashtagAdoptionSentences'+str(file_num)+'.txt','rb') as fr:
18 | 		for line in fr:
19 | 			line = line.rstrip()
20 | 			u = line.split(' ')
21 | 			s = len(u)
22 | 			context_count[s]+=1
23 | 			for id in range(0, s):
24 | 				author = int(u[id])
25 | 				count[author]+=1
26 | 	print "process", process_num, "file complete", file_num
27 | 	pickle.dump(count,open(out_dir+"/frequencyNodes_1hr_bfsr_timeonly"+str(file_num)+".pickle","wb"))
28 | 	pickle.dump(context_count,open(out_dir+"/frequencyContextLength_1hr_bfsr_timeonly"+str(file_num)+".pickle","wb"))
29 | 
30 | #run count_sentence_file on different adoption sentences files in parallel processes
31 | num_workers = min(NUM_PROCESSES,cpu_count())
32 | pool = Pool(processes=num_workers) 
33 | process_num=0
34 | for i in range(0,num_files):
35 | 	pool.apply_async(count_sentence_file, args=(i,process_num))
36 | 	process_num+=1
37 | pool.close()
38 | pool.join()	
39 | 
40 | #combine counts from different pickle file
41 | count=defaultdict(int)
42 | context_count=defaultdict(int)
43 | for file_num in range(0,num_files):
44 | 	presentc = pickle.load(open(out_dir+"/frequencyNodes_1hr_bfsr_timeonly"+str(file_num)+".pickle","rb"))
45 | 	presentcc = pickle.load(open(out_dir+"/frequencyContextLength_1hr_bfsr_timeonly"+str(file_num)+".pickle","rb"))
46 | 	for i in presentc:
47 | 		count[i]+=presentc[i]
48 | 	for i in presentcc:
49 | 		context_count[i]+=presentcc[i]
50 | pickle.dump(count,open(out_dir+"/comb_frequencyNodes_1hr_bfsr_timeonly.pickle","wb"))
51 | pickle.dump(context_count,open(out_dir+"/comb_frequencyContextLength_1hr_bfsr_timeonly.pickle","wb"))	
52 | """
53 | #plot
54 | count = pickle.load(open("sentences_frequency_files/comb_frequencyNodes_1hr_bfsr_loc.pickle","rb"))
55 | node_freq = []
56 | for i in count:
57 | 	node_freq.append(count[i])
58 | 	
59 | node_freq = np.array(node_freq)
60 | 
61 | num_bin = 100000
62 | def freq_plot(data,xlab):
63 | 	values, base = np.histogram(data, bins=num_bin)
64 | 	cumulative = np.cumsum(values)
65 | 	# plt.plot(base[:-1], values, c='red') #frequency
66 | 	# plt.plot(base[:-1], cumulative/float(len(data)), c='red') #normalised
67 | 	# plt.plot(base[:-1], len(data)-cumulative, c='red') #inverse, greater than
68 | 	plt.plot(base[:-1], len(data)-np.append(0,cumulative)[:-1], c='red') #inverse, greater than or equal to
69 | 	plt.yscale('log')
70 | 	plt.xscale('log')
71 | 	plt.xlabel(xlab)
72 | 	plt.xlim(xmin=0)
73 | 	plt.ylabel('number of users')
74 | 	plt.title('cumulative frequency distribution (greater than or equal to)')
75 | 	plt.grid()
76 | 	plt.show()
77 | 	
78 | freq_plot(node_freq,'Count of user occurrence in sentences')
79 | 
80 | #frequency plot of path lengths
81 | ccount = pickle.load(open("sentences_frequency_files/comb_frequencyContextLength_1hr_bfsr_loc.pickle","rb"))
82 | clength_freq = []
83 | for i in ccount:
84 | 	clength_freq.append((i,ccount[i]))
85 | 
86 | def freq_plot_clength(data,xlab):
87 | 	x,y = zip(*data)
88 | 	x = [i-0.4 for i in x] #label at bar centre
89 | 	y = [i/float(sum(y)) for i in y] #normalised
90 | 	plt.bar(x, y) #frequency
91 | 	plt.xlabel(xlab)
92 | 	plt.xlim(xmin=0)
93 | 	plt.ylabel('Proportion of paths')
94 | 	plt.title('frequency distribution')
95 | 	plt.grid()
96 | 	plt.show()
97 | freq_plot_clength(clength_freq,'Path length')
98 | 
99 | 


--------------------------------------------------------------------------------
/user_vector_training/nearest_neighbours/query_nearest_users_sen.py:
--------------------------------------------------------------------------------
  1 | #query users nearest to a given user using node vectors file from distance-filewrite.c file and compare with users in same path in sentences file
  2 | 
  3 | import cPickle as pickle
  4 | import random
  5 | import os, sys, datetime
  6 | from heapq import nlargest
  7 | 
  8 | start_time = datetime.datetime.now()
  9 | 
 10 | adoption_sentence_filename = "/mnt/filer01/word2vec/degree_distribution/sentences_files/userSentencesComb_12hr" #"sample_sequences"
 11 | #time_diff_for_edge = 5*1*60*60 #5 context width for path in one direction
 12 | 
 13 | m = dict()
 14 | fr = open("/twitterSimulations/graph/map.txt")
 15 | for line in fr:
 16 | 	line = line.rstrip()
 17 | 	u = line.split(' ')
 18 | 	m[int(u[0])] = int(u[1])
 19 | fr.close()
 20 | print 'Map Read'
 21 | 
 22 | location_buckets = [-1] * 7697889
 23 | fr = open('/twitterSimulations/known_locations.txt', 'r')
 24 | for line in fr:
 25 | 	line = line.rstrip()
 26 | 	u = line.split('\t')
 27 | 	try:
 28 | 		location_buckets[m[int(u[0])]] = int(u[1])
 29 | 	except:
 30 | 		pass
 31 | fr.close()
 32 | 
 33 | fr = open('/twitterSimulations/known_locations1.txt', 'r')
 34 | for line in fr:
 35 | 	line = line.rstrip()
 36 | 	u = line.split('\t')
 37 | 	try:
 38 | 		location_buckets[m[int(u[0])]] = int(u[1])
 39 | 	except:
 40 | 		pass
 41 | fr.close()
 42 | print "location file read"
 43 | 
 44 | def call_distance(word):
 45 | 	return os.system("./distance-filewrite ../node_vectors_12hr.bin query_output_temp12hrsen "+str(word))
 46 | 	
 47 | def get_nearest():
 48 | 	nearest = []
 49 | 	with open("query_output_temp12hrsen","rb") as fr:
 50 | 		for line in fr:
 51 | 			line=line.rstrip().split('\t')
 52 | 			nearest.append(int(line[0]))
 53 | 	return nearest
 54 | 
 55 | def compare_nearest(seq,w2v):
 56 | 	return len(set(seq)&set(w2v))
 57 | 
 58 | vocab = []
 59 | with open("../node_vocab_12hr.txt","rb") as fr:
 60 | 	next(fr)
 61 | 	for line in fr:
 62 | 		line=line.rstrip().split(' ')
 63 | 		vocab.append(int(line[0]))
 64 | print "Vocab read"
 65 | 
 66 | rand_users = random.sample(vocab,100)
 67 | rand_users_set = set(rand_users)
 68 | vocab = set(vocab)
 69 | print "Sample selected"
 70 | 
 71 | near_count = [[0]*7697889 for i in xrange(0,100)]
 72 | 
 73 | linecount=0
 74 | with open(adoption_sentence_filename, 'r') as fr:
 75 | 	for line in fr:
 76 | 		line = line.rstrip()
 77 | 		u = line.split(' ')
 78 | 		sentence = map(int,u)
 79 | 		for author in sentence:
 80 | 			if author in rand_users_set:
 81 | 				for j in sentence:
 82 | 					near_count[rand_users.index(author)][j]+=1
 83 | 				near_count[rand_users.index(author)][author]-=1
 84 | 		linecount+=1
 85 | 		if linecount%1000000==0:
 86 | 			print "path count", linecount
 87 | print "Sequence file read"
 88 | 
 89 | near_users_seq = dict()
 90 | for i in range(0,len(rand_users)):
 91 | 	user_count = near_count[i]
 92 | 	count = []
 93 | 	for l in vocab:#xrange(0,7697889):
 94 | 		if user_count[l]!=0 and l!=rand_users[i]:
 95 | 			count.append((l,user_count[l]))
 96 | 	#count = zip(range(0,7697889),near_count[i])
 97 | 	#count_nz = [(a,b) for (a,b) in count if b!=0]
 98 | 	#count_s = sorted(count_nz,key=lambda x: x[1],reverse=True)[0:100]
 99 | 	#count_s = sorted(range(0,7697889),key=lambda x: user_count[x],reverse=True)
100 | 	count_s = nlargest(100,count,key=lambda x: x[1])
101 | 	if len(count_s)==0:
102 | 		u,c = [], []
103 | 	else:
104 | 		u,c = zip(*count_s)
105 | 	near_users_seq[rand_users[i]]=list(u)
106 | 	print "sel count", rand_users[i], len(u), "non zero", len(count)
107 | 
108 | nearest_users_w2v_pickle = dict()
109 | count_pickle = []
110 | for user in rand_users:
111 | 	a = call_distance(user)
112 | 	if a!=0:
113 | 		print "call error"
114 | 		sys.exit(0)
115 | 	nearest_users_w2v = get_nearest()
116 | 	comp_count = compare_nearest(near_users_seq[user][0:100],nearest_users_w2v[0:100])
117 | 	print "common users", user, comp_count
118 | 	count_pickle.append(comp_count)
119 | 	nearest_users_w2v_pickle[user]=nearest_users_w2v
120 | 
121 | with open("nearest_users_compare12hrsen.pickle","wb") as fd:
122 | 	pickle.dump(rand_users,fd)
123 | 	pickle.dump(count_pickle,fd)
124 | 	pickle.dump(near_users_seq,fd)
125 | 	pickle.dump(nearest_users_w2v_pickle,fd)
126 | 	pickle.dump(near_count,fd)
127 | 
128 | print start_time, datetime.datetime.now()


--------------------------------------------------------------------------------
/tsne_plots/tsne_user_visualisation.py:
--------------------------------------------------------------------------------
  1 | #visualising users who adopted a hashtag using t-SNE on user vectors
  2 | 
  3 | import matplotlib
  4 | matplotlib.use('Agg')
  5 | from tsne import *
  6 | from numpy import array
  7 | import math, random
  8 | 
  9 | word_vectors = []
 10 | path_vec_file = '/mnt/filer01/word2vec/node_vectors_1hr_pr.txt'
 11 | word_vector_dim = 100
 12 | labels = dict()
 13 | X_word = []
 14 | windex=0
 15 | with open(path_vec_file, 'rb') as fr:
 16 | 	_,dim = next(fr).rstrip().split(' ')
 17 | 	word_vector_dim = int(dim)
 18 | 	next(fr)
 19 | 	for line in fr:
 20 | 		line = line.rstrip()
 21 | 		u = line.split(' ')
 22 | 		if len(u) != word_vector_dim+1:
 23 | 			print "vector length error"
 24 | 		word = int(u[0])
 25 | 		vec = map(float,u[1:])
 26 | 		labels[word]=windex
 27 | 		windex+=1
 28 | 		X_word.append(vec)
 29 | 
 30 | word_freq_sorted = []
 31 | path_vocab_file = '/mnt/filer01/word2vec/node_vocab_1hr_pr.txt'
 32 | with open(path_vocab_file, 'rb') as fr:
 33 | 	next(fr)
 34 | 	for line in fr:
 35 | 		line = line.rstrip()
 36 | 		u = line.split(' ')
 37 | 		word_freq_sorted.append(int(u[0]))
 38 | 
 39 | tag_seq = dict()
 40 | users_ht = set()
 41 | seq_file = '/mnt/filer01/word2vec/degree_distribution/sample_ht_sequences'
 42 | with open(seq_file, 'rb') as fr:
 43 | 	for line in fr:
 44 | 		line = line.rstrip()
 45 | 		u = line.split(' ')
 46 | 		tag = u[0]
 47 | 		sequence = []
 48 | 		for i in range(1, len(u)):
 49 | 			author = int(u[i][u[i].index(',')+1 : ])
 50 | 			if author in labels:
 51 | 				sequence.append(author)
 52 | 		tag_seq[tag]=random.sample(sequence,500)
 53 | 		for u in tag_seq[tag]:
 54 | 			users_ht.add(u)
 55 | 		print tag, len(sequence)
 56 | print len(users_ht)
 57 | 		
 58 | m = dict()
 59 | fr = open("/twitterSimulations/graph/map.txt")
 60 | for line in fr:
 61 | 	line = line.rstrip()
 62 | 	u = line.split(' ')
 63 | 	m[int(u[0])] = int(u[1])
 64 | fr.close()
 65 | print 'Map Read'
 66 | 
 67 | location_buckets = [-1] * 7697889
 68 | # location_buckets = dict() #map to -1 for users not in location files
 69 | fr = open('/twitterSimulations/known_locations.txt', 'r')
 70 | for line in fr:
 71 | 	line = line.rstrip()
 72 | 	u = line.split('\t')
 73 | 	try:
 74 | 		location_buckets[m[int(u[0])]] = int(u[1])
 75 | 	except:
 76 | 		pass
 77 | fr.close()
 78 | 
 79 | fr = open('/twitterSimulations/known_locations1.txt', 'r')
 80 | for line in fr:
 81 | 	line = line.rstrip()
 82 | 	u = line.split('\t')
 83 | 	try:
 84 | 		location_buckets[m[int(u[0])]] = int(u[1])
 85 | 	except:
 86 | 		pass
 87 | fr.close()
 88 | print "location file read"		
 89 | 
 90 | def get_word_vectors_ht(wlist):
 91 | 	vectors = []
 92 | 	color = []
 93 | 	tags=tag_seq.keys()
 94 | 	c1 = set(tag_seq[tags[0]]) #modikiadalat (dark blue)
 95 | 	c2 = set(tag_seq[tags[1]]) #7millionandcounting (light blue)
 96 | 	c3 = set(tag_seq[tags[2]]) #time100 (red)
 97 | 	for w in wlist:
 98 | 		vectors.append(X_word[labels[w]])
 99 | 		if w in c1:
100 | 			color.append(50)
101 | 		elif w in c2:
102 | 			color.append(100)
103 | 		elif w in c3:
104 | 			color.append(200)
105 | 		else:
106 | 			print "no tag"
107 | 	return array(vectors), color
108 | 
109 | def get_word_vectors(wlist):
110 | 	vectors = []
111 | 	color = []
112 | 	for w in wlist:
113 | 		vectors.append(X_word[labels[w]])
114 | 		color.append(location_buckets[w])
115 | 	return array(vectors), color
116 | 	
117 | # most_freq = word_freq_sorted[0:2500]
118 | # least_freq = word_freq_sorted[-2500:]
119 | half_num_words = int(len(word_freq_sorted)/2.0)
120 | mid_freq = word_freq_sorted[half_num_words-1250:half_num_words+1249]
121 | all_random = random.sample(word_freq_sorted,1000)
122 | 
123 | def save_embed_plot((X,color),fname):
124 | 	Y = tsne(X, no_dims = 2, initial_dims = 50, perplexity = 30.0);
125 | 	fig = Plot.figure()
126 | 	Plot.scatter(Y[:,0], Y[:,1], s=20, c=color, alpha=0.8, edgecolor='none');
127 | 	Plot.axis('off')
128 | 	fig.savefig(fname, dpi=300, bbox_inches='tight')
129 | 	
130 | if __name__ == "__main__":
131 | 	print "Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset."
132 | 	# print "Running example on 2,500 MNIST digits..."
133 | 	# X = Math.loadtxt("mnist2500_X.txt");
134 | 	# labels = Math.loadtxt("mnist2500_labels.txt");
135 | 	# save_embed_plot(get_word_vectors(most_freq),'embed_users_mostfreq.png')
136 | 	save_embed_plot(get_word_vectors(all_random),'embed_users_random_1hr_pr.png')
137 | 	# save_embed_plot(get_word_vectors(mid_freq),'embed_users_midfreq_1hr_pr.png')
138 | 	save_embed_plot(get_word_vectors_ht(list(users_ht)),'embed_users_random_ht_1hr_pr.png')
139 | 	# save_embed_plot(get_word_vectors(least_freq),'embed_users_leastfreq.png')


--------------------------------------------------------------------------------
/tsne_plots/tsne_hashtag_visualisation.py:
--------------------------------------------------------------------------------
  1 | #visualising top 100 most, least and mid frequent hashtags using t-SNE on histogram-of-counts vectors from word classes
  2 | 
  3 | import matplotlib
  4 | matplotlib.use('Agg')
  5 | from tsne import *
  6 | from numpy import array
  7 | import math, random
  8 | import cPickle as pickle
  9 | from collections import Counter
 10 | 
 11 | path_class_file = '/mnt/filer01/word2vec/twitter_vectors_classes.sorted.txt'
 12 | word_to_cluster = dict()
 13 | with open(path_class_file, 'rb') as fr:
 14 | 	for line in fr:
 15 | 		line = line.rstrip()
 16 | 		u = line.split(' ')
 17 | 		word_to_cluster[u[0]]=int(u[1])
 18 | 
 19 | tag_labels = []
 20 | num_tags = 0
 21 | with open('/mnt/filer01/tweets_repository/Nov2013/tag_tweets_bow.txt', 'rb') as fr:
 22 | 	for line in fr:
 23 | 		line = line.rstrip()
 24 | 		u = line.split('\t')
 25 | 		tag = u[0]
 26 | 		tag_labels.append(tag)
 27 | 		num_tags+=1
 28 | 
 29 | word_doc_freq = dict()
 30 | tag_bow = []
 31 | with open('/mnt/filer01/tweets_repository/Nov2013/tag_tweets_bow_processed.txt', 'rb') as fr:
 32 | 	for line in fr:
 33 | 		line = line.rstrip()
 34 | 		u = line.split(' ')
 35 | 		tag = u[0]
 36 | 		words = u[1:]
 37 | 		tag_bow.append(words) # remove duplicate words also
 38 | 		doc_words = set()
 39 | 		for w in words:
 40 | 			if w not in doc_words:
 41 | 				if w not in word_doc_freq:
 42 | 					word_doc_freq[w]=0
 43 | 				word_doc_freq[w]+=1
 44 | 				doc_words.add(w)
 45 | 		
 46 | word_clusters_dim = 1000
 47 | word_not_found=set()
 48 | hist_feature = []
 49 | for tag_words in tag_bow:
 50 | 	tag_feature = [0]*word_clusters_dim
 51 | 	num_words = 0
 52 | 	# word_term_freq = Counter(tag_words)
 53 | 	for word in tag_words:
 54 | 		try: #words from tag bow missing in word vector, may be because of min limit on word occurrence 5
 55 | 			cluster_id = word_to_cluster[word] # cluster index from 0, and order of idx and labels same
 56 | 			df = word_doc_freq[word] #document frequency of words from vocab file
 57 | 			idf = math.log10(float(num_tags)/df)
 58 | 			# if word=='dconcert': # count for cluster with 'dconcert' very high, causing nan value error in tsne P-value calculation
 59 | 				# continue
 60 | 			tag_feature[cluster_id]+=1*idf #using idf as word relevance
 61 | 			num_words+=1*idf
 62 | 		except:
 63 | 			word_not_found.add(word)
 64 | 	#normalise by total number of words
 65 | 	# num_words = len(tag_words)
 66 | 	if num_words==0:
 67 | 		print "error, tag with no words"
 68 | 		num_words = 0.1
 69 | 	hist_feature.append([float(x)/num_words for x in tag_feature])
 70 | # with open('hashtag_vec_tfidf.pickle', 'wb') as fd:
 71 | 	# pickle.dump(hist_feature,fd)
 72 | print len(word_doc_freq), len(word_not_found)
 73 | tag_freq = []
 74 | with open('tag_freq_1500.csv', 'rb') as fr:
 75 | 	next(fr)
 76 | 	for line in fr:
 77 | 		line = line.rstrip()
 78 | 		u = line.split(',')
 79 | 		tag = u[0]
 80 | 		tag_freq.append((tag,int(u[1])))
 81 | 
 82 | tag_freq_sorted = [t for t,_ in sorted(tag_freq,key=lambda x: x[1], reverse = True)]
 83 | most_freq = set(tag_freq_sorted[0:150])
 84 | least_freq = set(tag_freq_sorted[-150:])
 85 | half_num_words = int(len(tag_freq_sorted)/2.0)
 86 | mid_freq = set(tag_freq_sorted[half_num_words-75:half_num_words+74])
 87 | all_random = set(random.sample(tag_freq_sorted,150))
 88 | 
 89 | #set visibility of most, least and mid frequency hashtags by setting text size
 90 | def get_tag_size_label(tlist):
 91 | 	size = []
 92 | 	label = []
 93 | 	for t in tag_labels:
 94 | 		if t in tlist:
 95 | 			size.append(2)
 96 | 			label.append(t.decode('latin-1'))
 97 | 		else:
 98 | 			size.append(0)
 99 | 			label.append('')
100 | 	return size, array(label)
101 | 
102 | X = array(hist_feature)
103 | Y = tsne(X, 2, 50, 30.0);
104 | 
105 | def save_embed_plot((tag_sizes,labels),fname):
106 | 	fig = Plot.figure()
107 | 	Plot.scatter(Y[:,0], Y[:,1], 0);
108 | 	for label, x, y, s in zip(labels, Y[:,0], Y[:,1], tag_sizes):
109 | 		Plot.annotate(label, xy = (x, y), xytext = (0, 0), textcoords = 'offset points', size=s)
110 | 	Plot.axis('off')
111 | 	fig.savefig(fname, dpi=800, bbox_inches='tight')
112 | 	
113 | if __name__ == "__main__":
114 | 	print "Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset."
115 | 	# print "Running example on 2,500 MNIST digits..."
116 | 	# X = Math.loadtxt("mnist2500_X.txt");
117 | 	# labels = Math.loadtxt("mnist2500_labels.txt");
118 | 	save_embed_plot(get_tag_size_label(most_freq),'embed_tag_mostfreq.png')
119 | 	save_embed_plot(get_tag_size_label(mid_freq),'embed_tag_midfreq.png')
120 | 	save_embed_plot(get_tag_size_label(least_freq),'embed_tag_leastfreq.png')
121 | 	save_embed_plot(get_tag_size_label(all_random),'embed_tag_random.png')


--------------------------------------------------------------------------------
/user_vector_training/nearest_neighbours/test_kdtree_query.py:
--------------------------------------------------------------------------------
  1 | from scipy.spatial import cKDTree as KDTree
  2 | import time
  3 | from math import sqrt
  4 | import random
  5 | from heapq import nsmallest
  6 | from sklearn.neighbors import NearestNeighbors
  7 | 
  8 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_bfsr.txt"
  9 | # M=100000
 10 | def read_vector_file(path_vectors_file):
 11 | 	vocab = []
 12 | 	vectors = []
 13 | 	count=0
 14 | 	with open(path_vectors_file,"rb") as fr:
 15 | 		_,dim = next(fr).rstrip().split(' ')
 16 | 		word_vector_dim = int(dim)
 17 | 		next(fr)
 18 | 		for line in fr:
 19 | 			# if count==M:
 20 | 				# break
 21 | 			line = line.rstrip()
 22 | 			u = line.split(' ')
 23 | 			if len(u) != word_vector_dim+1:
 24 | 				print "vector length error"
 25 | 			word = int(u[0])
 26 | 			#normalise to length 1
 27 | 			# vec = []
 28 | 			# length = 0.0
 29 | 			# for d in u[1:]:
 30 | 				# num=float(d)
 31 | 				# vec.append(num)
 32 | 				# length+=num**2
 33 | 			# length = sqrt(length)
 34 | 			vec = map(float,u[1:])
 35 | 			length = sum(x**2 for x in vec)
 36 | 			vec_norm = [x/length for x in vec]
 37 | 			vocab.append(word)
 38 | 			vectors.append(vec_norm)
 39 | 			count+=1
 40 | 	return vectors, vocab, word_vector_dim
 41 | 
 42 | def get_Nranked_list(query_set_ind,N):
 43 | 	# wordN = [0]*N
 44 | 	# distN = [0.0]*N
 45 | 	dist_total = []
 46 | 	set_size = len(query_set_ind)
 47 | 	for i in xrange(0,len(vec)):
 48 | 		if i in query_set_ind:
 49 | 			continue
 50 | 		pres_word = i
 51 | 		pres_vec = vec[i]
 52 | 		dist_k = [0.0]*set_size
 53 | 		k=0
 54 | 		dim=len(pres_vec)
 55 | 		for voc_ind in query_set_ind:
 56 | 			user_vec = vec[voc_ind]
 57 | 			dist = sum( (user_vec[x]-pres_vec[x])**2 for x in xrange(0,dim) )
 58 | 			dist_k[k]= sqrt(dist)
 59 | 			k+=1
 60 | 		nearest_k = min(dist_k) # dist_k_sorted[0] #  if sorted not needed
 61 | 		dist_set=nearest_k
 62 | 		dist_total.append((pres_word,dist_set))
 63 | 	wordN = [w for w,_ in nsmallest(N,dist_total,key=lambda x: x[1])]
 64 | 	return wordN #zip(wordN,distN)
 65 | 
 66 | t=0.0
 67 | t1=0.0
 68 | t2=0.0
 69 | N=3
 70 | k=500
 71 | M= 2654594 #1000000
 72 | D=10
 73 | S=10
 74 | eps = 0
 75 | 
 76 | vec,vocab,dim = read_vector_file(vec_file)
 77 | print "num points", len(vec), "dim", dim
 78 | 
 79 | # vec = [v[:D] for v in vec[:M]]
 80 | print len(vec),len(vec[0]), "eps", eps
 81 | tic = time.clock()
 82 | kd = KDTree(vec, leafsize=10)
 83 | toc = time.clock()
 84 | print "scipy tree built in", (toc-tic)*1000
 85 | 
 86 | tic = time.clock()
 87 | neigh = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='ball_tree', leaf_size=10, metric='minkowski', p=2) #'ball_tree', 'kd_tree', 'auto'
 88 | kd_sklearn = neigh.fit(vec)
 89 | toc = time.clock()
 90 | print "sklearn tree built in", (toc-tic)*1000
 91 | 
 92 | for _ in range(0,N):	
 93 | 	sample = random.sample(range(0,M),S)
 94 | 	sample_vec = [vec[i] for i in sample]
 95 | 	
 96 | 	tic = time.clock()
 97 | 	d_list,knn_list = kd.query(sample_vec,k=k+1) #, eps=eps)
 98 | 	dist_n_list = []
 99 | 	for d,n in zip(d_list,knn_list):
100 | 		dist_n_list+=list(zip(n,d))[1:]
101 | 	knn= [w for w,_ in nsmallest(k,dist_n_list,key=lambda x: x[1])]
102 | 	toc = time.clock()
103 | 	print "scipy, tree query in", (toc-tic)*1000
104 | 	t+=(toc-tic)*1000
105 | 	
106 | 	
107 | 	tic1 = time.clock()
108 | 	knn_brute = get_Nranked_list(sample,k)
109 | 	toc1 = time.clock()
110 | 	print "brute, tree query in", (toc1-tic1)*1000
111 | 	if knn_brute!=knn:
112 | 		print "scipy, not same points", "same", len(set(knn_brute)&set(knn)), "out of", k
113 | 	else:
114 | 		print "same", len(set(knn_brute)&set(knn)), len(knn_brute)
115 | 	t1+=(toc1-tic1)*1000
116 | 	
117 | 	tic1 = time.clock()
118 | 	d_list,knn_list = neigh.kneighbors(X=sample_vec, n_neighbors=k+1, return_distance=True)
119 | 	dist_n_list = []
120 | 	for d,n in zip(d_list,knn_list):
121 | 		dist_n_list+=list(zip(n,d))[1:]
122 | 	knn_sklearn= [w for w,_ in nsmallest(k,dist_n_list,key=lambda x: x[1])]
123 | 	toc1 = time.clock()
124 | 	print "sklearn, tree query in", (toc1-tic1)*1000
125 | 	if knn_sklearn!=knn_brute:
126 | 		print "sklearn, not same points", "same", len(set(knn_brute)&set(knn_sklearn)), "out of", k
127 | 	else:
128 | 		print "same", len(set(knn_brute)&set(knn_sklearn))
129 | 	t2+=(toc1-tic1)*1000
130 | 	
131 | print "tree query in, avg, kdtree", t*1./N, "brute", t1*1./N, "sklearn", t2*1./N
132 | """
133 | for i in random.sample(range(0,M),N):
134 | 	tic = time.clock()
135 | 	_,knn = kd.query(vec[i],k=k) #, eps=eps)
136 | 	toc = time.clock()
137 | 	# print i, knn
138 | 	# print "tree query in", (toc-tic)*1000
139 | 	t+=(toc-tic)*1000
140 | 	
141 | 	tic1 = time.clock()
142 | 	knn_brute = get_Nranked_list([i],k)
143 | 	toc1 = time.clock()
144 | 	# print i, knn_brute
145 | 	# print "tree query in", (toc1-tic1)*1000
146 | 	if knn_brute!=list(knn):
147 | 		print "not same points", "same", len(set(knn_brute)&set(list(knn))), "out of", k
148 | 	t1+=(toc1-tic1)*1000
149 | print "tree query in, avg, kdtree", t*1./N, "brute", t1*1./N
150 | """


--------------------------------------------------------------------------------
/user_vector_training/filter_hashtag_sequence.py:
--------------------------------------------------------------------------------
  1 | #filter only tweets by subset of users in hashtag sequence file
  2 | import time
  3 | import sys
  4 | import os
  5 | import cPickle as pickle
  6 | import random
  7 | """	
  8 | min_tweets_sequence = 2 
  9 | selected_users = set()
 10 | with open("userSubset.csv","r") as fr:
 11 | 	for line in fr:
 12 | 		line = line.rstrip()
 13 | 		u = line.split(',')
 14 | 		id,_,_ = int(u[0]),int(u[1]),int(u[2])
 15 | 		selected_users.add(id)
 16 | 
 17 | m = dict()
 18 | fr = open("/twitterSimulations/graph/map.txt")
 19 | for line in fr:
 20 | 	line = line.rstrip()
 21 | 	u = line.split(' ')
 22 | 	m[int(u[0])] = int(u[1])
 23 | fr.close()
 24 | print 'Map Read'
 25 | 
 26 | adoption_sequence = dict()
 27 | with open('/twitterSimulations/timeline_data/dif_timeline1s', 'r') as fr:
 28 | 	for line in fr:
 29 | 		line = line.rstrip()
 30 | 		u = line.split('\t')
 31 | 		tag = u[0]
 32 | 		time = int(u[1])
 33 | 		author = m[int(u[2])]
 34 | 		if author not in selected_users:
 35 | 			continue
 36 | 		try:
 37 | 			adoption_sequence[tag].append((time,author))
 38 | 		except KeyError:
 39 | 			adoption_sequence[tag]=[(time,author)]
 40 | print len(adoption_sequence)
 41 | 
 42 | with open('hashtagAdoptionSequences_filter.txt','wb') as fd:
 43 | 	for tag in adoption_sequence.keys():
 44 | 		if len(adoption_sequence[tag])>=min_tweets_sequence:
 45 | 			fd.write(tag)
 46 | 			for t,a in adoption_sequence[tag]:
 47 | 				fd.write(' '+str(t)+','+str(a)) #author is of type str for using join
 48 | 			fd.write('\n')
 49 | """
 50 | #separate sequences into training (80%) and test sequences (20%)
 51 | """
 52 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt"
 53 | adoption_sequence = []
 54 | large_tag_id = []
 55 | count=0
 56 | with open(adoption_sequence_filename, 'r') as fr:
 57 | 	for line in fr:
 58 | 		line = line.rstrip()
 59 | 		u = line.split(' ')
 60 | 		#tag = u[0]
 61 | 		sequence = []
 62 | 		if len(u)-1>=100000:
 63 | 			large_tag_id.append(count)
 64 | 		for i in range(1, len(u)):
 65 | 			#timestamp = int(u[i][0:u[i].index(',')])
 66 | 			author = int(u[i][u[i].index(',')+1 : ])
 67 | 			sequence.append(author)
 68 | 		adoption_sequence.append(sequence)
 69 | 		count+=1
 70 | 
 71 | num_lines = len(adoption_sequence) #3617312
 72 | print num_lines
 73 | seq_random_index=range(0,num_lines)
 74 | random.shuffle(seq_random_index)
 75 | num_train = int(0.8*num_lines)
 76 | print num_train
 77 | train_seq_id = seq_random_index[:num_train]
 78 | test_seq_id = seq_random_index[num_train:]
 79 | with open("sequence_file_split_indices.pickle","wb") as fd:
 80 | 	pickle.dump(train_seq_id,fd)
 81 | 	pickle.dump(test_seq_id,fd)
 82 | users_train=set()
 83 | for i in train_seq_id:
 84 | 	for u in adoption_sequence[i]:
 85 | 		users_train.add(u)
 86 | users_test=set()
 87 | overlap = set()
 88 | for i in test_seq_id:
 89 | 	for u in adoption_sequence[i]:
 90 | 		users_test.add(u)
 91 | 		if u in users_train:
 92 | 			overlap.add(u)
 93 | print len(users_train), len(users_test), len(overlap)
 94 | with open("sequence_file_split_users.pickle","wb") as fd:
 95 | 	pickle.dump(users_train,fd)
 96 | 	pickle.dump(users_test,fd)
 97 | 
 98 | # with open("sequence_large_hashtags.pickle","wb") as fd:
 99 | # 	pickle.dump(large_tag_id,fd)
100 | """
101 | # filter follower files for users in adoption sequence
102 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt"
103 | adoption_sequence_users = set()
104 | count=0
105 | with open(adoption_sequence_filename, 'r') as fr:
106 | 	for line in fr:
107 | 		line = line.rstrip()
108 | 		u = line.split(' ')
109 | 		#tag = u[0]
110 | 		for i in range(1, len(u)):
111 | 			#timestamp = int(u[i][0:u[i].index(',')])
112 | 			author = int(u[i][u[i].index(',')+1 : ])
113 | 			adoption_sequence_users.add(author)
114 | 		count+=1
115 | print len(adoption_sequence_users), count
116 | 
117 | m = dict()
118 | fr = open("/twitterSimulations/graph/map.txt")
119 | for line in fr:
120 | 	line = line.rstrip()
121 | 	u = line.split(' ')
122 | 	m[int(u[0])] = int(u[1])
123 | fr.close()
124 | print 'Map Read'
125 | 
126 | # arr = ["user_followers_bigger_graph_recrawl_3.txt"]
127 | arr = ["user_followers_bigger_graph.txt","user_followers_bigger_graph_2.txt", "user_followers_bigger_graph_i.txt","user_followers_bigger_graph_recrawl_2.txt", "user_followers_bigger_graph_recrawl_3.txt","user_followers_bigger_graph_recrawl.txt"]
128 | 
129 | follower_adj = [ [] for i in xrange(0, 7697889) ]
130 | 
131 | for i in arr:
132 | 	fr = open("/twitterSimulations/graph/" + i,'r')
133 | 	for line in fr:
134 | 		line = line.rstrip()
135 | 		u = line.split(' ')
136 | 		if(int(u[0]) > 7697889):
137 | 			continue
138 | 		if len(u) > 2:
139 | 			for j in range(2,len(u)):
140 | 				follower_adj[m[int(u[1])]].append(m[int(u[j])])
141 | 	fr.close()
142 | 	print i
143 | 
144 | print 'Graph Read\n'
145 | 
146 | # for i in range(0, 7697889):
147 | 	# follower_adj[i] = set(follower_adj[i])
148 | 
149 | print 'Graph Set\n'
150 | 
151 | with open("graph_files/follower_graph_tweeters","wb") as fd:
152 | 	for i in follower_adj:
153 | 		if i in adoption_sequence_users:
154 | 			fol = set(follower_adj[i])&adoption_sequence_users
155 | 			fol = map(str,list(fol))
156 | 			fd.write(str(len(fol))+" "+str(i)+" "+" ".join(fol)+"\n")
157 | 		


--------------------------------------------------------------------------------
/user_vector_training/nearest_neighbours/query_nearest_users.py:
--------------------------------------------------------------------------------
  1 | #query users nearest to a given user using node vectors file from distance-filewrite.c file and compare with users nearby in hashtag sequence file
  2 | 
  3 | import cPickle as pickle
  4 | import random
  5 | import os, sys, datetime
  6 | from heapq import nlargest
  7 | from distance_w2v import *
  8 | 
  9 | start_time = datetime.datetime.now()
 10 | 
 11 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences"
 12 | time_diff_for_edge = 1*1*60*60 #5 context width for path in one direction
 13 | vec_file = "../node_vectors_1hr_bfs_sgng.txt"
 14 | vocab_file = "../node_vocab_1hr_bfs_sgng.txt"
 15 | out_file = "nearest_users_compare1hr_bfs_sgng.pickle"
 16 | vec,vocab_ind,_ = read_vector_file(vec_file)
 17 | 
 18 | m = dict()
 19 | fr = open("/twitterSimulations/graph/map.txt")
 20 | for line in fr:
 21 | 	line = line.rstrip()
 22 | 	u = line.split(' ')
 23 | 	m[int(u[0])] = int(u[1])
 24 | fr.close()
 25 | print 'Map Read'
 26 | 
 27 | location_buckets = [-1] * 7697889
 28 | fr = open('/twitterSimulations/known_locations.txt', 'r')
 29 | for line in fr:
 30 | 	line = line.rstrip()
 31 | 	u = line.split('\t')
 32 | 	try:
 33 | 		location_buckets[m[int(u[0])]] = int(u[1])
 34 | 	except:
 35 | 		pass
 36 | fr.close()
 37 | 
 38 | fr = open('/twitterSimulations/known_locations1.txt', 'r')
 39 | for line in fr:
 40 | 	line = line.rstrip()
 41 | 	u = line.split('\t')
 42 | 	try:
 43 | 		location_buckets[m[int(u[0])]] = int(u[1])
 44 | 	except:
 45 | 		pass
 46 | fr.close()
 47 | print "location file read"
 48 | 
 49 | # def call_distance(word):
 50 | # 	return os.system("./distance-filewrite ../node_vectors_1hr_bfs_15.bin query_output_temp1hr_bfs_15 "+str(word))
 51 | 	
 52 | # def get_nearest():
 53 | # 	nearest = []
 54 | # 	with open("query_output_temp1hr_bfs_15","rb") as fr:
 55 | # 		for line in fr:
 56 | # 			line=line.rstrip().split('\t')
 57 | # 			nearest.append(int(line[0]))
 58 | # 	return nearest
 59 | 
 60 | def compare_nearest(seq,w2v):
 61 | 	return len(set(seq)&set(w2v))
 62 | 
 63 | vocab = []
 64 | freq = dict()
 65 | with open(vocab_file,"rb") as fr:
 66 | 	next(fr)
 67 | 	for line in fr:
 68 | 		line=line.rstrip().split(' ')
 69 | 		vocab.append(int(line[0]))
 70 | 		freq[int(line[0])]=int(line[1])
 71 | print "Vocab read"
 72 | 
 73 | sub_vocab=[]
 74 | for v in vocab:
 75 | 	if freq[v]>10000:
 76 | 		sub_vocab.append(v)
 77 | rand_users = random.sample(vocab,100)
 78 | rand_users_set = set(rand_users)
 79 | vocab = set(vocab)
 80 | print "Sample selected"
 81 | 
 82 | near_count = [[0]*7697889 for i in xrange(0,100)]
 83 | 
 84 | tagcount=0
 85 | with open(adoption_sequence_filename, 'r') as fr:
 86 | 	for line in fr:
 87 | 		line = line.rstrip()
 88 | 		u = line.split(' ')
 89 | 		for i in range(1, len(u)):
 90 | 			timestamp = int(u[i][0:u[i].index(',')])
 91 | 			author = int(u[i][u[i].index(',')+1 : ])
 92 | 			location_user = location_buckets[author]
 93 | 			if author in rand_users_set:
 94 | 				for j in range(i+1, len(u)):
 95 | 					t1 = int(u[j][0:u[j].index(',')])
 96 | 					a1 = int(u[j][u[j].index(',')+1 : ])
 97 | 					if t1-timestamp<=time_diff_for_edge:
 98 | 						if location_buckets[a1]==location_user:
 99 | 							near_count[rand_users.index(author)][a1]+=1
100 | 					else:
101 | 						break
102 | 				for j in range(i-1, 0, -1):
103 | 					t1 = int(u[j][0:u[j].index(',')])
104 | 					a1 = int(u[j][u[j].index(',')+1 : ])
105 | 					if timestamp-t1<=time_diff_for_edge:
106 | 						if location_buckets[a1]==location_user:
107 | 							near_count[rand_users.index(author)][a1]+=1
108 | 					else:
109 | 						break
110 | 		tagcount+=1
111 | 		if tagcount%100000==0:
112 | 			print "Hashtag count", tagcount
113 | print "Sequence file read"
114 | 
115 | near_users_seq = dict()
116 | for i in range(0,len(rand_users)):
117 | 	user_count = near_count[i]
118 | 	count = []
119 | 	for l in vocab:#xrange(0,7697889):
120 | 		if user_count[l]!=0 and l!=rand_users[i]:
121 | 			count.append((l,user_count[l]))
122 | 	#count = zip(range(0,7697889),near_count[i])
123 | 	#count_nz = [(a,b) for (a,b) in count if b!=0]
124 | 	#count_s = sorted(count_nz,key=lambda x: x[1],reverse=True)[0:100]
125 | 	#count_s = sorted(range(0,7697889),key=lambda x: user_count[x],reverse=True)
126 | 	count_s = nlargest(100,count,key=lambda x: x[1])
127 | 	if len(count_s)==0:
128 | 		u,c = [], []
129 | 	else:
130 | 		u,c = zip(*count_s)
131 | 	near_users_seq[rand_users[i]]=list(u)
132 | 	print "sel count", rand_users[i], len(u), "non zero", len(count)
133 | 
134 | nearest_users_w2v_pickle = dict()
135 | count_pickle = []
136 | for user in rand_users:
137 | 	# a = call_distance(user)
138 | 	# if a!=0:
139 | 	# 	print "call error"
140 | 	# 	sys.exit(0)
141 | 	# nearest_users_w2v = get_nearest()
142 | 	nearest_users_w2v = get_Nnearest(user,vec,vocab_ind,100)
143 | 	comp_count = compare_nearest(near_users_seq[user][0:100],nearest_users_w2v[0:100])
144 | 	print "common users", user, comp_count, "out of", min(len(near_users_seq[user]),len(nearest_users_w2v))
145 | 	count_pickle.append(comp_count)
146 | 	nearest_users_w2v_pickle[user]=nearest_users_w2v
147 | 
148 | with open(out_file,"wb") as fd:
149 | 	pickle.dump(rand_users,fd)
150 | 	pickle.dump(count_pickle,fd)
151 | 	pickle.dump(near_users_seq,fd)
152 | 	pickle.dump(nearest_users_w2v_pickle,fd)
153 | 
154 | print start_time, datetime.datetime.now()


--------------------------------------------------------------------------------
/user_vector_training/helpers/test_distance.py:
--------------------------------------------------------------------------------
  1 | from heapq import nsmallest,nlargest
  2 | from math import sqrt
  3 | vec = [(1,1),(4,2),(2,2),(3,2),(3,3),(4,4),(2,3)]
  4 | for i in range(0,7):
  5 |     a,b=vec[i]
  6 |     l=float(sqrt(a**2+b**2))
  7 |     vec[i]=(a/l,b/l)
  8 | vocab = [1,2,3,4,5,6,7]
  9 | dim = 2
 10 | par_m = 2
 11 | vocab_index=dict()
 12 | for i in xrange(0,len(vocab)):
 13 | 	vocab_index[vocab[i]]=i
 14 | query_set = [3,4]
 15 | N=4
 16 | 
 17 | def get_Nranked_list(query_set,N):
 18 | 	# wordN = [0]*N
 19 | 	# distN = [0.0]*N
 20 | 	dist_total = []
 21 | 	set_size = len(query_set)
 22 | 	try:
 23 | 		query_set_ind = [ vocab_index[query] for query in query_set ]
 24 | 	except KeyError:
 25 | 		print "query word not present"
 26 | 		return
 27 | 	print query_set_ind
 28 | 	for i in xrange(0,len(vec)):
 29 | 		if i in query_set_ind:
 30 | 			continue
 31 | 		pres_word = vocab[i]
 32 | 		pres_vec = vec[i]
 33 | 		dist_k = [0.0]*set_size
 34 | 		k=0
 35 | 		for voc_ind in query_set_ind:
 36 | 			user_vec = vec[voc_ind]
 37 | 			#Euclidean distance, cosine similarity user_vec[x]*pres_vec[x], change to decreasing order of distance in sorted,distN
 38 | 			print user_vec,pres_vec
 39 | 			dist = 1- sum((user_vec[x]*pres_vec[x]) for x in xrange(0,dim))
 40 | 			dist_k[k]=sqrt(float(2*dist))
 41 | 			k+=1
 42 | 			# dist = 0.0
 43 | 			# for x in xrange(0,dim):
 44 | 			# 	dist+=(user_vec[x]-pres_vec[x])**2 
 45 | 		#distance of a point from a set
 46 | 		# dist_k_sorted = sorted(dist_k)
 47 | 		print i,dist_k
 48 | 		nearest_k = min(dist_k) # dist_k_sorted[0] #  if sorted not needed
 49 | 		if nearest_k!=0.0:
 50 | 			dist_set=sum( (nearest_k/dist_k[p])**(par_m) for p in xrange(0,set_size) )
 51 | 			dist_set = nearest_k * (dist_set)**(1.0/set_size)
 52 | 		else:
 53 | 			dist_set=0.0
 54 | 		print i,dist_set
 55 | 		dist_total.append((pres_word,dist_set))
 56 | 		# for j in xrange(0,N):
 57 | 		# 	if dist>distN[j]:
 58 | 		# 		for k in xrange(N-1,j,-1):
 59 | 		# 			distN[k] = distN[k-1]
 60 | 		# 			wordN[k] = wordN[k-1]
 61 | 		# 		distN[j] = dist
 62 | 		# 		wordN[j] = pres_word
 63 | 		# 		break
 64 | 	print dist_total
 65 | 	wordN = [w for w,_ in nsmallest(N,dist_total,key=lambda x: x[1])]
 66 | 	return wordN #zip(wordN,distN)
 67 | 
 68 | print get_Nranked_list(query_set,N)
 69 | 
 70 | adj = {1:set([2,3]),2:set([1,3,5,6]),3:set([1]),4:set([5]),5:set([1,2]),6:set([1])}
 71 | nb_seq_order = [3,4,5,1,2,6,7]
 72 | def getadj(user):
 73 | 	return adj[user]
 74 | def get_Nranked_list_fol(query_set,N):
 75 | 	friend_count = dict()
 76 | 	init_adopters = query_set
 77 | 	sec_hop = 2
 78 | 	while (sec_hop>0):
 79 | 		for a in init_adopters:
 80 | 			followers = getadj(a)
 81 | 			print a,followers
 82 | 			for f in followers-set(query_set):
 83 | 				try:
 84 | 					friend_count[f]+=1
 85 | 				except KeyError:
 86 | 					friend_count[f]=1
 87 | 		init_adopters = friend_count.keys()
 88 | 		sec_hop-=1
 89 | 		print friend_count
 90 | 	friend_count_list = [(f,friend_count[f]) for f in friend_count]
 91 | 	print friend_count_list
 92 | 	ranked_list = [f for f,_ in nlargest(N,friend_count_list,key=lambda x: x[1])]
 93 | 	print ranked_list
 94 | 	if len(friend_count_list)>=N:
 95 | 		return ranked_list
 96 | 	else:
 97 | 		print "followers ranked list short"
 98 | 		users_left = N-len(friend_count_list)
 99 | 		for i in nb_seq_order:
100 | 			if i not in friend_count and i not in query_set:
101 | 				ranked_list.append(i)
102 | 				users_left-=1
103 | 			if users_left==0:
104 | 				break
105 | 		return ranked_list
106 | 
107 | print get_Nranked_list_fol(query_set,N)
108 | 
109 | num_init_adopters=2
110 | N = 3
111 | seq_sample_vocab = [3,4,1,7,2]
112 | init_adopters=seq_sample_vocab[0:num_init_adopters]
113 | seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:])
114 | M = len(seq_sample_vocab)
115 | print M, "pred seq length"
116 | #precision, recall evaluation
117 | adopters_vec = get_Nranked_list(init_adopters,N)
118 | print adopters_vec
119 | precision_k = 0.0
120 | num_hits = 0.0
121 | for k,p in enumerate(adopters_vec):
122 | 	if p in seq_sample_vocab:
123 | 		num_hits+=1.0
124 | 		precision_k += num_hits/(k+1.0)
125 | average_precision = precision_k/min(M,N)
126 | # prec_r = num_hits/M
127 | prec_k = num_hits/N
128 | rec_k = num_hits/M
129 | print "Avg precision", average_precision, "adopters in seq", len(seq_sample_vocab)
130 | # print "RPrecision", prec_r
131 | print "Precision", prec_k, "Recall", rec_k
132 | 
133 | adoption_sequence_filename="ab.txt"
134 | seq_len_threshold=3
135 | def read_adoption_sequence(adoption_sequence_filename, start, end,train_seq_id,large_tag_id):
136 | 	with open(adoption_sequence_filename, 'r') as fr:
137 | 		count=0
138 | 		for line in fr:
139 | 			if count < start:
140 | 				count+=1
141 | 				continue
142 | 			elif count >= end:
143 | 				return
144 | 			if count not in train_seq_id or count in large_tag_id:
145 | 				count+=1
146 | 				continue
147 | 			count+=1
148 | 			line = line.rstrip()
149 | 			u = line.split(' ')
150 | 			tag = u[0]
151 | 			sequence = []
152 | 			adopters = set()
153 | 			for i in range(1, len(u)):
154 | 				timestamp = int(u[i][0:u[i].index(',')])
155 | 				author = int(u[i][u[i].index(',')+1 : ])
156 | 				sequence.append((timestamp,author))
157 | 				adopters.add(author)
158 | 			if len(adopters) < seq_len_threshold:
159 | 				continue
160 | 			yield (tag,sequence)
161 | for i in read_adoption_sequence(adoption_sequence_filename, 0, 4,set([0,1,3]),[]):
162 | 	print i


--------------------------------------------------------------------------------
/Untitled:
--------------------------------------------------------------------------------
 1 | /dbresearch2/sahil/node_vectors_1hr_pr.txt 0.1 True
 2 | num users in train sequences 2574807
 3 | Map Read
 4 | Follower file offset Read
 5 | 
 6 | Friend file offset Read
 7 | 
 8 | 2157571 2574807 100
 9 | 100000 900000
10 | /usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
11 |   'precision', 'predicted', average, warn_for)
12 | /usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
13 |   'precision', 'predicted', average, warn_for)
14 | ((0.0023275735255466729, 0.19630666666666666), 176676, (0.0013922458628841607, 0.19630666666666666), (0.0070921985815602835, 0.19630666666666666)) 6
15 | ((0.11315118094706672, 0.4070988888888889), 366389, (0.15637680750854319, 0.4070988888888889), (0.11585482553372159, 0.4070988888888889))
16 | Counter({6: 339200, 38: 126921, 2: 108161, 5: 101815, 50: 28857, 49: 26452, 27: 26401, 40: 14987, 52: 14302, 42: 13881, 16: 12945, 25: 11053, 129: 11022, 22: 10750, 19: 7019, 28: 4531, 104: 4220, 120: 4090, 68: 3577, 94: 3366, 35: 3239, 65: 3121, 93: 2907, 20: 2378, 76: 2361, 88: 1431, 62: 1340, 55: 1252, 64: 1079, 79: 754, 102: 596, 110: 507, 106: 504, 91: 495, 37: 465, 111: 415, 60: 316, 95: 314, 47: 301, 127: 263, 100: 215, 45: 182, 139: 172, 14: 104, 99: 100, 89: 99, 84: 97, 41: 91, 122: 87, 72: 86, 118: 85, 48: 78, 53: 77, 10: 74, 36: 70, 54: 67, 71: 53, 39: 50, 82: 43, 103: 43, 12: 41, 105: 38, 107: 37, 33: 33, 32: 31, 134: 31, 108: 30, 75: 28, 132: 28, 109: 22, 80: 21, 67: 20, 101: 20, 58: 19, 87: 15, 128: 15, 9: 12, 96: 12, 123: 11, 116: 10, 114: 9, 66: 8, 140: 8, 30: 6, 44: 6, 135: 6, 21: 3, 86: 3, 113: 3, 0: 2, 83: 2, 125: 2, 136: 2, 138: 2, 4: 1, 133: 1, 137: 1}) 97
17 | ((0.13520330600572122, 0.40737222222222225), 366635, (0.21112793377240058, 0.40737222222222225), (0.12320217398439315, 0.40737222222222225))
18 | Counter({6: 355749, 38: 117134, 5: 109981, 2: 106147, 50: 24001, 49: 22807, 27: 22636, 40: 15325, 52: 12153, 42: 11663, 16: 11388, 22: 9930, 25: 9828, 129: 7313, 19: 6326, 28: 4241, 60: 3487, 120: 3362, 104: 3152, 94: 3145, 35: 3029, 68: 2946, 65: 2617, 127: 2487, 20: 2288, 93: 2087, 91: 1973, 76: 1866, 62: 1514, 106: 1400, 45: 1354, 30: 1180, 55: 1122, 102: 1092, 88: 1054, 79: 1013, 64: 901, 110: 855, 111: 802, 123: 761, 36: 702, 37: 611, 108: 565, 47: 547, 14: 544, 0: 512, 139: 510, 66: 403, 4: 380, 112: 328, 134: 311, 89: 298, 58: 278, 77: 219, 73: 206, 95: 130, 116: 129, 41: 118, 67: 112, 81: 109, 43: 96, 119: 72, 61: 71, 113: 71, 56: 67, 63: 59, 74: 59, 48: 55, 26: 51, 100: 50, 24: 48, 3: 45, 7: 37, 18: 24, 1: 16, 92: 13, 121: 9, 98: 8, 31: 6, 126: 6, 17: 4, 124: 3, 15: 2, 21: 2, 57: 2, 131: 2, 78: 1}) 87
19 | fol 100000
20 | fol 200000
21 | fol 300000
22 | fol 400000
23 | fol 500000
24 | fol 600000
25 | fol 700000
26 | fol 800000
27 | fol 900000
28 | fol based pred 1738513.587
29 | ((0.18500498224648235, 0.38802111111111109), 349219, (0.27468649543097362, 0.38802111111111109), (0.15741696575059896, 0.38802111111111109))
30 | Counter({6: 369040, 2: 111260, 5: 91384, 38: 73751, 40: 21811, 49: 19016, 50: 18591, 27: 14546, 21: 12674, 4: 11865, 16: 10763, 25: 10510, 42: 9603, 0: 9439, 52: 8556, 129: 8479, 22: 7131, 3: 5310, 19: 4940, 35: 4713, 1: 4597, 23: 3886, 60: 3772, 34: 3657, 120: 3590, 127: 3131, 30: 3078, 123: 2518, 68: 2444, 45: 2288, 20: 2281, 28: 2181, 36: 1966, 93: 1713, 56: 1591, 65: 1561, 91: 1525, 112: 1477, 64: 1442, 76: 1428, 94: 1381, 55: 1290, 104: 1174, 47: 1112, 106: 998, 139: 982, 108: 974, 43: 937, 111: 909, 37: 890, 81: 887, 73: 878, 14: 805, 134: 795, 79: 789, 62: 748, 24: 696, 121: 687, 74: 625, 88: 609, 110: 583, 51: 524, 61: 399, 66: 395, 102: 355, 67: 343, 48: 340, 89: 325, 41: 324, 78: 294, 57: 258, 58: 252, 95: 249, 17: 242, 26: 239, 77: 233, 131: 233, 113: 219, 8: 208, 96: 183, 92: 179, 7: 178, 63: 160, 119: 152, 124: 134, 31: 111, 18: 106, 11: 92, 59: 92, 83: 91, 15: 90, 126: 73, 29: 54, 116: 53, 98: 49, 100: 45, 117: 31, 72: 29, 46: 28, 70: 27, 33: 24, 122: 24, 87: 21, 105: 20, 32: 18, 44: 18, 75: 17, 53: 16, 69: 16, 10: 14, 54: 12, 71: 12, 80: 12, 128: 12, 84: 11, 82: 10, 103: 10, 12: 9, 13: 9, 101: 9, 140: 8, 99: 7, 107: 7, 135: 6, 9: 4, 39: 4, 125: 4, 86: 3, 114: 3, 118: 3, 132: 3, 136: 3, 138: 2, 97: 1, 133: 1, 137: 1}) 136 161031
31 | limit pred ((0.21452624247316587, 0.43708464089833271), 322992, (0.27552783828129274, 0.43708464089833265), (0.19807627055156674, 0.43708464089833265)) cov 0.821076666667
32 | fr 100000
33 | fr 200000
34 | fr 300000
35 | fr 400000
36 | fr 500000
37 | fr 600000
38 | fr 700000
39 | fr 800000
40 | fr 900000
41 | ((0.17216425039620847, 0.41254777777777779), 371293, (0.3000193087788961, 0.41254777777777779), (0.14673950069699471, 0.41254777777777779))
42 | Counter({6: 352261, 2: 166024, 38: 88391, 5: 74008, 49: 22190, 50: 22092, 27: 17790, 40: 13809, 16: 10948, 52: 10829, 42: 9951, 25: 9719, 21: 9543, 129: 8532, 22: 7597, 4: 6247, 19: 5832, 0: 5617, 35: 3601, 120: 2973, 1: 2933, 28: 2498, 93: 2460, 20: 2339, 65: 2308, 68: 1985, 76: 1941, 3: 1932, 127: 1805, 30: 1796, 45: 1656, 104: 1466, 112: 1366, 94: 1303, 60: 1241, 55: 1234, 123: 1208, 23: 1125, 62: 1103, 56: 1093, 64: 1074, 36: 983, 34: 975, 106: 970, 110: 846, 134: 708, 43: 679, 81: 676, 91: 662, 89: 661, 47: 658, 37: 594, 139: 576, 14: 560, 111: 558, 108: 525, 79: 512, 24: 401, 73: 375, 57: 303, 61: 284, 121: 249, 41: 246, 48: 238, 74: 193, 95: 189, 88: 186, 78: 179, 8: 178, 66: 170, 102: 170, 124: 136, 113: 130, 58: 120, 26: 109, 77: 108, 119: 95, 51: 90, 17: 88, 29: 70, 67: 70, 63: 62, 126: 52, 70: 46, 31: 44, 92: 44, 15: 43, 18: 42, 131: 34, 7: 30, 11: 29, 83: 29, 59: 22, 117: 21, 96: 18, 32: 16, 116: 16, 98: 13, 87: 10, 33: 9, 69: 9, 100: 9, 46: 7, 105: 7, 103: 6, 75: 5, 71: 4, 39: 3, 54: 3, 72: 3, 122: 3, 128: 3, 132: 3, 12: 2, 44: 2, 114: 2, 10: 1, 13: 1, 53: 1, 82: 1, 84: 1, 97: 1, 99: 1, 101: 1, 140: 1}) 125 90000
43 | limit pred ((0.18509957355080509, 0.43739382716049385), 354289, (0.30040341757656785, 0.43739382716049385), (0.16387609625493194, 0.43739382716049385)) cov 0.9
44 | fol 18 77 fr 59087 4784 59
45 | /dbresearch2/sahil/node_vectors_1hr_pr.txt 0.1 True
46 | 


--------------------------------------------------------------------------------
/tsne_plots/tsne.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #  tsne.py
  3 | #  
  4 | # Implementation of t-SNE in Python. The implementation was tested on Python 2.5.1, and it requires a working 
  5 | # installation of NumPy. The implementation comes with an example on the MNIST dataset. In order to plot the
  6 | # results of this example, a working installation of matplotlib is required.
  7 | # The example can be run by executing: ipython tsne.py -pylab
  8 | #
  9 | #
 10 | #  Created by Laurens van der Maaten on 20-12-08.
 11 | #  Copyright (c) 2008 Tilburg University. All rights reserved.
 12 | 
 13 | import numpy as Math
 14 | import pylab as Plot
 15 | 	
 16 | def Hbeta(D = Math.array([]), beta = 1.0):
 17 | 	"""Compute the perplexity and the P-row for a specific value of the precision of a Gaussian distribution."""
 18 | 	
 19 | 	# Compute P-row and corresponding perplexity
 20 | 	P = Math.exp(-D.copy() * beta);
 21 | 	sumP = sum(P);
 22 | 	H = Math.log(sumP) + beta * Math.sum(D * P) / sumP;
 23 | 	P = P / sumP;
 24 | 	return H, P;
 25 | 	
 26 | 	
 27 | def x2p(X = Math.array([]), tol = 1e-5, perplexity = 30.0):
 28 | 	"""Performs a binary search to get P-values in such a way that each conditional Gaussian has the same perplexity."""
 29 | 
 30 | 	# Initialize some variables
 31 | 	print "Computing pairwise distances..."
 32 | 	(n, d) = X.shape;
 33 | 	sum_X = Math.sum(Math.square(X), 1);
 34 | 	D = Math.add(Math.add(-2 * Math.dot(X, X.T), sum_X).T, sum_X);
 35 | 	P = Math.zeros((n, n));
 36 | 	beta = Math.ones((n, 1));
 37 | 	logU = Math.log(perplexity);
 38 |     
 39 | 	# Loop over all datapoints
 40 | 	for i in range(n):
 41 | 	
 42 | 		# Print progress
 43 | 		if i % 500 == 0:
 44 | 			print "Computing P-values for point ", i, " of ", n, "..."
 45 | 	
 46 | 		# Compute the Gaussian kernel and entropy for the current precision
 47 | 		betamin = -Math.inf; 
 48 | 		betamax =  Math.inf;
 49 | 		Di = D[i, Math.concatenate((Math.r_[0:i], Math.r_[i+1:n]))];
 50 | 		(H, thisP) = Hbeta(Di, beta[i]);
 51 | 			
 52 | 		# Evaluate whether the perplexity is within tolerance
 53 | 		Hdiff = H - logU;
 54 | 		tries = 0;
 55 | 		while Math.abs(Hdiff) > tol and tries < 50:
 56 | 				
 57 | 			# If not, increase or decrease precision
 58 | 			if Hdiff > 0:
 59 | 				betamin = beta[i];
 60 | 				if betamax == Math.inf or betamax == -Math.inf:
 61 | 					beta[i] = beta[i] * 2;
 62 | 				else:
 63 | 					beta[i] = (beta[i] + betamax) / 2;
 64 | 			else:
 65 | 				betamax = beta[i];
 66 | 				if betamin == Math.inf or betamin == -Math.inf:
 67 | 					beta[i] = beta[i] / 2;
 68 | 				else:
 69 | 					beta[i] = (beta[i] + betamin) / 2;
 70 | 			
 71 | 			# Recompute the values
 72 | 			(H, thisP) = Hbeta(Di, beta[i]);
 73 | 			Hdiff = H - logU;
 74 | 			tries = tries + 1;
 75 | 			
 76 | 		# Set the final row of P
 77 | 		P[i, Math.concatenate((Math.r_[0:i], Math.r_[i+1:n]))] = thisP;
 78 | 	
 79 | 	# Return final P-matrix
 80 | 	print "Mean value of sigma: ", Math.mean(Math.sqrt(1 / beta))
 81 | 	return P;
 82 | 	
 83 | 	
 84 | def pca(X = Math.array([]), no_dims = 50):
 85 | 	"""Runs PCA on the NxD array X in order to reduce its dimensionality to no_dims dimensions."""
 86 | 
 87 | 	print "Preprocessing the data using PCA..."
 88 | 	(n, d) = X.shape;
 89 | 	X = X - Math.tile(Math.mean(X, 0), (n, 1));
 90 | 	(l, M) = Math.linalg.eig(Math.dot(X.T, X));
 91 | 	Y = Math.dot(X, M[:,0:no_dims]);
 92 | 	return Y;
 93 | 
 94 | 
 95 | def tsne(X = Math.array([]), no_dims = 2, initial_dims = 50, perplexity = 30.0):
 96 | 	"""Runs t-SNE on the dataset in the NxD array X to reduce its dimensionality to no_dims dimensions.
 97 | 	The syntaxis of the function is Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array."""
 98 | 	
 99 | 	# Check inputs
100 | 	if X.dtype != "float64":
101 | 		print "Error: array X should have type float64.";
102 | 		return -1;
103 | 	#if no_dims.__class__ != "<type 'int'>":			# doesn't work yet!
104 | 	#	print "Error: number of dimensions should be an integer.";
105 | 	#	return -1;
106 | 	
107 | 	# Initialize variables
108 | 	X = pca(X, initial_dims);
109 | 	(n, d) = X.shape;
110 | 	max_iter = 1000;
111 | 	initial_momentum = 0.5;
112 | 	final_momentum = 0.8;
113 | 	eta = 500;
114 | 	min_gain = 0.01;
115 | 	Y = Math.random.randn(n, no_dims);
116 | 	dY = Math.zeros((n, no_dims));
117 | 	iY = Math.zeros((n, no_dims));
118 | 	gains = Math.ones((n, no_dims));
119 | 	
120 | 	# Compute P-values
121 | 	P = x2p(X, 1e-5, perplexity);
122 | 	P = P + Math.transpose(P);
123 | 	P = P / Math.sum(P);
124 | 	P = P * 4;									# early exaggeration
125 | 	P = Math.maximum(P, 1e-12);
126 | 	
127 | 	# Run iterations
128 | 	for iter in range(max_iter):
129 | 		
130 | 		# Compute pairwise affinities
131 | 		sum_Y = Math.sum(Math.square(Y), 1);		
132 | 		num = 1 / (1 + Math.add(Math.add(-2 * Math.dot(Y, Y.T), sum_Y).T, sum_Y));
133 | 		num[range(n), range(n)] = 0;
134 | 		Q = num / Math.sum(num);
135 | 		Q = Math.maximum(Q, 1e-12);
136 | 		
137 | 		# Compute gradient
138 | 		PQ = P - Q;
139 | 		for i in range(n):
140 | 			dY[i,:] = Math.sum(Math.tile(PQ[:,i] * num[:,i], (no_dims, 1)).T * (Y[i,:] - Y), 0);
141 | 			
142 | 		# Perform the update
143 | 		if iter < 20:
144 | 			momentum = initial_momentum
145 | 		else:
146 | 			momentum = final_momentum
147 | 		gains = (gains + 0.2) * ((dY > 0) != (iY > 0)) + (gains * 0.8) * ((dY > 0) == (iY > 0));
148 | 		gains[gains < min_gain] = min_gain;
149 | 		iY = momentum * iY - eta * (gains * dY);
150 | 		Y = Y + iY;
151 | 		Y = Y - Math.tile(Math.mean(Y, 0), (n, 1));
152 | 		
153 | 		# Compute current value of cost function
154 | 		if (iter + 1) % 10 == 0:
155 | 			C = Math.sum(P * Math.log(P / Q));
156 | 			print "Iteration ", (iter + 1), ": error is ", C
157 | 			
158 | 		# Stop lying about P-values
159 | 		if iter == 100:
160 | 			P = P / 4;
161 | 			
162 | 	# Return solution
163 | 	return Y;
164 | 		
165 | 	
166 | if __name__ == "__main__":
167 | 	print "Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset."
168 | 	print "Running example on 2,500 MNIST digits..."
169 | 	X = Math.loadtxt("mnist2500_X.txt");
170 | 	labels = Math.loadtxt("mnist2500_labels.txt");
171 | 	Y = tsne(X, 2, 50, 20.0);
172 | 	fig = Plot.figure()
173 | 	Plot.scatter(Y[:,0], Y[:,1], 20, labels);
174 | 	fig.savefig('foo.png')
175 | 


--------------------------------------------------------------------------------
/misc/test.py:
--------------------------------------------------------------------------------
  1 | #TODO
  2 | #distribution of number of tweets sharing the same hashtags as the ones used by each user, number of users with atleast 20% common following users and number of reciprocal relations with users in the subset of selected users
  3 | #use these features as sentences in word2vec for node representations or try deepwalk on adjacency list of subset of users or use sequence of authors adopting a hashtag for hashtags with atleast 10 adoptions as sentences
  4 | import time
  5 | import sys
  6 | import os
  7 | import cPickle as pickle
  8 | import random
  9 | 
 10 | 
 11 | m = dict()
 12 | fr = open("/twitterSimulations/graph/map.txt")
 13 | for line in fr:
 14 | 	line = line.rstrip()
 15 | 	u = line.split(' ')
 16 | 	m[int(u[0])] = int(u[1])
 17 | fr.close()
 18 | 
 19 | tags_for_user = dict()
 20 | num_tweets_per_tag = dict()
 21 | with open('/twitterSimulations/timeline_data/dif_timeline1s', 'r') as fr:
 22 | 	for line in fr:
 23 | 		line = line.rstrip()
 24 | 		u = line.split('\t')
 25 | 		tag = u[0]
 26 | 		author = m[int(u[2])]
 27 | 		if author not in tags_for_user:
 28 | 			tags_for_user[author]=set()
 29 | 		tags_for_user[author].add(tag)
 30 | 		if tag not in num_tweets_per_tag:
 31 | 			num_tweets_per_tag[tag]=0
 32 | 		num_tweets_per_tag[tag]+=1
 33 | print len(tags_for_user)
 34 | 	
 35 | selected_users = set()
 36 | with open("userSubset.csv","r") as fr:
 37 | 	for line in fr:
 38 | 		line = line.rstrip()
 39 | 		u = line.split(',')
 40 | 		id,_,_ = int(u[0]),int(u[1]),int(u[2])
 41 | 		selected_users.add(id)
 42 | 
 43 | #subset follower and friend adjacency list
 44 | """
 45 | arr = ["user_followers_bigger_graph.txt","user_followers_bigger_graph_2.txt", "user_followers_bigger_graph_i.txt","user_followers_bigger_graph_recrawl_2.txt", "user_followers_bigger_graph_recrawl_3.txt","user_followers_bigger_graph_recrawl.txt"]
 46 | follower = dict()
 47 | for i in arr:
 48 | 	fr = open("/twitterSimulations/graph/" + i,'r')
 49 | 	for line in fr:
 50 | 		line = line.rstrip()
 51 | 		u = line.split(' ')
 52 | 		if(int(u[0]) > 7697889):
 53 | 			continue
 54 | 		node = m[int(u[1])]
 55 | 		if node not in selected_users:
 56 | 			continue
 57 | 		follower[node] = []
 58 | 		if len(u) > 2:
 59 | 			for j in range(2,len(u)):
 60 | 				snode = m[int(u[j])]
 61 | 				if snode in selected_users:
 62 | 					follower[node].append(snode)
 63 | 	fr.close()
 64 | 	print i
 65 | pickle.dump( follower, open( "subset_follower_graph.pickle", "wb" ) )
 66 | 
 67 | arr_friend = ["user_friends_bigger_graph.txt","user_friends_bigger_graph_2.txt", "user_friends_bigger_graph_i.txt","user_friends_bigger_graph_recrawl.txt"]
 68 | friend = dict()
 69 | num_friend_id_not_found=0
 70 | friend_id_not_found=set()
 71 | for i in arr_friend:
 72 | 	fr = open("/twitterSimulations/graph/" + i,'r')
 73 | 	for line in fr:
 74 | 		line = line.rstrip()
 75 | 		u = line.split(' ')
 76 | 		if(int(u[0]) > 7697889):
 77 | 			continue
 78 | 		try:
 79 | 			node = m[int(u[1])]
 80 | 		except:
 81 | 			num_friend_id_not_found += 1
 82 | 			friend_id_not_found.add(int(u[1]))
 83 | 			continue
 84 | 		if node not in selected_users:
 85 | 			continue
 86 | 		friend[node] = []
 87 | 		if len(u) > 2:
 88 | 			for j in range(2,len(u)):
 89 | 				try:
 90 | 					snode = m[int(u[j])]
 91 | 				except:
 92 | 					num_friend_id_not_found += 1
 93 | 					friend_id_not_found.add(int(u[1]))
 94 | 					continue
 95 | 				if snode in selected_users:
 96 | 					friend[node].append(snode)
 97 | 	fr.close()
 98 | 	print i
 99 | pickle.dump( friend, open( "subset_friend_graph.pickle", "wb" ) )
100 | print num_friend_id_not_found
101 | pickle.dump( friend_id_not_found, open( "friend_id_not_found.pickle", "wb" ) )
102 | """
103 | """
104 | follower = pickle.load( open( "subset_follower_graph.pickle", "rb" ) )
105 | print "Follower file loaded"
106 | """
107 | friend = pickle.load( open( "subset_friend_graph.pickle", "rb" ) )
108 | print "Friend file loaded"
109 | 
110 | #number of users with reciprocal links
111 | """
112 | num_rec = dict()
113 | count=0
114 | for node in selected_users:
115 | 	count+=1
116 | 	if count%10000==0:
117 | 		print count," Users processed"
118 | 	num_rec[node]=0
119 | 	# for nbh in friend[node]:
120 | 		# if node in friend[nbh]:
121 | 			# num_rec[node]+=1
122 | 	try:
123 | 		incoming = set(friend[node])
124 | 		outgoing = set(follower[node])
125 | 		reciprocal = set.intersection(incoming, outgoing)
126 | 		num_rec[node]+=len(reciprocal)
127 | 	except:
128 | 		pass
129 | pickle.dump( num_rec, open( "num_reciprocal_links.pickle", "wb" ) )
130 | """
131 | 
132 | num_rec = pickle.load( open( "num_reciprocal_links.pickle", "rb" ) )
133 | 
134 | 
135 | def get_intersection(list1,list2):
136 | 	s = set(list2)
137 | 	count=0
138 | 	for i in list1:
139 | 		if i in s:
140 | 			count+=1
141 | 	return count
142 | 	
143 | #users with more than 20% common friends
144 | num_common_friends = dict()
145 | count=0
146 | num_common_friends_thr = dict()
147 | selected_users_list = random.sample(selected_users,500)
148 | for i in range(0,len(selected_users_list)):
149 | 	count+=1
150 | 	if count%1000==0:
151 | 		print count," Users processed"
152 | 	node = selected_users_list[i]
153 | 	adj_nodes = friend[node]
154 | 	thr = .20*len(adj_nodes)
155 | 	
156 | 	# for j in range(i+1,len(selected_users_list)):
157 | 		# snode = selected_users_list[j]
158 | 	for snode in selected_users:
159 | 		
160 | 		nbh_adj_nodes = friend[snode]
161 | 		thr_s = .20*len(nbh_adj_nodes)
162 | 		common = get_intersection(adj_nodes, nbh_adj_nodes)
163 | 		if common>=thr:
164 | 			if node not in num_common_friends_thr:
165 | 				num_common_friends_thr[node]=0
166 | 			num_common_friends_thr[node]+=1
167 | 		# if common>=thr_s:
168 | 			# if snode not in num_common_friends_thr:
169 | 				# num_common_friends_thr[snode]=0
170 | 			# num_common_friends_thr[snode]+=1
171 | pickle.dump( num_common_friends_thr, open( "num_common_friends_thr_test.pickle", "wb" ) )
172 | 
173 | 
174 | #tweets with same hashtags
175 | num_tweets_with_same_tags = dict()
176 | for node in selected_users:
177 | 	num_tweets_with_same_tags[node] = sum([num_tweets_per_tag[x] for x in tags_for_user[node]])
178 | 	
179 | with open("featuresUserSubset_test.csv","w") as fd:
180 | 	for i in selected_users_list:
181 | 		fd.write(str(i)+","+str(num_common_friends_thr[i])+","+str(num_rec[i])+","+str(num_tweets_with_same_tags[i])+"\n")
182 | 		


--------------------------------------------------------------------------------
/user_vector_training/helpers/feature_dist.py:
--------------------------------------------------------------------------------
  1 | #TODO
  2 | #distribution of number of tweets sharing the same hashtags as the ones used by each user, number of users with atleast 20% common following users and number of reciprocal relations with users in the subset of selected users
  3 | #use these features as sentences in word2vec for node representations or try deepwalk on adjacency list of subset of users or use sequence of authors adopting a hashtag for hashtags with atleast 10 adoptions as sentences
  4 | import time
  5 | import sys
  6 | import os
  7 | import cPickle as pickle
  8 | 	
  9 | 
 10 | m = dict()
 11 | fr = open("/twitterSimulations/graph/map.txt")
 12 | for line in fr:
 13 | 	line = line.rstrip()
 14 | 	u = line.split(' ')
 15 | 	m[int(u[0])] = int(u[1])
 16 | fr.close()
 17 | """
 18 | tags_for_user = dict()
 19 | num_tweets_per_tag = dict()
 20 | with open('/twitterSimulations/timeline_data/dif_timeline1s', 'r') as fr:
 21 | 	for line in fr:
 22 | 		line = line.rstrip()
 23 | 		u = line.split('\t')
 24 | 		tag = u[0]
 25 | 		author = m[int(u[2])]
 26 | 		if author not in tags_for_user:
 27 | 			tags_for_user[author]=set()
 28 | 		tags_for_user[author].add(tag)
 29 | 		if tag not in num_tweets_per_tag:
 30 | 			num_tweets_per_tag[tag]=0
 31 | 		num_tweets_per_tag[tag]+=1
 32 | print len(tags_for_user)
 33 | """
 34 | selected_users = set()
 35 | with open("userSubset.csv","r") as fr:
 36 | 	for line in fr:
 37 | 		line = line.rstrip()
 38 | 		u = line.split(',')
 39 | 		id,_,_ = int(u[0]),int(u[1]),int(u[2])
 40 | 		selected_users.add(id)
 41 | 		
 42 | #subset follower and friend adjacency list
 43 | """
 44 | arr = ["user_followers_bigger_graph.txt","user_followers_bigger_graph_2.txt", "user_followers_bigger_graph_i.txt","user_followers_bigger_graph_recrawl_2.txt", "user_followers_bigger_graph_recrawl_3.txt","user_followers_bigger_graph_recrawl.txt"]
 45 | follower = dict()
 46 | for i in arr:
 47 | 	fr = open("/twitterSimulations/graph/" + i,'r')
 48 | 	for line in fr:
 49 | 		line = line.rstrip()
 50 | 		u = line.split(' ')
 51 | 		if(int(u[0]) > 7697889):
 52 | 			continue
 53 | 		node = m[int(u[1])]
 54 | 		if node not in selected_users:
 55 | 			continue
 56 | 		follower[node] = []
 57 | 		if len(u) > 2:
 58 | 			for j in range(2,len(u)):
 59 | 				snode = m[int(u[j])]
 60 | 				if snode in selected_users:
 61 | 					follower[node].append(snode)
 62 | 	fr.close()
 63 | 	print i
 64 | pickle.dump( follower, open( "subset_follower_graph.pickle", "wb" ) )
 65 | 
 66 | arr_friend = ["user_friends_bigger_graph.txt","user_friends_bigger_graph_2.txt", "user_friends_bigger_graph_i.txt","user_friends_bigger_graph_recrawl.txt"]
 67 | friend = dict()
 68 | num_friend_id_not_found=0
 69 | friend_id_not_found=set()
 70 | for i in arr_friend:
 71 | 	fr = open("/twitterSimulations/graph/" + i,'r')
 72 | 	for line in fr:
 73 | 		line = line.rstrip()
 74 | 		u = line.split(' ')
 75 | 		if(int(u[0]) > 7697889):
 76 | 			continue
 77 | 		try:
 78 | 			node = m[int(u[1])]
 79 | 		except:
 80 | 			num_friend_id_not_found += 1
 81 | 			friend_id_not_found.add(int(u[1]))
 82 | 			continue
 83 | 		if node not in selected_users:
 84 | 			continue
 85 | 		friend[node] = []
 86 | 		if len(u) > 2:
 87 | 			for j in range(2,len(u)):
 88 | 				try:
 89 | 					snode = m[int(u[j])]
 90 | 				except:
 91 | 					num_friend_id_not_found += 1
 92 | 					friend_id_not_found.add(int(u[1]))
 93 | 					continue
 94 | 				if snode in selected_users:
 95 | 					friend[node].append(snode)
 96 | 	fr.close()
 97 | 	print i
 98 | pickle.dump( friend, open( "subset_friend_graph.pickle", "wb" ) )
 99 | print num_friend_id_not_found
100 | pickle.dump( friend_id_not_found, open( "friend_id_not_found.pickle", "wb" ) )
101 | """
102 | 
103 | follower = pickle.load( open( "subset_follower_graph.pickle", "rb" ) )
104 | print "Follower file loaded"
105 | """
106 | friend = pickle.load( open( "subset_friend_graph.pickle", "rb" ) )
107 | print "Friend file loaded"
108 | """
109 | #number of users with reciprocal links
110 | """
111 | num_rec = dict()
112 | count=0
113 | for node in selected_users:
114 | 	count+=1
115 | 	if count%10000==0:
116 | 		print count," Users processed"
117 | 	num_rec[node]=0
118 | 	# for nbh in friend[node]:
119 | 		# if node in friend[nbh]:
120 | 			# num_rec[node]+=1
121 | 	try:
122 | 		incoming = set(friend[node])
123 | 		outgoing = set(follower[node])
124 | 		reciprocal = set.intersection(incoming, outgoing)
125 | 		num_rec[node]+=len(reciprocal)
126 | 	except:
127 | 		pass
128 | pickle.dump( num_rec, open( "num_reciprocal_links.pickle", "wb" ) )
129 | """
130 | """
131 | num_rec = pickle.load( open( "num_reciprocal_links.pickle", "rb" ) )
132 | """
133 | #users with more than 20% common friends
134 | num_common_friends = dict()
135 | count=0
136 | for node in selected_users:
137 | 	count+=1
138 | 	if count%1000==0:
139 | 		print count," Users processed"
140 | 	try:
141 | 		out_nodes = follower[node]
142 | 	except:
143 | 		continue
144 | 	num_out = len(out_nodes)#len(out_nodes)
145 | 	for i in range(0,num_out):
146 | 		out = out_nodes[i]
147 | 		for j in range(i+1,num_out):
148 | 			sout = out_nodes[j]
149 | 			if out>sout:
150 | 				(out,sout) = (sout,out)
151 | 			if out not in num_common_friends:
152 | 				num_common_friends[out]=dict()
153 | 				num_common_friends[out][sout]=1
154 | 			elif sout not in num_common_friends[out]:
155 | 				num_common_friends[out][sout]=1
156 | 			else:
157 | 				num_common_friends[out][sout]+=1
158 | 		
159 | pickle.dump( num_common_friends, open( "num_common_friends.pickle", "wb" ) )
160 | """
161 | num_common_friends_thr = dict()
162 | for out in num_common_friends:
163 | 	for sout in num_common_friends[out]:
164 | 		thr = .20*len(friend[out])
165 | 		thr_s = .20*len(friend[sout])
166 | 		common = num_common_friends[out][sout]
167 | 		if common>=thr:
168 | 			if out not in num_common_friends_thr:
169 | 				num_common_friends_thr[out]=0
170 | 			num_common_friends_thr[out]+=1
171 | 		if common>=thr_s:
172 | 			if sout not in num_common_friends_thr:
173 | 				num_common_friends_thr[sout]=0
174 | 			num_common_friends_thr[sout]+=1
175 | 
176 | pickle.dump( num_common_friends_thr, open( "num_common_friends_thr.pickle", "wb" ) )
177 | """
178 | """
179 | #tweets with same hashtags
180 | num_tweets_with_same_tags = dict()
181 | for node in selected_users:
182 | 	num_tweets_with_same_tags[node] = sum([num_tweets_per_tag[x] for x in tags_for_user[node]])
183 | 	
184 | with open("featuresUserSubset.csv","w") as fd:
185 | 	for i in selected_users:
186 | 		fd.write(str(i)+","+str(num_common_friends_thr[i])+","+str(num_rec[i])+","+str(num_tweets_with_same_tags[i])+"\n")
187 | """


--------------------------------------------------------------------------------
/adopter_prediction/adopter_prediction.py:
--------------------------------------------------------------------------------
  1 | #get nearest users of the source of a hashtag sequence in test sequences using user vectors and compare with actual adopters in the sequence
  2 | 
  3 | import cPickle as pickle
  4 | import time
  5 | from math import sqrt
  6 | import random
  7 | 
  8 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_bfsr.txt"
  9 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences"
 10 | with open("sequence_file_split_indices.pickle","rb") as fr:
 11 | 	_ = pickle.load(fr)
 12 | 	test_seq_id = pickle.load(fr)
 13 | test_seq_id = set(test_seq_id)
 14 | 
 15 | with open("sequence_file_split_users.pickle","rb") as fr:
 16 | 	users_train = pickle.load(fr)
 17 | 	users_test = pickle.load(fr)
 18 | users_test = set(users_test)
 19 | 
 20 | def read_vector_file(path_vectors_file,users_test):
 21 | 	vocab = []
 22 | 	vectors = []
 23 | 	with open(path_vectors_file,"rb") as fr:
 24 | 		_,dim = next(fr).rstrip().split(' ')
 25 | 		word_vector_dim = int(dim)
 26 | 		next(fr)
 27 | 		for line in fr:
 28 | 			line = line.rstrip()
 29 | 			u = line.split(' ')
 30 | 			if len(u) != word_vector_dim+1:
 31 | 				print "vector length error"
 32 | 			word = int(u[0])
 33 | 			if word in users_test:
 34 | 				vec = []
 35 | 				length = 0.0
 36 | 				for d in u[1:]:
 37 | 					num=float(d)
 38 | 					vec.append(num)
 39 | 					length+=num**2
 40 | 				#vec = map(float,u[1:])
 41 | 				#length = sum(x**2 for x in vec)
 42 | 				length = sqrt(length)
 43 | 				vec_norm = [x/length for x in vec]
 44 | 				vocab.append(word)
 45 | 				vectors.append(vec_norm)
 46 | 	return vectors, vocab, word_vector_dim
 47 | 
 48 | vec,vocab,dim = read_vector_file(vec_file,users_test)
 49 | vocab_index=dict()
 50 | for i in xrange(0,len(vocab)):
 51 | 	vocab_index[vocab[i]]=i
 52 | num_users_test = len(vocab)
 53 | # print "num users in test sequences", num_users_test
 54 | # print "users removed from vocab", len(set(users_train)-set(vocab))
 55 | # print "users in test sequences but not in vocab", len(users_test-set(vocab))
 56 | 
 57 | #Peter Norvig's code for memo
 58 | # def memo(f):
 59 | #     "Memoize function f."
 60 | #     table = {}
 61 | #     def fmemo(*args):
 62 | #         if args not in table:
 63 | #             table[args] = f(*args)
 64 | #         return table[args]
 65 | #     fmemo.memo = table
 66 | #     return fmemo
 67 | # dist_memo = dict()
 68 | 
 69 | # @memo
 70 | def get_Nranked_list(query,N):
 71 | 	wordN = [0]*N
 72 | 	distN = [0.0]*N
 73 | 	try:
 74 | 		voc_ind = vocab_index[query]
 75 | 	except KeyError:
 76 | 		print "query word not present"
 77 | 		return
 78 | 	query_vec = vec[voc_ind]
 79 | 	for i in xrange(0,len(vec)):
 80 | 		if i==voc_ind:
 81 | 			continue
 82 | 		pres_word = vocab[i]
 83 | 		pres_vec = vec[i]
 84 | 		dist = 0.0
 85 | 		for x in xrange(0,dim):
 86 | 			dist+=query_vec[x]*pres_vec[x]
 87 | 		#dist = sum(query_vec[x]*pres_vec[x] for x in range(0,dim))
 88 | 		for j in xrange(0,N):
 89 | 			if dist>distN[j]:
 90 | 				for k in xrange(N-1,j,-1):
 91 | 					distN[k] = distN[k-1]
 92 | 					wordN[k] = wordN[k-1]
 93 | 				distN[j] = dist
 94 | 				wordN[j] = pres_word
 95 | 				break
 96 | 	return wordN #zip(wordN,distN)
 97 | 
 98 | not_found_vocab=[]
 99 | # source_thr = 1395858601 + 7*24*60*60
100 | tag_seq = []
101 | count=0
102 | # nb_seq = dict()
103 | with open(adoption_sequence_filename, "rb") as fr:
104 | 	for line in fr:
105 | 		line = line.rstrip()
106 | 		u = line.split(' ')
107 | 		not_found=0
108 | 		# first_timestamp = int(u[1][0:u[1].index(',')])
109 | 		# if first_timestamp>=source_thr
110 | 		if count in test_seq_id:
111 | 			seq=[]
112 | 			for i in xrange(1, len(u)):
113 | 				#timestamp = int(u[i][0:u[i].index(',')])
114 | 				author = int(u[i][u[i].index(',')+1 : ])
115 | 				if author in vocab_index:
116 | 					seq.append(author)
117 | 				else:
118 | 					not_found+=1
119 | 			if len(seq)>1:
120 | 				tag_seq.append(seq)
121 | 				not_found_vocab.append(not_found)
122 | 		# else:
123 | 		# 	adop=[]
124 | 		# 	for i in xrange(1, len(u)):
125 | 		# 		author = int(u[i][u[i].index(',')+1 : ])
126 | 		# 		if author in vocab_index:
127 | 		# 			adop.append(author)
128 | 		# 	for author in set(adop):			
129 | 		# 		try:
130 | 		# 			nb_seq[author]+=1
131 | 		# 		except KeyError:
132 | 		# 			nb_seq[author]=1
133 | 		count+=1
134 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq]
135 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True)
136 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted]
137 | # pickle.dump(nb_seq_order,open("adopter_pred_files/baseline_user_order_bfsr.pickle","wb"))
138 | nb_seq_order = pickle.load(open("adopter_pred_files/baseline_user_order_bfsr.pickle","rb"))
139 | print len(nb_seq_order)
140 | print len(tag_seq),len(test_seq_id),count
141 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab)
142 | seq_count_limit=100
143 | num_seqs=0
144 | mean_ap=0
145 | mean_prec_r=0
146 | mean_ap_nbapp=0
147 | mean_prec_r_nbapp=0
148 | # N=100
149 | seq_random_index=range(0,len(tag_seq))
150 | random.shuffle(seq_random_index)
151 | for i in seq_random_index:
152 | 	seq_sample_vocab = tag_seq[i]
153 | 	# source_user=seq_sample[0]
154 | 	# if source_user not in vocab_index:
155 | 	# 	continue
156 | 	# seq_sample_vocab = [x for x in seq_sample if x in vocab_index]
157 | 	# if len(seq_sample_vocab)<2:#2
158 | 	# 	continue
159 | 	source_user=seq_sample_vocab[0]
160 | 	seq_sample_vocab = set(seq_sample_vocab[1:])
161 | 	M = len(seq_sample_vocab)
162 | 	N = M #1000 #num_users_test
163 | 	# if M<1000:
164 | 	# 	continue
165 | 	not_found=not_found_vocab[i]
166 | 	#source_vec=vec[vocab_index[source_user]]
167 | 
168 | 	adopters_vec = get_Nranked_list(source_user,N)
169 | 	precision_k = 0.0
170 | 	num_hits = 0.0
171 | 	for k,p in enumerate(adopters_vec):
172 | 		if p in seq_sample_vocab:
173 | 			num_hits+=1.0
174 | 			precision_k += num_hits/(k+1.0)
175 | 	average_precision = precision_k/min(M,N)
176 | 	prec_r = num_hits/M
177 | 	print "Avg precision", average_precision, "num of users not found", not_found, "num of adopters in seq", len(seq_sample_vocab)
178 | 	print "RPrecision", prec_r
179 | 	# print "Precision", num_hits/N, "Recall", num_hits/M
180 | 	mean_ap+=average_precision
181 | 	mean_prec_r+=prec_r
182 | 	num_seqs+=1
183 | 	print "MAP", mean_ap/float(num_seqs), "MRP", mean_prec_r/float(num_seqs)
184 | 	
185 | 	nb_seq_order = nb_seq_order[:N]
186 | 	precision_k_nbapp = 0.0
187 | 	num_hits_nbapp = 0.0
188 | 	for k,p in enumerate(nb_seq_order):
189 | 		if p in seq_sample_vocab:
190 | 			num_hits_nbapp+=1.0
191 | 			precision_k_nbapp += num_hits_nbapp/(k+1.0)
192 | 	average_precision_nbapp = precision_k_nbapp/min(M,N)
193 | 	prec_r_nbapp = num_hits_nbapp/M
194 | 	print "Nb_App", "Avg precision", average_precision_nbapp
195 | 	print "Nb_App", "RPrecision", prec_r_nbapp
196 | 	# print "Precision", num_hits_nbapp/N, "Recall", num_hits_nbapp/M
197 | 	mean_ap_nbapp+=average_precision_nbapp
198 | 	mean_prec_r_nbapp+=prec_r_nbapp
199 | 	print "Nb_App", "MAP", mean_ap_nbapp/float(num_seqs), "MRP", mean_prec_r_nbapp/float(num_seqs)
200 | 	
201 | 	seq_count_limit-=1
202 | 	if seq_count_limit==0:
203 | 		break
204 | print num_seqs
205 | print "MAP", "user vectors", mean_ap/float(num_seqs), "Nb_App", mean_ap_nbapp/float(num_seqs)
206 | print "MRP", mean_prec_r/float(num_seqs), mean_prec_r_nbapp/float(num_seqs)
207 | #pickle.dump(source_time,open("source_time.pickle","wb"))
208 | 


--------------------------------------------------------------------------------
/adopter_prediction/adopter_prediction_parallel.py:
--------------------------------------------------------------------------------
  1 | #get nearest users of the source of a hashtag sequence in test sequences using user vectors and compare with actual adopters in the sequence
  2 | 
  3 | import cPickle as pickle
  4 | import time
  5 | from math import sqrt
  6 | import random
  7 | from multiprocessing import Pool, cpu_count
  8 | 
  9 | NUM_PROCESSES = 5
 10 | 
 11 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_pr.txt"
 12 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences"
 13 | with open("sequence_file_split_indices.pickle","rb") as fr:
 14 | 	_ = pickle.load(fr)
 15 | 	test_seq_id = pickle.load(fr)
 16 | test_seq_id = set(test_seq_id)
 17 | 
 18 | with open("sequence_file_split_users.pickle","rb") as fr:
 19 | 	users_train = pickle.load(fr)
 20 | 	users_test = pickle.load(fr)
 21 | users_test = set(users_test)
 22 | 
 23 | def read_vector_file(path_vectors_file,users_test):
 24 | 	vocab = []
 25 | 	vectors = []
 26 | 	with open(path_vectors_file,"rb") as fr:
 27 | 		_,dim = next(fr).rstrip().split(' ')
 28 | 		word_vector_dim = int(dim)
 29 | 		next(fr)
 30 | 		for line in fr:
 31 | 			line = line.rstrip()
 32 | 			u = line.split(' ')
 33 | 			if len(u) != word_vector_dim+1:
 34 | 				print "vector length error"
 35 | 			word = int(u[0])
 36 | 			if word in users_test:
 37 | 				vec = []
 38 | 				length = 0.0
 39 | 				for d in u[1:]:
 40 | 					num=float(d)
 41 | 					vec.append(num)
 42 | 					length+=num**2
 43 | 				#vec = map(float,u[1:])
 44 | 				#length = sum(x**2 for x in vec)
 45 | 				length = sqrt(length)
 46 | 				vec_norm = [x/length for x in vec]
 47 | 				vocab.append(word)
 48 | 				vectors.append(vec_norm)
 49 | 	return vectors, vocab, word_vector_dim
 50 | 
 51 | vec,vocab,dim = read_vector_file(vec_file,users_test)
 52 | vocab_index=dict()
 53 | for i in xrange(0,len(vocab)):
 54 | 	vocab_index[vocab[i]]=i
 55 | num_users_test = len(vocab)
 56 | # print "num users in test sequences", num_users_test
 57 | # print "users removed from vocab", len(set(users_train)-set(vocab))
 58 | # print "users in test sequences but not in vocab", len(users_test-set(vocab))
 59 | 
 60 | #Peter Norvig's code for memo
 61 | # def memo(f):
 62 | #     "Memoize function f."
 63 | #     table = {}
 64 | #     def fmemo(*args):
 65 | #         if args not in table:
 66 | #             table[args] = f(*args)
 67 | #         return table[args]
 68 | #     fmemo.memo = table
 69 | #     return fmemo
 70 | # dist_memo = dict()
 71 | 
 72 | # @memo
 73 | def get_Nranked_list(query,N):
 74 | 	wordN = [0]*N
 75 | 	distN = [0.0]*N
 76 | 	try:
 77 | 		voc_ind = vocab_index[query]
 78 | 	except KeyError:
 79 | 		print "query word not present"
 80 | 		return
 81 | 	query_vec = vec[voc_ind]
 82 | 	for i in range(0,len(vec)):
 83 | 		if i==voc_ind:
 84 | 			continue
 85 | 		pres_word = vocab[i]
 86 | 		pres_vec = vec[i]
 87 | 		dist = 0.0
 88 | 		for x in range(0,dim):
 89 | 			dist+=query_vec[x]*pres_vec[x]
 90 | 		#dist = sum(query_vec[x]*pres_vec[x] for x in range(0,dim))
 91 | 		for j in range(0,N):
 92 | 			if dist>distN[j]:
 93 | 				for k in range(N-1,j,-1):
 94 | 					distN[k] = distN[k-1]
 95 | 					wordN[k] = wordN[k-1]
 96 | 				distN[j] = dist
 97 | 				wordN[j] = pres_word
 98 | 				break
 99 | 	return wordN #zip(wordN,distN)
100 | 
101 | not_found_vocab=[]
102 | # source_thr = 1395858601 + 7*24*60*60
103 | tag_seq = []
104 | count=0
105 | # nb_seq = dict()
106 | with open(adoption_sequence_filename, "rb") as fr:
107 | 	for line in fr:
108 | 		line = line.rstrip()
109 | 		u = line.split(' ')
110 | 		not_found=0
111 | 		# first_timestamp = int(u[1][0:u[1].index(',')])
112 | 		# if first_timestamp>=source_thr
113 | 		if count in test_seq_id:
114 | 			seq=[]
115 | 			for i in xrange(1, len(u)):
116 | 				#timestamp = int(u[i][0:u[i].index(',')])
117 | 				author = int(u[i][u[i].index(',')+1 : ])
118 | 				if author in vocab_index:
119 | 					seq.append(author)
120 | 				else:
121 | 					not_found+=1
122 | 			if len(seq)>1:
123 | 				tag_seq.append(seq)
124 | 				not_found_vocab.append(not_found)
125 | 		# else:
126 | 		# 	adop=[]
127 | 		# 	for i in xrange(1, len(u)):
128 | 		# 		author = int(u[i][u[i].index(',')+1 : ])
129 | 		# 		if author in vocab_index:
130 | 		# 			adop.append(author)
131 | 		# 	for author in set(adop):			
132 | 		# 		try:
133 | 		# 			nb_seq[author]+=1
134 | 		# 		except KeyError:
135 | 		# 			nb_seq[author]=1
136 | 		count+=1
137 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq]
138 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True)
139 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted]
140 | # pickle.dump(nb_seq_order,open("adopter_pred_files/baseline_user_order_bfsr.pickle","wb"))
141 | nb_seq_order = pickle.load(open("adopter_pred_files/baseline_user_order_pr.pickle","rb"))
142 | print len(nb_seq_order)
143 | # print len(tag_seq),len(test_seq_id),count
144 | # print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab)
145 | 
146 | seq_random_index=range(0,len(tag_seq))
147 | random.shuffle(seq_random_index)
148 | 
149 | def adopter_prediction(process_num,start,end):
150 | 	seq_count_limit=100
151 | 	num_seqs=0
152 | 	mean_ap=0
153 | 	# mean_prec_r=0
154 | 	mean_ap_nbapp=0
155 | 	# mean_prec_r_nbapp=0
156 | 	# N=100
157 | 	for i in seq_random_index[start:end]:
158 | 		seq_sample_vocab = tag_seq[i]
159 | 		# source_user=seq_sample[0]
160 | 		# if source_user not in vocab_index:
161 | 		# 	continue
162 | 		# seq_sample_vocab = [x for x in seq_sample if x in vocab_index]
163 | 		# if len(seq_sample_vocab)<2:#2
164 | 		# 	continue
165 | 		source_user=seq_sample_vocab[0]
166 | 		seq_sample_vocab = set(seq_sample_vocab[1:])
167 | 		M = len(seq_sample_vocab)
168 | 		N = num_users_test #M #1000
169 | 		# if M<1000:
170 | 		# 	continue
171 | 		not_found=not_found_vocab[i]
172 | 		#source_vec=vec[vocab_index[source_user]]
173 | 
174 | 		adopters_vec = get_Nranked_list(source_user,N)
175 | 		precision_k = 0.0
176 | 		num_hits = 0.0
177 | 		for k,p in enumerate(adopters_vec):
178 | 			if p in seq_sample_vocab:
179 | 				num_hits+=1.0
180 | 				precision_k += num_hits/(k+1.0)
181 | 		average_precision = precision_k/min(M,N)
182 | 		# prec_r = num_hits/M
183 | 		print "Avg precision", average_precision, "num of users not found", not_found, "num of adopters in seq", len(seq_sample_vocab), "Process", process_num
184 | 		# print "Precision", num_hits/N, "Recall", num_hits/M
185 | 		mean_ap+=average_precision
186 | 		# mean_prec_r+=prec_r
187 | 		num_seqs+=1
188 | 		print "MAP", mean_ap/float(num_seqs), "Process", process_num#, "MRP", mean_prec_r/float(num_seqs)
189 | 		
190 | 		nb_seq_order = nb_seq_order[:N]
191 | 		precision_k_nbapp = 0.0
192 | 		num_hits_nbapp = 0.0
193 | 		for k,p in enumerate(nb_seq_order):
194 | 			if p in seq_sample_vocab:
195 | 				num_hits_nbapp+=1.0
196 | 				precision_k_nbapp += num_hits_nbapp/(k+1.0)
197 | 		average_precision_nbapp = precision_k_nbapp/min(M,N)
198 | 		# prec_r_nbapp = num_hits_nbapp/M
199 | 		print "Nb_App", "Avg precision", average_precision_nbapp, "Process", process_num
200 | 		# print "Precision", num_hits_nbapp/N, "Recall", num_hits_nbapp/M
201 | 		mean_ap_nbapp+=average_precision_nbapp
202 | 		# mean_prec_r_nbapp+=prec_r_nbapp
203 | 		print "Nb_App", "MAP", mean_ap_nbapp/float(num_seqs), "Process", process_num#, "MRP", mean_prec_r_nbapp/float(num_seqs)
204 | 		
205 | 		seq_count_limit-=1
206 | 		if seq_count_limit==0:
207 | 			break
208 | 	print num_seqs, mean_ap, mean_ap_nbapp, "Process", process_num
209 | 	print "user vectors", mean_ap/float(num_seqs), "Process", process_num
210 | 	print "Nb_App", mean_ap_nbapp/float(num_seqs), "Process", process_num
211 | 	# print mean_prec_r/float(num_seqs)
212 | 	#pickle.dump(source_time,open("source_time.pickle","wb"))
213 | 
214 | num_workers = min(NUM_PROCESSES,cpu_count())
215 | pool = Pool(processes=num_workers) 
216 | process_num=0
217 | NUM_SEQ = len(seq_random_index)
218 | lines_per_process = int(NUM_SEQ/(2.0*num_workers))
219 | for s,e in ( (i,min(i+lines_per_process,NUM_SEQ)) for i in xrange(0,NUM_SEQ,lines_per_process) ):
220 | 	pool.apply_async(adopter_prediction, args=(process_num,s,e))
221 | 	process_num+=1
222 | pool.close()
223 | pool.join()


--------------------------------------------------------------------------------
/neighbourhood_experiments/entropy_vs_spread.py:
--------------------------------------------------------------------------------
  1 | #get entropy of distribution of first 1000 adopters in different clusters for topics with atleast 1000 adopters and compare with eventual spread
  2 | 
  3 | import cPickle as pickle
  4 | import time
  5 | from math import sqrt, log
  6 | import random
  7 | from heapq import nsmallest, nlargest, merge
  8 | import numpy
  9 | # from scipy.spatial import cKDTree as KDTree
 10 | from sklearn.neighbors import NearestNeighbors
 11 | import sys
 12 | from multiprocessing import Pool, cpu_count
 13 | from collections import defaultdict
 14 | import traceback
 15 | 
 16 | NUM_PROCESSES = 1
 17 | 
 18 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_pr.txt"
 19 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences"
 20 | seq_len_threshold = 500 #top_k
 21 | train_ex_limit = 100
 22 | norm_vec = True
 23 | 
 24 | print vec_file, seq_len_threshold, norm_vec
 25 | 
 26 | with open("/mnt/filer01/word2vec/degree_distribution/sequence_file_split_indices.pickle","rb") as fr:
 27 | 	_ = pickle.load(fr)
 28 | 	test_seq_id = pickle.load(fr)
 29 | test_seq_id = set(test_seq_id)
 30 | 
 31 | def read_vector_file(path_vectors_file):
 32 | 	vocab = []
 33 | 	vectors = []
 34 | 	with open(path_vectors_file,"rb") as fr:
 35 | 		_,dim = next(fr).rstrip().split(' ')
 36 | 		word_vector_dim = int(dim)
 37 | 		next(fr)
 38 | 		for line in fr:
 39 | 			line = line.rstrip()
 40 | 			u = line.split(' ')
 41 | 			if len(u) != word_vector_dim+1:
 42 | 				print "vector length error"
 43 | 			word = int(u[0])
 44 | 			#normalise to length 1
 45 | 			if norm_vec:
 46 | 				vec = []
 47 | 				length = 0.0
 48 | 				for d in u[1:]:
 49 | 					num=float(d)
 50 | 					vec.append(num)
 51 | 					length+=num**2
 52 | 				#vec = map(float,u[1:])
 53 | 				#length = sum(x**2 for x in vec)
 54 | 				length = sqrt(length)
 55 | 				vec_norm = [x/length for x in vec]
 56 | 				vectors.append(vec_norm)
 57 | 			else:
 58 | 				vec = map(float,u[1:])
 59 | 				vectors.append(vec)
 60 | 			vocab.append(word)
 61 | 	return vectors, vocab, word_vector_dim
 62 | 
 63 | vec,vocab,dim = read_vector_file(vec_file)
 64 | vocab_index=dict()
 65 | for i in xrange(0,len(vocab)):
 66 | 	vocab_index[vocab[i]]=i
 67 | num_users = len(vocab)
 68 | print "num users in train sequences", num_users
 69 | # print "users removed from vocab", len(set(users_train)-set(vocab))
 70 | # print "users in test sequences but not in vocab", len(users_test-set(vocab))
 71 | 
 72 | def print_stats(u):
 73 | 	e,t,r = zip(*u)
 74 | 	return [numpy.mean(e), numpy.std(e), numpy.median(e)], [numpy.mean(t), numpy.std(t), numpy.median(t)], [numpy.mean(r), numpy.std(r), numpy.median(r)]
 75 | 
 76 | # reading test sequences
 77 | not_found_vocab=[]
 78 | # source_thr = 1395858601 + 12*60*60
 79 | # non_emergent_tags = pickle.load(open("/mnt/filer01/word2vec/degree_distribution/nonEmergentHashtags.pickle","rb"))
 80 | tag_seq = []
 81 | count=0
 82 | # nb_seq = dict()
 83 | # adlen = []
 84 | with open(adoption_sequence_filename, "rb") as fr:
 85 | 	for line in fr:
 86 | 		line = line.rstrip()
 87 | 		u = line.split(' ')
 88 | 		not_found = set()
 89 | 		adopters = set()
 90 | 		# first_timestamp = int(u[1][0:u[1].index(',')])
 91 | 		# first tweet only after source_thr timestamp
 92 | 		# if first_timestamp>=source_thr
 93 | 		# check if <5 tweets in 12 hours for emergent hashtags, not already popular
 94 | 		# u[0] not in non_emergent_tags and
 95 | 		if count in test_seq_id:
 96 | 			seq=[]
 97 | 			for i in xrange(1, len(u)):
 98 | 				#timestamp = int(u[i][0:u[i].index(',')])
 99 | 				author = int(u[i][u[i].index(',')+1 : ])
100 | 				if author in vocab_index:
101 | 					# removing repeat adopters
102 | 					if author not in adopters:
103 | 						seq.append(author)
104 | 						adopters.add(author)
105 | 				else:
106 | 					not_found.add(author)
107 | 			if len(seq)>0:
108 | 				tag_seq.append(seq)
109 | 				not_found_vocab.append(len(not_found))
110 | 				# adlen.append(len(seq))
111 | 		# elif count not in test_seq_id:
112 | 		# 	adop=[]
113 | 		# 	for i in xrange(1, len(u)):
114 | 		# 		author = int(u[i][u[i].index(',')+1 : ])
115 | 		# 		if author in vocab_index:
116 | 		# 			adop.append(author)
117 | 		# 	for author in set(adop):			
118 | 		# 		try:
119 | 		# 			nb_seq[author]+=1
120 | 		# 		except KeyError:
121 | 		# 			nb_seq[author]=1
122 | 		count+=1
123 | #nb, number of training sequences participated in
124 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq]
125 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True)
126 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted]
127 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb"))
128 | # pickle.dump(adlen,open("adlen.pickle","wb"))
129 | 
130 | print len(tag_seq),len(test_seq_id),count
131 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab)
132 | 
133 | """
134 | #test sequences in random order
135 | seq_random_index=range(0,len(tag_seq))
136 | random.shuffle(seq_random_index)
137 | 
138 | seq_index_filter = []
139 | for i in seq_random_index:
140 | 	seq_sample_vocab = tag_seq[i]
141 | 	M = len(seq_sample_vocab)
142 | 	if M<seq_len_threshold:
143 | 		continue
144 | 	seq_index_filter.append(i)
145 | print "tags remaining", len(seq_index_filter)
146 | 
147 | with open("/mnt/filer01/word2vec/degree_distribution/candset_stat_files/test_sequence_indices_thr"+str(seq_len_threshold)+".pickle","wb") as fd:
148 | 	pickle.dump(seq_index_filter,fd)
149 | """
150 | with open("/mnt/filer01/word2vec/degree_distribution/candset_stat_files/test_sequence_indices_thr"+str(seq_len_threshold)+".pickle","rb") as fr:
151 | 	seq_index_filter = pickle.load(fr)
152 | print "tags remaining", len(seq_index_filter)
153 | # seq_index_filter = seq_index_filter[0:2000]
154 | 
155 | #get k-means clusters
156 | with open("/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/user_vector_cluster_index_1hr_pr.pickle","rb") as fr:
157 | 	vec_cluster_idx = pickle.load(fr)
158 | 
159 | def get_entropy(adop):
160 | 	clusters = defaultdict(int)
161 | 	for u in adop:
162 | 		clusters[vec_cluster_idx[vocab_index[u]]]+=1
163 | 	ent=0.0
164 | 	for c in clusters:
165 | 		p = clusters[c]*1./seq_len_threshold
166 | 		if p > 0:
167 | 			ent+= -1.0*p*log(p,2)
168 | 	return ent
169 | 
170 | def init_adopt_stat(process_num,start,end):
171 | 	print process_num, start, end
172 | 	try:
173 | 		cand_set_recall_spread = []
174 | 		l=0
175 | 		avg_num_adopters = 0
176 | 		count=0
177 | 		for i in seq_index_filter:
178 | 			if count < start:
179 | 				count+=1
180 | 				continue
181 | 			elif count >= end:
182 | 				break
183 | 			count+=1
184 | 			seq_sample_vocab = tag_seq[i]
185 | 			total_spread = len(seq_sample_vocab)
186 | 			avg_num_adopters+=total_spread
187 | 			
188 | 			init_adopters=seq_sample_vocab[0:seq_len_threshold]
189 | 			ent = get_entropy(init_adopters)
190 | 
191 | 			#entropy of random sample of users
192 | 			random_adopters = random.sample(vocab,seq_len_threshold)
193 | 			ent_M1 = get_entropy(random_adopters)
194 | 
195 | 			ent_rel = 0.0
196 | 			if ent_M1>0:
197 | 				ent_rel = ent*1./ent_M1
198 | 
199 | 			cand_set_recall_spread.append((ent,total_spread,ent_rel))
200 | 
201 | 			l+=1
202 | 			if l%25==0:
203 | 				print "entropy", ent, "random", ent_M1, "total spread", total_spread, "rel", ent_rel, l
204 | 			# if l==train_ex_limit:
205 | 			# 	break
206 | 		print process_num, start, "num examples", l, "ent", print_stats(cand_set_recall_spread), avg_num_adopters*1./l
207 | 		with open("/mnt/filer01/word2vec/degree_distribution/candset_stat_files/entropy_vs_spread_seq"+str(seq_len_threshold)+"_ex"+str(start)+".pickle","wb") as fd:
208 | 			pickle.dump(cand_set_recall_spread,fd)
209 | 	except Exception as e:
210 | 		print traceback.format_exc()
211 | 
212 | tic = time.clock()
213 | init_adopt_stat(0,0,1000)
214 | toc = time.clock()
215 | print "init adopt eval in", (toc-tic)*1000
216 | 
217 | # NUM_LINES = len(seq_index_filter)
218 | 
219 | # num_workers = min(NUM_PROCESSES,cpu_count())
220 | # pool = Pool(processes=num_workers) 
221 | # process_num=0
222 | # lines_per_process = int(NUM_LINES/(2*num_workers))
223 | # for s,e in ( (i,min(i+lines_per_process,NUM_LINES)) for i in xrange(0,NUM_LINES,lines_per_process) ):
224 | # 	pool.apply_async(init_adopt_stat, args=(process_num,s,e))
225 | # 	process_num+=1
226 | # pool.close()
227 | # pool.join()
228 | 
229 | print vec_file


--------------------------------------------------------------------------------
/neighbourhood_experiments/candidate_set_coverage/cand_recall_plot.py:
--------------------------------------------------------------------------------
  1 | #plot candidate set recall for different n,c averaged over 100 or 50 tags
  2 | 
  3 | from collections import defaultdict
  4 | import cPickle as pickle
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | import sys
  8 | 
  9 | num_init_adopters = [10,100] #range(10,101,10)
 10 | top_k = np.arange(0.6,1.05,0.1) #np.arange(0.5,1.01,0.1) #[500,1000,2000]+range(3000,10001,1000)
 11 | seq_len_threshold = 500
 12 | 
 13 | def get_stats(u):
 14 | 	return [np.mean(u), np.std(u)] #, np.median(u)
 15 | 
 16 | def cc_plot(cc):
 17 | 	cc_mean, cc_std = zip(*cc)
 18 | 	plt.errorbar(num_init_adopters, cc_mean, yerr=cc_std, fmt='o')
 19 | 	plt.xlabel('Number of initial adopters, n')
 20 | 	plt.ylabel('Proportion of adopters in candidate set')
 21 | 	plt.title('Candidate set coverage with varying value of n\nc=1000, Avg. over 100 topics')
 22 | 	plt.grid()
 23 | 	plt.show()
 24 | 
 25 | def cc_boxplot(data,n,xlab,avg):
 26 | 	plt.boxplot(data, labels=xlab, whis='range', showmeans=True, meanprops=dict(marker='D', markerfacecolor='red'))
 27 | 	plt.xlabel('Number of nearest neighbours queried, c\n(with average neighbour set size)')
 28 | 	# plt.ylabel('Neighbourhood coverage')
 29 | 	# plt.title('Candidate set coverage with varying value of c\nn='+str(n)+', Average over 50 topics')
 30 | 	
 31 | 	# plt.xlabel('Radius for querying neighbour set, r\n(with average neighbour set size)')
 32 | 	# plt.ylabel('Neighbourhood coverage')
 33 | 	# plt.title('Candidate set coverage with varying value of r\nn='+str(n)+', Average over 50 topics')
 34 | 	
 35 | 	# plt.ylabel('Fraction of neighbouring users from same geography')
 36 | 	# plt.title('Geography precision with radius based search\nAverage over '+str(n)+' users')
 37 | 
 38 | 	plt.ylabel('Fraction of friends in neighbour set')
 39 | 	plt.title('Following Coverage with nearest neighbour search\nAverage number of friends '+str(round(avg,2))+', Average over '+str(n)+' users')
 40 | 	
 41 | 	plt.tight_layout()
 42 | 	plt.ylim(0,1.0)
 43 | 	plt.grid()
 44 | 	plt.show()
 45 | 
 46 | def cc_scatterplot(cc,spread):
 47 | 	plt.scatter(cc, spread)
 48 | 	m, b = np.polyfit(cc,spread, 1)
 49 | 	plt.plot(np.asarray(cc), m*np.asarray(cc) + b, 'r-', label = 'Linear fit')
 50 | 	plt.ylim(-0.1,1.0)
 51 | 	plt.xlim(-0.1,1.0)
 52 | 	# plt.yscale('log')
 53 | 	# plt.xscale('log')
 54 | 	# plt.ylim(500,plt.ylim()[1])
 55 | 	# plt.xlabel('Proportion of first 1000 adopters present in candidate set')
 56 | 	# plt.ylabel('Total spread')
 57 | 	# plt.title('Candidate set coverage and eventual spread of topics\nCorr. coeff. = '+str(round(np.corrcoef(cc,spread)[0,1],4)))	
 58 | 	# plt.xlabel('Proportion of adopters in candidate set')
 59 | 	# plt.ylabel('Precision@10')
 60 | 	# plt.title('Candidate set coverage and Precision@10 of topics\nCorr. coeff. = '+str(round(np.corrcoef(cc,spread)[0,1],4)))
 61 | 	plt.xlabel('Network neighbours (followers)')
 62 | 	plt.ylabel('Vector space neighbours')
 63 | 	# plt.title('Likelihood of co-adoption of users with different neighbourhoods\nCorr. coeff. = '+str(round(np.corrcoef(cc,spread)[0,1],4)))
 64 | 	plt.grid()
 65 | 	plt.legend(loc='upper left')
 66 | 	plt.show()
 67 | 
 68 | num_bin = 10
 69 | def ent_histogram(val):
 70 | 	x,bins,_=plt.hist(val, num_bin, rwidth=0.8, align='left')
 71 | 	# plt.bar(range(1,len(val)+1), val)
 72 | 	# plt.bar(range(1,len(rec)+1), rec)
 73 | 	plt.xlim(-0.1,1.0)
 74 | 	plt.xlabel('Precision@10')
 75 | 	plt.ylabel('Frequency')
 76 | 	# plt.title('Entropy of distribution of geo-locations in clusters')
 77 | 	plt.tight_layout()
 78 | 	plt.grid()
 79 | 	plt.show()
 80 | 
 81 | def freq_plot(y):
 82 | 	time_bins = [(0,5),(6,8),(9,11),(12,14),(15,17),(18,20),(21,23)]
 83 | 	x = [str(i)+'-'+str(j) for i,j in time_bins]
 84 | 	plt.bar(range(len(y)), y, align='center')
 85 | 	plt.xticks(range(len(y)), x, size='small')
 86 | 	plt.xlabel('Time of day (in hour)')
 87 | 	# plt.xlim(xmin=0)
 88 | 	plt.ylabel('Proportion of tweets')
 89 | 	plt.title('Frequency distribution of tweeting time')
 90 | 	plt.grid()
 91 | 	plt.show()
 92 | #coverage box plots
 93 | """
 94 | for n in num_init_adopters:
 95 | 	# cc = []
 96 | 	cand_size = []
 97 | 	data = []
 98 | 	for i in top_k:
 99 | 		# with open("candset_stat_files/candset_n"+str(n)+"_c"+str(i)+".pickle","rb") as fr:
100 | 		with open("candset_stat_files/candset_n"+str(n)+"_r"+str(i)+".pickle","rb") as fr:
101 | 			cand_set_recall = pickle.load(fr)
102 | 			cand_set_overlap = pickle.load(fr)
103 | 			cand_set_cr = pickle.load(fr)
104 | 			cand_set_size_list = pickle.load(fr)
105 | 		print n,i,len(cand_set_recall)
106 | 		# if i==1000:
107 | 		if n==10:
108 | 			cand_set_recall = cand_set_recall[:50]
109 | 		# cc.append(get_stats(cand_set_recall))	
110 | 		data.append(cand_set_recall)
111 | 		cand_size.append(np.mean(cand_set_size_list))
112 | 	xlab = [str(x)+'\n('+str(y)+')' for x,y in zip(top_k,cand_size)]
113 | 	cc_boxplot(data,n,xlab)
114 | 
115 | # cc_plot(cc)
116 | # cc_boxplot(data)
117 | """
118 | #entropy histogram plots
119 | """
120 | with open("user_vector_cluster_entropy.pickle","rb") as fr:
121 | 	_ = pickle.load(fr)
122 | 	c_ent = pickle.load(fr)
123 | 	_ = pickle.load(fr)
124 | 	l_ent = pickle.load(fr)
125 | ent_histogram(c_ent)
126 | ent_histogram(l_ent)
127 | """
128 | 
129 | #coverage vs spread scatter plots
130 | """
131 | # with open("candset_stat_files/test_sequence_indices_thr1000.pickle","rb") as fr:
132 | # 	seq_index_filter = pickle.load(fr)
133 | NUM_LINES = 2000#len(seq_index_filter)
134 | 
135 | num_workers = 9
136 | lines_per_process = int(NUM_LINES/(2*num_workers))
137 | cand_set_recall_spread = []
138 | for s,e in ( (i,min(i+lines_per_process,NUM_LINES)) for i in xrange(0,NUM_LINES,lines_per_process) ):
139 | 	print s,e
140 | 	with open("candset_stat_files/candset_vs_spread_n"+str(num_init_adopters)+"_c"+str(top_k)+"_seq"+str(seq_len_threshold)+"_ex"+str(s)+".pickle","rb") as fr:
141 | 		cc_subset = pickle.load(fr)
142 | 		cand_set_recall_spread += cc_subset
143 | # cand_set_recall_spread = sorted(cand_set_recall_spread, key=lambda x: x[0])
144 | cc,spread = zip(*cand_set_recall_spread)
145 | print sum(cc)*1./len(cc), sum(spread)*1./len(spread), min(cc), max(cc), min(spread), max(spread)
146 | cc_scatterplot(cc,spread)
147 | """
148 | 
149 | #sliding window median plots
150 | """
151 | window_length = 50
152 | median_spread_mw = []
153 | median_cc_mw = []
154 | K=2000
155 | for i in range(0,len(cand_set_recall_spread)):
156 | 	m = np.median(spread[i:i+window_length])
157 | 	# m = 0
158 | 	# for s in spread[i:i+window_length]:
159 | 	# 	if s>=K:
160 | 	# 		m+=1
161 | 	# m = m*1./window_length
162 | 	mw = cc[i:i+window_length]
163 | 	c = mw[len(mw)//2]
164 | 	median_spread_mw.append(m)
165 | 	median_cc_mw.append(c)
166 | cc_scatterplot(median_cc_mw,median_spread_mw)
167 | """
168 | 
169 | #prec@10 vs coverage scatter plots
170 | 
171 | with open("candset_stat_files/nbr_frac0.5_seq100.pickle","rb") as fr:
172 | 	cand_set_recall = pickle.load(fr)
173 | 	cand_set_overlap = pickle.load(fr)
174 | 	cand_set_size_list = pickle.load(fr)
175 | print np.mean(cand_set_recall,axis=0), len(cand_set_recall)
176 | print cand_set_recall[0:10], cand_set_overlap[0:10], cand_set_size_list[0:10]
177 | fol,vec = zip(*cand_set_recall)
178 | # nbh,_ = zip(*cand_set_size_list)
179 | # p_fol = [x*1./y for (x,y) in zip(fol,nbh)]
180 | # p_vec = [x*1./y for (x,y) in zip(vec,nbh)]
181 | # print np.mean(p_fol), np.mean(p_vec)
182 | cc_scatterplot(fol,vec)
183 | # ent_histogram(u)
184 | 
185 | """
186 | #entropy vs spread scatter plots
187 | with open("candset_stat_files/entropy_vs_spread_seq500_ex0.pickle","rb") as fr:
188 | 	ent_spread = pickle.load(fr)
189 | ent_spread = sorted(ent_spread, key=lambda x: x[1])
190 | e,s,er = zip(*ent_spread)
191 | cc_scatterplot(er,s)
192 | """
193 | """
194 | #activity time histogram plots
195 | with open("candset_stat_files/sample_user_activity_time_uneven_bins.pickle","rb") as fr:
196 | 	sample_activity = pickle.load(fr)
197 | 	# total_tweets = pickle.load(fr)
198 | # total_activity_bins = [i*1./total_tweets for i in total_activity_bins]
199 | # print total_tweets, total_activity_bins
200 | print len(sample_activity)
201 | for i in sample_activity:
202 | 	c = sample_activity[i]
203 | 	freq = [0]*7
204 | 	for b in c:
205 | 		freq[b]+=c[b]
206 | 	freq_plot(freq)
207 | """
208 | 
209 | #geography, follower, following coverage box plots
210 | """
211 | cand_size = []
212 | avg=[]
213 | data = []
214 | top_k = [1000,2000,5000,10000] #np.arange(0.6,1.05,0.1).tolist()+[1.2] #np.arange(0.6,1.15,0.1)
215 | n=10000
216 | for i in top_k:
217 | 	# with open("candset_stat_files/candset_loc_c"+str(i)+".pickle","rb") as fr:
218 | 	with open("candset_stat_files/candset_fol_fr_c"+str(i)+".pickle","rb") as fr:
219 | 		cand_set_recall = pickle.load(fr)
220 | 		cand_set_overlap = pickle.load(fr)
221 | 		num_nbr = pickle.load(fr)
222 | 		cand_set_size_list = pickle.load(fr)
223 | 	print i,len(cand_set_recall)
224 | 	# cc_geo,cc_prec_geo = zip(*cand_set_recall)
225 | 	cc_fol,cc_fr = zip(*cand_set_recall)
226 | 	num_fol,num_fr = zip(*num_nbr)
227 | 	avg_fol = np.mean(num_fol)
228 | 	avg_fr = np.mean(num_fr)
229 | 	avg.append(avg_fr)
230 | 	print avg_fol,avg_fr
231 | 	# cc.append(get_stats(cand_set_recall))	
232 | 	data.append(cc_fr)
233 | 	cand_size.append(np.mean(cand_set_size_list))
234 | print np.mean(avg)
235 | xlab = [str(x)+'\n('+str(y)+')' for x,y in zip(top_k,cand_size)]
236 | cc_boxplot(data,n,xlab,np.mean(avg))
237 | """


--------------------------------------------------------------------------------
/neighbourhood_experiments/candidate_set_coverage/cand_cov_vs_spread.py:
--------------------------------------------------------------------------------
  1 | #get candidate set coverage in first 1000 adopters for topics with atleast 1000 adopters and compare with eventual spread
  2 | 
  3 | import cPickle as pickle
  4 | import time
  5 | from math import sqrt
  6 | import random
  7 | from heapq import nsmallest, nlargest, merge
  8 | import numpy
  9 | # from scipy.spatial import cKDTree as KDTree
 10 | from sklearn.neighbors import NearestNeighbors
 11 | import sys
 12 | from multiprocessing import Pool, cpu_count
 13 | 
 14 | NUM_PROCESSES = 9
 15 | 
 16 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_pr.txt"
 17 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences"
 18 | num_init_adopters = 100
 19 | top_k = 4000
 20 | seq_len_threshold = 500 #top_k
 21 | cand_size_factor = 1
 22 | train_ex_limit = 100
 23 | norm_vec = True
 24 | 
 25 | print vec_file, num_init_adopters, top_k, seq_len_threshold, norm_vec
 26 | 
 27 | with open("/mnt/filer01/word2vec/degree_distribution/sequence_file_split_indices.pickle","rb") as fr:
 28 | 	_ = pickle.load(fr)
 29 | 	test_seq_id = pickle.load(fr)
 30 | test_seq_id = set(test_seq_id)
 31 | 
 32 | def read_vector_file(path_vectors_file):
 33 | 	vocab = []
 34 | 	vectors = []
 35 | 	with open(path_vectors_file,"rb") as fr:
 36 | 		_,dim = next(fr).rstrip().split(' ')
 37 | 		word_vector_dim = int(dim)
 38 | 		next(fr)
 39 | 		for line in fr:
 40 | 			line = line.rstrip()
 41 | 			u = line.split(' ')
 42 | 			if len(u) != word_vector_dim+1:
 43 | 				print "vector length error"
 44 | 			word = int(u[0])
 45 | 			#normalise to length 1
 46 | 			if norm_vec:
 47 | 				vec = []
 48 | 				length = 0.0
 49 | 				for d in u[1:]:
 50 | 					num=float(d)
 51 | 					vec.append(num)
 52 | 					length+=num**2
 53 | 				#vec = map(float,u[1:])
 54 | 				#length = sum(x**2 for x in vec)
 55 | 				length = sqrt(length)
 56 | 				vec_norm = [x/length for x in vec]
 57 | 				vectors.append(vec_norm)
 58 | 			else:
 59 | 				vec = map(float,u[1:])
 60 | 				vectors.append(vec)
 61 | 			vocab.append(word)
 62 | 	return vectors, vocab, word_vector_dim
 63 | 
 64 | vec,vocab,dim = read_vector_file(vec_file)
 65 | vocab_index=dict()
 66 | for i in xrange(0,len(vocab)):
 67 | 	vocab_index[vocab[i]]=i
 68 | num_users = len(vocab)
 69 | print "num users in train sequences", num_users
 70 | # print "users removed from vocab", len(set(users_train)-set(vocab))
 71 | # print "users in test sequences but not in vocab", len(users_test-set(vocab))
 72 | 
 73 | # building kd-tree
 74 | tic = time.clock()
 75 | # kd = KDTree(vec, leafsize=10)
 76 | neigh = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='ball_tree', leaf_size=100, metric='minkowski', p=2) #'ball_tree', 'kd_tree', 'auto'
 77 | neigh.fit(vec)
 78 | toc = time.clock()
 79 | print "ball tree built in", (toc-tic)*1000
 80 | 
 81 | def get_candidate_set(query_set,next_adopters,N):
 82 | 	try:
 83 | 		query_set_ind = [ vocab_index[query] for query in query_set ]
 84 | 	except KeyError:
 85 | 		print "query word not present"
 86 | 		return
 87 | 	query_vec = [vec[i] for i in query_set_ind]
 88 | 	# query using scipy kdtree
 89 | 	# d_list,knn_list = kd.query(query_vec,k=cand_size_factor*N+len(query_set_ind))
 90 | 	# query using sklearn
 91 | 	d_list,knn_list = neigh.kneighbors(X=query_vec, n_neighbors=cand_size_factor*N+len(query_set_ind), return_distance=True)
 92 | 
 93 | 	cand_set = set()
 94 | 	for index_list in knn_list:
 95 | 		filtered=[idx for idx in index_list if idx not in query_set_ind]
 96 | 		cand_set.update(filtered)
 97 | 
 98 | 	cand_set_size = len(cand_set)
 99 | 	M = len(next_adopters)
100 | 	next_adopters_index = [vocab_index[a] for a in next_adopters]
101 | 	next_adopters_index = set(next_adopters_index)
102 | 	num_adopters = len(cand_set&next_adopters_index)
103 | 
104 | 	# print "candidate set recall", num_adopters, "out of", len(next_adopters), "cand size", len(cand_user_idx)
105 | 	cr = num_adopters*1./cand_set_size
106 | 	cc = 0.0
107 | 	if M!=0:
108 | 		cc = num_adopters*1./M
109 | 	return num_adopters, cand_set_size, cc, cr, M
110 | 
111 | def print_stats(u):
112 | 	return [numpy.mean(u), numpy.std(u), numpy.median(u)]
113 | 
114 | # reading test sequences
115 | not_found_vocab=[]
116 | # source_thr = 1395858601 + 12*60*60
117 | # non_emergent_tags = pickle.load(open("/mnt/filer01/word2vec/degree_distribution/nonEmergentHashtags.pickle","rb"))
118 | tag_seq = []
119 | count=0
120 | # nb_seq = dict()
121 | # adlen = []
122 | with open(adoption_sequence_filename, "rb") as fr:
123 | 	for line in fr:
124 | 		line = line.rstrip()
125 | 		u = line.split(' ')
126 | 		not_found = set()
127 | 		adopters = set()
128 | 		# first_timestamp = int(u[1][0:u[1].index(',')])
129 | 		# first tweet only after source_thr timestamp
130 | 		# if first_timestamp>=source_thr
131 | 		# check if <5 tweets in 12 hours for emergent hashtags, not already popular
132 | 		# u[0] not in non_emergent_tags and
133 | 		if count in test_seq_id:
134 | 			seq=[]
135 | 			for i in xrange(1, len(u)):
136 | 				#timestamp = int(u[i][0:u[i].index(',')])
137 | 				author = int(u[i][u[i].index(',')+1 : ])
138 | 				if author in vocab_index:
139 | 					# removing repeat adopters
140 | 					if author not in adopters:
141 | 						seq.append(author)
142 | 						adopters.add(author)
143 | 				else:
144 | 					not_found.add(author)
145 | 			if len(seq)>0:
146 | 				tag_seq.append(seq)
147 | 				not_found_vocab.append(len(not_found))
148 | 				# adlen.append(len(seq))
149 | 		# elif count not in test_seq_id:
150 | 		# 	adop=[]
151 | 		# 	for i in xrange(1, len(u)):
152 | 		# 		author = int(u[i][u[i].index(',')+1 : ])
153 | 		# 		if author in vocab_index:
154 | 		# 			adop.append(author)
155 | 		# 	for author in set(adop):			
156 | 		# 		try:
157 | 		# 			nb_seq[author]+=1
158 | 		# 		except KeyError:
159 | 		# 			nb_seq[author]=1
160 | 		count+=1
161 | #nb, number of training sequences participated in
162 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq]
163 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True)
164 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted]
165 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb"))
166 | # pickle.dump(adlen,open("adlen.pickle","wb"))
167 | 
168 | print len(tag_seq),len(test_seq_id),count
169 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab)
170 | 
171 | """
172 | #test sequences in random order
173 | seq_random_index=range(0,len(tag_seq))
174 | random.shuffle(seq_random_index)
175 | 
176 | seq_index_filter = []
177 | for i in seq_random_index:
178 | 	seq_sample_vocab = tag_seq[i]
179 | 	M = len(seq_sample_vocab)
180 | 	if M<seq_len_threshold:
181 | 		continue
182 | 	seq_index_filter.append(i)
183 | print "tags remaining", len(seq_index_filter)
184 | 
185 | with open("/mnt/filer01/word2vec/degree_distribution/candset_stat_files/test_sequence_indices_thr"+str(seq_len_threshold)+".pickle","wb") as fd:
186 | 	pickle.dump(seq_index_filter,fd)
187 | """
188 | with open("/mnt/filer01/word2vec/degree_distribution/candset_stat_files/test_sequence_indices_thr"+str(seq_len_threshold)+".pickle","rb") as fr:
189 | 	seq_index_filter = pickle.load(fr)
190 | print "tags remaining", len(seq_index_filter)
191 | seq_index_filter = seq_index_filter[0:2000]
192 | 
193 | def cand_set_stat(process_num,start,end,num_init,num_query):
194 | 	print process_num, start, end, num_init, num_query
195 | 	cand_set_recall_spread = []
196 | 	cand_set_overlap = []
197 | 	cand_set_cr = []
198 | 	cand_set_size_list = []
199 | 	cand_cov = 0.0
200 | 	cand_cr = 0.0
201 | 	l=0
202 | 	avg_num_adopters = 0
203 | 	count=0
204 | 	for i in seq_index_filter:
205 | 		if count < start:
206 | 			count+=1
207 | 			continue
208 | 		elif count >= end:
209 | 			break
210 | 		count+=1
211 | 		seq_sample_vocab = tag_seq[i]
212 | 		total_spread = len(seq_sample_vocab)
213 | 		avg_num_adopters+=total_spread
214 | 		init_adopters=seq_sample_vocab[0:num_init]
215 | 		next_adopters = seq_sample_vocab[num_init:seq_len_threshold]
216 | 		N = num_query #1000 #M #num_users
217 | 
218 | 		op, cand_set_size, cc, cr, M = get_candidate_set(init_adopters, next_adopters, N)
219 | 		cand_cov+=cc
220 | 		cand_cr+=cr
221 | 
222 | 		cand_set_recall_spread.append((cc,total_spread))
223 | 		cand_set_overlap.append(op)
224 | 		cand_set_cr.append(cr)
225 | 		cand_set_size_list.append(cand_set_size)
226 | 
227 | 		# print "cc", cc, "cand size", cand_set_size, "Avg", cand_cov*1./(l+1), sum(cand_set_size_list)*1./(l+1), "adop in cand", op, "total", M, total_spread, l
228 | 		l+=1
229 | 		# if l==train_ex_limit:
230 | 		# 	break
231 | 	print process_num, start, num_init, num_query, "num examples", l, "cc", print_stats(cand_set_recall_spread), avg_num_adopters*1./l
232 | 	with open("/mnt/filer01/word2vec/degree_distribution/candset_stat_files/candset_vs_spread_n"+str(num_init)+"_c"+str(num_query)+"_seq"+str(seq_len_threshold)+"_ex"+str(start)+".pickle","wb") as fd:
233 | 		pickle.dump(cand_set_recall_spread,fd)
234 | 		# pickle.dump(cand_set_overlap,fd)
235 | 		# pickle.dump(cand_set_cr,fd)
236 | 		# pickle.dump(cand_set_size_list,fd)
237 | 
238 | # tic = time.clock()
239 | # cand_set_stat(0,num_init_adopters,top_k)
240 | # toc = time.clock()
241 | # print "cand set eval in", (toc-tic)*1000
242 | 
243 | NUM_LINES = len(seq_index_filter)
244 | 
245 | num_workers = min(NUM_PROCESSES,cpu_count())
246 | pool = Pool(processes=num_workers) 
247 | process_num=0
248 | lines_per_process = int(NUM_LINES/(2*num_workers))
249 | for s,e in ( (i,min(i+lines_per_process,NUM_LINES)) for i in xrange(0,NUM_LINES,lines_per_process) ):
250 | 	pool.apply_async(cand_set_stat, args=(process_num,s,e,num_init_adopters,top_k))
251 | 	process_num+=1
252 | pool.close()
253 | pool.join()
254 | 
255 | print vec_file, num_init_adopters, top_k


--------------------------------------------------------------------------------
/adopter_prediction/adopter_pred_cand_set_stat.py:
--------------------------------------------------------------------------------
  1 | #get nearest users to the initial adopters of a hashtag sequence in test sequences using user vectors and write candidate set size stats
  2 | #for different values of init adopters, query size or query radius
  3 | #changed index file of sequences with non-zero number of adopters (and those who are present in vocab) in sequence_file_split_indices.pickle
  4 | 
  5 | import cPickle as pickle
  6 | import time
  7 | from math import sqrt
  8 | import random
  9 | from heapq import nsmallest, nlargest, merge
 10 | import numpy
 11 | # from scipy.spatial import cKDTree as KDTree
 12 | from sklearn.neighbors import NearestNeighbors
 13 | import sys
 14 | from multiprocessing import Pool, cpu_count
 15 | import traceback
 16 | 
 17 | NUM_PROCESSES = 2
 18 | 
 19 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_pr.txt"
 20 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences"
 21 | num_init_adopters = [10] #range(10,101,10)
 22 | top_k = [6000,10000] #range(4000,10001,2000)
 23 | query_rad = numpy.arange(0.6,1.05,0.05)
 24 | seq_len_threshold = 500 #top_k
 25 | cand_size_factor = 1
 26 | train_ex_limit = 50
 27 | norm_vec = True
 28 | 
 29 | print vec_file, num_init_adopters, top_k, train_ex_limit, query_rad, norm_vec
 30 | 
 31 | with open("/mnt/filer01/word2vec/degree_distribution/sequence_file_split_indices.pickle","rb") as fr:
 32 | 	_ = pickle.load(fr)
 33 | 	test_seq_id = pickle.load(fr)
 34 | test_seq_id = set(test_seq_id)
 35 | 
 36 | def read_vector_file(path_vectors_file):
 37 | 	vocab = []
 38 | 	vectors = []
 39 | 	with open(path_vectors_file,"rb") as fr:
 40 | 		_,dim = next(fr).rstrip().split(' ')
 41 | 		word_vector_dim = int(dim)
 42 | 		next(fr)
 43 | 		for line in fr:
 44 | 			line = line.rstrip()
 45 | 			u = line.split(' ')
 46 | 			if len(u) != word_vector_dim+1:
 47 | 				print "vector length error"
 48 | 			word = int(u[0])
 49 | 			#normalise to length 1
 50 | 			if norm_vec:
 51 | 				vec = []
 52 | 				length = 0.0
 53 | 				for d in u[1:]:
 54 | 					num=float(d)
 55 | 					vec.append(num)
 56 | 					length+=num**2
 57 | 				#vec = map(float,u[1:])
 58 | 				#length = sum(x**2 for x in vec)
 59 | 				length = sqrt(length)
 60 | 				vec_norm = [x/length for x in vec]
 61 | 				vectors.append(vec_norm)
 62 | 			else:
 63 | 				vec = map(float,u[1:])
 64 | 				vectors.append(vec)
 65 | 			vocab.append(word)
 66 | 	return vectors, vocab, word_vector_dim
 67 | 
 68 | vec,vocab,dim = read_vector_file(vec_file)
 69 | vocab_index=dict()
 70 | for i in xrange(0,len(vocab)):
 71 | 	vocab_index[vocab[i]]=i
 72 | num_users = len(vocab)
 73 | print "num users in train sequences", num_users
 74 | # print "users removed from vocab", len(set(users_train)-set(vocab))
 75 | # print "users in test sequences but not in vocab", len(users_test-set(vocab))
 76 | 
 77 | # building kd-tree
 78 | tic = time.clock()
 79 | # kd = KDTree(vec, leafsize=10)
 80 | neigh = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='ball_tree', leaf_size=100, metric='minkowski', p=2) #'ball_tree', 'kd_tree', 'auto'
 81 | neigh.fit(vec)
 82 | toc = time.clock()
 83 | print "ball tree built in", (toc-tic)*1000
 84 | 
 85 | def get_candidate_set(query_set,next_adopters,N):
 86 | 	try:
 87 | 		query_set_ind = [ vocab_index[query] for query in query_set ]
 88 | 	except KeyError:
 89 | 		print "query word not present"
 90 | 		return
 91 | 	query_vec = [vec[i] for i in query_set_ind]
 92 | 	
 93 | 	# query using scipy kdtree
 94 | 	# d_list,knn_list = kd.query(query_vec,k=cand_size_factor*N+len(query_set_ind))
 95 | 	
 96 | 	# query using sklearn
 97 | 	d_list,knn_list = neigh.kneighbors(X=query_vec, n_neighbors=cand_size_factor*N+len(query_set_ind), return_distance=True)
 98 | 	
 99 | 	# get vectors within distance N
100 | 	# _,knn_list = neigh.radius_neighbors(X=query_vec, radius=N, return_distance=True)
101 | 	qresult_size = []
102 | 
103 | 	cand_set = set()
104 | 	for index_list in knn_list:
105 | 		qresult_size.append(len(index_list))
106 | 		filtered=[idx for idx in index_list if idx not in query_set_ind]
107 | 		cand_set.update(filtered)
108 | 
109 | 	med_qresult_size = numpy.median(qresult_size)
110 | 	cand_set_size = len(cand_set)
111 | 	M = len(next_adopters)
112 | 	next_adopters_index = [vocab_index[a] for a in next_adopters]
113 | 	next_adopters_index = set(next_adopters_index)
114 | 	num_adopters = len(cand_set&next_adopters_index)
115 | 	cand_adopters = cand_set&next_adopters_index
116 | 
117 | 	# print "candidate set recall", num_adopters, "out of", len(next_adopters), "cand size", len(cand_user_idx)
118 | 	cr = 0.0
119 | 	if cand_set_size!=0:
120 | 		cr = num_adopters*1./cand_set_size
121 | 	cc = 0.0
122 | 	if M!=0:
123 | 		cc = num_adopters*1./M
124 | 	return num_adopters, cand_set_size, cc, cr, M, med_qresult_size, cand_adopters, cand_set, next_adopters_index
125 | 
126 | def print_stats(u):
127 | 	return [numpy.mean(u), numpy.std(u), numpy.median(u)]
128 | 
129 | # reading test sequences
130 | not_found_vocab=[]
131 | # source_thr = 1395858601 + 12*60*60
132 | # non_emergent_tags = pickle.load(open("/mnt/filer01/word2vec/degree_distribution/nonEmergentHashtags.pickle","rb"))
133 | tag_seq = []
134 | count=0
135 | # nb_seq = dict()
136 | # adlen = []
137 | with open(adoption_sequence_filename, "rb") as fr:
138 | 	for line in fr:
139 | 		line = line.rstrip()
140 | 		u = line.split(' ')
141 | 		not_found = set()
142 | 		adopters = set()
143 | 		# first_timestamp = int(u[1][0:u[1].index(',')])
144 | 		# first tweet only after source_thr timestamp
145 | 		# if first_timestamp>=source_thr
146 | 		# check if <5 tweets in 12 hours for emergent hashtags, not already popular
147 | 		# u[0] not in non_emergent_tags and
148 | 		if count in test_seq_id:
149 | 			seq=[]
150 | 			for i in xrange(1, len(u)):
151 | 				#timestamp = int(u[i][0:u[i].index(',')])
152 | 				author = int(u[i][u[i].index(',')+1 : ])
153 | 				if author in vocab_index:
154 | 					# removing repeat adopters
155 | 					if author not in adopters:
156 | 						seq.append(author)
157 | 						adopters.add(author)
158 | 				else:
159 | 					not_found.add(author)
160 | 			if len(seq)>0:
161 | 				tag_seq.append(seq)
162 | 				not_found_vocab.append(len(not_found))
163 | 				# adlen.append(len(seq))
164 | 		# elif count not in test_seq_id:
165 | 		# 	adop=[]
166 | 		# 	for i in xrange(1, len(u)):
167 | 		# 		author = int(u[i][u[i].index(',')+1 : ])
168 | 		# 		if author in vocab_index:
169 | 		# 			adop.append(author)
170 | 		# 	for author in set(adop):			
171 | 		# 		try:
172 | 		# 			nb_seq[author]+=1
173 | 		# 		except KeyError:
174 | 		# 			nb_seq[author]=1
175 | 		count+=1
176 | #nb, number of training sequences participated in
177 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq]
178 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True)
179 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted]
180 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb"))
181 | # pickle.dump(adlen,open("adlen.pickle","wb"))
182 | 
183 | print len(tag_seq),len(test_seq_id),count
184 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab)
185 | 
186 | """
187 | #test sequences in random order
188 | seq_random_index=range(0,len(tag_seq))
189 | random.shuffle(seq_random_index)
190 | 
191 | seq_index_filter = []
192 | for i in seq_random_index:
193 | 	seq_sample_vocab = tag_seq[i]
194 | 	init_adopters=seq_sample_vocab[0:num_init_adopters]
195 | 	seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:])
196 | 	M = len(seq_sample_vocab)
197 | 	N = top_k #1000 #M #num_users
198 | 	if M<seq_len_threshold:
199 | 		continue
200 | 	seq_index_filter.append(i)
201 | print "tags remaining", len(seq_index_filter)
202 | 
203 | #train-test split for learning weights
204 | num_train = int(0.5*len(seq_index_filter))
205 | print "training examples present", num_train
206 | train_seq_id_weight = seq_index_filter[:num_train]
207 | test_seq_id_weight = seq_index_filter[num_train:]
208 | with open("/mnt/filer01/word2vec/degree_distribution/candset_stat_files/sequence_file_split_indices.pickle","wb") as fd:
209 | 	pickle.dump(train_seq_id_weight,fd)
210 | 	pickle.dump(test_seq_id_weight,fd)
211 | """
212 | with open("/mnt/filer01/word2vec/degree_distribution/candset_stat_files/test_sequence_split_indices.pickle","rb") as fr:
213 | 	train_seq_id_weight = pickle.load(fr)
214 | 	test_seq_id_weight = pickle.load(fr)
215 | 
216 | def cand_set_stat(process_num,num_init,num_query):
217 | 	print process_num, num_init, num_query
218 | 	try:
219 | 		cand_set_recall = []
220 | 		cand_set_overlap = []
221 | 		cand_set_cr = []
222 | 		cand_set_size_list = []
223 | 		cand_cov = 0.0
224 | 		cand_cr = 0.0
225 | 		cand_corr = set()
226 | 		cand_total = set()
227 | 		adop_total = set()
228 | 
229 | 		l=0
230 | 		avg_num_adopters = 0
231 | 		for i in train_seq_id_weight:
232 | 			seq_sample_vocab = tag_seq[i]
233 | 			avg_num_adopters+=len(seq_sample_vocab)
234 | 			init_adopters=seq_sample_vocab[0:num_init]
235 | 			next_adopters = seq_sample_vocab[num_init:]
236 | 			N = num_query #1000 #M #num_users
237 | 
238 | 			op, cand_set_size, cc, cr, M, med_qres, corr, cand, adop = get_candidate_set(init_adopters, next_adopters, N)
239 | 			cand_cov+=cc
240 | 			cand_cr+=cr
241 | 			cand_corr.update(corr)
242 | 			cand_total.update(cand)
243 | 			adop_total.update(adop)
244 | 
245 | 			cand_set_recall.append(cc)
246 | 			cand_set_overlap.append(op)
247 | 			cand_set_cr.append(cr)
248 | 			cand_set_size_list.append(cand_set_size)
249 | 
250 | 			print num_query, "cc", cc, "cand size", cand_set_size, "Avg", cand_cov*1./(l+1), sum(cand_set_size_list)*1./(l+1), "adop in cand", op, "total", M, "med qres size", med_qres, "uniq cand covered", len(cand_corr), "total adop", len(adop_total), "total cand", len(cand_total), l
251 | 			l+=1
252 | 			# if l%25==0:
253 | 			# 	print num_query, "cc", cc, "cr", cr, "cand size", cand_set_size, "Avg", cand_cov*1./(l+1), sum(cand_set_size_list)*1./(l+1), "adop in cand", op, "total", M, "med qres size", med_qres, l
254 | 			if l==train_ex_limit:
255 | 				break
256 | 		print process_num, num_init, num_query, "num examples", l, "cc", print_stats(cand_set_recall), avg_num_adopters*1./l, "uniq cand covered", len(cand_corr), "total adop", len(adop_total), "total cand", len(cand_total)
257 | 		# with open("/mnt/filer01/word2vec/degree_distribution/candset_stat_files/candset_n"+str(num_init)+"_c"+str(num_query)+".pickle","wb") as fd:
258 | 		# # with open("/mnt/filer01/word2vec/degree_distribution/candset_stat_files/candset_n"+str(num_init)+"_r"+str(num_query)+".pickle","wb") as fd:
259 | 		# 	pickle.dump(cand_set_recall,fd)
260 | 		# 	pickle.dump(cand_set_overlap,fd)
261 | 		# 	pickle.dump(cand_set_cr,fd)
262 | 		# 	pickle.dump(cand_set_size_list,fd)
263 | 	except Exception as e:
264 | 		print traceback.format_exc()
265 | 
266 | # tic = time.clock()
267 | # cand_set_stat(0,num_init_adopters,top_k)
268 | # toc = time.clock()
269 | # print "cand set eval in", (toc-tic)*1000
270 | 
271 | num_workers = min(NUM_PROCESSES,cpu_count())
272 | pool = Pool(processes=num_workers) 
273 | process_num=0
274 | for i in num_init_adopters:
275 | 	for j in top_k:
276 | 	# for j in query_rad:
277 | 		pool.apply_async(cand_set_stat, args=(process_num,i,j))
278 | 		process_num+=1
279 | pool.close()
280 | pool.join()
281 | 
282 | print vec_file, num_init_adopters, top_k


--------------------------------------------------------------------------------
/adopter_prediction/adopter_prediction_next_k.py:
--------------------------------------------------------------------------------
  1 | #get nearest users to the initial adopters of a hashtag sequence in test sequences using user vectors,
  2 | #rank based on learned weighted sum of distances of candidates from initial adopters and compare with actual adopters in the sequence
  3 | #predict next k adopters for each topic in training using increasing number of initial adopters and minimum distance metric
  4 | 
  5 | import cPickle as pickle
  6 | import time
  7 | from math import sqrt
  8 | import random
  9 | from heapq import nsmallest, nlargest, merge
 10 | import numpy
 11 | # from scipy.spatial import cKDTree as KDTree
 12 | from sklearn.neighbors import NearestNeighbors
 13 | import sys
 14 | # sys.path.append('libsvm-3.20/python')
 15 | # from svmutil import *
 16 | # sys.path.append('liblinear-1.96/python')
 17 | # from liblinearutil import *
 18 | from sklearn.svm import LinearSVC, SVC
 19 | from sklearn.ensemble import RandomForestClassifier
 20 | from sklearn.linear_model import LogisticRegression
 21 | from sklearn.cross_validation import KFold
 22 | 
 23 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_pr.txt"
 24 | nb_sorted_pickle = "/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/baseline_user_order_1hr_pr.pickle"
 25 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences"
 26 | num_init_adopters = 10
 27 | par_m = 8
 28 | metric_Hausdorff_m_avg = 0
 29 | top_k = 100
 30 | seq_len_threshold = top_k #500
 31 | cand_size_factor = 1
 32 | train_ex_limit = 100
 33 | norm_vec = True
 34 | next_k_test = 10
 35 | next_k_limit = 100
 36 | 
 37 | def init_clf():
 38 | 	# clf = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, C=1.0, class_weight=None)
 39 | 	# clf = SVC(C=1000.0, kernel='rbf', shrinking=True, probability=False, tol=0.001, cache_size=2000, class_weight=None, max_iter=-1)
 40 | 	clf = RandomForestClassifier(n_estimators=300, n_jobs=10, class_weight=None)
 41 | 	# clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, class_weight=None, max_iter=100)
 42 | 	return clf
 43 | 
 44 | # training_options='-s 0 -t 2 -b 1 -m 8000'
 45 | print vec_file, num_init_adopters, metric_Hausdorff_m_avg, top_k, cand_size_factor, train_ex_limit, next_k_test
 46 | 
 47 | with open("/mnt/filer01/word2vec/degree_distribution/sequence_file_split_indices.pickle","rb") as fr:
 48 | 	_ = pickle.load(fr)
 49 | 	test_seq_id = pickle.load(fr)
 50 | test_seq_id = set(test_seq_id)
 51 | 
 52 | def read_vector_file(path_vectors_file):
 53 | 	vocab = []
 54 | 	vectors = []
 55 | 	with open(path_vectors_file,"rb") as fr:
 56 | 		_,dim = next(fr).rstrip().split(' ')
 57 | 		word_vector_dim = int(dim)
 58 | 		next(fr)
 59 | 		for line in fr:
 60 | 			line = line.rstrip()
 61 | 			u = line.split(' ')
 62 | 			if len(u) != word_vector_dim+1:
 63 | 				print "vector length error"
 64 | 			word = int(u[0])
 65 | 			#normalise to length 1
 66 | 			vec = []
 67 | 			length = 0.0
 68 | 			for d in u[1:]:
 69 | 				num=float(d)
 70 | 				vec.append(num)
 71 | 				length+=num**2
 72 | 			#vec = map(float,u[1:])
 73 | 			#length = sum(x**2 for x in vec)
 74 | 			length = sqrt(length)
 75 | 			vec_norm = [x/length for x in vec]
 76 | 			vocab.append(word)
 77 | 			vectors.append(vec_norm)
 78 | 	return vectors, vocab, word_vector_dim
 79 | 
 80 | vec,vocab,dim = read_vector_file(vec_file)
 81 | vocab_index=dict()
 82 | for i in xrange(0,len(vocab)):
 83 | 	vocab_index[vocab[i]]=i
 84 | num_users = len(vocab)
 85 | print "num users in train sequences", num_users
 86 | # print "users removed from vocab", len(set(users_train)-set(vocab))
 87 | # print "users in test sequences but not in vocab", len(users_test-set(vocab))
 88 | 
 89 | # building kd-tree
 90 | tic = time.clock()
 91 | # kd = KDTree(vec, leafsize=10)
 92 | neigh = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='ball_tree', leaf_size=100, metric='minkowski', p=2) #'ball_tree', 'kd_tree', 'auto'
 93 | neigh.fit(vec)
 94 | toc = time.clock()
 95 | print "kdtree tree built in", (toc-tic)*1000
 96 | 
 97 | def get_Nranked_list_kdtree(query_set,N):
 98 | 	try:
 99 | 		query_set_ind = [ vocab_index[query] for query in query_set ]
100 | 	except KeyError:
101 | 		print "query word not present"
102 | 		return
103 | 	query_vec = [vec[i] for i in query_set_ind]
104 | 	#?use distance_upper_bound for set_size queries sequentially
105 | 	# query using scipy kdtree
106 | 	# d_list,knn_list = kd.query(query_vec,k=N+len(query_set_ind)) #, eps=eps)
107 | 	# query using sklearn
108 | 	d_list,knn_list = neigh.kneighbors(X=query_vec, n_neighbors=N+len(query_set_ind), return_distance=True)
109 | 	#?use heap of size set_size and push top elements from set_size ranked list until N elements are popped
110 | 	index_dist_list = []
111 | 	for d,index in zip(d_list,knn_list):
112 | 		filtered=[(dt,idx) for (dt,idx) in list(zip(d,index)) if idx not in query_set_ind]
113 | 		index_dist_list.append(filtered)
114 | 	knn=[]
115 | 	sel=set()
116 | 	count=0
117 | 	for (d,idx) in merge(*index_dist_list):
118 | 		if idx not in sel:
119 | 			sel.add(idx)
120 | 			knn.append(vocab[idx])
121 | 			count+=1
122 | 		if count==N:
123 | 			break
124 | 	return knn
125 | 
126 | def get_cand_feature_vectors(query_set,next_adopters,N):
127 | 	try:
128 | 		query_set_ind = [ vocab_index[query] for query in query_set ]
129 | 	except KeyError:
130 | 		print "query word not present"
131 | 		return
132 | 	query_vec = [vec[i] for i in query_set_ind]
133 | 	# query using scipy kdtree
134 | 	# d_list,knn_list = kd.query(query_vec,k=cand_size_factor*N+len(query_set_ind))
135 | 	# query using sklearn
136 | 	d_list,knn_list = neigh.kneighbors(X=query_vec, n_neighbors=cand_size_factor*N+len(query_set_ind), return_distance=True)
137 | 
138 | 	cand_set = set()
139 | 	for index_list in knn_list:
140 | 		filtered=[idx for idx in index_list if idx not in query_set_ind]
141 | 		cand_set.update(filtered)
142 | 
143 | 	X=[]
144 | 	Y=[]
145 | 	cand_user=[]
146 | 	num_adopters = 0
147 | 	for idx in cand_set:
148 | 		dist_query_set = [0.0]*len(query_set)
149 | 		cand_vec = vec[idx]
150 | 		l=0
151 | 		for q in query_vec:
152 | 			dist = sum( (cand_vec[x]-q[x])**2 for x in xrange(0,dim) )
153 | 			dist_query_set[l]= sqrt(dist)
154 | 			l+=1
155 | 		# avg = sum(dist_query_set)*1./l
156 | 		dist_query_set=sorted(dist_query_set)
157 | 		# dist_query_set.append(avg)
158 | 		user_m_id = vocab[idx]
159 | 		label=-1
160 | 		if user_m_id in next_adopters:
161 | 			label=1
162 | 			num_adopters+=1
163 | 		X.append(dist_query_set)
164 | 		Y.append(label)
165 | 		cand_user.append(user_m_id)
166 | 	print "candidate set recall", num_adopters, "out of", len(next_adopters), "cand size", len(cand_user)
167 | 	cr = num_adopters*1./len(cand_user)
168 | 	cc = 0.0
169 | 	if len(next_adopters)!=0:
170 | 		cc = num_adopters*1./len(next_adopters)
171 | 	return X,Y,cand_user,cc,cr
172 | 
173 | # reading test sequences
174 | not_found_vocab=[]
175 | # source_thr = 1395858601 + 12*60*60
176 | # non_emergent_tags = pickle.load(open("/mnt/filer01/word2vec/degree_distribution/nonEmergentHashtags.pickle","rb"))
177 | tag_seq = []
178 | count=0
179 | # nb_seq = dict()
180 | # adlen = []
181 | with open(adoption_sequence_filename, "rb") as fr:
182 | 	for line in fr:
183 | 		line = line.rstrip()
184 | 		u = line.split(' ')
185 | 		not_found = set()
186 | 		adopters = set()
187 | 		# first_timestamp = int(u[1][0:u[1].index(',')])
188 | 		# first tweet only after source_thr timestamp
189 | 		# if first_timestamp>=source_thr
190 | 		# check if <5 tweets in 12 hours for emergent hashtags, not already popular
191 | 		# u[0] not in non_emergent_tags and
192 | 		if count in test_seq_id:
193 | 			seq=[]
194 | 			for i in xrange(1, len(u)):
195 | 				#timestamp = int(u[i][0:u[i].index(',')])
196 | 				author = int(u[i][u[i].index(',')+1 : ])
197 | 				if author in vocab_index:
198 | 					# removing repeat adopters
199 | 					if author not in adopters:
200 | 						seq.append(author)
201 | 						adopters.add(author)
202 | 				else:
203 | 					not_found.add(author)
204 | 			if len(seq)>num_init_adopters:
205 | 				tag_seq.append(seq)
206 | 				not_found_vocab.append(len(not_found))
207 | 				# adlen.append(len(seq))
208 | 		# elif count not in test_seq_id:
209 | 		# 	adop=[]
210 | 		# 	for i in xrange(1, len(u)):
211 | 		# 		author = int(u[i][u[i].index(',')+1 : ])
212 | 		# 		if author in vocab_index:
213 | 		# 			adop.append(author)
214 | 		# 	for author in set(adop):			
215 | 		# 		try:
216 | 		# 			nb_seq[author]+=1
217 | 		# 		except KeyError:
218 | 		# 			nb_seq[author]=1
219 | 		count+=1
220 | #nb, number of training sequences participated in
221 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq]
222 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True)
223 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted]
224 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb"))
225 | # pickle.dump(adlen,open("adlen.pickle","wb"))
226 | 
227 | print len(tag_seq),len(test_seq_id),count
228 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab)
229 | 
230 | prec_k_total = []
231 | rec_k_total = []
232 | 
233 | """
234 | #test sequences in random order
235 | seq_random_index=range(0,len(tag_seq))
236 | random.shuffle(seq_random_index)
237 | 
238 | seq_index_filter = []
239 | for i in seq_random_index:
240 | 	seq_sample_vocab = tag_seq[i]
241 | 	init_adopters=seq_sample_vocab[0:num_init_adopters]
242 | 	seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:])
243 | 	M = len(seq_sample_vocab)
244 | 	N = top_k #1000 #M #num_users
245 | 	if M<seq_len_threshold:
246 | 		continue
247 | 	seq_index_filter.append(i)
248 | print "tags remaining", len(seq_index_filter)
249 | 
250 | #train-test split for learning weights
251 | num_train = int(0.5*len(seq_index_filter))
252 | print "training examples present", num_train
253 | train_seq_id_weight = seq_index_filter[:num_train]
254 | test_seq_id_weight = seq_index_filter[num_train:]
255 | with open("adopter_pred_files/sequence_file_split_indices_weight_n40.pickle","wb") as fd:
256 | 	pickle.dump(train_seq_id_weight,fd)
257 | 	pickle.dump(test_seq_id_weight,fd)
258 | """
259 | with open("/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/sequence_file_split_indices_weight_n10.pickle","rb") as fr:
260 | 	train_seq_id_weight = pickle.load(fr)
261 | 	test_seq_id_weight = pickle.load(fr)
262 | 
263 | # train_X = []
264 | # train_Y = []
265 | l=0
266 | for i in train_seq_id_weight:
267 | 	
268 | 	seq_sample_vocab = tag_seq[i]
269 | 
270 | 	precision_k = []
271 | 	recall_k = []
272 | 
273 | 	#continuously increase initial adopters for training
274 | 	for init_size in range(num_init_adopters,next_k_limit,next_k_test):	
275 | 		init_adopters=seq_sample_vocab[0:init_size]
276 | 		next_k_adopters = set(seq_sample_vocab[init_size:init_size+next_k_test])
277 | 		M = len(next_k_adopters)
278 | 		N = top_k #1000 #M #num_users
279 | 
280 | 		adopters_vec = get_Nranked_list_kdtree(init_adopters,N)
281 | 		
282 | 		num_hits = len(set(next_k_adopters)&set(adopters_vec))
283 | 		prec_k_cv = num_hits*1./N
284 | 		rec_k_cv = num_hits*1./M
285 | 
286 | 		#precision at R
287 | 		# pred_adopters = [w for w,_ in nlargest(M,cand_prob_list,key=lambda x: x[1])]
288 | 		# prec_k_cv = len(set(next_k_adopters)&set(pred_adopters))*1./M
289 | 
290 | 		print "precision", prec_k_cv, "recall", rec_k_cv, "num adopters", M
291 | 		precision_k.append(prec_k_cv)
292 | 		recall_k.append(rec_k_cv)
293 | 
294 | 	prec_k_total.append(precision_k)
295 | 	rec_k_total.append(recall_k)
296 | 	
297 | 	print "Avg precision", precision_k, "Avg recall", recall_k, "precision total", numpy.mean(prec_k_total, axis=0), "recall total", numpy.mean(rec_k_total, axis=0)
298 | 	l+=1
299 | 	if l%20==0:
300 | 		print "example num", l
301 | 	if l==train_ex_limit:
302 | 		break
303 | print "training examples taken", l
304 | 
305 | def print_stats(u):
306 | 	return [numpy.mean(u, axis=0), numpy.std(u, axis=0), numpy.median(u, axis=0)]
307 | print "Precision", print_stats(prec_k_total), "Recall", print_stats(rec_k_total)
308 | with open("adopter_pred_files/single_topic_next_k/eval_n10_min_k10.pickle","wb") as fd:
309 | 	pickle.dump(prec_k_total,fd)
310 | 	pickle.dump(rec_k_total,fd)
311 | print vec_file, num_init_adopters, metric_Hausdorff_m_avg, top_k


--------------------------------------------------------------------------------
/results.txt:
--------------------------------------------------------------------------------
 1 | ---------------------------------------RESULTS FOR tr = 0.01---------------------------------------
 2 | 
 3 | me2150758@bhaskara2:/dbresearch2/sahil$ python user_vector_cluster_geography.py
 4 | /dbresearch2/sahil/node_vectors_1hr_pr.txt 0.01 True
 5 | num users in train sequences 2574807
 6 | Map Read
 7 | Follower file offset Read
 8 | 
 9 | Friend file offset Read
10 | 
11 | 2157571 2574807 100
12 | 10000 990000
13 | /usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
14 |   'precision', 'predicted', average, warn_for)
15 | /usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
16 |   'precision', 'predicted', average, warn_for)
17 | ((0.0023272571599040484, 0.19627474747474746), 194312, (0.0013920194856365069, 0.19627474747474746), (0.0070921985815602835, 0.19627474747474746)) 6
18 | ((0.10435187991953089, 0.38197474747474741), 378155, (0.10916766860799113, 0.38197474747474747), (0.11455696212534415, 0.38197474747474747))
19 | Counter({6: 331252, 38: 125775, 2: 123067, 5: 113621, 27: 26135, 50: 25963, 49: 24861, 52: 14714, 16: 13378, 42: 12937, 25: 11900, 22: 11320, 40: 10170, 19: 8598, 93: 5495, 129: 5301, 65: 4988, 28: 4657, 120: 4427, 94: 4413, 104: 4328, 68: 4228, 14: 3910, 20: 3283, 111: 2950, 55: 2912, 43: 2849, 102: 2849, 35: 2729, 79: 2729, 110: 2714, 76: 2675, 108: 2653, 64: 2642, 45: 2552, 121: 2382, 106: 2289, 127: 2240, 24: 2238, 134: 2200, 37: 2175, 66: 2170, 91: 2163, 77: 2160, 47: 2109, 88: 2039, 73: 1955, 81: 1842, 51: 1815, 139: 1776, 61: 1733, 7: 1695, 15: 1627, 123: 1511, 63: 1489, 78: 1279, 58: 1275, 131: 1230, 112: 1133, 62: 1035, 113: 950, 31: 910, 56: 829, 59: 801, 89: 733, 8: 705, 41: 624, 95: 622, 11: 619, 67: 602, 48: 559, 74: 557, 119: 478, 92: 471, 126: 465, 30: 431, 17: 418, 60: 374, 70: 349, 87: 349, 124: 313, 83: 281, 140: 246, 98: 235, 96: 230, 57: 215, 18: 208, 21: 208, 44: 175, 39: 166, 117: 156, 46: 154, 29: 114, 72: 112, 32: 101, 4: 83, 114: 80, 23: 79, 133: 55, 100: 53, 109: 52, 116: 48, 0: 44, 12: 43, 122: 35, 135: 27, 10: 26, 103: 24, 36: 15, 53: 14, 132: 10, 26: 5, 13: 4, 130: 3, 9: 2, 34: 2, 3: 1}) 117
20 | ((0.1103276019568873, 0.38996868686868685), 386069, (0.15368525242647463, 0.38996868686868685), (0.10260427842184484, 0.38996868686868685))
21 | Counter({6: 364055, 2: 132725, 38: 129034, 5: 128043, 27: 24639, 50: 22848, 49: 22753, 40: 17268, 52: 12466, 16: 12078, 42: 11500, 25: 11045, 22: 10023, 19: 8371, 129: 6459, 30: 5212, 60: 5137, 28: 4195, 104: 4097, 127: 4005, 94: 3964, 93: 3562, 35: 3121, 21: 3070, 120: 3046, 68: 2944, 65: 2837, 20: 2487, 45: 2400, 0: 2108, 106: 2095, 4: 2082, 23: 1710, 76: 1684, 91: 1549, 123: 1177, 55: 1149, 64: 976, 79: 863, 77: 766, 88: 754, 73: 740, 102: 734, 37: 637, 111: 618, 110: 617, 43: 608, 108: 536, 134: 510, 112: 473, 3: 459, 66: 441, 47: 420, 81: 402, 24: 386, 89: 272, 14: 209, 121: 207, 36: 198, 31: 186, 62: 179, 139: 170, 67: 120, 1: 111, 56: 89, 41: 84, 87: 58, 34: 51, 15: 49, 58: 45, 51: 23, 95: 12, 61: 11, 131: 11, 7: 8, 63: 6, 78: 5, 113: 5, 48: 4, 8: 3, 126: 3, 74: 2, 17: 1}) 83
22 | fol 100000
23 | fol 200000
24 | fol 300000
25 | fol 400000
26 | fol 500000
27 | fol 600000
28 | fol 700000
29 | fol 800000
30 | fol 900000
31 | fol based pred 1594819.884
32 | ((0.089663747847160385, 0.25323131313131314), 250699, (0.18648283120472844, 0.25323131313131314), (0.066906148699151041, 0.25323131313131314))
33 | Counter({6: 665300, 2: 60172, 5: 57105, 38: 38172, 40: 15158, 21: 12267, 4: 11457, 0: 10437, 49: 10381, 50: 7776, 27: 6757, 25: 6598, 129: 4864, 30: 4796, 23: 4725, 42: 4647, 3: 4542, 60: 4501, 1: 4305, 52: 4304, 16: 3728, 34: 2717, 22: 2358, 45: 2242, 28: 2170, 19: 2101, 35: 2019, 127: 1898, 36: 1804, 68: 1737, 81: 1397, 73: 1281, 120: 1207, 94: 1116, 123: 1082, 134: 974, 93: 940, 65: 897, 20: 889, 112: 858, 24: 855, 121: 845, 104: 844, 64: 784, 43: 730, 79: 717, 55: 704, 106: 665, 139: 623, 126: 618, 91: 602, 37: 584, 76: 506, 66: 500, 111: 475, 108: 470, 67: 460, 83: 423, 47: 422, 7: 374, 11: 347, 14: 313, 56: 300, 18: 289, 102: 288, 110: 288, 88: 265, 70: 248, 131: 224, 8: 221, 57: 220, 77: 219, 31: 188, 15: 179, 58: 179, 113: 176, 89: 152, 78: 150, 74: 149, 12: 148, 41: 135, 59: 128, 48: 108, 61: 105, 95: 95, 62: 94, 32: 77, 51: 77, 72: 77, 63: 70, 92: 67, 87: 63, 39: 50, 117: 49, 119: 46, 17: 44, 26: 43, 140: 39, 124: 26, 122: 24, 109: 20, 132: 17, 9: 16, 29: 16, 46: 16, 100: 14, 114: 12, 135: 12, 98: 9, 53: 8, 116: 6, 13: 4, 103: 3, 10: 2, 96: 2, 130: 2, 133: 1}) 117 565510
34 | limit pred ((0.16464504571087968, 0.34837098636010272), 147880, (0.18780634986580708, 0.34837098636010272), (0.16071995453792767, 0.34837098636010272)) cov 0.428777777778
35 | fr 100000
36 | fr 200000
37 | fr 300000
38 | fr 400000
39 | fr 500000
40 | fr 600000
41 | fr 700000
42 | fr 800000
43 | fr 900000
44 | ((0.093460097021065905, 0.26633232323232325), 263669, (0.18393748438574531, 0.26633232323232325), (0.073482839247457951, 0.26633232323232325))
45 | Counter({6: 569514, 2: 92242, 5: 69829, 38: 38774, 40: 26075, 21: 19513, 49: 12719, 0: 12708, 50: 11326, 4: 11069, 27: 10223, 1: 8737, 16: 8543, 34: 8046, 25: 7796, 30: 6374, 42: 6298, 129: 6018, 19: 5471, 52: 4461, 60: 3743, 22: 3420, 45: 3223, 3: 3110, 23: 2744, 127: 2565, 121: 2134, 68: 1914, 20: 1780, 35: 1727, 28: 1556, 81: 1486, 108: 1316, 120: 1285, 65: 1170, 43: 1151, 36: 1150, 73: 1008, 134: 884, 104: 868, 94: 857, 64: 823, 112: 778, 91: 773, 67: 730, 106: 719, 55: 695, 93: 654, 123: 653, 79: 518, 77: 461, 78: 444, 139: 441, 124: 424, 24: 385, 62: 357, 47: 342, 66: 340, 111: 336, 11: 304, 37: 301, 110: 295, 83: 291, 18: 279, 102: 264, 76: 256, 95: 241, 14: 214, 126: 214, 92: 213, 131: 188, 7: 181, 57: 171, 56: 159, 61: 142, 70: 141, 12: 132, 8: 124, 31: 124, 41: 122, 88: 115, 32: 87, 51: 81, 15: 73, 58: 67, 48: 57, 59: 56, 63: 47, 74: 36, 119: 34, 72: 31, 113: 31, 89: 29, 17: 27, 114: 21, 117: 21, 26: 20, 140: 19, 39: 18, 122: 18, 109: 16, 29: 9, 53: 7, 100: 4, 135: 4, 46: 3, 87: 3, 98: 2, 103: 2, 132: 2, 10: 1, 13: 1, 44: 1, 96: 1}) 114 387749
46 | limit pred ((0.1297494329145378, 0.31791229902482521), 191463, (0.18467459578376944, 0.31791229902482521), (0.11696095576996142, 0.31791229902482521)) cov 0.608334343434
47 | fol 21 96 fr 65152 5217 75
48 | /dbresearch2/sahil/node_vectors_1hr_pr.txt 0.01 True
49 | 
50 | 
51 | ---------------------------------------RESULTS FOR tr = 0.1---------------------------------------
52 | 
53 | /dbresearch2/sahil/node_vectors_1hr_pr.txt 0.1 True
54 | num users in train sequences 2574807
55 | Map Read
56 | Follower file offset Read
57 | 
58 | Friend file offset Read
59 | 
60 | 2157571 2574807 100
61 | 100000 900000
62 | /usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
63 |   'precision', 'predicted', average, warn_for)
64 | /usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
65 |   'precision', 'predicted', average, warn_for)
66 | ((0.0023275735255466729, 0.19630666666666666), 176676, (0.0013922458628841607, 0.19630666666666666), (0.0070921985815602835, 0.19630666666666666)) 6
67 | ((0.11315118094706672, 0.4070988888888889), 366389, (0.15637680750854319, 0.4070988888888889), (0.11585482553372159, 0.4070988888888889))
68 | Counter({6: 339200, 38: 126921, 2: 108161, 5: 101815, 50: 28857, 49: 26452, 27: 26401, 40: 14987, 52: 14302, 42: 13881, 16: 12945, 25: 11053, 129: 11022, 22: 10750, 19: 7019, 28: 4531, 104: 4220, 120: 4090, 68: 3577, 94: 3366, 35: 3239, 65: 3121, 93: 2907, 20: 2378, 76: 2361, 88: 1431, 62: 1340, 55: 1252, 64: 1079, 79: 754, 102: 596, 110: 507, 106: 504, 91: 495, 37: 465, 111: 415, 60: 316, 95: 314, 47: 301, 127: 263, 100: 215, 45: 182, 139: 172, 14: 104, 99: 100, 89: 99, 84: 97, 41: 91, 122: 87, 72: 86, 118: 85, 48: 78, 53: 77, 10: 74, 36: 70, 54: 67, 71: 53, 39: 50, 82: 43, 103: 43, 12: 41, 105: 38, 107: 37, 33: 33, 32: 31, 134: 31, 108: 30, 75: 28, 132: 28, 109: 22, 80: 21, 67: 20, 101: 20, 58: 19, 87: 15, 128: 15, 9: 12, 96: 12, 123: 11, 116: 10, 114: 9, 66: 8, 140: 8, 30: 6, 44: 6, 135: 6, 21: 3, 86: 3, 113: 3, 0: 2, 83: 2, 125: 2, 136: 2, 138: 2, 4: 1, 133: 1, 137: 1}) 97
69 | ((0.13520330600572122, 0.40737222222222225), 366635, (0.21112793377240058, 0.40737222222222225), (0.12320217398439315, 0.40737222222222225))
70 | Counter({6: 355749, 38: 117134, 5: 109981, 2: 106147, 50: 24001, 49: 22807, 27: 22636, 40: 15325, 52: 12153, 42: 11663, 16: 11388, 22: 9930, 25: 9828, 129: 7313, 19: 6326, 28: 4241, 60: 3487, 120: 3362, 104: 3152, 94: 3145, 35: 3029, 68: 2946, 65: 2617, 127: 2487, 20: 2288, 93: 2087, 91: 1973, 76: 1866, 62: 1514, 106: 1400, 45: 1354, 30: 1180, 55: 1122, 102: 1092, 88: 1054, 79: 1013, 64: 901, 110: 855, 111: 802, 123: 761, 36: 702, 37: 611, 108: 565, 47: 547, 14: 544, 0: 512, 139: 510, 66: 403, 4: 380, 112: 328, 134: 311, 89: 298, 58: 278, 77: 219, 73: 206, 95: 130, 116: 129, 41: 118, 67: 112, 81: 109, 43: 96, 119: 72, 61: 71, 113: 71, 56: 67, 63: 59, 74: 59, 48: 55, 26: 51, 100: 50, 24: 48, 3: 45, 7: 37, 18: 24, 1: 16, 92: 13, 121: 9, 98: 8, 31: 6, 126: 6, 17: 4, 124: 3, 15: 2, 21: 2, 57: 2, 131: 2, 78: 1}) 87
71 | fol 100000
72 | fol 200000
73 | fol 300000
74 | fol 400000
75 | fol 500000
76 | fol 600000
77 | fol 700000
78 | fol 800000
79 | fol 900000
80 | fol based pred 1738513.587
81 | ((0.18500498224648235, 0.38802111111111109), 349219, (0.27468649543097362, 0.38802111111111109), (0.15741696575059896, 0.38802111111111109))
82 | Counter({6: 369040, 2: 111260, 5: 91384, 38: 73751, 40: 21811, 49: 19016, 50: 18591, 27: 14546, 21: 12674, 4: 11865, 16: 10763, 25: 10510, 42: 9603, 0: 9439, 52: 8556, 129: 8479, 22: 7131, 3: 5310, 19: 4940, 35: 4713, 1: 4597, 23: 3886, 60: 3772, 34: 3657, 120: 3590, 127: 3131, 30: 3078, 123: 2518, 68: 2444, 45: 2288, 20: 2281, 28: 2181, 36: 1966, 93: 1713, 56: 1591, 65: 1561, 91: 1525, 112: 1477, 64: 1442, 76: 1428, 94: 1381, 55: 1290, 104: 1174, 47: 1112, 106: 998, 139: 982, 108: 974, 43: 937, 111: 909, 37: 890, 81: 887, 73: 878, 14: 805, 134: 795, 79: 789, 62: 748, 24: 696, 121: 687, 74: 625, 88: 609, 110: 583, 51: 524, 61: 399, 66: 395, 102: 355, 67: 343, 48: 340, 89: 325, 41: 324, 78: 294, 57: 258, 58: 252, 95: 249, 17: 242, 26: 239, 77: 233, 131: 233, 113: 219, 8: 208, 96: 183, 92: 179, 7: 178, 63: 160, 119: 152, 124: 134, 31: 111, 18: 106, 11: 92, 59: 92, 83: 91, 15: 90, 126: 73, 29: 54, 116: 53, 98: 49, 100: 45, 117: 31, 72: 29, 46: 28, 70: 27, 33: 24, 122: 24, 87: 21, 105: 20, 32: 18, 44: 18, 75: 17, 53: 16, 69: 16, 10: 14, 54: 12, 71: 12, 80: 12, 128: 12, 84: 11, 82: 10, 103: 10, 12: 9, 13: 9, 101: 9, 140: 8, 99: 7, 107: 7, 135: 6, 9: 4, 39: 4, 125: 4, 86: 3, 114: 3, 118: 3, 132: 3, 136: 3, 138: 2, 97: 1, 133: 1, 137: 1}) 136 161031
83 | limit pred ((0.21452624247316587, 0.43708464089833271), 322992, (0.27552783828129274, 0.43708464089833265), (0.19807627055156674, 0.43708464089833265)) cov 0.821076666667
84 | fr 100000
85 | fr 200000
86 | fr 300000
87 | fr 400000
88 | fr 500000
89 | fr 600000
90 | fr 700000
91 | fr 800000
92 | fr 900000
93 | ((0.17216425039620847, 0.41254777777777779), 371293, (0.3000193087788961, 0.41254777777777779), (0.14673950069699471, 0.41254777777777779))
94 | Counter({6: 352261, 2: 166024, 38: 88391, 5: 74008, 49: 22190, 50: 22092, 27: 17790, 40: 13809, 16: 10948, 52: 10829, 42: 9951, 25: 9719, 21: 9543, 129: 8532, 22: 7597, 4: 6247, 19: 5832, 0: 5617, 35: 3601, 120: 2973, 1: 2933, 28: 2498, 93: 2460, 20: 2339, 65: 2308, 68: 1985, 76: 1941, 3: 1932, 127: 1805, 30: 1796, 45: 1656, 104: 1466, 112: 1366, 94: 1303, 60: 1241, 55: 1234, 123: 1208, 23: 1125, 62: 1103, 56: 1093, 64: 1074, 36: 983, 34: 975, 106: 970, 110: 846, 134: 708, 43: 679, 81: 676, 91: 662, 89: 661, 47: 658, 37: 594, 139: 576, 14: 560, 111: 558, 108: 525, 79: 512, 24: 401, 73: 375, 57: 303, 61: 284, 121: 249, 41: 246, 48: 238, 74: 193, 95: 189, 88: 186, 78: 179, 8: 178, 66: 170, 102: 170, 124: 136, 113: 130, 58: 120, 26: 109, 77: 108, 119: 95, 51: 90, 17: 88, 29: 70, 67: 70, 63: 62, 126: 52, 70: 46, 31: 44, 92: 44, 15: 43, 18: 42, 131: 34, 7: 30, 11: 29, 83: 29, 59: 22, 117: 21, 96: 18, 32: 16, 116: 16, 98: 13, 87: 10, 33: 9, 69: 9, 100: 9, 46: 7, 105: 7, 103: 6, 75: 5, 71: 4, 39: 3, 54: 3, 72: 3, 122: 3, 128: 3, 132: 3, 12: 2, 44: 2, 114: 2, 10: 1, 13: 1, 53: 1, 82: 1, 84: 1, 97: 1, 99: 1, 101: 1, 140: 1}) 125 90000
95 | limit pred ((0.18509957355080509, 0.43739382716049385), 354289, (0.30040341757656785, 0.43739382716049385), (0.16387609625493194, 0.43739382716049385)) cov 0.9
96 | fol 18 77 fr 59087 4784 59
97 | /dbresearch2/sahil/node_vectors_1hr_pr.txt 0.1 True
98 | 


--------------------------------------------------------------------------------
/adopter_prediction/adopter_prediction_next_k_weight_learning.py:
--------------------------------------------------------------------------------
  1 | #get nearest users to the initial adopters of a hashtag sequence in test sequences using user vectors,
  2 | #rank based on learned weighted sum of distances of candidates from initial adopters and compare with actual adopters in the sequence
  3 | #train on increasing number of initial adopters and test model on next k adopters for each topic in training
  4 | 
  5 | import cPickle as pickle
  6 | import time
  7 | from math import sqrt
  8 | import random
  9 | from heapq import nsmallest, nlargest, merge
 10 | import numpy
 11 | # from scipy.spatial import cKDTree as KDTree
 12 | from sklearn.neighbors import NearestNeighbors
 13 | import sys
 14 | # sys.path.append('libsvm-3.20/python')
 15 | # from svmutil import *
 16 | # sys.path.append('liblinear-1.96/python')
 17 | # from liblinearutil import *
 18 | from sklearn.svm import LinearSVC, SVC
 19 | from sklearn.ensemble import RandomForestClassifier
 20 | from sklearn.linear_model import LogisticRegression
 21 | from sklearn.cross_validation import KFold
 22 | 
 23 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_pr.txt"
 24 | nb_sorted_pickle = "/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/baseline_user_order_1hr_pr.pickle"
 25 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences"
 26 | num_init_adopters = 10
 27 | par_m = 8
 28 | metric_Hausdorff_m_avg = 0
 29 | top_k = 500
 30 | seq_len_threshold = top_k #500
 31 | cand_size_factor = 1
 32 | train_ex_limit = 100
 33 | norm_vec = True
 34 | next_k_test = 100
 35 | next_k_limit = 500
 36 | 
 37 | def init_clf():
 38 | 	# clf = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, C=1.0, class_weight=None)
 39 | 	# clf = SVC(C=1000.0, kernel='rbf', shrinking=True, probability=False, tol=0.001, cache_size=2000, class_weight=None, max_iter=-1)
 40 | 	clf = RandomForestClassifier(n_estimators=300, n_jobs=10, class_weight=None)
 41 | 	# clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, class_weight=None, max_iter=100)
 42 | 	return clf
 43 | 
 44 | # training_options='-s 0 -t 2 -b 1 -m 8000'
 45 | print vec_file, num_init_adopters, metric_Hausdorff_m_avg, top_k, cand_size_factor, train_ex_limit, next_k_test
 46 | 
 47 | with open("/mnt/filer01/word2vec/degree_distribution/sequence_file_split_indices.pickle","rb") as fr:
 48 | 	_ = pickle.load(fr)
 49 | 	test_seq_id = pickle.load(fr)
 50 | test_seq_id = set(test_seq_id)
 51 | 
 52 | def read_vector_file(path_vectors_file):
 53 | 	vocab = []
 54 | 	vectors = []
 55 | 	with open(path_vectors_file,"rb") as fr:
 56 | 		_,dim = next(fr).rstrip().split(' ')
 57 | 		word_vector_dim = int(dim)
 58 | 		next(fr)
 59 | 		for line in fr:
 60 | 			line = line.rstrip()
 61 | 			u = line.split(' ')
 62 | 			if len(u) != word_vector_dim+1:
 63 | 				print "vector length error"
 64 | 			word = int(u[0])
 65 | 			#normalise to length 1
 66 | 			vec = []
 67 | 			length = 0.0
 68 | 			for d in u[1:]:
 69 | 				num=float(d)
 70 | 				vec.append(num)
 71 | 				length+=num**2
 72 | 			#vec = map(float,u[1:])
 73 | 			#length = sum(x**2 for x in vec)
 74 | 			length = sqrt(length)
 75 | 			vec_norm = [x/length for x in vec]
 76 | 			vocab.append(word)
 77 | 			vectors.append(vec_norm)
 78 | 	return vectors, vocab, word_vector_dim
 79 | 
 80 | vec,vocab,dim = read_vector_file(vec_file)
 81 | vocab_index=dict()
 82 | for i in xrange(0,len(vocab)):
 83 | 	vocab_index[vocab[i]]=i
 84 | num_users = len(vocab)
 85 | print "num users in train sequences", num_users
 86 | # print "users removed from vocab", len(set(users_train)-set(vocab))
 87 | # print "users in test sequences but not in vocab", len(users_test-set(vocab))
 88 | 
 89 | # building kd-tree
 90 | tic = time.clock()
 91 | # kd = KDTree(vec, leafsize=10)
 92 | neigh = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='ball_tree', leaf_size=100, metric='minkowski', p=2) #'ball_tree', 'kd_tree', 'auto'
 93 | neigh.fit(vec)
 94 | toc = time.clock()
 95 | print "kdtree tree built in", (toc-tic)*1000
 96 | 
 97 | def get_cand_feature_vectors(query_set,next_adopters,N):
 98 | 	try:
 99 | 		query_set_ind = [ vocab_index[query] for query in query_set ]
100 | 	except KeyError:
101 | 		print "query word not present"
102 | 		return
103 | 	query_vec = [vec[i] for i in query_set_ind]
104 | 	# query using scipy kdtree
105 | 	# d_list,knn_list = kd.query(query_vec,k=cand_size_factor*N+len(query_set_ind))
106 | 	# query using sklearn
107 | 	d_list,knn_list = neigh.kneighbors(X=query_vec, n_neighbors=cand_size_factor*N+len(query_set_ind), return_distance=True)
108 | 
109 | 	cand_set = set()
110 | 	for index_list in knn_list:
111 | 		filtered=[idx for idx in index_list if idx not in query_set_ind]
112 | 		cand_set.update(filtered)
113 | 
114 | 	X=[]
115 | 	Y=[]
116 | 	cand_user=[]
117 | 	num_adopters = 0
118 | 	for idx in cand_set:
119 | 		dist_query_set = [0.0]*len(query_set)
120 | 		cand_vec = vec[idx]
121 | 		l=0
122 | 		for q in query_vec:
123 | 			dist = sum( (cand_vec[x]-q[x])**2 for x in xrange(0,dim) )
124 | 			dist_query_set[l]= sqrt(dist)
125 | 			l+=1
126 | 		# avg = sum(dist_query_set)*1./l
127 | 		dist_query_set=sorted(dist_query_set)
128 | 		# dist_query_set.append(avg)
129 | 		user_m_id = vocab[idx]
130 | 		label=-1
131 | 		if user_m_id in next_adopters:
132 | 			label=1
133 | 			num_adopters+=1
134 | 		X.append(dist_query_set)
135 | 		Y.append(label)
136 | 		cand_user.append(user_m_id)
137 | 	print "candidate set recall", num_adopters, "out of", len(next_adopters), "cand size", len(cand_user)
138 | 	cr = num_adopters*1./len(cand_user)
139 | 	cc = 0.0
140 | 	if len(next_adopters)!=0:
141 | 		cc = num_adopters*1./len(next_adopters)
142 | 	return X,Y,cand_user,cc,cr
143 | 
144 | # reading test sequences
145 | not_found_vocab=[]
146 | # source_thr = 1395858601 + 12*60*60
147 | # non_emergent_tags = pickle.load(open("/mnt/filer01/word2vec/degree_distribution/nonEmergentHashtags.pickle","rb"))
148 | tag_seq = []
149 | count=0
150 | # nb_seq = dict()
151 | # adlen = []
152 | with open(adoption_sequence_filename, "rb") as fr:
153 | 	for line in fr:
154 | 		line = line.rstrip()
155 | 		u = line.split(' ')
156 | 		not_found = set()
157 | 		adopters = set()
158 | 		# first_timestamp = int(u[1][0:u[1].index(',')])
159 | 		# first tweet only after source_thr timestamp
160 | 		# if first_timestamp>=source_thr
161 | 		# check if <5 tweets in 12 hours for emergent hashtags, not already popular
162 | 		# u[0] not in non_emergent_tags and
163 | 		if count in test_seq_id:
164 | 			seq=[]
165 | 			for i in xrange(1, len(u)):
166 | 				#timestamp = int(u[i][0:u[i].index(',')])
167 | 				author = int(u[i][u[i].index(',')+1 : ])
168 | 				if author in vocab_index:
169 | 					# removing repeat adopters
170 | 					if author not in adopters:
171 | 						seq.append(author)
172 | 						adopters.add(author)
173 | 				else:
174 | 					not_found.add(author)
175 | 			if len(seq)>num_init_adopters:
176 | 				tag_seq.append(seq)
177 | 				not_found_vocab.append(len(not_found))
178 | 				# adlen.append(len(seq))
179 | 		# elif count not in test_seq_id:
180 | 		# 	adop=[]
181 | 		# 	for i in xrange(1, len(u)):
182 | 		# 		author = int(u[i][u[i].index(',')+1 : ])
183 | 		# 		if author in vocab_index:
184 | 		# 			adop.append(author)
185 | 		# 	for author in set(adop):			
186 | 		# 		try:
187 | 		# 			nb_seq[author]+=1
188 | 		# 		except KeyError:
189 | 		# 			nb_seq[author]=1
190 | 		count+=1
191 | #nb, number of training sequences participated in
192 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq]
193 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True)
194 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted]
195 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb"))
196 | # pickle.dump(adlen,open("adlen.pickle","wb"))
197 | 
198 | print len(tag_seq),len(test_seq_id),count
199 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab)
200 | 
201 | prec_k_total = []
202 | rec_k_total = []
203 | cand_set_recall = []
204 | cand_set_cr = []
205 | cand_set_size_list = []
206 | 
207 | """
208 | #test sequences in random order
209 | seq_random_index=range(0,len(tag_seq))
210 | random.shuffle(seq_random_index)
211 | 
212 | seq_index_filter = []
213 | for i in seq_random_index:
214 | 	seq_sample_vocab = tag_seq[i]
215 | 	init_adopters=seq_sample_vocab[0:num_init_adopters]
216 | 	seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:])
217 | 	M = len(seq_sample_vocab)
218 | 	N = top_k #1000 #M #num_users
219 | 	if M<seq_len_threshold:
220 | 		continue
221 | 	seq_index_filter.append(i)
222 | print "tags remaining", len(seq_index_filter)
223 | 
224 | #train-test split for learning weights
225 | num_train = int(0.5*len(seq_index_filter))
226 | print "training examples present", num_train
227 | train_seq_id_weight = seq_index_filter[:num_train]
228 | test_seq_id_weight = seq_index_filter[num_train:]
229 | with open("adopter_pred_files/sequence_file_split_indices_weight_n40.pickle","wb") as fd:
230 | 	pickle.dump(train_seq_id_weight,fd)
231 | 	pickle.dump(test_seq_id_weight,fd)
232 | """
233 | with open("/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/sequence_file_split_indices_weight_n10.pickle","rb") as fr:
234 | 	train_seq_id_weight = pickle.load(fr)
235 | 	test_seq_id_weight = pickle.load(fr)
236 | 
237 | # train_X = []
238 | # train_Y = []
239 | l=0
240 | for i in train_seq_id_weight:
241 | 	
242 | 	seq_sample_vocab = tag_seq[i]
243 | 
244 | 	precision_k = []
245 | 	recall_k = []
246 | 	cand_cov = []
247 | 	cand_cr = []
248 | 
249 | 	#continuously increase initial adopters for training
250 | 	for init_size in range(num_init_adopters,next_k_limit,next_k_test):	
251 | 		init_adopters=seq_sample_vocab[init_size-num_init_adopters:init_size]
252 | 		next_k_adopters = set(seq_sample_vocab[init_size:init_size+next_k_test])
253 | 		M = len(next_k_adopters)
254 | 		N = top_k #1000 #M #num_users
255 | 
256 | 		X, Y, cand_user, cc, cr = get_cand_feature_vectors(init_adopters, next_k_adopters, N)
257 | 		cand_cov.append(cc)
258 | 		cand_cr.append(cr)
259 | 
260 | 		X = numpy.asarray(X)
261 | 		Y = numpy.asarray(Y)
262 | 		# cand_user = numpy.asarray(cand_user)
263 | 
264 | 		if init_size!=num_init_adopters:
265 | 
266 | 			print "check score", clf_t.score(prevX,prevY), clf_t.score(X,Y)
267 | 			# p_vals_adopt = clf_t.decision_function(test_X)
268 | 			p_vals = clf_t.predict_proba(X)
269 | 			
270 | 			try:
271 | 				cls_ind = list(clf_t.classes_).index(1)
272 | 				p_vals_adopt = [p[cls_ind] for p in p_vals]
273 | 			except ValueError:
274 | 				p_vals_adopt = [0.0 for p in p_vals]
275 | 
276 | 			cand_prob_list = zip(cand_user,p_vals_adopt)
277 | 		
278 | 			#precision at k
279 | 			pred_adopters = [w for w,_ in nlargest(next_k_test,cand_prob_list,key=lambda x: x[1])]
280 | 			num_hits = len(set(next_k_adopters)&set(pred_adopters))
281 | 			prec_k_cv = num_hits*1./next_k_test
282 | 			rec_k_cv = num_hits*1./M
283 | 
284 | 			#precision at R
285 | 			# pred_adopters = [w for w,_ in nlargest(M,cand_prob_list,key=lambda x: x[1])]
286 | 			# prec_k_cv = len(set(next_k_adopters)&set(pred_adopters))*1./M
287 | 
288 | 			print "precision", prec_k_cv, "recall", rec_k_cv, "num adopters", M, len(X), "cc", cc, "cr", cr
289 | 			precision_k.append(prec_k_cv)
290 | 			recall_k.append(rec_k_cv)
291 | 		else:
292 | 			precision_k.append(0.0)
293 | 			recall_k.append(0.0)
294 | 
295 | 		#re-initialise
296 | 		clf_t = init_clf()
297 | 		clf_t.fit(X, Y)
298 | 		print "fit for", init_size, "score", clf_t.score(X,Y)
299 | 		prevX=X
300 | 		prevY=Y
301 | 
302 | 	prec_k_total.append(precision_k)
303 | 	rec_k_total.append(recall_k)
304 | 	cand_set_recall.append(cand_cov)
305 | 	cand_set_cr.append(cand_cr)
306 | 	cand_set_size_list.append(len(X))
307 | 	
308 | 	print "Avg precision", precision_k, "Avg recall", recall_k, "precision total", numpy.mean(prec_k_total, axis=0), "recall total", numpy.mean(rec_k_total, axis=0)
309 | 	l+=1
310 | 	if l%20==0:
311 | 		print "example num", l
312 | 	if l==train_ex_limit:
313 | 		break
314 | print "training examples taken", l, "avg candidate set recall", numpy.mean(cand_set_recall, axis=0), "avg candidate set class ratio", numpy.mean(cand_set_cr, axis=0)
315 | 
316 | def print_stats(u):
317 | 	return [numpy.mean(u, axis=0), numpy.std(u, axis=0), numpy.median(u, axis=0)]
318 | print "Precision", print_stats(prec_k_total), "Recall", print_stats(rec_k_total)
319 | with open("adopter_pred_files/single_topic_next_k/eval_n10_rf_k10.pickle","wb") as fd:
320 | 	pickle.dump(prec_k_total,fd)
321 | 	pickle.dump(rec_k_total,fd)
322 | 	pickle.dump(cand_set_recall,fd)
323 | 	pickle.dump(cand_set_cr,fd)
324 | 	pickle.dump(cand_set_size_list,fd)
325 | print vec_file, num_init_adopters, metric_Hausdorff_m_avg, top_k


--------------------------------------------------------------------------------
/tsne_plots/tsne_topic_adopters_visualisation.py:
--------------------------------------------------------------------------------
  1 | #visualising users in the candidate set of initial adopters for a particular hashtag using t-SNE on user vectors
  2 | 
  3 | import matplotlib
  4 | matplotlib.use('Agg')
  5 | from tsne import *
  6 | import cPickle as pickle
  7 | import time
  8 | from math import sqrt
  9 | import random
 10 | from heapq import nsmallest, nlargest, merge
 11 | import numpy
 12 | # from scipy.spatial import cKDTree as KDTree
 13 | from sklearn.neighbors import NearestNeighbors
 14 | import sys
 15 | 
 16 | from sklearn.svm import LinearSVC, SVC
 17 | from sklearn.ensemble import RandomForestClassifier
 18 | from sklearn.linear_model import LogisticRegression
 19 | from sklearn.cross_validation import KFold
 20 | 
 21 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_pr.txt"
 22 | nb_sorted_pickle = "/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/baseline_user_order_1hr_pr.pickle"
 23 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences"
 24 | num_init_adopters = 10
 25 | top_k = 1000
 26 | seq_len_threshold = top_k #500
 27 | cand_size_factor = 1
 28 | train_ex_limit = 50
 29 | norm_vec = True
 30 | 
 31 | def init_clf():
 32 | 	# clf = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, C=1.0, class_weight=None)
 33 | 	# clf = SVC(C=1000.0, kernel='rbf', shrinking=True, probability=False, tol=0.001, cache_size=2000, class_weight=None, max_iter=-1)
 34 | 	# clf = RandomForestClassifier(n_estimators=300, n_jobs=10, class_weight=None)
 35 | 	clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, class_weight=None, max_iter=100)
 36 | 	return clf
 37 | 
 38 | # training_options='-s 0 -t 2 -b 1 -m 8000'
 39 | print vec_file, num_init_adopters, top_k, cand_size_factor, train_ex_limit
 40 | 
 41 | with open("/mnt/filer01/word2vec/degree_distribution/sequence_file_split_indices.pickle","rb") as fr:
 42 | 	_ = pickle.load(fr)
 43 | 	test_seq_id = pickle.load(fr)
 44 | test_seq_id = set(test_seq_id)
 45 | 
 46 | def read_vector_file(path_vectors_file):
 47 | 	vocab = []
 48 | 	vectors = []
 49 | 	with open(path_vectors_file,"rb") as fr:
 50 | 		_,dim = next(fr).rstrip().split(' ')
 51 | 		word_vector_dim = int(dim)
 52 | 		next(fr)
 53 | 		for line in fr:
 54 | 			line = line.rstrip()
 55 | 			u = line.split(' ')
 56 | 			if len(u) != word_vector_dim+1:
 57 | 				print "vector length error"
 58 | 			word = int(u[0])
 59 | 			#normalise to length 1
 60 | 			if norm_vec:
 61 | 				vec = []
 62 | 				length = 0.0
 63 | 				for d in u[1:]:
 64 | 					num=float(d)
 65 | 					vec.append(num)
 66 | 					length+=num**2
 67 | 				#vec = map(float,u[1:])
 68 | 				#length = sum(x**2 for x in vec)
 69 | 				length = sqrt(length)
 70 | 				vec_norm = [x/length for x in vec]
 71 | 				vectors.append(vec_norm)
 72 | 			else:
 73 | 				vec = map(float,u[1:])
 74 | 				vectors.append(vec)
 75 | 			vocab.append(word)
 76 | 	return vectors, vocab, word_vector_dim
 77 | 
 78 | vec,vocab,dim = read_vector_file(vec_file)
 79 | vocab_index=dict()
 80 | for i in xrange(0,len(vocab)):
 81 | 	vocab_index[vocab[i]]=i
 82 | num_users = len(vocab)
 83 | print "num users in train sequences", num_users
 84 | # print "users removed from vocab", len(set(users_train)-set(vocab))
 85 | # print "users in test sequences but not in vocab", len(users_test-set(vocab))
 86 | 
 87 | # building kd-tree
 88 | tic = time.clock()
 89 | # kd = KDTree(vec, leafsize=10)
 90 | neigh = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='ball_tree', leaf_size=100, metric='minkowski', p=2) #'ball_tree', 'kd_tree', 'auto'
 91 | neigh.fit(vec)
 92 | toc = time.clock()
 93 | print "kdtree tree built in", (toc-tic)*1000
 94 | 
 95 | def get_cand_feature_vectors(query_set,next_adopters,N):
 96 | 	try:
 97 | 		query_set_ind = [ vocab_index[query] for query in query_set ]
 98 | 	except KeyError:
 99 | 		print "query word not present"
100 | 		return
101 | 	query_vec = [vec[i] for i in query_set_ind]
102 | 	# query using scipy kdtree
103 | 	# d_list,knn_list = kd.query(query_vec,k=cand_size_factor*N+len(query_set_ind))
104 | 	# query using sklearn
105 | 	d_list,knn_list = neigh.kneighbors(X=query_vec, n_neighbors=cand_size_factor*N+len(query_set_ind), return_distance=True)
106 | 
107 | 	cand_set = set()
108 | 	for index_list in knn_list:
109 | 		filtered=[idx for idx in index_list if idx not in query_set_ind]
110 | 		cand_set.update(filtered)
111 | 
112 | 	X=[]
113 | 	Y=[]
114 | 	cand_user=[]
115 | 	num_adopters = 0
116 | 	for idx in cand_set:
117 | 		dist_query_set = [0.0]*len(query_set)
118 | 		cand_vec = vec[idx]
119 | 		l=0
120 | 		for q in query_vec:
121 | 			dist = sum( (cand_vec[x]-q[x])**2 for x in xrange(0,dim) )
122 | 			dist_query_set[l]= sqrt(dist)
123 | 			l+=1
124 | 		# avg = sum(dist_query_set)*1./l
125 | 		dist_query_set=sorted(dist_query_set)
126 | 		# dist_query_set.append(avg)
127 | 		user_m_id = vocab[idx]
128 | 		label=-1
129 | 		if user_m_id in next_adopters:
130 | 			label=1
131 | 			num_adopters+=1
132 | 		X.append(dist_query_set)
133 | 		Y.append(label)
134 | 		cand_user.append(user_m_id)
135 | 	print "candidate set recall", num_adopters, "out of", len(next_adopters), "cand size", len(cand_user)
136 | 	cc = 0.0
137 | 	if len(next_adopters)!=0:
138 | 		cc = num_adopters*1./len(next_adopters)
139 | 	return X,Y,cand_user,cc
140 | 
141 | # reading test sequences
142 | not_found_vocab=[]
143 | # source_thr = 1395858601 + 12*60*60
144 | # non_emergent_tags = pickle.load(open("/mnt/filer01/word2vec/degree_distribution/nonEmergentHashtags.pickle","rb"))
145 | 
146 | tag_seq = []
147 | tag_name = []
148 | count=0
149 | # nb_seq = dict()
150 | # adlen = []
151 | with open(adoption_sequence_filename, "rb") as fr:
152 | 	for line in fr:
153 | 		line = line.rstrip()
154 | 		u = line.split(' ')
155 | 		not_found = set()
156 | 		adopters = set()
157 | 		# first_timestamp = int(u[1][0:u[1].index(',')])
158 | 		# first tweet only after source_thr timestamp
159 | 		# if first_timestamp>=source_thr
160 | 		# check if <5 tweets in 12 hours for emergent hashtags, not already popular
161 | 		# u[0] not in non_emergent_tags and
162 | 		if count in test_seq_id:
163 | 			seq=[]
164 | 			for i in xrange(1, len(u)):
165 | 				#timestamp = int(u[i][0:u[i].index(',')])
166 | 				author = int(u[i][u[i].index(',')+1 : ])
167 | 				if author in vocab_index:
168 | 					# removing repeat adopters
169 | 					if author not in adopters:
170 | 						seq.append(author)
171 | 						adopters.add(author)
172 | 				else:
173 | 					not_found.add(author)
174 | 			if len(seq)>num_init_adopters:
175 | 				tag_seq.append(seq)
176 | 				tag_name.append(u[0])
177 | 				not_found_vocab.append(len(not_found))
178 | 				# adlen.append(len(seq))
179 | 		# elif count not in test_seq_id:
180 | 		# 	adop=[]
181 | 		# 	for i in xrange(1, len(u)):
182 | 		# 		author = int(u[i][u[i].index(',')+1 : ])
183 | 		# 		if author in vocab_index:
184 | 		# 			adop.append(author)
185 | 		# 	for author in set(adop):			
186 | 		# 		try:
187 | 		# 			nb_seq[author]+=1
188 | 		# 		except KeyError:
189 | 		# 			nb_seq[author]=1
190 | 		count+=1
191 | #nb, number of training sequences participated in
192 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq]
193 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True)
194 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted]
195 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb"))
196 | # pickle.dump(adlen,open("adlen.pickle","wb"))
197 | 
198 | print len(tag_seq),len(test_seq_id),count
199 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab)
200 | 
201 | cand_cov = 0.0
202 | 
203 | """
204 | #test sequences in random order
205 | seq_random_index=range(0,len(tag_seq))
206 | random.shuffle(seq_random_index)
207 | 
208 | seq_index_filter = []
209 | for i in seq_random_index:
210 | 	seq_sample_vocab = tag_seq[i]
211 | 	init_adopters=seq_sample_vocab[0:num_init_adopters]
212 | 	seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:])
213 | 	M = len(seq_sample_vocab)
214 | 	N = top_k #1000 #M #num_users
215 | 	if M<seq_len_threshold:
216 | 		continue
217 | 	seq_index_filter.append(i)
218 | print "tags remaining", len(seq_index_filter)
219 | 
220 | #train-test split for learning weights
221 | num_train = int(0.5*len(seq_index_filter))
222 | print "training examples present", num_train
223 | train_seq_id_weight = seq_index_filter[:num_train]
224 | test_seq_id_weight = seq_index_filter[num_train:]
225 | with open("adopter_pred_files/sequence_file_split_indices_weight_n40.pickle","wb") as fd:
226 | 	pickle.dump(train_seq_id_weight,fd)
227 | 	pickle.dump(test_seq_id_weight,fd)
228 | """
229 | with open("/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/sequence_file_split_indices_weight_n10.pickle","rb") as fr:
230 | 	train_seq_id_weight = pickle.load(fr)
231 | 	test_seq_id_weight = pickle.load(fr)
232 | 
233 | # with open("/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/train_file_weight_c1_n10.pickle","rb") as fr:
234 | # 	train_X = pickle.load(fr)
235 | # 	train_Y = pickle.load(fr)
236 | # clf = init_clf()
237 | # clf.fit(train_X, train_Y)
238 | # print clf.get_params()
239 | 
240 | with open("/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/single_topic_train_test_files/mean_precision_n10_rf.pickle","rb") as fr:
241 | 	prec_k_total = pickle.load(fr)
242 | 
243 | l=0
244 | tag_val = []
245 | for i in train_seq_id_weight:
246 | 
247 | 	tag = tag_name[i]
248 | 	seq_sample_vocab = tag_seq[i]
249 | 	init_adopters=seq_sample_vocab[0:num_init_adopters]
250 | 	seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:])
251 | 	M = len(seq_sample_vocab)
252 | 	N = top_k #1000 #M #num_users
253 | 
254 | 	X, Y, cand_user, cc = get_cand_feature_vectors(init_adopters, seq_sample_vocab, N)
255 | 	cand_cov+=cc
256 | 
257 | 	X = numpy.asarray(X)
258 | 	Y = numpy.asarray(Y)
259 | 	# cand_user = numpy.asarray(cand_user)
260 | 	prec = round(prec_k_total[l],4)
261 | 	with open("/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/single_topic_vis/train_file_n10_"+str(l)+".pickle","wb") as fd:
262 | 		pickle.dump(X,fd)
263 | 		pickle.dump(Y,fd)
264 | 		pickle.dump(cand_user,fd)
265 | 		pickle.dump(init_adopters,fd)
266 | 		pickle.dump(tag,fd)
267 | 	"""
268 | 	with open("/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/single_topic_vis/train_file_n10_"+str(l)+".pickle","rb") as fr:
269 | 		X = pickle.load(fr)
270 | 		Y = pickle.load(fr)
271 | 		cand_user = pickle.load(fr)
272 | 		init_adopters = pickle.load(fr)
273 | 	cc=0
274 | 	"""
275 | 	# clf_t = init_clf()
276 | 	# clf_t.fit(X, Y)
277 | 	tag_val.append((init_adopters,zip(cand_user,Y),tag,prec))
278 | 
279 | 	l+=1
280 | 	if l%20==0:
281 | 		print "example num", l
282 | 	if l==train_ex_limit:
283 | 		break
284 | print "training examples taken", l, "avg candidate set recall", cand_cov*1./l
285 | 
286 | vec_limit = 1000
287 | def get_user_vectors(t):
288 | 	init_adopters,cand,tag,prec = tag_val[t]
289 | 	vectors = []
290 | 	color = []
291 | 	count=0
292 | 	for u in init_adopters:
293 | 		vectors.append(vec[vocab_index[u]])
294 | 		color.append(0)
295 | 	random.shuffle(cand)
296 | 	for u,y in cand:
297 | 		vectors.append(vec[vocab_index[u]])
298 | 		if y==1:
299 | 			color.append(1)
300 | 		else:
301 | 			color.append(2)
302 | 		count+=1
303 | 		if count==vec_limit:
304 | 			break
305 | 	return numpy.array(vectors), color, tag, prec
306 | 
307 | def save_embed_plot((X,color,tag,prec),fname):
308 | 	Y = tsne(X, no_dims = 2, initial_dims = 50, perplexity = 30.0)
309 | 	with open("/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/single_topic_vis/"+fname+".pickle","wb") as fd:
310 | 		pickle.dump(Y,fd)
311 | 	fig = Plot.figure()
312 | 	init = []
313 | 	adopt = []
314 | 	rest = []
315 | 	for i,c in enumerate(color):
316 | 		if c==0:
317 | 			init.append(i)
318 | 		elif c==1:
319 | 			adopt.append(i)
320 | 		else:
321 | 			rest.append(i)
322 | 	Y_init = Y[init]
323 | 	Y_adopt = Y[adopt]
324 | 	Y_rest = Y[rest]
325 | 	Plot.scatter(Y_init[:,0], Y_init[:,1], s=20, c='r', alpha=0.8, label = 'initial adopters', edgecolor='none')
326 | 	Plot.scatter(Y_adopt[:,0], Y_adopt[:,1], s=12, c='b', alpha=0.8, label = 'adopters', edgecolor='none')
327 | 	Plot.scatter(Y_rest[:,0], Y_rest[:,1], s=10, c='c', alpha=0.4, label = 'non-adopters', edgecolor='none')
328 | 	Plot.axis('off')
329 | 	Plot.legend(prop={'size':8})
330 | 	Plot.title('#'+tag+', P@100: '+str(prec))
331 | 	fig.savefig(fname+'.png', dpi=400, bbox_inches='tight')
332 | 	
333 | if __name__ == "__main__":
334 | 	print "Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset."
335 | 	for i in range(0,train_ex_limit):
336 | 		save_embed_plot(get_user_vectors(i),'embed_adopters_topic_'+str(i))
337 | 
338 | #cc 0.0589, candidate set recall 280 out of 4751 cand size 6312
339 | #cc 0.219, candidate set recall 516 out of 2347 cand size 4702
340 | #cc 0.56, candidate set recall 658 out of 1162 cand size 4075


--------------------------------------------------------------------------------
/adopter_prediction/adopter_prediction_single_topic.py:
--------------------------------------------------------------------------------
  1 | #get nearest users to the initial adopters of a hashtag sequence in test sequences using user vectors,
  2 | #rank based on learned weighted sum of distances of candidates from initial adopters and compare with actual adopters in the sequence
  3 | #train and test model on each topic in training
  4 | 
  5 | import cPickle as pickle
  6 | import time
  7 | from math import sqrt
  8 | import random
  9 | from heapq import nsmallest, nlargest, merge
 10 | import numpy
 11 | # from scipy.spatial import cKDTree as KDTree
 12 | from sklearn.neighbors import NearestNeighbors
 13 | import sys
 14 | # sys.path.append('libsvm-3.20/python')
 15 | # from svmutil import *
 16 | # sys.path.append('liblinear-1.96/python')
 17 | # from liblinearutil import *
 18 | from sklearn.svm import LinearSVC, SVC
 19 | from sklearn.ensemble import RandomForestClassifier
 20 | from sklearn.linear_model import LogisticRegression
 21 | from sklearn.cross_validation import KFold
 22 | 
 23 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_pr.txt"
 24 | nb_sorted_pickle = "/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/baseline_user_order_1hr_pr.pickle"
 25 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences"
 26 | num_init_adopters = 10
 27 | par_m = 8
 28 | metric_Hausdorff_m_avg = 0
 29 | top_k = 4000
 30 | seq_len_threshold = top_k #500
 31 | cand_size_factor = 1
 32 | train_ex_limit = 100
 33 | norm_vec = True
 34 | cv_fold = 5
 35 | top_k_test = 10
 36 | 
 37 | def init_clf():
 38 | 	# clf = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, C=1.0, class_weight=None)
 39 | 	# clf = SVC(C=1000.0, kernel='rbf', shrinking=True, probability=False, tol=0.001, cache_size=2000, class_weight=None, max_iter=-1)
 40 | 	clf = RandomForestClassifier(n_estimators=300, n_jobs=10, class_weight=None)
 41 | 	# clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, class_weight=None, max_iter=100)
 42 | 	return clf
 43 | 
 44 | # training_options='-s 0 -t 2 -b 1 -m 8000'
 45 | print vec_file, num_init_adopters, metric_Hausdorff_m_avg, top_k, cand_size_factor, train_ex_limit, top_k_test
 46 | 
 47 | with open("/mnt/filer01/word2vec/degree_distribution/sequence_file_split_indices.pickle","rb") as fr:
 48 | 	_ = pickle.load(fr)
 49 | 	test_seq_id = pickle.load(fr)
 50 | test_seq_id = set(test_seq_id)
 51 | 
 52 | def read_vector_file(path_vectors_file):
 53 | 	vocab = []
 54 | 	vectors = []
 55 | 	with open(path_vectors_file,"rb") as fr:
 56 | 		_,dim = next(fr).rstrip().split(' ')
 57 | 		word_vector_dim = int(dim)
 58 | 		next(fr)
 59 | 		for line in fr:
 60 | 			line = line.rstrip()
 61 | 			u = line.split(' ')
 62 | 			if len(u) != word_vector_dim+1:
 63 | 				print "vector length error"
 64 | 			word = int(u[0])
 65 | 			#normalise to length 1
 66 | 			if norm_vec:
 67 | 				vec = []
 68 | 				length = 0.0
 69 | 				for d in u[1:]:
 70 | 					num=float(d)
 71 | 					vec.append(num)
 72 | 					length+=num**2
 73 | 				#vec = map(float,u[1:])
 74 | 				#length = sum(x**2 for x in vec)
 75 | 				length = sqrt(length)
 76 | 				vec_norm = [x/length for x in vec]
 77 | 				vectors.append(vec_norm)
 78 | 			else:
 79 | 				vec = map(float,u[1:])
 80 | 				vectors.append(vec)
 81 | 			vocab.append(word)
 82 | 	return vectors, vocab, word_vector_dim
 83 | 
 84 | vec,vocab,dim = read_vector_file(vec_file)
 85 | vocab_index=dict()
 86 | for i in xrange(0,len(vocab)):
 87 | 	vocab_index[vocab[i]]=i
 88 | num_users = len(vocab)
 89 | print "num users in train sequences", num_users
 90 | # print "users removed from vocab", len(set(users_train)-set(vocab))
 91 | # print "users in test sequences but not in vocab", len(users_test-set(vocab))
 92 | 
 93 | # building kd-tree
 94 | tic = time.clock()
 95 | # kd = KDTree(vec, leafsize=10)
 96 | neigh = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='ball_tree', leaf_size=100, metric='minkowski', p=2) #'ball_tree', 'kd_tree', 'auto'
 97 | neigh.fit(vec)
 98 | toc = time.clock()
 99 | print "kdtree built in", (toc-tic)*1000
100 | 
101 | def get_cand_feature_vectors(query_set,next_adopters,N):
102 | 	try:
103 | 		query_set_ind = [ vocab_index[query] for query in query_set ]
104 | 	except KeyError:
105 | 		print "query word not present"
106 | 		return
107 | 	query_vec = [vec[i] for i in query_set_ind]
108 | 	# query using scipy kdtree
109 | 	# d_list,knn_list = kd.query(query_vec,k=cand_size_factor*N+len(query_set_ind))
110 | 	# query using sklearn
111 | 	d_list,knn_list = neigh.kneighbors(X=query_vec, n_neighbors=cand_size_factor*N+len(query_set_ind), return_distance=True)
112 | 
113 | 	cand_set = set()
114 | 	for index_list in knn_list:
115 | 		filtered=[idx for idx in index_list if idx not in query_set_ind]
116 | 		cand_set.update(filtered)
117 | 
118 | 	X=[]
119 | 	Y=[]
120 | 	cand_user=[]
121 | 	num_adopters = 0
122 | 	for idx in cand_set:
123 | 		dist_query_set = [0.0]*len(query_set)
124 | 		cand_vec = vec[idx]
125 | 		l=0
126 | 		for q in query_vec:
127 | 			dist = sum( (cand_vec[x]-q[x])**2 for x in xrange(0,dim) )
128 | 			dist_query_set[l]= sqrt(dist)
129 | 			l+=1
130 | 		# avg = sum(dist_query_set)*1./l
131 | 		dist_query_set=sorted(dist_query_set)
132 | 		# dist_query_set.append(avg)
133 | 		user_m_id = vocab[idx]
134 | 		label=-1
135 | 		if user_m_id in next_adopters:
136 | 			label=1
137 | 			num_adopters+=1
138 | 		X.append(dist_query_set)
139 | 		Y.append(label)
140 | 		cand_user.append(user_m_id)
141 | 	print "candidate set recall", num_adopters, "out of", len(next_adopters), "cand size", len(cand_user)
142 | 	cr = num_adopters*1./len(cand_user)
143 | 	cc = 0.0
144 | 	if len(next_adopters)!=0:
145 | 		cc = num_adopters*1./len(next_adopters)
146 | 	return X,Y,cand_user,cc,cr
147 | 
148 | # reading test sequences
149 | not_found_vocab=[]
150 | # source_thr = 1395858601 + 12*60*60
151 | # non_emergent_tags = pickle.load(open("/mnt/filer01/word2vec/degree_distribution/nonEmergentHashtags.pickle","rb"))
152 | tag_seq = []
153 | count=0
154 | # nb_seq = dict()
155 | # adlen = []
156 | with open(adoption_sequence_filename, "rb") as fr:
157 | 	for line in fr:
158 | 		line = line.rstrip()
159 | 		u = line.split(' ')
160 | 		not_found = set()
161 | 		adopters = set()
162 | 		# first_timestamp = int(u[1][0:u[1].index(',')])
163 | 		# first tweet only after source_thr timestamp
164 | 		# if first_timestamp>=source_thr
165 | 		# check if <5 tweets in 12 hours for emergent hashtags, not already popular
166 | 		# u[0] not in non_emergent_tags and
167 | 		if count in test_seq_id:
168 | 			seq=[]
169 | 			for i in xrange(1, len(u)):
170 | 				#timestamp = int(u[i][0:u[i].index(',')])
171 | 				author = int(u[i][u[i].index(',')+1 : ])
172 | 				if author in vocab_index:
173 | 					# removing repeat adopters
174 | 					if author not in adopters:
175 | 						seq.append(author)
176 | 						adopters.add(author)
177 | 				else:
178 | 					not_found.add(author)
179 | 			if len(seq)>num_init_adopters:
180 | 				tag_seq.append(seq)
181 | 				not_found_vocab.append(len(not_found))
182 | 				# adlen.append(len(seq))
183 | 		# elif count not in test_seq_id:
184 | 		# 	adop=[]
185 | 		# 	for i in xrange(1, len(u)):
186 | 		# 		author = int(u[i][u[i].index(',')+1 : ])
187 | 		# 		if author in vocab_index:
188 | 		# 			adop.append(author)
189 | 		# 	for author in set(adop):			
190 | 		# 		try:
191 | 		# 			nb_seq[author]+=1
192 | 		# 		except KeyError:
193 | 		# 			nb_seq[author]=1
194 | 		count+=1
195 | #nb, number of training sequences participated in
196 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq]
197 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True)
198 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted]
199 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb"))
200 | # pickle.dump(adlen,open("adlen.pickle","wb"))
201 | 
202 | print len(tag_seq),len(test_seq_id),count
203 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab)
204 | 
205 | prec_k_total = []
206 | rec_k_total = []
207 | cand_set_recall = []
208 | cand_set_cr = []
209 | cand_set_size_list = []
210 | cand_cov = 0.0
211 | cand_cr = 0.0
212 | 
213 | """
214 | #test sequences in random order
215 | seq_random_index=range(0,len(tag_seq))
216 | random.shuffle(seq_random_index)
217 | 
218 | seq_index_filter = []
219 | for i in seq_random_index:
220 | 	seq_sample_vocab = tag_seq[i]
221 | 	init_adopters=seq_sample_vocab[0:num_init_adopters]
222 | 	seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:])
223 | 	M = len(seq_sample_vocab)
224 | 	N = top_k #1000 #M #num_users
225 | 	if M<seq_len_threshold:
226 | 		continue
227 | 	seq_index_filter.append(i)
228 | print "tags remaining", len(seq_index_filter)
229 | 
230 | #train-test split for learning weights
231 | num_train = int(0.5*len(seq_index_filter))
232 | print "training examples present", num_train
233 | train_seq_id_weight = seq_index_filter[:num_train]
234 | test_seq_id_weight = seq_index_filter[num_train:]
235 | with open("adopter_pred_files/sequence_file_split_indices_weight_n40.pickle","wb") as fd:
236 | 	pickle.dump(train_seq_id_weight,fd)
237 | 	pickle.dump(test_seq_id_weight,fd)
238 | """
239 | with open("/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/sequence_file_split_indices_weight_n"+str(num_init_adopters)+".pickle","rb") as fr:
240 | 	train_seq_id_weight = pickle.load(fr)
241 | 	test_seq_id_weight = pickle.load(fr)
242 | 
243 | # train_X = []
244 | # train_Y = []
245 | l=0
246 | for i in train_seq_id_weight:
247 | 	
248 | 	seq_sample_vocab = tag_seq[i]
249 | 	init_adopters=seq_sample_vocab[0:num_init_adopters]
250 | 	seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:])
251 | 	M = len(seq_sample_vocab)
252 | 	N = top_k #1000 #M #num_users
253 | 
254 | 	X, Y, cand_user, cc, cr = get_cand_feature_vectors(init_adopters, seq_sample_vocab, N)
255 | 	cand_cov+=cc
256 | 	cand_cr+=cr
257 | 
258 | 	X = numpy.asarray(X)
259 | 	Y = numpy.asarray(Y)
260 | 	# cand_user = numpy.asarray(cand_user)
261 | 
262 | 	# with open("adopter_pred_files/single_topic_train_test_files/train_file_n10_"+str(l)+".pickle","wb") as fd:
263 | 	# 	pickle.dump(X,fd)
264 | 	# 	pickle.dump(Y,fd)
265 | 	"""
266 | 	with open("adopter_pred_files/single_topic_train_test_files/train_file_n10_"+str(l)+".pickle","rb") as fr:
267 | 		X = pickle.load(fr)
268 | 		Y = pickle.load(fr)
269 | 	cc=0
270 | 	cr=0
271 | 	"""
272 | 	cand_set_size = len(X)
273 | 
274 | 	precision_k = 0.0
275 | 	recall_k = 0.0
276 | 	#cross-validation, random split
277 | 	kf = KFold(cand_set_size, n_folds=cv_fold, shuffle=True)
278 | 	for train_ind,test_ind in kf:
279 | 		train_X, test_X = X[train_ind], X[test_ind]
280 | 		train_Y, test_Y = Y[train_ind], Y[test_ind]
281 | 
282 | 		# test_cand_user = cand_user[test_ind]
283 | 		test_adopt_ind = [ind for ind,val in enumerate(test_Y) if val==1]
284 | 		num_adopt = len(test_adopt_ind)
285 | 		#re-initialise
286 | 		clf_t = init_clf()
287 | 		clf_t.fit(train_X, train_Y)
288 | 
289 | 		# p_vals_adopt = clf_t.decision_function(test_X)
290 | 		p_vals = clf_t.predict_proba(test_X)
291 | 		try:
292 | 			cls_ind = list(clf_t.classes_).index(1)
293 | 			p_vals_adopt = [p[cls_ind] for p in p_vals]
294 | 		except ValueError:
295 | 			p_vals_adopt = [0.0 for p in p_vals]
296 | 
297 | 		cand_prob_list = zip(range(0,len(test_X)),p_vals_adopt)
298 | 		
299 | 		#precision at k
300 | 		pred_adopters = [w for w,_ in nlargest(top_k_test,cand_prob_list,key=lambda x: x[1])]
301 | 		num_hits = len(set(test_adopt_ind)&set(pred_adopters))
302 | 		prec_k_cv = num_hits*1./top_k_test
303 | 		if num_adopt!=0:
304 | 			rec_k_cv = num_hits*1./num_adopt
305 | 		else:
306 | 			rec_k_cv = 0
307 | 
308 | 		#precision at R
309 | 		# pred_adopters = [w for w,_ in nlargest(num_adopt,cand_prob_list,key=lambda x: x[1])]
310 | 		# if num_adopt!=0:
311 | 		# 	prec_k_cv = len(set(test_adopt_ind)&set(pred_adopters))*1./num_adopt
312 | 		# else:
313 | 		# 	prec_k_cv = 0
314 | 
315 | 		print "precision", prec_k_cv, "recall", rec_k_cv, "num adopters", num_adopt, len(test_Y)
316 | 		precision_k += prec_k_cv
317 | 		recall_k += rec_k_cv
318 | 
319 | 	precision_k = precision_k*1./cv_fold
320 | 	recall_k = recall_k*1./cv_fold
321 | 	prec_k_total.append(precision_k)
322 | 	rec_k_total.append(recall_k)
323 | 	cand_set_recall.append(cc)
324 | 	cand_set_cr.append(cr)
325 | 	cand_set_size_list.append(cand_set_size)
326 | 
327 | 	print "Avg precision", precision_k, "Avg recall", recall_k, "precision total", sum(prec_k_total)*1./(l+1), "recall total", sum(rec_k_total)*1./(l+1), "cc", cc, "cr", cr
328 | 	l+=1
329 | 	if l%20==0:
330 | 		print "example num", l
331 | 	if l==train_ex_limit:
332 | 		break
333 | print "training examples taken", l, "avg candidate set recall", cand_cov*1./l, "avg candidate set class ratio", cand_cr*1./l
334 | 
335 | def print_stats(u):
336 | 	return [numpy.mean(u), numpy.std(u), numpy.median(u)]
337 | print "Precision", print_stats(prec_k_total), "Recall", print_stats(rec_k_total)
338 | with open("adopter_pred_files/single_topic_train_test_files/mean_precision_n"+str(num_init_adopters)+"_rf_prec"+str(top_k_test)+".pickle","wb") as fd:
339 | 	pickle.dump(prec_k_total,fd)
340 | 	pickle.dump(rec_k_total,fd)
341 | 	pickle.dump(cand_set_recall,fd)
342 | 	pickle.dump(cand_set_cr,fd)
343 | 	pickle.dump(cand_set_size_list,fd)
344 | print vec_file, num_init_adopters, metric_Hausdorff_m_avg, top_k


--------------------------------------------------------------------------------
/user_vector_training/sentence_creation/helpers/test_sentence.py:
--------------------------------------------------------------------------------
  1 | #extract hashtag graphs from sequence of authors adopting a hashtag and find all paths in the graphs to write into corpus file as sentences for training using word2vec 
  2 | import time
  3 | import sys
  4 | import os
  5 | import cPickle as pickle
  6 | import random
  7 | 
  8 | min_tweets_sequence = 0 # minimum number of tweets on a hashtag to remove hashtags with only few tweets available for extracting context
  9 | 
 10 | #conditions for edges between tweets
 11 | time_diff_for_edge = 10
 12 | # time_diff_for_edge = 12*60*60
 13 | follower_following_cond = False
 14 | geography_cond = False
 15 | 
 16 | context_length = 4 #m/2, length of context (to one side) or length of paths (half of the length) to consider
 17 | min_context_length = 0 #minimum length of context or length of paths to consider
 18 | gamma = 1 #number of contexts or paths for a tweet in a sequence
 19 | NUM_LEVEL_LIMIT = 2
 20 | 
 21 | def get_location(author):
 22 | 	if author in location_buckets:
 23 | 		return location_buckets[author]
 24 | 	else:
 25 | 		return -1 #location unknown
 26 | 		
 27 | #initialise adjacency list
 28 | def init_adj_list(num_nodes):
 29 | 	adj = [[]] * num_nodes
 30 | 	for i in range(0, num_nodes):
 31 | 		adj[i] = []
 32 | 	return adj
 33 | 
 34 | #get all paths starting from a vertex using DFS on hashtag graph
 35 | #Reference: http://eddmann.com/posts/depth-first-search-and-breadth-first-search-in-python/
 36 | def dfs_paths(adj,start):
 37 | 	# visited = set()
 38 | 	paths = []
 39 | 	stack = [(start, [start])]
 40 | 	while stack:
 41 | 		(vertex, path) = stack.pop()
 42 | 		nbh = set(adj[vertex]) - set(path)
 43 | 		# nbh = set(adj[vertex]) - visited # visit each vertex once
 44 | 		if len(nbh)==0:
 45 | 			paths.append(path)
 46 | 		for next in nbh:
 47 | 			stack.append((next, path + [next])) #instead of all possible paths from all vertices, get maximum length paths in the graph
 48 | 			# visited.add(next)
 49 | 	return paths
 50 | 
 51 | #sample paths to the right of vertex from hashtag graph
 52 | def sample_paths_one_side(adj,present_node):
 53 | 	paths = []
 54 | 	for i in xrange(0,gamma):
 55 | 		path=[present_node]
 56 | 		count=0
 57 | 		while count<context_length: #change context length value for single side
 58 | 			adjacent_nodes = adj[present_node]
 59 | 			if adjacent_nodes!=[]:
 60 | 				present_node=random.choice(adjacent_nodes) #randomly choose one of the neighbours of present node
 61 | 				path.append(present_node)
 62 | 				count+=1
 63 | 			else:
 64 | 				break
 65 | 		paths.append(path)
 66 | 	return paths
 67 | 	
 68 | #sample paths to left and right of vertex from hashtag graph
 69 | def sample_paths_both_side(adj,rev_adj,start):
 70 | 	paths = []
 71 | 	present_node = start
 72 | 	print "Start", start
 73 | 	for i in xrange(0,gamma):
 74 | 		#left
 75 | 		path=[]
 76 | 		count=0
 77 | 		while count<context_length: #change context length value for single side
 78 | 			adjacent_nodes = rev_adj[present_node]
 79 | 			if adjacent_nodes!=[]:
 80 | 				present_node=random.choice(adjacent_nodes) #randomly choose one of the neighbours of present node
 81 | 				path.append(present_node)
 82 | 				print present_node, "out of", adjacent_nodes, "left"
 83 | 				count+=1
 84 | 			else:
 85 | 				break
 86 | 		path.reverse()
 87 | 		path.append(start)
 88 | 		print path, "from left"
 89 | 		#right
 90 | 		count=0
 91 | 		present_node = start
 92 | 		while count<context_length: #change context length value for single side
 93 | 			adjacent_nodes = adj[present_node]
 94 | 			if adjacent_nodes!=[]:
 95 | 				present_node=random.choice(adjacent_nodes) #randomly choose one of the neighbours of present node
 96 | 				path.append(present_node)
 97 | 				print present_node, "out of", adjacent_nodes, "right"
 98 | 				count+=1
 99 | 			else:
100 | 				break
101 | 		paths.append(path)
102 | 		print path, "full"
103 | 	return paths
104 | 
105 | #sample neighbouring vertices right of a vertex from hashtag graph in bfs manner
106 | def sample_nbhs_bfs_one_side(adj,start):
107 | 	paths = []
108 | 	for i in xrange(0,gamma):
109 | 		print "Start", start
110 | 		present_node=start
111 | 		path=[present_node]
112 | 		count=0
113 | 		next_level=adj[present_node]
114 | 		visited=set()
115 | 		visited.add(present_node)
116 | 		num_levels=1
117 | 		while count<context_length and next_level!=[] and num_levels<=NUM_LEVEL_LIMIT: #change context length value for single side
118 | 			num_remaining = context_length-count
119 | 			if len(next_level) < num_remaining:
120 | 				print path, next_level, "before", num_remaining
121 | 				path+=next_level
122 | 				count+=len(next_level)
123 | 				visited.update(next_level)
124 | 				nbh=set()
125 | 				for vertex_p in next_level:
126 | 					for vertex_n in adj[vertex_p]:
127 | 						if vertex_n not in visited:
128 | 							nbh.add(vertex_n)
129 | 				next_level=list(nbh)
130 | 				num_levels+=1
131 | 				print path, next_level, "after", visited
132 | 			elif len(next_level) > num_remaining:
133 | 				path+=random.sample(next_level,num_remaining) #order of vertices changed by sample
134 | 				print path, "out of", next_level
135 | 				break
136 | 			else:
137 | 				path+=next_level
138 | 				print path, next_level, "all"
139 | 				break
140 | 		paths.append(path)
141 | 	return paths
142 | 	
143 | #sample neighbouring vertices to left and right of vertex from hashtag graph
144 | def sample_nbhs_bfs(adj,rev_adj,start):
145 | 	paths = []
146 | 	for i in xrange(0,gamma):
147 | 		#left
148 | 		path=[]
149 | 		count=0
150 | 		queue=[start]
151 | 		visited=set()
152 | 		while count<context_length+1 and queue!=[]: #change context length value for single side
153 | 			present_node = queue.pop()
154 | 			if present_node not in visited:
155 | 				# present_node=random.choice(adjacent_nodes) #randomly choose one of the neighbours of present node
156 | 				visited.add(present_node)
157 | 				adjacent_nodes = [node for node in rev_adj[present_node] if node not in visited]
158 | 				path=[present_node]+path
159 | 				queue=adjacent_nodes+queue
160 | 				count+=1
161 | 		path=path[:-1]
162 | 		#right
163 | 		count=0
164 | 		queue=[start]
165 | 		visited=set()
166 | 		while count<context_length+1 and queue!=[]: #change context length value for single side
167 | 			present_node = queue.pop(0)
168 | 			if present_node not in visited:
169 | 				# present_node=random.choice(adjacent_nodes) #randomly choose one of the neighbours of present node
170 | 				visited.add(present_node)
171 | 				adjacent_nodes = [node for node in adj[present_node] if node not in visited]
172 | 				path.append(present_node)
173 | 				queue+=adjacent_nodes
174 | 				count+=1
175 | 		paths.append(path)
176 | 	return paths
177 | 	
178 | #get user ids from vertex ids in paths
179 | def path_to_sentence(nodes,path):
180 | 	s=[]
181 | 	for i in path:
182 | 		_,author = nodes[i]
183 | 		s.append(str(author)) #type of str for author is needed for using join
184 | 	return s
185 | 	
186 | #separate hashtag segments from adoption sequence of a hashtag using maximum time difference allowed for edges for reducing length of sequence to consider for hashtag graph
187 | """
188 | def get_adoption_segments(sequence):
189 | 	first_tw_time,first_tw_author = sequence[0]
190 | 	prev_time = first_tw_time
191 | 	seg = [] #group of tweets or segment
192 | 	seg.append(sequence[0])
193 | 	segments=[] #group of segments
194 | 	for i in sequence[1:]:
195 | 		time,_ = i
196 | 		if time-prev_time>time_diff_for_edge:
197 | 			segments.append(seg)
198 | 			seg = []
199 | 		seg.append(i)
200 | 		prev_time = time
201 | 	if seg!=[]:
202 | 		segments.append(seg)
203 | 	return segments
204 | """
205 | #get adjacency list of hashtag graph from a segment
206 | """
207 | def get_hashtag_graph_adj(segment):
208 | 	num_nodes = len(segment)
209 | 	adj_list = init_adj_list(num_nodes) #adjacency list for directed graph
210 | 	if num_nodes==1:
211 | 		return adj_list
212 | 	for i in range(0,num_nodes):
213 | 		time_first,_ = segment[i]
214 | 		for j in range(i+1,num_nodes):
215 | 			time_second,_ = segment[j]
216 | 			if time_second-time_first<=time_diff_for_edge: # only time difference considered for an edge, check other conditions
217 | 				adj_list[i].append(j)
218 | 			else:
219 | 				break #tweets are arranged in increasing time, so no edges will be there with vertices past present node
220 | 			#location
221 | 			#follower relation
222 | 			#check if more than one connected components in a segment if single path is considered for each segment
223 | 	return adj_list
224 | """
225 | 
226 | def get_hashtag_graph_adj(segment):
227 | 	num_nodes = len(segment)
228 | 	# adj_list = init_adj_list(num_nodes) #adjacency list for directed graph
229 | 	adj_list = [[] for i in xrange(0, num_nodes)]
230 | 	rev_adj_list = [[] for i in xrange(0, num_nodes)] #defaultdict(list)
231 | 	# print "init", total_size(adj_list), total_size(rev_adj_list)
232 | 	# print "adj list init"
233 | 	if num_nodes==1:
234 | 		return adj_list, rev_adj_list
235 | 	location = [[] for i in xrange(0, max_locations)] #dict()
236 | 	for i in xrange(0,num_nodes):
237 | 		_,author = segment[i]
238 | 		author_loc = location_buckets[author]
239 | 		if author_loc!=-1: #no edges between users with unknown location
240 | 			location[author_loc].append(i) #time sorted order will change across locations, but not within location. order of vertices in adjacency list is still same
241 | 	print "location list", location
242 | 	count=0
243 | 	for same_loc_seq in location:
244 | 		num_loc = len(same_loc_seq)
245 | 		print count, "Count", len(same_loc_seq)
246 | 		count+=1
247 | 		for i in xrange(0,num_loc):
248 | 			vertex_index_first = same_loc_seq[i]
249 | 			time_first,_ = segment[vertex_index_first]
250 | 			for j in xrange(i+1,num_loc):
251 | 				vertex_index_second = same_loc_seq[j]
252 | 				time_second,_ = segment[vertex_index_second]
253 | 				if time_second-time_first<=time_diff_for_edge: # only time difference considered for an edge, check other conditions
254 | 					adj_list[vertex_index_first].append(vertex_index_second)
255 | 					rev_adj_list[vertex_index_second].append(vertex_index_first)
256 | 					# rev_adj_list[vertex_index_second].insert(0,vertex_index_first) #to make the order of vertices having edge to second vertex in decreasing order, i.e., closest vertex first
257 | 				else:
258 | 					break #tweets are arranged in increasing time, so no edges will be there with vertices past present node
259 | 				#follower relation
260 | 				#check if more than one connected components in a segment if single path is considered for each segment
261 | 	# print "assigned", total_size(adj_list), total_size(rev_adj_list)
262 | 
263 | 	return adj_list, rev_adj_list
264 | """
265 | #get adjacency list of hashtag graph from a segment, using only time diff
266 | def get_hashtag_graph_adj(segment):
267 | 	num_nodes = len(segment)
268 | 	adj_list = [[] for i in xrange(0, num_nodes)]
269 | 	rev_adj_list = [[] for i in xrange(0, num_nodes)]
270 | 	# print "adj list init"
271 | 	if num_nodes==1:
272 | 		return adj_list, rev_adj_list
273 | 	for i in xrange(0,num_nodes):
274 | 		time_first,author_first = segment[i]
275 | 		for j in xrange(i+1,num_nodes):
276 | 			time_second,author_second = segment[j]
277 | 			if time_second-time_first<=time_diff_for_edge: # only time difference considered for an edge, check other conditions
278 | 				adj_list[i].append(j)
279 | 				rev_adj_list[j].append(i)
280 | 			else:
281 | 				break #tweets are arranged in increasing time, so no edges will be there with vertices past present node
282 | 			#follower relation
283 | 			#check if more than one connected components in a segment if single path is considered for each segment
284 | 	return adj_list, rev_adj_list
285 | """
286 | #get all paths of length m from hashtag graph
287 | def get_paths_from_graph(nodes, adj, rev_adj):
288 | 	if len(nodes)>=min_context_length: #only if less than m length paths are not taken
289 | 		for start in xrange(0,len(nodes)):
290 | 			# if len(nodes)-start-1<min_context_length: #number of vertices left are less than min context length
291 | 				# break
292 | 				
293 | 			#DFS for paths starting from a vertex
294 | 			# paths_vertices = dfs_paths(adj,start)
295 | 			
296 | 			#sample paths from right of all nodes
297 | 			# paths_vertices = sample_paths_one_side(adj,start)
298 | 			
299 | 			#sample paths from left and right of all nodes
300 | 			# paths_vertices = sample_paths_both_side(adj,rev_adj,start) #first find path to the left of present node
301 | 			
302 | 			#sample neighbours from left and right of all nodes in breadth-first search way
303 | 			# paths_vertices = sample_nbhs_bfs(adj,rev_adj,start)
304 | 			
305 | 			#sample neighbours from right of all nodes in breadth-first search way
306 | 			paths_vertices = sample_nbhs_bfs_one_side(adj,start)
307 | 			
308 | 			for p in paths_vertices:
309 | 				if len(p)>=min_context_length: #only take paths above minimum context length
310 | 					yield (start,path_to_sentence(nodes,p))
311 | 
312 | #get sentences from hashtag sequences
313 | sentences=[]
314 | max_locations = 2
315 | adoption_sequence = dict()
316 | adoption_sequence['test']=[(4,0),(10,1),(15,2),(21,3),(23,4),(26,5),(28,6),(37,7),(40,8),(45,9)]
317 | location_buckets = [0,0,1,1,-1,1,1,1,1,1]
318 | 
319 | def get_sentences(adoption_sequence):
320 | 	tag_count = 0
321 | 	for t in adoption_sequence:
322 | 		segment=adoption_sequence[t]
323 | 		tag_count+=1
324 | 		adj_list, rev_adj_list = get_hashtag_graph_adj(segment)
325 | 		print adj_list, rev_adj_list
326 | 		paths = get_paths_from_graph(segment, adj_list, rev_adj_list)
327 | 		for p in paths: #change if only one path generated from a hashtag graph
328 | 			yield p
329 | 
330 | print(list(get_sentences(adoption_sequence)))


--------------------------------------------------------------------------------