├── .DS_Store ├── helpers ├── cat_files.sh ├── cummDistPlot.R └── frequency_plot.py ├── user_vector_training ├── word2vec │ └── twitter_training_w2v.sh ├── helpers │ ├── remove_repeat_contexts.py │ ├── node_frequency.py │ ├── test_distance.py │ └── feature_dist.py ├── deg_dist_in_top_users.py ├── nearest_neighbours │ ├── plot_similarity_nearest_users.py │ ├── query_nearest_users_sen.py │ ├── test_kdtree_query.py │ └── query_nearest_users.py ├── distance_w2v.py ├── filter_hashtag_sequence.py └── sentence_creation │ └── helpers │ └── test_sentence.py ├── adopter_prediction ├── helpers │ └── measure.py ├── prec_plot.py ├── adopter_prediction.py ├── adopter_prediction_parallel.py ├── adopter_pred_cand_set_stat.py ├── adopter_prediction_next_k.py ├── adopter_prediction_next_k_weight_learning.py └── adopter_prediction_single_topic.py ├── run_script.sh ├── filter_hashtag_sequence.py ├── train_test_split_hashtag_sequence.py ├── logs └── notes.log ├── filter_follower_graph.py ├── tsne_plots ├── tsne_word_visualisation.py ├── tsne_user_visualisation.py ├── tsne_hashtag_visualisation.py ├── tsne.py └── tsne_topic_adopters_visualisation.py ├── misc ├── virality_prediction_features.py └── test.py ├── README.md ├── Untitled ├── neighbourhood_experiments ├── entropy_vs_spread.py └── candidate_set_coverage │ ├── cand_recall_plot.py │ └── cand_cov_vs_spread.py └── results.txt /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sahil505/social-network-embeddings/HEAD/.DS_Store -------------------------------------------------------------------------------- /helpers/cat_files.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | indir="sentences_files" 3 | outfile="sentences_files/userSentencesComb" 4 | for string in "$indir"/*.txt ; do 5 | cat "$string" >> "$outfile" 6 | echo "$string" 7 | # rm "$string" 8 | done -------------------------------------------------------------------------------- /user_vector_training/word2vec/twitter_training_w2v.sh: -------------------------------------------------------------------------------- 1 | make 2 | 3 | CORPUS=tweets_comb_processed 4 | SAVE_FILE=twitter_vectors.txt 5 | VOCAB_FILE=vocab.txt 6 | 7 | time ./word2vec -train $CORPUS -output $SAVE_FILE -cbow 0 -size 200 -window 8 -negative 5 -hs 0 -sample 1e-4 -threads 20 -binary 0 -iter 15 -save-vocab $VOCAB_FILE 8 | -------------------------------------------------------------------------------- /helpers/cummDistPlot.R: -------------------------------------------------------------------------------- 1 | # Cumulative distribution plot 2 | 3 | library(data.table) 4 | library(plyr) 5 | library(ggplot2) 6 | 7 | dat<-fread("G:/socialnetworks_project_log/degree_distribution/numFriendsPerUser.csv") 8 | dat<-as.data.frame(dat) 9 | 10 | plot(ecdf(dat$V2)) 11 | 12 | ggplot(dat,aes(x = V2)) + stat_ecdf() + 13 | scale_x_log10() + 14 | scale_y_continuous(expand = c(0,0)) + ylab("Cumulative distribution") + xlab("Value") + theme_bw(16) 15 | 16 | dat<-fread("G:/socialnetworks_project_log/degree_distribution/numTweetsPerAuthor.csv") 17 | dat<-as.data.frame(dat) 18 | plot(ecdf(dat$V2)) 19 | 20 | dat<-fread("G:/socialnetworks_project_log/degree_distribution/featuresUserSubset.csv") 21 | dat<-as.data.frame(dat) 22 | ggplot(dat,aes(x = V2)) + stat_ecdf() + 23 | scale_x_log10() + 24 | scale_y_continuous(expand = c(0,0)) + ylab("Cumulative distribution") + xlab("Value") + theme_bw(16) 25 | ggplot(dat,aes(x = V3)) + stat_ecdf() + 26 | scale_x_log10() + 27 | scale_y_continuous(expand = c(0,0)) + ylab("Cumulative distribution") + xlab("Value") + theme_bw(16) 28 | -------------------------------------------------------------------------------- /adopter_prediction/helpers/measure.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, os.path 4 | from string import * 5 | from sys import argv 6 | from subr import * 7 | 8 | if len(argv) < 4: 9 | print "Usage: %s testing_file testing_output_file training_class" % (argv[0]) 10 | sys.exit(1) 11 | 12 | def main(): 13 | original = read_first_column(argv[1]) 14 | test_output = read_first_column(argv[2]) 15 | train_new_class = read_first_column(argv[3]) 16 | 17 | predict = [] 18 | for i in range(len(test_output)): 19 | idx = atoi(test_output[i][0]) 20 | predict.append(train_new_class[idx]) 21 | 22 | if(len(predict) != len(original)): 23 | print "Error: lines of %s and %s are different." % (argv[1],argv[2]) 24 | sys.exit(1) 25 | 26 | labels = [] 27 | for i in range(len(train_new_class)): 28 | for lab in train_new_class[i]: 29 | if (lab not in labels): 30 | labels.append(lab) 31 | 32 | print "number of labels = %s" % len(labels) 33 | 34 | result = measure(original,predict,labels) 35 | 36 | print "Exact match ratio: %s" % result[0] 37 | print "Microaverage F-measure: %s" % result[1] 38 | print "Macroaverage F-measure: %s" % result[2] 39 | 40 | main() 41 | -------------------------------------------------------------------------------- /run_script.sh: -------------------------------------------------------------------------------- 1 | ## Basic workflow 2 | 3 | cd /dbresearch2/word2vec/degree_distribution 4 | 5 | ### pre-process hashtag adoption sequences ### 6 | # filter users 7 | python "deg_dist_in_top_users.py" 8 | 9 | # create hashtag sequences. Output to sentences_files_timeonly/ 10 | python "filter_hashtag_sequence.py" 11 | 12 | # filter follower graph 13 | python "filter_follower_graph.py" 14 | 15 | # get hashtags tweeted by these users 16 | python "train_test_split_hashtag_sequence.py" 17 | 18 | ### corpus creation ### 19 | # convert to sentences 20 | python "sentence_creation/sentence_hashtag_adoption.py" 21 | 22 | # concatenate sentence files. Output userSentencesComb file. 23 | bash cat_files.sh 24 | 25 | ### train vectors using word2vec ### 26 | # word2vec to get user vectors. Output node_vectors_1hr_bfsr.txt and node_vocab_1hr_bfsr.txt 27 | bash "node_vector_training.sh" 28 | 29 | ### adopter prediction task ### 30 | # frequency and exposure rank baselines 31 | python "adopter_prediction_baseline.py" 32 | 33 | # user vector averaging method 34 | python "adopter_prediction_multiple_prec_plot.py" 35 | 36 | ### geolocation prediction task ### 37 | # classification method and baselines 38 | python "user_vector_cluster_geography.py" -------------------------------------------------------------------------------- /helpers/frequency_plot.py: -------------------------------------------------------------------------------- 1 | # file to plot frequency distribution using matplotlib 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | rec_links = [] 7 | same_tags_tweets = [] 8 | with open("featuresUserSubset.csv","rb") as fr: 9 | for line in fr: 10 | line = line.rstrip() 11 | u = line.split(',') 12 | id,rec,tags = int(u[0]),int(u[1]),int(u[2]) 13 | rec_links.append(rec) 14 | same_tags_tweets.append(tags) 15 | rec_links = np.array(rec_links) 16 | same_tags_tweets = np.array(same_tags_tweets) 17 | 18 | num_bin = 100000 19 | def freq_plot(data,xlab): 20 | values, base = np.histogram(data, bins=num_bin) 21 | cumulative = np.cumsum(values) 22 | plt.plot(base[:-1], values, c='red') #frequency 23 | # plt.plot(base[:-1], cumulative/float(len(data)), c='red') #normalised 24 | # plt.plot(base[:-1], len(data)-cumulative, c='red') #inverse, greater than 25 | # plt.plot(base[:-1], len(data)-np.append(0,cumulative)[:-1], c='red') #inverse, greater than or equal to 26 | # plt.yscale('log') 27 | plt.xscale('log') 28 | plt.xlabel(xlab) 29 | plt.xlim(xmin=0) 30 | plt.ylabel('cumulative frequency') 31 | plt.title('cumulative frequency distribution (greater than or equal to)') 32 | plt.grid() 33 | plt.show() 34 | 35 | freq_plot(rec_links,'Users with reciprocal links') 36 | freq_plot(same_tags_tweets,'Users with tweets on same hashtags') -------------------------------------------------------------------------------- /filter_hashtag_sequence.py: -------------------------------------------------------------------------------- 1 | #filter only tweets by subset of users in hashtag sequence file 2 | import time 3 | import sys 4 | import os 5 | import cPickle as pickle 6 | import random 7 | 8 | min_tweets_sequence = 2 9 | selected_users = set() 10 | with open("userSubset.csv","r") as fr: 11 | for line in fr: 12 | line = line.rstrip() 13 | u = line.split(',') 14 | id,_,_ = int(u[0]),int(u[1]),int(u[2]) 15 | selected_users.add(id) 16 | 17 | m = dict() 18 | fr = open("/twitterSimulations/graph/map.txt") 19 | for line in fr: 20 | line = line.rstrip() 21 | u = line.split(' ') 22 | m[int(u[0])] = int(u[1]) 23 | fr.close() 24 | print 'Map Read' 25 | 26 | adoption_sequence = dict() 27 | with open('/twitterSimulations/timeline_data/dif_timeline1s', 'r') as fr: 28 | for line in fr: 29 | line = line.rstrip() 30 | u = line.split('\t') 31 | tag = u[0] 32 | time = int(u[1]) 33 | author = m[int(u[2])] 34 | if author not in selected_users: 35 | continue 36 | try: 37 | adoption_sequence[tag].append((time,author)) 38 | except KeyError: 39 | adoption_sequence[tag]=[(time,author)] 40 | print len(adoption_sequence) 41 | 42 | with open('hashtagAdoptionSequences.txt','wb') as fd: # 'hashtagAdoptionSequences_filter.txt' 43 | for tag in adoption_sequence.keys(): 44 | if len(adoption_sequence[tag])>=min_tweets_sequence: 45 | fd.write(tag) 46 | for t,a in adoption_sequence[tag]: 47 | fd.write(' '+str(t)+','+str(a)) #author is of type str for using join 48 | fd.write('\n') -------------------------------------------------------------------------------- /train_test_split_hashtag_sequence.py: -------------------------------------------------------------------------------- 1 | #filter only tweets by subset of users in hashtag sequence file 2 | import time 3 | import sys 4 | import os 5 | import cPickle as pickle 6 | import random 7 | 8 | #separate sequences into training (80%) and test sequences (20%) 9 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" 10 | adoption_sequence = [] 11 | large_tag_id = [] 12 | count=0 13 | with open(adoption_sequence_filename, 'r') as fr: 14 | for line in fr: 15 | line = line.rstrip() 16 | u = line.split(' ') 17 | #tag = u[0] 18 | sequence = [] 19 | if len(u)-1>=100000: 20 | large_tag_id.append(count) 21 | for i in range(1, len(u)): 22 | #timestamp = int(u[i][0:u[i].index(',')]) 23 | author = int(u[i][u[i].index(',')+1 : ]) 24 | sequence.append(author) 25 | adoption_sequence.append(sequence) 26 | count+=1 27 | 28 | num_lines = len(adoption_sequence) #3617312 29 | print num_lines 30 | seq_random_index=range(0,num_lines) 31 | random.shuffle(seq_random_index) 32 | num_train = int(0.8*num_lines) 33 | print num_train 34 | train_seq_id = seq_random_index[:num_train] 35 | test_seq_id = seq_random_index[num_train:] 36 | with open("sequence_file_split_indices.pickle","wb") as fd: 37 | pickle.dump(train_seq_id,fd) 38 | pickle.dump(test_seq_id,fd) 39 | users_train=set() 40 | for i in train_seq_id: 41 | for u in adoption_sequence[i]: 42 | users_train.add(u) 43 | users_test=set() 44 | overlap = set() 45 | for i in test_seq_id: 46 | for u in adoption_sequence[i]: 47 | users_test.add(u) 48 | if u in users_train: 49 | overlap.add(u) 50 | print len(users_train), len(users_test), len(overlap) 51 | with open("sequence_file_split_users.pickle","wb") as fd: 52 | pickle.dump(users_train,fd) 53 | pickle.dump(users_test,fd) 54 | 55 | with open("sequence_large_hashtags.pickle","wb") as fd: 56 | pickle.dump(large_tag_id,fd) -------------------------------------------------------------------------------- /user_vector_training/helpers/remove_repeat_contexts.py: -------------------------------------------------------------------------------- 1 | #filter sentences with users repeating in the same context 2 | 3 | from collections import defaultdict 4 | import cPickle as pickle 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from multiprocessing import Pool, cpu_count 8 | 9 | adoption_sentence_filename = "/mnt/filer01/word2vec/degree_distribution/sentences_files/userSentencesComb_12hr" 10 | out_file = adoption_sentence_filename+"_filter" 11 | num_files = 13 12 | NUM_PROCESSES = num_files 13 | 14 | linecount=0 15 | with open(adoption_sentence_filename, 'r') as fr, open(out_file, 'w') as fd: 16 | for line in fr: 17 | line = line.rstrip() 18 | u = line.split(' ') 19 | s=set(u) 20 | if len(s)>1: 21 | fd.write(line+"\n") 22 | linecount+=1 23 | if linecount%1000000==0: 24 | print "path count", linecount 25 | print "Sequence file read" 26 | 27 | """ 28 | #run count_sentence_file on different adoption sentences files in parallel processes 29 | num_workers = min(NUM_PROCESSES,cpu_count()) 30 | pool = Pool(processes=num_workers) 31 | process_num=0 32 | for i in range(0,num_files): 33 | pool.apply_async(count_sentence_file, args=(i,process_num)) 34 | process_num+=1 35 | pool.close() 36 | pool.join() 37 | 38 | #combine counts from different pickle file 39 | count=defaultdict(int) 40 | context_count=defaultdict(int) 41 | for file_num in range(0,num_files): 42 | presentc = pickle.load(open(out_dir+"/frequencyNodes_1hr"+str(file_num)+".pickle","rb")) 43 | presentcc = pickle.load(open(out_dir+"/frequencyContextLength_1hr"+str(file_num)+".pickle","rb")) 44 | for i in presentc: 45 | count[i]+=presentc[i] 46 | for i in presentcc: 47 | context_count[i]+=presentcc[i] 48 | pickle.dump(count,open(out_dir+"/comb_frequencyNodes_1hr_timeonly.pickle","wb")) 49 | pickle.dump(context_count,open(out_dir+"/comb_frequencyContextLength_1hr_timeonly.pickle","wb")) 50 | """ 51 | -------------------------------------------------------------------------------- /logs/notes.log: -------------------------------------------------------------------------------- 1 | checking sparsity of number of contexts available for users in terms of number of users with reciprocal links and number of users tweeting on same hashtag 2 | extracting sentences using paths in hashtag graphs with time and geography based edges and training word2vec for getting user vectors, plots for user count and sentence length frequency 3 | comparing top 10 similar users obtained using user vectors and using counts of users from tweet data 4 | splitting sequence file into train and test for training user vectors for adopter prediction task, using breadth-first search and path-based approach for generating sentences from hashtag graph, using sequences with atleast k adopters in training user vectors ? 5 | tSNE visualisation of user vectors trained using sentences with geography and time or time criteria 6 | querying for nearest neighbors of user vectors using kd-tree or brute force approach and ranking them using distance metrics (min, average, weighted average) for different values of n and k 7 | plotting change in precision and map at k with different values of k 8 | learning weights for combining distances, in sorted order, of candidate users from initial adopters, with candidate set as nearest neighbors of user vectors of initial adopters, taking only subset of test sequences for training weights 9 | learning weights for individual topics by querying a set of candidates first using nearest neighbour or distance-based methods, and then training and testing classifier to predict adopters from the candidate set 10 | tSNE visualisation of candidate set for individual topics along with predicted labels 11 | stepwise prediction of adopters from the candidate set, predicting adopters based on a prediction probability threshold? 12 | plotting candidate set coverage with increasing size of candidate set in terms of number of nearest neighbours queried or it’s radius, plotting total spread of topics with the candidate set coverage in first 1000 adoptions 13 | training and testing adopter prediction task for particular size of candidate set 14 | predicting geography of users from user vectors, comparing with network-based baselines -------------------------------------------------------------------------------- /user_vector_training/deg_dist_in_top_users.py: -------------------------------------------------------------------------------- 1 | #filter users according to number of tweets with hashtags and number of following, plot degree distribution of this subset of users to check if there is senough context available for each user 2 | import time 3 | import re 4 | import datetime 5 | import dateutil.tz 6 | import calendar 7 | import sys 8 | import os 9 | import cPickle as pickle 10 | 11 | 12 | m = dict() 13 | fr = open("/twitterSimulations/graph/map.txt") 14 | for line in fr: 15 | line = line.rstrip() 16 | u = line.split(' ') 17 | m[int(u[0])] = int(u[1]) 18 | fr.close() 19 | 20 | num_tagtweets_per_user = dict() 21 | with open('/twitterSimulations/timeline_data/dif_timeline1s', 'r') as fr: 22 | for line in fr: 23 | line = line.rstrip() 24 | u = line.split('\t') 25 | author = m[int(u[2])] 26 | if author not in num_tagtweets_per_user: 27 | num_tagtweets_per_user[author]=0 28 | num_tagtweets_per_user[author]+=1 29 | print len(num_tagtweets_per_user) 30 | selected_tagtweets_users = set(num_tagtweets_per_user.keys()) 31 | """ 32 | with open("numTweetsPerAuthor.csv","w") as fd: 33 | for i in num_tagtweets_per_user: 34 | fd.write(str(i)+","+str(num_tagtweets_per_user[i])+"\n") 35 | """ 36 | node_nbh = pickle.load(open( "/twitterSimulations/friends_count_user.pickle", "rb" ) ) 37 | print len(node_nbh) 38 | 39 | selected_friends_users = set(node_nbh.keys()) 40 | """ 41 | with open("numFriendsPerUser.csv","w") as fd: 42 | for i in node_nbh: 43 | fd.write(str(i)+","+str(node_nbh[i])+"\n") 44 | """ 45 | common_users = set.intersection(selected_tagtweets_users, selected_friends_users) 46 | print len(common_users) 47 | 48 | def get_subset(d,t): 49 | s = set() 50 | for i in d: 51 | if d[i]>=t: 52 | s.add(i) 53 | return s 54 | sel_tagtweets = get_subset(num_tagtweets_per_user,15) 55 | sel_friends = get_subset(node_nbh,200) 56 | 57 | common_users = set.intersection(sel_tagtweets, sel_friends) #1001525 58 | print len(common_users) 59 | 60 | with open("userSubset.csv","w") as fd: 61 | for i in common_users: 62 | fd.write(str(i)+","+str(num_tagtweets_per_user[i])+","+str(node_nbh[i])+"\n") -------------------------------------------------------------------------------- /filter_follower_graph.py: -------------------------------------------------------------------------------- 1 | #filter only tweets by subset of users in hashtag sequence file 2 | import time 3 | import sys 4 | import os 5 | import cPickle as pickle 6 | import random 7 | 8 | # filter follower files for users in adoption sequence 9 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" 10 | adoption_sequence_users = set() 11 | count=0 12 | with open(adoption_sequence_filename, 'r') as fr: 13 | for line in fr: 14 | line = line.rstrip() 15 | u = line.split(' ') 16 | #tag = u[0] 17 | for i in range(1, len(u)): 18 | #timestamp = int(u[i][0:u[i].index(',')]) 19 | author = int(u[i][u[i].index(',')+1 : ]) 20 | adoption_sequence_users.add(author) 21 | count+=1 22 | print len(adoption_sequence_users), count 23 | 24 | m = dict() 25 | fr = open("/twitterSimulations/graph/map.txt") 26 | for line in fr: 27 | line = line.rstrip() 28 | u = line.split(' ') 29 | m[int(u[0])] = int(u[1]) 30 | fr.close() 31 | print 'Map Read' 32 | 33 | # arr = ["user_followers_bigger_graph_recrawl_3.txt"] 34 | arr = ["user_followers_bigger_graph.txt","user_followers_bigger_graph_2.txt", "user_followers_bigger_graph_i.txt","user_followers_bigger_graph_recrawl_2.txt", "user_followers_bigger_graph_recrawl_3.txt","user_followers_bigger_graph_recrawl.txt"] 35 | 36 | follower_adj = [ [] for i in xrange(0, 7697889) ] 37 | 38 | for i in arr: 39 | fr = open("/twitterSimulations/graph/" + i,'r') 40 | for line in fr: 41 | line = line.rstrip() 42 | u = line.split(' ') 43 | if(int(u[0]) > 7697889): 44 | continue 45 | if len(u) > 2: 46 | for j in range(2,len(u)): 47 | follower_adj[m[int(u[1])]].append(m[int(u[j])]) 48 | fr.close() 49 | print i 50 | 51 | print 'Graph Read\n' 52 | 53 | # for i in range(0, 7697889): 54 | # follower_adj[i] = set(follower_adj[i]) 55 | 56 | print 'Graph Set\n' 57 | 58 | with open("graph_files/follower_graph_tweeters","wb") as fd: 59 | for i in follower_adj: 60 | if i in adoption_sequence_users: 61 | fol = set(follower_adj[i])&adoption_sequence_users 62 | fol = map(str,list(fol)) 63 | fd.write(str(len(fol))+" "+str(i)+" "+" ".join(fol)+"\n") 64 | -------------------------------------------------------------------------------- /user_vector_training/nearest_neighbours/plot_similarity_nearest_users.py: -------------------------------------------------------------------------------- 1 | #plot scatterplot of similarity between nearest users for a query users obtained from user vectors and from hashtag sequence file 2 | 3 | import cPickle as pickle 4 | from distance_w2v import * 5 | import matplotlib 6 | matplotlib.use('Agg') 7 | import pylab as Plot 8 | from numpy import array 9 | 10 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr.txt" 11 | nearest_users_pickle = "/mnt/filer01/word2vec/degree_distribution/nearest_users_compare1hr_5.pickle" 12 | 13 | def save_scatterplot(X,overlap,fname): 14 | posX,posY,color = zip(*X) 15 | #max_d = max(color) 16 | #min_d = min(color) 17 | #color_norm = [(x-min_d)/float(max_d-min_d) for x in color] 18 | fig = Plot.figure() 19 | Plot.scatter(posX, posY, s=20, c=color) 20 | #Plot.axis('off') 21 | Plot.xlim([0,100]) 22 | Plot.ylim([0,100]) 23 | Plot.xlabel('User vectors') 24 | Plot.ylabel('Counts') 25 | Plot.colorbar() 26 | fig.suptitle('Overlap '+str(overlap)) 27 | fig.savefig(fname, dpi=100, bbox_inches='tight') 28 | 29 | with open(nearest_users_pickle,"rb") as fr: 30 | sample_users = pickle.load(fr) 31 | overlap_count = pickle.load(fr) 32 | nearest_users_seq = pickle.load(fr) 33 | nearest_users_w2v = pickle.load(fr) 34 | 35 | max_overlap = overlap_count.index(max(overlap_count)) 36 | min_overlap = overlap_count.index(min(overlap_count)) 37 | overlap_query_users = [max(overlap_count),min(overlap_count)] 38 | 39 | vec,vocab,_ = read_vector_file(vec_file) 40 | 41 | query_users = [sample_users[max_overlap],sample_users[min_overlap]] 42 | count=0 43 | for query_user in query_users: 44 | count+=1 45 | users_seq = nearest_users_seq[query_user] 46 | users_w2v = nearest_users_w2v[query_user] 47 | print len(users_seq),len(users_w2v) 48 | X = [] 49 | for i in range(0,len(users_w2v)): 50 | vec1=vec[vocab.index(users_w2v[i])] 51 | for j in range(0,len(users_seq)): 52 | vec2=vec[vocab.index(users_seq[j])] 53 | dist = 0.0 54 | for d in range(0,len(vec1)): 55 | dist+=vec1[d]*vec2[d] 56 | X.append((i+1,j+1,dist)) 57 | save_scatterplot(X,overlap_query_users[count-1],fname='nearest_users_scatterplot'+str(count)) -------------------------------------------------------------------------------- /user_vector_training/distance_w2v.py: -------------------------------------------------------------------------------- 1 | #same as distance.c file in word2vec for use in query_nearest_users.py 2 | 3 | from math import sqrt 4 | 5 | def read_vector_file(path_vectors_file): 6 | vocab = [] 7 | vectors = [] 8 | with open(path_vectors_file,"rb") as fr: 9 | _,dim = next(fr).rstrip().split(' ') 10 | word_vector_dim = int(dim) 11 | next(fr) 12 | for line in fr: 13 | line = line.rstrip() 14 | u = line.split(' ') 15 | if len(u) != word_vector_dim+1: 16 | print "vector length error" 17 | word = int(u[0]) 18 | vec = [] 19 | length = 0.0 20 | for d in u[1:]: 21 | num=float(d) 22 | vec.append(num) 23 | length+=num**2 24 | #vec = map(float,u[1:]) 25 | #length = sum(x**2 for x in vec) 26 | length = sqrt(length) 27 | vec_norm = [x/length for x in vec] 28 | vocab.append(word) 29 | vectors.append(vec_norm) 30 | return vectors, vocab, word_vector_dim 31 | 32 | def get_Nnearest(query,vec,vocab,N): 33 | wordN = [0]*N 34 | distN = [0.0]*N 35 | try: 36 | voc_ind = vocab.index(query) 37 | except ValueError: 38 | print "query word not present" 39 | return 40 | query_vec = vec[voc_ind] 41 | dim = len(query_vec) 42 | for i in range(0,len(vec)): 43 | if i==voc_ind: 44 | continue 45 | pres_word = vocab[i] 46 | pres_vec = vec[i] 47 | dist = 0.0 48 | for x in range(0,dim): 49 | dist+=query_vec[x]*pres_vec[x] 50 | #dist = sum(query_vec[x]*pres_vec[x] for x in range(0,dim)) 51 | for j in range(0,N): 52 | if dist>distN[j]: 53 | for k in range(N-1,j,-1): 54 | distN[k] = distN[k-1] 55 | wordN[k] = wordN[k-1] 56 | distN[j] = dist 57 | wordN[j] = pres_word 58 | break 59 | return wordN #zip(wordN,distN) 60 | 61 | def get_distance(query1,query2,vec,vocab): 62 | dist=0.0 63 | try: 64 | vec1=vec[vocab.index(query1)] 65 | vec2=vec[vocab.index(query2)] 66 | except ValueError: 67 | print "query word not present" 68 | return 69 | for i in range(0,len(vec1)): 70 | dist+=vec1[i]*vec2[i] 71 | return dist 72 | 73 | #vec,vocab,_ = read_vector_file("/mnt/filer01/word2vec/node_vectors_1hr.txt") 74 | #print get_Nnearest(17,vec,vocab,N=1) 75 | #print get_distance(17,1145375,vec,vocab) 76 | #print get_distance(1,1145375,vec,vocab)==None -------------------------------------------------------------------------------- /tsne_plots/tsne_word_visualisation.py: -------------------------------------------------------------------------------- 1 | #visualising top 100 most, least and mid frequent words using t-SNE 2 | 3 | from tsne import * 4 | from numpy import array 5 | 6 | word_vectors = [] 7 | path_vec_file = '/mnt/filer01/word2vec/twitter_vectors.txt' 8 | word_vector_dim = 200 9 | labels = dict() 10 | X_word = [] 11 | windex=0 12 | with open(path_vec_file, 'rb') as fr: 13 | next(fr) 14 | for line in fr: 15 | line = line.rstrip() 16 | u = line.split(' ') 17 | if len(u) != word_vector_dim+1: 18 | print "vector length error" 19 | word = u[0].decode('latin-1') 20 | vec = map(float,u[1:]) 21 | labels[word]=windex 22 | # word_vectors.append([word]+vec) 23 | X_word.append(vec) 24 | windex+=1 25 | # labels = [x[0] for x in word_vectors] 26 | # X_word = [x[1:] for x in word_vectors] 27 | 28 | word_freq_sorted = [] 29 | path_vocab_file = '/mnt/filer01/word2vec/vocab.txt' 30 | with open(path_vocab_file, 'rb') as fr: 31 | for line in fr: 32 | line = line.rstrip() 33 | u = line.split(' ') 34 | word_freq_sorted.append(u[0].decode('latin-1')) 35 | 36 | def get_word_vectors(wlist): 37 | vectors = [] 38 | for w in wlist: 39 | vectors.append(X_word[labels[w]]) 40 | return array(vectors) 41 | 42 | most_freq = word_freq_sorted[0:1000] 43 | mid_freq = word_freq_sorted[-1000:] 44 | half_num_words = int(len(word_freq_sorted)/2.0) 45 | least_freq = word_freq_sorted[half_num_words-500:half_num_words+499] 46 | 47 | def save_embed_plot(X,labels,fname): 48 | Y = tsne(X, 2, word_vector_dim, 20.0); 49 | fig = Plot.figure() 50 | Plot.scatter(Y[:,0], Y[:,1], 1); 51 | for label, x, y in zip(labels, Y[:,0], Y[:,1]): 52 | Plot.annotate(label, xy = (x, y), xytext = (0, 0), textcoords = 'offset points', size=5) 53 | fig.savefig(fname, dpi=1200) 54 | 55 | if __name__ == "__main__": 56 | print "Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset." 57 | # print "Running example on 2,500 MNIST digits..." 58 | # X = Math.loadtxt("mnist2500_X.txt"); 59 | # labels = Math.loadtxt("mnist2500_labels.txt"); 60 | 61 | # save_embed_plot(get_word_vectors(most_freq),array(most_freq),'embed_mostfreq.png') 62 | # save_embed_plot(get_word_vectors(mid_freq),array(mid_freq),'embed_midfreq.png') 63 | # save_embed_plot(get_word_vectors(least_freq),array(least_freq),'embed_leastfreq.png') 64 | save_embed_plot(get_word_vectors(word_freq_sorted),array(word_freq_sorted),'embed_all.png') 65 | -------------------------------------------------------------------------------- /misc/virality_prediction_features.py: -------------------------------------------------------------------------------- 1 | #write features for hashtags in virality prediction using user vectors 2 | 3 | import cPickle as pickle 4 | import time 5 | from distance_w2v import * 6 | 7 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr.txt" 8 | timeline_file = "/twitterSimulations/timeline_data/timeline_weng" 9 | feature_file = "/mnt/filer01/word2vec/degree_distribution/feature_file.csv" 10 | 11 | vec,vocab,dim = read_vector_file(vec_file) 12 | vocab_index=dict() 13 | for i in range(0,len(vocab)): 14 | vocab_index[vocab[i]]=i 15 | 16 | m = dict() 17 | fr = open("/twitterSimulations/graph/map.txt") 18 | for line in fr: 19 | line = line.rstrip() 20 | u = line.split(' ') 21 | m[int(u[0])] = int(u[1]) 22 | fr.close() 23 | print 'Map Read' 24 | 25 | not_found_vocab=[] 26 | pred_thr = 1500 27 | with open(timeline_file, "rb") as fr, open(feature_file, "wb") as fd: 28 | feature_names = ','.join(["max_"+str(x) for x in range(0,dim)])+','+','.join(["min_"+str(x) for x in range(0,dim)])+','+','.join(["avg_"+str(x) for x in range(0,dim)]) 29 | fd.write("TagName,"+feature_names+",Class\n") 30 | for line in fr: 31 | line = line.rstrip() 32 | u = line.split(' ') 33 | if len(u) <= pred_thr: 34 | continue 35 | numTweets = 0 36 | not_found=0 37 | user_vectors = [] 38 | for i in range(1, len(u)): 39 | #timestamp = int(u[i][0:u[i].index(',')]) 40 | numTweets = i 41 | if(numTweets > pred_thr): 42 | break 43 | author = int(u[i][u[i].index(',')+1 : ]) 44 | author = m[author] 45 | if author in vocab_index: 46 | user_vec=vec[vocab_index[author]] 47 | else: 48 | not_found+=1 49 | continue 50 | user_vectors.append(user_vec) 51 | if user_vectors==[]: 52 | max_vec = [0.0]*dim 53 | min_vec = [0.0]*dim 54 | avg_vec = [0.0]*dim 55 | print u[0] 56 | else: 57 | aggr_vec = zip(*user_vectors) 58 | max_vec = [] 59 | min_vec = [] 60 | avg_vec = [] 61 | for i in range(0,len(aggr_vec)): 62 | d = aggr_vec[i] 63 | max_vec.append(max(d)) 64 | min_vec.append(min(d)) 65 | avg_vec.append(sum(d)/float(len(d))) 66 | if len(u) > 10000: 67 | class_label = '1' 68 | else: 69 | class_label = '0' 70 | fd.write(str(u[0])+','+','.join(map(str,max_vec))+','+','.join(map(str,min_vec))+','+','.join(map(str,avg_vec))+','+class_label+'\n') 71 | not_found_vocab.append(not_found) 72 | 73 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab) 74 | #pickle.dump(not_found_vocab,open("not_found_vocab.pickle","wb")) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Social Network Embeddings 2 | Code for methods to embed social network users based on their topic activity, described in the [paper](https://arxiv.org/abs/1710.07622). This experiment is performed on a large-scale social network extracted from Twitter, consisting of about 7.7 million users and their activity on around 3.6 million topics over a month long period to predict the most likely future adopters of a topic and the geo-location of users by training a word2vec (Skip-Gram Model) model in the context of text mining to compute representations of users. 3 | 4 | #### Abstract 5 | 6 | ``` 7 | This article presents a novel approach for learning low-dimensional distributed representations of users in online social networks. Existing methods rely on the network structure formed by the social relationships among users to extract these representations. 8 | However, the network information can be obsolete, incomplete or dynamically changing. In addition, in some cases, it can be prohibitively expensive to get the network information. Therefore, we propose an alternative approach based on observations from topics being talked on in social networks. 9 | We utilise the time information of users adopting topics in order to embed them in a real-valued vector space. Through extensive experiments, we investigate the properties of the representations learned and their efficacy in preserving information about link structure among users. 10 | We also evaluate the representations in two different prediction tasks, namely, predicting most likely future adopters of a topic and predicting the geo-location of users. Experiments to validate the proposed methods are performed on a large-scale social network extracted from Twitter, consisting of about 7.7 million users and their activity on around 3.6 million topics over a month-long period. 11 | ``` 12 | 13 | #### Note 14 | Adventurers beware! This repository is meant for version control of scripts used for experiments in the paper. So, not heavily commented and not heavily tested. 15 | 16 | ## Basic workflow 17 | 1. "user_vector_training/deg_dist_in_top_users.py" -> filter users 18 | 2. "user_vector_training/filter_hashtag_sequence.py" -> get hashtags tweeted by these users 19 | 3. "user_vector_training/sentence_creation/sentence_hashtag_adoption.py" -> convert to sentences 20 | 4. "user_vector_training/word2vec/twitter_training_w2v.sh" -> word2vec to get user vectors 21 | 5. "adopter_prediction/adopter_prediction.py" -> next adopter prediction 22 | 23 | ## Contact 24 | If you are interested in knowing more or have any questions on the code, feel free to contact me at & Harvineet at . 25 | -------------------------------------------------------------------------------- /adopter_prediction/prec_plot.py: -------------------------------------------------------------------------------- 1 | #plot MAP, precision, recall at k for different k in single tag case and averaged over 100 tags 2 | #plot precision at k and R for individual topics with learned weights 3 | 4 | from collections import defaultdict 5 | import cPickle as pickle 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | """ 9 | top_k = [25,50]+range(100,1001,100) #[100,200,300]+range(500,5001,500) 10 | 11 | def eval_plot(vec,nbapp,fol,ylab,title): 12 | plt.plot(top_k, vec) 13 | plt.plot(top_k, nbapp) 14 | plt.plot(top_k, fol) 15 | plt.legend(['User vectors','Frequency','Followers']) 16 | plt.xlabel('k') 17 | # plt.xlim(xmin=top_k[0]) 18 | plt.ylabel(ylab) 19 | plt.title(title) 20 | plt.grid() 21 | plt.show() 22 | 23 | #single tag plot 24 | with open("prec_plot_single1.pickle","rb") as fr: 25 | mapk = pickle.load(fr) 26 | preck = pickle.load(fr) 27 | reck = pickle.load(fr) 28 | vec,nbapp,fol = zip(*preck) 29 | eval_plot(list(vec),list(nbapp),list(fol),'Precision at k','Precision@k values at different k') 30 | 31 | def average(eval_list): 32 | avg_vec = [0.0]*len(top_k) 33 | avg_nbapp = [0.0]*len(top_k) 34 | avg_fol = [0.0]*len(top_k) 35 | for l in eval_list: 36 | vec,nbapp,fol = zip(*l) 37 | for i,v in enumerate(vec): 38 | avg_vec[i]+=v 39 | for i,v in enumerate(nbapp): 40 | avg_nbapp[i]+=v 41 | for i,v in enumerate(fol): 42 | avg_fol[i]+=v 43 | num_tags = len(eval_list) 44 | avg_vec = [v*1.0/num_tags for v in avg_vec] 45 | avg_nbapp = [v*1.0/num_tags for v in avg_nbapp] 46 | avg_fol = [v*1.0/num_tags for v in avg_fol] 47 | return avg_vec,avg_nbapp,avg_fol 48 | 49 | top_k = [1,2,5]+range(10,101,10) 50 | #100 tags 51 | with open("prec_plot_k.pickle","rb") as fr: 52 | mapk = pickle.load(fr) 53 | preck = pickle.load(fr) 54 | reck = pickle.load(fr) 55 | vec,nbapp,fol = average(mapk) 56 | eval_plot(vec,nbapp,fol,'Precision at k','Precision@k values at different k') 57 | """ 58 | num_bin = 50 59 | def eval_plot(eval,rec,xlab,title): 60 | plt.hist(eval, num_bin) 61 | # plt.bar(range(1,len(eval)+1), eval) 62 | # plt.bar(range(1,len(rec)+1), rec) 63 | plt.xlabel(xlab) 64 | plt.ylabel('Frequency') 65 | plt.title(title) 66 | plt.grid() 67 | plt.show() 68 | 69 | # with open("mean_precision_n10_rf_prec10.pickle","rb") as fr: 70 | # prec_k_total = pickle.load(fr) 71 | # cand_set_recall = pickle.load(fr) 72 | # cand_set_size_list = pickle.load(fr) 73 | 74 | with open("eval_n10_lr.pickle","rb") as fr: 75 | ap_total = pickle.load(fr) 76 | prec_k_total = pickle.load(fr) 77 | _ = pickle.load(fr) 78 | cand_set_recall = pickle.load(fr) 79 | 80 | # print sum(cand_set_size_list) 81 | # eval_plot(list(prec_k_total),list(cand_set_size_list),'Precision@k','Histogram of Prec@10 for 100 topics') 82 | user,_,_ = zip(*prec_k_total) 83 | eval_plot(list(user),[],'Precision@k','Histogram of Precision@500') -------------------------------------------------------------------------------- /user_vector_training/helpers/node_frequency.py: -------------------------------------------------------------------------------- 1 | #count frequency of nodes occurring in sentences 2 | 3 | from collections import defaultdict 4 | import cPickle as pickle 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from multiprocessing import Pool, cpu_count 8 | """ 9 | in_dir = "/mnt/filer01/word2vec/degree_distribution/sentences_files_timeonly/" 10 | out_dir = "/mnt/filer01/word2vec/degree_distribution/count_files/" 11 | num_files = 8 12 | NUM_PROCESSES = num_files 13 | 14 | def count_sentence_file(file_num,process_num): 15 | count=defaultdict(int) 16 | context_count=defaultdict(int) 17 | with open(in_dir+'/hashtagAdoptionSentences'+str(file_num)+'.txt','rb') as fr: 18 | for line in fr: 19 | line = line.rstrip() 20 | u = line.split(' ') 21 | s = len(u) 22 | context_count[s]+=1 23 | for id in range(0, s): 24 | author = int(u[id]) 25 | count[author]+=1 26 | print "process", process_num, "file complete", file_num 27 | pickle.dump(count,open(out_dir+"/frequencyNodes_1hr_bfsr_timeonly"+str(file_num)+".pickle","wb")) 28 | pickle.dump(context_count,open(out_dir+"/frequencyContextLength_1hr_bfsr_timeonly"+str(file_num)+".pickle","wb")) 29 | 30 | #run count_sentence_file on different adoption sentences files in parallel processes 31 | num_workers = min(NUM_PROCESSES,cpu_count()) 32 | pool = Pool(processes=num_workers) 33 | process_num=0 34 | for i in range(0,num_files): 35 | pool.apply_async(count_sentence_file, args=(i,process_num)) 36 | process_num+=1 37 | pool.close() 38 | pool.join() 39 | 40 | #combine counts from different pickle file 41 | count=defaultdict(int) 42 | context_count=defaultdict(int) 43 | for file_num in range(0,num_files): 44 | presentc = pickle.load(open(out_dir+"/frequencyNodes_1hr_bfsr_timeonly"+str(file_num)+".pickle","rb")) 45 | presentcc = pickle.load(open(out_dir+"/frequencyContextLength_1hr_bfsr_timeonly"+str(file_num)+".pickle","rb")) 46 | for i in presentc: 47 | count[i]+=presentc[i] 48 | for i in presentcc: 49 | context_count[i]+=presentcc[i] 50 | pickle.dump(count,open(out_dir+"/comb_frequencyNodes_1hr_bfsr_timeonly.pickle","wb")) 51 | pickle.dump(context_count,open(out_dir+"/comb_frequencyContextLength_1hr_bfsr_timeonly.pickle","wb")) 52 | """ 53 | #plot 54 | count = pickle.load(open("sentences_frequency_files/comb_frequencyNodes_1hr_bfsr_loc.pickle","rb")) 55 | node_freq = [] 56 | for i in count: 57 | node_freq.append(count[i]) 58 | 59 | node_freq = np.array(node_freq) 60 | 61 | num_bin = 100000 62 | def freq_plot(data,xlab): 63 | values, base = np.histogram(data, bins=num_bin) 64 | cumulative = np.cumsum(values) 65 | # plt.plot(base[:-1], values, c='red') #frequency 66 | # plt.plot(base[:-1], cumulative/float(len(data)), c='red') #normalised 67 | # plt.plot(base[:-1], len(data)-cumulative, c='red') #inverse, greater than 68 | plt.plot(base[:-1], len(data)-np.append(0,cumulative)[:-1], c='red') #inverse, greater than or equal to 69 | plt.yscale('log') 70 | plt.xscale('log') 71 | plt.xlabel(xlab) 72 | plt.xlim(xmin=0) 73 | plt.ylabel('number of users') 74 | plt.title('cumulative frequency distribution (greater than or equal to)') 75 | plt.grid() 76 | plt.show() 77 | 78 | freq_plot(node_freq,'Count of user occurrence in sentences') 79 | 80 | #frequency plot of path lengths 81 | ccount = pickle.load(open("sentences_frequency_files/comb_frequencyContextLength_1hr_bfsr_loc.pickle","rb")) 82 | clength_freq = [] 83 | for i in ccount: 84 | clength_freq.append((i,ccount[i])) 85 | 86 | def freq_plot_clength(data,xlab): 87 | x,y = zip(*data) 88 | x = [i-0.4 for i in x] #label at bar centre 89 | y = [i/float(sum(y)) for i in y] #normalised 90 | plt.bar(x, y) #frequency 91 | plt.xlabel(xlab) 92 | plt.xlim(xmin=0) 93 | plt.ylabel('Proportion of paths') 94 | plt.title('frequency distribution') 95 | plt.grid() 96 | plt.show() 97 | freq_plot_clength(clength_freq,'Path length') 98 | 99 | -------------------------------------------------------------------------------- /user_vector_training/nearest_neighbours/query_nearest_users_sen.py: -------------------------------------------------------------------------------- 1 | #query users nearest to a given user using node vectors file from distance-filewrite.c file and compare with users in same path in sentences file 2 | 3 | import cPickle as pickle 4 | import random 5 | import os, sys, datetime 6 | from heapq import nlargest 7 | 8 | start_time = datetime.datetime.now() 9 | 10 | adoption_sentence_filename = "/mnt/filer01/word2vec/degree_distribution/sentences_files/userSentencesComb_12hr" #"sample_sequences" 11 | #time_diff_for_edge = 5*1*60*60 #5 context width for path in one direction 12 | 13 | m = dict() 14 | fr = open("/twitterSimulations/graph/map.txt") 15 | for line in fr: 16 | line = line.rstrip() 17 | u = line.split(' ') 18 | m[int(u[0])] = int(u[1]) 19 | fr.close() 20 | print 'Map Read' 21 | 22 | location_buckets = [-1] * 7697889 23 | fr = open('/twitterSimulations/known_locations.txt', 'r') 24 | for line in fr: 25 | line = line.rstrip() 26 | u = line.split('\t') 27 | try: 28 | location_buckets[m[int(u[0])]] = int(u[1]) 29 | except: 30 | pass 31 | fr.close() 32 | 33 | fr = open('/twitterSimulations/known_locations1.txt', 'r') 34 | for line in fr: 35 | line = line.rstrip() 36 | u = line.split('\t') 37 | try: 38 | location_buckets[m[int(u[0])]] = int(u[1]) 39 | except: 40 | pass 41 | fr.close() 42 | print "location file read" 43 | 44 | def call_distance(word): 45 | return os.system("./distance-filewrite ../node_vectors_12hr.bin query_output_temp12hrsen "+str(word)) 46 | 47 | def get_nearest(): 48 | nearest = [] 49 | with open("query_output_temp12hrsen","rb") as fr: 50 | for line in fr: 51 | line=line.rstrip().split('\t') 52 | nearest.append(int(line[0])) 53 | return nearest 54 | 55 | def compare_nearest(seq,w2v): 56 | return len(set(seq)&set(w2v)) 57 | 58 | vocab = [] 59 | with open("../node_vocab_12hr.txt","rb") as fr: 60 | next(fr) 61 | for line in fr: 62 | line=line.rstrip().split(' ') 63 | vocab.append(int(line[0])) 64 | print "Vocab read" 65 | 66 | rand_users = random.sample(vocab,100) 67 | rand_users_set = set(rand_users) 68 | vocab = set(vocab) 69 | print "Sample selected" 70 | 71 | near_count = [[0]*7697889 for i in xrange(0,100)] 72 | 73 | linecount=0 74 | with open(adoption_sentence_filename, 'r') as fr: 75 | for line in fr: 76 | line = line.rstrip() 77 | u = line.split(' ') 78 | sentence = map(int,u) 79 | for author in sentence: 80 | if author in rand_users_set: 81 | for j in sentence: 82 | near_count[rand_users.index(author)][j]+=1 83 | near_count[rand_users.index(author)][author]-=1 84 | linecount+=1 85 | if linecount%1000000==0: 86 | print "path count", linecount 87 | print "Sequence file read" 88 | 89 | near_users_seq = dict() 90 | for i in range(0,len(rand_users)): 91 | user_count = near_count[i] 92 | count = [] 93 | for l in vocab:#xrange(0,7697889): 94 | if user_count[l]!=0 and l!=rand_users[i]: 95 | count.append((l,user_count[l])) 96 | #count = zip(range(0,7697889),near_count[i]) 97 | #count_nz = [(a,b) for (a,b) in count if b!=0] 98 | #count_s = sorted(count_nz,key=lambda x: x[1],reverse=True)[0:100] 99 | #count_s = sorted(range(0,7697889),key=lambda x: user_count[x],reverse=True) 100 | count_s = nlargest(100,count,key=lambda x: x[1]) 101 | if len(count_s)==0: 102 | u,c = [], [] 103 | else: 104 | u,c = zip(*count_s) 105 | near_users_seq[rand_users[i]]=list(u) 106 | print "sel count", rand_users[i], len(u), "non zero", len(count) 107 | 108 | nearest_users_w2v_pickle = dict() 109 | count_pickle = [] 110 | for user in rand_users: 111 | a = call_distance(user) 112 | if a!=0: 113 | print "call error" 114 | sys.exit(0) 115 | nearest_users_w2v = get_nearest() 116 | comp_count = compare_nearest(near_users_seq[user][0:100],nearest_users_w2v[0:100]) 117 | print "common users", user, comp_count 118 | count_pickle.append(comp_count) 119 | nearest_users_w2v_pickle[user]=nearest_users_w2v 120 | 121 | with open("nearest_users_compare12hrsen.pickle","wb") as fd: 122 | pickle.dump(rand_users,fd) 123 | pickle.dump(count_pickle,fd) 124 | pickle.dump(near_users_seq,fd) 125 | pickle.dump(nearest_users_w2v_pickle,fd) 126 | pickle.dump(near_count,fd) 127 | 128 | print start_time, datetime.datetime.now() -------------------------------------------------------------------------------- /tsne_plots/tsne_user_visualisation.py: -------------------------------------------------------------------------------- 1 | #visualising users who adopted a hashtag using t-SNE on user vectors 2 | 3 | import matplotlib 4 | matplotlib.use('Agg') 5 | from tsne import * 6 | from numpy import array 7 | import math, random 8 | 9 | word_vectors = [] 10 | path_vec_file = '/mnt/filer01/word2vec/node_vectors_1hr_pr.txt' 11 | word_vector_dim = 100 12 | labels = dict() 13 | X_word = [] 14 | windex=0 15 | with open(path_vec_file, 'rb') as fr: 16 | _,dim = next(fr).rstrip().split(' ') 17 | word_vector_dim = int(dim) 18 | next(fr) 19 | for line in fr: 20 | line = line.rstrip() 21 | u = line.split(' ') 22 | if len(u) != word_vector_dim+1: 23 | print "vector length error" 24 | word = int(u[0]) 25 | vec = map(float,u[1:]) 26 | labels[word]=windex 27 | windex+=1 28 | X_word.append(vec) 29 | 30 | word_freq_sorted = [] 31 | path_vocab_file = '/mnt/filer01/word2vec/node_vocab_1hr_pr.txt' 32 | with open(path_vocab_file, 'rb') as fr: 33 | next(fr) 34 | for line in fr: 35 | line = line.rstrip() 36 | u = line.split(' ') 37 | word_freq_sorted.append(int(u[0])) 38 | 39 | tag_seq = dict() 40 | users_ht = set() 41 | seq_file = '/mnt/filer01/word2vec/degree_distribution/sample_ht_sequences' 42 | with open(seq_file, 'rb') as fr: 43 | for line in fr: 44 | line = line.rstrip() 45 | u = line.split(' ') 46 | tag = u[0] 47 | sequence = [] 48 | for i in range(1, len(u)): 49 | author = int(u[i][u[i].index(',')+1 : ]) 50 | if author in labels: 51 | sequence.append(author) 52 | tag_seq[tag]=random.sample(sequence,500) 53 | for u in tag_seq[tag]: 54 | users_ht.add(u) 55 | print tag, len(sequence) 56 | print len(users_ht) 57 | 58 | m = dict() 59 | fr = open("/twitterSimulations/graph/map.txt") 60 | for line in fr: 61 | line = line.rstrip() 62 | u = line.split(' ') 63 | m[int(u[0])] = int(u[1]) 64 | fr.close() 65 | print 'Map Read' 66 | 67 | location_buckets = [-1] * 7697889 68 | # location_buckets = dict() #map to -1 for users not in location files 69 | fr = open('/twitterSimulations/known_locations.txt', 'r') 70 | for line in fr: 71 | line = line.rstrip() 72 | u = line.split('\t') 73 | try: 74 | location_buckets[m[int(u[0])]] = int(u[1]) 75 | except: 76 | pass 77 | fr.close() 78 | 79 | fr = open('/twitterSimulations/known_locations1.txt', 'r') 80 | for line in fr: 81 | line = line.rstrip() 82 | u = line.split('\t') 83 | try: 84 | location_buckets[m[int(u[0])]] = int(u[1]) 85 | except: 86 | pass 87 | fr.close() 88 | print "location file read" 89 | 90 | def get_word_vectors_ht(wlist): 91 | vectors = [] 92 | color = [] 93 | tags=tag_seq.keys() 94 | c1 = set(tag_seq[tags[0]]) #modikiadalat (dark blue) 95 | c2 = set(tag_seq[tags[1]]) #7millionandcounting (light blue) 96 | c3 = set(tag_seq[tags[2]]) #time100 (red) 97 | for w in wlist: 98 | vectors.append(X_word[labels[w]]) 99 | if w in c1: 100 | color.append(50) 101 | elif w in c2: 102 | color.append(100) 103 | elif w in c3: 104 | color.append(200) 105 | else: 106 | print "no tag" 107 | return array(vectors), color 108 | 109 | def get_word_vectors(wlist): 110 | vectors = [] 111 | color = [] 112 | for w in wlist: 113 | vectors.append(X_word[labels[w]]) 114 | color.append(location_buckets[w]) 115 | return array(vectors), color 116 | 117 | # most_freq = word_freq_sorted[0:2500] 118 | # least_freq = word_freq_sorted[-2500:] 119 | half_num_words = int(len(word_freq_sorted)/2.0) 120 | mid_freq = word_freq_sorted[half_num_words-1250:half_num_words+1249] 121 | all_random = random.sample(word_freq_sorted,1000) 122 | 123 | def save_embed_plot((X,color),fname): 124 | Y = tsne(X, no_dims = 2, initial_dims = 50, perplexity = 30.0); 125 | fig = Plot.figure() 126 | Plot.scatter(Y[:,0], Y[:,1], s=20, c=color, alpha=0.8, edgecolor='none'); 127 | Plot.axis('off') 128 | fig.savefig(fname, dpi=300, bbox_inches='tight') 129 | 130 | if __name__ == "__main__": 131 | print "Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset." 132 | # print "Running example on 2,500 MNIST digits..." 133 | # X = Math.loadtxt("mnist2500_X.txt"); 134 | # labels = Math.loadtxt("mnist2500_labels.txt"); 135 | # save_embed_plot(get_word_vectors(most_freq),'embed_users_mostfreq.png') 136 | save_embed_plot(get_word_vectors(all_random),'embed_users_random_1hr_pr.png') 137 | # save_embed_plot(get_word_vectors(mid_freq),'embed_users_midfreq_1hr_pr.png') 138 | save_embed_plot(get_word_vectors_ht(list(users_ht)),'embed_users_random_ht_1hr_pr.png') 139 | # save_embed_plot(get_word_vectors(least_freq),'embed_users_leastfreq.png') -------------------------------------------------------------------------------- /tsne_plots/tsne_hashtag_visualisation.py: -------------------------------------------------------------------------------- 1 | #visualising top 100 most, least and mid frequent hashtags using t-SNE on histogram-of-counts vectors from word classes 2 | 3 | import matplotlib 4 | matplotlib.use('Agg') 5 | from tsne import * 6 | from numpy import array 7 | import math, random 8 | import cPickle as pickle 9 | from collections import Counter 10 | 11 | path_class_file = '/mnt/filer01/word2vec/twitter_vectors_classes.sorted.txt' 12 | word_to_cluster = dict() 13 | with open(path_class_file, 'rb') as fr: 14 | for line in fr: 15 | line = line.rstrip() 16 | u = line.split(' ') 17 | word_to_cluster[u[0]]=int(u[1]) 18 | 19 | tag_labels = [] 20 | num_tags = 0 21 | with open('/mnt/filer01/tweets_repository/Nov2013/tag_tweets_bow.txt', 'rb') as fr: 22 | for line in fr: 23 | line = line.rstrip() 24 | u = line.split('\t') 25 | tag = u[0] 26 | tag_labels.append(tag) 27 | num_tags+=1 28 | 29 | word_doc_freq = dict() 30 | tag_bow = [] 31 | with open('/mnt/filer01/tweets_repository/Nov2013/tag_tweets_bow_processed.txt', 'rb') as fr: 32 | for line in fr: 33 | line = line.rstrip() 34 | u = line.split(' ') 35 | tag = u[0] 36 | words = u[1:] 37 | tag_bow.append(words) # remove duplicate words also 38 | doc_words = set() 39 | for w in words: 40 | if w not in doc_words: 41 | if w not in word_doc_freq: 42 | word_doc_freq[w]=0 43 | word_doc_freq[w]+=1 44 | doc_words.add(w) 45 | 46 | word_clusters_dim = 1000 47 | word_not_found=set() 48 | hist_feature = [] 49 | for tag_words in tag_bow: 50 | tag_feature = [0]*word_clusters_dim 51 | num_words = 0 52 | # word_term_freq = Counter(tag_words) 53 | for word in tag_words: 54 | try: #words from tag bow missing in word vector, may be because of min limit on word occurrence 5 55 | cluster_id = word_to_cluster[word] # cluster index from 0, and order of idx and labels same 56 | df = word_doc_freq[word] #document frequency of words from vocab file 57 | idf = math.log10(float(num_tags)/df) 58 | # if word=='dconcert': # count for cluster with 'dconcert' very high, causing nan value error in tsne P-value calculation 59 | # continue 60 | tag_feature[cluster_id]+=1*idf #using idf as word relevance 61 | num_words+=1*idf 62 | except: 63 | word_not_found.add(word) 64 | #normalise by total number of words 65 | # num_words = len(tag_words) 66 | if num_words==0: 67 | print "error, tag with no words" 68 | num_words = 0.1 69 | hist_feature.append([float(x)/num_words for x in tag_feature]) 70 | # with open('hashtag_vec_tfidf.pickle', 'wb') as fd: 71 | # pickle.dump(hist_feature,fd) 72 | print len(word_doc_freq), len(word_not_found) 73 | tag_freq = [] 74 | with open('tag_freq_1500.csv', 'rb') as fr: 75 | next(fr) 76 | for line in fr: 77 | line = line.rstrip() 78 | u = line.split(',') 79 | tag = u[0] 80 | tag_freq.append((tag,int(u[1]))) 81 | 82 | tag_freq_sorted = [t for t,_ in sorted(tag_freq,key=lambda x: x[1], reverse = True)] 83 | most_freq = set(tag_freq_sorted[0:150]) 84 | least_freq = set(tag_freq_sorted[-150:]) 85 | half_num_words = int(len(tag_freq_sorted)/2.0) 86 | mid_freq = set(tag_freq_sorted[half_num_words-75:half_num_words+74]) 87 | all_random = set(random.sample(tag_freq_sorted,150)) 88 | 89 | #set visibility of most, least and mid frequency hashtags by setting text size 90 | def get_tag_size_label(tlist): 91 | size = [] 92 | label = [] 93 | for t in tag_labels: 94 | if t in tlist: 95 | size.append(2) 96 | label.append(t.decode('latin-1')) 97 | else: 98 | size.append(0) 99 | label.append('') 100 | return size, array(label) 101 | 102 | X = array(hist_feature) 103 | Y = tsne(X, 2, 50, 30.0); 104 | 105 | def save_embed_plot((tag_sizes,labels),fname): 106 | fig = Plot.figure() 107 | Plot.scatter(Y[:,0], Y[:,1], 0); 108 | for label, x, y, s in zip(labels, Y[:,0], Y[:,1], tag_sizes): 109 | Plot.annotate(label, xy = (x, y), xytext = (0, 0), textcoords = 'offset points', size=s) 110 | Plot.axis('off') 111 | fig.savefig(fname, dpi=800, bbox_inches='tight') 112 | 113 | if __name__ == "__main__": 114 | print "Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset." 115 | # print "Running example on 2,500 MNIST digits..." 116 | # X = Math.loadtxt("mnist2500_X.txt"); 117 | # labels = Math.loadtxt("mnist2500_labels.txt"); 118 | save_embed_plot(get_tag_size_label(most_freq),'embed_tag_mostfreq.png') 119 | save_embed_plot(get_tag_size_label(mid_freq),'embed_tag_midfreq.png') 120 | save_embed_plot(get_tag_size_label(least_freq),'embed_tag_leastfreq.png') 121 | save_embed_plot(get_tag_size_label(all_random),'embed_tag_random.png') -------------------------------------------------------------------------------- /user_vector_training/nearest_neighbours/test_kdtree_query.py: -------------------------------------------------------------------------------- 1 | from scipy.spatial import cKDTree as KDTree 2 | import time 3 | from math import sqrt 4 | import random 5 | from heapq import nsmallest 6 | from sklearn.neighbors import NearestNeighbors 7 | 8 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_bfsr.txt" 9 | # M=100000 10 | def read_vector_file(path_vectors_file): 11 | vocab = [] 12 | vectors = [] 13 | count=0 14 | with open(path_vectors_file,"rb") as fr: 15 | _,dim = next(fr).rstrip().split(' ') 16 | word_vector_dim = int(dim) 17 | next(fr) 18 | for line in fr: 19 | # if count==M: 20 | # break 21 | line = line.rstrip() 22 | u = line.split(' ') 23 | if len(u) != word_vector_dim+1: 24 | print "vector length error" 25 | word = int(u[0]) 26 | #normalise to length 1 27 | # vec = [] 28 | # length = 0.0 29 | # for d in u[1:]: 30 | # num=float(d) 31 | # vec.append(num) 32 | # length+=num**2 33 | # length = sqrt(length) 34 | vec = map(float,u[1:]) 35 | length = sum(x**2 for x in vec) 36 | vec_norm = [x/length for x in vec] 37 | vocab.append(word) 38 | vectors.append(vec_norm) 39 | count+=1 40 | return vectors, vocab, word_vector_dim 41 | 42 | def get_Nranked_list(query_set_ind,N): 43 | # wordN = [0]*N 44 | # distN = [0.0]*N 45 | dist_total = [] 46 | set_size = len(query_set_ind) 47 | for i in xrange(0,len(vec)): 48 | if i in query_set_ind: 49 | continue 50 | pres_word = i 51 | pres_vec = vec[i] 52 | dist_k = [0.0]*set_size 53 | k=0 54 | dim=len(pres_vec) 55 | for voc_ind in query_set_ind: 56 | user_vec = vec[voc_ind] 57 | dist = sum( (user_vec[x]-pres_vec[x])**2 for x in xrange(0,dim) ) 58 | dist_k[k]= sqrt(dist) 59 | k+=1 60 | nearest_k = min(dist_k) # dist_k_sorted[0] # if sorted not needed 61 | dist_set=nearest_k 62 | dist_total.append((pres_word,dist_set)) 63 | wordN = [w for w,_ in nsmallest(N,dist_total,key=lambda x: x[1])] 64 | return wordN #zip(wordN,distN) 65 | 66 | t=0.0 67 | t1=0.0 68 | t2=0.0 69 | N=3 70 | k=500 71 | M= 2654594 #1000000 72 | D=10 73 | S=10 74 | eps = 0 75 | 76 | vec,vocab,dim = read_vector_file(vec_file) 77 | print "num points", len(vec), "dim", dim 78 | 79 | # vec = [v[:D] for v in vec[:M]] 80 | print len(vec),len(vec[0]), "eps", eps 81 | tic = time.clock() 82 | kd = KDTree(vec, leafsize=10) 83 | toc = time.clock() 84 | print "scipy tree built in", (toc-tic)*1000 85 | 86 | tic = time.clock() 87 | neigh = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='ball_tree', leaf_size=10, metric='minkowski', p=2) #'ball_tree', 'kd_tree', 'auto' 88 | kd_sklearn = neigh.fit(vec) 89 | toc = time.clock() 90 | print "sklearn tree built in", (toc-tic)*1000 91 | 92 | for _ in range(0,N): 93 | sample = random.sample(range(0,M),S) 94 | sample_vec = [vec[i] for i in sample] 95 | 96 | tic = time.clock() 97 | d_list,knn_list = kd.query(sample_vec,k=k+1) #, eps=eps) 98 | dist_n_list = [] 99 | for d,n in zip(d_list,knn_list): 100 | dist_n_list+=list(zip(n,d))[1:] 101 | knn= [w for w,_ in nsmallest(k,dist_n_list,key=lambda x: x[1])] 102 | toc = time.clock() 103 | print "scipy, tree query in", (toc-tic)*1000 104 | t+=(toc-tic)*1000 105 | 106 | 107 | tic1 = time.clock() 108 | knn_brute = get_Nranked_list(sample,k) 109 | toc1 = time.clock() 110 | print "brute, tree query in", (toc1-tic1)*1000 111 | if knn_brute!=knn: 112 | print "scipy, not same points", "same", len(set(knn_brute)&set(knn)), "out of", k 113 | else: 114 | print "same", len(set(knn_brute)&set(knn)), len(knn_brute) 115 | t1+=(toc1-tic1)*1000 116 | 117 | tic1 = time.clock() 118 | d_list,knn_list = neigh.kneighbors(X=sample_vec, n_neighbors=k+1, return_distance=True) 119 | dist_n_list = [] 120 | for d,n in zip(d_list,knn_list): 121 | dist_n_list+=list(zip(n,d))[1:] 122 | knn_sklearn= [w for w,_ in nsmallest(k,dist_n_list,key=lambda x: x[1])] 123 | toc1 = time.clock() 124 | print "sklearn, tree query in", (toc1-tic1)*1000 125 | if knn_sklearn!=knn_brute: 126 | print "sklearn, not same points", "same", len(set(knn_brute)&set(knn_sklearn)), "out of", k 127 | else: 128 | print "same", len(set(knn_brute)&set(knn_sklearn)) 129 | t2+=(toc1-tic1)*1000 130 | 131 | print "tree query in, avg, kdtree", t*1./N, "brute", t1*1./N, "sklearn", t2*1./N 132 | """ 133 | for i in random.sample(range(0,M),N): 134 | tic = time.clock() 135 | _,knn = kd.query(vec[i],k=k) #, eps=eps) 136 | toc = time.clock() 137 | # print i, knn 138 | # print "tree query in", (toc-tic)*1000 139 | t+=(toc-tic)*1000 140 | 141 | tic1 = time.clock() 142 | knn_brute = get_Nranked_list([i],k) 143 | toc1 = time.clock() 144 | # print i, knn_brute 145 | # print "tree query in", (toc1-tic1)*1000 146 | if knn_brute!=list(knn): 147 | print "not same points", "same", len(set(knn_brute)&set(list(knn))), "out of", k 148 | t1+=(toc1-tic1)*1000 149 | print "tree query in, avg, kdtree", t*1./N, "brute", t1*1./N 150 | """ -------------------------------------------------------------------------------- /user_vector_training/filter_hashtag_sequence.py: -------------------------------------------------------------------------------- 1 | #filter only tweets by subset of users in hashtag sequence file 2 | import time 3 | import sys 4 | import os 5 | import cPickle as pickle 6 | import random 7 | """ 8 | min_tweets_sequence = 2 9 | selected_users = set() 10 | with open("userSubset.csv","r") as fr: 11 | for line in fr: 12 | line = line.rstrip() 13 | u = line.split(',') 14 | id,_,_ = int(u[0]),int(u[1]),int(u[2]) 15 | selected_users.add(id) 16 | 17 | m = dict() 18 | fr = open("/twitterSimulations/graph/map.txt") 19 | for line in fr: 20 | line = line.rstrip() 21 | u = line.split(' ') 22 | m[int(u[0])] = int(u[1]) 23 | fr.close() 24 | print 'Map Read' 25 | 26 | adoption_sequence = dict() 27 | with open('/twitterSimulations/timeline_data/dif_timeline1s', 'r') as fr: 28 | for line in fr: 29 | line = line.rstrip() 30 | u = line.split('\t') 31 | tag = u[0] 32 | time = int(u[1]) 33 | author = m[int(u[2])] 34 | if author not in selected_users: 35 | continue 36 | try: 37 | adoption_sequence[tag].append((time,author)) 38 | except KeyError: 39 | adoption_sequence[tag]=[(time,author)] 40 | print len(adoption_sequence) 41 | 42 | with open('hashtagAdoptionSequences_filter.txt','wb') as fd: 43 | for tag in adoption_sequence.keys(): 44 | if len(adoption_sequence[tag])>=min_tweets_sequence: 45 | fd.write(tag) 46 | for t,a in adoption_sequence[tag]: 47 | fd.write(' '+str(t)+','+str(a)) #author is of type str for using join 48 | fd.write('\n') 49 | """ 50 | #separate sequences into training (80%) and test sequences (20%) 51 | """ 52 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" 53 | adoption_sequence = [] 54 | large_tag_id = [] 55 | count=0 56 | with open(adoption_sequence_filename, 'r') as fr: 57 | for line in fr: 58 | line = line.rstrip() 59 | u = line.split(' ') 60 | #tag = u[0] 61 | sequence = [] 62 | if len(u)-1>=100000: 63 | large_tag_id.append(count) 64 | for i in range(1, len(u)): 65 | #timestamp = int(u[i][0:u[i].index(',')]) 66 | author = int(u[i][u[i].index(',')+1 : ]) 67 | sequence.append(author) 68 | adoption_sequence.append(sequence) 69 | count+=1 70 | 71 | num_lines = len(adoption_sequence) #3617312 72 | print num_lines 73 | seq_random_index=range(0,num_lines) 74 | random.shuffle(seq_random_index) 75 | num_train = int(0.8*num_lines) 76 | print num_train 77 | train_seq_id = seq_random_index[:num_train] 78 | test_seq_id = seq_random_index[num_train:] 79 | with open("sequence_file_split_indices.pickle","wb") as fd: 80 | pickle.dump(train_seq_id,fd) 81 | pickle.dump(test_seq_id,fd) 82 | users_train=set() 83 | for i in train_seq_id: 84 | for u in adoption_sequence[i]: 85 | users_train.add(u) 86 | users_test=set() 87 | overlap = set() 88 | for i in test_seq_id: 89 | for u in adoption_sequence[i]: 90 | users_test.add(u) 91 | if u in users_train: 92 | overlap.add(u) 93 | print len(users_train), len(users_test), len(overlap) 94 | with open("sequence_file_split_users.pickle","wb") as fd: 95 | pickle.dump(users_train,fd) 96 | pickle.dump(users_test,fd) 97 | 98 | # with open("sequence_large_hashtags.pickle","wb") as fd: 99 | # pickle.dump(large_tag_id,fd) 100 | """ 101 | # filter follower files for users in adoption sequence 102 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" 103 | adoption_sequence_users = set() 104 | count=0 105 | with open(adoption_sequence_filename, 'r') as fr: 106 | for line in fr: 107 | line = line.rstrip() 108 | u = line.split(' ') 109 | #tag = u[0] 110 | for i in range(1, len(u)): 111 | #timestamp = int(u[i][0:u[i].index(',')]) 112 | author = int(u[i][u[i].index(',')+1 : ]) 113 | adoption_sequence_users.add(author) 114 | count+=1 115 | print len(adoption_sequence_users), count 116 | 117 | m = dict() 118 | fr = open("/twitterSimulations/graph/map.txt") 119 | for line in fr: 120 | line = line.rstrip() 121 | u = line.split(' ') 122 | m[int(u[0])] = int(u[1]) 123 | fr.close() 124 | print 'Map Read' 125 | 126 | # arr = ["user_followers_bigger_graph_recrawl_3.txt"] 127 | arr = ["user_followers_bigger_graph.txt","user_followers_bigger_graph_2.txt", "user_followers_bigger_graph_i.txt","user_followers_bigger_graph_recrawl_2.txt", "user_followers_bigger_graph_recrawl_3.txt","user_followers_bigger_graph_recrawl.txt"] 128 | 129 | follower_adj = [ [] for i in xrange(0, 7697889) ] 130 | 131 | for i in arr: 132 | fr = open("/twitterSimulations/graph/" + i,'r') 133 | for line in fr: 134 | line = line.rstrip() 135 | u = line.split(' ') 136 | if(int(u[0]) > 7697889): 137 | continue 138 | if len(u) > 2: 139 | for j in range(2,len(u)): 140 | follower_adj[m[int(u[1])]].append(m[int(u[j])]) 141 | fr.close() 142 | print i 143 | 144 | print 'Graph Read\n' 145 | 146 | # for i in range(0, 7697889): 147 | # follower_adj[i] = set(follower_adj[i]) 148 | 149 | print 'Graph Set\n' 150 | 151 | with open("graph_files/follower_graph_tweeters","wb") as fd: 152 | for i in follower_adj: 153 | if i in adoption_sequence_users: 154 | fol = set(follower_adj[i])&adoption_sequence_users 155 | fol = map(str,list(fol)) 156 | fd.write(str(len(fol))+" "+str(i)+" "+" ".join(fol)+"\n") 157 | -------------------------------------------------------------------------------- /user_vector_training/nearest_neighbours/query_nearest_users.py: -------------------------------------------------------------------------------- 1 | #query users nearest to a given user using node vectors file from distance-filewrite.c file and compare with users nearby in hashtag sequence file 2 | 3 | import cPickle as pickle 4 | import random 5 | import os, sys, datetime 6 | from heapq import nlargest 7 | from distance_w2v import * 8 | 9 | start_time = datetime.datetime.now() 10 | 11 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences" 12 | time_diff_for_edge = 1*1*60*60 #5 context width for path in one direction 13 | vec_file = "../node_vectors_1hr_bfs_sgng.txt" 14 | vocab_file = "../node_vocab_1hr_bfs_sgng.txt" 15 | out_file = "nearest_users_compare1hr_bfs_sgng.pickle" 16 | vec,vocab_ind,_ = read_vector_file(vec_file) 17 | 18 | m = dict() 19 | fr = open("/twitterSimulations/graph/map.txt") 20 | for line in fr: 21 | line = line.rstrip() 22 | u = line.split(' ') 23 | m[int(u[0])] = int(u[1]) 24 | fr.close() 25 | print 'Map Read' 26 | 27 | location_buckets = [-1] * 7697889 28 | fr = open('/twitterSimulations/known_locations.txt', 'r') 29 | for line in fr: 30 | line = line.rstrip() 31 | u = line.split('\t') 32 | try: 33 | location_buckets[m[int(u[0])]] = int(u[1]) 34 | except: 35 | pass 36 | fr.close() 37 | 38 | fr = open('/twitterSimulations/known_locations1.txt', 'r') 39 | for line in fr: 40 | line = line.rstrip() 41 | u = line.split('\t') 42 | try: 43 | location_buckets[m[int(u[0])]] = int(u[1]) 44 | except: 45 | pass 46 | fr.close() 47 | print "location file read" 48 | 49 | # def call_distance(word): 50 | # return os.system("./distance-filewrite ../node_vectors_1hr_bfs_15.bin query_output_temp1hr_bfs_15 "+str(word)) 51 | 52 | # def get_nearest(): 53 | # nearest = [] 54 | # with open("query_output_temp1hr_bfs_15","rb") as fr: 55 | # for line in fr: 56 | # line=line.rstrip().split('\t') 57 | # nearest.append(int(line[0])) 58 | # return nearest 59 | 60 | def compare_nearest(seq,w2v): 61 | return len(set(seq)&set(w2v)) 62 | 63 | vocab = [] 64 | freq = dict() 65 | with open(vocab_file,"rb") as fr: 66 | next(fr) 67 | for line in fr: 68 | line=line.rstrip().split(' ') 69 | vocab.append(int(line[0])) 70 | freq[int(line[0])]=int(line[1]) 71 | print "Vocab read" 72 | 73 | sub_vocab=[] 74 | for v in vocab: 75 | if freq[v]>10000: 76 | sub_vocab.append(v) 77 | rand_users = random.sample(vocab,100) 78 | rand_users_set = set(rand_users) 79 | vocab = set(vocab) 80 | print "Sample selected" 81 | 82 | near_count = [[0]*7697889 for i in xrange(0,100)] 83 | 84 | tagcount=0 85 | with open(adoption_sequence_filename, 'r') as fr: 86 | for line in fr: 87 | line = line.rstrip() 88 | u = line.split(' ') 89 | for i in range(1, len(u)): 90 | timestamp = int(u[i][0:u[i].index(',')]) 91 | author = int(u[i][u[i].index(',')+1 : ]) 92 | location_user = location_buckets[author] 93 | if author in rand_users_set: 94 | for j in range(i+1, len(u)): 95 | t1 = int(u[j][0:u[j].index(',')]) 96 | a1 = int(u[j][u[j].index(',')+1 : ]) 97 | if t1-timestamp<=time_diff_for_edge: 98 | if location_buckets[a1]==location_user: 99 | near_count[rand_users.index(author)][a1]+=1 100 | else: 101 | break 102 | for j in range(i-1, 0, -1): 103 | t1 = int(u[j][0:u[j].index(',')]) 104 | a1 = int(u[j][u[j].index(',')+1 : ]) 105 | if timestamp-t1<=time_diff_for_edge: 106 | if location_buckets[a1]==location_user: 107 | near_count[rand_users.index(author)][a1]+=1 108 | else: 109 | break 110 | tagcount+=1 111 | if tagcount%100000==0: 112 | print "Hashtag count", tagcount 113 | print "Sequence file read" 114 | 115 | near_users_seq = dict() 116 | for i in range(0,len(rand_users)): 117 | user_count = near_count[i] 118 | count = [] 119 | for l in vocab:#xrange(0,7697889): 120 | if user_count[l]!=0 and l!=rand_users[i]: 121 | count.append((l,user_count[l])) 122 | #count = zip(range(0,7697889),near_count[i]) 123 | #count_nz = [(a,b) for (a,b) in count if b!=0] 124 | #count_s = sorted(count_nz,key=lambda x: x[1],reverse=True)[0:100] 125 | #count_s = sorted(range(0,7697889),key=lambda x: user_count[x],reverse=True) 126 | count_s = nlargest(100,count,key=lambda x: x[1]) 127 | if len(count_s)==0: 128 | u,c = [], [] 129 | else: 130 | u,c = zip(*count_s) 131 | near_users_seq[rand_users[i]]=list(u) 132 | print "sel count", rand_users[i], len(u), "non zero", len(count) 133 | 134 | nearest_users_w2v_pickle = dict() 135 | count_pickle = [] 136 | for user in rand_users: 137 | # a = call_distance(user) 138 | # if a!=0: 139 | # print "call error" 140 | # sys.exit(0) 141 | # nearest_users_w2v = get_nearest() 142 | nearest_users_w2v = get_Nnearest(user,vec,vocab_ind,100) 143 | comp_count = compare_nearest(near_users_seq[user][0:100],nearest_users_w2v[0:100]) 144 | print "common users", user, comp_count, "out of", min(len(near_users_seq[user]),len(nearest_users_w2v)) 145 | count_pickle.append(comp_count) 146 | nearest_users_w2v_pickle[user]=nearest_users_w2v 147 | 148 | with open(out_file,"wb") as fd: 149 | pickle.dump(rand_users,fd) 150 | pickle.dump(count_pickle,fd) 151 | pickle.dump(near_users_seq,fd) 152 | pickle.dump(nearest_users_w2v_pickle,fd) 153 | 154 | print start_time, datetime.datetime.now() -------------------------------------------------------------------------------- /user_vector_training/helpers/test_distance.py: -------------------------------------------------------------------------------- 1 | from heapq import nsmallest,nlargest 2 | from math import sqrt 3 | vec = [(1,1),(4,2),(2,2),(3,2),(3,3),(4,4),(2,3)] 4 | for i in range(0,7): 5 | a,b=vec[i] 6 | l=float(sqrt(a**2+b**2)) 7 | vec[i]=(a/l,b/l) 8 | vocab = [1,2,3,4,5,6,7] 9 | dim = 2 10 | par_m = 2 11 | vocab_index=dict() 12 | for i in xrange(0,len(vocab)): 13 | vocab_index[vocab[i]]=i 14 | query_set = [3,4] 15 | N=4 16 | 17 | def get_Nranked_list(query_set,N): 18 | # wordN = [0]*N 19 | # distN = [0.0]*N 20 | dist_total = [] 21 | set_size = len(query_set) 22 | try: 23 | query_set_ind = [ vocab_index[query] for query in query_set ] 24 | except KeyError: 25 | print "query word not present" 26 | return 27 | print query_set_ind 28 | for i in xrange(0,len(vec)): 29 | if i in query_set_ind: 30 | continue 31 | pres_word = vocab[i] 32 | pres_vec = vec[i] 33 | dist_k = [0.0]*set_size 34 | k=0 35 | for voc_ind in query_set_ind: 36 | user_vec = vec[voc_ind] 37 | #Euclidean distance, cosine similarity user_vec[x]*pres_vec[x], change to decreasing order of distance in sorted,distN 38 | print user_vec,pres_vec 39 | dist = 1- sum((user_vec[x]*pres_vec[x]) for x in xrange(0,dim)) 40 | dist_k[k]=sqrt(float(2*dist)) 41 | k+=1 42 | # dist = 0.0 43 | # for x in xrange(0,dim): 44 | # dist+=(user_vec[x]-pres_vec[x])**2 45 | #distance of a point from a set 46 | # dist_k_sorted = sorted(dist_k) 47 | print i,dist_k 48 | nearest_k = min(dist_k) # dist_k_sorted[0] # if sorted not needed 49 | if nearest_k!=0.0: 50 | dist_set=sum( (nearest_k/dist_k[p])**(par_m) for p in xrange(0,set_size) ) 51 | dist_set = nearest_k * (dist_set)**(1.0/set_size) 52 | else: 53 | dist_set=0.0 54 | print i,dist_set 55 | dist_total.append((pres_word,dist_set)) 56 | # for j in xrange(0,N): 57 | # if dist>distN[j]: 58 | # for k in xrange(N-1,j,-1): 59 | # distN[k] = distN[k-1] 60 | # wordN[k] = wordN[k-1] 61 | # distN[j] = dist 62 | # wordN[j] = pres_word 63 | # break 64 | print dist_total 65 | wordN = [w for w,_ in nsmallest(N,dist_total,key=lambda x: x[1])] 66 | return wordN #zip(wordN,distN) 67 | 68 | print get_Nranked_list(query_set,N) 69 | 70 | adj = {1:set([2,3]),2:set([1,3,5,6]),3:set([1]),4:set([5]),5:set([1,2]),6:set([1])} 71 | nb_seq_order = [3,4,5,1,2,6,7] 72 | def getadj(user): 73 | return adj[user] 74 | def get_Nranked_list_fol(query_set,N): 75 | friend_count = dict() 76 | init_adopters = query_set 77 | sec_hop = 2 78 | while (sec_hop>0): 79 | for a in init_adopters: 80 | followers = getadj(a) 81 | print a,followers 82 | for f in followers-set(query_set): 83 | try: 84 | friend_count[f]+=1 85 | except KeyError: 86 | friend_count[f]=1 87 | init_adopters = friend_count.keys() 88 | sec_hop-=1 89 | print friend_count 90 | friend_count_list = [(f,friend_count[f]) for f in friend_count] 91 | print friend_count_list 92 | ranked_list = [f for f,_ in nlargest(N,friend_count_list,key=lambda x: x[1])] 93 | print ranked_list 94 | if len(friend_count_list)>=N: 95 | return ranked_list 96 | else: 97 | print "followers ranked list short" 98 | users_left = N-len(friend_count_list) 99 | for i in nb_seq_order: 100 | if i not in friend_count and i not in query_set: 101 | ranked_list.append(i) 102 | users_left-=1 103 | if users_left==0: 104 | break 105 | return ranked_list 106 | 107 | print get_Nranked_list_fol(query_set,N) 108 | 109 | num_init_adopters=2 110 | N = 3 111 | seq_sample_vocab = [3,4,1,7,2] 112 | init_adopters=seq_sample_vocab[0:num_init_adopters] 113 | seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:]) 114 | M = len(seq_sample_vocab) 115 | print M, "pred seq length" 116 | #precision, recall evaluation 117 | adopters_vec = get_Nranked_list(init_adopters,N) 118 | print adopters_vec 119 | precision_k = 0.0 120 | num_hits = 0.0 121 | for k,p in enumerate(adopters_vec): 122 | if p in seq_sample_vocab: 123 | num_hits+=1.0 124 | precision_k += num_hits/(k+1.0) 125 | average_precision = precision_k/min(M,N) 126 | # prec_r = num_hits/M 127 | prec_k = num_hits/N 128 | rec_k = num_hits/M 129 | print "Avg precision", average_precision, "adopters in seq", len(seq_sample_vocab) 130 | # print "RPrecision", prec_r 131 | print "Precision", prec_k, "Recall", rec_k 132 | 133 | adoption_sequence_filename="ab.txt" 134 | seq_len_threshold=3 135 | def read_adoption_sequence(adoption_sequence_filename, start, end,train_seq_id,large_tag_id): 136 | with open(adoption_sequence_filename, 'r') as fr: 137 | count=0 138 | for line in fr: 139 | if count < start: 140 | count+=1 141 | continue 142 | elif count >= end: 143 | return 144 | if count not in train_seq_id or count in large_tag_id: 145 | count+=1 146 | continue 147 | count+=1 148 | line = line.rstrip() 149 | u = line.split(' ') 150 | tag = u[0] 151 | sequence = [] 152 | adopters = set() 153 | for i in range(1, len(u)): 154 | timestamp = int(u[i][0:u[i].index(',')]) 155 | author = int(u[i][u[i].index(',')+1 : ]) 156 | sequence.append((timestamp,author)) 157 | adopters.add(author) 158 | if len(adopters) < seq_len_threshold: 159 | continue 160 | yield (tag,sequence) 161 | for i in read_adoption_sequence(adoption_sequence_filename, 0, 4,set([0,1,3]),[]): 162 | print i -------------------------------------------------------------------------------- /Untitled: -------------------------------------------------------------------------------- 1 | /dbresearch2/sahil/node_vectors_1hr_pr.txt 0.1 True 2 | num users in train sequences 2574807 3 | Map Read 4 | Follower file offset Read 5 | 6 | Friend file offset Read 7 | 8 | 2157571 2574807 100 9 | 100000 900000 10 | /usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples. 11 | 'precision', 'predicted', average, warn_for) 12 | /usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. 13 | 'precision', 'predicted', average, warn_for) 14 | ((0.0023275735255466729, 0.19630666666666666), 176676, (0.0013922458628841607, 0.19630666666666666), (0.0070921985815602835, 0.19630666666666666)) 6 15 | ((0.11315118094706672, 0.4070988888888889), 366389, (0.15637680750854319, 0.4070988888888889), (0.11585482553372159, 0.4070988888888889)) 16 | Counter({6: 339200, 38: 126921, 2: 108161, 5: 101815, 50: 28857, 49: 26452, 27: 26401, 40: 14987, 52: 14302, 42: 13881, 16: 12945, 25: 11053, 129: 11022, 22: 10750, 19: 7019, 28: 4531, 104: 4220, 120: 4090, 68: 3577, 94: 3366, 35: 3239, 65: 3121, 93: 2907, 20: 2378, 76: 2361, 88: 1431, 62: 1340, 55: 1252, 64: 1079, 79: 754, 102: 596, 110: 507, 106: 504, 91: 495, 37: 465, 111: 415, 60: 316, 95: 314, 47: 301, 127: 263, 100: 215, 45: 182, 139: 172, 14: 104, 99: 100, 89: 99, 84: 97, 41: 91, 122: 87, 72: 86, 118: 85, 48: 78, 53: 77, 10: 74, 36: 70, 54: 67, 71: 53, 39: 50, 82: 43, 103: 43, 12: 41, 105: 38, 107: 37, 33: 33, 32: 31, 134: 31, 108: 30, 75: 28, 132: 28, 109: 22, 80: 21, 67: 20, 101: 20, 58: 19, 87: 15, 128: 15, 9: 12, 96: 12, 123: 11, 116: 10, 114: 9, 66: 8, 140: 8, 30: 6, 44: 6, 135: 6, 21: 3, 86: 3, 113: 3, 0: 2, 83: 2, 125: 2, 136: 2, 138: 2, 4: 1, 133: 1, 137: 1}) 97 17 | ((0.13520330600572122, 0.40737222222222225), 366635, (0.21112793377240058, 0.40737222222222225), (0.12320217398439315, 0.40737222222222225)) 18 | Counter({6: 355749, 38: 117134, 5: 109981, 2: 106147, 50: 24001, 49: 22807, 27: 22636, 40: 15325, 52: 12153, 42: 11663, 16: 11388, 22: 9930, 25: 9828, 129: 7313, 19: 6326, 28: 4241, 60: 3487, 120: 3362, 104: 3152, 94: 3145, 35: 3029, 68: 2946, 65: 2617, 127: 2487, 20: 2288, 93: 2087, 91: 1973, 76: 1866, 62: 1514, 106: 1400, 45: 1354, 30: 1180, 55: 1122, 102: 1092, 88: 1054, 79: 1013, 64: 901, 110: 855, 111: 802, 123: 761, 36: 702, 37: 611, 108: 565, 47: 547, 14: 544, 0: 512, 139: 510, 66: 403, 4: 380, 112: 328, 134: 311, 89: 298, 58: 278, 77: 219, 73: 206, 95: 130, 116: 129, 41: 118, 67: 112, 81: 109, 43: 96, 119: 72, 61: 71, 113: 71, 56: 67, 63: 59, 74: 59, 48: 55, 26: 51, 100: 50, 24: 48, 3: 45, 7: 37, 18: 24, 1: 16, 92: 13, 121: 9, 98: 8, 31: 6, 126: 6, 17: 4, 124: 3, 15: 2, 21: 2, 57: 2, 131: 2, 78: 1}) 87 19 | fol 100000 20 | fol 200000 21 | fol 300000 22 | fol 400000 23 | fol 500000 24 | fol 600000 25 | fol 700000 26 | fol 800000 27 | fol 900000 28 | fol based pred 1738513.587 29 | ((0.18500498224648235, 0.38802111111111109), 349219, (0.27468649543097362, 0.38802111111111109), (0.15741696575059896, 0.38802111111111109)) 30 | Counter({6: 369040, 2: 111260, 5: 91384, 38: 73751, 40: 21811, 49: 19016, 50: 18591, 27: 14546, 21: 12674, 4: 11865, 16: 10763, 25: 10510, 42: 9603, 0: 9439, 52: 8556, 129: 8479, 22: 7131, 3: 5310, 19: 4940, 35: 4713, 1: 4597, 23: 3886, 60: 3772, 34: 3657, 120: 3590, 127: 3131, 30: 3078, 123: 2518, 68: 2444, 45: 2288, 20: 2281, 28: 2181, 36: 1966, 93: 1713, 56: 1591, 65: 1561, 91: 1525, 112: 1477, 64: 1442, 76: 1428, 94: 1381, 55: 1290, 104: 1174, 47: 1112, 106: 998, 139: 982, 108: 974, 43: 937, 111: 909, 37: 890, 81: 887, 73: 878, 14: 805, 134: 795, 79: 789, 62: 748, 24: 696, 121: 687, 74: 625, 88: 609, 110: 583, 51: 524, 61: 399, 66: 395, 102: 355, 67: 343, 48: 340, 89: 325, 41: 324, 78: 294, 57: 258, 58: 252, 95: 249, 17: 242, 26: 239, 77: 233, 131: 233, 113: 219, 8: 208, 96: 183, 92: 179, 7: 178, 63: 160, 119: 152, 124: 134, 31: 111, 18: 106, 11: 92, 59: 92, 83: 91, 15: 90, 126: 73, 29: 54, 116: 53, 98: 49, 100: 45, 117: 31, 72: 29, 46: 28, 70: 27, 33: 24, 122: 24, 87: 21, 105: 20, 32: 18, 44: 18, 75: 17, 53: 16, 69: 16, 10: 14, 54: 12, 71: 12, 80: 12, 128: 12, 84: 11, 82: 10, 103: 10, 12: 9, 13: 9, 101: 9, 140: 8, 99: 7, 107: 7, 135: 6, 9: 4, 39: 4, 125: 4, 86: 3, 114: 3, 118: 3, 132: 3, 136: 3, 138: 2, 97: 1, 133: 1, 137: 1}) 136 161031 31 | limit pred ((0.21452624247316587, 0.43708464089833271), 322992, (0.27552783828129274, 0.43708464089833265), (0.19807627055156674, 0.43708464089833265)) cov 0.821076666667 32 | fr 100000 33 | fr 200000 34 | fr 300000 35 | fr 400000 36 | fr 500000 37 | fr 600000 38 | fr 700000 39 | fr 800000 40 | fr 900000 41 | ((0.17216425039620847, 0.41254777777777779), 371293, (0.3000193087788961, 0.41254777777777779), (0.14673950069699471, 0.41254777777777779)) 42 | Counter({6: 352261, 2: 166024, 38: 88391, 5: 74008, 49: 22190, 50: 22092, 27: 17790, 40: 13809, 16: 10948, 52: 10829, 42: 9951, 25: 9719, 21: 9543, 129: 8532, 22: 7597, 4: 6247, 19: 5832, 0: 5617, 35: 3601, 120: 2973, 1: 2933, 28: 2498, 93: 2460, 20: 2339, 65: 2308, 68: 1985, 76: 1941, 3: 1932, 127: 1805, 30: 1796, 45: 1656, 104: 1466, 112: 1366, 94: 1303, 60: 1241, 55: 1234, 123: 1208, 23: 1125, 62: 1103, 56: 1093, 64: 1074, 36: 983, 34: 975, 106: 970, 110: 846, 134: 708, 43: 679, 81: 676, 91: 662, 89: 661, 47: 658, 37: 594, 139: 576, 14: 560, 111: 558, 108: 525, 79: 512, 24: 401, 73: 375, 57: 303, 61: 284, 121: 249, 41: 246, 48: 238, 74: 193, 95: 189, 88: 186, 78: 179, 8: 178, 66: 170, 102: 170, 124: 136, 113: 130, 58: 120, 26: 109, 77: 108, 119: 95, 51: 90, 17: 88, 29: 70, 67: 70, 63: 62, 126: 52, 70: 46, 31: 44, 92: 44, 15: 43, 18: 42, 131: 34, 7: 30, 11: 29, 83: 29, 59: 22, 117: 21, 96: 18, 32: 16, 116: 16, 98: 13, 87: 10, 33: 9, 69: 9, 100: 9, 46: 7, 105: 7, 103: 6, 75: 5, 71: 4, 39: 3, 54: 3, 72: 3, 122: 3, 128: 3, 132: 3, 12: 2, 44: 2, 114: 2, 10: 1, 13: 1, 53: 1, 82: 1, 84: 1, 97: 1, 99: 1, 101: 1, 140: 1}) 125 90000 43 | limit pred ((0.18509957355080509, 0.43739382716049385), 354289, (0.30040341757656785, 0.43739382716049385), (0.16387609625493194, 0.43739382716049385)) cov 0.9 44 | fol 18 77 fr 59087 4784 59 45 | /dbresearch2/sahil/node_vectors_1hr_pr.txt 0.1 True 46 | -------------------------------------------------------------------------------- /tsne_plots/tsne.py: -------------------------------------------------------------------------------- 1 | # 2 | # tsne.py 3 | # 4 | # Implementation of t-SNE in Python. The implementation was tested on Python 2.5.1, and it requires a working 5 | # installation of NumPy. The implementation comes with an example on the MNIST dataset. In order to plot the 6 | # results of this example, a working installation of matplotlib is required. 7 | # The example can be run by executing: ipython tsne.py -pylab 8 | # 9 | # 10 | # Created by Laurens van der Maaten on 20-12-08. 11 | # Copyright (c) 2008 Tilburg University. All rights reserved. 12 | 13 | import numpy as Math 14 | import pylab as Plot 15 | 16 | def Hbeta(D = Math.array([]), beta = 1.0): 17 | """Compute the perplexity and the P-row for a specific value of the precision of a Gaussian distribution.""" 18 | 19 | # Compute P-row and corresponding perplexity 20 | P = Math.exp(-D.copy() * beta); 21 | sumP = sum(P); 22 | H = Math.log(sumP) + beta * Math.sum(D * P) / sumP; 23 | P = P / sumP; 24 | return H, P; 25 | 26 | 27 | def x2p(X = Math.array([]), tol = 1e-5, perplexity = 30.0): 28 | """Performs a binary search to get P-values in such a way that each conditional Gaussian has the same perplexity.""" 29 | 30 | # Initialize some variables 31 | print "Computing pairwise distances..." 32 | (n, d) = X.shape; 33 | sum_X = Math.sum(Math.square(X), 1); 34 | D = Math.add(Math.add(-2 * Math.dot(X, X.T), sum_X).T, sum_X); 35 | P = Math.zeros((n, n)); 36 | beta = Math.ones((n, 1)); 37 | logU = Math.log(perplexity); 38 | 39 | # Loop over all datapoints 40 | for i in range(n): 41 | 42 | # Print progress 43 | if i % 500 == 0: 44 | print "Computing P-values for point ", i, " of ", n, "..." 45 | 46 | # Compute the Gaussian kernel and entropy for the current precision 47 | betamin = -Math.inf; 48 | betamax = Math.inf; 49 | Di = D[i, Math.concatenate((Math.r_[0:i], Math.r_[i+1:n]))]; 50 | (H, thisP) = Hbeta(Di, beta[i]); 51 | 52 | # Evaluate whether the perplexity is within tolerance 53 | Hdiff = H - logU; 54 | tries = 0; 55 | while Math.abs(Hdiff) > tol and tries < 50: 56 | 57 | # If not, increase or decrease precision 58 | if Hdiff > 0: 59 | betamin = beta[i]; 60 | if betamax == Math.inf or betamax == -Math.inf: 61 | beta[i] = beta[i] * 2; 62 | else: 63 | beta[i] = (beta[i] + betamax) / 2; 64 | else: 65 | betamax = beta[i]; 66 | if betamin == Math.inf or betamin == -Math.inf: 67 | beta[i] = beta[i] / 2; 68 | else: 69 | beta[i] = (beta[i] + betamin) / 2; 70 | 71 | # Recompute the values 72 | (H, thisP) = Hbeta(Di, beta[i]); 73 | Hdiff = H - logU; 74 | tries = tries + 1; 75 | 76 | # Set the final row of P 77 | P[i, Math.concatenate((Math.r_[0:i], Math.r_[i+1:n]))] = thisP; 78 | 79 | # Return final P-matrix 80 | print "Mean value of sigma: ", Math.mean(Math.sqrt(1 / beta)) 81 | return P; 82 | 83 | 84 | def pca(X = Math.array([]), no_dims = 50): 85 | """Runs PCA on the NxD array X in order to reduce its dimensionality to no_dims dimensions.""" 86 | 87 | print "Preprocessing the data using PCA..." 88 | (n, d) = X.shape; 89 | X = X - Math.tile(Math.mean(X, 0), (n, 1)); 90 | (l, M) = Math.linalg.eig(Math.dot(X.T, X)); 91 | Y = Math.dot(X, M[:,0:no_dims]); 92 | return Y; 93 | 94 | 95 | def tsne(X = Math.array([]), no_dims = 2, initial_dims = 50, perplexity = 30.0): 96 | """Runs t-SNE on the dataset in the NxD array X to reduce its dimensionality to no_dims dimensions. 97 | The syntaxis of the function is Y = tsne.tsne(X, no_dims, perplexity), where X is an NxD NumPy array.""" 98 | 99 | # Check inputs 100 | if X.dtype != "float64": 101 | print "Error: array X should have type float64."; 102 | return -1; 103 | #if no_dims.__class__ != "": # doesn't work yet! 104 | # print "Error: number of dimensions should be an integer."; 105 | # return -1; 106 | 107 | # Initialize variables 108 | X = pca(X, initial_dims); 109 | (n, d) = X.shape; 110 | max_iter = 1000; 111 | initial_momentum = 0.5; 112 | final_momentum = 0.8; 113 | eta = 500; 114 | min_gain = 0.01; 115 | Y = Math.random.randn(n, no_dims); 116 | dY = Math.zeros((n, no_dims)); 117 | iY = Math.zeros((n, no_dims)); 118 | gains = Math.ones((n, no_dims)); 119 | 120 | # Compute P-values 121 | P = x2p(X, 1e-5, perplexity); 122 | P = P + Math.transpose(P); 123 | P = P / Math.sum(P); 124 | P = P * 4; # early exaggeration 125 | P = Math.maximum(P, 1e-12); 126 | 127 | # Run iterations 128 | for iter in range(max_iter): 129 | 130 | # Compute pairwise affinities 131 | sum_Y = Math.sum(Math.square(Y), 1); 132 | num = 1 / (1 + Math.add(Math.add(-2 * Math.dot(Y, Y.T), sum_Y).T, sum_Y)); 133 | num[range(n), range(n)] = 0; 134 | Q = num / Math.sum(num); 135 | Q = Math.maximum(Q, 1e-12); 136 | 137 | # Compute gradient 138 | PQ = P - Q; 139 | for i in range(n): 140 | dY[i,:] = Math.sum(Math.tile(PQ[:,i] * num[:,i], (no_dims, 1)).T * (Y[i,:] - Y), 0); 141 | 142 | # Perform the update 143 | if iter < 20: 144 | momentum = initial_momentum 145 | else: 146 | momentum = final_momentum 147 | gains = (gains + 0.2) * ((dY > 0) != (iY > 0)) + (gains * 0.8) * ((dY > 0) == (iY > 0)); 148 | gains[gains < min_gain] = min_gain; 149 | iY = momentum * iY - eta * (gains * dY); 150 | Y = Y + iY; 151 | Y = Y - Math.tile(Math.mean(Y, 0), (n, 1)); 152 | 153 | # Compute current value of cost function 154 | if (iter + 1) % 10 == 0: 155 | C = Math.sum(P * Math.log(P / Q)); 156 | print "Iteration ", (iter + 1), ": error is ", C 157 | 158 | # Stop lying about P-values 159 | if iter == 100: 160 | P = P / 4; 161 | 162 | # Return solution 163 | return Y; 164 | 165 | 166 | if __name__ == "__main__": 167 | print "Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset." 168 | print "Running example on 2,500 MNIST digits..." 169 | X = Math.loadtxt("mnist2500_X.txt"); 170 | labels = Math.loadtxt("mnist2500_labels.txt"); 171 | Y = tsne(X, 2, 50, 20.0); 172 | fig = Plot.figure() 173 | Plot.scatter(Y[:,0], Y[:,1], 20, labels); 174 | fig.savefig('foo.png') 175 | -------------------------------------------------------------------------------- /misc/test.py: -------------------------------------------------------------------------------- 1 | #TODO 2 | #distribution of number of tweets sharing the same hashtags as the ones used by each user, number of users with atleast 20% common following users and number of reciprocal relations with users in the subset of selected users 3 | #use these features as sentences in word2vec for node representations or try deepwalk on adjacency list of subset of users or use sequence of authors adopting a hashtag for hashtags with atleast 10 adoptions as sentences 4 | import time 5 | import sys 6 | import os 7 | import cPickle as pickle 8 | import random 9 | 10 | 11 | m = dict() 12 | fr = open("/twitterSimulations/graph/map.txt") 13 | for line in fr: 14 | line = line.rstrip() 15 | u = line.split(' ') 16 | m[int(u[0])] = int(u[1]) 17 | fr.close() 18 | 19 | tags_for_user = dict() 20 | num_tweets_per_tag = dict() 21 | with open('/twitterSimulations/timeline_data/dif_timeline1s', 'r') as fr: 22 | for line in fr: 23 | line = line.rstrip() 24 | u = line.split('\t') 25 | tag = u[0] 26 | author = m[int(u[2])] 27 | if author not in tags_for_user: 28 | tags_for_user[author]=set() 29 | tags_for_user[author].add(tag) 30 | if tag not in num_tweets_per_tag: 31 | num_tweets_per_tag[tag]=0 32 | num_tweets_per_tag[tag]+=1 33 | print len(tags_for_user) 34 | 35 | selected_users = set() 36 | with open("userSubset.csv","r") as fr: 37 | for line in fr: 38 | line = line.rstrip() 39 | u = line.split(',') 40 | id,_,_ = int(u[0]),int(u[1]),int(u[2]) 41 | selected_users.add(id) 42 | 43 | #subset follower and friend adjacency list 44 | """ 45 | arr = ["user_followers_bigger_graph.txt","user_followers_bigger_graph_2.txt", "user_followers_bigger_graph_i.txt","user_followers_bigger_graph_recrawl_2.txt", "user_followers_bigger_graph_recrawl_3.txt","user_followers_bigger_graph_recrawl.txt"] 46 | follower = dict() 47 | for i in arr: 48 | fr = open("/twitterSimulations/graph/" + i,'r') 49 | for line in fr: 50 | line = line.rstrip() 51 | u = line.split(' ') 52 | if(int(u[0]) > 7697889): 53 | continue 54 | node = m[int(u[1])] 55 | if node not in selected_users: 56 | continue 57 | follower[node] = [] 58 | if len(u) > 2: 59 | for j in range(2,len(u)): 60 | snode = m[int(u[j])] 61 | if snode in selected_users: 62 | follower[node].append(snode) 63 | fr.close() 64 | print i 65 | pickle.dump( follower, open( "subset_follower_graph.pickle", "wb" ) ) 66 | 67 | arr_friend = ["user_friends_bigger_graph.txt","user_friends_bigger_graph_2.txt", "user_friends_bigger_graph_i.txt","user_friends_bigger_graph_recrawl.txt"] 68 | friend = dict() 69 | num_friend_id_not_found=0 70 | friend_id_not_found=set() 71 | for i in arr_friend: 72 | fr = open("/twitterSimulations/graph/" + i,'r') 73 | for line in fr: 74 | line = line.rstrip() 75 | u = line.split(' ') 76 | if(int(u[0]) > 7697889): 77 | continue 78 | try: 79 | node = m[int(u[1])] 80 | except: 81 | num_friend_id_not_found += 1 82 | friend_id_not_found.add(int(u[1])) 83 | continue 84 | if node not in selected_users: 85 | continue 86 | friend[node] = [] 87 | if len(u) > 2: 88 | for j in range(2,len(u)): 89 | try: 90 | snode = m[int(u[j])] 91 | except: 92 | num_friend_id_not_found += 1 93 | friend_id_not_found.add(int(u[1])) 94 | continue 95 | if snode in selected_users: 96 | friend[node].append(snode) 97 | fr.close() 98 | print i 99 | pickle.dump( friend, open( "subset_friend_graph.pickle", "wb" ) ) 100 | print num_friend_id_not_found 101 | pickle.dump( friend_id_not_found, open( "friend_id_not_found.pickle", "wb" ) ) 102 | """ 103 | """ 104 | follower = pickle.load( open( "subset_follower_graph.pickle", "rb" ) ) 105 | print "Follower file loaded" 106 | """ 107 | friend = pickle.load( open( "subset_friend_graph.pickle", "rb" ) ) 108 | print "Friend file loaded" 109 | 110 | #number of users with reciprocal links 111 | """ 112 | num_rec = dict() 113 | count=0 114 | for node in selected_users: 115 | count+=1 116 | if count%10000==0: 117 | print count," Users processed" 118 | num_rec[node]=0 119 | # for nbh in friend[node]: 120 | # if node in friend[nbh]: 121 | # num_rec[node]+=1 122 | try: 123 | incoming = set(friend[node]) 124 | outgoing = set(follower[node]) 125 | reciprocal = set.intersection(incoming, outgoing) 126 | num_rec[node]+=len(reciprocal) 127 | except: 128 | pass 129 | pickle.dump( num_rec, open( "num_reciprocal_links.pickle", "wb" ) ) 130 | """ 131 | 132 | num_rec = pickle.load( open( "num_reciprocal_links.pickle", "rb" ) ) 133 | 134 | 135 | def get_intersection(list1,list2): 136 | s = set(list2) 137 | count=0 138 | for i in list1: 139 | if i in s: 140 | count+=1 141 | return count 142 | 143 | #users with more than 20% common friends 144 | num_common_friends = dict() 145 | count=0 146 | num_common_friends_thr = dict() 147 | selected_users_list = random.sample(selected_users,500) 148 | for i in range(0,len(selected_users_list)): 149 | count+=1 150 | if count%1000==0: 151 | print count," Users processed" 152 | node = selected_users_list[i] 153 | adj_nodes = friend[node] 154 | thr = .20*len(adj_nodes) 155 | 156 | # for j in range(i+1,len(selected_users_list)): 157 | # snode = selected_users_list[j] 158 | for snode in selected_users: 159 | 160 | nbh_adj_nodes = friend[snode] 161 | thr_s = .20*len(nbh_adj_nodes) 162 | common = get_intersection(adj_nodes, nbh_adj_nodes) 163 | if common>=thr: 164 | if node not in num_common_friends_thr: 165 | num_common_friends_thr[node]=0 166 | num_common_friends_thr[node]+=1 167 | # if common>=thr_s: 168 | # if snode not in num_common_friends_thr: 169 | # num_common_friends_thr[snode]=0 170 | # num_common_friends_thr[snode]+=1 171 | pickle.dump( num_common_friends_thr, open( "num_common_friends_thr_test.pickle", "wb" ) ) 172 | 173 | 174 | #tweets with same hashtags 175 | num_tweets_with_same_tags = dict() 176 | for node in selected_users: 177 | num_tweets_with_same_tags[node] = sum([num_tweets_per_tag[x] for x in tags_for_user[node]]) 178 | 179 | with open("featuresUserSubset_test.csv","w") as fd: 180 | for i in selected_users_list: 181 | fd.write(str(i)+","+str(num_common_friends_thr[i])+","+str(num_rec[i])+","+str(num_tweets_with_same_tags[i])+"\n") 182 | -------------------------------------------------------------------------------- /user_vector_training/helpers/feature_dist.py: -------------------------------------------------------------------------------- 1 | #TODO 2 | #distribution of number of tweets sharing the same hashtags as the ones used by each user, number of users with atleast 20% common following users and number of reciprocal relations with users in the subset of selected users 3 | #use these features as sentences in word2vec for node representations or try deepwalk on adjacency list of subset of users or use sequence of authors adopting a hashtag for hashtags with atleast 10 adoptions as sentences 4 | import time 5 | import sys 6 | import os 7 | import cPickle as pickle 8 | 9 | 10 | m = dict() 11 | fr = open("/twitterSimulations/graph/map.txt") 12 | for line in fr: 13 | line = line.rstrip() 14 | u = line.split(' ') 15 | m[int(u[0])] = int(u[1]) 16 | fr.close() 17 | """ 18 | tags_for_user = dict() 19 | num_tweets_per_tag = dict() 20 | with open('/twitterSimulations/timeline_data/dif_timeline1s', 'r') as fr: 21 | for line in fr: 22 | line = line.rstrip() 23 | u = line.split('\t') 24 | tag = u[0] 25 | author = m[int(u[2])] 26 | if author not in tags_for_user: 27 | tags_for_user[author]=set() 28 | tags_for_user[author].add(tag) 29 | if tag not in num_tweets_per_tag: 30 | num_tweets_per_tag[tag]=0 31 | num_tweets_per_tag[tag]+=1 32 | print len(tags_for_user) 33 | """ 34 | selected_users = set() 35 | with open("userSubset.csv","r") as fr: 36 | for line in fr: 37 | line = line.rstrip() 38 | u = line.split(',') 39 | id,_,_ = int(u[0]),int(u[1]),int(u[2]) 40 | selected_users.add(id) 41 | 42 | #subset follower and friend adjacency list 43 | """ 44 | arr = ["user_followers_bigger_graph.txt","user_followers_bigger_graph_2.txt", "user_followers_bigger_graph_i.txt","user_followers_bigger_graph_recrawl_2.txt", "user_followers_bigger_graph_recrawl_3.txt","user_followers_bigger_graph_recrawl.txt"] 45 | follower = dict() 46 | for i in arr: 47 | fr = open("/twitterSimulations/graph/" + i,'r') 48 | for line in fr: 49 | line = line.rstrip() 50 | u = line.split(' ') 51 | if(int(u[0]) > 7697889): 52 | continue 53 | node = m[int(u[1])] 54 | if node not in selected_users: 55 | continue 56 | follower[node] = [] 57 | if len(u) > 2: 58 | for j in range(2,len(u)): 59 | snode = m[int(u[j])] 60 | if snode in selected_users: 61 | follower[node].append(snode) 62 | fr.close() 63 | print i 64 | pickle.dump( follower, open( "subset_follower_graph.pickle", "wb" ) ) 65 | 66 | arr_friend = ["user_friends_bigger_graph.txt","user_friends_bigger_graph_2.txt", "user_friends_bigger_graph_i.txt","user_friends_bigger_graph_recrawl.txt"] 67 | friend = dict() 68 | num_friend_id_not_found=0 69 | friend_id_not_found=set() 70 | for i in arr_friend: 71 | fr = open("/twitterSimulations/graph/" + i,'r') 72 | for line in fr: 73 | line = line.rstrip() 74 | u = line.split(' ') 75 | if(int(u[0]) > 7697889): 76 | continue 77 | try: 78 | node = m[int(u[1])] 79 | except: 80 | num_friend_id_not_found += 1 81 | friend_id_not_found.add(int(u[1])) 82 | continue 83 | if node not in selected_users: 84 | continue 85 | friend[node] = [] 86 | if len(u) > 2: 87 | for j in range(2,len(u)): 88 | try: 89 | snode = m[int(u[j])] 90 | except: 91 | num_friend_id_not_found += 1 92 | friend_id_not_found.add(int(u[1])) 93 | continue 94 | if snode in selected_users: 95 | friend[node].append(snode) 96 | fr.close() 97 | print i 98 | pickle.dump( friend, open( "subset_friend_graph.pickle", "wb" ) ) 99 | print num_friend_id_not_found 100 | pickle.dump( friend_id_not_found, open( "friend_id_not_found.pickle", "wb" ) ) 101 | """ 102 | 103 | follower = pickle.load( open( "subset_follower_graph.pickle", "rb" ) ) 104 | print "Follower file loaded" 105 | """ 106 | friend = pickle.load( open( "subset_friend_graph.pickle", "rb" ) ) 107 | print "Friend file loaded" 108 | """ 109 | #number of users with reciprocal links 110 | """ 111 | num_rec = dict() 112 | count=0 113 | for node in selected_users: 114 | count+=1 115 | if count%10000==0: 116 | print count," Users processed" 117 | num_rec[node]=0 118 | # for nbh in friend[node]: 119 | # if node in friend[nbh]: 120 | # num_rec[node]+=1 121 | try: 122 | incoming = set(friend[node]) 123 | outgoing = set(follower[node]) 124 | reciprocal = set.intersection(incoming, outgoing) 125 | num_rec[node]+=len(reciprocal) 126 | except: 127 | pass 128 | pickle.dump( num_rec, open( "num_reciprocal_links.pickle", "wb" ) ) 129 | """ 130 | """ 131 | num_rec = pickle.load( open( "num_reciprocal_links.pickle", "rb" ) ) 132 | """ 133 | #users with more than 20% common friends 134 | num_common_friends = dict() 135 | count=0 136 | for node in selected_users: 137 | count+=1 138 | if count%1000==0: 139 | print count," Users processed" 140 | try: 141 | out_nodes = follower[node] 142 | except: 143 | continue 144 | num_out = len(out_nodes)#len(out_nodes) 145 | for i in range(0,num_out): 146 | out = out_nodes[i] 147 | for j in range(i+1,num_out): 148 | sout = out_nodes[j] 149 | if out>sout: 150 | (out,sout) = (sout,out) 151 | if out not in num_common_friends: 152 | num_common_friends[out]=dict() 153 | num_common_friends[out][sout]=1 154 | elif sout not in num_common_friends[out]: 155 | num_common_friends[out][sout]=1 156 | else: 157 | num_common_friends[out][sout]+=1 158 | 159 | pickle.dump( num_common_friends, open( "num_common_friends.pickle", "wb" ) ) 160 | """ 161 | num_common_friends_thr = dict() 162 | for out in num_common_friends: 163 | for sout in num_common_friends[out]: 164 | thr = .20*len(friend[out]) 165 | thr_s = .20*len(friend[sout]) 166 | common = num_common_friends[out][sout] 167 | if common>=thr: 168 | if out not in num_common_friends_thr: 169 | num_common_friends_thr[out]=0 170 | num_common_friends_thr[out]+=1 171 | if common>=thr_s: 172 | if sout not in num_common_friends_thr: 173 | num_common_friends_thr[sout]=0 174 | num_common_friends_thr[sout]+=1 175 | 176 | pickle.dump( num_common_friends_thr, open( "num_common_friends_thr.pickle", "wb" ) ) 177 | """ 178 | """ 179 | #tweets with same hashtags 180 | num_tweets_with_same_tags = dict() 181 | for node in selected_users: 182 | num_tweets_with_same_tags[node] = sum([num_tweets_per_tag[x] for x in tags_for_user[node]]) 183 | 184 | with open("featuresUserSubset.csv","w") as fd: 185 | for i in selected_users: 186 | fd.write(str(i)+","+str(num_common_friends_thr[i])+","+str(num_rec[i])+","+str(num_tweets_with_same_tags[i])+"\n") 187 | """ -------------------------------------------------------------------------------- /adopter_prediction/adopter_prediction.py: -------------------------------------------------------------------------------- 1 | #get nearest users of the source of a hashtag sequence in test sequences using user vectors and compare with actual adopters in the sequence 2 | 3 | import cPickle as pickle 4 | import time 5 | from math import sqrt 6 | import random 7 | 8 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_bfsr.txt" 9 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences" 10 | with open("sequence_file_split_indices.pickle","rb") as fr: 11 | _ = pickle.load(fr) 12 | test_seq_id = pickle.load(fr) 13 | test_seq_id = set(test_seq_id) 14 | 15 | with open("sequence_file_split_users.pickle","rb") as fr: 16 | users_train = pickle.load(fr) 17 | users_test = pickle.load(fr) 18 | users_test = set(users_test) 19 | 20 | def read_vector_file(path_vectors_file,users_test): 21 | vocab = [] 22 | vectors = [] 23 | with open(path_vectors_file,"rb") as fr: 24 | _,dim = next(fr).rstrip().split(' ') 25 | word_vector_dim = int(dim) 26 | next(fr) 27 | for line in fr: 28 | line = line.rstrip() 29 | u = line.split(' ') 30 | if len(u) != word_vector_dim+1: 31 | print "vector length error" 32 | word = int(u[0]) 33 | if word in users_test: 34 | vec = [] 35 | length = 0.0 36 | for d in u[1:]: 37 | num=float(d) 38 | vec.append(num) 39 | length+=num**2 40 | #vec = map(float,u[1:]) 41 | #length = sum(x**2 for x in vec) 42 | length = sqrt(length) 43 | vec_norm = [x/length for x in vec] 44 | vocab.append(word) 45 | vectors.append(vec_norm) 46 | return vectors, vocab, word_vector_dim 47 | 48 | vec,vocab,dim = read_vector_file(vec_file,users_test) 49 | vocab_index=dict() 50 | for i in xrange(0,len(vocab)): 51 | vocab_index[vocab[i]]=i 52 | num_users_test = len(vocab) 53 | # print "num users in test sequences", num_users_test 54 | # print "users removed from vocab", len(set(users_train)-set(vocab)) 55 | # print "users in test sequences but not in vocab", len(users_test-set(vocab)) 56 | 57 | #Peter Norvig's code for memo 58 | # def memo(f): 59 | # "Memoize function f." 60 | # table = {} 61 | # def fmemo(*args): 62 | # if args not in table: 63 | # table[args] = f(*args) 64 | # return table[args] 65 | # fmemo.memo = table 66 | # return fmemo 67 | # dist_memo = dict() 68 | 69 | # @memo 70 | def get_Nranked_list(query,N): 71 | wordN = [0]*N 72 | distN = [0.0]*N 73 | try: 74 | voc_ind = vocab_index[query] 75 | except KeyError: 76 | print "query word not present" 77 | return 78 | query_vec = vec[voc_ind] 79 | for i in xrange(0,len(vec)): 80 | if i==voc_ind: 81 | continue 82 | pres_word = vocab[i] 83 | pres_vec = vec[i] 84 | dist = 0.0 85 | for x in xrange(0,dim): 86 | dist+=query_vec[x]*pres_vec[x] 87 | #dist = sum(query_vec[x]*pres_vec[x] for x in range(0,dim)) 88 | for j in xrange(0,N): 89 | if dist>distN[j]: 90 | for k in xrange(N-1,j,-1): 91 | distN[k] = distN[k-1] 92 | wordN[k] = wordN[k-1] 93 | distN[j] = dist 94 | wordN[j] = pres_word 95 | break 96 | return wordN #zip(wordN,distN) 97 | 98 | not_found_vocab=[] 99 | # source_thr = 1395858601 + 7*24*60*60 100 | tag_seq = [] 101 | count=0 102 | # nb_seq = dict() 103 | with open(adoption_sequence_filename, "rb") as fr: 104 | for line in fr: 105 | line = line.rstrip() 106 | u = line.split(' ') 107 | not_found=0 108 | # first_timestamp = int(u[1][0:u[1].index(',')]) 109 | # if first_timestamp>=source_thr 110 | if count in test_seq_id: 111 | seq=[] 112 | for i in xrange(1, len(u)): 113 | #timestamp = int(u[i][0:u[i].index(',')]) 114 | author = int(u[i][u[i].index(',')+1 : ]) 115 | if author in vocab_index: 116 | seq.append(author) 117 | else: 118 | not_found+=1 119 | if len(seq)>1: 120 | tag_seq.append(seq) 121 | not_found_vocab.append(not_found) 122 | # else: 123 | # adop=[] 124 | # for i in xrange(1, len(u)): 125 | # author = int(u[i][u[i].index(',')+1 : ]) 126 | # if author in vocab_index: 127 | # adop.append(author) 128 | # for author in set(adop): 129 | # try: 130 | # nb_seq[author]+=1 131 | # except KeyError: 132 | # nb_seq[author]=1 133 | count+=1 134 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq] 135 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True) 136 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted] 137 | # pickle.dump(nb_seq_order,open("adopter_pred_files/baseline_user_order_bfsr.pickle","wb")) 138 | nb_seq_order = pickle.load(open("adopter_pred_files/baseline_user_order_bfsr.pickle","rb")) 139 | print len(nb_seq_order) 140 | print len(tag_seq),len(test_seq_id),count 141 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab) 142 | seq_count_limit=100 143 | num_seqs=0 144 | mean_ap=0 145 | mean_prec_r=0 146 | mean_ap_nbapp=0 147 | mean_prec_r_nbapp=0 148 | # N=100 149 | seq_random_index=range(0,len(tag_seq)) 150 | random.shuffle(seq_random_index) 151 | for i in seq_random_index: 152 | seq_sample_vocab = tag_seq[i] 153 | # source_user=seq_sample[0] 154 | # if source_user not in vocab_index: 155 | # continue 156 | # seq_sample_vocab = [x for x in seq_sample if x in vocab_index] 157 | # if len(seq_sample_vocab)<2:#2 158 | # continue 159 | source_user=seq_sample_vocab[0] 160 | seq_sample_vocab = set(seq_sample_vocab[1:]) 161 | M = len(seq_sample_vocab) 162 | N = M #1000 #num_users_test 163 | # if M<1000: 164 | # continue 165 | not_found=not_found_vocab[i] 166 | #source_vec=vec[vocab_index[source_user]] 167 | 168 | adopters_vec = get_Nranked_list(source_user,N) 169 | precision_k = 0.0 170 | num_hits = 0.0 171 | for k,p in enumerate(adopters_vec): 172 | if p in seq_sample_vocab: 173 | num_hits+=1.0 174 | precision_k += num_hits/(k+1.0) 175 | average_precision = precision_k/min(M,N) 176 | prec_r = num_hits/M 177 | print "Avg precision", average_precision, "num of users not found", not_found, "num of adopters in seq", len(seq_sample_vocab) 178 | print "RPrecision", prec_r 179 | # print "Precision", num_hits/N, "Recall", num_hits/M 180 | mean_ap+=average_precision 181 | mean_prec_r+=prec_r 182 | num_seqs+=1 183 | print "MAP", mean_ap/float(num_seqs), "MRP", mean_prec_r/float(num_seqs) 184 | 185 | nb_seq_order = nb_seq_order[:N] 186 | precision_k_nbapp = 0.0 187 | num_hits_nbapp = 0.0 188 | for k,p in enumerate(nb_seq_order): 189 | if p in seq_sample_vocab: 190 | num_hits_nbapp+=1.0 191 | precision_k_nbapp += num_hits_nbapp/(k+1.0) 192 | average_precision_nbapp = precision_k_nbapp/min(M,N) 193 | prec_r_nbapp = num_hits_nbapp/M 194 | print "Nb_App", "Avg precision", average_precision_nbapp 195 | print "Nb_App", "RPrecision", prec_r_nbapp 196 | # print "Precision", num_hits_nbapp/N, "Recall", num_hits_nbapp/M 197 | mean_ap_nbapp+=average_precision_nbapp 198 | mean_prec_r_nbapp+=prec_r_nbapp 199 | print "Nb_App", "MAP", mean_ap_nbapp/float(num_seqs), "MRP", mean_prec_r_nbapp/float(num_seqs) 200 | 201 | seq_count_limit-=1 202 | if seq_count_limit==0: 203 | break 204 | print num_seqs 205 | print "MAP", "user vectors", mean_ap/float(num_seqs), "Nb_App", mean_ap_nbapp/float(num_seqs) 206 | print "MRP", mean_prec_r/float(num_seqs), mean_prec_r_nbapp/float(num_seqs) 207 | #pickle.dump(source_time,open("source_time.pickle","wb")) 208 | -------------------------------------------------------------------------------- /adopter_prediction/adopter_prediction_parallel.py: -------------------------------------------------------------------------------- 1 | #get nearest users of the source of a hashtag sequence in test sequences using user vectors and compare with actual adopters in the sequence 2 | 3 | import cPickle as pickle 4 | import time 5 | from math import sqrt 6 | import random 7 | from multiprocessing import Pool, cpu_count 8 | 9 | NUM_PROCESSES = 5 10 | 11 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_pr.txt" 12 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences" 13 | with open("sequence_file_split_indices.pickle","rb") as fr: 14 | _ = pickle.load(fr) 15 | test_seq_id = pickle.load(fr) 16 | test_seq_id = set(test_seq_id) 17 | 18 | with open("sequence_file_split_users.pickle","rb") as fr: 19 | users_train = pickle.load(fr) 20 | users_test = pickle.load(fr) 21 | users_test = set(users_test) 22 | 23 | def read_vector_file(path_vectors_file,users_test): 24 | vocab = [] 25 | vectors = [] 26 | with open(path_vectors_file,"rb") as fr: 27 | _,dim = next(fr).rstrip().split(' ') 28 | word_vector_dim = int(dim) 29 | next(fr) 30 | for line in fr: 31 | line = line.rstrip() 32 | u = line.split(' ') 33 | if len(u) != word_vector_dim+1: 34 | print "vector length error" 35 | word = int(u[0]) 36 | if word in users_test: 37 | vec = [] 38 | length = 0.0 39 | for d in u[1:]: 40 | num=float(d) 41 | vec.append(num) 42 | length+=num**2 43 | #vec = map(float,u[1:]) 44 | #length = sum(x**2 for x in vec) 45 | length = sqrt(length) 46 | vec_norm = [x/length for x in vec] 47 | vocab.append(word) 48 | vectors.append(vec_norm) 49 | return vectors, vocab, word_vector_dim 50 | 51 | vec,vocab,dim = read_vector_file(vec_file,users_test) 52 | vocab_index=dict() 53 | for i in xrange(0,len(vocab)): 54 | vocab_index[vocab[i]]=i 55 | num_users_test = len(vocab) 56 | # print "num users in test sequences", num_users_test 57 | # print "users removed from vocab", len(set(users_train)-set(vocab)) 58 | # print "users in test sequences but not in vocab", len(users_test-set(vocab)) 59 | 60 | #Peter Norvig's code for memo 61 | # def memo(f): 62 | # "Memoize function f." 63 | # table = {} 64 | # def fmemo(*args): 65 | # if args not in table: 66 | # table[args] = f(*args) 67 | # return table[args] 68 | # fmemo.memo = table 69 | # return fmemo 70 | # dist_memo = dict() 71 | 72 | # @memo 73 | def get_Nranked_list(query,N): 74 | wordN = [0]*N 75 | distN = [0.0]*N 76 | try: 77 | voc_ind = vocab_index[query] 78 | except KeyError: 79 | print "query word not present" 80 | return 81 | query_vec = vec[voc_ind] 82 | for i in range(0,len(vec)): 83 | if i==voc_ind: 84 | continue 85 | pres_word = vocab[i] 86 | pres_vec = vec[i] 87 | dist = 0.0 88 | for x in range(0,dim): 89 | dist+=query_vec[x]*pres_vec[x] 90 | #dist = sum(query_vec[x]*pres_vec[x] for x in range(0,dim)) 91 | for j in range(0,N): 92 | if dist>distN[j]: 93 | for k in range(N-1,j,-1): 94 | distN[k] = distN[k-1] 95 | wordN[k] = wordN[k-1] 96 | distN[j] = dist 97 | wordN[j] = pres_word 98 | break 99 | return wordN #zip(wordN,distN) 100 | 101 | not_found_vocab=[] 102 | # source_thr = 1395858601 + 7*24*60*60 103 | tag_seq = [] 104 | count=0 105 | # nb_seq = dict() 106 | with open(adoption_sequence_filename, "rb") as fr: 107 | for line in fr: 108 | line = line.rstrip() 109 | u = line.split(' ') 110 | not_found=0 111 | # first_timestamp = int(u[1][0:u[1].index(',')]) 112 | # if first_timestamp>=source_thr 113 | if count in test_seq_id: 114 | seq=[] 115 | for i in xrange(1, len(u)): 116 | #timestamp = int(u[i][0:u[i].index(',')]) 117 | author = int(u[i][u[i].index(',')+1 : ]) 118 | if author in vocab_index: 119 | seq.append(author) 120 | else: 121 | not_found+=1 122 | if len(seq)>1: 123 | tag_seq.append(seq) 124 | not_found_vocab.append(not_found) 125 | # else: 126 | # adop=[] 127 | # for i in xrange(1, len(u)): 128 | # author = int(u[i][u[i].index(',')+1 : ]) 129 | # if author in vocab_index: 130 | # adop.append(author) 131 | # for author in set(adop): 132 | # try: 133 | # nb_seq[author]+=1 134 | # except KeyError: 135 | # nb_seq[author]=1 136 | count+=1 137 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq] 138 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True) 139 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted] 140 | # pickle.dump(nb_seq_order,open("adopter_pred_files/baseline_user_order_bfsr.pickle","wb")) 141 | nb_seq_order = pickle.load(open("adopter_pred_files/baseline_user_order_pr.pickle","rb")) 142 | print len(nb_seq_order) 143 | # print len(tag_seq),len(test_seq_id),count 144 | # print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab) 145 | 146 | seq_random_index=range(0,len(tag_seq)) 147 | random.shuffle(seq_random_index) 148 | 149 | def adopter_prediction(process_num,start,end): 150 | seq_count_limit=100 151 | num_seqs=0 152 | mean_ap=0 153 | # mean_prec_r=0 154 | mean_ap_nbapp=0 155 | # mean_prec_r_nbapp=0 156 | # N=100 157 | for i in seq_random_index[start:end]: 158 | seq_sample_vocab = tag_seq[i] 159 | # source_user=seq_sample[0] 160 | # if source_user not in vocab_index: 161 | # continue 162 | # seq_sample_vocab = [x for x in seq_sample if x in vocab_index] 163 | # if len(seq_sample_vocab)<2:#2 164 | # continue 165 | source_user=seq_sample_vocab[0] 166 | seq_sample_vocab = set(seq_sample_vocab[1:]) 167 | M = len(seq_sample_vocab) 168 | N = num_users_test #M #1000 169 | # if M<1000: 170 | # continue 171 | not_found=not_found_vocab[i] 172 | #source_vec=vec[vocab_index[source_user]] 173 | 174 | adopters_vec = get_Nranked_list(source_user,N) 175 | precision_k = 0.0 176 | num_hits = 0.0 177 | for k,p in enumerate(adopters_vec): 178 | if p in seq_sample_vocab: 179 | num_hits+=1.0 180 | precision_k += num_hits/(k+1.0) 181 | average_precision = precision_k/min(M,N) 182 | # prec_r = num_hits/M 183 | print "Avg precision", average_precision, "num of users not found", not_found, "num of adopters in seq", len(seq_sample_vocab), "Process", process_num 184 | # print "Precision", num_hits/N, "Recall", num_hits/M 185 | mean_ap+=average_precision 186 | # mean_prec_r+=prec_r 187 | num_seqs+=1 188 | print "MAP", mean_ap/float(num_seqs), "Process", process_num#, "MRP", mean_prec_r/float(num_seqs) 189 | 190 | nb_seq_order = nb_seq_order[:N] 191 | precision_k_nbapp = 0.0 192 | num_hits_nbapp = 0.0 193 | for k,p in enumerate(nb_seq_order): 194 | if p in seq_sample_vocab: 195 | num_hits_nbapp+=1.0 196 | precision_k_nbapp += num_hits_nbapp/(k+1.0) 197 | average_precision_nbapp = precision_k_nbapp/min(M,N) 198 | # prec_r_nbapp = num_hits_nbapp/M 199 | print "Nb_App", "Avg precision", average_precision_nbapp, "Process", process_num 200 | # print "Precision", num_hits_nbapp/N, "Recall", num_hits_nbapp/M 201 | mean_ap_nbapp+=average_precision_nbapp 202 | # mean_prec_r_nbapp+=prec_r_nbapp 203 | print "Nb_App", "MAP", mean_ap_nbapp/float(num_seqs), "Process", process_num#, "MRP", mean_prec_r_nbapp/float(num_seqs) 204 | 205 | seq_count_limit-=1 206 | if seq_count_limit==0: 207 | break 208 | print num_seqs, mean_ap, mean_ap_nbapp, "Process", process_num 209 | print "user vectors", mean_ap/float(num_seqs), "Process", process_num 210 | print "Nb_App", mean_ap_nbapp/float(num_seqs), "Process", process_num 211 | # print mean_prec_r/float(num_seqs) 212 | #pickle.dump(source_time,open("source_time.pickle","wb")) 213 | 214 | num_workers = min(NUM_PROCESSES,cpu_count()) 215 | pool = Pool(processes=num_workers) 216 | process_num=0 217 | NUM_SEQ = len(seq_random_index) 218 | lines_per_process = int(NUM_SEQ/(2.0*num_workers)) 219 | for s,e in ( (i,min(i+lines_per_process,NUM_SEQ)) for i in xrange(0,NUM_SEQ,lines_per_process) ): 220 | pool.apply_async(adopter_prediction, args=(process_num,s,e)) 221 | process_num+=1 222 | pool.close() 223 | pool.join() -------------------------------------------------------------------------------- /neighbourhood_experiments/entropy_vs_spread.py: -------------------------------------------------------------------------------- 1 | #get entropy of distribution of first 1000 adopters in different clusters for topics with atleast 1000 adopters and compare with eventual spread 2 | 3 | import cPickle as pickle 4 | import time 5 | from math import sqrt, log 6 | import random 7 | from heapq import nsmallest, nlargest, merge 8 | import numpy 9 | # from scipy.spatial import cKDTree as KDTree 10 | from sklearn.neighbors import NearestNeighbors 11 | import sys 12 | from multiprocessing import Pool, cpu_count 13 | from collections import defaultdict 14 | import traceback 15 | 16 | NUM_PROCESSES = 1 17 | 18 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_pr.txt" 19 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences" 20 | seq_len_threshold = 500 #top_k 21 | train_ex_limit = 100 22 | norm_vec = True 23 | 24 | print vec_file, seq_len_threshold, norm_vec 25 | 26 | with open("/mnt/filer01/word2vec/degree_distribution/sequence_file_split_indices.pickle","rb") as fr: 27 | _ = pickle.load(fr) 28 | test_seq_id = pickle.load(fr) 29 | test_seq_id = set(test_seq_id) 30 | 31 | def read_vector_file(path_vectors_file): 32 | vocab = [] 33 | vectors = [] 34 | with open(path_vectors_file,"rb") as fr: 35 | _,dim = next(fr).rstrip().split(' ') 36 | word_vector_dim = int(dim) 37 | next(fr) 38 | for line in fr: 39 | line = line.rstrip() 40 | u = line.split(' ') 41 | if len(u) != word_vector_dim+1: 42 | print "vector length error" 43 | word = int(u[0]) 44 | #normalise to length 1 45 | if norm_vec: 46 | vec = [] 47 | length = 0.0 48 | for d in u[1:]: 49 | num=float(d) 50 | vec.append(num) 51 | length+=num**2 52 | #vec = map(float,u[1:]) 53 | #length = sum(x**2 for x in vec) 54 | length = sqrt(length) 55 | vec_norm = [x/length for x in vec] 56 | vectors.append(vec_norm) 57 | else: 58 | vec = map(float,u[1:]) 59 | vectors.append(vec) 60 | vocab.append(word) 61 | return vectors, vocab, word_vector_dim 62 | 63 | vec,vocab,dim = read_vector_file(vec_file) 64 | vocab_index=dict() 65 | for i in xrange(0,len(vocab)): 66 | vocab_index[vocab[i]]=i 67 | num_users = len(vocab) 68 | print "num users in train sequences", num_users 69 | # print "users removed from vocab", len(set(users_train)-set(vocab)) 70 | # print "users in test sequences but not in vocab", len(users_test-set(vocab)) 71 | 72 | def print_stats(u): 73 | e,t,r = zip(*u) 74 | return [numpy.mean(e), numpy.std(e), numpy.median(e)], [numpy.mean(t), numpy.std(t), numpy.median(t)], [numpy.mean(r), numpy.std(r), numpy.median(r)] 75 | 76 | # reading test sequences 77 | not_found_vocab=[] 78 | # source_thr = 1395858601 + 12*60*60 79 | # non_emergent_tags = pickle.load(open("/mnt/filer01/word2vec/degree_distribution/nonEmergentHashtags.pickle","rb")) 80 | tag_seq = [] 81 | count=0 82 | # nb_seq = dict() 83 | # adlen = [] 84 | with open(adoption_sequence_filename, "rb") as fr: 85 | for line in fr: 86 | line = line.rstrip() 87 | u = line.split(' ') 88 | not_found = set() 89 | adopters = set() 90 | # first_timestamp = int(u[1][0:u[1].index(',')]) 91 | # first tweet only after source_thr timestamp 92 | # if first_timestamp>=source_thr 93 | # check if <5 tweets in 12 hours for emergent hashtags, not already popular 94 | # u[0] not in non_emergent_tags and 95 | if count in test_seq_id: 96 | seq=[] 97 | for i in xrange(1, len(u)): 98 | #timestamp = int(u[i][0:u[i].index(',')]) 99 | author = int(u[i][u[i].index(',')+1 : ]) 100 | if author in vocab_index: 101 | # removing repeat adopters 102 | if author not in adopters: 103 | seq.append(author) 104 | adopters.add(author) 105 | else: 106 | not_found.add(author) 107 | if len(seq)>0: 108 | tag_seq.append(seq) 109 | not_found_vocab.append(len(not_found)) 110 | # adlen.append(len(seq)) 111 | # elif count not in test_seq_id: 112 | # adop=[] 113 | # for i in xrange(1, len(u)): 114 | # author = int(u[i][u[i].index(',')+1 : ]) 115 | # if author in vocab_index: 116 | # adop.append(author) 117 | # for author in set(adop): 118 | # try: 119 | # nb_seq[author]+=1 120 | # except KeyError: 121 | # nb_seq[author]=1 122 | count+=1 123 | #nb, number of training sequences participated in 124 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq] 125 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True) 126 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted] 127 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb")) 128 | # pickle.dump(adlen,open("adlen.pickle","wb")) 129 | 130 | print len(tag_seq),len(test_seq_id),count 131 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab) 132 | 133 | """ 134 | #test sequences in random order 135 | seq_random_index=range(0,len(tag_seq)) 136 | random.shuffle(seq_random_index) 137 | 138 | seq_index_filter = [] 139 | for i in seq_random_index: 140 | seq_sample_vocab = tag_seq[i] 141 | M = len(seq_sample_vocab) 142 | if M 0: 167 | ent+= -1.0*p*log(p,2) 168 | return ent 169 | 170 | def init_adopt_stat(process_num,start,end): 171 | print process_num, start, end 172 | try: 173 | cand_set_recall_spread = [] 174 | l=0 175 | avg_num_adopters = 0 176 | count=0 177 | for i in seq_index_filter: 178 | if count < start: 179 | count+=1 180 | continue 181 | elif count >= end: 182 | break 183 | count+=1 184 | seq_sample_vocab = tag_seq[i] 185 | total_spread = len(seq_sample_vocab) 186 | avg_num_adopters+=total_spread 187 | 188 | init_adopters=seq_sample_vocab[0:seq_len_threshold] 189 | ent = get_entropy(init_adopters) 190 | 191 | #entropy of random sample of users 192 | random_adopters = random.sample(vocab,seq_len_threshold) 193 | ent_M1 = get_entropy(random_adopters) 194 | 195 | ent_rel = 0.0 196 | if ent_M1>0: 197 | ent_rel = ent*1./ent_M1 198 | 199 | cand_set_recall_spread.append((ent,total_spread,ent_rel)) 200 | 201 | l+=1 202 | if l%25==0: 203 | print "entropy", ent, "random", ent_M1, "total spread", total_spread, "rel", ent_rel, l 204 | # if l==train_ex_limit: 205 | # break 206 | print process_num, start, "num examples", l, "ent", print_stats(cand_set_recall_spread), avg_num_adopters*1./l 207 | with open("/mnt/filer01/word2vec/degree_distribution/candset_stat_files/entropy_vs_spread_seq"+str(seq_len_threshold)+"_ex"+str(start)+".pickle","wb") as fd: 208 | pickle.dump(cand_set_recall_spread,fd) 209 | except Exception as e: 210 | print traceback.format_exc() 211 | 212 | tic = time.clock() 213 | init_adopt_stat(0,0,1000) 214 | toc = time.clock() 215 | print "init adopt eval in", (toc-tic)*1000 216 | 217 | # NUM_LINES = len(seq_index_filter) 218 | 219 | # num_workers = min(NUM_PROCESSES,cpu_count()) 220 | # pool = Pool(processes=num_workers) 221 | # process_num=0 222 | # lines_per_process = int(NUM_LINES/(2*num_workers)) 223 | # for s,e in ( (i,min(i+lines_per_process,NUM_LINES)) for i in xrange(0,NUM_LINES,lines_per_process) ): 224 | # pool.apply_async(init_adopt_stat, args=(process_num,s,e)) 225 | # process_num+=1 226 | # pool.close() 227 | # pool.join() 228 | 229 | print vec_file -------------------------------------------------------------------------------- /neighbourhood_experiments/candidate_set_coverage/cand_recall_plot.py: -------------------------------------------------------------------------------- 1 | #plot candidate set recall for different n,c averaged over 100 or 50 tags 2 | 3 | from collections import defaultdict 4 | import cPickle as pickle 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | import sys 8 | 9 | num_init_adopters = [10,100] #range(10,101,10) 10 | top_k = np.arange(0.6,1.05,0.1) #np.arange(0.5,1.01,0.1) #[500,1000,2000]+range(3000,10001,1000) 11 | seq_len_threshold = 500 12 | 13 | def get_stats(u): 14 | return [np.mean(u), np.std(u)] #, np.median(u) 15 | 16 | def cc_plot(cc): 17 | cc_mean, cc_std = zip(*cc) 18 | plt.errorbar(num_init_adopters, cc_mean, yerr=cc_std, fmt='o') 19 | plt.xlabel('Number of initial adopters, n') 20 | plt.ylabel('Proportion of adopters in candidate set') 21 | plt.title('Candidate set coverage with varying value of n\nc=1000, Avg. over 100 topics') 22 | plt.grid() 23 | plt.show() 24 | 25 | def cc_boxplot(data,n,xlab,avg): 26 | plt.boxplot(data, labels=xlab, whis='range', showmeans=True, meanprops=dict(marker='D', markerfacecolor='red')) 27 | plt.xlabel('Number of nearest neighbours queried, c\n(with average neighbour set size)') 28 | # plt.ylabel('Neighbourhood coverage') 29 | # plt.title('Candidate set coverage with varying value of c\nn='+str(n)+', Average over 50 topics') 30 | 31 | # plt.xlabel('Radius for querying neighbour set, r\n(with average neighbour set size)') 32 | # plt.ylabel('Neighbourhood coverage') 33 | # plt.title('Candidate set coverage with varying value of r\nn='+str(n)+', Average over 50 topics') 34 | 35 | # plt.ylabel('Fraction of neighbouring users from same geography') 36 | # plt.title('Geography precision with radius based search\nAverage over '+str(n)+' users') 37 | 38 | plt.ylabel('Fraction of friends in neighbour set') 39 | plt.title('Following Coverage with nearest neighbour search\nAverage number of friends '+str(round(avg,2))+', Average over '+str(n)+' users') 40 | 41 | plt.tight_layout() 42 | plt.ylim(0,1.0) 43 | plt.grid() 44 | plt.show() 45 | 46 | def cc_scatterplot(cc,spread): 47 | plt.scatter(cc, spread) 48 | m, b = np.polyfit(cc,spread, 1) 49 | plt.plot(np.asarray(cc), m*np.asarray(cc) + b, 'r-', label = 'Linear fit') 50 | plt.ylim(-0.1,1.0) 51 | plt.xlim(-0.1,1.0) 52 | # plt.yscale('log') 53 | # plt.xscale('log') 54 | # plt.ylim(500,plt.ylim()[1]) 55 | # plt.xlabel('Proportion of first 1000 adopters present in candidate set') 56 | # plt.ylabel('Total spread') 57 | # plt.title('Candidate set coverage and eventual spread of topics\nCorr. coeff. = '+str(round(np.corrcoef(cc,spread)[0,1],4))) 58 | # plt.xlabel('Proportion of adopters in candidate set') 59 | # plt.ylabel('Precision@10') 60 | # plt.title('Candidate set coverage and Precision@10 of topics\nCorr. coeff. = '+str(round(np.corrcoef(cc,spread)[0,1],4))) 61 | plt.xlabel('Network neighbours (followers)') 62 | plt.ylabel('Vector space neighbours') 63 | # plt.title('Likelihood of co-adoption of users with different neighbourhoods\nCorr. coeff. = '+str(round(np.corrcoef(cc,spread)[0,1],4))) 64 | plt.grid() 65 | plt.legend(loc='upper left') 66 | plt.show() 67 | 68 | num_bin = 10 69 | def ent_histogram(val): 70 | x,bins,_=plt.hist(val, num_bin, rwidth=0.8, align='left') 71 | # plt.bar(range(1,len(val)+1), val) 72 | # plt.bar(range(1,len(rec)+1), rec) 73 | plt.xlim(-0.1,1.0) 74 | plt.xlabel('Precision@10') 75 | plt.ylabel('Frequency') 76 | # plt.title('Entropy of distribution of geo-locations in clusters') 77 | plt.tight_layout() 78 | plt.grid() 79 | plt.show() 80 | 81 | def freq_plot(y): 82 | time_bins = [(0,5),(6,8),(9,11),(12,14),(15,17),(18,20),(21,23)] 83 | x = [str(i)+'-'+str(j) for i,j in time_bins] 84 | plt.bar(range(len(y)), y, align='center') 85 | plt.xticks(range(len(y)), x, size='small') 86 | plt.xlabel('Time of day (in hour)') 87 | # plt.xlim(xmin=0) 88 | plt.ylabel('Proportion of tweets') 89 | plt.title('Frequency distribution of tweeting time') 90 | plt.grid() 91 | plt.show() 92 | #coverage box plots 93 | """ 94 | for n in num_init_adopters: 95 | # cc = [] 96 | cand_size = [] 97 | data = [] 98 | for i in top_k: 99 | # with open("candset_stat_files/candset_n"+str(n)+"_c"+str(i)+".pickle","rb") as fr: 100 | with open("candset_stat_files/candset_n"+str(n)+"_r"+str(i)+".pickle","rb") as fr: 101 | cand_set_recall = pickle.load(fr) 102 | cand_set_overlap = pickle.load(fr) 103 | cand_set_cr = pickle.load(fr) 104 | cand_set_size_list = pickle.load(fr) 105 | print n,i,len(cand_set_recall) 106 | # if i==1000: 107 | if n==10: 108 | cand_set_recall = cand_set_recall[:50] 109 | # cc.append(get_stats(cand_set_recall)) 110 | data.append(cand_set_recall) 111 | cand_size.append(np.mean(cand_set_size_list)) 112 | xlab = [str(x)+'\n('+str(y)+')' for x,y in zip(top_k,cand_size)] 113 | cc_boxplot(data,n,xlab) 114 | 115 | # cc_plot(cc) 116 | # cc_boxplot(data) 117 | """ 118 | #entropy histogram plots 119 | """ 120 | with open("user_vector_cluster_entropy.pickle","rb") as fr: 121 | _ = pickle.load(fr) 122 | c_ent = pickle.load(fr) 123 | _ = pickle.load(fr) 124 | l_ent = pickle.load(fr) 125 | ent_histogram(c_ent) 126 | ent_histogram(l_ent) 127 | """ 128 | 129 | #coverage vs spread scatter plots 130 | """ 131 | # with open("candset_stat_files/test_sequence_indices_thr1000.pickle","rb") as fr: 132 | # seq_index_filter = pickle.load(fr) 133 | NUM_LINES = 2000#len(seq_index_filter) 134 | 135 | num_workers = 9 136 | lines_per_process = int(NUM_LINES/(2*num_workers)) 137 | cand_set_recall_spread = [] 138 | for s,e in ( (i,min(i+lines_per_process,NUM_LINES)) for i in xrange(0,NUM_LINES,lines_per_process) ): 139 | print s,e 140 | with open("candset_stat_files/candset_vs_spread_n"+str(num_init_adopters)+"_c"+str(top_k)+"_seq"+str(seq_len_threshold)+"_ex"+str(s)+".pickle","rb") as fr: 141 | cc_subset = pickle.load(fr) 142 | cand_set_recall_spread += cc_subset 143 | # cand_set_recall_spread = sorted(cand_set_recall_spread, key=lambda x: x[0]) 144 | cc,spread = zip(*cand_set_recall_spread) 145 | print sum(cc)*1./len(cc), sum(spread)*1./len(spread), min(cc), max(cc), min(spread), max(spread) 146 | cc_scatterplot(cc,spread) 147 | """ 148 | 149 | #sliding window median plots 150 | """ 151 | window_length = 50 152 | median_spread_mw = [] 153 | median_cc_mw = [] 154 | K=2000 155 | for i in range(0,len(cand_set_recall_spread)): 156 | m = np.median(spread[i:i+window_length]) 157 | # m = 0 158 | # for s in spread[i:i+window_length]: 159 | # if s>=K: 160 | # m+=1 161 | # m = m*1./window_length 162 | mw = cc[i:i+window_length] 163 | c = mw[len(mw)//2] 164 | median_spread_mw.append(m) 165 | median_cc_mw.append(c) 166 | cc_scatterplot(median_cc_mw,median_spread_mw) 167 | """ 168 | 169 | #prec@10 vs coverage scatter plots 170 | 171 | with open("candset_stat_files/nbr_frac0.5_seq100.pickle","rb") as fr: 172 | cand_set_recall = pickle.load(fr) 173 | cand_set_overlap = pickle.load(fr) 174 | cand_set_size_list = pickle.load(fr) 175 | print np.mean(cand_set_recall,axis=0), len(cand_set_recall) 176 | print cand_set_recall[0:10], cand_set_overlap[0:10], cand_set_size_list[0:10] 177 | fol,vec = zip(*cand_set_recall) 178 | # nbh,_ = zip(*cand_set_size_list) 179 | # p_fol = [x*1./y for (x,y) in zip(fol,nbh)] 180 | # p_vec = [x*1./y for (x,y) in zip(vec,nbh)] 181 | # print np.mean(p_fol), np.mean(p_vec) 182 | cc_scatterplot(fol,vec) 183 | # ent_histogram(u) 184 | 185 | """ 186 | #entropy vs spread scatter plots 187 | with open("candset_stat_files/entropy_vs_spread_seq500_ex0.pickle","rb") as fr: 188 | ent_spread = pickle.load(fr) 189 | ent_spread = sorted(ent_spread, key=lambda x: x[1]) 190 | e,s,er = zip(*ent_spread) 191 | cc_scatterplot(er,s) 192 | """ 193 | """ 194 | #activity time histogram plots 195 | with open("candset_stat_files/sample_user_activity_time_uneven_bins.pickle","rb") as fr: 196 | sample_activity = pickle.load(fr) 197 | # total_tweets = pickle.load(fr) 198 | # total_activity_bins = [i*1./total_tweets for i in total_activity_bins] 199 | # print total_tweets, total_activity_bins 200 | print len(sample_activity) 201 | for i in sample_activity: 202 | c = sample_activity[i] 203 | freq = [0]*7 204 | for b in c: 205 | freq[b]+=c[b] 206 | freq_plot(freq) 207 | """ 208 | 209 | #geography, follower, following coverage box plots 210 | """ 211 | cand_size = [] 212 | avg=[] 213 | data = [] 214 | top_k = [1000,2000,5000,10000] #np.arange(0.6,1.05,0.1).tolist()+[1.2] #np.arange(0.6,1.15,0.1) 215 | n=10000 216 | for i in top_k: 217 | # with open("candset_stat_files/candset_loc_c"+str(i)+".pickle","rb") as fr: 218 | with open("candset_stat_files/candset_fol_fr_c"+str(i)+".pickle","rb") as fr: 219 | cand_set_recall = pickle.load(fr) 220 | cand_set_overlap = pickle.load(fr) 221 | num_nbr = pickle.load(fr) 222 | cand_set_size_list = pickle.load(fr) 223 | print i,len(cand_set_recall) 224 | # cc_geo,cc_prec_geo = zip(*cand_set_recall) 225 | cc_fol,cc_fr = zip(*cand_set_recall) 226 | num_fol,num_fr = zip(*num_nbr) 227 | avg_fol = np.mean(num_fol) 228 | avg_fr = np.mean(num_fr) 229 | avg.append(avg_fr) 230 | print avg_fol,avg_fr 231 | # cc.append(get_stats(cand_set_recall)) 232 | data.append(cc_fr) 233 | cand_size.append(np.mean(cand_set_size_list)) 234 | print np.mean(avg) 235 | xlab = [str(x)+'\n('+str(y)+')' for x,y in zip(top_k,cand_size)] 236 | cc_boxplot(data,n,xlab,np.mean(avg)) 237 | """ -------------------------------------------------------------------------------- /neighbourhood_experiments/candidate_set_coverage/cand_cov_vs_spread.py: -------------------------------------------------------------------------------- 1 | #get candidate set coverage in first 1000 adopters for topics with atleast 1000 adopters and compare with eventual spread 2 | 3 | import cPickle as pickle 4 | import time 5 | from math import sqrt 6 | import random 7 | from heapq import nsmallest, nlargest, merge 8 | import numpy 9 | # from scipy.spatial import cKDTree as KDTree 10 | from sklearn.neighbors import NearestNeighbors 11 | import sys 12 | from multiprocessing import Pool, cpu_count 13 | 14 | NUM_PROCESSES = 9 15 | 16 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_pr.txt" 17 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences" 18 | num_init_adopters = 100 19 | top_k = 4000 20 | seq_len_threshold = 500 #top_k 21 | cand_size_factor = 1 22 | train_ex_limit = 100 23 | norm_vec = True 24 | 25 | print vec_file, num_init_adopters, top_k, seq_len_threshold, norm_vec 26 | 27 | with open("/mnt/filer01/word2vec/degree_distribution/sequence_file_split_indices.pickle","rb") as fr: 28 | _ = pickle.load(fr) 29 | test_seq_id = pickle.load(fr) 30 | test_seq_id = set(test_seq_id) 31 | 32 | def read_vector_file(path_vectors_file): 33 | vocab = [] 34 | vectors = [] 35 | with open(path_vectors_file,"rb") as fr: 36 | _,dim = next(fr).rstrip().split(' ') 37 | word_vector_dim = int(dim) 38 | next(fr) 39 | for line in fr: 40 | line = line.rstrip() 41 | u = line.split(' ') 42 | if len(u) != word_vector_dim+1: 43 | print "vector length error" 44 | word = int(u[0]) 45 | #normalise to length 1 46 | if norm_vec: 47 | vec = [] 48 | length = 0.0 49 | for d in u[1:]: 50 | num=float(d) 51 | vec.append(num) 52 | length+=num**2 53 | #vec = map(float,u[1:]) 54 | #length = sum(x**2 for x in vec) 55 | length = sqrt(length) 56 | vec_norm = [x/length for x in vec] 57 | vectors.append(vec_norm) 58 | else: 59 | vec = map(float,u[1:]) 60 | vectors.append(vec) 61 | vocab.append(word) 62 | return vectors, vocab, word_vector_dim 63 | 64 | vec,vocab,dim = read_vector_file(vec_file) 65 | vocab_index=dict() 66 | for i in xrange(0,len(vocab)): 67 | vocab_index[vocab[i]]=i 68 | num_users = len(vocab) 69 | print "num users in train sequences", num_users 70 | # print "users removed from vocab", len(set(users_train)-set(vocab)) 71 | # print "users in test sequences but not in vocab", len(users_test-set(vocab)) 72 | 73 | # building kd-tree 74 | tic = time.clock() 75 | # kd = KDTree(vec, leafsize=10) 76 | neigh = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='ball_tree', leaf_size=100, metric='minkowski', p=2) #'ball_tree', 'kd_tree', 'auto' 77 | neigh.fit(vec) 78 | toc = time.clock() 79 | print "ball tree built in", (toc-tic)*1000 80 | 81 | def get_candidate_set(query_set,next_adopters,N): 82 | try: 83 | query_set_ind = [ vocab_index[query] for query in query_set ] 84 | except KeyError: 85 | print "query word not present" 86 | return 87 | query_vec = [vec[i] for i in query_set_ind] 88 | # query using scipy kdtree 89 | # d_list,knn_list = kd.query(query_vec,k=cand_size_factor*N+len(query_set_ind)) 90 | # query using sklearn 91 | d_list,knn_list = neigh.kneighbors(X=query_vec, n_neighbors=cand_size_factor*N+len(query_set_ind), return_distance=True) 92 | 93 | cand_set = set() 94 | for index_list in knn_list: 95 | filtered=[idx for idx in index_list if idx not in query_set_ind] 96 | cand_set.update(filtered) 97 | 98 | cand_set_size = len(cand_set) 99 | M = len(next_adopters) 100 | next_adopters_index = [vocab_index[a] for a in next_adopters] 101 | next_adopters_index = set(next_adopters_index) 102 | num_adopters = len(cand_set&next_adopters_index) 103 | 104 | # print "candidate set recall", num_adopters, "out of", len(next_adopters), "cand size", len(cand_user_idx) 105 | cr = num_adopters*1./cand_set_size 106 | cc = 0.0 107 | if M!=0: 108 | cc = num_adopters*1./M 109 | return num_adopters, cand_set_size, cc, cr, M 110 | 111 | def print_stats(u): 112 | return [numpy.mean(u), numpy.std(u), numpy.median(u)] 113 | 114 | # reading test sequences 115 | not_found_vocab=[] 116 | # source_thr = 1395858601 + 12*60*60 117 | # non_emergent_tags = pickle.load(open("/mnt/filer01/word2vec/degree_distribution/nonEmergentHashtags.pickle","rb")) 118 | tag_seq = [] 119 | count=0 120 | # nb_seq = dict() 121 | # adlen = [] 122 | with open(adoption_sequence_filename, "rb") as fr: 123 | for line in fr: 124 | line = line.rstrip() 125 | u = line.split(' ') 126 | not_found = set() 127 | adopters = set() 128 | # first_timestamp = int(u[1][0:u[1].index(',')]) 129 | # first tweet only after source_thr timestamp 130 | # if first_timestamp>=source_thr 131 | # check if <5 tweets in 12 hours for emergent hashtags, not already popular 132 | # u[0] not in non_emergent_tags and 133 | if count in test_seq_id: 134 | seq=[] 135 | for i in xrange(1, len(u)): 136 | #timestamp = int(u[i][0:u[i].index(',')]) 137 | author = int(u[i][u[i].index(',')+1 : ]) 138 | if author in vocab_index: 139 | # removing repeat adopters 140 | if author not in adopters: 141 | seq.append(author) 142 | adopters.add(author) 143 | else: 144 | not_found.add(author) 145 | if len(seq)>0: 146 | tag_seq.append(seq) 147 | not_found_vocab.append(len(not_found)) 148 | # adlen.append(len(seq)) 149 | # elif count not in test_seq_id: 150 | # adop=[] 151 | # for i in xrange(1, len(u)): 152 | # author = int(u[i][u[i].index(',')+1 : ]) 153 | # if author in vocab_index: 154 | # adop.append(author) 155 | # for author in set(adop): 156 | # try: 157 | # nb_seq[author]+=1 158 | # except KeyError: 159 | # nb_seq[author]=1 160 | count+=1 161 | #nb, number of training sequences participated in 162 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq] 163 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True) 164 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted] 165 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb")) 166 | # pickle.dump(adlen,open("adlen.pickle","wb")) 167 | 168 | print len(tag_seq),len(test_seq_id),count 169 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab) 170 | 171 | """ 172 | #test sequences in random order 173 | seq_random_index=range(0,len(tag_seq)) 174 | random.shuffle(seq_random_index) 175 | 176 | seq_index_filter = [] 177 | for i in seq_random_index: 178 | seq_sample_vocab = tag_seq[i] 179 | M = len(seq_sample_vocab) 180 | if M= end: 209 | break 210 | count+=1 211 | seq_sample_vocab = tag_seq[i] 212 | total_spread = len(seq_sample_vocab) 213 | avg_num_adopters+=total_spread 214 | init_adopters=seq_sample_vocab[0:num_init] 215 | next_adopters = seq_sample_vocab[num_init:seq_len_threshold] 216 | N = num_query #1000 #M #num_users 217 | 218 | op, cand_set_size, cc, cr, M = get_candidate_set(init_adopters, next_adopters, N) 219 | cand_cov+=cc 220 | cand_cr+=cr 221 | 222 | cand_set_recall_spread.append((cc,total_spread)) 223 | cand_set_overlap.append(op) 224 | cand_set_cr.append(cr) 225 | cand_set_size_list.append(cand_set_size) 226 | 227 | # print "cc", cc, "cand size", cand_set_size, "Avg", cand_cov*1./(l+1), sum(cand_set_size_list)*1./(l+1), "adop in cand", op, "total", M, total_spread, l 228 | l+=1 229 | # if l==train_ex_limit: 230 | # break 231 | print process_num, start, num_init, num_query, "num examples", l, "cc", print_stats(cand_set_recall_spread), avg_num_adopters*1./l 232 | with open("/mnt/filer01/word2vec/degree_distribution/candset_stat_files/candset_vs_spread_n"+str(num_init)+"_c"+str(num_query)+"_seq"+str(seq_len_threshold)+"_ex"+str(start)+".pickle","wb") as fd: 233 | pickle.dump(cand_set_recall_spread,fd) 234 | # pickle.dump(cand_set_overlap,fd) 235 | # pickle.dump(cand_set_cr,fd) 236 | # pickle.dump(cand_set_size_list,fd) 237 | 238 | # tic = time.clock() 239 | # cand_set_stat(0,num_init_adopters,top_k) 240 | # toc = time.clock() 241 | # print "cand set eval in", (toc-tic)*1000 242 | 243 | NUM_LINES = len(seq_index_filter) 244 | 245 | num_workers = min(NUM_PROCESSES,cpu_count()) 246 | pool = Pool(processes=num_workers) 247 | process_num=0 248 | lines_per_process = int(NUM_LINES/(2*num_workers)) 249 | for s,e in ( (i,min(i+lines_per_process,NUM_LINES)) for i in xrange(0,NUM_LINES,lines_per_process) ): 250 | pool.apply_async(cand_set_stat, args=(process_num,s,e,num_init_adopters,top_k)) 251 | process_num+=1 252 | pool.close() 253 | pool.join() 254 | 255 | print vec_file, num_init_adopters, top_k -------------------------------------------------------------------------------- /adopter_prediction/adopter_pred_cand_set_stat.py: -------------------------------------------------------------------------------- 1 | #get nearest users to the initial adopters of a hashtag sequence in test sequences using user vectors and write candidate set size stats 2 | #for different values of init adopters, query size or query radius 3 | #changed index file of sequences with non-zero number of adopters (and those who are present in vocab) in sequence_file_split_indices.pickle 4 | 5 | import cPickle as pickle 6 | import time 7 | from math import sqrt 8 | import random 9 | from heapq import nsmallest, nlargest, merge 10 | import numpy 11 | # from scipy.spatial import cKDTree as KDTree 12 | from sklearn.neighbors import NearestNeighbors 13 | import sys 14 | from multiprocessing import Pool, cpu_count 15 | import traceback 16 | 17 | NUM_PROCESSES = 2 18 | 19 | vec_file = "/mnt/filer01/word2vec/node_vectors_1hr_pr.txt" 20 | adoption_sequence_filename = "/mnt/filer01/word2vec/degree_distribution/hashtagAdoptionSequences.txt" #"sample_sequences" 21 | num_init_adopters = [10] #range(10,101,10) 22 | top_k = [6000,10000] #range(4000,10001,2000) 23 | query_rad = numpy.arange(0.6,1.05,0.05) 24 | seq_len_threshold = 500 #top_k 25 | cand_size_factor = 1 26 | train_ex_limit = 50 27 | norm_vec = True 28 | 29 | print vec_file, num_init_adopters, top_k, train_ex_limit, query_rad, norm_vec 30 | 31 | with open("/mnt/filer01/word2vec/degree_distribution/sequence_file_split_indices.pickle","rb") as fr: 32 | _ = pickle.load(fr) 33 | test_seq_id = pickle.load(fr) 34 | test_seq_id = set(test_seq_id) 35 | 36 | def read_vector_file(path_vectors_file): 37 | vocab = [] 38 | vectors = [] 39 | with open(path_vectors_file,"rb") as fr: 40 | _,dim = next(fr).rstrip().split(' ') 41 | word_vector_dim = int(dim) 42 | next(fr) 43 | for line in fr: 44 | line = line.rstrip() 45 | u = line.split(' ') 46 | if len(u) != word_vector_dim+1: 47 | print "vector length error" 48 | word = int(u[0]) 49 | #normalise to length 1 50 | if norm_vec: 51 | vec = [] 52 | length = 0.0 53 | for d in u[1:]: 54 | num=float(d) 55 | vec.append(num) 56 | length+=num**2 57 | #vec = map(float,u[1:]) 58 | #length = sum(x**2 for x in vec) 59 | length = sqrt(length) 60 | vec_norm = [x/length for x in vec] 61 | vectors.append(vec_norm) 62 | else: 63 | vec = map(float,u[1:]) 64 | vectors.append(vec) 65 | vocab.append(word) 66 | return vectors, vocab, word_vector_dim 67 | 68 | vec,vocab,dim = read_vector_file(vec_file) 69 | vocab_index=dict() 70 | for i in xrange(0,len(vocab)): 71 | vocab_index[vocab[i]]=i 72 | num_users = len(vocab) 73 | print "num users in train sequences", num_users 74 | # print "users removed from vocab", len(set(users_train)-set(vocab)) 75 | # print "users in test sequences but not in vocab", len(users_test-set(vocab)) 76 | 77 | # building kd-tree 78 | tic = time.clock() 79 | # kd = KDTree(vec, leafsize=10) 80 | neigh = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='ball_tree', leaf_size=100, metric='minkowski', p=2) #'ball_tree', 'kd_tree', 'auto' 81 | neigh.fit(vec) 82 | toc = time.clock() 83 | print "ball tree built in", (toc-tic)*1000 84 | 85 | def get_candidate_set(query_set,next_adopters,N): 86 | try: 87 | query_set_ind = [ vocab_index[query] for query in query_set ] 88 | except KeyError: 89 | print "query word not present" 90 | return 91 | query_vec = [vec[i] for i in query_set_ind] 92 | 93 | # query using scipy kdtree 94 | # d_list,knn_list = kd.query(query_vec,k=cand_size_factor*N+len(query_set_ind)) 95 | 96 | # query using sklearn 97 | d_list,knn_list = neigh.kneighbors(X=query_vec, n_neighbors=cand_size_factor*N+len(query_set_ind), return_distance=True) 98 | 99 | # get vectors within distance N 100 | # _,knn_list = neigh.radius_neighbors(X=query_vec, radius=N, return_distance=True) 101 | qresult_size = [] 102 | 103 | cand_set = set() 104 | for index_list in knn_list: 105 | qresult_size.append(len(index_list)) 106 | filtered=[idx for idx in index_list if idx not in query_set_ind] 107 | cand_set.update(filtered) 108 | 109 | med_qresult_size = numpy.median(qresult_size) 110 | cand_set_size = len(cand_set) 111 | M = len(next_adopters) 112 | next_adopters_index = [vocab_index[a] for a in next_adopters] 113 | next_adopters_index = set(next_adopters_index) 114 | num_adopters = len(cand_set&next_adopters_index) 115 | cand_adopters = cand_set&next_adopters_index 116 | 117 | # print "candidate set recall", num_adopters, "out of", len(next_adopters), "cand size", len(cand_user_idx) 118 | cr = 0.0 119 | if cand_set_size!=0: 120 | cr = num_adopters*1./cand_set_size 121 | cc = 0.0 122 | if M!=0: 123 | cc = num_adopters*1./M 124 | return num_adopters, cand_set_size, cc, cr, M, med_qresult_size, cand_adopters, cand_set, next_adopters_index 125 | 126 | def print_stats(u): 127 | return [numpy.mean(u), numpy.std(u), numpy.median(u)] 128 | 129 | # reading test sequences 130 | not_found_vocab=[] 131 | # source_thr = 1395858601 + 12*60*60 132 | # non_emergent_tags = pickle.load(open("/mnt/filer01/word2vec/degree_distribution/nonEmergentHashtags.pickle","rb")) 133 | tag_seq = [] 134 | count=0 135 | # nb_seq = dict() 136 | # adlen = [] 137 | with open(adoption_sequence_filename, "rb") as fr: 138 | for line in fr: 139 | line = line.rstrip() 140 | u = line.split(' ') 141 | not_found = set() 142 | adopters = set() 143 | # first_timestamp = int(u[1][0:u[1].index(',')]) 144 | # first tweet only after source_thr timestamp 145 | # if first_timestamp>=source_thr 146 | # check if <5 tweets in 12 hours for emergent hashtags, not already popular 147 | # u[0] not in non_emergent_tags and 148 | if count in test_seq_id: 149 | seq=[] 150 | for i in xrange(1, len(u)): 151 | #timestamp = int(u[i][0:u[i].index(',')]) 152 | author = int(u[i][u[i].index(',')+1 : ]) 153 | if author in vocab_index: 154 | # removing repeat adopters 155 | if author not in adopters: 156 | seq.append(author) 157 | adopters.add(author) 158 | else: 159 | not_found.add(author) 160 | if len(seq)>0: 161 | tag_seq.append(seq) 162 | not_found_vocab.append(len(not_found)) 163 | # adlen.append(len(seq)) 164 | # elif count not in test_seq_id: 165 | # adop=[] 166 | # for i in xrange(1, len(u)): 167 | # author = int(u[i][u[i].index(',')+1 : ]) 168 | # if author in vocab_index: 169 | # adop.append(author) 170 | # for author in set(adop): 171 | # try: 172 | # nb_seq[author]+=1 173 | # except KeyError: 174 | # nb_seq[author]=1 175 | count+=1 176 | #nb, number of training sequences participated in 177 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq] 178 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True) 179 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted] 180 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb")) 181 | # pickle.dump(adlen,open("adlen.pickle","wb")) 182 | 183 | print len(tag_seq),len(test_seq_id),count 184 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab) 185 | 186 | """ 187 | #test sequences in random order 188 | seq_random_index=range(0,len(tag_seq)) 189 | random.shuffle(seq_random_index) 190 | 191 | seq_index_filter = [] 192 | for i in seq_random_index: 193 | seq_sample_vocab = tag_seq[i] 194 | init_adopters=seq_sample_vocab[0:num_init_adopters] 195 | seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:]) 196 | M = len(seq_sample_vocab) 197 | N = top_k #1000 #M #num_users 198 | if M=source_thr 190 | # check if <5 tweets in 12 hours for emergent hashtags, not already popular 191 | # u[0] not in non_emergent_tags and 192 | if count in test_seq_id: 193 | seq=[] 194 | for i in xrange(1, len(u)): 195 | #timestamp = int(u[i][0:u[i].index(',')]) 196 | author = int(u[i][u[i].index(',')+1 : ]) 197 | if author in vocab_index: 198 | # removing repeat adopters 199 | if author not in adopters: 200 | seq.append(author) 201 | adopters.add(author) 202 | else: 203 | not_found.add(author) 204 | if len(seq)>num_init_adopters: 205 | tag_seq.append(seq) 206 | not_found_vocab.append(len(not_found)) 207 | # adlen.append(len(seq)) 208 | # elif count not in test_seq_id: 209 | # adop=[] 210 | # for i in xrange(1, len(u)): 211 | # author = int(u[i][u[i].index(',')+1 : ]) 212 | # if author in vocab_index: 213 | # adop.append(author) 214 | # for author in set(adop): 215 | # try: 216 | # nb_seq[author]+=1 217 | # except KeyError: 218 | # nb_seq[author]=1 219 | count+=1 220 | #nb, number of training sequences participated in 221 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq] 222 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True) 223 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted] 224 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb")) 225 | # pickle.dump(adlen,open("adlen.pickle","wb")) 226 | 227 | print len(tag_seq),len(test_seq_id),count 228 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab) 229 | 230 | prec_k_total = [] 231 | rec_k_total = [] 232 | 233 | """ 234 | #test sequences in random order 235 | seq_random_index=range(0,len(tag_seq)) 236 | random.shuffle(seq_random_index) 237 | 238 | seq_index_filter = [] 239 | for i in seq_random_index: 240 | seq_sample_vocab = tag_seq[i] 241 | init_adopters=seq_sample_vocab[0:num_init_adopters] 242 | seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:]) 243 | M = len(seq_sample_vocab) 244 | N = top_k #1000 #M #num_users 245 | if M=source_thr 161 | # check if <5 tweets in 12 hours for emergent hashtags, not already popular 162 | # u[0] not in non_emergent_tags and 163 | if count in test_seq_id: 164 | seq=[] 165 | for i in xrange(1, len(u)): 166 | #timestamp = int(u[i][0:u[i].index(',')]) 167 | author = int(u[i][u[i].index(',')+1 : ]) 168 | if author in vocab_index: 169 | # removing repeat adopters 170 | if author not in adopters: 171 | seq.append(author) 172 | adopters.add(author) 173 | else: 174 | not_found.add(author) 175 | if len(seq)>num_init_adopters: 176 | tag_seq.append(seq) 177 | not_found_vocab.append(len(not_found)) 178 | # adlen.append(len(seq)) 179 | # elif count not in test_seq_id: 180 | # adop=[] 181 | # for i in xrange(1, len(u)): 182 | # author = int(u[i][u[i].index(',')+1 : ]) 183 | # if author in vocab_index: 184 | # adop.append(author) 185 | # for author in set(adop): 186 | # try: 187 | # nb_seq[author]+=1 188 | # except KeyError: 189 | # nb_seq[author]=1 190 | count+=1 191 | #nb, number of training sequences participated in 192 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq] 193 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True) 194 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted] 195 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb")) 196 | # pickle.dump(adlen,open("adlen.pickle","wb")) 197 | 198 | print len(tag_seq),len(test_seq_id),count 199 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab) 200 | 201 | prec_k_total = [] 202 | rec_k_total = [] 203 | cand_set_recall = [] 204 | cand_set_cr = [] 205 | cand_set_size_list = [] 206 | 207 | """ 208 | #test sequences in random order 209 | seq_random_index=range(0,len(tag_seq)) 210 | random.shuffle(seq_random_index) 211 | 212 | seq_index_filter = [] 213 | for i in seq_random_index: 214 | seq_sample_vocab = tag_seq[i] 215 | init_adopters=seq_sample_vocab[0:num_init_adopters] 216 | seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:]) 217 | M = len(seq_sample_vocab) 218 | N = top_k #1000 #M #num_users 219 | if M=source_thr 160 | # check if <5 tweets in 12 hours for emergent hashtags, not already popular 161 | # u[0] not in non_emergent_tags and 162 | if count in test_seq_id: 163 | seq=[] 164 | for i in xrange(1, len(u)): 165 | #timestamp = int(u[i][0:u[i].index(',')]) 166 | author = int(u[i][u[i].index(',')+1 : ]) 167 | if author in vocab_index: 168 | # removing repeat adopters 169 | if author not in adopters: 170 | seq.append(author) 171 | adopters.add(author) 172 | else: 173 | not_found.add(author) 174 | if len(seq)>num_init_adopters: 175 | tag_seq.append(seq) 176 | tag_name.append(u[0]) 177 | not_found_vocab.append(len(not_found)) 178 | # adlen.append(len(seq)) 179 | # elif count not in test_seq_id: 180 | # adop=[] 181 | # for i in xrange(1, len(u)): 182 | # author = int(u[i][u[i].index(',')+1 : ]) 183 | # if author in vocab_index: 184 | # adop.append(author) 185 | # for author in set(adop): 186 | # try: 187 | # nb_seq[author]+=1 188 | # except KeyError: 189 | # nb_seq[author]=1 190 | count+=1 191 | #nb, number of training sequences participated in 192 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq] 193 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True) 194 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted] 195 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb")) 196 | # pickle.dump(adlen,open("adlen.pickle","wb")) 197 | 198 | print len(tag_seq),len(test_seq_id),count 199 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab) 200 | 201 | cand_cov = 0.0 202 | 203 | """ 204 | #test sequences in random order 205 | seq_random_index=range(0,len(tag_seq)) 206 | random.shuffle(seq_random_index) 207 | 208 | seq_index_filter = [] 209 | for i in seq_random_index: 210 | seq_sample_vocab = tag_seq[i] 211 | init_adopters=seq_sample_vocab[0:num_init_adopters] 212 | seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:]) 213 | M = len(seq_sample_vocab) 214 | N = top_k #1000 #M #num_users 215 | if M=source_thr 165 | # check if <5 tweets in 12 hours for emergent hashtags, not already popular 166 | # u[0] not in non_emergent_tags and 167 | if count in test_seq_id: 168 | seq=[] 169 | for i in xrange(1, len(u)): 170 | #timestamp = int(u[i][0:u[i].index(',')]) 171 | author = int(u[i][u[i].index(',')+1 : ]) 172 | if author in vocab_index: 173 | # removing repeat adopters 174 | if author not in adopters: 175 | seq.append(author) 176 | adopters.add(author) 177 | else: 178 | not_found.add(author) 179 | if len(seq)>num_init_adopters: 180 | tag_seq.append(seq) 181 | not_found_vocab.append(len(not_found)) 182 | # adlen.append(len(seq)) 183 | # elif count not in test_seq_id: 184 | # adop=[] 185 | # for i in xrange(1, len(u)): 186 | # author = int(u[i][u[i].index(',')+1 : ]) 187 | # if author in vocab_index: 188 | # adop.append(author) 189 | # for author in set(adop): 190 | # try: 191 | # nb_seq[author]+=1 192 | # except KeyError: 193 | # nb_seq[author]=1 194 | count+=1 195 | #nb, number of training sequences participated in 196 | # nb_seq_part = [(a,nb_seq[a]) for a in nb_seq] 197 | # nb_seq_part_sorted = sorted(nb_seq_part, key=lambda x: x[1], reverse=True) 198 | # nb_seq_order = [a for a,_ in nb_seq_part_sorted] 199 | # pickle.dump(nb_seq_order,open(nb_sorted_pickle,"wb")) 200 | # pickle.dump(adlen,open("adlen.pickle","wb")) 201 | 202 | print len(tag_seq),len(test_seq_id),count 203 | print sum(not_found_vocab)/float(len(not_found_vocab)),max(not_found_vocab),min(not_found_vocab) 204 | 205 | prec_k_total = [] 206 | rec_k_total = [] 207 | cand_set_recall = [] 208 | cand_set_cr = [] 209 | cand_set_size_list = [] 210 | cand_cov = 0.0 211 | cand_cr = 0.0 212 | 213 | """ 214 | #test sequences in random order 215 | seq_random_index=range(0,len(tag_seq)) 216 | random.shuffle(seq_random_index) 217 | 218 | seq_index_filter = [] 219 | for i in seq_random_index: 220 | seq_sample_vocab = tag_seq[i] 221 | init_adopters=seq_sample_vocab[0:num_init_adopters] 222 | seq_sample_vocab = set(seq_sample_vocab[num_init_adopters:]) 223 | M = len(seq_sample_vocab) 224 | N = top_k #1000 #M #num_users 225 | if M num_remaining: 133 | path+=random.sample(next_level,num_remaining) #order of vertices changed by sample 134 | print path, "out of", next_level 135 | break 136 | else: 137 | path+=next_level 138 | print path, next_level, "all" 139 | break 140 | paths.append(path) 141 | return paths 142 | 143 | #sample neighbouring vertices to left and right of vertex from hashtag graph 144 | def sample_nbhs_bfs(adj,rev_adj,start): 145 | paths = [] 146 | for i in xrange(0,gamma): 147 | #left 148 | path=[] 149 | count=0 150 | queue=[start] 151 | visited=set() 152 | while counttime_diff_for_edge: 197 | segments.append(seg) 198 | seg = [] 199 | seg.append(i) 200 | prev_time = time 201 | if seg!=[]: 202 | segments.append(seg) 203 | return segments 204 | """ 205 | #get adjacency list of hashtag graph from a segment 206 | """ 207 | def get_hashtag_graph_adj(segment): 208 | num_nodes = len(segment) 209 | adj_list = init_adj_list(num_nodes) #adjacency list for directed graph 210 | if num_nodes==1: 211 | return adj_list 212 | for i in range(0,num_nodes): 213 | time_first,_ = segment[i] 214 | for j in range(i+1,num_nodes): 215 | time_second,_ = segment[j] 216 | if time_second-time_first<=time_diff_for_edge: # only time difference considered for an edge, check other conditions 217 | adj_list[i].append(j) 218 | else: 219 | break #tweets are arranged in increasing time, so no edges will be there with vertices past present node 220 | #location 221 | #follower relation 222 | #check if more than one connected components in a segment if single path is considered for each segment 223 | return adj_list 224 | """ 225 | 226 | def get_hashtag_graph_adj(segment): 227 | num_nodes = len(segment) 228 | # adj_list = init_adj_list(num_nodes) #adjacency list for directed graph 229 | adj_list = [[] for i in xrange(0, num_nodes)] 230 | rev_adj_list = [[] for i in xrange(0, num_nodes)] #defaultdict(list) 231 | # print "init", total_size(adj_list), total_size(rev_adj_list) 232 | # print "adj list init" 233 | if num_nodes==1: 234 | return adj_list, rev_adj_list 235 | location = [[] for i in xrange(0, max_locations)] #dict() 236 | for i in xrange(0,num_nodes): 237 | _,author = segment[i] 238 | author_loc = location_buckets[author] 239 | if author_loc!=-1: #no edges between users with unknown location 240 | location[author_loc].append(i) #time sorted order will change across locations, but not within location. order of vertices in adjacency list is still same 241 | print "location list", location 242 | count=0 243 | for same_loc_seq in location: 244 | num_loc = len(same_loc_seq) 245 | print count, "Count", len(same_loc_seq) 246 | count+=1 247 | for i in xrange(0,num_loc): 248 | vertex_index_first = same_loc_seq[i] 249 | time_first,_ = segment[vertex_index_first] 250 | for j in xrange(i+1,num_loc): 251 | vertex_index_second = same_loc_seq[j] 252 | time_second,_ = segment[vertex_index_second] 253 | if time_second-time_first<=time_diff_for_edge: # only time difference considered for an edge, check other conditions 254 | adj_list[vertex_index_first].append(vertex_index_second) 255 | rev_adj_list[vertex_index_second].append(vertex_index_first) 256 | # rev_adj_list[vertex_index_second].insert(0,vertex_index_first) #to make the order of vertices having edge to second vertex in decreasing order, i.e., closest vertex first 257 | else: 258 | break #tweets are arranged in increasing time, so no edges will be there with vertices past present node 259 | #follower relation 260 | #check if more than one connected components in a segment if single path is considered for each segment 261 | # print "assigned", total_size(adj_list), total_size(rev_adj_list) 262 | 263 | return adj_list, rev_adj_list 264 | """ 265 | #get adjacency list of hashtag graph from a segment, using only time diff 266 | def get_hashtag_graph_adj(segment): 267 | num_nodes = len(segment) 268 | adj_list = [[] for i in xrange(0, num_nodes)] 269 | rev_adj_list = [[] for i in xrange(0, num_nodes)] 270 | # print "adj list init" 271 | if num_nodes==1: 272 | return adj_list, rev_adj_list 273 | for i in xrange(0,num_nodes): 274 | time_first,author_first = segment[i] 275 | for j in xrange(i+1,num_nodes): 276 | time_second,author_second = segment[j] 277 | if time_second-time_first<=time_diff_for_edge: # only time difference considered for an edge, check other conditions 278 | adj_list[i].append(j) 279 | rev_adj_list[j].append(i) 280 | else: 281 | break #tweets are arranged in increasing time, so no edges will be there with vertices past present node 282 | #follower relation 283 | #check if more than one connected components in a segment if single path is considered for each segment 284 | return adj_list, rev_adj_list 285 | """ 286 | #get all paths of length m from hashtag graph 287 | def get_paths_from_graph(nodes, adj, rev_adj): 288 | if len(nodes)>=min_context_length: #only if less than m length paths are not taken 289 | for start in xrange(0,len(nodes)): 290 | # if len(nodes)-start-1=min_context_length: #only take paths above minimum context length 310 | yield (start,path_to_sentence(nodes,p)) 311 | 312 | #get sentences from hashtag sequences 313 | sentences=[] 314 | max_locations = 2 315 | adoption_sequence = dict() 316 | adoption_sequence['test']=[(4,0),(10,1),(15,2),(21,3),(23,4),(26,5),(28,6),(37,7),(40,8),(45,9)] 317 | location_buckets = [0,0,1,1,-1,1,1,1,1,1] 318 | 319 | def get_sentences(adoption_sequence): 320 | tag_count = 0 321 | for t in adoption_sequence: 322 | segment=adoption_sequence[t] 323 | tag_count+=1 324 | adj_list, rev_adj_list = get_hashtag_graph_adj(segment) 325 | print adj_list, rev_adj_list 326 | paths = get_paths_from_graph(segment, adj_list, rev_adj_list) 327 | for p in paths: #change if only one path generated from a hashtag graph 328 | yield p 329 | 330 | print(list(get_sentences(adoption_sequence))) --------------------------------------------------------------------------------