├── lp ├── endmapper.py ├── mapper.py ├── reducer.py └── run.py ├── pre ├── mapper.py ├── reducer.py ├── reverse.py └── run.sh ├── readme └── twitter ├── exclude.txt └── run-twitter.sh /lp/endmapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | 3 | 4 | """ 5 | Mapper for Label Propagation algorithm using map-reduce 6 | 7 | Author: 8 | Name: Akshay Bhat 9 | WebSite: http://www.akshaybhat.com 10 | 11 | """ 12 | import sys 13 | 14 | if __name__ == '__main__': 15 | try: 16 | filename = sys.argv[1] 17 | names = ['']*int(sys.argv[2]) 18 | for line in file(filename): 19 | try: 20 | line = line.strip().split(' ') 21 | names[int(line[0])] = line[1] 22 | except: 23 | pass 24 | except: 25 | raise 26 | for line in sys.stdin: 27 | try: 28 | node = int(line.strip().split('\t')[0]) 29 | label = int(line.strip().split('\t')[1]) 30 | score = line.strip().split('\t')[2] 31 | if names: 32 | if names[node]: 33 | node = names[node] 34 | if names[label]: 35 | label = names[label] 36 | print str(label)+'\t'+str(node)+'\t'+score 37 | except: 38 | print line 39 | raise 40 | -------------------------------------------------------------------------------- /lp/mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2.6 2 | 3 | 4 | """ 5 | Mapper for Label Propagation algorithm using map-reduce 6 | 7 | Author: 8 | Name: Akshay Bhat 9 | WebSite: http://www.akshaybhat.com 10 | 11 | """ 12 | import os, random, time, sys, array, logging, collections 13 | 14 | def ParseOptions(argv): 15 | """ 16 | Parse the Command line Options 17 | """ 18 | SizeHint = 0 19 | if len(argv) > 1: 20 | SizeHint = int(argv[1]) 21 | if len(argv) > 2: 22 | Labels = argv[2] 23 | else: 24 | Labels = '' 25 | return [SizeHint,Labels] 26 | 27 | 28 | def ApplyAndVote(line,delim = '\t'): 29 | """ 30 | Applies the value of label for each neighbor 31 | and calls maxVote function 32 | """ 33 | entry = line.strip().split(delim) 34 | del line 35 | node = int(entry[0]) 36 | 37 | if len(entry) != 1: # handles cases where a node is not connected to any other node 38 | nLabels = [Label[int(k)] for k in entry[1:]] 39 | newLabel, weight = maxVote(nLabels) 40 | return node, newLabel, weight 41 | else: 42 | return node, node, 1 43 | 44 | 45 | def maxVote(nLabels): 46 | """ 47 | This function is used byt map function, given a list of labels of neighbors 48 | this function finds the most frequent labels and randomly returns one of them 49 | """ 50 | cnt = collections.defaultdict(int) 51 | for i in nLabels: 52 | cnt[i] += 1 53 | maxv = max(cnt.itervalues()) 54 | weight = float(maxv) / len(nLabels) 55 | return random.choice([k for k,v in cnt.iteritems() if v == maxv]), weight 56 | 57 | 58 | if __name__ == '__main__': 59 | [SizeHint,Labels]=ParseOptions(sys.argv) # Parse the command line options 60 | Label = range(SizeHint) # according to a post on stackoverflow this is faster than array.array, confirmed it myself as well 61 | if Labels: 62 | for line in file(Labels): 63 | entries = line.strip().split('\t') 64 | Label[int(entries[0])] = int(entries[1]) 65 | for line in sys.stdin: 66 | try: 67 | node, newLabel, weight = ApplyAndVote(line) 68 | print str(node) + '\t' + str(newLabel) +'\t' +str(weight) 69 | except: 70 | print line 71 | raise 72 | -------------------------------------------------------------------------------- /lp/reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | 4 | Description: Reducer used to summarize the results 5 | 6 | Used for summarizing results into following format. Provide input only from endmapper.py, 7 | also sort only based on node as key, else a single cluster might get split into two different reducers. 8 | 9 | cluster-label \t num_nodes \t nodeid \t weight \t nodeid \t weight ...... 10 | """ 11 | import sys 12 | 13 | if __name__ == '__main__': 14 | oldcommunity = '' 15 | temp = '' 16 | count = 0 17 | community = '' 18 | for line in sys.stdin: # line must be output from end mapper sorted on label as key, 19 | entries = line.strip().split('\t') 20 | community = entries[0] 21 | node = entries[1] 22 | weight = entries[2] 23 | if community != oldcommunity or count==10000: 24 | if oldcommunity: 25 | print oldcommunity+'\t'+str(count)+'\t'+temp 26 | oldcommunity = community 27 | temp = node + '\t' + weight 28 | count = 1 29 | else: 30 | count += 1 31 | temp += '\t' + node + '\t' + weight 32 | if community: 33 | print community+'\t'+str(count)+'\t'+temp 34 | 35 | -------------------------------------------------------------------------------- /lp/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Author: Akshay U. Bhat 4 | Description: Python file for setting up and executing Label Propgation job 5 | example call ./run.py Net.txt 4 17000000 6 | TO DO: wriete a custom partitionar instead of setting reducer to 1 7 | """ 8 | import sys,os 9 | exec_init_string = 'hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -input %s -output %s.Label%d.txt -mapper "mapper.py %d" -reducer NONE -file mapper.py' 10 | exec_string = 'hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -input %s -output %s.Label%d.txt -mapper "mapper.py %d Label.txt" -reducer NONE -file mapper.py -file Label.txt' 11 | exec_final_string = 'hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -input %s.Label%d.txt -output %s.Label.final.txt -mapper "endmapper.py users.txt 70000000" -reducer reducer.py -file reducer.py -file endmapper.py -file ../../users.txt' 12 | 13 | get_string = 'hadoop fs -getmerge &.Label#.txt Label.txt' 14 | 15 | if __name__ == '__main__': 16 | sizehint = 0 17 | infile = '' 18 | iterations = 0 19 | predict = False 20 | try: 21 | infile = sys.argv[1] 22 | iterations = int(sys.argv[2]) 23 | sizehint = int(sys.argv[3]) 24 | except: 25 | print "Please specify infile, number of iterations, maximum node index" 26 | raise 27 | # run the mapper with all nodes with labels same as node id 28 | os.system(exec_init_string%(infile,infile,0,sizehint)) 29 | # download the new Labels file 30 | os.system(get_string.replace('#','0').replace('&',infile)) 31 | 32 | for i in range(1,iterations): 33 | os.system(exec_string%(infile,infile,i,sizehint)) 34 | os.system('rm Label.txt') 35 | os.system(get_string.replace('#',str(i)).replace('&',infile)) 36 | os.system(exec_final_string%(infile,i,infile)) 37 | os.system('rm Label.txt') 38 | -------------------------------------------------------------------------------- /pre/mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.5 2 | """ 3 | Remove 2.5 from the line above, its a hack needed for it to work on Fedora 4 | 5 | Author: Akshay Bhat 6 | Desc: mapepr script which reverse id's in each pair, such that all lines will be sorted according to the id of the user following other user 7 | 8 | ToDo: clear up the naming, current naming convention assumes reverse to be true 9 | 10 | Usage: give 11 | -mapper "mapper.py Reverse" 12 | for twitter dataset from KAIST 13 | and just 14 | -mapper mapper.py 15 | for 16 | SourceID\tTargetID\n 17 | file 18 | 19 | """ 20 | import sys 21 | 22 | if __name__ == '__main__': 23 | Exclude = {} 24 | Reverse = False 25 | try: 26 | for line in file('exclude.txt').readlines(): 27 | Exclude[line.strip().split('\t')[0]] = 1 28 | except: 29 | pass 30 | if len(sys.argv) > 1: 31 | Reverse = True 32 | 33 | for line in sys.stdin: 34 | entries = line.strip().split('\t') 35 | Following = entries[0] 36 | User = entries[1] 37 | if Exclude: 38 | if not(User in Exclude) and not(Following in Exclude): 39 | if Reverse: 40 | print User + '\t' + Following 41 | else: 42 | print Following + '\t' + User 43 | else: 44 | if Reverse: 45 | print User + '\t' + Following 46 | else: 47 | print Following + '\t' + User 48 | 49 | -------------------------------------------------------------------------------- /pre/reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.5 2 | """ 3 | 4 | Author: Akshay Bhat 5 | Desc: reducer script to collect all ids followed by a user into a single line 6 | 7 | """ 8 | import sys 9 | 10 | if __name__ == '__main__': 11 | curUser = None 12 | Buf = '' 13 | for line in sys.stdin: 14 | entries = line.strip().split('\t') 15 | User = entries[0] 16 | Following = entries[1] 17 | if curUser != User: 18 | if curUser: 19 | print Buf 20 | Buf = User 21 | curUser = User 22 | Buf += '\t' + Following 23 | else: 24 | Buf += '\t' + Following 25 | print Buf 26 | 27 | 28 | -------------------------------------------------------------------------------- /pre/reverse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.5 2 | """ 3 | Remove 2.5 from the line above, its a hack needed for it to work on Fedora 4 | 5 | Author: Akshay Bhat 6 | Desc: mapepr script which reverse id's in each pair, such that all lines will be sorted according to the id of the user following other user 7 | 8 | ToDo: clear up the naming, current naming convention assumes reverse to be true 9 | 10 | Usage: give 11 | -mapper "mapper.py Reverse" 12 | for twitter dataset from KAIST 13 | and just 14 | -mapper mapper.py 15 | for 16 | SourceID\tTargetID\n 17 | file 18 | 19 | """ 20 | import sys 21 | 22 | if __name__ == '__main__': 23 | Exclude = {} # an array might be used instead of dictionary. Though it is less memmory efficient but it gurrantees O(1) access time. 24 | try: 25 | for line in file('exclude.txt').readlines(): 26 | Exclude[line.strip().split('\t')[0]] = 1 27 | except: 28 | pass 29 | if len(sys.argv) > 0: 30 | Reverse = False 31 | 32 | for line in sys.stdin: 33 | entries = line.strip().split('\t') 34 | Following = entries[0] 35 | User = entries[1] 36 | if Exclude: 37 | if not(User in Exclude) and not(Following in Exclude): 38 | if Reverse: 39 | print User + '\t' + Following 40 | else: 41 | print Following + '\t' + User 42 | else: 43 | if Reverse: 44 | print User + '\t' + Following 45 | else: 46 | print Following + '\t' + User 47 | 48 | -------------------------------------------------------------------------------- /pre/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # Generic Usage: 3 | # ./run.sh DP 4 | hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -input $1 -output $1.net -mapper mapper.py -reducer reducer.py -file mapper.py -file reducer.py 5 | 6 | # for twitter 7 | #hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -input $1 -output $1.net -mapper 'mapper.py Reverse' -reducer reducer.py -file mapper.py -file reducer.py -file exclude.txt -------------------------------------------------------------------------------- /readme: -------------------------------------------------------------------------------- 1 | Scalable Community Detection using Label Propagation and Map Reduce 2 | 3 | Author: Akshay Bhat 4 | Contact: akshaybhat [at] gmail.com 5 | 6 | Please visit http://www.akshaybhat.com/LPMR for more information 7 | 8 | Organization: 9 | Folder Description 10 | lp Code for Communtiy Detection 11 | 12 | pre Code to pre processing the edgelist file 13 | 14 | twitter Code for automating everything for twitter dataset 15 | 16 | Usage: 17 | note that this is an experimental code, and not a library. Thus it involves multiple hacks. 18 | 19 | You will need a working hadoop installation, this code has been tested using a cluster which used hadoop 0.19. Thus It should work very well with versions > 0.19. 20 | Still you will need to change path to hadoop streaming jar file. 21 | 22 | Download Twitter_rv.net from http://an.kaist.ac.kr/traces/WWW2010.html 23 | 24 | Download numeric2users.tar.gz from above website, extract it, rename it as Users.txt and put it outside the LPMR folder. (sorry if this sounds weird, will fix this soon) 25 | 26 | cd into twitter directory and execute 27 | ./run-twitter.sh twitter_rv.net 28 | 29 | [you will most likely get errors due to hadoop not being ] 30 | 31 | License: Research purpose only 32 | 33 | -------------------------------------------------------------------------------- /twitter/run-twitter.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # Generic Usage: 3 | # ./run-twitter.sh twitter_rv.net 4 | 5 | cd .. 6 | cd pre 7 | hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -input $1 -output Twitter/Net.pegasus -mapper 'mapper.py Reverse' -reducer NONE -file mapper.py 8 | hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -input $1 -output Twitter/Net.exclude -mapper 'mapper.py Reverse' -reducer reducer.py -file mapper.py -file reducer.py -file ../twitter/exclude.txt 9 | hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -input $1 -output Twitter/Net.complete -mapper 'mapper.py Reverse' -reducer reducer.py -file mapper.py -file reducer.py 10 | cd .. 11 | 12 | cd lp 13 | ./run.py Twitter/Net.exclude 15 70000000 14 | cd .. 15 | 16 | 17 | 18 | 19 | # cd ../PEG 20 | # code for perfoming in/out degree and page rank calculation using Pegasus 21 | # ./run_dd.sh in 72 Twitter/Net.pegasus 22 | # ./run_dd.sh out 72 Twitter/Net.pegasus 23 | # ./run_pr.sh 72 70000000 nosym Twitter/Net.pegasus 24 | # cd .. 25 | --------------------------------------------------------------------------------