├── lp
    ├── endmapper.py
    ├── mapper.py
    ├── reducer.py
    └── run.py
├── pre
    ├── mapper.py
    ├── reducer.py
    ├── reverse.py
    └── run.sh
├── readme
└── twitter
    ├── exclude.txt
    └── run-twitter.sh


/lp/endmapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python2.6
 2 | 
 3 | 
 4 | """
 5 | Mapper for Label Propagation algorithm using map-reduce
 6 | 
 7 | Author:
 8 | Name:       Akshay Bhat
 9 | WebSite:    http://www.akshaybhat.com
10 | 
11 | """
12 | import sys
13 |                 
14 | if __name__ == '__main__':
15 |     try:
16 |             filename = sys.argv[1]
17 |             names = ['']*int(sys.argv[2])
18 |             for line in file(filename):
19 |                 try:
20 |                     line = line.strip().split(' ')
21 |                     names[int(line[0])] = line[1]
22 |                 except:
23 |                     pass
24 |     except:
25 |         raise
26 |     for line in sys.stdin:
27 |         try:
28 |             node = int(line.strip().split('\t')[0])
29 |             label = int(line.strip().split('\t')[1])
30 |             score = line.strip().split('\t')[2]
31 |             if names:
32 |                 if names[node]:
33 |                     node = names[node]
34 |                 if names[label]:
35 |                     label = names[label]
36 |             print str(label)+'\t'+str(node)+'\t'+score
37 |         except:
38 |             print line
39 |             raise
40 | 


--------------------------------------------------------------------------------
/lp/mapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python2.6
 2 | 
 3 | 
 4 | """
 5 | Mapper for Label Propagation algorithm using map-reduce
 6 | 
 7 | Author:
 8 | Name:       Akshay Bhat
 9 | WebSite:    http://www.akshaybhat.com
10 | 
11 | """
12 | import os, random, time, sys, array, logging, collections
13 | 
14 | def ParseOptions(argv):
15 |     """
16 |     Parse the Command line Options
17 |     """
18 |     SizeHint = 0
19 |     if len(argv) > 1:
20 |         SizeHint = int(argv[1])
21 |     if len(argv) > 2:
22 |         Labels = argv[2]
23 |     else:
24 |         Labels  = ''
25 |     return [SizeHint,Labels]
26 | 
27 | 
28 | def ApplyAndVote(line,delim = '\t'):
29 |     """
30 |     Applies the value of label for each neighbor
31 |     and calls maxVote function
32 |     """
33 |     entry =  line.strip().split(delim)
34 |     del line
35 |     node = int(entry[0])
36 |     
37 |     if len(entry) != 1:     # handles cases where a node is not connected to any other node
38 |         nLabels = [Label[int(k)] for k in entry[1:]]
39 |         newLabel, weight = maxVote(nLabels) 
40 |         return node, newLabel, weight 
41 |     else:
42 |         return node, node, 1
43 | 
44 | 
45 | def maxVote(nLabels):
46 |     """
47 |     This function is used byt map function, given a list of labels of neighbors
48 |     this function finds the most frequent labels and randomly returns one of them
49 |     """
50 |     cnt = collections.defaultdict(int)
51 |     for i in nLabels:
52 |         cnt[i] += 1
53 |     maxv = max(cnt.itervalues())
54 |     weight =  float(maxv) / len(nLabels)
55 |     return random.choice([k for k,v in cnt.iteritems() if v == maxv]), weight
56 | 
57 |                 
58 | if __name__ == '__main__':
59 |     [SizeHint,Labels]=ParseOptions(sys.argv) # Parse the command line options
60 |     Label = range(SizeHint) # according to a post on stackoverflow this is faster than array.array, confirmed it myself as well   
61 |     if Labels:
62 |         for line in file(Labels):
63 |             entries = line.strip().split('\t') 
64 |             Label[int(entries[0])] = int(entries[1])
65 |     for line in sys.stdin:
66 |         try:
67 |             node, newLabel, weight = ApplyAndVote(line)
68 |             print str(node) + '\t' + str(newLabel) +'\t' +str(weight)
69 |         except:
70 |             print line
71 |             raise
72 | 


--------------------------------------------------------------------------------
/lp/reducer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | 
 4 | Description: Reducer used to summarize the results
 5 | 
 6 | Used for summarizing results into following format. Provide input only from endmapper.py,
 7 | also sort only based on node as key, else  a single cluster might get split into two different reducers. 
 8 | 
 9 | cluster-label \t num_nodes \t  nodeid \t weight \t  nodeid \t weight ...... 
10 | """
11 | import sys
12 | 
13 | if __name__ == '__main__':
14 |         oldcommunity = ''
15 |         temp = ''
16 |         count = 0
17 |         community = ''
18 |         for line in sys.stdin:  # line must be output from end mapper sorted on label as key, 
19 |                 entries = line.strip().split('\t')
20 |                 community = entries[0]
21 |                 node = entries[1]
22 |                 weight = entries[2]
23 |                 if community != oldcommunity or count==10000:
24 |                         if oldcommunity:
25 |                                 print oldcommunity+'\t'+str(count)+'\t'+temp
26 |                         oldcommunity = community
27 |                         temp = node + '\t' + weight
28 |                         count = 1
29 |                 else:
30 |                         count += 1
31 |                         temp += '\t' + node + '\t' + weight
32 |         if community:
33 |                 print community+'\t'+str(count)+'\t'+temp
34 | 
35 | 


--------------------------------------------------------------------------------
/lp/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Author: Akshay U. Bhat
 4 |     Description: Python file for setting up and executing Label Propgation job
 5 |     example call ./run.py Net.txt 4 17000000
 6 | 	TO DO: wriete a custom partitionar instead of setting reducer to 1 
 7 | """
 8 | import sys,os
 9 | exec_init_string = 'hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar  -input %s -output %s.Label%d.txt -mapper "mapper.py %d" -reducer NONE -file mapper.py'
10 | exec_string = 'hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar  -input %s -output %s.Label%d.txt -mapper "mapper.py %d Label.txt" -reducer NONE -file mapper.py -file Label.txt'
11 | exec_final_string = 'hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar  -input %s.Label%d.txt -output %s.Label.final.txt -mapper "endmapper.py users.txt 70000000"  -reducer reducer.py -file reducer.py -file endmapper.py -file ../../users.txt'
12 | 
13 | get_string = 'hadoop fs -getmerge &.Label#.txt Label.txt'
14 | 
15 | if __name__ == '__main__':
16 |     sizehint = 0
17 |     infile = ''
18 |     iterations = 0
19 |     predict = False
20 |     try:
21 |         infile = sys.argv[1]
22 |         iterations =  int(sys.argv[2])
23 |         sizehint = int(sys.argv[3])
24 |     except:
25 |         print "Please specify infile, number of iterations, maximum node index"
26 |         raise
27 |     # run the mapper with all nodes with labels same as node id
28 |     os.system(exec_init_string%(infile,infile,0,sizehint))
29 |     # download the new Labels file
30 |     os.system(get_string.replace('#','0').replace('&',infile))
31 |     
32 |     for i in range(1,iterations):
33 |         os.system(exec_string%(infile,infile,i,sizehint))
34 |         os.system('rm Label.txt')
35 |         os.system(get_string.replace('#',str(i)).replace('&',infile))
36 |     os.system(exec_final_string%(infile,i,infile))
37 |     os.system('rm Label.txt')
38 | 


--------------------------------------------------------------------------------
/pre/mapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.5
 2 | """
 3 | Remove 2.5 from the line above, its a hack needed for it to work on Fedora
 4 | 
 5 | Author: Akshay Bhat
 6 | Desc:  mapepr script which reverse id's in each pair, such that all lines will be sorted according to the id of the user following other user
 7 | 
 8 | ToDo: clear up the naming, current naming convention assumes reverse to be true
 9 | 
10 | Usage: give
11 | -mapper "mapper.py Reverse" 
12 | for twitter dataset from KAIST
13 | and just
14 | -mapper mapper.py
15 | for
16 | SourceID\tTargetID\n
17 | file 
18 | 
19 | """
20 | import sys
21 | 
22 | if __name__ == '__main__':
23 |     Exclude = {}
24 |     Reverse = False
25 |     try: 
26 | 	for line in file('exclude.txt').readlines():
27 |             Exclude[line.strip().split('\t')[0]] = 1
28 |     except:
29 | 	pass
30 |     if len(sys.argv) > 1:
31 |         Reverse = True
32 |     
33 |     for line in sys.stdin:
34 |         entries =  line.strip().split('\t')
35 |         Following = entries[0]
36 |         User = entries[1]
37 |         if Exclude:
38 |             if not(User in Exclude) and not(Following in Exclude):
39 |                 if Reverse:
40 |                     print User + '\t' + Following
41 |                 else:
42 |                     print Following + '\t' + User
43 |         else:
44 |             if Reverse:
45 |                 print User + '\t' + Following
46 |             else:
47 |                 print Following + '\t' + User
48 | 
49 | 


--------------------------------------------------------------------------------
/pre/reducer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.5
 2 | """
 3 | 
 4 | Author: Akshay Bhat
 5 | Desc: reducer script to collect all ids followed by a user into a single line
 6 | 
 7 | """
 8 | import sys
 9 | 
10 | if __name__ == '__main__':
11 |     curUser = None
12 |     Buf = ''
13 |     for line in sys.stdin:
14 |         entries =  line.strip().split('\t')
15 |         User = entries[0]
16 |         Following = entries[1]
17 |         if curUser != User:
18 |             if curUser:
19 |                 print Buf
20 |             Buf = User
21 |             curUser = User
22 |             Buf += '\t' + Following
23 |         else:
24 |             Buf += '\t' + Following
25 |     print Buf
26 |     
27 | 
28 | 


--------------------------------------------------------------------------------
/pre/reverse.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.5
 2 | """
 3 | Remove 2.5 from the line above, its a hack needed for it to work on Fedora
 4 | 
 5 | Author: Akshay Bhat
 6 | Desc:  mapepr script which reverse id's in each pair, such that all lines will be sorted according to the id of the user following other user
 7 | 
 8 | ToDo: clear up the naming, current naming convention assumes reverse to be true
 9 | 
10 | Usage: give
11 | -mapper "mapper.py Reverse" 
12 | for twitter dataset from KAIST
13 | and just
14 | -mapper mapper.py
15 | for
16 | SourceID\tTargetID\n
17 | file 
18 | 
19 | """
20 | import sys
21 | 
22 | if __name__ == '__main__':
23 |     Exclude = {} # an array might be used instead of dictionary. Though it is less memmory efficient but it gurrantees O(1) access time.
24 |     try: 
25 | 	for line in file('exclude.txt').readlines():
26 |             Exclude[line.strip().split('\t')[0]] = 1
27 |     except:
28 | 	pass
29 |     if len(sys.argv) > 0:
30 |         Reverse = False
31 |     
32 |     for line in sys.stdin:
33 |         entries =  line.strip().split('\t')
34 |         Following = entries[0]
35 |         User = entries[1]
36 |         if Exclude:
37 |             if not(User in Exclude) and not(Following in Exclude):
38 |                 if Reverse:
39 |                     print User + '\t' + Following
40 |                 else:
41 |                     print Following + '\t' + User
42 |         else:
43 |             if Reverse:
44 |                 print User + '\t' + Following
45 |             else:
46 |                 print Following + '\t' + User
47 | 
48 | 


--------------------------------------------------------------------------------
/pre/run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | # Generic Usage:
3 | # ./run.sh DP
4 | hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -input $1 -output $1.net -mapper mapper.py -reducer reducer.py -file mapper.py -file reducer.py 
5 | 
6 | # for twitter
7 | #hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -input $1 -output $1.net -mapper 'mapper.py Reverse' -reducer reducer.py -file mapper.py -file reducer.py -file exclude.txt


--------------------------------------------------------------------------------
/readme:
--------------------------------------------------------------------------------
 1 | Scalable Community Detection using Label Propagation and Map Reduce
 2 | 
 3 | Author: Akshay Bhat 
 4 | Contact: akshaybhat [at] gmail.com
 5 | 
 6 | Please visit http://www.akshaybhat.com/LPMR for more information
 7 | 
 8 | Organization:
 9 | Folder				Description
10 | lp				Code for Communtiy Detection 
11 | 
12 | pre				Code to pre processing the edgelist file 
13 | 
14 | twitter			Code for automating everything for twitter dataset
15 | 
16 | Usage:
17 | note that this is an experimental code, and not a library. Thus it involves multiple hacks.
18 | 
19 | You will need a working hadoop installation, this code has been tested using a cluster which used hadoop 0.19. Thus It should work very well with versions > 0.19. 
20 | Still you will need to change path to hadoop streaming jar file.
21 |  
22 | Download Twitter_rv.net from http://an.kaist.ac.kr/traces/WWW2010.html
23 | 
24 | Download numeric2users.tar.gz from above website, extract it, rename it as Users.txt and put it outside the LPMR folder. (sorry if this sounds weird, will fix this soon) 
25 | 
26 | cd into twitter directory and execute
27 | ./run-twitter.sh twitter_rv.net
28 | 
29 | [you will most likely get errors due to hadoop not being ]
30 | 
31 | License: Research purpose only
32 | 
33 | 


--------------------------------------------------------------------------------
/twitter/run-twitter.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | # Generic Usage:
 3 | # ./run-twitter.sh twitter_rv.net
 4 | 
 5 | cd ..
 6 | cd pre
 7 | hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -input $1 -output Twitter/Net.pegasus -mapper 'mapper.py Reverse' -reducer NONE -file mapper.py
 8 | hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -input $1 -output Twitter/Net.exclude -mapper 'mapper.py Reverse' -reducer reducer.py -file mapper.py -file reducer.py -file ../twitter/exclude.txt
 9 | hadoop jar /usr/local/hadoop/contrib/streaming/hadoop-0.19.2-streaming.jar -input $1 -output Twitter/Net.complete -mapper 'mapper.py Reverse' -reducer reducer.py -file mapper.py -file reducer.py
10 | cd ..
11 | 
12 | cd lp
13 | ./run.py Twitter/Net.exclude 15 70000000 
14 | cd ..
15 | 
16 | 
17 | 
18 | 
19 | # cd ../PEG
20 | # code for perfoming in/out degree and page rank calculation using Pegasus
21 | # ./run_dd.sh in 72 Twitter/Net.pegasus
22 | # ./run_dd.sh out 72 Twitter/Net.pegasus
23 | # ./run_pr.sh 72 70000000 nosym Twitter/Net.pegasus
24 | # cd ..
25 | 


--------------------------------------------------------------------------------