├── .gitignore
├── .gitmodules
├── AUTHORS
├── CONTRIBUTING.md
├── LICENSE.txt
├── MAINTAINERS.md
├── README.md
├── circulo
    ├── algorithms
    │   ├── __init__.py
    │   ├── betweenness.py
    │   ├── biSBM
    │   │   ├── Makefile
    │   │   ├── biSBM.c
    │   │   └── biSBM.h
    │   ├── conga.py
    │   ├── congo.py
    │   ├── congo_test.py
    │   ├── girvan_newman.py
    │   ├── min_conductance.py
    │   ├── overlap.py
    │   ├── radicchi.py
    │   ├── rolx.py
    │   ├── snap_bigclam.py
    │   ├── snap_cesna.py
    │   ├── snap_cnm.py
    │   ├── snap_coda.py
    │   ├── snap_cpm.py
    │   ├── snap_girvan_newman.py
    │   ├── snap_infomap.py
    │   └── spectral.py
    ├── data
    │   ├── README.md
    │   ├── README_template.md
    │   ├── amazon
    │   │   └── run.py
    │   ├── as_data
    │   │   ├── README.md
    │   │   └── run.py
    │   ├── databot.py
    │   ├── flights
    │   │   ├── README.md
    │   │   └── run.py
    │   ├── football
    │   │   ├── README.md
    │   │   └── run.py
    │   ├── house_voting
    │   │   ├── README.md
    │   │   ├── download.sh
    │   │   └── run.py
    │   ├── karate
    │   │   ├── README.md
    │   │   └── run.py
    │   ├── malaria
    │   │   ├── README.md
    │   │   └── run.py
    │   ├── nba_schedule
    │   │   ├── README.md
    │   │   └── run.py
    │   ├── netscience
    │   │   ├── README.md
    │   │   └── run.py
    │   ├── pgp
    │   │   ├── README.md
    │   │   └── run.py
    │   ├── revolution
    │   │   ├── README.md
    │   │   └── run.py
    │   ├── school
    │   │   ├── README.md
    │   │   └── run.py
    │   ├── scotus
    │   │   ├── README.md
    │   │   └── run.py
    │   ├── senate_voting
    │   │   ├── README.md
    │   │   ├── download.sh
    │   │   ├── exercise.md
    │   │   └── run.py
    │   └── southernwomen
    │   │   ├── README.md
    │   │   └── run.py
    ├── metrics
    │   ├── cover.py
    │   ├── graph.py
    │   ├── omega.py
    │   └── probability_metric.py
    ├── setup
    │   ├── run_algos.py
    │   └── run_metrics.py
    ├── unit_tests
    │   ├── karate.gml
    │   ├── metrics.py
    │   └── test_metrics.py
    ├── utils
    │   ├── downloader.py
    │   ├── general.py
    │   ├── snap.py
    │   └── stochastic_selector.py
    └── wrappers
    │   └── community.py
├── experiments
    ├── README.md
    ├── cluster_omega_comparison.py
    ├── community_label.py
    ├── gephi_plot
    │   ├── create_graphml.py
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── com
    │   │               └── lab41
    │   │                   └── circulo
    │   │                       └── gephi_plot
    │   │                           └── PlotGraphs.java
    ├── goodness_indicators.py
    ├── histogram_metrics.py
    ├── images
    │   ├── bubble_plot.png
    │   ├── community_label_results.png
    │   ├── counts.png
    │   ├── flights_algo_infomap.png
    │   ├── football--groundtruth--0.png
    │   ├── football_histogram.png
    │   └── time_vs_omega.png
    ├── metricsCharts.R
    ├── metrics_clustering.py
    ├── omega_comparison.py
    └── partition_metrics.R
└── support
    ├── Dockerfile
    ├── requirements.txt
    └── server_scripts
        ├── circulo_server.sh
        └── clean_circulo.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.pyc
3 | .DS_Store
4 | *.swp
5 | *.pickle
6 | circulo/data/*/raw/
7 | circulo/data/GRAPHS/
8 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "lib/snap"]
2 | 	path = lib/snap
3 | 	url = https://github.com/snap-stanford/snap.git
4 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | # This file lists all individuals having contributed content to the repository.
 2 | # If you're submitting a patch, please add your name here in alphabetical order as part of the patch.
 3 | #
 4 | # For a list of active project maintainers, see the MAINTAINERS file.
 5 | #
 6 | Paul M <paulm@lab41.org>
 7 | Yonas Tesfaye <ytesfaye@lab41.org>
 8 | Nikhil Desai <ndesai@lab41.org>
 9 | Robbie Ostrow <rostrow@lab41.org>
10 | and various US Government Participants
11 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/CONTRIBUTING.md


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 |    Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
 2 | 
 3 |    Licensed under the Apache License, Version 2.0 (the "License");
 4 |    you may not use this file except in compliance with the License.
 5 |    You may obtain a copy of the License at
 6 | 
 7 |        http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |    Unless required by applicable law or agreed to in writing, software
10 |    distributed under the License is distributed on an "AS IS" BASIS,
11 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |    See the License for the specific language governing permissions and
13 |    limitations under the License.
14 | 


--------------------------------------------------------------------------------
/MAINTAINERS.md:
--------------------------------------------------------------------------------
1 | Paul M <paulm@lab41.org>
2 | Yonas Tesfaye <ytesfaye@lab41.org>
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #Circulo: A Community Detection Evaluation Framework
 2 | 
 3 | ###Contribute
 4 | Contributors welcome! If you want to contribute, please issue a pull request.
 5 | 
 6 | ##About
 7 | ####The Framework:
 8 | <b>Circulo is a "Community Detection" Evaluation Framework</b> written primarily in Python.   The Framework performs statistical analysis against partitions of a Graph resulting from the execution of a given community detection algorithm.  The resulting quantitative measures can be used to drive experiments such as measuring algorithm efficacy against specific dataset types or comparing different algorithm execution results against the same dataset. The framework includes the following components:
 9 | 
10 | - __Data ETL (Extract Transform Load) Engine__: Circulo includes functionality to incorporate existing datasets into the evaluation framework.  By default, the framework includes several dataset "downloaders" in the directory [circulo/data](circulo/data). To learn how to add a dataset, please see the data [README](circulo/data/README.md). We encourage users to issue pull requests for new datasets.
11 | - __Algorithm Execution Engine:__  Circulo includes several algorithms by default which are located in the [algorithms](circulo/algorithms) directory. Using the framework, these algorithms can run in parallel against the included datasets by running the [run_algos.py](circulo/setup/run_algos.py) tool. Because some algorithms include parameters that can better cater execution to the type of input Graph (i.e directed and/or weighted), algorithm execution is wrapped in the file [community.py](circulo/wrappers/community.py). This enables the same algorithm to automatically operate differently depending on the dataset--enabling the algorithm to adapt to the dataset if allowed. To add an algorithm to the framework, add the files to [algorithms](circulo/algorithms) and update the wrapper [community.py](circulo/wrappers/community.py) appropriately.
12 | - __Metrics Engine:__ The metrics engine provides quantitative analysis of a given partitionin g of a graph. The metrics include internal statistical measures of a community (i.e. density), external measurements (i.e. expansion), and network wide metrics (ground truth comparisons).  
13 | - __Experiments Engine:__ Different types of experiments have been designed to find patterns in the metric results.  For example, how do algorithms compare when considering both time and accuracy. This component is meant to be a "playground" for experimentation on metric results. Experiments may vary significantly. Each file in the [experiments](experiments) directory is meant to be an independent experiment.  See the [README](experiment/README.md) for more information.
14 | 
15 | ####The Research
16 |  Prior to building the Circulo framework, Lab41 conducted a market survey into Community Detection algorithms and metrics. The survey was used to guide the development of Circulo. The survey includes, but is not limited to, summaries of algorithms, references to academic literature, and general information about the field. The survey can be found here: http://lab41.github.io/survey-community-detection/.
17 | 
18 | 
19 | ####The Underlying Graph Framework
20 | Since we did not want to reimplement the notion of a graph, we decided to pick an existing Graph Framework as a backdrop for our work.  Though any of the popular graph frameworks could have been used, iGraph was chosen as our primary graph framework. iGraph offers a number of features:
21 | 
22 | - First and foremost, iGraph implements a number of community detection algorithms out of the box. It also provides two data structures for community detection: VertexClustering (non-overlapping communities) and VertexCover (overlapping communities)
23 | - iGraph is written in C at its core making it fast
24 | - iGraph has wrappers for Python and R
25 | - iGraph is a mature framework
26 | 
27 | Other frameworks which could be used include GraphX, GraphLab, SNAP, NetworkX.
28 | 
29 | 
30 | ##Installation and Setup
31 | ####Package Requirements
32 | 
33 | -  git
34 | -  python3
35 | -  igraph (Refer to Appendix A for further instructions)
36 | -  matplotlib
37 | -  cairo (if you want to plot directly from igraph)
38 | -  scipy
39 | -  scikit.learn
40 |   
41 | 
42 | ####Installation
43 | Below are instructions for using Circulo
44 | 
45 | 	#clone Circulo repository (note: this also clone SNAP)
46 | 	git clone --recursive https://github.com/Lab41/circulo.git
47 | 	#set PYTHONPATH env variable
48 | 	export PYTHONPATH=/path/to/Circulo
49 |     #make the snap code base
50 |     pushd lib/snap; make; popd
51 | 
52 | 
53 | 
54 | #### Running the Evaluation Framework
55 | At the core, the evaluation framework run various community detection algorithms against various datasets. 
56 | 	
57 | 	#To run your algorithms against the data
58 | 	python circulo/setup/run_algos [parameters ...]	
59 | 	#To run metrics against the results of run_algos
60 | 	python circulo/setup/run_metrics [parameters ...]
61 | 
62 | 	
63 | 
64 | ##Appendix
65 | ####Appendix A: iGraph Installation
66 | #####Ubuntu
67 | 
68 |     sudo apt-get install igraph
69 |     sudo apt-get install libxml2-dev libz-dev python-dev
70 | 
71 | #####OS X
72 | 	
73 | 	#using brew install igraph dylibs
74 | 	brew install homebrew/science/igraph
75 |     
76 | 	#install Cairo
77 | 	#installs the core libraries for cairo
78 | 	brew install cairo 
79 | 	
80 | 	#installs the python site-packages. NOTE: pip does not work for pycairo. 
81 | 	#If you want to use pip, create sym links to the site packages in brew
82 | 	brew install py3cairo
83 | 
84 |     #install python igraph
85 |     pip3 install python-igraph
86 | 
87 | 


--------------------------------------------------------------------------------
/circulo/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | modules = glob.glob(os.path.dirname(__file__)+"/*.py")
4 | __all__ = [ os.path.basename(f)[:-3] for f in modules]
5 | 


--------------------------------------------------------------------------------
/circulo/algorithms/betweenness.py:
--------------------------------------------------------------------------------
 1 | import igraph as ig
 2 | import itertools
 3 | from collections import Counter
 4 | 
 5 | 
 6 | def edge_and_pair_betweenness(G):
 7 | 	"""
 8 | 	An attempt to find the edge and pair betweennesses without finding all 
 9 | 	shortest paths, using flows. Currently unused.
10 | 	"""
11 | 	eb = {edge.tuple : 0 for edge in G.es}
12 | 	pb = {vertex.index : {uw : 0 for uw in itertools.combinations(G.neighbors(vertex), 2)} for vertex in G.vs}
13 | 	for v in G.vs:
14 | 		flows, pairFlows = get_flows(G, v.index, eb, pb)
15 | 		for flow in flows: # pythonify
16 | 			eb[flow] += flows[flow] / 2. # counted twice.
17 | 		for pflow in pairFlows:
18 | 			for uw in pairFlows[pflow]:
19 | 				pb[pflow][uw] += pairFlows[pflow][uw] / 2. 
20 | 	return eb, pb
21 | 
22 | 
23 | def get_flows(G, index, eb, pb):
24 | 	"""
25 | 	Initializing the edge and pair betweenness dicts using flows.
26 | 	Edge betweenness correct, but pair betweenness needs work. 
27 | 	This can be used as a template for future work concerning flows.
28 | 	"""
29 | 	# don't reinitialize these dicts each time.
30 | 	flows = {edge.tuple : 0 for edge in G.es}
31 | 	pairFlows = {vertex.index : {uw : 0 for uw in itertools.combinations(G.neighbors(vertex), 2)} for vertex in G.vs}
32 | 	bfs = G.bfsiter(index, advanced=True)
33 | 	bfsList = [bfs.next()[0].index]
34 | 	# skipping root, manually adding it
35 | 	bfsDict = {index : {"depth" : 0, "parents" : [], "numPaths": 1, "flow": 1}}
36 | 	nodesSeen = set([index])
37 | 	# initializing bfs dict (decompose)
38 | 	for v, depth, parent in bfs:
39 | 		i = v.index
40 | 		bfsList.append(i) # probably don't need
41 | 		parents = [p for p in G.neighbors(v) if p in nodesSeen and bfsDict[p]["depth"] < depth]
42 | 		nodesSeen.add(i)
43 | 		numPaths = sum(bfsDict[p]["numPaths"] for p in parents)
44 | 		bfsDict[i] = {"depth": depth, "parents": parents, "numPaths": numPaths, "flow": 1}
45 | 
46 | 	# getting flows (decompose)
47 | 	for v in reversed(bfsList):
48 | 		# getting edge flows
49 | 		parents = bfsDict[v]["parents"]
50 | 		totalPaths = float(sum(bfsDict[p]["numPaths"] for p in parents))
51 | 		for p in parents:
52 | 			flowProportion = bfsDict[p]["numPaths"] / totalPaths
53 | 			flow = flowProportion * bfsDict[v]["flow"]
54 | 			flows[order_tuple((v, p))] = flow
55 | 			bfsDict[p]["flow"] += flow
56 | 			grandparents = bfsDict[p]["parents"]
57 | 			totalGrandparentPaths = float(sum(bfsDict[g]["numPaths"] for g in set(grandparents)))
58 | 			for g in grandparents:
59 | 				gCount = Counter(grandparents)
60 | 				gFlowProportion = bfsDict[g]["numPaths"] / totalPaths / float(gCount[g])
61 | 				gFlow = gFlowProportion * bfsDict[v]["flow"]
62 | 				pairFlows[p][order_tuple((v, g))] = gFlow
63 | 
64 | 	##
65 | 	# pairFlows are incorrect!!!
66 | 	##
67 | 	return flows, pairFlows


--------------------------------------------------------------------------------
/circulo/algorithms/biSBM/Makefile:
--------------------------------------------------------------------------------
 1 | # # Makefile modified from http://www.cs.swarthmore.edu/~newhall/unixhelp/howto_makefiles.html
 2 | 
 3 | 
 4 | # Essentially the commands that are being run:
 5 | 
 6 | # all:
 7 | # 	gcc -O3 -Wall -pedantic biSBM.c -I/usr/local/Cellar/igraph/0.7.1/include/igraph -o biSBM -L/usr/local/Cellar/igraph/0.7.1/lib -ligraph
 8 | 
 9 | # debug:
10 | # 	gcc -Wall -g -pedantic biSBM.c -I/usr/local/Cellar/igraph/0.7.1/include/igraph -o biSBM_debug -L/usr/local/Cellar/igraph/0.7.1/lib -ligraph
11 | 
12 | 
13 | # # define the compiler to use
14 | CC = gcc
15 | 
16 | # # define any compile-time flags
17 | CFLAGS = -O3 -g -Wall -pedantic
18 | CFLAGS_DEBUG = -Wall -g -pedantic
19 | 
20 | # # define any directories containing header files other than /usr/include
21 | INCLUDES = -I/usr/local/Cellar/igraph/0.7.1/include/igraph
22 | 
23 | # # define library paths in addition to /usr/lib
24 | LFLAGS = -L/usr/local/Cellar/igraph/0.7.1/lib
25 | 
26 | # # define any libraries to link into executable:
27 | LIBS = -ligraph
28 | 
29 | # # define the source files
30 | SRCS = biSBM.c
31 | 
32 | # # define the object files 
33 | OBJS = $(SRCS:.c=.o)
34 | 
35 | # # define the executable file 
36 | MAIN = biSBM
37 | 
38 | MAIN_DEBUG = biSBM_debug
39 | 
40 | .PHONY: depend clean
41 | 
42 | all:    $(MAIN)
43 | 	@echo  Compilation completed successfully.
44 | 
45 | debug:    $(MAIN_DEBUG)
46 | 	@echo  Unoptimized compilation completed successfully.
47 | 
48 | $(MAIN): $(OBJS) 
49 | 	$(CC) $(CFLAGS) $(INCLUDES) -o $(MAIN) $(OBJS) $(LFLAGS) $(LIBS)
50 | 
51 | $(MAIN_DEBUG): $(OBJS)
52 | 	$(CC) $(CFLAGS_UNOPT) $(INCLUDES) -o $(MAIN_DEBUG) $(OBJS) $(LFLAGS) $(LIBS)
53 | 
54 | # this is a suffix replacement rule for building .o's from .c's
55 | # it uses automatic variables $<: the name of the prerequisite of
56 | # the rule(a .c file) and $@: the name of the target of the rule (a .o file) 
57 | # (see the gnu make manual section about automatic variables)
58 | .c.o:
59 | 	$(CC) $(CFLAGS) $(INCLUDES) -c $<  -o $@
60 | 
61 | clean:
62 | 	$(RM) *.o *~ $(MAIN) $(MAIN_DEBUG)
63 | 
64 | depend: $(SRCS)
65 | 	makedepend $(INCLUDES) $^
66 | 
67 | # # DO NOT DELETE THIS LINE -- make depend needs it
68 | 


--------------------------------------------------------------------------------
/circulo/algorithms/biSBM/biSBM.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * TODO: main comment
 3 |  *
 4 |  *
 5 |  */
 6 | #include <stdlib.h>
 7 | #include <stdio.h>
 8 | #include <string.h>
 9 | #include <limits.h>
10 | #include <stdbool.h>
11 | #include <math.h>
12 | #include <float.h>
13 | #include <assert.h>
14 | #include <igraph.h>
15 | 
16 | 
17 | int igraph_community_bipartite_sbm(igraph_t *graph, igraph_vector_t *membership, 
18 |                                    igraph_integer_t k_a, igraph_integer_t k_b, 
19 |                                    igraph_integer_t max_iters, igraph_bool_t degree_correct);
20 | 
21 | int log_message(const char *message, ...);
22 | 
23 | void igraph_read_graph_generic(igraph_t *graph, char *type, char *file_name);
24 | 
25 | void print_usage_and_exit(int exitstatus);
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/circulo/algorithms/congo_test.py:
--------------------------------------------------------------------------------
 1 | import circulo.algorithms.congo as CONGO
 2 | import unittest
 3 | import igraph
 4 | import itertools
 5 | 
 6 | class TestCongoFunctions(unittest.TestCase):
 7 | 
 8 |     def setUp(self):
 9 |         """
10 |         Initializes the graph for testing to Zachary's
11 |         karate club.
12 |         """
13 |         self.graph = igraph.Graph.Famous("zachary")
14 |         self.graph.vs['CONGA_orig'] = [i.index for i in self.graph.vs]
15 |         self.graph.es['eb'] = 0
16 |         self.graph.vs['pb'] = [{uw : 0 for uw in itertools.combinations(self.graph.neighbors(vertex), 2)} for vertex in self.graph.vs]
17 | 
18 | 
19 |     def tearDown(self):
20 |         self.graph = None
21 | 
22 | 
23 |     def test_test(self):
24 |         """
25 |         Calculates the edge betweenness twice and checks
26 |         equality. Just making sure the testing framework
27 |         and igraph are working properly.
28 |         """
29 |         eb = self.graph.edge_betweenness()
30 |         self.assertEqual(self.graph.edge_betweenness(), eb)
31 | 
32 | 
33 |     def test_edge_betweenness(self):
34 |         """
35 |         Checks that the implementation of edge_betweenness in
36 |         edge_and_pair_betweenness matches that of igraph's
37 |         graph.edge_betweenness.
38 |         """
39 |         ebtheirs = self.graph.edge_betweenness()
40 |         ebmine, _ = CONGO.edge_and_pair_betweenness(self.graph)
41 |         for e in self.graph.es:
42 |             self.assertAlmostEqual(ebtheirs[e.index], ebmine[e.tuple])
43 | 
44 | 
45 |     def test_pair_betweenness(self):
46 |         """
47 |         Checks to make sure that the sum of all pair betweennesses
48 |         on a specific vertex are equal to its vertex betweenness.
49 |         """
50 |         _, pb = CONGO.edge_and_pair_betweenness(self.graph)
51 |         vb = self.graph.betweenness()
52 |         for v in pb:
53 |             self.assertAlmostEqual(sum(pb[v].values()), vb[v])
54 | 
55 | 
56 |     # def test_vertex_betweeenness_from_eb(self):
57 |     #   """
58 |     #   Checks that the implementation of vertex_betweeenness_from_eb
59 |     #   yields the same results as that of igraph's graph.betweenness
60 |     #   """
61 |     #   eb = self.graph.edge_betweenness()
62 |     #   ebmine, _ = CONGO.edge_and_pair_betweenness(self.graph)
63 |     #   vbtheirs = self.graph.betweenness()
64 |     #   vbmine = CONGO.vertex_betweeenness_from_eb(self.graph, ebmine)
65 |     #   for v in self.graph.vs:
66 |     #       self.assertAlmostEqual(vbtheirs[v.index], vbmine[v.index])
67 | 
68 | 
69 |     def test_initialize_betweenness(self):
70 |         cp = self.graph.copy()
71 | 
72 |         eb = self.graph.edge_betweenness()
73 |         CONGO.do_initial_betweenness(cp, 3)
74 |         for i, e in enumerate(eb):
75 |             self.assertAlmostEqual(e, cp.es[i]['eb'])
76 | 
77 | 
78 | # def testBetweennesses(G, h):
79 | #     eb = G.edge_betweenness(cutoff=h)
80 | #     for i, v in enumerate(G.es):
81 | #         print v['eb'], 2 * eb[i], abs(v['eb'] - 2 * eb[i]) > .001
82 | 
83 | 
84 | def suite():
85 |     suite = unittest.TestSuite()
86 |     tests = ['test_test', 'test_vertex_betweeenness_from_eb', 'test_edge_betweenness', 'test_pair_betweenness']
87 | 
88 |     return unittest.TestSuite(list(map(TestCongoFunctions, tests)))
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     unittest.main()
93 | 


--------------------------------------------------------------------------------
/circulo/algorithms/girvan_newman.py:
--------------------------------------------------------------------------------
  1 | import igraph as ig
  2 | import operator
  3 | import sys
  4 | 
  5 | 
  6 | def gn(origGraph):
  7 | 	"""
  8 | 	Parameters:
  9 | 		origGraph: a graph in igraph format
 10 | 
 11 | 	Return value:
 12 | 		A dendrogram (VertexDendrogram) created by running Girvan-Newman
 13 | 
 14 | 	Notes: 
 15 | 		Runs the Girvan-Newman (edge-betweenness) algorithm on the graph provided.
 16 | 		Iteratively removes the edge with the highest edge-betweenness, then recalculates.
 17 | 	"""
 18 | 	
 19 | 	# initialize a list of removed edges that result in a split of the graph
 20 | 	splits = []
 21 | 
 22 | 	G = origGraph.copy() 
 23 | 
 24 | 	while G.es:
 25 | 
 26 | 		# Calculate all edge betweennesses
 27 | 		# TODO: only recalculate on relevant portions
 28 | 		edge_betweennesses = G.edge_betweenness()
 29 | 
 30 | 		# returns an the first index if there is a tie at max.
 31 | 		max_index, _ = max(enumerate(edge_betweennesses), key=operator.itemgetter(1))
 32 | 
 33 | 		# edge with the max betweenness
 34 | 		edge = G.es[max_index].tuple
 35 | 
 36 | 		G.delete_edges(edge)
 37 | 
 38 | 		if splitGraph(G, edge):
 39 | 
 40 | 			# edge is a tuple, but we want a list of lists.
 41 | 			splits += [list(edge)]
 42 | 
 43 | 	vd = createDendrogram(origGraph, splits)
 44 | 
 45 | 	# If we don't call this then as_clustering() fails. bugfix in development branch.
 46 | 	vd.optimal_count 
 47 | 
 48 | 	return vd
 49 | 
 50 | 
 51 | def splitGraph(G, edge):
 52 | 	""" 
 53 | 	Parameters:
 54 | 		G: an igraph graph
 55 | 		edge: an edge of the form (v1, v2) where v1 and v2 are vertices in G.
 56 | 	
 57 | 	Return value:
 58 | 		A boolean value. True if removing the edge splits the graph.
 59 | 	
 60 | 	Notes:
 61 | 		Checks to see if removing edge from G splits the graph into 2 disjoint
 62 | 	communities. If so, returns True, otherwise False.
 63 | 	"""
 64 | 
 65 | 	return not G.edge_disjoint_paths(source=edge[0], target=edge[1])
 66 | 
 67 | 
 68 | def createDendrogram(G, splits):
 69 | 	"""
 70 | 	Given a historical list of split edges, creates a dendrogram 
 71 | 	by calculating the merges. 
 72 | 
 73 | 	Runs in O(nlgn) (But really, close to O(n).) This is a useful function
 74 | 	for any divisive algorithm for which splits can be saved more easily
 75 | 	than merges.
 76 | 	"""
 77 | 
 78 | 	# To create a dendrogram, new merges have id of max id + 1
 79 | 	n = len(splits) + 1
 80 | 	merges = []
 81 | 
 82 | 	mergeDict = {}
 83 | 
 84 | 	while splits:
 85 | 		# most recent split popped off
 86 | 		edge = splits.pop()
 87 | 
 88 | 		# Get the values the dendrogram wants for each vertex by finding
 89 | 		# where merges have already happened.
 90 | 		edge = [traverse(vertex, mergeDict) for vertex in edge]
 91 | 
 92 | 		merges += [edge]
 93 | 
 94 | 		# Update the dict to reflect a new merge.
 95 | 		for vertex in edge:
 96 | 			mergeDict[vertex] = n
 97 | 		
 98 | 		n += 1
 99 | 
100 | 	return ig.VertexDendrogram(G, merges)
101 | 
102 | 
103 | def traverse(vertex, mergeDict):
104 | 	"""
105 | 	Given a vertex and a dictionaty of merges, returns the id of the cluster
106 | 	the vertex belongs to.
107 | 	"""
108 | 	while vertex in mergeDict:
109 | 		vertex = mergeDict[vertex]
110 | 	return vertex
111 | 
112 | 
113 | 
114 | if __name__ == "__main__":
115 | 	G = ig.load(sys.argv[1])
116 | 	gn(G)


--------------------------------------------------------------------------------
/circulo/algorithms/min_conductance.py:
--------------------------------------------------------------------------------
 1 | import circulo.metrics
 2 | import circulo.algorithms.spectral
 3 | from igraph import Graph
 4 | 
 5 | def min_conductance(G, weights=None, tries=3):
 6 |     '''
 7 |     Returns the minimum conductance of a Graph by using spectral clustering to ``approximate'' the minimum ratio-cut.
 8 |     http://www.kyb.mpg.de/fileadmin/user_upload/files/publications/attachments/Luxburg07_tutorial_4488%5b0%5d.pdf
 9 |     '''
10 |     (rv_val, rv_vc) = (float("inf"), None)
11 |     for i in range(0,tries):
12 |         try:
13 |             #Obtain a cut of G, it should already be a minimum
14 |             curr_vc = G.community_spectral(k=2, weights=weights, which='NCut')
15 |             curr_val = max(curr_vc.as_cover().conductance())
16 |             if curr_val < rv_val :
17 |                 (rv_val, rv_vc) = (curr_val, curr_vc)
18 |         except:
19 |             pass
20 | 
21 | 
22 |     return rv_val, rv_vc
23 | 
24 | Graph.min_conductance = min_conductance
25 | 


--------------------------------------------------------------------------------
/circulo/algorithms/snap_bigclam.py:
--------------------------------------------------------------------------------
 1 | import igraph
 2 | import os
 3 | import subprocess
 4 | from circulo.utils.snap import setup,read_communities_by_community
 5 | from multiprocessing import cpu_count
 6 | 
 7 | def bigclam(G, data_prefix='snap_', node_filepath='', detect_comm=100, min_comm=5, max_comm=100, trials=5, threads=cpu_count(), alpha=0.3, beta=0.3):
 8 |     '''
 9 |     BigClam from Snap
10 | 
11 |     Parameters
12 |     ----------
13 |     G :                 A NetworkX graph or edge list file
14 |     data_prefix:        Output file for communitities (data_prefix + cmtyvv.txt)
15 |     node_file_path:     Input file name for node names (Node ID, Node label)
16 |     detect_comm:        The number of communities to detect (-1: detect automatically) (Default: 100)
17 |     min_comm:           Minimum number of communities to try (Default = 5)
18 |     max_comm:           Maximum number of communities to try (Default = 100)
19 |     trials:             How many trials for the number of communities (Default = 10)
20 |     threads:            Number of threads for parallelization (Default = 4)
21 |     alpha:              Alpha for backtracking line search (Default = 0.05)
22 |     beta:               Beta for backtracking line search (Default = 0.3)
23 | 
24 |     Returns:  List of SubGraphs representing the communities.  The SubGraphs are automatically serialized to disk as file data_prefix+'cmtyvv.txt'
25 |     '''
26 | 
27 |     snap_home, graph_file = setup(G, include_header=False)
28 | 
29 | 
30 |     if graph_file is None:
31 |         return None
32 | 
33 |     path_bigclam = os.path.join(snap_home, "examples", "bigclam", "bigclam")
34 | 
35 |     try:
36 |         FNULL = open(os.devnull, 'w')
37 |         out = subprocess.Popen([path_bigclam,"-o:"+data_prefix,"-i:"+graph_file,"-l:"+node_filepath,"-c:"+str(detect_comm), "-mc:"+str(min_comm), "-xc:"+str(max_comm), "-nc:"+str(trials), "-nt:"+str(threads), "-sa:"+str(alpha), "-sb:"+str(beta)], stdout=FNULL).wait()
38 | 
39 |     except TypeError as e:
40 |         print("Error occurred: {}".format(e))
41 |         return
42 | 
43 | 
44 |     os.remove(graph_file)
45 | 
46 |     return read_communities_by_community(data_prefix + "cmtyvv.txt", G, delete_file=True)
47 | 
48 | 
49 | def main():
50 | 
51 |     G = igraph.Graph.Erdos_Renyi(n=30, m=100)
52 |     snap_home, filename = setup(G)
53 | 
54 |     vc = bigclam(G)
55 |     print(vc)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/circulo/algorithms/snap_cesna.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | import igraph
17 | import os
18 | import subprocess
19 | from circulo.utils import snap
20 | import shutil
21 | from multiprocessing import cpu_count
22 | 
23 | def cesna(G, attributes_to_include, data_prefix='snap_', node_filepath='', detect_comm=100, min_comm=5, max_comm=100, trials=5, threads=cpu_count(), alpha=.3, beta=0.3):
24 | 
25 |     '''
26 |     Parameters
27 |     -----------
28 |     G:                  An iGraph or edge list file
29 |     f_attributes:       Input node attribute file name (Required)
30 |     f_attribute_names:  Input file name for node attribute names (Required)
31 |     nodes:              Input file name for node names (Node ID, Node label)
32 |     detect_comm:         The number of communities to detect (-1: detect automatically) (default:10)
33 |     min_comm:           Minimum number of communities to try (default:3)
34 |     max_comm:           Maximum number of communities to try (default:20)
35 |     trials:             How many trials for the number of communities (default:5)
36 |     threads:            Number of threads for parallelization (default:4)
37 |     aw:                 We maximize (1 - aw) P(Network) + aw * P(Attributes) (default:0.5)
38 |     lw:                 Weight for l-1 regularization on learning the logistic model parameters (default:1)
39 |     alpha:              Alpha for backtracking line search (default:0.05)
40 |     beta:               Beta for backtracking line search (default:0.3)
41 |     mf                  if the fraction of nodes with positive values for an attribute is smaller than this, we ignore that attribute (default:0)
42 |     '''
43 | 
44 |     snap_home, graph_file = snap.setup(G)
45 | 
46 |     f_attribute_names, f_attributes = snap.attribute_setup(G, attributes_to_include)
47 |     if graph_file is None:
48 |         return
49 | 
50 |     path_cesna = os.path.join(snap_home, "examples", "cesna", "cesna")
51 | 
52 |     try:
53 |         FNULL = open(os.devnull, 'w')
54 |         out = subprocess.Popen([path_cesna,"-o:"+data_prefix,"-i:"+graph_file,"-l:"+node_filepath, "-c:" + str(detect_comm), "-mc:"+str(min_comm), "-xc:"+str(max_comm), "-nc:"+str(trials), "-nt:"+str(threads), "-sa:"+str(alpha), "-sb:"+str(beta),  "-a:"+f_attributes, "-n:"+f_attribute_names],stdout=FNULL).wait()
55 | 
56 | 
57 |     except TypeError as e:
58 |         print("Error occurred: {}".format(e))
59 |         return
60 | 
61 |     os.remove(graph_file)
62 | 
63 |     return snap.read_communities_by_community(data_prefix + "cmtyvv.txt", G, delete_file=True)
64 | 
65 | 
66 | 
67 | def main():
68 | 
69 |     G = igraph.load('/Users/ytesfaye/tmp/GRAPHS/flights.graphml')
70 |     #snap_home, filename = setup(G)
71 | 
72 |     vc = cesna(G)
73 |     print(vc)
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     main()
78 | 


--------------------------------------------------------------------------------
/circulo/algorithms/snap_cnm.py:
--------------------------------------------------------------------------------
 1 | import igraph
 2 | from circulo.utils.snap import divisive, setup
 3 | 
 4 | def clauset_newman_moore(G, output="communities.txt"):
 5 |     return divisive(G, "2", output)
 6 | 
 7 | def main():
 8 | 
 9 |     G = igraph.Graph.Erdos_Renyi(n=30, m=100)
10 |     snap_home, filename = setup(G)
11 | 
12 |     vc = clauset_newman_moore(G)
13 |     print(vc)
14 | 
15 | if __name__ == "__main__":
16 |     main()
17 | 


--------------------------------------------------------------------------------
/circulo/algorithms/snap_coda.py:
--------------------------------------------------------------------------------
 1 | import igraph
 2 | import os
 3 | import subprocess
 4 | from circulo.utils.snap import setup, read_communities_by_community
 5 | 
 6 | 
 7 | def coda(G, data_prefix='snap_', node_filepath='', graph_type=0, detect_comm=100, min_comm=5, max_comm=100, trials=10, threads=4, alpha=0.05, beta=0.3):
 8 |     '''
 9 |     Coda from Snap
10 | 
11 |     Parameters
12 |     ----------
13 |     G :                 A NetworkX graph or edge list file
14 |     node_file_path:     Input file name for node names (Node ID, Node label)
15 |     graph_type:         0=directed, 1=undirected (default: 0)
16 |     detect_comm:        The number of communities to detect (-1: detect automatically) (Default: 100)
17 |     min_comm:           Minimum number of communities to try (Default = 5)
18 |     max_comm:           Maximum number of communities to try (Default = 100)
19 |     trials:             How many trials for the number of communities (Default = 10)
20 |     threads:            Number of threads for parallelization (Default = 4)
21 |     alpha:              Alpha for backtracking line search (Default = 0.05)
22 |     beta:               Beta for backtracking line search (Default = 0.3)
23 |     '''
24 | 
25 |     snap_home, graph_file = setup(G)
26 |     path_coda = os.path.join(snap_home, "examples", "coda", "coda")
27 | 
28 |     try:
29 |         FNULL = open(os.devnull, 'w')
30 | 
31 |         out = subprocess.Popen([path_coda,"-o:"+data_prefix,"-i:"+graph_file,"-l:"+node_filepath,"-g:"+str(graph_type),"-c:"+str(detect_comm), "-mc:"+str(min_comm), "xc:"+str(max_comm), "-nc:"+str(trials), "-nt:"+str(threads), "-sa:"+str(alpha), "-sb:"+str(beta)], stdout=FNULL).wait()
32 | 
33 |     except TypeError as e:
34 |         print("Error occurred: {}".format(e))
35 |         return
36 | 
37 | 
38 |     os.remove(graph_file)
39 | 
40 |     #CODE returns an "in" and an "out" file. Not sure why... so am just using out
41 |     return read_communities_by_community(data_prefix + "cmtyvv.out.txt", G)
42 | 
43 | 
44 | 
45 | def main():
46 | 
47 |     G = igraph.Graph.Erdos_Renyi(n=30, m=100)
48 |     snap_home, filename = setup(G)
49 | 
50 |     vc = coda(G)
51 |     print(vc)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/circulo/algorithms/snap_cpm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | import igraph
17 | import os
18 | import subprocess
19 | from circulo.utils import snap
20 | 
21 | def clique_percolation(G, data_prefix='snap_'):
22 | 
23 |     '''
24 |     Parameters
25 |     -----------
26 |     G:                  An iGraph or edge list file
27 |     '''
28 | 
29 |     snap_home, graph_file = snap.setup(G)
30 | 
31 |     if graph_file is None:
32 |         return
33 | 
34 |     path_cpm = os.path.join(snap_home, "examples", "cliques", "cliquesmain")
35 | 
36 |     try:
37 |         FNULL = open(os.devnull, 'w')
38 |         out = subprocess.Popen([path_cpm,"-o:"+data_prefix,"-i:"+graph_file], stdout=FNULL).wait()
39 | 
40 | 
41 |     except TypeError as e:
42 |         print("Error occurred: {}".format(e))
43 |         return
44 | 
45 |     os.remove(graph_file)
46 | 
47 |     return snap.read_communities_by_community("cpm-" + data_prefix + ".txt", G, delete_file=True)
48 | 
49 | 
50 | 
51 | def main():
52 | 
53 |     G = igraph.load('/Users/ytesfaye/tmp/GRAPHS/flights.graphml')
54 |     #snap_home, filename = setup(G)
55 | 
56 |     vc = cesna(G)
57 |     print(vc)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     main()


--------------------------------------------------------------------------------
/circulo/algorithms/snap_girvan_newman.py:
--------------------------------------------------------------------------------
 1 | import igraph
 2 | from circulo.utils.snap import setup, divisive
 3 | 
 4 | def girvan_newman(G, output="communities.txt"):
 5 |     return divisive(G, "1", output)
 6 | 
 7 | def main():
 8 | 
 9 |     G = igraph.Graph.Erdos_Renyi(n=30, m=100)
10 |     snap_home, filename = setup(G)
11 | 
12 |     vc = girvan_newman(G)
13 |     print(vc)
14 | 
15 | if __name__ == "__main__":
16 |     main()
17 | 


--------------------------------------------------------------------------------
/circulo/algorithms/snap_infomap.py:
--------------------------------------------------------------------------------
 1 | import igraph
 2 | from circulo.utils.snap import divisive, setup
 3 | 
 4 | def infomap(G, output="communities"):
 5 |     return divisive(G, "3", output)
 6 | 
 7 | def main():
 8 | 
 9 |     G = igraph.Graph.Erdos_Renyi(n=30, m=100)
10 |     snap_home, filename = setup(G)
11 | 
12 |     vc = infomap(G)
13 |     print(vc)
14 | 
15 | if __name__ == "__main__":
16 |     main()
17 | 


--------------------------------------------------------------------------------
/circulo/algorithms/spectral.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy.linalg import norm
 3 | from scipy.sparse import csc_matrix, diags
 4 | from scipy.sparse.linalg import eigsh
 5 | from scipy.cluster.vq import vq, kmeans2
 6 | 
 7 | from igraph import Graph, VertexClustering
 8 | 
 9 | def __eigenvectors_to_vc(G, eigvc):
10 |   centroid, label = kmeans2(eigvc, eigvc.shape[1], minit='points')
11 |   return VertexClustering(G, label)
12 | 
13 | def __community_spectral_base(G, k, weights, normalized):
14 |   L = csc_matrix(G.laplacian(weights=weights, 
15 |                              normalized=normalized), 
16 |                  dtype='d')
17 |   eigvl, eigvc = eigsh(L, k, which='SM')
18 |   if normalized:
19 |     for row in eigvc:
20 |       row /= norm(row)
21 |   return __eigenvectors_to_vc(G, eigvc)
22 | 
23 | def __community_spectral_rw(G, k, weights):
24 |   L = G.laplacian(weights=weights)
25 |   D = np.diag(L)
26 |   L = csc_matrix(L, dtype='d')
27 |   D = diags(D, 0, dtype='d', format=L.format)
28 | 
29 |   eigvl, eigvc = eigsh(L, k, M=D, which='SM')
30 | 
31 |   return __eigenvectors_to_vc(G, eigvc)
32 | 
33 | def community_spectral(G, k=2, weights=None, which='NCut_rw'):
34 |   '''
35 |   Performs a relaxed version of Ratio or N-cut by performing k-means on 
36 |   the (n, k)-matrix of eigenvectors from different versions of the Graph 
37 |   Laplacian.
38 |   @params
39 |    G        : an igraph.Graph.
40 |    k        : number of communities to cluster.
41 |    weights  : A weight vector or the name of an edge property.
42 |    which    : the type of cut to perform, one of RatioCut, NCut, or NCut_rw.
43 |   @returns
44 |    vc : VertexClustering with up to k clusters
45 |   '''
46 |   method = {
47 |       'RatioCut'.lower()  : lambda g, c, w: __community_spectral_base(g, c, w, normalized=False),
48 |       'NCut'.lower()      : lambda g, c, w: __community_spectral_base(g, c, w, normalized=True),
49 |       'NCut_rw'.lower()   : lambda g, c, w: __community_spectral_rw(g, c, w)
50 |   }
51 | 
52 |   # The default cut is accross components
53 |   vc = G.components()
54 |   if len(vc) >= k:
55 |     membership = [ x%k for x in vc.membership ]
56 |     vc = VertexClustering(G, membership)
57 |   else:
58 |     vc = method[which.lower()](G,k,weights)
59 | 
60 |   return vc
61 | 
62 | Graph.community_spectral = community_spectral
63 | 


--------------------------------------------------------------------------------
/circulo/data/README.md:
--------------------------------------------------------------------------------
 1 | # Circulo Datasets 
 2 | 
 3 | ###Summary
 4 | This directory contains the python scripts that download the individual datasets for the Circulo framework.  Each subdirectory represents a single dataset. Each dataset is converted into graphml and stored in the [GRAPHS](circulo/data/GRAPHS) directory by the run.py script. As such, run.py is responsible for downloading and converting raw data into a graphml formatted file.  Each run.py script must contain a class that inherits from the CirculoData class found in the [databot](circulo/data/databot.py) module.  
 5 | 
 6 | 
 7 | ###How do I add a new dataset?
 8 | 
 9 | The key to understanding how to import a dataset into Circulo is to be familiar with the CirculoData class in the [databot](circulo/data/databot.py) module. We'll pretend our new dataset is called "friends".  To import the friends dataset into the Circulo framework, follow these steps:
10 | 
11 | 1. Create a new subdirectory with a name describing the new dataset: `mkdir friends`
12 | 2. Create the python file `friends/run.py` and be sure that `run.py` contains a class that inherits from CirculoData. In this case, we will call the class `FriendsData`. 
13 | 3. Copy the README template into the new directory, naming it `README.md`: `cp README_template.md friends/README.md`. Be sure to be as thorough as possible when writing the README so that others will understand your dataset.
14 | 4. Override the necessary functions from the CirculoData class in the FriendsData class in `run.py`.  Please see other `run.py` files for examples.  The amount of code required in the `run.py` file largely depends on how close the original data is to a graph format. 
15 | 5. Add a row to the Dataset Index in this README.
16 | 6. In setup/run_algos.py there is a list called "data_choices", add your newly created datasets to that list (it must match the folder name)
17 | 
18 | ## Dataset Index
19 | | Dataset | Description | Has Ground Truth?
20 | | ------- | ------------|:---------------------:|
21 | | amazon | Co-purchasing Data | Yes |
22 | | as_data | Autonomous System Relationship Data | Yes |
23 | | house_voting | 2014 congress (house) voting data | Yes |
24 | | flights | Flights data from <http://openflights.org/data.html> | Yes |
25 | | football | NCAA D1A games played in the Fall 2000 season |  Yes |
26 | | karate | Famous data set of Zachary's karate club | Yes |
27 | | malaria | Amino acids in malaria parasite | **No** |
28 | | nba_schedule | Games played in the 2013-2014 NBA season | Yes |
29 | | netscience | Graph of collaborators on papers about network science | **No** |
30 | | pgp | Interactions in pretty good privacy |  **No** |
31 | | revolution |This is a bipartite graph representing colonial American dissidents' membership |**No**|
32 | | school | Face-to-face interactions in a primary school | Yes |
33 | | scotus | Supreme court case citation network | **No** |
34 | | senate_voting | 2014 congress (senate) voting data | Yes |
35 | | southern_women | bipartite graph of southern women social groups | __No__ |
36 | 
37 | ## Resources
38 | Here are some links with lots of graphs. Most of these sites also point you towards other resources. If you need a graph that we don't provide a script for, these sites are a good place to start looking.
39 | 
40 | <http://nexus.igraph.org>: igraph's own repository of graphs. Available in several formats.
41 | 
42 | <https://networkdata.ics.uci.edu/about.php> UC Irvine's repository of graphs. Available in several formats.
43 | 
44 | <http://www-personal.umich.edu/~mejn/netdata/> Mark Newman's personal collection of graphs. Available in gml.
45 | 
46 | <http://snap.stanford.edu/data/> Snap's repository of (especially large) datasets. 
47 | 
48 | <http://mlg.ucd.ie/index.html#software> Interesting datasets curated by University College Dublin.
49 | 


--------------------------------------------------------------------------------
/circulo/data/README_template.md:
--------------------------------------------------------------------------------
 1 | ## [Dataset Name]
 2 | 
 3 | The data can be found at (Link to dataset)
 4 | 
 5 | ## Description
 6 | (Give a high level description of the data set.)
 7 | 
 8 | Directed: TODO
 9 | 
10 | Weighted: TODO
11 | 
12 | Multigraph: TODO
13 | 
14 | ### Vertices 
15 | (describe what the vertices represent, and their attributes)
16 | 
17 | Attributes:
18 | 
19 | ### Edges
20 | (describe what the edges represent, and their attributes)
21 | 
22 | Attributes:
23 | 
24 | ## Ground Truth
25 | (describe the ground truth implemented, if any)
26 | 
27 | ## Other Notes
28 | * See `run.py` for specific details
29 | 
30 | ## References


--------------------------------------------------------------------------------
/circulo/data/amazon/run.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import urllib.request
  3 | import igraph as ig
  4 | import gzip
  5 | import pickle
  6 | import shutil
  7 | import sys
  8 | from circulo.utils.downloader import download_with_notes, _unzip
  9 | import csv
 10 | 
 11 | from circulo.data.databot import *
 12 | 
 13 | ## First pass at downloading SNAP data.
 14 | # 1. SNAP uses gzip for compression
 15 | # 2. There are overlapping communities, but graphml attributes cannot be lists so stored as string
 16 | # 3. Uses pickle file instead to store graph object (keeping groundtruth as list)
 17 | # 4. igraph wants to keep vertex ids sequential but SNAP data is not, so some empty nodes are created
 18 | # 5. after deleting these isolate nodes, the ids are remapped to remain sequetial, so have to also remap ground truth
 19 | 
 20 | DOWNLOAD_URL = 'http://snap.stanford.edu/data/bigdata/communities/com-amazon.ungraph.txt.gz'
 21 | DATA_NAME = 'com-amazon.ungraph.txt'
 22 | GRAPH_NAME = 'amazon.graphml'
 23 | 
 24 | #ground truth of the top 5000 copurchasing items
 25 | DOWNLOAD_URL_GROUNDTRUTH = 'http://snap.stanford.edu/data/bigdata/communities/com-amazon.top5000.cmty.txt.gz'
 26 | #DOWNLOAD_URL_GROUNDTRUTH = 'http://snap.stanford.edu/data/bigdata/communities/com-amazon.all.cmty.txt.gz'
 27 | GROUNDTRUTH_NAME = 'com-amazon.top5000.cmty.txt'
 28 | 
 29 | PICKLE_NAME = 'amazon-graph.pickle'
 30 | 
 31 | 
 32 | class AmazonData(CirculoData):
 33 | 
 34 |     def __download__(self):
 35 |         '''
 36 |         downloads graph from SNAP website
 37 |         '''
 38 |         #download the graph as an edgelist
 39 |         self.download_with_notes(DOWNLOAD_URL)
 40 | 
 41 |         #download ground truth
 42 |         self.download_with_notes(DOWNLOAD_URL_GROUNDTRUTH)
 43 | 
 44 |     def __prepare__(self):
 45 | 
 46 |         data_path_old = os.path.join(self.raw_data_path, DATA_NAME + ".old")
 47 |         data_path = os.path.join(self.raw_data_path, DATA_NAME)
 48 | 
 49 |         #remove non edge data from edgelist
 50 |         shutil.move(data_path, data_path_old)
 51 | 
 52 |         with open(data_path_old, "r") as f:
 53 |             with open(data_path, "w") as out:
 54 |                 for line in f:
 55 |                     if(line.startswith('#') == False):
 56 |                         out.write(line)
 57 | 
 58 |         groundtruth_path = os.path.join(self.raw_data_path,GROUNDTRUTH_NAME)
 59 | 
 60 |         # Read in Edgelist. Note that igraph creates extra nodes
 61 |         # with no edges for ids missing in sequential order
 62 |         # from the graph. We will delete these isolates later
 63 |         g = ig.Graph.Read_Edgelist(data_path,directed=False)
 64 | 
 65 |         # Assign communities as node attributes
 66 |         with open(groundtruth_path,'r') as gtp:
 67 |                 csvreader = csv.reader(gtp,delimiter='\t')
 68 |                 # note that converting to graphml, attributes cannot be lists
 69 |                 # only boolean,int,long,float,double,or string
 70 |                 #
 71 |                 # storing groundtruth communities as both arrays and strings
 72 |                 # so that graphml file can retain attribute
 73 |                 g.vs()['groundtruth_str'] = ''
 74 | 
 75 |                 count = 0
 76 |                 for line in csvreader:
 77 |                     for v in line:
 78 |                         v = int(v)
 79 |                         if g.vs[v]['groundtruth_str']:
 80 |                                 g.vs[v]['groundtruth_str'] += ',' + str(count)
 81 |                         else:
 82 |                             g.vs[v]['groundtruth_str'] = str(count)
 83 |                     count += 1
 84 |                     max_clusters = count
 85 | 
 86 |         # remove isolates - this changes node ids!
 87 |         g.delete_vertices(g.vs.select(_degree=0))
 88 | 
 89 |         # Write out graphml file
 90 |         g.write_graphml(self.graph_path)
 91 | 
 92 | 
 93 | 
 94 |     def get_context(self):
 95 |         return  {
 96 |             CirculoData.CONTEXT_OPTIMAL_PARTITIONS:5000
 97 |             }
 98 | 
 99 |     def get_ground_truth(self, G):
100 | 
101 |         cluster_dict = {}
102 | 
103 |         for idx, cluster_str in enumerate(G.vs()['groundtruth_str']):
104 |             for c in  cluster_str.split():
105 |                 if c not in cluster_dict:
106 |                     cluster_dict[c] = []
107 | 
108 |                 #have to re-do this since id's likely changed by removing isolates
109 |                 cluster_dict[c].append(idx)
110 | 
111 |         return  ig.VertexCover(G,[v for v in cluster_dict.values()])
112 | 
113 | 
114 | def main():
115 |     databot = AmazonData("amazon")
116 |     databot.get_ground_truth(databot.get_graph())
117 | 
118 | if __name__ == '__main__':
119 |     main()
120 | 


--------------------------------------------------------------------------------
/circulo/data/as_data/README.md:
--------------------------------------------------------------------------------
 1 | ## AS Relationship Data
 2 | 
 3 | The data can be found at <http://www.caida.org/data/as-relationships/>
 4 | 
 5 | ## Description
 6 | This dataset is taken from the Center for Applied Internet Data Analysis (CAIDA). This dataset assigns labels, either peer or isp, to Autonomous System (AS) Relationships. Understand AS relationships is useful for understanding the structure of the internet and why routing properties are the way they are.
 7 | 
 8 | Directed: No 
 9 | 
10 | Weighted: No
11 | 
12 | Multigraph: No
13 | 
14 | ### Vertices 
15 | Each vertex represents an AS.
16 | 
17 | Attributes:
18 | * **ASN**: The name of the researcher
19 | * **aut_name**: Name of AS
20 | * **changed**: Date of change to infomartion
21 | * **country**: Country of AS
22 | * **org_name**: Name of organization running AS
23 | * **source**: Registrar (i.e. ARIN) 
24 | 
25 | 
26 | ### Edges
27 | An edge represents a relationship between two AS.
28 | 
29 | Attributes:
30 | * **relationship**: 1 if it is a provider/customer link, 0 if it is a peer AS link
31 | 
32 | ## Ground Truth
33 | Currently set to country the AS is in. Registrar might more closely reflect the community structure
34 | 
35 | ## Other Notes
36 | * See `run.py` for specific details
37 | 
38 | ## References
39 | The CAIDA UCSD AS-Relationship - 20141201,
40 | <http://www.caida.org/data/as-relationships/>


--------------------------------------------------------------------------------
/circulo/data/as_data/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | import igraph
 17 | from igraph import VertexCover
 18 | import gzip
 19 | import bz2
 20 | import os
 21 | from operator import itemgetter
 22 | from collections import defaultdict
 23 | 
 24 | 
 25 | from circulo.data.databot import CirculoData
 26 | 
 27 | DOWNLOAD_URL = 'http://data.caida.org/datasets/as-relationships/serial-1/20141201.as-rel.txt.bz2'
 28 | AS_INFO_URL = 'http://data.caida.org/datasets/as-organizations/20141001.as-org2info.txt.gz'
 29 | 
 30 | 
 31 | class ASData(CirculoData):
 32 |     @staticmethod
 33 |     def __make_asn_mapping(fname):
 34 |         """
 35 |         Helper function to turn downloaded file into dictionary where key = ASN and values are dictionary of properties
 36 |         """
 37 |         FORMAT_STRING = '# format'
 38 |         ORG_ID_STRING = 'org_id'
 39 |         ASN_STRING = 'aut'
 40 | 
 41 |         data_by_org_id = {}  # Dictionary of properties by org ID {org_id:{prop_name:prop_value}}
 42 |         # Read source file using # format lines to parse fields
 43 |         with gzip.open(fname, 'rt') as f:
 44 |             for line in f:
 45 |                 line = line.strip()
 46 |                 if line.startswith(FORMAT_STRING):
 47 |                     # Extract format
 48 |                     format_fields = line[len(FORMAT_STRING)+1:].split('|')
 49 |                     org_id_index = format_fields.index(ORG_ID_STRING)
 50 |                     print(format_fields, org_id_index)
 51 |                 elif line.startswith("#"):
 52 |                     # Ignore comments that aren't format
 53 |                     pass
 54 |                 else:
 55 |                     # Decode data and add to data_by_org_id
 56 |                     data = line.split('|')
 57 |                     org_id = data[org_id_index]
 58 |                     if org_id not in data_by_org_id:
 59 |                         data_by_org_id[org_id] = {}
 60 |                     for i,data_field in enumerate(format_fields):
 61 |                         if i != org_id_index:
 62 |                             data_by_org_id[org_id][data_field] = data[i]
 63 | 
 64 |         # Restructure data to be sorted by asn
 65 |         data_by_asn = {}  # {asn:{prop_name:prop_value}}
 66 |         for org_id in data_by_org_id:
 67 |             asn = data_by_org_id[org_id][ASN_STRING]
 68 |             if asn not in data_by_asn:
 69 |                 data_by_asn[asn] = {}
 70 |             for (field_name, field_val) in data_by_org_id[org_id].items():
 71 |                 if field_name != ASN_STRING:
 72 |                     data_by_asn[asn][field_name] = field_val
 73 |         return data_by_asn
 74 | 
 75 |     def __download__(self):
 76 |         print("Downloading")
 77 |         self.download_with_notes(DOWNLOAD_URL)
 78 |         self.download_with_notes(AS_INFO_URL)
 79 | 
 80 |     def __prepare__(self):
 81 |         filename = os.path.join(self.raw_data_path, os.path.basename(DOWNLOAD_URL))
 82 | 
 83 |         edges = []
 84 |         relationships = []
 85 |         print("Reading links")
 86 |         num_nodes = -1
 87 |         # Read in raw AS Links
 88 |         with bz2.open(filename, 'rt') as f:
 89 |             for line in f:
 90 |                 line = line.strip()
 91 |                 if not line.startswith('#'):
 92 |                     (src, dst, relationship) = line.split('|')
 93 |                     src = int(src)
 94 |                     dst = int(dst)
 95 |                     if src and dst:
 96 |                         # TODO: Consider changing to directed graph and duplicating peer links in  both directions?
 97 |                         edges.append((src, dst))
 98 |                         relationships.append(relationship)
 99 |                         # Keep track of max node seen
100 |                         if src > num_nodes:
101 |                             num_nodes = src
102 |                         if dst > num_nodes:
103 |                             num_nodes = dst
104 | 
105 |         print("Creating Graph")
106 |         g = igraph.Graph(directed=False)
107 |         g.add_vertices(num_nodes+1) # Need +1 since ASN are 1 indexed but verticies are 0 indexed
108 |         g.add_edges(edges)
109 | 
110 |         # Keep AS Names through pruning
111 |         g.vs["ASN"] = [str(as_num) for as_num in range(len(g.vs))]
112 |         # Add relationships before pruning
113 |         g.es["relationship"] = relationships
114 | 
115 |         # Add other ASN Properties
116 |         asn_filename = os.path.join(self.raw_data_path, os.path.basename(AS_INFO_URL))
117 |         asn_info = self.__make_asn_mapping(asn_filename)
118 |         print("Num Nodes:", num_nodes)
119 |         for asn in asn_info:
120 |             if int(asn) <= num_nodes:
121 |                 for field_name, field_val in asn_info[asn].items():
122 |                     g.vs[int(asn)][field_name] = field_val
123 | 
124 |         print("Checking Graph")
125 |         # Take largest connected component
126 |         components = g.components(mode=igraph.WEAK)
127 |         if len(components) > 1:
128 |             print("[Graph Prep - as_data]... Disconnected Graph Detected. Using largest component.")
129 |             print("[Graph Prep - as_data]... Original graph: {} vertices and {} edges.".format(g.vcount(), g.ecount()))
130 |             g = g.subgraph(max(components, key=len))
131 |             print("[Graph Prep - as_data]... Largest component: {} vertices and {} edges.".format(g.vcount(), g.ecount()))
132 |         g.write_graphml(self.graph_path)
133 | 
134 |     def prune(self, G):
135 |         # There aren't edge weights so there's no way to prune
136 |         pass
137 | 
138 |     def get_ground_truth(self, G):
139 |         """
140 |         Get a Vertex Cover representing the ground truth for this graph. It's not apparent what the right "ground truth"
141 |         is but a guess is "country". It might be true that "source" (which is the registrar that handled the transaction
142 |         ) is a better guess
143 |         """
144 |         if G is None:
145 |             return
146 | 
147 |         GROUND_TRUTH_FIELD = 'country'
148 | 
149 |         membership = G.vs[GROUND_TRUTH_FIELD]
150 |         # Map community names to integers
151 |         community_name_to_id = {}
152 |         max_community_seen = 0
153 | 
154 |         cluster_dict = defaultdict(list)
155 |         for vertex_id, community_name in enumerate(membership):
156 |            cluster_dict[community_name].append(vertex_id)
157 | 
158 |         cluster_list = [v for v in cluster_dict.values()]
159 |         return VertexCover(G, cluster_list)
160 | 
161 | 
162 | def main():
163 |     databot = ASData("as_data")
164 |     G = databot.get_graph()
165 |     databot.get_ground_truth(G)
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     main()


--------------------------------------------------------------------------------
/circulo/data/databot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import sys
 18 | import os
 19 | import igraph
 20 | from igraph import VertexCover
 21 | import urllib.request
 22 | import zipfile
 23 | import gzip
 24 | import statistics
 25 | 
 26 | PRINT_PREFIX="[===DATA===]"
 27 | 
 28 | class CirculoData:
 29 | 
 30 |     #class variable
 31 |     CONTEXT_OPTIMAL_PARTITIONS = "optimal_partitions"
 32 |     CONTEXT_ATTRS_TO_USE = "attributes_to_use"
 33 | 
 34 |     def __init__(self, dataset_name):
 35 |         data_dir = os.path.dirname(__file__)
 36 |         graph_dir = os.path.join(data_dir, "GRAPHS")
 37 | 
 38 |         #make sure that the graph dir exists
 39 |         if not os.path.exists(graph_dir):
 40 |             os.mkdir(graph_dir)
 41 | 
 42 |         self.raw_data_path = os.path.join(data_dir,dataset_name, "raw")
 43 |         self.dataset_name = dataset_name
 44 |         self.graph_path = os.path.join(graph_dir, dataset_name+".graphml")
 45 | 
 46 |     def __download__(self):
 47 |         '''
 48 |         :data_dir  an existing directory where raw data should be stored
 49 | 
 50 |         Downloads data
 51 |         '''
 52 |         raise NotImplmentedError("function must be overridden")
 53 | 
 54 | 
 55 |     def __prepare__(self):
 56 |         '''
 57 |         :data_dir   an existing directory where raw data should be stored
 58 |         :graph_path path to serialized graph file
 59 |         :options    additional optionss
 60 |         '''
 61 |         raise NotImplementedError("function must be overridden")
 62 | 
 63 | 
 64 |     def get_context(self):
 65 |         '''
 66 |         returns a dictionary of recommended optimizatios when running this data against certain algorithms.
 67 |         By default, returns an empty dictionary.
 68 |         '''
 69 | 
 70 |         return dict()
 71 | 
 72 | 
 73 |     def get_ground_truth(self, G):
 74 |         '''
 75 |         Returns a VertexCover representing the ground truth for the given graph
 76 |         '''
 77 |         raise NotImplementedError("function must be overridden")
 78 | 
 79 |     def get_graph(self):
 80 |         '''
 81 |         Returns the graph loaded in memory
 82 |         '''
 83 | 
 84 |         if not os.path.exists(self.raw_data_path):
 85 |             os.mkdir(self.raw_data_path)
 86 |             self.__download__()
 87 | 
 88 |         if not os.path.exists(self.graph_path):
 89 |             self.__prepare__()
 90 | 
 91 |         return igraph.load(self.graph_path)
 92 | 
 93 | 
 94 |     def download_with_notes(self,url, progressbar=True, download_file=None):
 95 |         """
 96 |         Uses urllib to download data from URL. Saves the file_download.. Provides basic logging to stdout.
 97 | 
 98 |         :url            source url
 99 |         :file_downlaod  destination file path
100 |         :progressbar    shows progress bar (default: true)
101 | 
102 |         """
103 |         print(PRINT_PREFIX, "Downloading data from " + url + ".....")
104 | 
105 |         if download_file is None:
106 |             download_file = os.path.basename(url)
107 | 
108 |         download_path = os.path.join(self.raw_data_path, download_file)
109 | 
110 |         try:
111 |             if progressbar:
112 |                 urllib.request.urlretrieve(url, download_path, reporthook=progress)
113 |             else:
114 |                 urllib.request.urlretrieve(url, download_path)
115 |         except Exception as e:
116 |             print(PRINT_PREFIX, "Data download failed -- make sure the url is still valid, and that urllib is properly installed.\n\n")
117 |             raise(e)
118 |         print("Download complete.")
119 | 
120 |         _unzip(download_path)
121 | 
122 | 
123 | def _unzip(zip_path):
124 |     '''
125 |     Unzips the file at zip_path into the current directory
126 | 
127 |     :zip_path src of zip file
128 |     '''
129 | 
130 |     if zipfile.is_zipfile(zip_path):
131 |         try:
132 |             z = zipfile.ZipFile(zip_path)
133 |         except zipfile.BadZipFile as e:
134 |             print(PRINT_PREFIX, "ZipFile error: {}".format(e))
135 |             sys.exit(0)
136 |         print(PRINT_PREFIX, "Extracting from zip...")
137 |         z.extractall(path=os.path.dirname(zip_path))
138 | 
139 |     else:
140 |         unzip_file = os.path.splitext(zip_path)[0]
141 | 
142 |         with gzip.open(zip_path,'rb') as infile:
143 |             try:
144 |                 file_content = infile.read()
145 |             except OSError as e:
146 |                 print(PRINT_PREFIX, "Neither gzip nor zipfile. No extraction necessary.")
147 |                 return
148 | 
149 |             with open(unzip_file, "wb") as f:
150 |                 print(PRINT_PREFIX, "Extracting from gzip...")
151 |                 f.write(file_content)
152 | 
153 | def progress(blockNum, blockSize, totSize):
154 |     """
155 |     Provides an ascii progress bar that is 50 characters wide.
156 |     totSize is the total size of the task, blockSize is the size
157 |     of each block, and blockNum is the current block being worked on.
158 | 
159 |     For example:
160 | 
161 |     for i in range(100):
162 |         progress(i + 1, 1, 100)
163 |         sleep(1)
164 | 
165 |     will print a progress bar over 100 seconds.
166 |     """
167 |     downloaded = blockNum * blockSize
168 |     per = min(100 * downloaded / totSize, 100)
169 |     sys.stdout.write("\r%d%%" %per)
170 |     for i in range(int(per / 2)):
171 |         sys.stdout.write(".")
172 |     for i in range(50 - int(per/2)):
173 |         sys.stdout.write(" ")
174 |     sys.stdout.write("# ")
175 |     sys.stdout.flush()
176 | 
177 | 


--------------------------------------------------------------------------------
/circulo/data/flights/README.md:
--------------------------------------------------------------------------------
 1 | ## Airline Flight Data: Airport, Airline, and Route Data
 2 | 
 3 | The data can be found at <http://openflights.org/data.html>
 4 | 
 5 | ## Description
 6 | Route data between airports.
 7 | 
 8 | **Directed**: Yes
 9 | 
10 | **Weighted**: No
11 | 
12 | **Multigraph**: Default: No, but information is available.
13 | 
14 | ### Vertices 
15 | Each vertex represents some airport for which we have at least one flight record.
16 | 
17 | Attributes:
18 | * **DST**: Daylight savings time. One of E (Europe), A (US/Canada), S (South America), O (Australia), Z (New Zealand), N (None) or U (Unknown).
19 | * **altitude**: In feet.
20 | * **ICAO**: 4-letter ICAO code. Blank if not assigned.
21 | * **id**: Unique integral identifier. Generally use "name" instead, to reference.
22 | * **name**: Unique OpenFlights identifier for this airport.
23 | * **city**: Main city served by airport. May be spelled differently from airport_name.
24 | * **latitude**: Decimal degrees, usually to six significant digits. Negative is South, positive is North.
25 | * **longitude**: Decimal degrees, usually to six significant digits. Negative is West, positive is East.
26 | * **timezone**: Hours offset from UTC. Fractional hours are expressed as decimals, eg. India is 5.5.
27 | * **IATA/FAA**: 3-letter FAA code, for airports located in Country "United States of America". 3-letter IATA code, for all other airports. Blank if not assigned.
28 | * **country**: Country or territory where airport is located.
29 | * **airport_name**: Name of airport. May or may not contain the city name.
30 | 
31 | 
32 | ### Edges
33 | There is a directed edge between two nodes wherever there is a flight between those nodes. By "flight," we mean a recurring flight like UA123, not an individual instance of a flight. Different airlines have flights between the same source and destination, so this is a multigraph that can be modified into a weighted graph by calling download_utils.multigraph_to_weights. `get_graph` does this automatically, but the graph is saved as a multigraph.
34 | 
35 | Attributes (only available in multigraph. Otherwise, the only attribute is "weight"):
36 | * **airline_id**: Unique OpenFlights identifier for this airline.
37 | * **equipment**: 3-letter codes for plane type(s) generally used on this flight, separated by spaces.
38 | * **source_airport**: 3-letter (IATA) or 4-letter (ICAO) code of the source airport.
39 | * **stops**: Number of stops on this flight ("0" for direct)
40 | * **source_id**: Unique OpenFlights identifier for source airport
41 | * **codeshare**: "Y" if this flight is a codeshare (that is, not operated by Airline, but another carrier), empty otherwise.
42 | * **dest_airport**: 3-letter (IATA) or 4-letter (ICAO) code of the destination airport.
43 | * **dest_id**: Unique OpenFlights identifier for destination airport.
44 | * **airline**: 2-letter (IATA) or 3-letter (ICAO) code of the airline.
45 | 
46 | ## Ground Truth
47 | `get_ground_truth` returns a VertexClustering of vertices grouped by some attribute from the vertex attributes supplied by the user. Currently, the ground truth defaults to clustering by country.
48 | 
49 | ## Other Notes
50 | * See `run.py` for specific details
51 | 
52 | ## References
53 | Thanks to OpenFlights.org


--------------------------------------------------------------------------------
/circulo/data/football/README.md:
--------------------------------------------------------------------------------
 1 | ## American College Football
 2 | 
 3 | The data can be found at <http://www-personal.umich.edu/~mejn/netdata/football.zip>
 4 | 
 5 | ## Description
 6 | Football games played between Division 1A colleges during the regular season.
 7 | 
 8 | Directed: No
 9 | 
10 | Weighted: No
11 | 
12 | Multigraph: Yes
13 | 
14 | ### Vertices 
15 | Each vertex represents a team. 
16 | 
17 | Attributes:
18 | * **label**: School name
19 | * **id**: Unique identifying integer id
20 | * **value**: Integer value specifying conference.
21 | 
22 | ### Edges
23 | There is an edge between two vertices for each game the teams have played each other. Since a few teams play each other multiple times, this is a multigraph. It can be converted into a weighted graph by calling `download_utils.multigraph_to_weights` from the Circulo package.
24 | 
25 | Attributes: None
26 | 
27 | ## Other Notes
28 | * See `run.py` for specific details
29 | 
30 | ## Ground Truth
31 | `get_ground_truth` groups the vertices by conference.
32 | 
33 | ## References
34 | Data from Mark Newman's personal website. 
35 | 
36 | M. Girvan and M. E. J. Newman, *Proc. Natl. Acad. Sci. USA* **99**, 7821-7826 (2002).


--------------------------------------------------------------------------------
/circulo/data/football/run.py:
--------------------------------------------------------------------------------
 1 | import igraph
 2 | from igraph import VertexCover
 3 | import os
 4 | import sys
 5 | import urllib.request
 6 | import shutil
 7 | from circulo.data.databot import *
 8 | 
 9 | DOWNLOAD_URL = "http://www-personal.umich.edu/~mejn/netdata/football.zip"
10 | 
11 | class FootballData(CirculoData):
12 | 
13 |     def __download__(self):
14 |         """
15 |         downloads the graph from DOWNLOAD_URL into data_dir/GRAPH_NAME
16 |         """
17 |         self.download_with_notes(DOWNLOAD_URL)
18 | 
19 |     def __prepare__(self):
20 |         """
21 |         """
22 |         #convert gml to graphml
23 |         G = igraph.load( os.path.join(self.raw_data_path, "football.gml"))
24 |         #must delete the id attribute since graphml uses it as a reserved attribute and gml does not
25 |         del G.vs['id']
26 |         G.write_graphml(self.graph_path)
27 | 
28 | 
29 |     def get_ground_truth(self, G):
30 |         """
31 |         Returns a VertexClustering object of the
32 |         ground truth of the graph G. The ground truth for this
33 |         football data is the conference to which each team belongs.
34 |         """
35 | 
36 |         #by default conferences are identified by a float number
37 |         float_membership = G.vs['value']
38 |         conf_map = {}
39 |         for vertex_id, conference_id in enumerate(float_membership):
40 |             if conference_id not in conf_map:
41 |                 conf_map[conference_id] = []
42 |             conf_map[conference_id].append(vertex_id)
43 | 
44 | 
45 |         cluster_list = [v for k,v in conf_map.items()]
46 | 
47 |         return VertexCover(G, cluster_list)
48 | 
49 | 
50 | def main():
51 |     FootballData("football").get_ground_truth()
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/circulo/data/house_voting/README.md:
--------------------------------------------------------------------------------
 1 | ## Congress Voting Data
 2 | 
 3 | The data can be found at <https://www.govtrack.us/developers/data>
 4 | 
 5 | ## Description
 6 | Congress voting records from 2014.
 7 | 
 8 | Directed: No
 9 | 
10 | Weighted: Yes
11 | 
12 | Multigraph: No
13 | 
14 | ### Vertices 
15 | Each vertex represents a congressperson for whom we have voting data.
16 | 
17 | Attributes:
18 | * **name**: Unique identifying id
19 | * **full_name**: Name of the congressperson
20 | * **state**: State represented
21 | * **id**: Unique identifier. In most cases, use "name."
22 | * **party**: Political party.
23 | 
24 | ### Edges
25 | There is an edge between two nodes whenever the congresspeople vote together on an issue. The edges are weighted by the number of votes that are shared. 
26 | 
27 | Attributes:
28 | * **weight**: The number of times the congresspeople on each side of this edge have voted the same way.
29 | 
30 | ## Ground Truth
31 | `get_ground_truth` returns a VertexClustering grouped by the parties of the politicians.
32 | 
33 | ## Other Notes
34 | * See `run.py` for specific details
35 | 
36 | ## References
37 | Thanks to GovTrack.us
38 | 


--------------------------------------------------------------------------------
/circulo/data/house_voting/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | if [ -z "$1" ]; then 
 5 |     echo "Data dir required" 
 6 |     exit 0
 7 | fi
 8 | 
 9 | if [ ! -d "$1" ]; then
10 |     echo "Data dir does not exist"
11 |     exit 0
12 | fi
13 | 
14 | rsync -avz --delete --delete-excluded --exclude **/text-versions/ govtrack.us::govtrackdata/congress/113/votes/2014 $1 > /dev/null 2>&1
15 | 	
16 | rsync -avz --delete --delete-excluded --exclude **/text-versions/ govtrack.us::govtrackdata/congress-legislators/legislators-current.csv $1 > /dev/null 2>&1
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/circulo/data/house_voting/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import json
 18 | import glob
 19 | import csv
 20 | import itertools
 21 | import os
 22 | import igraph
 23 | from igraph import VertexCover
 24 | from subprocess import call
 25 | from circulo.data.databot import CirculoData
 26 | 
 27 | 
 28 | class HouseData(CirculoData):
 29 | 
 30 |     def __download__(self):
 31 | 
 32 |         try:
 33 |             call(["bash", os.path.join(os.path.dirname(__file__), "download.sh"), self.raw_data_path])
 34 |         except Exception as e:
 35 |             #print("rsync failed to retrieve data")
 36 |             raise(e)
 37 | 
 38 |     def __prepare__(self):
 39 |         '''
 40 |         Prepare congress data. NOTE: the vertex lookups should be indexed, however this
 41 |         funciton could prob be sped up by just created a dict with all possible congress pairs
 42 |         and counting how often they vote together, then at the end creating the edges
 43 |         '''
 44 | 
 45 |         src_files = os.path.join(self.raw_data_path, "2014", "h*","*.json")
 46 |         c_type = "rep"
 47 |         G = igraph.Graph()
 48 | 
 49 | 
 50 |         #first load the vertices
 51 |         with open(os.path.join(self.raw_data_path, "legislators-current.csv"), 'r') as f:
 52 | 
 53 |             csvreader = csv.reader(f,delimiter=',',quotechar='"')
 54 |             #skip the headers
 55 |             next(csvreader, None)  # skip the headers
 56 |             for row in csvreader:
 57 | 
 58 |                 if c_type != row[4]:
 59 |                     continue
 60 |                 elif row[4] == "sen":
 61 |                     congress_id = row[21]
 62 |                 elif row[4] == "rep":
 63 |                     congress_id = row[18]
 64 |                 else:
 65 |                     raise("Unidentified congress: {}".format(row[4]))
 66 | 
 67 |                 G.add_vertex(
 68 |                     congress_id,
 69 |                     full_name="{} {}".format(row[1],row[0]),
 70 |                     party=row[7],
 71 |                     state=row[5]
 72 |                     )
 73 | 
 74 | 
 75 | 
 76 |         missing_ids = set()
 77 | 
 78 |         #now create the edges
 79 |         for fname in glob.glob(src_files):
 80 |             with open(fname,'r') as inputfile:
 81 |                 data = json.load(inputfile)
 82 |                 for vt in data['votes']:
 83 |                     congress_ids = [n['id'] for n in data['votes'][vt]]
 84 |                     pairs = itertools.combinations(congress_ids,2)
 85 | 
 86 |                     for congress_id0, congress_id1 in pairs:
 87 |                         try:
 88 |                             v0 = G.vs.find(congress_id0)
 89 |                         except ValueError as e:
 90 |                             missing_ids.add(congress_id0)
 91 |                             continue
 92 | 
 93 |                         try:
 94 |                             v1 = G.vs.find(congress_id1)
 95 |                         except ValueError as e:
 96 |                             missing_ids.add(congress_id1)
 97 |                             continue
 98 | 
 99 |                         e = G.get_eid(v0.index, v1.index, directed=False, error=False)
100 | 
101 |                         if e>=0:
102 |                             G.es[e]['weight'] += 1
103 |                         else:
104 |                             G.add_edge(v0, v1, weight=1)
105 | 
106 |         #the graph is highly connected, so we will prune it
107 |         self.prune(G)
108 | 
109 |         components = G.components(mode=igraph.WEAK)
110 | 
111 |         #the dataset by default is diconnected, so we must take the largest component
112 |         if len(components) is not 1:
113 |             G = G.subgraph(max(components, key=len))
114 | 
115 | 
116 |         G.write_graphml(self.graph_path)
117 | 
118 |     def prune(self,G):
119 | 
120 |         if G.is_weighted() is False:
121 |             print("Error: Unable to prune a graph without edge weights")
122 |             return
123 | 
124 |         weights = G.es()['weight']
125 |         threshold = .65 * max(weights)
126 |         orig_edge_count = G.ecount()
127 |         edges = G.es.select(weight_lt=threshold)
128 |         G.delete_edges(edges)
129 | 
130 |     def __party_to_cluster__(self, party):
131 |         if party == "Democrat":
132 |             return 0
133 |         elif party == "Republican":
134 |             return 1
135 |         elif party == "Independent":
136 |             return 2
137 |         else:
138 |             raise("Unknown party affiliation {}".format(party))
139 | 
140 |     def get_ground_truth(self, G):
141 | 
142 |         cluster_list = [[],[],[]]
143 | 
144 |         for vertex_id, party in enumerate(G.vs['party']):
145 |             cluster_list[self.__party_to_cluster__(party)].append(vertex_id)
146 | 
147 |         return VertexCover(G, cluster_list)
148 | 
149 | 
150 | def main():
151 | 
152 |     databot = HouseData("house_voting")
153 |     G = databot.get_graph()
154 |     databot.get_ground_truth(G)
155 | 
156 | if __name__ == "__main__":
157 |     main()
158 | 


--------------------------------------------------------------------------------
/circulo/data/karate/README.md:
--------------------------------------------------------------------------------
 1 | ## Zachary's Karate Club
 2 | 
 3 | The data can be found at <http://www-personal.umich.edu/~mejn/netdata/karate.zip>
 4 | 
 5 | ## Description
 6 | Friendships at a university karate club in the 1970s.
 7 | 
 8 | Directed: No
 9 | 
10 | Weighted: No
11 | 
12 | Multigraph: No
13 | 
14 | ### Vertices 
15 | Each vertex represents a member of the karate club.
16 | 
17 | Attributes:
18 | * **id**: Unique identifier.
19 | 
20 | ### Edges
21 | Each edge represents a friendship between two members of the club.
22 | 
23 | Attributes: None.
24 | 
25 | ## Ground Truth
26 | Not yet implemented.
27 | 
28 | ## Other Notes
29 | * See `run.py` for specific details
30 | 
31 | ## References
32 | Taken from Mark Newman's personal site.
33 | 
34 | W. W. Zachary, An information flow model for conflict and fission in small groups, *Journal of Anthropological Research* **33**, 452-473 (1977).


--------------------------------------------------------------------------------
/circulo/data/karate/run.py:
--------------------------------------------------------------------------------
 1 | import igraph
 2 | from igraph import VertexCover
 3 | import os
 4 | import sys
 5 | import urllib.request
 6 | from circulo.utils.downloader import download_with_notes
 7 | import shutil
 8 | from circulo.data.databot import *
 9 | 
10 | 
11 | DOWNLOAD_URL = 'http://www-personal.umich.edu/~mejn/netdata/karate.zip'
12 | KARATE_RAW = "karate.gml"
13 | 
14 | class KarateData(CirculoData):
15 | 
16 |     def __download__(self):
17 |         self.download_with_notes(DOWNLOAD_URL)
18 | 
19 |     def __prepare__(self):
20 | 
21 |         G = igraph.load(os.path.join(self.raw_data_path, KARATE_RAW))
22 |         del G.vs['id']
23 |         G.write_graphml(self.graph_path)
24 | 
25 |     def get_ground_truth(self, G):
26 |         """
27 |         returns a VertexClustering object of the
28 |         ground truth of the graph G.
29 |         """
30 | 
31 |         clusters_list = [
32 |                 [0,1,2,3,4,5,6,7,10,11,12,13, 16,17,19, 20, 22,23,24,25,26,27,28,29,30,31,32,33],
33 |                 [8,9,14, 15, 18,21]
34 |                 ]
35 | 
36 |         return VertexCover(G, clusters_list)
37 | 
38 | def main():
39 |     databot = KarateData("karate")
40 |     databot.get_ground_truth(databot.get_graph())
41 | 
42 | if __name__ == "__main__":
43 |     main()
44 | 


--------------------------------------------------------------------------------
/circulo/data/malaria/README.md:
--------------------------------------------------------------------------------
 1 | ## malaria
 2 | 
 3 | The data can be found at http://danlarremore.com/bipartiteSBM/malariaData.zip
 4 | 
 5 | ## Description
 6 | A bipartite graph representing genetic sequences from the malaria parasite
 7 | *Plasmodium falciparum*.
 8 | 
 9 | Directed: No
10 | 
11 | Weighted: No
12 | 
13 | Multigraph: No
14 | 
15 | Bipartite: Yes
16 | 
17 | ### Vertices 
18 | 297 genes and their 806 shared amino acid substrings
19 | 
20 | Attributes:
21 | None
22 | 
23 | ### Edges
24 | Edge between a gene and an acid if theeamino acid appears in the gene.
25 | 
26 | Attributes: 
27 | None
28 | 
29 | ## Ground Truth
30 | Not yet implemented
31 | 
32 | ## Other Notes
33 | * See `run.py` for specific details
34 | 
35 | ## References
36 | http://danlarremore.com/pdf/2014_LCJ_EfficientlyInferringCommunityStructureInBipartiteNetworks_PRE.pdf
37 | 
38 | Larremore, D. B., Clauset, A., and Buckee, C. O. (2013). A Network Approach to Analyzing Highly Recombinant Malaria Parasite Genes. PLoS Computational Biology, 9(10), e1003268.


--------------------------------------------------------------------------------
/circulo/data/malaria/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | import igraph
18 | import os
19 | 
20 | from circulo.data.databot import CirculoData
21 | 
22 | DOWNLOAD_URL = 'http://danlarremore.com/bipartiteSBM/malariaData.zip'
23 | DATA_DIR = "malariaData"
24 | 
25 | class MalariaData(CirculoData):
26 | 
27 |     def __download__(self):
28 |         self.download_with_notes(DOWNLOAD_URL)
29 | 
30 | 
31 |     def __prepare__(self):
32 | 
33 |         data = os.path.join(self.raw_data_path, DATA_DIR, "malaria.edgelist")
34 |         mod_data = os.path.join(self.raw_data_path, DATA_DIR, "mod_malaria.edgelist")
35 | 
36 |         #we just need to remove the third column which has 1's in it
37 |         #so igraph can read it as an edgelist
38 |         with open(data, 'r') as f:
39 |             with open(mod_data, 'w') as new:
40 |                 for line in f:
41 |                     new.write(line[:-2] + '\n')
42 | 
43 |         G = igraph.load(mod_data)
44 |         G.write_graphml(self.graph_path)
45 | 
46 | def main():
47 |     databot = MalariaData("malaria")
48 |     databot.get_graph()
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/circulo/data/nba_schedule/README.md:
--------------------------------------------------------------------------------
 1 | ## NBA Schedule
 2 | 
 3 | The data can be found at <https://github.com/davewalk/2013-2014-nba-schedule>
 4 | 
 5 | ## Description
 6 | Games played in the 2013-2014 NBA season.
 7 | 
 8 | Directed: No
 9 | 
10 | Weighted: Yes
11 | 
12 | Multigraph: No
13 | 
14 | ### Vertices 
15 | Each vertex represents a team.
16 | 
17 | Attributes:
18 | * **id**: Unique identifier.
19 | * **name**: Team name.
20 | 
21 | ### Edges
22 | There is an edge between each team that plays each other, weighted by the number of games played.
23 | 
24 | Attributes:
25 | * **weight**: Number of games played between the two teams.
26 | 
27 | ## Ground Truth
28 | `get_ground_truth` returns a VertexClustering of teams clustered by the six divisions.
29 | 
30 | ## Other Notes
31 | * See `run.py` for specific details
32 | 
33 | ## References
34 | Thanks to [Dave Walk](https://github.com/davewalk) and ESPN.com


--------------------------------------------------------------------------------
/circulo/data/nba_schedule/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | import csv
 19 | import re
 20 | import sys
 21 | import os
 22 | import glob
 23 | import statistics
 24 | 
 25 | from subprocess import call
 26 | 
 27 | import igraph
 28 | from igraph import VertexCover
 29 | 
 30 | from circulo.data.databot import CirculoData
 31 | 
 32 | 
 33 | class NBAData(CirculoData):
 34 | 
 35 | 
 36 |     def __download__(self):
 37 | 
 38 |         try:
 39 |             call(["git", "clone", "https://github.com/davewalk/2013-2014-nba-schedule", self.raw_data_path])
 40 |         except Exception as e:
 41 |             print("Git Clone Failed to retrieve data")
 42 |             raise(e)
 43 | 
 44 | 
 45 |     def convert(self, string):
 46 |         '''
 47 |         Puts the team names into a consistent format since the naming is inconsistent throughout
 48 |         the datasets
 49 |         '''
 50 |         string = string.lower()
 51 |         string = re.sub('_',"-", string)
 52 |         string = re.sub(' ',"-", string)
 53 |         return string
 54 | 
 55 | 
 56 |     def __prepare__(self):
 57 | 
 58 |         team_dict = {}
 59 | 
 60 |         G = igraph.Graph(directed=True)
 61 | 
 62 |         data_regex = os.path.join(self.raw_data_path,"data","csv",'*.csv')
 63 | 
 64 |         #adds the vertices strictly based on the names of the files
 65 |         for filename in glob.glob(data_regex):
 66 |             std_team_name = re.sub('.csv',"",os.path.basename(filename))
 67 |             std_team_name = re.sub('_',"-",std_team_name)
 68 |             G.add_vertex(std_team_name)
 69 | 
 70 |         #each file represents a team
 71 |         for filename in glob.glob(data_regex):
 72 | 
 73 |             with open(filename, "r") as data:
 74 | 
 75 |                 reader = csv.reader(data)
 76 |                 std_team_name = re.sub('.csv',"",os.path.basename(filename))
 77 |                 std_team_name = re.sub('_',"-", std_team_name)
 78 | 
 79 |                 team0 = G.vs.find(name=std_team_name)
 80 | 
 81 |                 #skip first row of header info
 82 |                 next(reader,None)
 83 | 
 84 |                 for row in reader:
 85 | 
 86 |                     std_opponent = self.convert(row[2])
 87 |                     team1 = G.vs.find(name=std_opponent)
 88 | 
 89 |                     if team0 is None or team1 is None:
 90 |                         raise("Vertex not found for input team name")
 91 |                         sys.exit(1)
 92 | 
 93 |                     G.add_edge(team0, team1)
 94 | 
 95 | 
 96 |         #we need to set a weight of 1 to every edge
 97 |         G.es['weight'] = 1
 98 | 
 99 |         #we simplify the multigraph
100 |         G.simplify(combine_edges={'weight':sum})
101 | 
102 |         #we collapse the graph
103 |         self.prune(G)
104 |         G.write_graphml(self.graph_path)
105 | 
106 |     def get_context(self):
107 |         return  {
108 |             CirculoData.CONTEXT_OPTIMAL_PARTITIONS:6
109 |             }
110 | 
111 | 
112 |     def get_ground_truth(self, G):
113 | 
114 |         #ground truth table
115 |         divisions = {
116 |                 "boston-celtics":0,
117 |                 "brooklyn-nets":0,
118 |                 "new-york-knicks":0,
119 |                 "philadelphia-76ers":0,
120 |                 "toronto-raptors":0,
121 |                 "chicago-bulls":1,
122 |                 "cleveland-cavaliers":1,
123 |                 "detroit-pistons":1,
124 |                 "indiana-pacers":1,
125 |                 "milwaukee-bucks":1,
126 |                 "atlanta-hawks":2,
127 |                 "charlotte-bobcats":2,
128 |                 "miami-heat":2,
129 |                 "orlando-magic":2,
130 |                 "washington-wizards":2,
131 |                 "dallas-mavericks":3,
132 |                 "houston-rockets":3,
133 |                 "memphis-grizzlies":3,
134 |                 "new-orleans-pelicans":3,
135 |                 "san-antonio-spurs":3,
136 |                 "denver-nuggets":4,
137 |                 "minnesota-timberwolves":4,
138 |                 "oklahoma-city-thunder":4,
139 |                 "portland-trail-blazers":4,
140 |                 "utah-jazz":4,
141 |                 "golden-state-warriors":5,
142 |                 "los-angeles-clippers":5,
143 |                 "los-angeles-lakers":5,
144 |                 "phoenix-suns":5,
145 |                 "sacramento-kings":5
146 |                 }
147 | 
148 |         cluster_list = [[],[],[],[],[],[]]
149 | 
150 |         for vertex_id, team_name in enumerate(G.vs['name']):
151 |             cluster_list[divisions[team_name]].append(vertex_id)
152 | 
153 | 
154 |         return VertexCover(G, cluster_list)
155 | 
156 |     def prune(self,G):
157 |         if G.is_weighted() is False:
158 |             return G
159 | 
160 |         weights = G.es()['weight']
161 | 
162 |         threshold = statistics.median(weights) + .0001
163 | 
164 |         orig_edge_count = G.ecount()
165 |         edges = G.es.select(weight_lt=threshold)
166 |         G.delete_edges(edges)
167 |         #this is a special case because this pruning will create a disconnected component, so let's add back in one edge
168 |         v0 = G.vs.find(name="washington-wizards")
169 |         v1 = G.vs.find(name="san-antonio-spurs")
170 |         G.add_edge(v0, v1, weight=1)
171 | 
172 | 
173 | def main():
174 |     databot = NBAData("nba_schedule")
175 |     databot.get_ground_truth(databot.get_graph())
176 | 
177 | if __name__ == "__main__":
178 |     main()
179 | 


--------------------------------------------------------------------------------
/circulo/data/netscience/README.md:
--------------------------------------------------------------------------------
 1 | ## Network Science Collaborations
 2 | 
 3 | The data can be found at <http://www-personal.umich.edu/~mejn/netdata/netscience.zip>
 4 | 
 5 | ## Description
 6 | Coauthorships of papers in the network science community.
 7 | 
 8 | Directed: No
 9 | 
10 | Weighted: No (but can be)
11 | 
12 | Multigraph: No
13 | 
14 | ### Vertices 
15 | Each vertex represents an author of some paper on network science.
16 | 
17 | Attributes:
18 | * **label**: The name of the researcher
19 | * **id**: Unique identifier
20 | 
21 | ### Edges
22 | There is an edge between two authors if they are coauthors on a paper.
23 | 
24 | Attributes:
25 | * **value**: (I think) that the value represents the "importance" of a connection. If there are n authors on a paper, each author adds 1/n to the value of their edge to each other author.
26 | 
27 | ## Ground Truth
28 | Not yet implemented.
29 | 
30 | ## Other Notes
31 | * See `run.py` for specific details
32 | 
33 | ## References
34 | Taken from Mark Newman's personal website.
35 | 
36 | M. E. J. Newman, *Phys. Rev. E* **74**, 036104 (2006).


--------------------------------------------------------------------------------
/circulo/data/netscience/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | import igraph
18 | import os
19 | 
20 | from circulo.data.databot import CirculoData
21 | 
22 | DOWNLOAD_URL = 'http://www-personal.umich.edu/~mejn/netdata/netscience.zip'
23 | 
24 | class NetscienceData(CirculoData):
25 | 
26 |     def __download__(self):
27 |         self.download_with_notes(DOWNLOAD_URL)
28 | 
29 |     def __prepare__(self):
30 | 
31 |         G = igraph.load(os.path.join(self.raw_data_path, "netscience.gml"))
32 |         del G.vs['id'] #graphml uses the id field, so we must remove it
33 |         G.write_graphml(self.graph_path)
34 | 
35 | def main():
36 |     NetscienceData("netscience").get_graph()
37 | 
38 | if __name__ == "__main__":
39 |     main()
40 | 


--------------------------------------------------------------------------------
/circulo/data/pgp/README.md:
--------------------------------------------------------------------------------
 1 | ## Interactions within Pretty Good Privacy
 2 | 
 3 | The data can be found at <http://deim.urv.cat/~alexandre.arenas/data/xarxes/PGP.zip>
 4 | 
 5 | ## Description
 6 | Users of the Pretty-Good-Privacy algorithm. Only the giant component included.
 7 | 
 8 | Directed: No
 9 | 
10 | Weighted: Yes (But all weights are 1, so not really)
11 | 
12 | Multigraph: No
13 | 
14 | ### Vertices 
15 | Each vertex is a person using the Pretty-Good-Privacy algorithm.
16 | 
17 | Attributes:
18 | * **x**: x-coordinate for plotting
19 | * **y**: y-coordinate for plotting
20 | * **z**: z-coordinate for plotting
21 | * **id**: Unique identifier
22 | 
23 | ### Edges
24 | Interactions under PGP.
25 | 
26 | Attributes:
27 | * **weight**: Always 1. Unweighted, for all intents and purposes.
28 | 
29 | ## Ground Truth
30 | Not yet implemented,
31 | 
32 | ## Other Notes
33 | * See `run.py` for specific details
34 | 
35 | ## References
36 | Taken from Alexandre Arenas' personal site.
37 | 
38 | M. Boguña, R. Pastor-Satorras, A. Diaz-Guilera and A. Arenas, *Physical Review E*, vol. **70**, 056122 (2004).


--------------------------------------------------------------------------------
/circulo/data/pgp/run.py:
--------------------------------------------------------------------------------
 1 | import igraph
 2 | import os
 3 | import urllib.request
 4 | 
 5 | from circulo.data.databot import CirculoData
 6 | 
 7 | 
 8 | 
 9 | GRAPH_NAME = 'PGPgiantcompo.net'
10 | DOWNLOAD_URL = 'http://deim.urv.cat/~aarenas/data/xarxes/PGP.zip'
11 | 
12 | 
13 | class PGPData(CirculoData):
14 | 
15 |     def __download__(self):
16 |         self.download_with_notes(DOWNLOAD_URL)
17 | 
18 |     def __prepare__(self):
19 | 
20 |         data_path = os.path.join(self.raw_data_path, GRAPH_NAME)
21 |         G = igraph.load(data_path)
22 |         del G.vs['id'] #graphml uses the id field
23 |         G.write_graphml(self.graph_path)
24 | 
25 |     def get_ground_truth(self, G):
26 |         raise(NotImplementedError)
27 | 
28 | 
29 | def main():
30 |     PGPData("pgp").get_graph()
31 | 
32 | if __name__ == "__main__":
33 |     main()
34 | 


--------------------------------------------------------------------------------
/circulo/data/revolution/README.md:
--------------------------------------------------------------------------------
 1 | ## Revolutionary War participants
 2 | This dataset is drawn from the appendix of David Hackett Fischer's _Paul Revere's Ride_; a CSV version is available within
 3 | the repository https://github.com/kjhealy/revere.git. 
 4 | 
 5 | ## Description
 6 | This is a bipartite graph representing colonial American dissidents' membership in seven Whig (anti-British) groups during the
 7 | buildup to the American Revolutionary War. 
 8 | See http://kieranhealy.org/blog/archives/2013/06/09/using-metadata-to-find-paul-revere/ and http://www.sscnet.ucla.edu/polisci/faculty/chwe/ps269/han.pdf
 9 | for some analyses using this data.
10 | 
11 | For more traditional SNA applications, a one-mode projection of this data will yield a co-attendance network of the 254
12 | examined Revolutionary War figures.
13 | 
14 | Graph properties:
15 |    - Directed: False
16 |    - Weighted: False
17 |    - Multigraph: False
18 |    - Bipartite: True
19 | 
20 | ### Vertices 
21 | The seven Type 1 nodes represent seven Whig groups: St. Andrew's Lodge, the Loyal Nine, the North Caucus, 
22 | the Long Room Club, the Boston Tea Party, the Boston Committe, and the London Enemies. 
23 | The 254 Type 2 nodes represent colonial Americans who attended meetings of these groups (including John Adams, Paul Revere, and Joseph Warren). 
24 | 
25 | Attributes: None. 
26 | 
27 | ### Edges
28 | Each edge represents membership by a colonial American in a Whig group.
29 | 
30 | Attributes: None.
31 | 
32 | ## Ground Truth
33 | None provided.
34 | 
35 | ## Other Notes
36 | * See `run.py` for specific details
37 | 
38 | ## References
39 | [1] Fischer, David Hackett. 1994. _Paul Revere’s Ride._ New York: Oxford University Press. 
40 | 


--------------------------------------------------------------------------------
/circulo/data/revolution/run.py:
--------------------------------------------------------------------------------
 1 | # This template should be copied and modified as necessary to become the
 2 | # `run.py` in each directory.
 3 | #
 4 | # Do not modify this file unless the template needs changing -- modify
 5 | # its copies in each data directory.
 6 | 
 7 | import igraph
 8 | import os
 9 | import csv
10 | from subprocess import call
11 | 
12 | from circulo.data.databot import CirculoData
13 | 
14 | GIT_URL = 'https://github.com/kjhealy/revere.git'
15 | CSV_FILE = 'data/PaulRevereAppD.csv'
16 | 
17 | class RevolutionData(CirculoData):
18 | 
19 |     def __download__(self):
20 |         try:
21 |             call(["git", "clone", GIT_URL, self.raw_data_path])
22 |         except Exception as e:
23 |             print("Git clone failed to retrieve data. Please try again.")
24 |             raise(e)
25 | 
26 |     def __prepare__(self):
27 | 
28 |         csv_path = os.path.join(self.raw_data_path, CSV_FILE)
29 | 
30 |         g = igraph.Graph()
31 | 
32 |         with open(csv_path) as f:
33 | 
34 |             reader = csv.DictReader(f)
35 |             clubs = reader.fieldnames[:]
36 |             clubs.remove('')
37 | 
38 |             for club in clubs:
39 |                 g.add_vertex(name=club)
40 | 
41 |             for patriot in reader:
42 |                 g.add_vertex(name=patriot[''])
43 |                 for club in clubs:
44 |                     if(patriot[club] == '1'):
45 |                         g.add_edge(patriot[''], club)
46 | 
47 |         g.write_graphml(self.graph_path)
48 | 
49 |     def get_ground_truth(self, G):
50 |         raise(NotImplementedError)
51 | 
52 | 
53 | def main():
54 |     RevolutionData("revolution").get_graph()
55 | 
56 | if __name__ == "__main__":
57 |     main()
58 | 


--------------------------------------------------------------------------------
/circulo/data/school/README.md:
--------------------------------------------------------------------------------
 1 | ## Primary School - Cumulative Networks
 2 | 
 3 | The data can be found at <http://www.sociopatterns.org/datasets/primary-school-cumulative-networks/>
 4 | 
 5 | ## Description
 6 | A network of face to face time between people at a primary school.
 7 | 
 8 | Directed: No
 9 | 
10 | Weighted: No (but can easily be modified to be weighted)
11 | 
12 | Multigraph: No
13 | 
14 | ### Vertices 
15 | Each vertex represents a person at the school (either a student or a teacher).
16 | 
17 | Attributes:
18 | * **classname**: The school class and grade, if a student. Otherwise, "Teachers"
19 | * **label**: Unique identifier.
20 | * **id**: Yet another unique identifier.
21 | * **gender**: M, F, or Unknown
22 | * **viz**: Undocumented. Always 0.0
23 | 
24 | ### Edges
25 | An edge exists where some actor was face to face with another one. 
26 | 
27 | Attributes:
28 | * **id**: Unique identifier. 
29 | * **count**: The number of times that contact was established during the day.
30 | * **duration**: The total time that the nodes on this edge spent in face to face contact, measured in 20 second intervals.
31 | 
32 | ## Ground Truth
33 | `get_ground_truth` returns a VertexClustering object in which the vertices are grouped by "classname".
34 | 
35 | ## Other Notes
36 | * See `run.py` for specific details
37 | * Either "count" or "duration" would make sense as a weight for use with a weighted algorithm.
38 | * `run.py` requires NetworkX from <https://networkx.github.io>.
39 | 
40 | ## References
41 | 
42 | Thanks to sociopatterns.org. 
43 | 


--------------------------------------------------------------------------------
/circulo/data/school/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | import networkx as nx
19 | from subprocess import call
20 | import igraph
21 | from igraph import VertexCover
22 | import os
23 | import glob
24 | 
25 | from circulo.data.databot import CirculoData
26 | 
27 | 
28 | 
29 | class SchoolData(CirculoData):
30 | 
31 |     def __download__(self):
32 |         """
33 |         Downloads graphs from http://www.sociopatterns.org/datasets/primary-school-cumulative-networks/
34 |         and saves them in a directory data. If data already exists, it will not redownload
35 |         the files
36 |         """
37 | 
38 |         try:
39 |             # Probably shouldn't be starting a subprocess, do this with a library
40 |             # like urllib2
41 |             call(["curl", "--create-dirs","-o", os.path.join(self.raw_data_path, "out_#1.gexf.gz"),"http://www.sociopatterns.org/wp-content/uploads/2014/05/sp_data_school_day_[1-2]_g.gexf_.gz"])
42 |         except Exception as e:
43 |             print("cURL failed -- make sure you have cURL, and make sure the site still has the graph data.")
44 |             raise(e)
45 |         for filename in glob.glob(os.path.join(self.raw_data_path,"*.gz")):
46 |             call(["gunzip", filename])
47 | 
48 | 
49 | 
50 |     def __prepare__(self):
51 | 
52 |         for f in glob.glob(os.path.join(self.raw_data_path, "*.gexf")):
53 | 
54 |             G = nx.read_gexf(f)
55 |             for node in G.node:
56 |                 for attrib in G.node[node]:
57 |                     if type(G.node[node][attrib]) == dict:
58 |                         # graphML doesn't play nice with dictionaries as attributes.
59 |                         # this line just deletes positional information.
60 |                         G.node[node][attrib] = 0
61 |             #newFileName = f[:f.rfind('.')] + ".graphml"
62 |             nx.write_graphml(G, self.graph_path)
63 | 
64 | 
65 |     def get_ground_truth(self, G):
66 | 
67 |         class_list = G.vs['classname']
68 |         class_dict = dict.fromkeys(class_list)
69 | 
70 |         #set the indices for lookup purposes. These will be the cluster ids
71 |         for idx, k in enumerate(class_dict):
72 |             class_dict[k] = []
73 | 
74 |         for student_number, class_id in enumerate(class_list):
75 |             class_dict[class_id].append(student_number)
76 | 
77 |         cluster_list = []
78 | 
79 |         for cluster in class_dict.values():
80 |             cluster_list.append(cluster)
81 | 
82 |         return VertexCover(G, cluster_list)
83 | 
84 | 
85 | def main():
86 |     databot = SchoolData("school")
87 |     databot.get_ground_truth(databot.get_graph())
88 | 
89 | if __name__ == "__main__":
90 |     main()
91 | 


--------------------------------------------------------------------------------
/circulo/data/scotus/README.md:
--------------------------------------------------------------------------------
 1 | ## SCOTUS Citation Network
 2 | The data can be found at http://jhfowler.ucsd.edu/judicial.htm (see [1])
 3 | 
 4 | ## Description
 5 | The dataset represents the citation graph of the Supreme Court of the United States from 1762-2002, drawn from 534
 6 | volumes of the U.S. Reports. 
 7 | 
 8 | Graph properties:
 9 |     - Directed: True
10 |     - Weighted: False
11 |     - Multigraph: False
12 | 
13 | ### Vertices 
14 | Each vertex of the graph represents a case argued before the U.S. Supreme Court.
15 | 
16 | Attributes:
17 |     - caseid: Internal ID used for identifying cases by authors of the dataset.
18 |     - usid: ID of the case in the U.S. Supreme Court archives (volume and case number)
19 |     - parties: Disputing parties in the case (e.g. 'Marbury v. Madison', 'Brown v. Board of Education of Topeka')
20 |     - year: Year the case was argued.
21 | 
22 | ### Edges
23 | Edges represent majority opinion citations of previous cases; they are directed and point from citing case to cited case.
24 | 
25 | Attributes: none.
26 | 
27 | ## Ground Truth
28 | No ground truth exists for this dataset.
29 | 
30 | ## Other Notes
31 | * See `run.py` for specific details.
32 | 
33 | ## References
34 | - [1] "The Authority of Supreme Court Precedent." James H. Fowler, Sangick Jeon. _Social Neworks_ 30 (1): 16-30 (January 2008)
35 | - [2] "Network Analysis and the Law: Measuring the Legal Importance of Supreme Court Precedents." James H. Fowler, Timothy R. Johnson, James F. Spriggs II, Sangick Jeon, Paul J. Wahlbeck. _Political Analysis,_ 15 (3): 324-346 (July 2007).
36 | 


--------------------------------------------------------------------------------
/circulo/data/scotus/run.py:
--------------------------------------------------------------------------------
 1 | import igraph
 2 | import os
 3 | import csv
 4 | from circulo.data.databot import *
 5 | 
 6 | DATA_DOWNLOAD_URL = 'http://jhfowler.ucsd.edu/data/judicial.csv'
 7 | VERTEX_DATA_FILE = 'judicial.csv'
 8 | 
 9 | EDGE_DOWNLOAD_URL = 'http://jhfowler.ucsd.edu/data/allcites.txt'
10 | EDGE_DATA_FILE = 'allcites.txt'
11 | 
12 | 
13 | class SCOTUSData(CirculoData):
14 | 
15 |     def __download__(self):
16 |         self.download_with_notes(DATA_DOWNLOAD_URL)
17 |         self.download_with_notes(EDGE_DOWNLOAD_URL)
18 | 
19 |     def __prepare__(self):
20 |         vertex_filename = os.path.join(self.raw_data_path, VERTEX_DATA_FILE)
21 |         edge_filename = os.path.join(self.raw_data_path, EDGE_DATA_FILE)
22 | 
23 |         g = igraph.load(edge_filename)
24 | 
25 |         vertex_file = open(vertex_filename)
26 |         reader = csv.DictReader(vertex_file)
27 | 
28 |         for case in reader:
29 |             caseid = int(case['caseid'])
30 |             v = g.vs[caseid]
31 | 
32 |             v['caseid']  = case['caseid']
33 |             v['usid']    = case['usid']
34 |             v['parties'] = case['parties']
35 |             v['year']    = case['year']
36 | 
37 |         vertex_file.close()
38 | 
39 |         # Case IDs are 1-indexed, so we delete the 0th vertex as it was extraneous.
40 |         g.delete_vertices([0])
41 |         g.write_graphml(self.graph_path)
42 | 
43 | def main():
44 |     SCOTUSData("scotus").get_graph()
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/circulo/data/senate_voting/README.md:
--------------------------------------------------------------------------------
 1 | ## Congress Voting Data
 2 | 
 3 | The data can be found at <https://www.govtrack.us/developers/data>
 4 | 
 5 | ## Description
 6 | Congress voting records from 2014.
 7 | 
 8 | Directed: No
 9 | 
10 | Weighted: Yes
11 | 
12 | Multigraph: No
13 | 
14 | ### Vertices 
15 | Each vertex represents a congressperson for whom we have voting data.
16 | 
17 | Attributes:
18 | * **name**: Unique identifying id
19 | * **full_name**: Name of the congressperson
20 | * **state**: State represented
21 | * **id**: Unique identifier. In most cases, use "name."
22 | * **party**: Political party.
23 | 
24 | ### Edges
25 | There is an edge between two nodes whenever the congresspeople vote together on an issue. The edges are weighted by the number of votes that are shared. 
26 | 
27 | Attributes:
28 | * **weight**: The number of times the congresspeople on each side of this edge have voted the same way.
29 | 
30 | ## Ground Truth
31 | `get_ground_truth` returns a VertexClustering grouped by the parties of the politicians.
32 | 
33 | ## Other Notes
34 | * See `run.py` for specific details
35 | 
36 | ## References
37 | Thanks to GovTrack.us
38 | 


--------------------------------------------------------------------------------
/circulo/data/senate_voting/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | if [ -z "$1" ]; then 
 5 |     echo "Data dir required" 
 6 |     exit 0
 7 | fi
 8 | 
 9 | if [ ! -d "$1" ]; then
10 |     echo "Data dir does not exist"
11 |     exit 0
12 | fi
13 | 
14 | rsync -avz --delete --delete-excluded --exclude **/text-versions/ govtrack.us::govtrackdata/congress/113/votes/2014 $1 > /dev/null 2>&1
15 | 	
16 | rsync -avz --delete --delete-excluded --exclude **/text-versions/ govtrack.us::govtrackdata/congress-legislators/legislators-current.csv $1 > /dev/null 2>&1
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/circulo/data/senate_voting/exercise.md:
--------------------------------------------------------------------------------
 1 | ## Exercise
 2 | 
 3 | __Requirements__
 4 | 
 5 | - NetworkX Fork from ....
 6 | - IPython QtConsole
 7 | - Snap from ...
 8 | 
 9 | 
10 | 
11 | From the iPython QtConsole 
12 | 
13 | 	#inline images
14 | 	import matplotlib
15 | 	%matplotlib inline  
16 | 
17 | 	import os
18 | 	os.environ['SNAPHOME'] = '/path/to/snap'
19 | 
20 | 
21 | 	#set the inline image size to be larger
22 |  	import matplotlib.pylab as pylab
23 |  	pylab.rcParams['figure.figsize'] = (14.0, 12.0)
24 | 
25 | 	#ETL the congress voting data
26 | 	%run parse_congress.py Filter
27 | 	
28 | 	import networkx as nx
29 | 	
30 | 	#from the senate dir, read in the senate data (you can do the house data too)
31 | 	G = nx.read_graphml('senate/senate.graphml', node_type=int)
32 | 	
33 | 	#set the layout
34 | 	pos = nx.fruchterman_reingold_layout(G, k=2)
35 | 	
36 | 	#create the labels
37 | 	labels=dict((n, d['name'] + ' ' + d['party']) for n,d in G.nodes(data=True) if d.has_key('party'))
38 | 
39 | 	
40 | 	nx.draw(G, pos = pos, node_size=60, node_color="red", edge_color="grey", with_labels=True, labels=labels)
41 | 
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/circulo/data/senate_voting/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import json
 18 | import glob
 19 | import csv
 20 | import itertools
 21 | import os
 22 | import igraph
 23 | from igraph import VertexCover
 24 | from subprocess import call
 25 | 
 26 | from circulo.data.databot import CirculoData
 27 | 
 28 | 
 29 | class SenateData(CirculoData):
 30 | 
 31 |     def __download__(self):
 32 | 
 33 |         try:
 34 |             call(["bash", os.path.join(os.path.dirname(__file__), "download.sh"), self.raw_data_path])
 35 |         except Exception as e:
 36 |             print("rsync failed to retrieve data")
 37 |             raise(e)
 38 | 
 39 |     def __prepare__(self):
 40 |         '''
 41 |         Prepare congress data. NOTE: the vertex lookups should be indexed, however this
 42 |         funciton could prob be sped up by just created a dict with all possible congress pairs
 43 |         and counting how often they vote together, then at the end creating the edges
 44 |         '''
 45 | 
 46 |         src_files = os.path.join(self.raw_data_path, "2014", "s*","*.json")
 47 |         c_type = "sen"
 48 |         G = igraph.Graph()
 49 | 
 50 | 
 51 |         #first load the vertices
 52 |         with open(os.path.join(self.raw_data_path, "legislators-current.csv"), 'r') as f:
 53 | 
 54 |             csvreader = csv.reader(f,delimiter=',',quotechar='"')
 55 |             #skip the headers
 56 |             next(csvreader, None)  # skip the headers
 57 |             for row in csvreader:
 58 | 
 59 |                 if c_type != row[4]:
 60 |                     continue
 61 |                 elif row[4] == "sen":
 62 |                     congress_id = row[21]
 63 |                 else:
 64 |                     raise("Unidentified congress: {}".format(row[4]))
 65 | 
 66 |                 G.add_vertex(
 67 |                     congress_id,
 68 |                     full_name="{} {}".format(row[1],row[0]),
 69 |                     party=row[7],
 70 |                     state=row[5]
 71 |                     )
 72 | 
 73 | 
 74 |         missing_ids = set()
 75 | 
 76 |         #now create the edges
 77 |         for fname in glob.glob(src_files):
 78 |             with open(fname,'r') as inputfile:
 79 |                 data = json.load(inputfile)
 80 |                 #print("Processing: {}".format(fname))
 81 |                 for vt in data['votes']:
 82 |                     congress_ids = [n['id'] for n in data['votes'][vt]]
 83 |                     #print(congress_ids)
 84 |                     pairs = itertools.combinations(congress_ids,2)
 85 | 
 86 |                     for congress_id0, congress_id1 in pairs:
 87 |                         #print("{} {}".format(congress_id0, congress_id1))
 88 |                         try:
 89 |                             v0 = G.vs.find(congress_id0)
 90 |                         except ValueError as e:
 91 |                             missing_ids.add(congress_id0)
 92 |                             continue
 93 | 
 94 |                         try:
 95 |                             v1 = G.vs.find(congress_id1)
 96 |                         except ValueError as e:
 97 |                             missing_ids.add(congress_id1)
 98 |                             continue
 99 | 
100 |                         e = G.get_eid(v0.index, v1.index, directed=False, error=False)
101 | 
102 |                         if e>=0:
103 |                             G.es[e]['weight'] += 1
104 |                         else:
105 |                             G.add_edge(v0, v1, weight=1)
106 | 
107 |         print("Ids not found: {}".format(missing_ids))
108 | 
109 |         #prune the graph
110 |         weights = G.es()['weight']
111 |         threshold =  .65 * max(weights)
112 |         edges = G.es.select(weight_lt=threshold)
113 |         G.delete_edges(edges)
114 | 
115 |         #make sure that the graph is not disconnected. if so take larger component
116 |         components = G.components(mode=igraph.WEAK)
117 |         if len(components) > 1:
118 |             print("[Graph Prep - Congress]... Disconnected Graph Detected. Using largest component.")
119 |             print("[Graph Prep - Congress]... Original graph: {} vertices and {} edges.".format(G.vcount(), G.ecount()))
120 |             G = G.subgraph(max(components, key=len))
121 |             print("[Graph Prep - Congress]... Largest component: {} vertices and {} edges.".format(G.vcount(), G.ecount()))
122 | 
123 | 
124 | 
125 |         G.write_graphml(self.graph_path)
126 | 
127 | 
128 |     def __party_to_cluster__(self, party):
129 |         if party == "Democrat":
130 |             return 0
131 |         elif party == "Republican":
132 |             return 1
133 |         elif party == "Independent":
134 |             return 2
135 |         else:
136 |             raise("Unknown party affiliation {}".format(party))
137 | 
138 |     def get_ground_truth(self, G):
139 | 
140 |         cluster_list = [[],[],[]]
141 | 
142 |         for vertex_id, party in enumerate(G.vs['party']):
143 |             cluster_list[self.__party_to_cluster__(party)].append(vertex_id)
144 | 
145 |         return VertexCover(G, cluster_list)
146 | 
147 | 
148 | def main():
149 | 
150 |     databot = SenateData('senate_voting')
151 |     G = databot.get_graph()
152 |     databot.get_ground_truth(G)
153 | 
154 | if __name__ == "__main__":
155 |     main()
156 | 


--------------------------------------------------------------------------------
/circulo/data/southernwomen/README.md:
--------------------------------------------------------------------------------
 1 | ## Davis "Southern Women" dataset
 2 | 
 3 | The data can be found at http://nexus.igraph.org/api/dataset_info?id=23&format=html.
 4 | 
 5 | ## Description
 6 | The network given represents a bipartite attendance network of 18 Southern women attending 14 social events in the Deep South, collected by Davis et al. in their
 7 | book "Deep South."
 8 | 
 9 | Directed: No
10 | 
11 | Weighted: No
12 | 
13 | Multigraph: No
14 | 
15 | ### Vertices 
16 | The graph is bipartite. Type 1 vertices represent the 18 women; type 2 vertices represent the 14 social events. 
17 | 
18 | Attributes:
19 | - name: Name of the woman or event.
20 | 
21 | ### Edges
22 | Edges represent attendance by a woman at an event. 
23 | 
24 | Attributes: None
25 | 
26 | ## Ground Truth
27 | No ground truth, although a clustering of the women was generated in later paper by Breiger et al.
28 | 
29 | ## Other Notes
30 | * See `run.py` for specific details
31 | 
32 | ## References
33 | - Breiger R. (1974). The duality of persons and groups. Social Forces, 53, 181-190.
34 | - Davis, A et al. (1941). Deep South. Chicago: University of Chicago Press.
35 | 


--------------------------------------------------------------------------------
/circulo/data/southernwomen/run.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | from circulo.data.databot import *
 4 | 
 5 | DOWNLOAD_URL = "http://nexus.igraph.org/api/dataset?id=23&format=GraphML"
 6 | DATA_ID = "southernwomen"
 7 | DOWNLOAD_FILE="Davis.GraphML"
 8 | 
 9 | class SouthernWomenData(CirculoData):
10 | 
11 |     def __download__(self):
12 |         self.download_with_notes(DOWNLOAD_URL, progressbar=False, download_file=DOWNLOAD_FILE)
13 | 
14 |     def __prepare__(self):
15 |         shutil.copyfile(os.path.join(self.raw_data_path, DOWNLOAD_FILE), self.graph_path)
16 | 
17 |     def get_ground_truth(self, G):
18 |         raise(NotImplementedError)
19 | 
20 |     def get_context(self):
21 |         return {
22 |                 CirculoData.CONTEXT_OPTIMAL_PARTITIONS:10
23 |                 }
24 | 
25 | 
26 | def main():
27 |     SouthernWomenData(DATA_ID).get_graph()
28 | 
29 | if __name__ == "__main__":
30 |     main()
31 | 


--------------------------------------------------------------------------------
/circulo/metrics/graph.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # Goal is to annotate a vertex cover with dictionary representing various cluster metrics
 3 | 
 4 | from igraph import Graph
 5 | from circulo.utils.general import aggregate
 6 | import circulo.algorithms.min_conductance
 7 | 
 8 | def triangle_participation(G):
 9 |     '''
10 |     This returns an array indicating whether the ith node in the graph belongs to a triad.
11 |     '''
12 |     rv = [False]*G.vcount()
13 | 
14 |     for u in G.vs():
15 |         if rv[u.index]:
16 |             continue
17 |         for v in u.neighbors():
18 |             for w in v.neighbors():
19 |                 is_triad = u in w.neighbors()
20 |                 rv[u.index] |= is_triad
21 |                 rv[v.index] |= is_triad
22 |                 rv[w.index] |= is_triad
23 |     return rv
24 | 
25 | def triangle_participation_ratio(G):
26 |     '''
27 |     The fraction of nodes in a graph that belong to a triad.
28 |     '''
29 |     rv = G.triangle_participation()
30 |     return 1.0*sum(rv)/G.vcount()
31 | 
32 | def cohesiveness(G, weights=None):
33 |     '''
34 |     Equation: g(S) = minS′⊂S φ(S′) where φ(S′) is the conductance of S′ measured in the induced subgraph by S.
35 |     To iterate over all possible subgraphs of a community would be too inefficient 2^n, therefore we approximate
36 |     the best subgraph (which would have the lowest conductance) by using Local Spectral communitying to find the best
37 |     cut
38 |     (cite: http://cs.stanford.edu/people/jure/pubs/comscore-icdm12.pdf)
39 |     '''
40 |     from circulo.algorithms import spectral
41 |     if G.vcount() <= 2:
42 |         val = 1
43 |     else:
44 |         #TODO: Consider using G_i.mincut() instead.
45 |         val, vc = G.min_conductance(weights=weights)
46 |     return val
47 | 
48 | def __helper_m(key_prefix, describe_dict):
49 |     dict0 = {}
50 | 
51 |     for k, v in describe_dict.items():
52 |         new_key = key_prefix + " (" + k + ")"
53 |         dict0[new_key] = v
54 | 
55 |     return dict0
56 | 
57 | def compute_metrics(G, refresh = True):
58 | 
59 |     descriptTLU = 'TLU--Local Clustering Coefficient'
60 |     descriptDegree = 'Degree Statistics'
61 | 
62 |     if refresh or G.metrics == None:
63 | 
64 |         #we treat a single node graph to have a density of 1
65 |         #TODO: This is undefined for multigraphs. Prob should simplify if this happens
66 |         density = G.density() if G.vcount() > 1 else 1.0
67 | 
68 |         G.metrics = {
69 |                 'Internal Number Nodes'         : G.vcount(),
70 |                 'Internal Number Edges'         : G.ecount(),
71 |                 'Density'                       : density,
72 |                 'Diameter'                      : G.diameter(),
73 |                 'Cohesiveness'                  : G.cohesiveness(),
74 |                 'Triangle Participation Ratio'  : G.triangle_participation_ratio(),
75 |                 'Transitivity Undirected (Global Clustering Coefficient)'
76 |                                                 : G.transitivity_undirected(mode='zero')
77 |                                                 }
78 |         G.metrics.update(aggregate(G.transitivity_local_undirected(mode='zero'), prefix=descriptTLU))
79 |         G.metrics.update(aggregate(G.degree(), prefix=descriptDegree))
80 | 
81 | Graph.metrics = None
82 | Graph.compute_metrics = compute_metrics
83 | Graph.cohesiveness = cohesiveness
84 | Graph.triangle_participation = triangle_participation
85 | Graph.triangle_participation_ratio = triangle_participation_ratio
86 | 


--------------------------------------------------------------------------------
/circulo/metrics/omega.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy
  3 | import scipy.sparse as sp
  4 | 
  5 | from igraph import Graph, VertexCover
  6 | 
  7 | def __reset_diagonal(A, sparse):
  8 |     '''
  9 |     input: matrix
 10 |     ouput: matrix object with diagonals set to 0
 11 |     '''
 12 | 
 13 |     if sparse:
 14 |         A = A - sp.dia_matrix((A.diagonal()[scipy.newaxis, :], [0]), shape=A.shape)
 15 |     else:
 16 |         A = A.copy()
 17 |         np.fill_diagonal(A, 0)
 18 |     return A
 19 | 
 20 | def __get_diagonal(A, sparse):
 21 |     '''
 22 |     input: Matrix
 23 |     output: vector with the diagonal entries
 24 |     '''
 25 |     if sparse:
 26 |         return A.diagonal()
 27 |     else:
 28 |         return np.diag(A)
 29 | 
 30 | 
 31 | def __get_matrix(vc, sparse):
 32 |     '''
 33 |     inputs: List of lists (vertexCover) object
 34 |     output: Node x Node matrix with the cell values indicating the number of clusters
 35 |             each pair of nodes shares
 36 |     '''
 37 |     n = len(vc) # number of nodes
 38 |     nc = max([max(i) for i in vc if i]) + 1 # number of clusters
 39 | 
 40 |     create_zero_matrix = sp.csr_matrix if sparse else np.zeros
 41 |     A = create_zero_matrix((n,n), dtype='int')
 42 |     for i in range(nc):
 43 |         # Create a Clique from Membership
 44 |         v = np.matrix([ (i in m)*1 for m in vc])
 45 |         if sparse:
 46 |             v = sp.csr_matrix(v)
 47 |         Ai = v.T*v
 48 |         A = A+Ai
 49 |     # DO NOT ZERO THE DIAGONALS HERE, __get_omega_e depends on them.
 50 |     return A.tocsr() if sparse else A
 51 | 
 52 | def __get_omega_u(A1, A2, sparse):
 53 |     '''
 54 |     inputs: Two __get_matrix results
 55 |     outputs: un-adjusted omega score
 56 | 
 57 |     '''
 58 |     n = A1.shape[0]
 59 |     M = n*(n-1)/2.0
 60 |     notA = __reset_diagonal((A1 != A2), sparse)
 61 |     rv = n*(n-1) - notA.sum()
 62 |     return rv/(2*M)
 63 | 
 64 | def __get_omega_e(A1, A2, sparse):
 65 |     '''
 66 |     inputs: Two __get_matrix results
 67 |     outputs: expected omega score
 68 | 
 69 |     '''
 70 |     n = A1.shape[0]
 71 |     M = n*(n-1)/2.0
 72 |     k = max(max((__get_diagonal(A1, sparse))), max(__get_diagonal(A2, sparse)))
 73 | 
 74 |     # The 0th iteration is done with a negation since it is a sparse matrix
 75 |     t_not0_1 = __reset_diagonal((A1 != 0), sparse)
 76 |     t_not0_2 = __reset_diagonal((A2 != 0), sparse)
 77 |     rv = n*(n-1) - t_not0_1.sum()
 78 |     rv *= n*(n-1) - t_not0_2.sum()
 79 |     for i in range(1, k+1):
 80 |         t_i_1 = __reset_diagonal((A1 == i), sparse)
 81 |         t_i_2 = __reset_diagonal((A2 == i), sparse)
 82 | 
 83 |         rv += t_i_1.sum()*t_i_2.sum()
 84 |     rv /= (2*M)**2
 85 |     return rv;
 86 | 
 87 | def omega_index(cover_membership_a, cover_membership_b, sparse=True):
 88 |     '''
 89 |     Uses the Omega Index metrics to compare two covers of a given domain, e.g. a Graph.
 90 |     @param cover_membership_a : A list of vertex to membership list.
 91 |             Example - a = [[0,1],[1],[0,2]]
 92 |     @param cover_membership_b : A list of vertex to membership list.
 93 |     @returns: Best match = 1, No match = 0
 94 |     '''
 95 | 
 96 |     A1 = __get_matrix(cover_membership_a, sparse)
 97 |     A2 = __get_matrix(cover_membership_b, sparse)
 98 |     omega_u = __get_omega_u(A1, A2, sparse)
 99 |     omega_e = __get_omega_e(A1, A2, sparse)
100 | 
101 |     return (omega_u - omega_e)/(1-omega_e)
102 | 
103 | 


--------------------------------------------------------------------------------
/circulo/metrics/probability_metric.py:
--------------------------------------------------------------------------------
 1 | import igraph as ig
 2 | import statistics
 3 | 
 4 | 
 5 | def probability_metric_score(G, clusters):
 6 |     """
 7 |     Returns the mean of all of the cluster's scores under the probability metric 
 8 |     defined below. Contains much less information than probability_metric_graph,
 9 |     but allows for a single number with which one can compare clustering algorithms.
10 |     """
11 |     l = [p[0] for p in probability_metric_graph(G, clusters)]
12 |     return statistics.mean(l);
13 | 
14 | def probability_metric_graph(G, clusters):
15 |     """
16 |     Calculates the probability metric on the graph G for each cluster in
17 |     clusters. Returns a list of 3-tuples [(a, b, c),...] where a is the mean,
18 |     b the standard deviation, and c the variance, indexed by cluster id. 
19 | 
20 |     This metric measures how likely a particle placed on some vertex will stay within
21 |     the original community after n random steps, where n is the number of vertices in 
22 |     the community (or some other, better value for normalization). 
23 | 
24 |     This returns a list [(mean, variance)_0, (mean, variance)_1..., (mean, variance)_n]
25 |     where the statistics within each cluster are represented by its index in the list.
26 | 
27 |     A high mean suggests strong community structure, but a high variance suggests that
28 |     a few objects in the community might be outliers and not necessarily belong.
29 |     
30 |     Currently, we use the size of the community as the number of steps taken, but this
31 |     isn't based on much theory. Some analysis is necessary to find a better choice for n.
32 | 
33 |     If you want one "score" to score the clustering instead of individual communities, 
34 |     call probability_metric_score instead.
35 |     """
36 |     fullStats = []
37 |     for cluster in clusters:
38 |         fullStats.append(probability_metric_cluster(G, cluster))
39 |     return fullStats
40 | 
41 | 
42 | def probability_metric_cluster(G, members):
43 |     """
44 |     Given the members of a cluster and the graph they belong to, finds
45 |     the cluster's mean, standard deviation, and variance. 
46 | 
47 |     Note: n currently defaults to the number of members in the community.
48 |     TODO: testing, to find out whether this is a legitimate normalization.
49 |     """
50 |     nMembers = len(members) # figure out if this is a good normalization
51 |     # numVertices = G.vcount();
52 |     # normalization = nMembers / numVertices 
53 |     data = [p_in_after_n(G, v, nMembers, members) for v in members]
54 |     mean = statistics.mean(data) # could divide mean by normalization
55 |     var = statistics.pvariance(data, mu=mean)
56 |     return mean, var
57 | 
58 | 
59 | def p_in_after_n(G, v, n, comm):
60 |     """
61 |     Finds the probability that a particle will remain 
62 |     within the community during every step of an 
63 |     n step random walk, beginning from v. At each step,
64 |     the particle is equally likely to travel to any
65 |     of its neighbors.
66 | 
67 |     TODO: use weights
68 |     """
69 |     return p_in_after_n_r_cached(G, v, n, set(comm), {})
70 | 
71 | 
72 | def p_in_after_n_r_cached(G, v, n, comm, cache):
73 |     """
74 |     Memoized, recursive implementation of p_in_after_n.
75 |     Internal function.
76 |     """
77 |     if (v, n) in cache:
78 |         # memoized, don't recurse
79 |         return cache[(v, n)]
80 |     if v not in comm:
81 |         # left the community
82 |         return 0
83 | 
84 |     neighbors = set(G.neighbors(v))
85 |     numNeighbors = float(len(neighbors))
86 | 
87 |     if n == 1:
88 |         # Second to last step of the possible recursion, 
89 |         # short circuit so we don't have to recurse down to 0
90 |         return len(neighbors & comm) / numNeighbors
91 |     
92 |     totalP = 0.
93 |     for neighbor in neighbors:
94 |         pGivenNeighbor = p_in_after_n_r_cached(G, neighbor, n-1, comm, cache)
95 |         cache[(neighbor, n-1)] = pGivenNeighbor
96 |         totalP += 1/numNeighbors * pGivenNeighbor
97 |     cache[(v, n)] = totalP
98 |     return totalP
99 | 


--------------------------------------------------------------------------------
/circulo/setup/run_metrics.py:
--------------------------------------------------------------------------------
  1 | # Now cluster the clusters
  2 | from circulo import metrics
  3 | from sklearn import metrics as skmetrics
  4 | import numpy as np
  5 | import pickle
  6 | import argparse
  7 | import os
  8 | import glob
  9 | import json
 10 | from igraph import VertexCover
 11 | import importlib
 12 | import circulo.metrics.cover
 13 | import multiprocessing
 14 | import time
 15 | import signal
 16 | import os
 17 | import errno
 18 | import traceback
 19 | import sys
 20 | from collections import namedtuple
 21 | import inspect
 22 | from circulo.data.databot import CirculoData
 23 | 
 24 | 
 25 | Worker = namedtuple('Worker', 'json_path output_path timeout')
 26 | 
 27 | def main():
 28 | 
 29 |     parser = argparse.ArgumentParser(description='Compute metrics for given cover.')
 30 |     parser.add_argument('input_path', type=str, help='file or directory containing results')
 31 |     parser.add_argument('output_path', type=str, help='output directory to write metric files')
 32 |     parser.add_argument('--workers', type=int, default=None, help='Number of workers to process (DEFAULT: number of processors)')
 33 |     parser.add_argument('--timeout', type=int, default=3600, help="timeout for a work item in seconds (DEFAULT: 3600)")
 34 |     args = parser.parse_args()
 35 | 
 36 |     if not os.path.exists(args.input_path):
 37 |         print("Path \"{}\" does not exist".format(args.input_path))
 38 |         return
 39 | 
 40 |     if not os.path.exists(args.output_path):
 41 |         os.makedirs(args.output_path)
 42 | 
 43 |     workers = []
 44 | 
 45 |     if os.path.isdir(args.input_path):
 46 |         for f in glob.glob(os.path.join(args.input_path, '*.json')):
 47 |             workers.append(Worker(f, args.output_path, args.timeout))
 48 |     else:
 49 |         workers.append(Worker(args.input_path, args.output_path, args.timeout))
 50 | 
 51 |     if args.workers is not None:
 52 |         pool = multiprocessing.Pool(processes = args.workers)
 53 |     else:
 54 |         pool = multiprocessing.Pool()
 55 | 
 56 |     r = pool.map_async(analyze_json, workers)
 57 |     r.get() #must call in order to get error from inside the child processes
 58 |     pool.close()
 59 |     pool.join()
 60 | 
 61 | class TimeoutError(Exception):
 62 |     pass
 63 | 
 64 | def __handle_timeout(signum, frame):
 65 |     raise TimeoutError(os.strerror(errno.ETIME))
 66 | 
 67 | 
 68 | def analyze_json(worker):
 69 | 
 70 |     signal.signal(signal.SIGALRM, __handle_timeout)
 71 |     signal.setitimer(signal.ITIMER_REAL, worker.timeout)
 72 |     t0 = time.time()
 73 | 
 74 | 
 75 |     data = None
 76 | 
 77 |     with open(worker.json_path) as f:
 78 |         data = json.load(f)
 79 | 
 80 |     if(data is None):
 81 |         print("No data found for ", worker.json_path)
 82 |         return
 83 | 
 84 |     print("###### Running metrics against " + data['job_name'])
 85 |     #load the graph and ground truth in
 86 |     data_mod =  importlib.import_module('circulo.data.'+data['dataset']+'.run')
 87 | 
 88 |     instance = None
 89 | 
 90 |     for name,cls in inspect.getmembers(data_mod):
 91 |         if inspect.isclass(cls) and issubclass(cls, CirculoData) and name != "CirculoData":
 92 |             instance = cls(data['dataset'])
 93 | 
 94 |     if instance == None:
 95 |         print("Unable to find data module for ", data['dataset'])
 96 |         return
 97 | 
 98 |     G = instance.get_graph()
 99 | 
100 |     #apply similar alterations as were done with the algos
101 |     alterations = data['alterations']
102 | 
103 |     if len(alterations) > 0:
104 |         if "weighted" in alterations:
105 |             G.es()['weight'] = 1
106 | 
107 |         if "undirected" in alterations:
108 |             G.to_undirected(combine_edges={'weight':sum})
109 | 
110 |         if "simple" in alterations:
111 |             G.simplify(combine_edges={'weight':sum})
112 | 
113 |         if "pruned" in alterations:
114 |             instance.prune(G)
115 | 
116 | 
117 |     weights = 'weight' if G.is_weighted() else None
118 |     #some datasets might not have ground truth
119 |     try:
120 |         vc = instance.get_ground_truth(G)
121 |         ground_truth_cover = cover_from_membership( vc.membership, G)
122 |     except Exception as e:
123 |         print("\t++NOTE for ", data['dataset'], ": Ground Truth Not Available")
124 |         ground_truth_cover = None
125 | 
126 |     results_cover = cover_from_membership(data['membership'], G)
127 | 
128 |     try:
129 |         t0 = time.time()
130 |         #results are currently stored within the cover object
131 |         results_cover.compute_metrics(weights=weights, ground_truth_cover=ground_truth_cover )
132 |     except TimeoutError as t:
133 |         print("\t+Timeout ERROR: was analyzing: ", data['job_name'])
134 |         signal.alarm(0)
135 |         return
136 |     except Exception as e:
137 |         print(e)
138 |         traceback.print_exc(file=sys.stdout)
139 |         return
140 |     out_dict = {
141 |         "name" : data['job_name'],
142 |         "elapsed" :data['elapsed'],
143 |         "membership" : data['membership'],
144 |         "omega": results_cover.compare_omega(ground_truth_cover),
145 |         "metrics": results_cover.metrics,
146 |         "metrics_elapsed": (time.time() - t0)
147 |         }
148 | 
149 | 
150 |     try:
151 | 
152 |         full_path = os.path.join(worker.output_path,data['job_name'] + ".json")
153 |         with open(full_path, 'w') as outfile:
154 |             json.dump(out_dict, outfile)
155 |     except Exception as e:
156 |         traceback.print_exc(file=sys.stdout)
157 |         print(e)
158 | 
159 | def cover_from_membership(membership, G):
160 | 
161 |     if(membership is None):
162 |         return None
163 | 
164 |     cluster_dict = {}
165 | 
166 |     for vertex_id, cluster_id_list in enumerate(membership):
167 |         for cluster_id in cluster_id_list:
168 |             if(cluster_id not in cluster_dict):
169 |                 cluster_dict[cluster_id] = []
170 |             cluster_dict[cluster_id].append(vertex_id)
171 | 
172 |     return VertexCover(G, [v for v in cluster_dict.values()])
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     main()
177 | 


--------------------------------------------------------------------------------
/circulo/unit_tests/karate.gml:
--------------------------------------------------------------------------------
  1 | Creator "Mark Newman on Fri Jul 21 12:39:27 2006"
  2 | graph
  3 | [
  4 |   node
  5 |   [
  6 |     id 1
  7 |   ]
  8 |   node
  9 |   [
 10 |     id 2
 11 |   ]
 12 |   node
 13 |   [
 14 |     id 3
 15 |   ]
 16 |   node
 17 |   [
 18 |     id 4
 19 |   ]
 20 |   node
 21 |   [
 22 |     id 5
 23 |   ]
 24 |   node
 25 |   [
 26 |     id 6
 27 |   ]
 28 |   node
 29 |   [
 30 |     id 7
 31 |   ]
 32 |   node
 33 |   [
 34 |     id 8
 35 |   ]
 36 |   node
 37 |   [
 38 |     id 9
 39 |   ]
 40 |   node
 41 |   [
 42 |     id 10
 43 |   ]
 44 |   node
 45 |   [
 46 |     id 11
 47 |   ]
 48 |   node
 49 |   [
 50 |     id 12
 51 |   ]
 52 |   node
 53 |   [
 54 |     id 13
 55 |   ]
 56 |   node
 57 |   [
 58 |     id 14
 59 |   ]
 60 |   node
 61 |   [
 62 |     id 15
 63 |   ]
 64 |   node
 65 |   [
 66 |     id 16
 67 |   ]
 68 |   node
 69 |   [
 70 |     id 17
 71 |   ]
 72 |   node
 73 |   [
 74 |     id 18
 75 |   ]
 76 |   node
 77 |   [
 78 |     id 19
 79 |   ]
 80 |   node
 81 |   [
 82 |     id 20
 83 |   ]
 84 |   node
 85 |   [
 86 |     id 21
 87 |   ]
 88 |   node
 89 |   [
 90 |     id 22
 91 |   ]
 92 |   node
 93 |   [
 94 |     id 23
 95 |   ]
 96 |   node
 97 |   [
 98 |     id 24
 99 |   ]
100 |   node
101 |   [
102 |     id 25
103 |   ]
104 |   node
105 |   [
106 |     id 26
107 |   ]
108 |   node
109 |   [
110 |     id 27
111 |   ]
112 |   node
113 |   [
114 |     id 28
115 |   ]
116 |   node
117 |   [
118 |     id 29
119 |   ]
120 |   node
121 |   [
122 |     id 30
123 |   ]
124 |   node
125 |   [
126 |     id 31
127 |   ]
128 |   node
129 |   [
130 |     id 32
131 |   ]
132 |   node
133 |   [
134 |     id 33
135 |   ]
136 |   node
137 |   [
138 |     id 34
139 |   ]
140 |   edge
141 |   [
142 |     source 2
143 |     target 1
144 |   ]
145 |   edge
146 |   [
147 |     source 3
148 |     target 1
149 |   ]
150 |   edge
151 |   [
152 |     source 3
153 |     target 2
154 |   ]
155 |   edge
156 |   [
157 |     source 4
158 |     target 1
159 |   ]
160 |   edge
161 |   [
162 |     source 4
163 |     target 2
164 |   ]
165 |   edge
166 |   [
167 |     source 4
168 |     target 3
169 |   ]
170 |   edge
171 |   [
172 |     source 5
173 |     target 1
174 |   ]
175 |   edge
176 |   [
177 |     source 6
178 |     target 1
179 |   ]
180 |   edge
181 |   [
182 |     source 7
183 |     target 1
184 |   ]
185 |   edge
186 |   [
187 |     source 7
188 |     target 5
189 |   ]
190 |   edge
191 |   [
192 |     source 7
193 |     target 6
194 |   ]
195 |   edge
196 |   [
197 |     source 8
198 |     target 1
199 |   ]
200 |   edge
201 |   [
202 |     source 8
203 |     target 2
204 |   ]
205 |   edge
206 |   [
207 |     source 8
208 |     target 3
209 |   ]
210 |   edge
211 |   [
212 |     source 8
213 |     target 4
214 |   ]
215 |   edge
216 |   [
217 |     source 9
218 |     target 1
219 |   ]
220 |   edge
221 |   [
222 |     source 9
223 |     target 3
224 |   ]
225 |   edge
226 |   [
227 |     source 10
228 |     target 3
229 |   ]
230 |   edge
231 |   [
232 |     source 11
233 |     target 1
234 |   ]
235 |   edge
236 |   [
237 |     source 11
238 |     target 5
239 |   ]
240 |   edge
241 |   [
242 |     source 11
243 |     target 6
244 |   ]
245 |   edge
246 |   [
247 |     source 12
248 |     target 1
249 |   ]
250 |   edge
251 |   [
252 |     source 13
253 |     target 1
254 |   ]
255 |   edge
256 |   [
257 |     source 13
258 |     target 4
259 |   ]
260 |   edge
261 |   [
262 |     source 14
263 |     target 1
264 |   ]
265 |   edge
266 |   [
267 |     source 14
268 |     target 2
269 |   ]
270 |   edge
271 |   [
272 |     source 14
273 |     target 3
274 |   ]
275 |   edge
276 |   [
277 |     source 14
278 |     target 4
279 |   ]
280 |   edge
281 |   [
282 |     source 17
283 |     target 6
284 |   ]
285 |   edge
286 |   [
287 |     source 17
288 |     target 7
289 |   ]
290 |   edge
291 |   [
292 |     source 18
293 |     target 1
294 |   ]
295 |   edge
296 |   [
297 |     source 18
298 |     target 2
299 |   ]
300 |   edge
301 |   [
302 |     source 20
303 |     target 1
304 |   ]
305 |   edge
306 |   [
307 |     source 20
308 |     target 2
309 |   ]
310 |   edge
311 |   [
312 |     source 22
313 |     target 1
314 |   ]
315 |   edge
316 |   [
317 |     source 22
318 |     target 2
319 |   ]
320 |   edge
321 |   [
322 |     source 26
323 |     target 24
324 |   ]
325 |   edge
326 |   [
327 |     source 26
328 |     target 25
329 |   ]
330 |   edge
331 |   [
332 |     source 28
333 |     target 3
334 |   ]
335 |   edge
336 |   [
337 |     source 28
338 |     target 24
339 |   ]
340 |   edge
341 |   [
342 |     source 28
343 |     target 25
344 |   ]
345 |   edge
346 |   [
347 |     source 29
348 |     target 3
349 |   ]
350 |   edge
351 |   [
352 |     source 30
353 |     target 24
354 |   ]
355 |   edge
356 |   [
357 |     source 30
358 |     target 27
359 |   ]
360 |   edge
361 |   [
362 |     source 31
363 |     target 2
364 |   ]
365 |   edge
366 |   [
367 |     source 31
368 |     target 9
369 |   ]
370 |   edge
371 |   [
372 |     source 32
373 |     target 1
374 |   ]
375 |   edge
376 |   [
377 |     source 32
378 |     target 25
379 |   ]
380 |   edge
381 |   [
382 |     source 32
383 |     target 26
384 |   ]
385 |   edge
386 |   [
387 |     source 32
388 |     target 29
389 |   ]
390 |   edge
391 |   [
392 |     source 33
393 |     target 3
394 |   ]
395 |   edge
396 |   [
397 |     source 33
398 |     target 9
399 |   ]
400 |   edge
401 |   [
402 |     source 33
403 |     target 15
404 |   ]
405 |   edge
406 |   [
407 |     source 33
408 |     target 16
409 |   ]
410 |   edge
411 |   [
412 |     source 33
413 |     target 19
414 |   ]
415 |   edge
416 |   [
417 |     source 33
418 |     target 21
419 |   ]
420 |   edge
421 |   [
422 |     source 33
423 |     target 23
424 |   ]
425 |   edge
426 |   [
427 |     source 33
428 |     target 24
429 |   ]
430 |   edge
431 |   [
432 |     source 33
433 |     target 30
434 |   ]
435 |   edge
436 |   [
437 |     source 33
438 |     target 31
439 |   ]
440 |   edge
441 |   [
442 |     source 33
443 |     target 32
444 |   ]
445 |   edge
446 |   [
447 |     source 34
448 |     target 9
449 |   ]
450 |   edge
451 |   [
452 |     source 34
453 |     target 10
454 |   ]
455 |   edge
456 |   [
457 |     source 34
458 |     target 14
459 |   ]
460 |   edge
461 |   [
462 |     source 34
463 |     target 15
464 |   ]
465 |   edge
466 |   [
467 |     source 34
468 |     target 16
469 |   ]
470 |   edge
471 |   [
472 |     source 34
473 |     target 19
474 |   ]
475 |   edge
476 |   [
477 |     source 34
478 |     target 20
479 |   ]
480 |   edge
481 |   [
482 |     source 34
483 |     target 21
484 |   ]
485 |   edge
486 |   [
487 |     source 34
488 |     target 23
489 |   ]
490 |   edge
491 |   [
492 |     source 34
493 |     target 24
494 |   ]
495 |   edge
496 |   [
497 |     source 34
498 |     target 27
499 |   ]
500 |   edge
501 |   [
502 |     source 34
503 |     target 28
504 |   ]
505 |   edge
506 |   [
507 |     source 34
508 |     target 29
509 |   ]
510 |   edge
511 |   [
512 |     source 34
513 |     target 30
514 |   ]
515 |   edge
516 |   [
517 |     source 34
518 |     target 31
519 |   ]
520 |   edge
521 |   [
522 |     source 34
523 |     target 32
524 |   ]
525 |   edge
526 |   [
527 |     source 34
528 |     target 33
529 |   ]
530 | ]
531 | 


--------------------------------------------------------------------------------
/circulo/unit_tests/metrics.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import unittest
  3 | import numpy as np
  4 | import circulo.metrics
  5 | import igraph
  6 | 
  7 | import importlib
  8 | import inspect
  9 | from circulo.data.databot import CirculoData
 10 | from circulo.setup.run_metrics import cover_from_membership
 11 | import circulo.metrics.cover
 12 | 
 13 | 
 14 | class TestMetrics(unittest.TestCase):
 15 |     def setUp(self):
 16 |         DATASET='karate'
 17 |         #load the graph and ground truth in
 18 |         data_mod =  importlib.import_module('circulo.data.'+DATASET+'.run')
 19 | 
 20 |         instance = None
 21 | 
 22 |         for name,cls in inspect.getmembers(data_mod):
 23 |             if inspect.isclass(cls) and issubclass(cls, CirculoData) and name != "CirculoData":
 24 |                 instance = cls(DATASET)
 25 | 
 26 |         self.G = instance.get_graph()
 27 |         membership=[[0,1,2,3,7,11,12,13,17,19,21],
 28 |                     [4,5,6,10,16],
 29 |                     [8,9,14,15,18,20,22,23,24,25,26,27,28,29,30,31,32,33]]
 30 |         self.weights=[5,7,4,5,8,7,2,1,1,6,7,4,9,6,8,2,2,1,2,5,6,5,7,7,3,4,4,6,7,7,5,7,4,8,5,4,5,3,1,6,4,3,3,3,1,6,2,7,8,8,1,7,5,7,5,4,7,3,7,5,8,9,4,2,8,8,6,3,6,6,8,5,6,7,5,7,7,7]
 31 |         self.G.es['weight'] = self.weights
 32 | 
 33 |         self.cover = circulo.metrics.cover.VertexCover(self.G, membership)
 34 | 
 35 |     def test_internaldensity(self):
 36 |         #doesn't apply to weighted graphs
 37 |         truth = [.4181818, .6, .22875817]
 38 | 
 39 |         #Density is an igraph ``metric''
 40 |         test  = [ s.density() for s in self.cover.subgraphs()]
 41 |         self.assertListAlmostEquals(truth, test, 2)
 42 | 
 43 |     def test_avgdegree(self):
 44 |         truth = [4.181818182, 2.4, 3.8888889]
 45 | 
 46 |         # Average degree is an igraph + python method
 47 |         from scipy import mean
 48 |         test  = [ mean(s.degree()) for s in self.cover.subgraphs() ]
 49 |         self.assertListAlmostEquals(truth, test, 2)
 50 | 
 51 |     def test_wavgdegree(self):
 52 |         truth = [24, 15.2, 23.3333334]
 53 | 
 54 |         # Average degree is an igraph + python method
 55 |         from scipy import mean
 56 |         test  = [ mean(s.strength(weights='weight')) for s in self.cover.subgraphs() ]
 57 |         self.assertListAlmostEquals(truth, test, 2)
 58 | 
 59 |     def test_FOMD(self):
 60 |         truth = [0.545454545, 0, 0.277777778]
 61 | 
 62 |         test = circulo.metrics.cover.fomd(self.cover)
 63 |         self.assertListAlmostEquals(truth, test, 2)
 64 | 
 65 |     def test_WFOMD(self):
 66 |         truth = [0.545454545, 0.4 , 0.388888889]
 67 | 
 68 |         test = circulo.metrics.cover.fomd(self.cover, weights='weight')
 69 |         self.assertListAlmostEquals(truth, test, 2)
 70 | 
 71 |     def test_expansion(self):
 72 |         truth = [1.272727, 0.8, 0.555556]
 73 | 
 74 |         test  = self.cover.expansion()
 75 |         self.assertListAlmostEquals(truth, test, 2)
 76 | 
 77 |     def test_wexpansion(self):
 78 |         truth = [2.181818, 1.2, 1]
 79 | 
 80 |         test  = self.cover.expansion(weights='weight')
 81 |         self.assertListAlmostEquals(truth, test, 2)
 82 | 
 83 |     def test_cutratio(self):
 84 |         #not applicable to weighted graphs
 85 |         truth = [.05534,.02759,.03472,]
 86 | 
 87 |         test = circulo.metrics.cover.cut_ratio(self.cover, allow_nan=True)
 88 |         self.assertListAlmostEquals(truth, test, 2)
 89 | 
 90 |     def test_conductance(self):
 91 |         truth = [0.2333333,0.25, 0.125]
 92 | 
 93 |         test  = self.cover.conductance()
 94 |         self.assertListAlmostEquals(truth, test, 2)
 95 | 
 96 |     def test_wconductance(self):
 97 |         truth = [0.083333, 0.0731707, 0.0410959]
 98 | 
 99 |         test  = self.cover.conductance(weights='weight')
100 |         self.assertListAlmostEquals(truth, test, 2)
101 | 
102 |     def test_normalizedcut(self):
103 |         truth = [0.346236559, 0.277027027, 0.229166667]
104 | 
105 |         test  = self.cover.normalized_cut()
106 |         self.assertListAlmostEquals(truth, test, 2)
107 | 
108 |     def test_wnormalizedcut(self):
109 |         truth = [0.125586854, 0.081300813, 0.085430866]
110 | 
111 |         test  = self.cover.normalized_cut(weights='weight')
112 |         self.assertListAlmostEquals(truth, test, 2)
113 | 
114 |     def test_TPR(self):
115 |         #same for weighted and unweighted graphs
116 |         truth = [0.9091,0.6, 0.9444444]
117 | 
118 |         test  = [ s.triangle_participation_ratio()
119 |                   for s in self.cover.subgraphs() ]
120 |         self.assertListAlmostEquals(truth, test, 2)
121 | 
122 |     def test_MaxODF(self):
123 |         truth = [.5,0.3333333, 0.5 ]
124 | 
125 |         test = circulo.metrics.cover.maximum_out_degree_fraction(self.cover)
126 |         self.assertListAlmostEquals(truth, test, 2)
127 | 
128 |     def test_WMaxODF(self):
129 |         truth = [0.222222222, 0.153846154, 0.2]
130 | 
131 |         test  = self.cover.maximum_out_degree_fraction(weights='weight')
132 |         self.assertListAlmostEquals(truth, test, 2)
133 | 
134 |     def test_avgODF(self):
135 |         truth = [0.138131313, 0.233333333, 0.117592593]
136 | 
137 |         test  = self.cover.average_out_degree_fraction()
138 |         self.assertListAlmostEquals(truth, test, 2)
139 | 
140 |     def test_wavgODF(self):
141 |         truth = [0.064922913, 0.080586081, 0.041399798]
142 | 
143 |         test  = self.cover.average_out_degree_fraction(weights='weight')
144 |         self.assertListAlmostEquals(truth, test, 2)
145 | 
146 |     def test_FlakeODF(self):
147 |         truth = [0,0,0]
148 | 
149 |         test = circulo.metrics.cover.flake_out_degree_fraction(self.cover)
150 |         #test  = self.cover.flake_out_degree_fraction()
151 |         self.assertListAlmostEquals(truth, test, 2)
152 | 
153 |     def test_WFLakeODF(self):
154 |         truth = [0,0,0]
155 | 
156 |         test = circulo.metrics.cover.flake_out_degree_fraction(self.cover, weights='weight')
157 |         self.assertListAlmostEquals(truth, test, 2)
158 | 
159 |     def test_separability(self):
160 |         truth = [1.6428571,1.5, 3.5]
161 | 
162 |         test = circulo.metrics.cover.separability(self.cover)
163 |         self.assertListAlmostEquals(truth, test, 2)
164 | 
165 |     def test_wseparability(self):
166 |         truth = [5.5, 6.3333333333, 11.666666667]
167 | 
168 |         test  = self.cover.separability(weights='weight')
169 |         self.assertListAlmostEquals(truth, test, 2)
170 | 
171 |     def test_localclusteringcoefficient(self):
172 |         #This averages the local clustering coefficient
173 |         #Results are the same for weighted and unweighted graphs
174 | 
175 |         truth = [0.75310245, 0.33333333, 0.65153920]
176 | 
177 |         # Local Clustering Coeff is an igraph function
178 |         from scipy import mean
179 |         test  = [ mean(s.transitivity_local_undirected(mode='zero'))
180 |                   for s in self.cover.subgraphs() ]
181 |         self.assertListAlmostEquals(truth, test, 2)
182 | 
183 |     def test_cohesiveness(self):
184 |         # TODO: Calculate cohesiveness "truth" cohesiveness truth
185 |         self.skipTest("Not sure what truth values for this should be, skipping for now")
186 |         truth = []
187 | 
188 |         test  = [ s.cohesiveness() for s in self.cover.subgraphs() ]
189 | 
190 |         self.assertListAlmostEquals(truth, test, 2)
191 | 
192 |     def assertListAlmostEquals(self, a, b, places=None, msg=None):
193 |         self.assertEquals(np.round(a,places).tolist(),
194 |                           np.round(b,places).tolist(), msg=msg)
195 | 
196 | 
197 | if __name__ == '__main__' :
198 |     unittest.main()
199 | 
200 | 


--------------------------------------------------------------------------------
/circulo/unit_tests/test_metrics.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import unittest
 3 | import igraph
 4 | from circulo.metrics import VertexCoverMetric
 5 | 
 6 | class TestMetrics(unittest.TestCase):
 7 |     def setUp(self):
 8 |         self.G=igraph.load("karate.gml")
 9 |         
10 |         membership=[
11 |                     [0,1,2,3,7,11,12,13,17,19,21],
12 |                     [4,5,6,10,16],
13 |                     [8,9,14,15,18,20,22,23,24,25,26,27,28,29,30,31,32,33]]
14 |         cover=igraph.VertexCover(self.G, membership)
15 |         metrics=VertexCoverMetric.run_analysis(cover, weights=None)
16 |         metrics.report()
17 |         self.comm_metrics=metrics.comm_metrics
18 |         
19 |     def test_density(self):
20 |         self.assertEqual(round(.4181818, 2), round(self.comm_metrics[0].density, 2))
21 |         self.assertEqual(round(.6, 2), round(self.comm_metrics[1].density,2))
22 |         self.assertEqual(round(.22875817, 2), round(self.comm_metrics[2].density,2))
23 | 
24 |     def test_avgdegree(self):
25 |         self.assertEqual(round(4.181818182, 2), round(self.comm_metrics[0].degree_avg,2))
26 |         self.assertEqual(round(2.4, 2), round(self.comm_metrics[1].degree_avg,2))
27 |         self.assertEqual(round(3.8888889,2), round(self.comm_metrics[2].degree_avg,2))
28 | 
29 |     def test_FOMD(self):
30 |         self.assertEqual(round(0.545454545,2), round(self.comm_metrics[0].fomd, 2))
31 |         self.assertEqual(round(0, 2), round(self.comm_metrics[1].fomd, 2))
32 |         self.assertEqual(round(0.277777778,2), round(self.comm_metrics[2].fomd,2))
33 | 
34 |     def test_expansion(self):
35 |         self.assertEqual(round(1.272727, 2), round(self.comm_metrics[0].degree_boundary_avg, 2))
36 |         self.assertEqual(round(0.8, 2), round(self.comm_metrics[1].degree_boundary_avg, 2))
37 |         self.assertEqual(round(0.555556, 2), round(self.comm_metrics[2].degree_boundary_avg,2))
38 | 
39 |     def test_cutratio(self):
40 |         self.assertEqual(round(.05534, 2), round(self.comm_metrics[0].cut_ratio, 2))
41 |         self.assertEqual(round(.02759, 2), round(self.comm_metrics[1].cut_ratio, 2))
42 |         self.assertEqual(round(.03472, 2), round(self.comm_metrics[2].cut_ratio, 2))
43 | 
44 |     def test_conductance(self):
45 |         self.assertEqual(round(0.2333333,2), round(self.comm_metrics[0].conductance,2))
46 |         self.assertEqual(round(0.25,2), round(self.comm_metrics[1].conductance,2))
47 |         self.assertEqual(round(0.125,2), round(self.comm_metrics[2].conductance,2))
48 | 
49 |     def test_normalizedcut(self):
50 |         self.assertEqual(round(0.346236559,2), round(self.comm_metrics[0].normalized_cut,2))
51 |         self.assertEqual(round(0.277027027,2), round(self.comm_metrics[1].normalized_cut,2))
52 |         self.assertEqual(round(0.229166667, 2), round(self.comm_metrics[2].normalized_cut,2))
53 | 
54 |     def test_TPR(self):
55 |         self.assertEqual(round(0.9091, 2), round(self.comm_metrics[0].tpr[1], 2))
56 |         self.assertEqual(round(0.6, 2), round(self.comm_metrics[1].tpr[1], 2))
57 |         self.assertEqual(round(0.9444, 2), round(self.comm_metrics[2].tpr[1], 2))
58 | 
59 |     def test_MaxODF(self):
60 |         self.assertEqual(round(0.5,2), round(self.comm_metrics[0].odf_dict["max"], 2))
61 |         self.assertEqual(round(0.3333333,2), round(self.comm_metrics[1].odf_dict["max"], 2))
62 |         self.assertEqual(round(0.5, 2), round(self.comm_metrics[2].odf_dict["max"], 2))
63 | 
64 |     def test_avgODF(self):
65 |         self.assertEqual(round(0.138131313,2), round(self.comm_metrics[0].odf_dict["average"], 2))
66 |         self.assertEqual(round(0.233333333,2), round(self.comm_metrics[1].odf_dict["average"], 2))
67 |         self.assertEqual(round(0.117592593, 2), round(self.comm_metrics[2].odf_dict["average"], 2))
68 | 
69 |     def test_FlakeODF(self):
70 |         self.assertEqual(round(0, 2), round(self.comm_metrics[0].odf_dict["flake"], 2))
71 |         self.assertEqual(round(0, 2), round(self.comm_metrics[1].odf_dict["flake"], 2))
72 |         self.assertEqual(round(0, 2), round(self.comm_metrics[2].odf_dict["flake"], 2))
73 | 
74 |     def test_separability(self):
75 |         self.assertEqual(round(1.6428571,2), round(self.comm_metrics[0].separability, 2))
76 |         self.assertEqual(round(1.5, 2), round(self.comm_metrics[1].separability, 2))
77 |         self.assertEqual(round(3.5, 2), round(self.comm_metrics[2].separability, 2))
78 | 
79 |     def test_clusteringcoefficient(self):
80 |         self.assertEqual(round(0.72503608, 2), round(self.comm_metrics[0].clustering_coefficient, 2))
81 |         self.assertEqual(round(0.66666667, 2), round(self.comm_metrics[1].clustering_coefficient, 2))
82 |         self.assertEqual(round(0.72045177, 2), round(self.comm_metrics[2].clustering_coefficient, 2))
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | if __name__ == '__main__' :
90 |     unittest.main()
91 | 
92 | 


--------------------------------------------------------------------------------
/circulo/utils/downloader.py:
--------------------------------------------------------------------------------
  1 | import urllib.request
  2 | import os
  3 | import zipfile
  4 | import gzip
  5 | import sys
  6 | import igraph as ig
  7 | from collections import defaultdict
  8 | 
  9 | def download_with_notes(url, filename, data_dir, progressbar=True):
 10 |     """
 11 |     Uses urllib to download data from URL. Saves the results in
 12 |     data_dir/FILENAME. Provides basic logging to stdout.
 13 |     """
 14 |     print("Downloading data from " + url + ".....")
 15 |     try:
 16 |         if progressbar:
 17 |             urllib.request.urlretrieve(url, os.path.join(data_dir, filename), progress)
 18 |         else:
 19 |             urllib.request.urlretrieve(url, os.path.join(data_dir, filename))
 20 |     except Exception as e:
 21 |         print("Data download failed -- make sure the url is still valid, and that urllib is properly installed.\n\n")
 22 |         raise(e)
 23 |     print("Download complete.")
 24 | 
 25 |     _unzip(data_dir, filename)
 26 | 
 27 | def _unzip(data_dir, filename):
 28 | 
 29 |     zip_path = os.path.join(data_dir, filename)
 30 | 
 31 |     if zipfile.is_zipfile(zip_path):
 32 |         try:
 33 |             z = zipfile.ZipFile(zip_path)
 34 |         except zipfile.BadZipFile as e:
 35 |             print("ZipFile error: {}".format(e))
 36 |             sys.exit(0)
 37 |         print("Extracting from zip...")
 38 |         z.extractall(path=data_dir)
 39 | 
 40 |     else:
 41 |         unzip_file = os.path.splitext(zip_path)[0]
 42 | 
 43 |         with gzip.open(zip_path,'rb') as infile:
 44 |             try:
 45 |                 file_content = infile.read()
 46 |             except OSError as e:
 47 |                 print("Neither gzip nor zipfile. No extraction necessary.")
 48 |                 return
 49 | 
 50 |             with open(unzip_file, "wb") as f:
 51 |                 print("Extracting from gzip...")
 52 |                 f.write(file_content)
 53 | 
 54 | def progress(blockNum, blockSize, totSize):
 55 |     """
 56 |     Provides an ascii progress bar that is 50 characters wide.
 57 |     totSize is the total size of the task, blockSize is the size
 58 |     of each block, and blockNum is the current block being worked on.
 59 | 
 60 |     For example:
 61 | 
 62 |     for i in range(100):
 63 |         progress(i + 1, 1, 100)
 64 |         sleep(1)
 65 | 
 66 |     will print a progress bar over 100 seconds.
 67 |     """
 68 |     downloaded = blockNum * blockSize
 69 |     per = min(100 * downloaded / totSize, 100)
 70 |     sys.stdout.write("\r%d%%" %per)
 71 |     for i in range(int(per / 2)):
 72 |         sys.stdout.write(".")
 73 |     for i in range(50 - int(per/2)):
 74 |         sys.stdout.write(" ")
 75 |     sys.stdout.write("# ")
 76 |     sys.stdout.flush()
 77 | 
 78 | 
 79 | def membership_to_clustering_list(membership):
 80 | 
 81 |     cluster_dict = {}
 82 | 
 83 |     for idx, cluster_id in enumerate(membership):
 84 |         if cluster_id not in cluster_dict:
 85 |             cluster_dict[cluster_id] = []
 86 |         cluster_dict[cluster_id].append(idx)
 87 | 
 88 |     return [v for v in cluster_dict.values()]
 89 | 
 90 | def multigraph_to_weights(G):
 91 |     """
 92 |     Given a multigraph, coalesces all duplicate edges into a single
 93 |     weighted edge. Removes all other attributes. Assumes all edges
 94 |     are either weighted or unweighted.
 95 |     """
 96 |     seen = defaultdict(float)
 97 |     for e in G.es:
 98 |         try:
 99 |             weight = e['weight']
100 |         except KeyError:
101 |             weight = 1
102 |         seen[e.tuple] += weight
103 |     G.delete_edges(None)
104 |     es = list(seen.keys())
105 |     weights = list(seen.values())
106 |     G.add_edges(es)
107 |     G.es['weight'] = weights
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/circulo/utils/general.py:
--------------------------------------------------------------------------------
 1 | from scipy.stats import describe
 2 | from scipy import median
 3 | import igraph
 4 | import numpy as np
 5 | from itertools import combinations
 6 | 
 7 | from circulo.metrics.omega import omega_index
 8 | 
 9 | def aggregate(array, prefix="",axis=0):
10 | 
11 |     stats = describe(array, axis)
12 | 
13 |     if len(array) == 1:
14 |         variance = -1.0
15 |     else:
16 |         variance = float(stats[3])
17 | 
18 | 
19 |     return  {
20 |         prefix+'Size':int(stats[0]),
21 |         prefix+'Min':float(stats[1][0]),
22 |         prefix+'Max':float(stats[1][1]),
23 |         prefix+'Mean':float(stats[2]),
24 |         prefix+'Unbiased Variance':variance,
25 |         prefix+'Biased Skewness':float(stats[4]),
26 |         prefix+'Biased Kurtosis':float(stats[5]),
27 |         prefix+'Median':float(median(array,axis))
28 |             }
29 | 
30 | 
31 | 
32 | def get_largest_component(G, descript="not specified"):
33 |     """
34 |     Given a graph, returns the subgraph containing only its largest component".
35 |     """
36 |     components = G.components(mode=igraph.WEAK)
37 |     if len(components) == 1:
38 |         return G
39 |     print("[Graph Prep -",descript,"]... Disconnected Graph Detected. Using largest component.")
40 |     print("[Graph Prep -",descript,"]... Original graph: {} vertices and {} edges.".format(G.vcount(), G.ecount()))
41 |     G = G.subgraph(max(components, key=len))
42 |     print("[Graph Prep -",descript,"]... Largest component: {} vertices and {} edges.".format(G.vcount(), G.ecount()))
43 |     return G
44 | 
45 | 
46 | 
47 | def run_comparison(memberships, comparator="omega"):
48 |     '''
49 |     Given a list of memberships, uses the comparator to compare results
50 | 
51 |     Args:
52 |         membershps: a list of membership arrays
53 |         comparator: the algorithm to use at the comparator (default: omega)
54 | 
55 |     Return:
56 |         a symetric matrix containing the results
57 |     '''
58 | 
59 |     size = len(memberships)
60 |     pairs = combinations(range(size), 2)
61 |     M = np.zeros((size, size), dtype=float)
62 |     np.fill_diagonal(M, 1)
63 | 
64 |     if comparator == "omega":
65 |         comp_func = omega_index
66 |     else:
67 |         raise NotImplementedError('Unknown comparison function')
68 | 
69 |     #fill in top right
70 |     for i, j in pairs:
71 |         score = comp_func(memberships[i], memberships[j])
72 |         M[i,j] = score
73 |         M[j,i] = score
74 | 
75 |     return M
76 | 


--------------------------------------------------------------------------------
/circulo/utils/snap.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import tempfile
  4 | import sys
  5 | import igraph
  6 | from  igraph.clustering import VertexCover
  7 | from collections import OrderedDict
  8 | 
  9 | from sklearn.feature_extraction import DictVectorizer
 10 | import numpy as np
 11 | 
 12 | import circulo
 13 | 
 14 | __author__="""Paul M"""
 15 | 
 16 | __all__ = []
 17 | 
 18 | 
 19 | ENV_SNAPPATH_VAR = "SNAPHOME"
 20 | 
 21 | 
 22 | def read_communities_by_community(f_name, G, delete_file=False):
 23 |     '''
 24 |     Reads a community file in the format where each line represents a community where the line is a list of nodes separated by white space
 25 |     '''
 26 | 
 27 |     comm_list = list()
 28 | 
 29 |     with open(f_name, 'r') as community_file:
 30 | 
 31 |         for line in community_file:
 32 |             if line.startswith('#'):
 33 |                 continue
 34 |             try:
 35 |                 comm_list.append(map(int, line.split()))
 36 |             except ValueError as e:
 37 |                 print("Node type is unclear for line: {}".format(line))
 38 |                 return
 39 | 
 40 |     if delete_file:
 41 |         os.remove(f_name)
 42 | 
 43 |     return VertexCover(G, comm_list)
 44 | 
 45 | 
 46 | def read_communities_by_node(f_name, G):
 47 |     '''
 48 |     Reads a community file where each line is a node and the community to which it belongs
 49 |     For example
 50 |     0   1
 51 |     0   4
 52 |     0   0
 53 |     1   3
 54 |     1   4
 55 |     2   5
 56 |     '''
 57 | 
 58 |     #dict with keys as community_id and values are a list of nodes
 59 |     community_dict = dict()
 60 |     max_node_id = len(G.vs)
 61 |     with open(f_name, 'r') as community_file:
 62 |         for line in community_file:
 63 |             if line.startswith('#'):
 64 |                 continue
 65 | 
 66 |             node_id, community_id = (int(x) for x in line.split())
 67 |             if node_id <= max_node_id:
 68 |                 if community_id not in community_dict:
 69 |                     community_dict[community_id] = []
 70 | 
 71 |                 community_dict[community_id].append(node_id)
 72 | 
 73 |     return VertexCover(G,  [v for v in community_dict.values()])
 74 | 
 75 | 
 76 | 
 77 | def divisive(G, algo_id, output):
 78 | 
 79 |     snap_home, graph_file  = setup(G)
 80 | 
 81 |     if graph_file is None:
 82 |         return
 83 | 
 84 |     path_girvan_newman = os.path.join(snap_home, "examples", "community", "community")
 85 | 
 86 | 
 87 |     try:
 88 |         out = subprocess.Popen([path_girvan_newman, "-i:"+graph_file, "-o:"+output, "-a:"+algo_id])
 89 |     except TypeError as e:
 90 |         print("Error occurred: {}".format(e))
 91 |         return
 92 | 
 93 |     out.wait()
 94 | 
 95 |     os.remove(graph_file)
 96 |     return read_communities_by_node(output, G)
 97 | 
 98 | 
 99 | def attribute_setup(G, attrs_of_interest):
100 |     """
101 |     Create node name and node attribute files. Uses DictVectorizer to encode free form attribute input into set of
102 |     binary classes. node_attribute_name_file contains the mapping of binary classes to names
103 |     """
104 |     f = tempfile.mkstemp()
105 |     node_attribute_name_file = f[1]
106 | 
107 |     f2 = tempfile.mkstemp()
108 |     node_attribute_file = f2[1]
109 | 
110 |     # Create an array of attributes of interest
111 |     attr_array = []
112 |     for node in G.vs:
113 |         node_attributes_dict = {}
114 |         for attr_name, attr_val in node.attributes().items():
115 |             if attr_name in attrs_of_interest:
116 |                 node_attributes_dict[attr_name] = attr_val
117 |         attr_array.append(node_attributes_dict)
118 | 
119 |     # TODO: Don't make dense array for sparse input
120 |     vec = DictVectorizer(dtype=np.int32)
121 |     vectorized_array = vec.fit_transform(attr_array).toarray()
122 |     try:
123 |         with open(node_attribute_name_file, 'w') as out:
124 |             for i, name in enumerate(vec.get_feature_names()):
125 |                 out.write("{}\t{}\n".format(i, name))
126 | 
127 |         with open(node_attribute_file, 'w') as out:
128 |             for node_num, bool_feature_array in enumerate(vectorized_array):
129 |                 for attr_num, val in enumerate(bool_feature_array):
130 |                     if val != 0:
131 |                         out.write("{}\t{}\n".format(node_num, attr_num))
132 |     except:
133 |         print("Error writing attribute info")
134 |         return None
135 | 
136 |     return (node_attribute_name_file, node_attribute_file)
137 | 
138 | 
139 | def setup(G, include_header=True):
140 |     snap_home = os.path.join(os.path.dirname(circulo.__path__._path[0]), "lib","snap")
141 | 
142 |     if not os.path.exists(os.path.join(snap_home,"examples","bigclam","bigclam")):
143 |         raise Exception("SNAP must be downloaded and built prior to using the snap algorithms")
144 | 
145 |     f = tempfile.mkstemp()
146 |     filename = f[1]
147 | 
148 |     try:
149 | 
150 |         #some snap algos can't handle single space edge delimiters, and igraph can't output
151 |         #tab delimited edgelist, so we always convert the single spaced output to a tabbed output
152 |         with open(filename, 'w') as out:
153 |             if include_header:
154 |                 out.write("# Directed Node Graph\n")
155 |                 out.write("# Descriptions\n")
156 |                 out.write("# Nodes: {}\tEdges: {}\n".format(len(G.vs), len(G.es)))
157 |                 out.write("# SrcNId\tDstNId\n")
158 |             for src in G.vs:
159 |                 for dst in src.neighbors(mode=igraph.ALL):
160 |                     out.write("{}\t{}\n".format(src.index, dst.index))
161 |                 #print(node.neighbors())
162 |             #for u,v in G.get_edgelist():
163 |             #    out.write("{}\t{}\n".format(u, v))
164 |             #    out.write("{}\t{}\n".format(v, u))
165 | 
166 |     except:
167 |         print("Error writing edgelist")
168 |         return None
169 | 
170 |     return (snap_home, filename)
171 | 


--------------------------------------------------------------------------------
/circulo/utils/stochastic_selector.py:
--------------------------------------------------------------------------------
 1 | # Now cluster the clusters
 2 | from circulo import metrics
 3 | from sklearn import metrics as skmetrics
 4 | from scipy.spatial.distance import squareform
 5 | from scipy.cluster.hierarchy import average,fcluster
 6 | import igraph
 7 | import numpy as np
 8 | import pickle
 9 | 
10 | 
11 | def to_crisp_membership(ovp_membership):
12 |     return [ a[0] for a in ovp_membership ]
13 | 
14 | 
15 | def argmax(array):
16 |     return max(zip(array, range(len(array))))[1]
17 | 
18 | 
19 | def select(covers):
20 |     #distance_matrix, y, Z = compute_distance_matrix(covers)
21 | 
22 |     #pick_representatives(covers, distance_matrix, y, Z)
23 | 
24 |     #for now just return the first cover. TODO: Cluster the covers correctly
25 |     return 0
26 | 
27 | def pick_representatives(covers, dist_matrix, y, Z):
28 | 
29 |     mega_clusters = fcluster(Z,.5)
30 | 
31 |     G = igraph.Graph.Adjacency((dist_matrix < 1).tolist(), 'UNDIRECTED')
32 | 
33 |     G.vs()['vc'] = covers
34 | 
35 |     #G.vs()['vc'] = results['vc.orig']  if 'vc.original' in results else results['vc']
36 |     for e in G.es():
37 |         e['weight'] = 1-dist_matrix[e.source, e.target]
38 | 
39 |     #mega_clusters -= 1
40 |     #cluster = igraph.VertexClustering(G, mega_clusters.tolist())
41 | 
42 |     #reps = []
43 |     #for s in cluster.subgraphs():
44 |     #    rep_id = argmax(s.strength(weights='weight'))
45 |     #    reps += [ s.vs()[rep_id]['vc'] ]
46 | 
47 | 
48 | 
49 | def compute_distance_matrix(covers):
50 |     # Compute stochastic clusters
51 |     num_results = len(covers)
52 |     distance_matrix= np.zeros((num_results,num_results))
53 |     print('Calculating distance matrix ... ')
54 |     for i in range(num_results):
55 |         for j in range(i+1,num_results):
56 |             #score = metrics.omega_index(results['vc'][i].membership,results['vc'][j].membership)
57 |             #score = skmetrics.f1_score(to_crisp_membership(results['vc'][i].membership),
58 |             #                           to_crisp_membership(results['vc'][j].membership))
59 |             score = skmetrics.adjusted_rand_score(to_crisp_membership(covers[i].membership),
60 |                                                   to_crisp_membership(covers[j].membership))
61 |             distance_matrix[i,j] = 1-score
62 |             distance_matrix[j,i] = 1-score
63 |     distance_matrix = np.matrix(distance_matrix)
64 | 
65 |     y = squareform(distance_matrix)
66 |     Z = average(y)
67 |     return distance_matrix, y, Z
68 | 


--------------------------------------------------------------------------------
/circulo/wrappers/community.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | import igraph
  3 | 
  4 | import circulo.algorithms
  5 | from circulo.algorithms import *
  6 | 
  7 | import statistics
  8 | 
  9 | 
 10 | from circulo.data.databot import CirculoData
 11 | 
 12 | 
 13 | def cleanup(G, databot, descript, algo_directed, algo_simple, algo_uses_weights):
 14 |     '''
 15 |     GRAPH Cleaning: Sometimes the graphs need to be cleaned for certain type of algorithms.
 16 |     The idea here is that we are trying to best match the data to what the algorithm can do.
 17 |     We start with specific matches and work our way to more general.
 18 |     '''
 19 | 
 20 |     alterations = []
 21 | 
 22 |     #first we check if algo and data have same directedness and type.
 23 |     if G.is_directed() == algo_directed and G.is_simple() == algo_simple and G.is_weighted() == algo_uses_weights:
 24 |         weight_attr =  "weight" if G.is_weighted() else None
 25 |         return G, weight_attr, alterations
 26 | 
 27 |     if algo_directed and not G.is_directed():
 28 |         print("\t[Info - ", descript, "] - Warning: Passing undirected graph to directed algo")
 29 | 
 30 |     #make a copy to prevserve original
 31 |     G_copy = G.copy()
 32 | 
 33 |     #add edge weights if not existing
 34 |     if not G_copy.is_weighted():
 35 |         G_copy.es()['weight'] = 1
 36 |         alterations.append('weighted')
 37 | 
 38 |     #if the graph is directed and algo is not directed, we make the graph undirected
 39 |     if G_copy.is_directed() and not algo_directed:
 40 |         orig_edge_count = G_copy.ecount()
 41 |         G_copy.to_undirected(combine_edges={'weight':sum})
 42 |         alterations.append('undirected')
 43 |         edges_removed = orig_edge_count - G_copy.ecount()
 44 |         print("\t[Info - ", descript, "] Converted directed to undirected: ", edges_removed, " edges collapsed of ", orig_edge_count)
 45 | 
 46 |     #if the algo is simple but the data is not, then we have to make the data simple
 47 |     if  algo_simple and not G.is_simple():
 48 |         orig_edge_count = G_copy.ecount()
 49 |         G_copy.simplify(combine_edges={'weight':sum})
 50 |         alterations.append('simple')
 51 |         edges_removed = orig_edge_count - G_copy.ecount()
 52 |         print("\t[Info - ", descript, "] Simplifying multigraph: ", edges_removed, " edges collapsed of ", orig_edge_count)
 53 | 
 54 |     #just quick check to see if the graph is nearly complete. If so we want to warn the user
 55 |     #since many algos don't do well with nearly complete graphs
 56 |     if G_copy.is_simple():
 57 |         complete_edges = G_copy.vcount()*(G.vcount()-1)/2
 58 | 
 59 |         if complete_edges *.8 < G_copy.ecount():
 60 |             print("\t[WARNING: ",descript,"] Graph is nearly complete")
 61 | 
 62 |     return G_copy, "weight", alterations
 63 | 
 64 | 
 65 | stochastic_algos = {
 66 |         "infomap",
 67 |         "fastgreedy",
 68 |         "leading_eigenvector",
 69 |         "multilevel",
 70 |         "label_propogation",
 71 |         "walktrap",
 72 |         "spinglass",
 73 |         "bigclam",
 74 |         "clauset_newman_moore"
 75 |         }
 76 | 
 77 | def comm_infomap(G, databot, descript):
 78 |     G, weights, alterations  = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True)
 79 |     return alterations, partial(igraph.Graph.community_infomap, G, edge_weights=weights, vertex_weights=None)
 80 | 
 81 | def comm_fastgreedy(G, databot, descript):
 82 |     G, weights, alterations  = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True)
 83 |     return alterations, partial(igraph.Graph.community_fastgreedy, G, weights=weights)
 84 | 
 85 | def comm_edge_betweenness(G, databot, descript):
 86 |     #edge betweenness does support undirected and directed, so just say that the algo_directed is the
 87 |     #same as the data being passed to it
 88 |     G, weights, alterations  = cleanup(G, databot, descript, algo_directed=G.is_directed(), algo_simple=True, algo_uses_weights=True)
 89 |     return alterations, partial(igraph.Graph.community_edge_betweenness, G, G.is_directed(), weights)
 90 | 
 91 | def comm_leading_eigenvector(G, databot, descript):
 92 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True)
 93 |     return alterations, partial(igraph.Graph.community_leading_eigenvector, G, weights=weights)
 94 | 
 95 | def comm_multilevel(G, databot, descript):
 96 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True)
 97 |     return alterations, partial(igraph.Graph.community_multilevel, G,  weights=weights)
 98 | 
 99 | def comm_label_propagation(G, databot, descript):
100 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True)
101 |     return alterations, partial(igraph.Graph.community_label_propagation, G, weights=weights)
102 | 
103 | def comm_walktrap(G, databot, descript):
104 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True)
105 |     return alterations, partial(igraph.Graph.community_walktrap, G, weights=weights)
106 | 
107 | def comm_spinglass(G, databot, descript):
108 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True)
109 |     return alterations, partial(igraph.Graph.community_spinglass, G, weights=weights)
110 | 
111 | def comm_conga(G, databot, descript):
112 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False)
113 |     return alterations, partial(circulo.algorithms.conga.conga, G)
114 | 
115 | def comm_congo(G, databot, descript):
116 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False)
117 |     return  alterations, partial(circulo.algorithms.congo.congo, G)
118 | 
119 | def comm_radicchi_strong(G, databot, descript):
120 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False)
121 |     return alterations, partial(circulo.algorithms.radicchi.radicchi,G,'strong')
122 | 
123 | def comm_radicchi_weak(G, databot, descript):
124 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False)
125 |     return alterations, partial(circulo.algorithms.radicchi.radicchi,G,'weak')
126 | 
127 | def comm_clique_percolation(G, databot, descript):
128 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False)
129 |     return alterations, partial(circulo.algorithms.snap_cpm.clique_percolation,G)
130 | 
131 | 
132 | def comm_bigclam(G, databot, descript):
133 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=True, algo_simple=True, algo_uses_weights=False)
134 |     ctx = databot.get_context()
135 |     num_comms = -1 # Detect automatically
136 |     min_comms = 1
137 |     max_comms = len(G.vs)
138 | 
139 |     return alterations, partial(circulo.algorithms.snap_bigclam.bigclam, G, detect_comm=num_comms, min_comm=min_comms, max_comm=max_comms)
140 | 
141 | def comm_cesna(G, databot, descript):
142 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False)
143 |     ctx = databot.get_context()
144 |     num_comms =  -1 # Detect automatically
145 | 
146 |     min_comms = 1
147 |     max_comms = len(G.vs)
148 | 
149 |     try:
150 |         attrs_to_use = ctx[CirculoData.CONTEXT_ATTRS_TO_USE]
151 |     except KeyError:
152 |         print("\t[skipping cesna because attributes not provided for ", descript)
153 |         return None,None
154 |     return alterations, partial(circulo.algorithms.snap_cesna.cesna, G, attrs_to_use, detect_comm=num_comms, min_comm=min_comms, max_comm=max_comms)
155 | 
156 | 
157 | def comm_coda(G, databot, descript):
158 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False)
159 |     return alterations, partial(circulo.algorithms.snap_coda.coda, G)
160 | 
161 | def comm_clauset_newman_moore(G, databot, descript):
162 |     G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False)
163 |     return alterations, partial(circulo.algorithms.snap_cnm.clauset_newman_moore, G)
164 | 


--------------------------------------------------------------------------------
/experiments/README.md:
--------------------------------------------------------------------------------
 1 | #Experiments
 2 | 
 3 | 
 4 | ### Ground Truth Similarity Test
 5 | - __PATH__: [metrics_clustering.py](metrics_clustering.py)
 6 | - __GOAL__: Determine if metrics applied to communities from Ground Truth are similar when grouped with metrics of non Ground Truth communities. The experiemnt leverages kmeans clustering where the features are the metrics, and the observations are a set of features for a given community.  The intuition behind this experiment is that if the ground truth communities fall under the same cluster, then there must exist some combination of metrics that represent the ideal community for this
 7 |   particular dataset.
 8 | - __RUN__: `python experiments/metrics_clustering.py metrics_dir dataset_name`
 9 | - __RESULTS__: The results show the _Groundtruth similarity_ (the largest percentage of ground truth communities in the same cluster), and the Frequency of ground truth communities in the same cluster. For example, you might see the following:
10 |     Groundtruth similarity:  0.5833333333333334 
11 |     Frequency of groundtruth communities as part of centroids [[0 7][1 5]]  
12 | 
13 | ### Label Communities
14 | - __PATH__: [community_label.py](community_label.py)
15 | - __GOAL__: Attempt to label communities by using node and edge attributes. For each community look at attribute values that are common in that community. 
16 | - __RUN__: `python community_label.py <graphml file> <results file>`
17 | - __RESULTS__: For each community the most common label for each attribute is displayed [Note the most common attribute is only displayed if it is assigned to more than half of the nodes in the community]
18 | 
19 | ![Community Labels](images/community_label_results.png)
20 | 
21 | ### Time vs Accuracy (two approaches)
22 | #####Approach 1
23 | - __PATH__: [metricsCharts.R](metricsCharts.R)   (Function: plotRunOmega)
24 | - __GOAL__: For a given data set with Ground Truth, measure how the result accuracy changes with certain algorithms, specifically taking into consideration executation time (time complexity). The idea would be to test, for example, time is correlated with accuracy.
25 | - __RUN__:
26 |     - ` metrics <- getMetrics("/path/to/json/metrics", "dataset name (i.e. football)")`
27 |     -  `plotRunOmega(metrics)`
28 | - __RESULTS__: This example shows the log of the calculation time vs. omega score for all the datsets from https://github.com/Lab41/Circulo-Data/releases/tag/2  
29 | 
30 | ![Log(time) vs. Omega Score](images/time_vs_omega.png)
31 | #####Approach 2 
32 | - __PATH__: [metricsCharts.R](metricsCharts.R) (Function: plotMetrics)
33 | - __GOAL__: Goal: Compare results computation time and accuracy across datasets
34 | - __RUN__: 
35 |     - `metrics <- getMetrics("/path/to/json/metrics", "dataset name (i.e. football)")`
36 |     - `plotMetrics(metrics)`
37 | - __RESULTS__: This exmple shows the datasets vs algorithms. The size of the bubble represents Omega score and the color represents how long it took to compute that result
38 | 
39 | ![Dataset vs. Algorithm](images/bubble_plot.png)
40 | 
41 | 
42 | ### Similar Algorithms
43 | - __PATH__: [cluster_omega_comparison.py](cluster_omega_comparison.py)
44 | - __GOAL__: Determine which algorithms produce similar results by comparing how similar their respective partitions are to eachother.
45 | - __RUN__: `python cluster_omega_comparison.py <path to algo results)>`
46 | - __RESULTS__: Counts of how often two algos produce similar results. For example:
47 | 
48 | ![Counts](images/counts.png)
49 | 
50 | 
51 | ### Histogram metrics across datasets
52 | - __PATH__: [histogram_metrics.py](histograph_metrics.py):
53 | - __GOAL__: This script allows you to compare the result of metrics across algorithms for a single dataset. It creates a histogram for each metric/algorithm pair showing the number of communities for that metric that fall into the specified bin. 
54 | - __RUN__: `python histogram_metrics.py <folder of your metrics json file> <Dataset desired i.e. amazon> [Optional: --metrics Density,Cohesiveness]`
55 | - __RESULTS__: This example shows the distributions of five parameters across datasets for the football data
56 | 
57 | ![Histogram of Football Data ](images/football_histogram.png)
58 | 
59 | ### Goodness Metrics
60 | - __PATH__: [goodness_indicators.py](goodness_indicators.py)
61 | - __GOAL__: This experiment is based on _Jaewon Yang and Jure Leskovec, http://cs.stanford.edu/people/jure/pubs/comscore-icdm12.pdf, Defining and Evaluating Network Communities based on Ground-truth_. It determines which community metrics are most correlated.
62 | - __RUN__: `python goodness_indicators.py metrics_dir`
63 | - __RESULTS__: An example result for the football ground truth dataset is shown below:
64 | 
65 | ![Correlated Metrics](images/football--groundtruth--0.png)
66 | 
67 | 
68 | #### Plot Community Detection 
69 | - __PATH__: [gephi_plot](gephi_plot)
70 | - __GOAL__: The graphml file created by create_graphml.py makes it easy to view the datset in Gephi and explore the graph. This Java program creates static PDFs of the results from the various aglorithms using Gephi as a layout and plotting engine to visualize the results.
71 | - __COMPILE__: Use Maven to compile the project. mvn compile assembly:single will give you a jar that contains all the dependcies needed to run the executable
72 | - __RUN__:
73 |     - `python create_graphml.py <path to results json files> <Path to graphs, typically circulo/circulo/data/GRAPHS> <output directory for new graphml files> [--least]`
74 |     - `java -jar gephi_plot-0.0.1-SNAPSHOT-jar-with-dependencies.jar <Directory with graphml files or graphml file> <output dir (it won't create a folder for you)>`
75 | - __RESULTS__: A set of PDFs are produced using the community detection results to color a visualization of the underlying graph that has been laid out using force-directed layout (Gephi's Force-Atlas 2). This plot is the flights data colored using the Infomap community detection results:
76 | 
77 | ![Flights data colored using Infomap results](images/flights_algo_infomap.png)
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/experiments/cluster_omega_comparison.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | import numpy as np
18 | from sklearn.cluster import spectral_clustering
19 | import argparse
20 | from math import floor, sqrt
21 | from operator import itemgetter
22 | from itertools import combinations
23 | import os
24 | import glob
25 | import json
26 | import operator
27 | 
28 | from circulo.utils.general import run_comparison
29 | 
30 | THRESHOLD = .7
31 | 
32 | def main():
33 |     parser = argparse.ArgumentParser(description= 'Use Relative Omega Scores to determine similarity of algorithms')
34 |     parser.add_argument('results_path', type=str, help='directory containing  algorithm results')
35 |     args = parser.parse_args()
36 | 
37 |     if not os.path.exists(args.results_path):
38 |         print("Path \"{}\" does not exist".format(args.results_path))
39 |         return
40 | 
41 |     dataset_groups = {}
42 |     algos = set()
43 | 
44 |     #sets the list of json files to a Key (dataset name)
45 |     #Allows us to quickly iterate over all result files for each dataset
46 |     #At the same time, we collect the list of algos from the results
47 |     for fname in glob.glob(os.path.join(args.results_path, '*.json')):
48 |         dataset = os.path.basename(fname).split('--')[0]
49 |         algos.add(os.path.basename(fname).split('--')[1])
50 |         if dataset in dataset_groups:
51 |             dataset_groups[dataset].append(fname)
52 |         else:
53 |             dataset_groups[dataset] = [fname]
54 | 
55 |     #create count dict for all possible pairs of algos (includes groundtruth)
56 |     counts=dict.fromkeys(combinations(sorted([a for a in algos]),2),0)
57 | 
58 |     #now iterate over each dataset name (there json files) and update the
59 |     #counts accordingly
60 |     for dataset_name, json_files in dataset_groups.items():
61 |         memberships = []
62 |         algo_names = []
63 |         for fjson in json_files:
64 | 
65 |             algo_names.append(os.path.basename(fjson).split("--")[1])
66 | 
67 |             with open(fjson) as f:
68 |                 memberships.append(json.load(f)['membership'])
69 | 
70 |         coords = np.argwhere(run_comparison(memberships) > THRESHOLD)
71 | 
72 |         for v in coords:
73 |             x,y = v.flatten()
74 |             if x != y and algo_names[x] < algo_names[y]:
75 |                 counts[(algo_names[x], algo_names[y])]+=1
76 | 
77 |     sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
78 | 
79 |     print("Total Datasets: ", len(dataset_groups))
80 | 
81 |     for s in sorted_counts:
82 |         print(s)
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/experiments/gephi_plot/create_graphml.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | import multiprocessing
 17 | import time
 18 | import signal
 19 | import os
 20 | import errno
 21 | import traceback
 22 | from collections import namedtuple
 23 | import glob
 24 | import sys
 25 | import igraph
 26 | import matplotlib.pyplot as plt
 27 | from circulo.wrappers import community
 28 | from circulo.metrics import omega
 29 | import argparse
 30 | import os
 31 | import json
 32 | import datetime
 33 | import multiprocessing
 34 | 
 35 | Worker = namedtuple('Worker', 'json_path raw_graph_path output_path pick_least_frequent pick_most_frequent timeout')
 36 | 
 37 | 
 38 | def main():
 39 |     parser = argparse.ArgumentParser(description='Compute metrics for given cover.')
 40 |     parser.add_argument('input_path', type=str, help='file or directory containing results')
 41 |     parser.add_argument('raw_graph_path', type=str, help='File or directory graphml files [typically circulo/data/GRAPHS/]')
 42 |     parser.add_argument('output_path', type=str, help='output directory to write metric files')
 43 |     parser.add_argument('--least', action="store_true", help='If you add this flag only keep least frequent community for a given node is kept (useful for plotting)')
 44 |     parser.add_argument('--most', action="store_true", help='If you add this flag only keep most frequent community for a given node is kept (useful for plotting)')
 45 |     parser.add_argument('--workers', type=int, default=multiprocessing.cpu_count(), help='Number of workers to process (DEFAULT: number of processors)')
 46 |     parser.add_argument('--timeout', type=int, default=3600, help="timeout for a work item in seconds (DEFAULT: 3600)")
 47 |     args = parser.parse_args()
 48 | 
 49 |     if args.least and args.most:
 50 |         print('Cannot select both least and most common community')
 51 |         return
 52 | 
 53 |     if not os.path.exists(args.input_path):
 54 |         print("Path \"{}\" does not exist".format(args.input_path))
 55 |         return
 56 | 
 57 |     if not os.path.exists(args.output_path):
 58 |         os.makedirs(args.output_path)
 59 | 
 60 |     workers = []
 61 |     json_groups = {}
 62 |     json_files = glob.glob(os.path.join(args.input_path, '*.json'))
 63 |     for json_file in json_files:
 64 |         dataset = os.path.basename(json_file).split('--')[0]
 65 |         if dataset in json_groups:
 66 |             json_groups[dataset].append(json_file)
 67 |         else:
 68 |             json_groups[dataset] = [json_file]
 69 | 
 70 |     raw_graph_files = glob.glob(os.path.join(args.raw_graph_path, '*.graphml'))
 71 |     for (dataset, json_files) in json_groups.items():
 72 |         raw_graph_file_path = None
 73 |         for raw_graph_file in raw_graph_files:
 74 |             if os.path.basename(raw_graph_file).startswith(dataset):
 75 |                 raw_graph_file_path = raw_graph_file
 76 |         workers.append(Worker(json_files, raw_graph_file_path, args.output_path, args.least, args.most, args.timeout))
 77 | 
 78 |     if args.workers is not None:
 79 |         pool = multiprocessing.Pool(processes = args.workers)
 80 |     else:
 81 |         pool = multiprocessing.Pool()
 82 | 
 83 |     r = pool.map_async(analyze_json, workers)
 84 |     r.get() #must call in order to get error from inside the child processes
 85 |     pool.close()
 86 |     pool.join()
 87 | 
 88 | 
 89 | class TimeoutError(Exception):
 90 |     pass
 91 | 
 92 | 
 93 | def __handle_timeout(signum, frame):
 94 |     raise TimeoutError(os.strerror(errno.ETIME))
 95 | 
 96 | 
 97 | def __get_least_frequent_community(community_array, community_counts, reverse):
 98 |     counts = []
 99 |     for community in community_array:
100 |         counts.append((community_counts[community], community))
101 | 
102 |     counts.sort()
103 |     if reverse:
104 |         counts.reverse()
105 | 
106 |     for i, (count,community) in enumerate(counts):
107 |         if count != 1 or i == len(counts)-1:
108 |             return community
109 | 
110 | 
111 | def analyze_json(worker):
112 |     """
113 |     Take in a set of json community detection results files and a graphml file representing the raw graph and output a
114 |     graphml file that contains, as attributes, the results of the algorithms
115 | 
116 |     Args:
117 |     worker: Named tuple of json_path raw_graph_path output_path timeout
118 |     """
119 |     signal.signal(signal.SIGALRM, __handle_timeout)
120 |     signal.setitimer(signal.ITIMER_REAL, worker.timeout)
121 | 
122 |     print('Loading raw Graphml file truth file: %s'%worker.raw_graph_path)
123 |     if worker.raw_graph_path is not None:
124 |         G = igraph.load(worker.raw_graph_path)
125 |     else:
126 |         print("ERROR: Not able to load graph")
127 |         return
128 | 
129 |     try:
130 |         for json_path in worker.json_path:
131 |             with open(json_path) as f:
132 |                 data = json.load(f)
133 |                 (name, algorithm) = data['job_name'].split('--')[:2]
134 | 
135 |                 algo_name = 'algo_%s'%algorithm
136 | 
137 |                 # Only if we are pulling least frequent
138 |                 if worker.pick_least_frequent or worker.pick_most_frequent:
139 |                     # Calculate number of nodes in each community
140 |                     community_counts = {}
141 |                     for node in data['membership']:
142 |                         for community in node:
143 |                             if community in community_counts:
144 |                                 community_counts[community] += 1
145 |                             else:
146 |                                 community_counts[community] = 1
147 | 
148 |                 # Add property to graph
149 |                 for node in G.vs():
150 |                     # Get cover Array
151 |                     # TODO: Fix this hacky way to turn node id (i.e. "n1") into node index (i.e. 1)
152 |                     try:
153 |                         community_array = data['membership'][int(node['id'][1:])]
154 |                     except IndexError:
155 |                         community_array= []
156 | 
157 |                     if worker.pick_least_frequent:
158 |                         least_frequent_community = __get_least_frequent_community(community_array, community_counts, reverse=False)
159 |                         if least_frequent_community is None:
160 |                             least_frequent_community = -1
161 |                         G.vs[node.index][algo_name] = str(least_frequent_community)
162 |                     elif worker.pick_most_frequent:
163 |                         least_frequent_community = __get_least_frequent_community(community_array, community_counts, reverse=True)
164 |                         if least_frequent_community is None:
165 |                             least_frequent_community = -1
166 |                         G.vs[node.index][algo_name] = str(least_frequent_community)
167 |                     else:
168 |                         G.vs[node.index][algo_name] = ','.join([str(x) for x in community_array])
169 | 
170 |     except TimeoutError as t:
171 |         print("\t+Timeout ERROR: was analyzing: ", data['job_name'])
172 |         signal.alarm(0)
173 |         return
174 |     except Exception as e:
175 |         print(e)
176 |         traceback.print_exc(file=sys.stdout)
177 |         return
178 | 
179 |     graphml_file_output = os.path.join(worker.output_path, "%s.graphml"% name)
180 |     print("Writing Graph: %s"%graphml_file_output )
181 |     igraph.write(G, graphml_file_output)
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     main()


--------------------------------------------------------------------------------
/experiments/gephi_plot/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 | 	<modelVersion>4.0.0</modelVersion>
 4 | 
 5 | 	<groupId>com.lab41.circulo</groupId>
 6 | 	<artifactId>gephi_plot</artifactId>
 7 | 	<version>0.0.1-SNAPSHOT</version>
 8 | 	<packaging>jar</packaging>
 9 | 
10 | 	<name>gephi_plot</name>
11 | 	<url>http://maven.apache.org</url>
12 | 
13 | 
14 | 	<repositories>
15 | 		<repository>
16 | 			<id>gephi-snapshots</id>
17 | 			<name>Gephi Snapshots</name>
18 | 			<url>http://nexus.gephi.org/nexus/content/repositories/snapshots/</url>
19 | 		</repository>
20 | 		<repository>
21 | 			<id>gephi-releases</id>
22 | 			<name>Gephi Releases</name>
23 | 			<url>http://nexus.gephi.org/nexus/content/repositories/releases/</url>
24 | 		</repository>
25 | 	</repositories>
26 | 	<properties>
27 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
28 | 	</properties>
29 | 	<dependencies>
30 | 		<dependency>
31 | 			<groupId>org.gephi</groupId>
32 | 			<artifactId>gephi-toolkit</artifactId>
33 | 			<version>0.8.2</version>
34 | 		</dependency>
35 | 		<dependency>
36 | 			<groupId>junit</groupId>
37 | 			<artifactId>junit</artifactId>
38 | 			<version>3.8.1</version>
39 | 			<scope>test</scope>
40 | 		</dependency>
41 | 	</dependencies>
42 | 	<build>
43 | 		<plugins>
44 | 			<plugin>
45 | 				<groupId>org.apache.maven.plugins</groupId>
46 | 				<artifactId>maven-compiler-plugin</artifactId>
47 | 				<version>3.2</version>
48 | 				<configuration>
49 | 					<source>1.7</source>
50 | 					<target>1.7</target>
51 | 				</configuration>
52 | 			</plugin>
53 | 			<plugin>
54 | 				<artifactId>maven-assembly-plugin</artifactId>
55 | 				<configuration>
56 | 					<archive>
57 | 						<manifest>
58 | 							<mainClass>com.lab41.circulo.gephi_plot.PlotGraphs</mainClass>
59 | 						</manifest>
60 | 					</archive>
61 | 					<descriptorRefs>
62 | 						<descriptorRef>jar-with-dependencies</descriptorRef>
63 | 					</descriptorRefs>
64 | 				</configuration>
65 | 			</plugin>
66 | 		</plugins>
67 | 	</build>
68 | </project>
69 | 


--------------------------------------------------------------------------------
/experiments/gephi_plot/src/main/java/com/lab41/circulo/gephi_plot/PlotGraphs.java:
--------------------------------------------------------------------------------
  1 | package com.lab41.circulo.gephi_plot;
  2 | /*
  3 |  * Based on Gephi Headless Example by Mathieu Bastian (GPL v3)
  4 | */
  5 | 
  6 | import java.io.File;
  7 | import java.io.IOException;
  8 | import java.nio.file.Paths;
  9 | import java.util.ArrayList;
 10 | 
 11 | import org.gephi.data.attributes.api.AttributeColumn;
 12 | import org.gephi.data.attributes.api.AttributeController;
 13 | import org.gephi.data.attributes.api.AttributeModel;
 14 | import org.gephi.graph.api.DirectedGraph;
 15 | import org.gephi.graph.api.GraphController;
 16 | import org.gephi.graph.api.GraphModel;
 17 | import org.gephi.io.exporter.api.ExportController;
 18 | import org.gephi.io.importer.api.Container;
 19 | import org.gephi.io.importer.api.ImportController;
 20 | import org.gephi.io.processor.plugin.DefaultProcessor;
 21 | import org.gephi.layout.plugin.forceAtlas2.ForceAtlas2;
 22 | import org.gephi.layout.plugin.forceAtlas2.ForceAtlas2Builder;
 23 | import org.gephi.partition.api.NodePartition;
 24 | import org.gephi.partition.api.PartitionController;
 25 | import org.gephi.partition.plugin.NodeColorTransformer;
 26 | import org.gephi.preview.api.PreviewController;
 27 | import org.gephi.preview.api.PreviewModel;
 28 | import org.gephi.preview.api.PreviewProperty;
 29 | import org.gephi.project.api.ProjectController;
 30 | import org.gephi.project.api.Workspace;
 31 | import org.openide.util.Lookup;
 32 | 
 33 | public class PlotGraphs {
 34 | 	public static void main(String[] args){
 35 | 		if(args.length != 2){
 36 | 			System.err.println("Usage java -jar gephi_plot.jar <Directory with graphml files or graphml file> <output dir>");
 37 | 			System.exit(65);
 38 | 		}
 39 | 		
 40 | 		ArrayList<String> filesToProcess = new ArrayList<String>();
 41 | 		File inputPath = new File(args[0]);
 42 | 		if (inputPath.exists()){
 43 | 			// If input is a single file add that and continue
 44 | 			if (inputPath.isFile()){
 45 | 				filesToProcess.add(inputPath.getPath());
 46 | 			// For each input file process output
 47 | 			}else{
 48 | 				for (String filePath: inputPath.list()){
 49 | 					if (filePath.endsWith(".graphml") == true){
 50 | 						String fullFilePath = Paths.get(inputPath.getPath(), filePath).toString();
 51 | 						filesToProcess.add(fullFilePath);
 52 | 					}
 53 | 				}
 54 | 			}		
 55 | 		}else{
 56 | 			System.err.println("Input path does not exist: " + args[0]);
 57 | 			System.exit(65);
 58 | 		}
 59 | 		
 60 | 		for (String fileToProcess: filesToProcess){
 61 | 			PlotGraphs hs = new PlotGraphs();
 62 | 			hs.script(fileToProcess, args[1]);
 63 | 		}
 64 | 	}
 65 | 	
 66 | 
 67 | 	public void script(String graphPath, String outputPath) {
 68 | 		// Extract dataset name
 69 | 		String graphFileName = new File(graphPath).getName();
 70 | 		String datasetName = graphFileName.substring(0, graphFileName.indexOf(".graphml"));
 71 | 		
 72 | 		// Initialize a Gephi project and workspace
 73 | 		ProjectController pc = Lookup.getDefault().lookup(ProjectController.class);
 74 | 		pc.newProject();
 75 | 		Workspace workspace = pc.getCurrentWorkspace();
 76 | 
 77 | 		// Get models and controllers for this new workspace - will be useful later
 78 | 		AttributeModel attributeModel = Lookup.getDefault().lookup(AttributeController.class).getModel();
 79 | 		GraphModel graphModel = Lookup.getDefault().lookup(GraphController.class).getModel();
 80 | 		PreviewModel model = Lookup.getDefault().lookup(PreviewController.class).getModel();
 81 | 		ImportController importController = Lookup.getDefault().lookup(ImportController.class);
 82 | 		PartitionController partitionController = Lookup.getDefault().lookup(PartitionController.class); 
 83 | 
 84 | 		// Import file       
 85 | 		Container container;
 86 | 		try {
 87 | 			File file = new File(graphPath);
 88 | 			container = importController.importFile(file);
 89 | 		} catch (Exception ex) {
 90 | 			ex.printStackTrace();
 91 | 			return;
 92 | 		}
 93 | 
 94 | 		// Append imported data to GraphAPI
 95 | 		importController.process(container, new DefaultProcessor(), workspace);
 96 | 
 97 | 		// See if graph is well imported
 98 | 		DirectedGraph graph = graphModel.getDirectedGraph();
 99 | 
100 | 		// Do ForceAtlas2 based layout
101 | 		ForceAtlas2Builder fa2b = new ForceAtlas2Builder();
102 | 		ForceAtlas2 fa2Layout = fa2b.buildLayout();
103 | 		fa2Layout.setGraphModel(graphModel);
104 | 		fa2Layout.setThreadsCount(Runtime.getRuntime().availableProcessors()); 
105 | 		fa2Layout.initAlgo();
106 | 		int i_max = 1000; // TODO: Look into setting this more intelligently (some sort of convergence metric)
107 | 		long startTime = System.currentTimeMillis();
108 | 		long currentTime = System.currentTimeMillis();
109 | 		for (int i = 0; i < i_max && fa2Layout.canAlgo() && currentTime - startTime < 1000*60*10 ; i++) {
110 | 			// Want to take faster steps at first but then be more careful
111 | 			if (i < i_max/4.0){
112 | 				fa2Layout.setJitterTolerance(1.0);
113 | 			}else{
114 | 				fa2Layout.setJitterTolerance(0.1);
115 | 			}
116 | 			fa2Layout.goAlgo();
117 | 			currentTime = System.currentTimeMillis();
118 | 		}
119 | 		fa2Layout.endAlgo();
120 | 
121 | 		// Figure out which algorithms are in the results set
122 | 		ArrayList<String> algoResultsPresent = new ArrayList<String>();
123 | 		for(AttributeColumn ac: attributeModel.getNodeTable().getColumns()){
124 | 			String title = ac.getTitle();
125 | 			if (title.startsWith("algo")){
126 | 				algoResultsPresent.add(ac.getTitle());
127 | 			}
128 | 		}
129 | 		
130 | 		// For each algorithm, create an output of the results
131 | 		for (String algoResult: algoResultsPresent){
132 | 			System.out.println("Printing: " + algoResult);
133 | 			NodePartition p = (NodePartition) partitionController.buildPartition(attributeModel.getNodeTable().getColumn(algoResult), graph);
134 | 			NodeColorTransformer nodeColorTransformer = new NodeColorTransformer();
135 | 			nodeColorTransformer.randomizeColors(p);
136 | 			partitionController.transform(p, nodeColorTransformer);
137 | 
138 | 			// Don't show node labels, make edges straight lines
139 | 			model.getProperties().putValue(PreviewProperty.SHOW_NODE_LABELS, Boolean.FALSE);
140 | 			model.getProperties().putValue(PreviewProperty.EDGE_CURVED, Boolean.FALSE);
141 | 
142 | 			// Export to PDF file
143 | 			ExportController ec = Lookup.getDefault().lookup(ExportController.class);
144 | 			try {
145 | 				String outputFileName = datasetName + "_"+algoResult+".pdf";
146 | 				ec.exportFile(Paths.get(outputPath, outputFileName).toFile());
147 | 			} catch (IOException ex) {
148 | 				ex.printStackTrace();
149 | 				return;
150 | 			}
151 | 		}
152 | 
153 | 	}
154 | }


--------------------------------------------------------------------------------
/experiments/histogram_metrics.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | # http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | import argparse
 17 | import os
 18 | import glob
 19 | import json
 20 | import sys
 21 | import numpy as np
 22 | import matplotlib.pyplot as plt
 23 | 
 24 | 
 25 | def analyze_metrics(dataset, output_dir, file_names, metrics_to_evaluate):
 26 |     """
 27 |         Creates histograms of specific metrics across algorithms
 28 | 
 29 |         Args:
 30 |            dataset (string): dataset being processed [used for naming output file]
 31 |            output_dir (string): output path
 32 |            file_names (list of strings): Input metrics json files
 33 |            metrics_to_evaluate (list of strings): Metrics to be histogramed
 34 |         Return:
 35 |             None
 36 |     """
 37 |     num_files = len(file_names)
 38 |     # Load metrics into memory
 39 |     metrics = []
 40 |     for json_path in file_names:
 41 |         with open(json_path) as f:
 42 |             metrics.append(json.load(f))
 43 | 
 44 |     # Get min/max for each metric across all datasets
 45 |     metric_min_max = {}
 46 |     for column, metric_to_evaluate in enumerate(metrics_to_evaluate):
 47 |         mins = []
 48 |         maxes = []
 49 |         for i, data in enumerate(metrics):
 50 |             mins.append(min(data['metrics'][metric_to_evaluate]['results']))
 51 |             maxes.append(max(data['metrics'][metric_to_evaluate]['results']))
 52 | 
 53 |         metric_min_max[metric_to_evaluate] = (min(mins), max(maxes))
 54 | 
 55 |     # Create Plots
 56 |     plt.clf()
 57 |     for column, metric_to_evaluate in enumerate(metrics_to_evaluate):
 58 |         for i, data in enumerate(metrics):
 59 |             (dataset, algorithm, number) = data['name'].split('--')
 60 |             print('Processing: ', dataset, algorithm)
 61 | 
 62 |             # Create subplot
 63 |             ax = plt.subplot(num_files, len(metrics_to_evaluate), i*(len(metrics_to_evaluate)) + 1 + column)
 64 |             plt.hist(data['metrics'][metric_to_evaluate]['results'], bins=20, range=metric_min_max[metric_to_evaluate])
 65 |             plt.yticks(ax.get_ylim(), fontsize=8)
 66 | 
 67 |             # Set algorithm name on left hand side
 68 |             if column == 0:
 69 |                 plt.ylabel(algorithm, rotation='horizontal', fontsize=8)
 70 | 
 71 |             # Set metric name on top of coluns
 72 |             if i == 0:
 73 |                 print('Printing Title: ', metric_to_evaluate)
 74 |                 plt.title(metric_to_evaluate, fontsize=8)
 75 | 
 76 |             # Only print x axis ticks at bottom of the columns
 77 |             if i != len(metrics)-1:
 78 |                 plt.xticks(fontsize=0)
 79 |             else:
 80 |                 plt.xticks(rotation='vertical', fontsize=8)
 81 | 
 82 |     plt.savefig(os.path.join(output_dir, '%s.png'%dataset))
 83 | 
 84 | def main():
 85 | 
 86 |     parser = argparse.ArgumentParser(description=
 87 |                                      'Create side by side histograms for various metrics across algorithms for a given dataset')
 88 |     parser.add_argument('input_path', type=str, help='file or directory containing  metric json files')
 89 |     parser.add_argument('dataset', type=str, help='Dataset desired (i.e. football)')
 90 |     parser.add_argument('--metrics', type=str,
 91 |                         default=','.join(['Separability', 'Cohesiveness', 'Density', 'Triangle Participation Ratio', 'Conductance']),
 92 |                         help='Metrics to Compare (comma separated)')
 93 |     parser.add_argument('--output', type=str, default=os.getcwd(), help='Base output directory')
 94 |     args = parser.parse_args()
 95 | 
 96 |     if not os.path.exists(args.input_path):
 97 |         print("Path \"{}\" does not exist".format(args.input_path))
 98 |         return
 99 | 
100 | 
101 |     if os.path.isdir(args.input_path):
102 |         file_names = glob.glob(os.path.join(args.input_path, '*%s*.json'%args.dataset))
103 |         analyze_metrics(args.dataset, args.output, file_names, args.metrics.split(','))
104 |     else:
105 |         analyze_metrics(args.dataset, args.output, [args.input_path], args.metrics.split(','))
106 | 
107 | if __name__ == "__main__":
108 |     main()
109 | 


--------------------------------------------------------------------------------
/experiments/images/bubble_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/bubble_plot.png


--------------------------------------------------------------------------------
/experiments/images/community_label_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/community_label_results.png


--------------------------------------------------------------------------------
/experiments/images/counts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/counts.png


--------------------------------------------------------------------------------
/experiments/images/flights_algo_infomap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/flights_algo_infomap.png


--------------------------------------------------------------------------------
/experiments/images/football--groundtruth--0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/football--groundtruth--0.png


--------------------------------------------------------------------------------
/experiments/images/football_histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/football_histogram.png


--------------------------------------------------------------------------------
/experiments/images/time_vs_omega.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/time_vs_omega.png


--------------------------------------------------------------------------------
/experiments/metricsCharts.R:
--------------------------------------------------------------------------------
  1 | #For Lab 41 Circulo metrics json files
  2 | #Patrick Wheatley NGA + others :)
  3 | #29Aug2014. Modified 22 Sep 2014
  4 | #reads json files from directory and plots pdf bubble chart of computation time and omega accuracy
  5 | 
  6 | # Sample Usage:
  7 | #   metrics <- getMetrics(datapath, "dataset name")
  8 | #   plotMetrics(metrics)
  9 | #   plotHist(metrics,'omega')
 10 | #   plotHist(metrics,'time')
 11 | #   plotRunOmega(metrics)
 12 | 
 13 | library(ggplot2)
 14 | #Switched from jsonlite to RJSIONIO for speed reasons
 15 | library(RJSONIO)
 16 | 
 17 | # Read metrics from json
 18 | getMetrics <- function(datapath='/Users/paulm/Desktop/metrics', dataset="football") {
 19 | 
 20 |     #Get file names and load the json files
 21 |     filenames <- list.files(datapath, pattern=paste(".*",dataset,".*.json", sep=""), full.names=TRUE)
 22 |     N <- length(filenames)
 23 |     results <- lapply(filenames, fromJSON)
 24 | 
 25 |     #parse filenames to get algorithm names and dataset names
 26 |     names <- basename(filenames)
 27 |     names2 <- sapply(1:N, function(x) strsplit(names, "\\.")[[x]][1])
 28 |     Datasets <- sapply(1:N, function(x) strsplit(names2, "--")[[x]][1])
 29 |     Algorithms <-sapply(1:N, function(x) strsplit(names2, "--")[[x]][2])
 30 | 
 31 |     #Pull computation time and omega from the json files
 32 |     ComputationTime <- sapply(1:N, function (x) results[[x]]$elapsed)
 33 |     OmegaAccuracy <-sapply(1:N, function (x) results[[x]]$omega)
 34 | 
 35 |     #fussy R data type formatting
 36 |     metrics <- cbind(Algorithms,Datasets,ComputationTime,OmegaAccuracy)
 37 |     ind <- which(metrics[,"OmegaAccuracy"] != "NULL")
 38 |     metrics <-data.frame(metrics[ind,],stringsAsFactors=FALSE)
 39 |     metrics <- data.frame(lapply(metrics, unlist),stringsAsFactors=FALSE)
 40 |     metrics$ComputationTime <- as.numeric(metrics$ComputationTime)
 41 |     # Normalize computation time by dataset
 42 |     metrics$ComputationTime <- ave(metrics$ComputationTime, list(metrics$Datasets), FUN=function(L) L/min(L))
 43 |     metrics$OmegaAccuracy <- as.numeric(metrics$OmegaAccuracy)
 44 | 
 45 |     
 46 |    return(metrics)
 47 | }
 48 | 
 49 | # Plot Metrics
 50 | plotMetrics <- function(metrics,toPDF=FALSE) {
 51 |     # Group metrics by Dataset and Algorithm, then summarize
 52 |     data <- aggregate(metrics[,c('ComputationTime','OmegaAccuracy')],list(metrics$Datasets,metrics$Algorithms),mean)
 53 |     colnames(data)[1:2] <- c("Datasets","Algorithms")
 54 |     
 55 |     keep <- which(data$Algorithms != 'groundtruth')
 56 |     data <- data[keep,]
 57 |     
 58 |     bubbleplot <- ggplot(data, aes(x=Datasets, y=Algorithms))+
 59 |           geom_point(aes(size=ComputationTime, colour=OmegaAccuracy), alpha=0.75)+
 60 |           scale_size_continuous(range =c(8, 25), trans='log')+
 61 |           scale_colour_gradient2(midpoint=0.4, low="red",mid="yellow", high="dark green")+
 62 |           theme_bw() + 
 63 |           theme(text = element_text(size=20))+
 64 |           ggtitle('Accuracy and Computation Time across Datasets and Algorithms')
 65 | 
 66 |     if (toPDF) {
 67 |         pdffile <- paste(Sys.time(),"metricsGraph.pdf", sep='')
 68 |         pdf(pdffile,height=10,width=12)
 69 |         print(bubbleplot)
 70 |         dev.off()
 71 |         cat(sprintf('printed to %s \n', pdffile))
 72 |     } else {
 73 |         print(bubbleplot)
 74 |     }
 75 | }
 76 | 
 77 | # Plot chart comparing runtime to accuracy
 78 | plotRunOmega <- function(metrics, toPDF=FALSE) {
 79 | 	runtimeplot <- ggplot(metrics, aes(x=log10(ComputationTime), y=OmegaAccuracy)) + 
 80 | 		geom_point(size=0) +
 81 | 		theme_bw()+
 82 | 		geom_text(aes(x=log10(ComputationTime), y=OmegaAccuracy, label=Algorithms, color=Datasets), size=4, angle=45) 
 83 | 	
 84 | 	
 85 | 	if (toPDF) {
 86 |         pdffile <- paste(Sys.time(),"runtimeVsOmegaAccuracy.png", sep='')
 87 |         png(pdffile, height=10, width=12, units='in', res=300)
 88 |         print(runtimeplot)
 89 |         dev.off()
 90 |         cat(sprintf('printed to %s \n', pdffile))
 91 |     } else {
 92 |         print(runtimeplot)
 93 |     }
 94 | }
 95 | 
 96 | # Plots histogram of specified metric (omega or computation time right now)
 97 | plotHist <- function(metrics,col='ComputationTime',toPDF=FALSE) {
 98 | 
 99 |     data <- metrics[c('Algorithms','Datasets',col)]
100 |     colnames(data)[3] <- "value"
101 | 
102 |     p <- ggplot(data,aes(x=value,fill=Algorithms)) +
103 |     facet_grid(. ~ Datasets) +
104 |     geom_density(alpha=0.5) +
105 |     #geom_histogram(alpha=0.5,position='identity')
106 |     xlab(col) + 
107 |     ylab('Probability Density') + 
108 |     theme_bw() +
109 |     ggtitle(Sys.time())
110 |   
111 | 
112 |     if (toPDF) {
113 |         pdffile <- paste(Sys.time(),"metricsGraph.pdf", sep='')
114 |         pdf(pdffile,height=10,width=12)
115 |         print(p)
116 |         dev.off()
117 |         cat(sprintf('printed to %s \n', pdffile))
118 |     } else {
119 |         print(p)
120 |     }
121 | }
122 | 
123 | 


--------------------------------------------------------------------------------
/experiments/metrics_clustering.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import glob
  3 | import os
  4 | from collections import Counter
  5 | import numpy as np
  6 | import argparse
  7 | 
  8 | from scipy.stats import itemfreq
  9 | from scipy.cluster.vq import kmeans2, whiten
 10 | 
 11 | 
 12 | 
 13 | metric_list = [
 14 |     "Conductance",
 15 |     "Cut Ratio",
 16 |     "Degree StatisticsBiased Kurtosis",
 17 |     "Density",
 18 |      "Expansion",
 19 |     "Cohesiveness",
 20 |     "Flake Out Degree Fraction",
 21 |     ]
 22 | 
 23 | 
 24 | NUM_DIMENSIONS = len(metric_list)
 25 | 
 26 | 
 27 | def run_experiment(metrics_path, dataset_name):
 28 | 
 29 |     num_comms = 0
 30 | 
 31 |     files_analyzed = 0
 32 |     #go through quickly to determine how many communities you have
 33 |     for f in glob.glob(metrics_path+"/"+dataset_name+"--*--*.json"):
 34 |         json_f = open(f)
 35 |         j = json.load(json_f)
 36 |         json_f.close()
 37 |         num_comms+=len(j['metrics']['Density']['results'])
 38 |         files_analyzed+=1
 39 | 
 40 |     if(files_analyzed == 0):
 41 |         print("No files to analyze")
 42 |         return
 43 | 
 44 |     print("Files Analyzed: ", files_analyzed)
 45 | 
 46 | 
 47 |     print("Running kmeans on ", num_comms, " communities")
 48 | 
 49 |     matrix = np.zeros((num_comms, NUM_DIMENSIONS))
 50 |     comm_count = 0
 51 |     gt_start = -1
 52 |     gt_end = -1
 53 | 
 54 | 
 55 |     for i, f in enumerate(glob.glob(metrics_path+"/"+dataset_name+"--*--*.json")):
 56 | 
 57 |         print(f)
 58 |         json_f = open(f)
 59 |         j = json.load(json_f)
 60 |         json_f.close()
 61 |         metrics = j['metrics']
 62 | 
 63 |         #get the number of comms for this file
 64 |         add_comms = len(metrics['Density']['results']) + comm_count
 65 | 
 66 |         if f == metrics_path+"/"+dataset_name+"--groundtruth--0.json":
 67 |              gt_start = comm_count
 68 |              gt_end = add_comms
 69 | 
 70 |         dim_idx=0
 71 |         for metric_name in metric_list:
 72 | 
 73 |             results = metrics[metric_name]['results']
 74 | 
 75 |             try:
 76 |                 matrix[comm_count:add_comms,dim_idx] = results
 77 |             except Exception as e:
 78 |                 print(result_dict['results'])
 79 |                 print("Error: ",e)
 80 | 
 81 |             dim_idx+=1
 82 |             if dim_idx == NUM_DIMENSIONS:
 83 |                 break
 84 | 
 85 |         comm_count=add_comms
 86 | 
 87 |     matrix_norm = whiten(matrix)
 88 |     centroid, label = kmeans2(matrix_norm, k=3)
 89 | 
 90 |     freq = itemfreq(label[gt_start:gt_end])
 91 | 
 92 |     m = max(freq, key=lambda y: y[1])
 93 | 
 94 |     ratio = float(m[1])/(gt_end-gt_start)
 95 | 
 96 |     print("Groundtruth similarity: ", ratio)
 97 | 
 98 |     print("Frequency of groundtruth communities as part of centroids")
 99 |     print(freq)
100 | 
101 |     i = gt_start
102 | 
103 |     print("GroundTruth Centroids range: ", gt_start, "-> ", gt_end)
104 |     while i < gt_end:
105 |     #    print(label[i])
106 |         i+=1
107 | 
108 | 
109 | 
110 | def main():
111 |     # Parse user input
112 |     parser = argparse.ArgumentParser(description='Experiment clustering community detection results')
113 |     parser.add_argument('metrics_dir', help="path to metrics dir")
114 |     parser.add_argument('dataset', help='dataset name.')
115 |     args = parser.parse_args()
116 | 
117 |     run_experiment(args.metrics_dir, args.dataset)
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 
123 | 


--------------------------------------------------------------------------------
/experiments/omega_comparison.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import circulo
 3 | import circulo.metrics
 4 | import numpy
 5 | import argparse
 6 | import os
 7 | import scipy
 8 | import csv
 9 | 
10 | def omega_loop(path, output_filename):
11 |     i = 0 
12 |     j = 0
13 |     k = 0
14 |     df_dimension = 0
15 |     total_omega_fs = 0
16 |     omega_fs = 0
17 | 
18 |     files = sorted(os.listdir(path[0]))
19 |     
20 |     for filename in files:
21 |         df_dimension = df_dimension + 1
22 |     omega_df = numpy.ones(shape = [df_dimension, df_dimension])*-1
23 |     #omega_list = numpy.empty(shape = [df_dimension*df_dimension+df_dimension, 3], dtype='S100')
24 |     omega_list = []
25 |     
26 |     
27 |     for f in files:
28 |         print(f)
29 |         json_data_f = open(path[0]+'/'+f)
30 |         data_f = json.load(json_data_f)
31 |         for s in files:
32 |            json_data_s = open(path[0]+'/'+s)
33 |            data_s = json.load(json_data_s)
34 |            omega_fs = circulo.metrics.omega.omega_index(data_f['membership'], data_s['membership'])
35 |            omega_list.append([f, s, omega_fs])
36 |            omega_df[i, j] = omega_fs
37 |            if f != s:
38 |                total_omega_fs = total_omega_fs + omega_fs
39 |            j = j + 1
40 |         omega_list.append([f, 'Adjusted_Average', total_omega_fs/(df_dimension-1)])
41 |         total_omega_fs = 0
42 |         i = i + 1
43 |         j = 0
44 |     print(omega_df)
45 |     numpy.save(output_filename[0], omega_df)
46 |     
47 |     with open(output_filename[0],'w') as myfile:
48 |         csvwriter = csv.writer(myfile,delimiter='\t')
49 |         csvwriter.writerow(['Graph1','Graph2','omega'])
50 |         for x in omega_list:
51 |             csvwriter.writerow(x)
52 | 
53 |     if omega_df.min() == 1:
54 |         print('All files give identical results')
55 |     else:
56 |         print('Differences exist among the files')
57 | 
58 | def main():
59 |     # Parse user input
60 |     parser = argparse.ArgumentParser(description='Run metrics across several algorithms or across iterations of a stochastic algorithm.')
61 |     parser.add_argument('path', nargs=1,help='Filepath location')
62 |     parser.add_argument('output_filename', nargs=1, help='Output filename')
63 |     args = parser.parse_args()
64 |     omega_loop(args.path, args.output_filename)
65 | 
66 | if __name__ == '__main__':
67 |     main()
68 | 


--------------------------------------------------------------------------------
/experiments/partition_metrics.R:
--------------------------------------------------------------------------------
 1 | # Sample Usage:
 2 | #   metrics <- getMetrics(datapath)
 3 | #   plotMetrics(metrics)
 4 | #   plotHist(metrics,'omega')
 5 | #   plotHist(metrics,'time')
 6 | 
 7 | library(ggplot2)
 8 | library(jsonlite)
 9 | 
10 | # Read metrics from json
11 | getMetrics <- function(datapath) {
12 | 
13 |     #Get file names and load the json files
14 |     filenames <- list.files(datapath, pattern="*.json", full.names=TRUE)
15 |     N <- length(filenames)
16 |     results <- lapply(filenames, fromJSON)
17 | 
18 |     #parse filenames to get algorithm names and dataset names
19 |     names <- basename(filenames)
20 |     names2 <- sapply(1:N, function(x) strsplit(names, "\\.")[[x]][1])
21 |     Datasets <- sapply(1:N, function(x) strsplit(names2, "--")[[x]][1])
22 |     Algorithms <-sapply(1:N, function(x) strsplit(names2, "--")[[x]][2])
23 | 
24 |     #fussy R data type formatting
25 |     metrics <- data.frame(Algorithms,Datasets)
26 |     
27 |     metric.names <- names(results[[1]]$metrics)
28 |     df <- data.frame(sapply(metric.names,function(l){
29 |         sapply(1:N,function(i) {results[[i]]$metrics[[l]]['results']        }) }))
30 |    
31 |     metrics <- cbind(metrics,df)
32 |     
33 |    return(metrics)
34 | }
35 | 
36 | # Plot one metric across a collection of datasets/algorithms
37 | plotMetric<- function(metrics,column='Conductance',datasets=NULL,algos=NULL,logx=FALSE,logy=FALSE,toPDF=FALSE) {
38 |     # Keep only data that matches datasets/algos criteria
39 |     data <- metrics
40 |     if (is.null(datasets)) {datasets <- unique(metrics$Datasets)}
41 |     if (is.null(algos)) {algos <- unique(metrics$Algorithms)}
42 | 
43 |     keep <- (data$Datasets %in% datasets) & (data$Algorithms %in% algos)
44 |     data <- data[keep,]
45 | 
46 |     # Reformat data columns into "long" format
47 |     Algorithms <- rep(data$Algorithms,sapply(data$Conductance,length))
48 |     Datasets <- rep(data$Datasets,sapply(data$Conductance,length))
49 |     value <- unlist(data[column])
50 |     data <- data.frame(Algorithms,Datasets,value)
51 | 
52 |     # Create density plot 
53 |     densityplot<- ggplot(data, aes(x=value,colour=Algorithms,fill=Algorithms))+
54 |                         facet_grid(. ~ Datasets) +
55 |                         geom_density(alpha=0.5) + 
56 |                         #geom_histogram(alpha=0.5,position='identity') + 
57 |                         xlab(column) + 
58 |                         ylab('Counts') + 
59 |                         theme_bw()+
60 |                         ggtitle(Sys.time())
61 | 
62 |     if (logx) {densityplot <- densityplot + scale_x_log10()}
63 |     if (logy) {densityplot <- densityplot + scale_y_log10()}
64 | 
65 |     # Print plot to PDF or screen
66 |     if (toPDF) {
67 |         pdffile <- paste(Sys.time(),"metricsGraph.pdf", sep='')
68 |         pdf(pdffile,height=10,width=12)
69 |         print(densityplot)
70 |         dev.off()
71 |         cat(sprintf('printed to %s \n', pdffile))
72 |     } else {
73 |         print(densityplot)
74 |     }
75 | }
76 | 
77 | # Plot specified metrics for one run (dataset/algorithm)
78 | ## NEEDS TO BE FINISHED.
79 | plotRun <- function(metrics,dataset='karate',algo='fastgreedy',columns=c('Conductance','Expansion'),logx=FALSE,logy=FALSE,toPDF=FALSE) {
80 |     # Keep only data that matches datasets/algos criteria
81 |     data <- metrics
82 |     keep <- (data$Datasets == dataset) & (data$Algorithms == algo)
83 |     data <- data[keep,]
84 | 
85 |     # Reformat data columns into "long" format
86 |     df <- sapply(data[,columns],unlist)
87 |     df <- data.frame(df,row.names=NULL)
88 |     # not done yet..
89 | }
90 | 


--------------------------------------------------------------------------------
/support/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3
 2 | 
 3 | RUN apt-get update && apt-get install --assume-yes git openssl curl \ 
 4 |      						   gcc g++ gfortran \ 
 5 |            					   libopenblas-dev liblapack-dev \
 6 | 						   libigraph0 \ 
 7 |  						   libpng12-dev libfreetype6-dev
 8 | 
 9 | ENV CFLAGS '-Wno-error=declaration-after-statement'
10 | RUN pip3 install numpy scipy scikit-learn matplotlib python-igraph
11 | 
12 | ADD . /Circulo
13 | WORKDIR /Circulo
14 | RUN pip3 install -r requirements.txt
15 | ENV PYTHONPATH /Circulo
16 | 
17 | CMD /bin/bash
18 | 


--------------------------------------------------------------------------------
/support/requirements.txt:
--------------------------------------------------------------------------------
1 | ipython==2.2.0
2 | matplotlib==1.4.0
3 | networkx==1.9
4 | numpy>=1.8.2
5 | python-igraph>=0.7
6 | scikit-learn>=0.15.1
7 | scipy>=0.14.0
8 | 


--------------------------------------------------------------------------------
/support/server_scripts/circulo_server.sh:
--------------------------------------------------------------------------------
 1 | # Circulo server instructions!
 2 | # This script should help with setting up a clean server instance with Circulo
 3 | 
 4 | #Currently it requries root access
 5 | 
 6 | ###
 7 | # Requirements before you run this script:
 8 | # git
 9 | # virtualenv
10 | # virtualenvwrapper
11 | # python3
12 | # pip3
13 | ###
14 | 
15 | 
16 | if [ "$#" -ne 1 ]; then
17 |         echo "Please provide remote for Circulo Git repo"
18 |         exit 0
19 | fi
20 | 
21 | CIRCULO_GIT_LOC = $1
22 | 
23 | 
24 | # create virtual environment folder and set the proper permissions
25 | cd /home
26 | sudo mkdir .venvs
27 | sudo chgrp -R admin .venvs/
28 | sudo chmod -R 770 .venvs/
29 | 
30 | # create circulo folder
31 | sudo mkdir circulo
32 | sudo chgrp -R admin circulo/
33 | sudo chmod -R 770 circulo/
34 | echo "created circulo folder"
35 | 
36 | # set up virtualenv. Requires virtualenvwrapper
37 | cd circulo
38 | export WORKON_HOME='~/.venvs'
39 | source '/usr/local/bin/virtualenvwrapper.sh'
40 | mkvirtualenv --python=`which python3` circulo
41 | deactivate
42 | 
43 | # adding a couple minor files for convenience
44 | echo "export WORKON_HOME='/home/.venvs' 
45 | source '/usr/local/bin/virtualenvwrapper.sh' 
46 | workon circulo" > setup
47 | 
48 | echo "Circulo
49 | 
50 | To work on circulo on this virtualenv, run
51 | 
52 |     source setup
53 | 
54 | Your prompt should start with (circulo) if it worked correctly.
55 | 
56 | To exit the virtualenv, run
57 | 
58 |     deactivate
59 | 
60 | 
61 | If you need to add new packages to the virtualenv, pip3 should work\
62 |  as expected as long as you are within the virtualenv. However, you may\
63 |   have to install unexpected dependencies that this OS didn't ship with.
64 | " > README
65 | 
66 | # get circulo!
67 | git clone $CIRCULO_GIT_LOC
68 | git clone https://github.com/snap-stanford/snap.git
69 | pushd snap
70 | 
71 | # start using the circulo virtualenv
72 | source setup
73 | 
74 | # add paths to virtualenv
75 | add2virtualenv /home/circulo/ /home/circulo/Circulo/ /home/circulo/Circulo/circulo/
76 | 
77 | # install dependencies for circulo's dependencies 
78 | # (you may have to add more, depending on your machine
79 | sudo apt-get install gfortran libopenblas-dev liblapack-dev # for scipy
80 | sudo apt-get install libpng12-dev libfreetype6-dev # for matplotlib
81 | sudo apt-get install libxml2-dev libz-dev python3-dev  #for igraph
82 | 
83 | 
84 | # install igraph
85 | pip3 install python-igraph
86 | pip3 install networkx
87 | 
88 | # install circulo's dependencies
89 | pip3 install numpy
90 | pip3 install scipy
91 | pip3 install matplotlib
92 | pip3 install scikit-learn
93 | pip3 install ipython
94 | 
95 | # finally, to use circulo, just cd into /home/circulo and run
96 | #     source setup
97 | # if your prompt begins with (circulo), you're ready to go.
98 | 


--------------------------------------------------------------------------------
/support/server_scripts/clean_circulo.sh:
--------------------------------------------------------------------------------
1 | rm -rf /home/circulo
2 | rm -rf /home/.venvs
3 | 


--------------------------------------------------------------------------------