├── .gitignore ├── .gitmodules ├── AUTHORS ├── CONTRIBUTING.md ├── LICENSE.txt ├── MAINTAINERS.md ├── README.md ├── circulo ├── algorithms │ ├── __init__.py │ ├── betweenness.py │ ├── biSBM │ │ ├── Makefile │ │ ├── biSBM.c │ │ └── biSBM.h │ ├── conga.py │ ├── congo.py │ ├── congo_test.py │ ├── girvan_newman.py │ ├── min_conductance.py │ ├── overlap.py │ ├── radicchi.py │ ├── rolx.py │ ├── snap_bigclam.py │ ├── snap_cesna.py │ ├── snap_cnm.py │ ├── snap_coda.py │ ├── snap_cpm.py │ ├── snap_girvan_newman.py │ ├── snap_infomap.py │ └── spectral.py ├── data │ ├── README.md │ ├── README_template.md │ ├── amazon │ │ └── run.py │ ├── as_data │ │ ├── README.md │ │ └── run.py │ ├── databot.py │ ├── flights │ │ ├── README.md │ │ └── run.py │ ├── football │ │ ├── README.md │ │ └── run.py │ ├── house_voting │ │ ├── README.md │ │ ├── download.sh │ │ └── run.py │ ├── karate │ │ ├── README.md │ │ └── run.py │ ├── malaria │ │ ├── README.md │ │ └── run.py │ ├── nba_schedule │ │ ├── README.md │ │ └── run.py │ ├── netscience │ │ ├── README.md │ │ └── run.py │ ├── pgp │ │ ├── README.md │ │ └── run.py │ ├── revolution │ │ ├── README.md │ │ └── run.py │ ├── school │ │ ├── README.md │ │ └── run.py │ ├── scotus │ │ ├── README.md │ │ └── run.py │ ├── senate_voting │ │ ├── README.md │ │ ├── download.sh │ │ ├── exercise.md │ │ └── run.py │ └── southernwomen │ │ ├── README.md │ │ └── run.py ├── metrics │ ├── cover.py │ ├── graph.py │ ├── omega.py │ └── probability_metric.py ├── setup │ ├── run_algos.py │ └── run_metrics.py ├── unit_tests │ ├── karate.gml │ ├── metrics.py │ └── test_metrics.py ├── utils │ ├── downloader.py │ ├── general.py │ ├── snap.py │ └── stochastic_selector.py └── wrappers │ └── community.py ├── experiments ├── README.md ├── cluster_omega_comparison.py ├── community_label.py ├── gephi_plot │ ├── create_graphml.py │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── lab41 │ │ └── circulo │ │ └── gephi_plot │ │ └── PlotGraphs.java ├── goodness_indicators.py ├── histogram_metrics.py ├── images │ ├── bubble_plot.png │ ├── community_label_results.png │ ├── counts.png │ ├── flights_algo_infomap.png │ ├── football--groundtruth--0.png │ ├── football_histogram.png │ └── time_vs_omega.png ├── metricsCharts.R ├── metrics_clustering.py ├── omega_comparison.py └── partition_metrics.R └── support ├── Dockerfile ├── requirements.txt └── server_scripts ├── circulo_server.sh └── clean_circulo.sh /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | .DS_Store 4 | *.swp 5 | *.pickle 6 | circulo/data/*/raw/ 7 | circulo/data/GRAPHS/ 8 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "lib/snap"] 2 | path = lib/snap 3 | url = https://github.com/snap-stanford/snap.git 4 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | # This file lists all individuals having contributed content to the repository. 2 | # If you're submitting a patch, please add your name here in alphabetical order as part of the patch. 3 | # 4 | # For a list of active project maintainers, see the MAINTAINERS file. 5 | # 6 | Paul M 7 | Yonas Tesfaye 8 | Nikhil Desai 9 | Robbie Ostrow 10 | and various US Government Participants 11 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/CONTRIBUTING.md -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /MAINTAINERS.md: -------------------------------------------------------------------------------- 1 | Paul M 2 | Yonas Tesfaye 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #Circulo: A Community Detection Evaluation Framework 2 | 3 | ###Contribute 4 | Contributors welcome! If you want to contribute, please issue a pull request. 5 | 6 | ##About 7 | ####The Framework: 8 | Circulo is a "Community Detection" Evaluation Framework written primarily in Python. The Framework performs statistical analysis against partitions of a Graph resulting from the execution of a given community detection algorithm. The resulting quantitative measures can be used to drive experiments such as measuring algorithm efficacy against specific dataset types or comparing different algorithm execution results against the same dataset. The framework includes the following components: 9 | 10 | - __Data ETL (Extract Transform Load) Engine__: Circulo includes functionality to incorporate existing datasets into the evaluation framework. By default, the framework includes several dataset "downloaders" in the directory [circulo/data](circulo/data). To learn how to add a dataset, please see the data [README](circulo/data/README.md). We encourage users to issue pull requests for new datasets. 11 | - __Algorithm Execution Engine:__ Circulo includes several algorithms by default which are located in the [algorithms](circulo/algorithms) directory. Using the framework, these algorithms can run in parallel against the included datasets by running the [run_algos.py](circulo/setup/run_algos.py) tool. Because some algorithms include parameters that can better cater execution to the type of input Graph (i.e directed and/or weighted), algorithm execution is wrapped in the file [community.py](circulo/wrappers/community.py). This enables the same algorithm to automatically operate differently depending on the dataset--enabling the algorithm to adapt to the dataset if allowed. To add an algorithm to the framework, add the files to [algorithms](circulo/algorithms) and update the wrapper [community.py](circulo/wrappers/community.py) appropriately. 12 | - __Metrics Engine:__ The metrics engine provides quantitative analysis of a given partitionin g of a graph. The metrics include internal statistical measures of a community (i.e. density), external measurements (i.e. expansion), and network wide metrics (ground truth comparisons). 13 | - __Experiments Engine:__ Different types of experiments have been designed to find patterns in the metric results. For example, how do algorithms compare when considering both time and accuracy. This component is meant to be a "playground" for experimentation on metric results. Experiments may vary significantly. Each file in the [experiments](experiments) directory is meant to be an independent experiment. See the [README](experiment/README.md) for more information. 14 | 15 | ####The Research 16 | Prior to building the Circulo framework, Lab41 conducted a market survey into Community Detection algorithms and metrics. The survey was used to guide the development of Circulo. The survey includes, but is not limited to, summaries of algorithms, references to academic literature, and general information about the field. The survey can be found here: http://lab41.github.io/survey-community-detection/. 17 | 18 | 19 | ####The Underlying Graph Framework 20 | Since we did not want to reimplement the notion of a graph, we decided to pick an existing Graph Framework as a backdrop for our work. Though any of the popular graph frameworks could have been used, iGraph was chosen as our primary graph framework. iGraph offers a number of features: 21 | 22 | - First and foremost, iGraph implements a number of community detection algorithms out of the box. It also provides two data structures for community detection: VertexClustering (non-overlapping communities) and VertexCover (overlapping communities) 23 | - iGraph is written in C at its core making it fast 24 | - iGraph has wrappers for Python and R 25 | - iGraph is a mature framework 26 | 27 | Other frameworks which could be used include GraphX, GraphLab, SNAP, NetworkX. 28 | 29 | 30 | ##Installation and Setup 31 | ####Package Requirements 32 | 33 | - git 34 | - python3 35 | - igraph (Refer to Appendix A for further instructions) 36 | - matplotlib 37 | - cairo (if you want to plot directly from igraph) 38 | - scipy 39 | - scikit.learn 40 | 41 | 42 | ####Installation 43 | Below are instructions for using Circulo 44 | 45 | #clone Circulo repository (note: this also clone SNAP) 46 | git clone --recursive https://github.com/Lab41/circulo.git 47 | #set PYTHONPATH env variable 48 | export PYTHONPATH=/path/to/Circulo 49 | #make the snap code base 50 | pushd lib/snap; make; popd 51 | 52 | 53 | 54 | #### Running the Evaluation Framework 55 | At the core, the evaluation framework run various community detection algorithms against various datasets. 56 | 57 | #To run your algorithms against the data 58 | python circulo/setup/run_algos [parameters ...] 59 | #To run metrics against the results of run_algos 60 | python circulo/setup/run_metrics [parameters ...] 61 | 62 | 63 | 64 | ##Appendix 65 | ####Appendix A: iGraph Installation 66 | #####Ubuntu 67 | 68 | sudo apt-get install igraph 69 | sudo apt-get install libxml2-dev libz-dev python-dev 70 | 71 | #####OS X 72 | 73 | #using brew install igraph dylibs 74 | brew install homebrew/science/igraph 75 | 76 | #install Cairo 77 | #installs the core libraries for cairo 78 | brew install cairo 79 | 80 | #installs the python site-packages. NOTE: pip does not work for pycairo. 81 | #If you want to use pip, create sym links to the site packages in brew 82 | brew install py3cairo 83 | 84 | #install python igraph 85 | pip3 install python-igraph 86 | 87 | -------------------------------------------------------------------------------- /circulo/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | modules = glob.glob(os.path.dirname(__file__)+"/*.py") 4 | __all__ = [ os.path.basename(f)[:-3] for f in modules] 5 | -------------------------------------------------------------------------------- /circulo/algorithms/betweenness.py: -------------------------------------------------------------------------------- 1 | import igraph as ig 2 | import itertools 3 | from collections import Counter 4 | 5 | 6 | def edge_and_pair_betweenness(G): 7 | """ 8 | An attempt to find the edge and pair betweennesses without finding all 9 | shortest paths, using flows. Currently unused. 10 | """ 11 | eb = {edge.tuple : 0 for edge in G.es} 12 | pb = {vertex.index : {uw : 0 for uw in itertools.combinations(G.neighbors(vertex), 2)} for vertex in G.vs} 13 | for v in G.vs: 14 | flows, pairFlows = get_flows(G, v.index, eb, pb) 15 | for flow in flows: # pythonify 16 | eb[flow] += flows[flow] / 2. # counted twice. 17 | for pflow in pairFlows: 18 | for uw in pairFlows[pflow]: 19 | pb[pflow][uw] += pairFlows[pflow][uw] / 2. 20 | return eb, pb 21 | 22 | 23 | def get_flows(G, index, eb, pb): 24 | """ 25 | Initializing the edge and pair betweenness dicts using flows. 26 | Edge betweenness correct, but pair betweenness needs work. 27 | This can be used as a template for future work concerning flows. 28 | """ 29 | # don't reinitialize these dicts each time. 30 | flows = {edge.tuple : 0 for edge in G.es} 31 | pairFlows = {vertex.index : {uw : 0 for uw in itertools.combinations(G.neighbors(vertex), 2)} for vertex in G.vs} 32 | bfs = G.bfsiter(index, advanced=True) 33 | bfsList = [bfs.next()[0].index] 34 | # skipping root, manually adding it 35 | bfsDict = {index : {"depth" : 0, "parents" : [], "numPaths": 1, "flow": 1}} 36 | nodesSeen = set([index]) 37 | # initializing bfs dict (decompose) 38 | for v, depth, parent in bfs: 39 | i = v.index 40 | bfsList.append(i) # probably don't need 41 | parents = [p for p in G.neighbors(v) if p in nodesSeen and bfsDict[p]["depth"] < depth] 42 | nodesSeen.add(i) 43 | numPaths = sum(bfsDict[p]["numPaths"] for p in parents) 44 | bfsDict[i] = {"depth": depth, "parents": parents, "numPaths": numPaths, "flow": 1} 45 | 46 | # getting flows (decompose) 47 | for v in reversed(bfsList): 48 | # getting edge flows 49 | parents = bfsDict[v]["parents"] 50 | totalPaths = float(sum(bfsDict[p]["numPaths"] for p in parents)) 51 | for p in parents: 52 | flowProportion = bfsDict[p]["numPaths"] / totalPaths 53 | flow = flowProportion * bfsDict[v]["flow"] 54 | flows[order_tuple((v, p))] = flow 55 | bfsDict[p]["flow"] += flow 56 | grandparents = bfsDict[p]["parents"] 57 | totalGrandparentPaths = float(sum(bfsDict[g]["numPaths"] for g in set(grandparents))) 58 | for g in grandparents: 59 | gCount = Counter(grandparents) 60 | gFlowProportion = bfsDict[g]["numPaths"] / totalPaths / float(gCount[g]) 61 | gFlow = gFlowProportion * bfsDict[v]["flow"] 62 | pairFlows[p][order_tuple((v, g))] = gFlow 63 | 64 | ## 65 | # pairFlows are incorrect!!! 66 | ## 67 | return flows, pairFlows -------------------------------------------------------------------------------- /circulo/algorithms/biSBM/Makefile: -------------------------------------------------------------------------------- 1 | # # Makefile modified from http://www.cs.swarthmore.edu/~newhall/unixhelp/howto_makefiles.html 2 | 3 | 4 | # Essentially the commands that are being run: 5 | 6 | # all: 7 | # gcc -O3 -Wall -pedantic biSBM.c -I/usr/local/Cellar/igraph/0.7.1/include/igraph -o biSBM -L/usr/local/Cellar/igraph/0.7.1/lib -ligraph 8 | 9 | # debug: 10 | # gcc -Wall -g -pedantic biSBM.c -I/usr/local/Cellar/igraph/0.7.1/include/igraph -o biSBM_debug -L/usr/local/Cellar/igraph/0.7.1/lib -ligraph 11 | 12 | 13 | # # define the compiler to use 14 | CC = gcc 15 | 16 | # # define any compile-time flags 17 | CFLAGS = -O3 -g -Wall -pedantic 18 | CFLAGS_DEBUG = -Wall -g -pedantic 19 | 20 | # # define any directories containing header files other than /usr/include 21 | INCLUDES = -I/usr/local/Cellar/igraph/0.7.1/include/igraph 22 | 23 | # # define library paths in addition to /usr/lib 24 | LFLAGS = -L/usr/local/Cellar/igraph/0.7.1/lib 25 | 26 | # # define any libraries to link into executable: 27 | LIBS = -ligraph 28 | 29 | # # define the source files 30 | SRCS = biSBM.c 31 | 32 | # # define the object files 33 | OBJS = $(SRCS:.c=.o) 34 | 35 | # # define the executable file 36 | MAIN = biSBM 37 | 38 | MAIN_DEBUG = biSBM_debug 39 | 40 | .PHONY: depend clean 41 | 42 | all: $(MAIN) 43 | @echo Compilation completed successfully. 44 | 45 | debug: $(MAIN_DEBUG) 46 | @echo Unoptimized compilation completed successfully. 47 | 48 | $(MAIN): $(OBJS) 49 | $(CC) $(CFLAGS) $(INCLUDES) -o $(MAIN) $(OBJS) $(LFLAGS) $(LIBS) 50 | 51 | $(MAIN_DEBUG): $(OBJS) 52 | $(CC) $(CFLAGS_UNOPT) $(INCLUDES) -o $(MAIN_DEBUG) $(OBJS) $(LFLAGS) $(LIBS) 53 | 54 | # this is a suffix replacement rule for building .o's from .c's 55 | # it uses automatic variables $<: the name of the prerequisite of 56 | # the rule(a .c file) and $@: the name of the target of the rule (a .o file) 57 | # (see the gnu make manual section about automatic variables) 58 | .c.o: 59 | $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ 60 | 61 | clean: 62 | $(RM) *.o *~ $(MAIN) $(MAIN_DEBUG) 63 | 64 | depend: $(SRCS) 65 | makedepend $(INCLUDES) $^ 66 | 67 | # # DO NOT DELETE THIS LINE -- make depend needs it 68 | -------------------------------------------------------------------------------- /circulo/algorithms/biSBM/biSBM.h: -------------------------------------------------------------------------------- 1 | /** 2 | * TODO: main comment 3 | * 4 | * 5 | */ 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | int igraph_community_bipartite_sbm(igraph_t *graph, igraph_vector_t *membership, 18 | igraph_integer_t k_a, igraph_integer_t k_b, 19 | igraph_integer_t max_iters, igraph_bool_t degree_correct); 20 | 21 | int log_message(const char *message, ...); 22 | 23 | void igraph_read_graph_generic(igraph_t *graph, char *type, char *file_name); 24 | 25 | void print_usage_and_exit(int exitstatus); 26 | 27 | 28 | -------------------------------------------------------------------------------- /circulo/algorithms/congo_test.py: -------------------------------------------------------------------------------- 1 | import circulo.algorithms.congo as CONGO 2 | import unittest 3 | import igraph 4 | import itertools 5 | 6 | class TestCongoFunctions(unittest.TestCase): 7 | 8 | def setUp(self): 9 | """ 10 | Initializes the graph for testing to Zachary's 11 | karate club. 12 | """ 13 | self.graph = igraph.Graph.Famous("zachary") 14 | self.graph.vs['CONGA_orig'] = [i.index for i in self.graph.vs] 15 | self.graph.es['eb'] = 0 16 | self.graph.vs['pb'] = [{uw : 0 for uw in itertools.combinations(self.graph.neighbors(vertex), 2)} for vertex in self.graph.vs] 17 | 18 | 19 | def tearDown(self): 20 | self.graph = None 21 | 22 | 23 | def test_test(self): 24 | """ 25 | Calculates the edge betweenness twice and checks 26 | equality. Just making sure the testing framework 27 | and igraph are working properly. 28 | """ 29 | eb = self.graph.edge_betweenness() 30 | self.assertEqual(self.graph.edge_betweenness(), eb) 31 | 32 | 33 | def test_edge_betweenness(self): 34 | """ 35 | Checks that the implementation of edge_betweenness in 36 | edge_and_pair_betweenness matches that of igraph's 37 | graph.edge_betweenness. 38 | """ 39 | ebtheirs = self.graph.edge_betweenness() 40 | ebmine, _ = CONGO.edge_and_pair_betweenness(self.graph) 41 | for e in self.graph.es: 42 | self.assertAlmostEqual(ebtheirs[e.index], ebmine[e.tuple]) 43 | 44 | 45 | def test_pair_betweenness(self): 46 | """ 47 | Checks to make sure that the sum of all pair betweennesses 48 | on a specific vertex are equal to its vertex betweenness. 49 | """ 50 | _, pb = CONGO.edge_and_pair_betweenness(self.graph) 51 | vb = self.graph.betweenness() 52 | for v in pb: 53 | self.assertAlmostEqual(sum(pb[v].values()), vb[v]) 54 | 55 | 56 | # def test_vertex_betweeenness_from_eb(self): 57 | # """ 58 | # Checks that the implementation of vertex_betweeenness_from_eb 59 | # yields the same results as that of igraph's graph.betweenness 60 | # """ 61 | # eb = self.graph.edge_betweenness() 62 | # ebmine, _ = CONGO.edge_and_pair_betweenness(self.graph) 63 | # vbtheirs = self.graph.betweenness() 64 | # vbmine = CONGO.vertex_betweeenness_from_eb(self.graph, ebmine) 65 | # for v in self.graph.vs: 66 | # self.assertAlmostEqual(vbtheirs[v.index], vbmine[v.index]) 67 | 68 | 69 | def test_initialize_betweenness(self): 70 | cp = self.graph.copy() 71 | 72 | eb = self.graph.edge_betweenness() 73 | CONGO.do_initial_betweenness(cp, 3) 74 | for i, e in enumerate(eb): 75 | self.assertAlmostEqual(e, cp.es[i]['eb']) 76 | 77 | 78 | # def testBetweennesses(G, h): 79 | # eb = G.edge_betweenness(cutoff=h) 80 | # for i, v in enumerate(G.es): 81 | # print v['eb'], 2 * eb[i], abs(v['eb'] - 2 * eb[i]) > .001 82 | 83 | 84 | def suite(): 85 | suite = unittest.TestSuite() 86 | tests = ['test_test', 'test_vertex_betweeenness_from_eb', 'test_edge_betweenness', 'test_pair_betweenness'] 87 | 88 | return unittest.TestSuite(list(map(TestCongoFunctions, tests))) 89 | 90 | 91 | if __name__ == '__main__': 92 | unittest.main() 93 | -------------------------------------------------------------------------------- /circulo/algorithms/girvan_newman.py: -------------------------------------------------------------------------------- 1 | import igraph as ig 2 | import operator 3 | import sys 4 | 5 | 6 | def gn(origGraph): 7 | """ 8 | Parameters: 9 | origGraph: a graph in igraph format 10 | 11 | Return value: 12 | A dendrogram (VertexDendrogram) created by running Girvan-Newman 13 | 14 | Notes: 15 | Runs the Girvan-Newman (edge-betweenness) algorithm on the graph provided. 16 | Iteratively removes the edge with the highest edge-betweenness, then recalculates. 17 | """ 18 | 19 | # initialize a list of removed edges that result in a split of the graph 20 | splits = [] 21 | 22 | G = origGraph.copy() 23 | 24 | while G.es: 25 | 26 | # Calculate all edge betweennesses 27 | # TODO: only recalculate on relevant portions 28 | edge_betweennesses = G.edge_betweenness() 29 | 30 | # returns an the first index if there is a tie at max. 31 | max_index, _ = max(enumerate(edge_betweennesses), key=operator.itemgetter(1)) 32 | 33 | # edge with the max betweenness 34 | edge = G.es[max_index].tuple 35 | 36 | G.delete_edges(edge) 37 | 38 | if splitGraph(G, edge): 39 | 40 | # edge is a tuple, but we want a list of lists. 41 | splits += [list(edge)] 42 | 43 | vd = createDendrogram(origGraph, splits) 44 | 45 | # If we don't call this then as_clustering() fails. bugfix in development branch. 46 | vd.optimal_count 47 | 48 | return vd 49 | 50 | 51 | def splitGraph(G, edge): 52 | """ 53 | Parameters: 54 | G: an igraph graph 55 | edge: an edge of the form (v1, v2) where v1 and v2 are vertices in G. 56 | 57 | Return value: 58 | A boolean value. True if removing the edge splits the graph. 59 | 60 | Notes: 61 | Checks to see if removing edge from G splits the graph into 2 disjoint 62 | communities. If so, returns True, otherwise False. 63 | """ 64 | 65 | return not G.edge_disjoint_paths(source=edge[0], target=edge[1]) 66 | 67 | 68 | def createDendrogram(G, splits): 69 | """ 70 | Given a historical list of split edges, creates a dendrogram 71 | by calculating the merges. 72 | 73 | Runs in O(nlgn) (But really, close to O(n).) This is a useful function 74 | for any divisive algorithm for which splits can be saved more easily 75 | than merges. 76 | """ 77 | 78 | # To create a dendrogram, new merges have id of max id + 1 79 | n = len(splits) + 1 80 | merges = [] 81 | 82 | mergeDict = {} 83 | 84 | while splits: 85 | # most recent split popped off 86 | edge = splits.pop() 87 | 88 | # Get the values the dendrogram wants for each vertex by finding 89 | # where merges have already happened. 90 | edge = [traverse(vertex, mergeDict) for vertex in edge] 91 | 92 | merges += [edge] 93 | 94 | # Update the dict to reflect a new merge. 95 | for vertex in edge: 96 | mergeDict[vertex] = n 97 | 98 | n += 1 99 | 100 | return ig.VertexDendrogram(G, merges) 101 | 102 | 103 | def traverse(vertex, mergeDict): 104 | """ 105 | Given a vertex and a dictionaty of merges, returns the id of the cluster 106 | the vertex belongs to. 107 | """ 108 | while vertex in mergeDict: 109 | vertex = mergeDict[vertex] 110 | return vertex 111 | 112 | 113 | 114 | if __name__ == "__main__": 115 | G = ig.load(sys.argv[1]) 116 | gn(G) -------------------------------------------------------------------------------- /circulo/algorithms/min_conductance.py: -------------------------------------------------------------------------------- 1 | import circulo.metrics 2 | import circulo.algorithms.spectral 3 | from igraph import Graph 4 | 5 | def min_conductance(G, weights=None, tries=3): 6 | ''' 7 | Returns the minimum conductance of a Graph by using spectral clustering to ``approximate'' the minimum ratio-cut. 8 | http://www.kyb.mpg.de/fileadmin/user_upload/files/publications/attachments/Luxburg07_tutorial_4488%5b0%5d.pdf 9 | ''' 10 | (rv_val, rv_vc) = (float("inf"), None) 11 | for i in range(0,tries): 12 | try: 13 | #Obtain a cut of G, it should already be a minimum 14 | curr_vc = G.community_spectral(k=2, weights=weights, which='NCut') 15 | curr_val = max(curr_vc.as_cover().conductance()) 16 | if curr_val < rv_val : 17 | (rv_val, rv_vc) = (curr_val, curr_vc) 18 | except: 19 | pass 20 | 21 | 22 | return rv_val, rv_vc 23 | 24 | Graph.min_conductance = min_conductance 25 | -------------------------------------------------------------------------------- /circulo/algorithms/snap_bigclam.py: -------------------------------------------------------------------------------- 1 | import igraph 2 | import os 3 | import subprocess 4 | from circulo.utils.snap import setup,read_communities_by_community 5 | from multiprocessing import cpu_count 6 | 7 | def bigclam(G, data_prefix='snap_', node_filepath='', detect_comm=100, min_comm=5, max_comm=100, trials=5, threads=cpu_count(), alpha=0.3, beta=0.3): 8 | ''' 9 | BigClam from Snap 10 | 11 | Parameters 12 | ---------- 13 | G : A NetworkX graph or edge list file 14 | data_prefix: Output file for communitities (data_prefix + cmtyvv.txt) 15 | node_file_path: Input file name for node names (Node ID, Node label) 16 | detect_comm: The number of communities to detect (-1: detect automatically) (Default: 100) 17 | min_comm: Minimum number of communities to try (Default = 5) 18 | max_comm: Maximum number of communities to try (Default = 100) 19 | trials: How many trials for the number of communities (Default = 10) 20 | threads: Number of threads for parallelization (Default = 4) 21 | alpha: Alpha for backtracking line search (Default = 0.05) 22 | beta: Beta for backtracking line search (Default = 0.3) 23 | 24 | Returns: List of SubGraphs representing the communities. The SubGraphs are automatically serialized to disk as file data_prefix+'cmtyvv.txt' 25 | ''' 26 | 27 | snap_home, graph_file = setup(G, include_header=False) 28 | 29 | 30 | if graph_file is None: 31 | return None 32 | 33 | path_bigclam = os.path.join(snap_home, "examples", "bigclam", "bigclam") 34 | 35 | try: 36 | FNULL = open(os.devnull, 'w') 37 | out = subprocess.Popen([path_bigclam,"-o:"+data_prefix,"-i:"+graph_file,"-l:"+node_filepath,"-c:"+str(detect_comm), "-mc:"+str(min_comm), "-xc:"+str(max_comm), "-nc:"+str(trials), "-nt:"+str(threads), "-sa:"+str(alpha), "-sb:"+str(beta)], stdout=FNULL).wait() 38 | 39 | except TypeError as e: 40 | print("Error occurred: {}".format(e)) 41 | return 42 | 43 | 44 | os.remove(graph_file) 45 | 46 | return read_communities_by_community(data_prefix + "cmtyvv.txt", G, delete_file=True) 47 | 48 | 49 | def main(): 50 | 51 | G = igraph.Graph.Erdos_Renyi(n=30, m=100) 52 | snap_home, filename = setup(G) 53 | 54 | vc = bigclam(G) 55 | print(vc) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /circulo/algorithms/snap_cesna.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | import igraph 17 | import os 18 | import subprocess 19 | from circulo.utils import snap 20 | import shutil 21 | from multiprocessing import cpu_count 22 | 23 | def cesna(G, attributes_to_include, data_prefix='snap_', node_filepath='', detect_comm=100, min_comm=5, max_comm=100, trials=5, threads=cpu_count(), alpha=.3, beta=0.3): 24 | 25 | ''' 26 | Parameters 27 | ----------- 28 | G: An iGraph or edge list file 29 | f_attributes: Input node attribute file name (Required) 30 | f_attribute_names: Input file name for node attribute names (Required) 31 | nodes: Input file name for node names (Node ID, Node label) 32 | detect_comm: The number of communities to detect (-1: detect automatically) (default:10) 33 | min_comm: Minimum number of communities to try (default:3) 34 | max_comm: Maximum number of communities to try (default:20) 35 | trials: How many trials for the number of communities (default:5) 36 | threads: Number of threads for parallelization (default:4) 37 | aw: We maximize (1 - aw) P(Network) + aw * P(Attributes) (default:0.5) 38 | lw: Weight for l-1 regularization on learning the logistic model parameters (default:1) 39 | alpha: Alpha for backtracking line search (default:0.05) 40 | beta: Beta for backtracking line search (default:0.3) 41 | mf if the fraction of nodes with positive values for an attribute is smaller than this, we ignore that attribute (default:0) 42 | ''' 43 | 44 | snap_home, graph_file = snap.setup(G) 45 | 46 | f_attribute_names, f_attributes = snap.attribute_setup(G, attributes_to_include) 47 | if graph_file is None: 48 | return 49 | 50 | path_cesna = os.path.join(snap_home, "examples", "cesna", "cesna") 51 | 52 | try: 53 | FNULL = open(os.devnull, 'w') 54 | out = subprocess.Popen([path_cesna,"-o:"+data_prefix,"-i:"+graph_file,"-l:"+node_filepath, "-c:" + str(detect_comm), "-mc:"+str(min_comm), "-xc:"+str(max_comm), "-nc:"+str(trials), "-nt:"+str(threads), "-sa:"+str(alpha), "-sb:"+str(beta), "-a:"+f_attributes, "-n:"+f_attribute_names],stdout=FNULL).wait() 55 | 56 | 57 | except TypeError as e: 58 | print("Error occurred: {}".format(e)) 59 | return 60 | 61 | os.remove(graph_file) 62 | 63 | return snap.read_communities_by_community(data_prefix + "cmtyvv.txt", G, delete_file=True) 64 | 65 | 66 | 67 | def main(): 68 | 69 | G = igraph.load('/Users/ytesfaye/tmp/GRAPHS/flights.graphml') 70 | #snap_home, filename = setup(G) 71 | 72 | vc = cesna(G) 73 | print(vc) 74 | 75 | 76 | if __name__ == "__main__": 77 | main() 78 | -------------------------------------------------------------------------------- /circulo/algorithms/snap_cnm.py: -------------------------------------------------------------------------------- 1 | import igraph 2 | from circulo.utils.snap import divisive, setup 3 | 4 | def clauset_newman_moore(G, output="communities.txt"): 5 | return divisive(G, "2", output) 6 | 7 | def main(): 8 | 9 | G = igraph.Graph.Erdos_Renyi(n=30, m=100) 10 | snap_home, filename = setup(G) 11 | 12 | vc = clauset_newman_moore(G) 13 | print(vc) 14 | 15 | if __name__ == "__main__": 16 | main() 17 | -------------------------------------------------------------------------------- /circulo/algorithms/snap_coda.py: -------------------------------------------------------------------------------- 1 | import igraph 2 | import os 3 | import subprocess 4 | from circulo.utils.snap import setup, read_communities_by_community 5 | 6 | 7 | def coda(G, data_prefix='snap_', node_filepath='', graph_type=0, detect_comm=100, min_comm=5, max_comm=100, trials=10, threads=4, alpha=0.05, beta=0.3): 8 | ''' 9 | Coda from Snap 10 | 11 | Parameters 12 | ---------- 13 | G : A NetworkX graph or edge list file 14 | node_file_path: Input file name for node names (Node ID, Node label) 15 | graph_type: 0=directed, 1=undirected (default: 0) 16 | detect_comm: The number of communities to detect (-1: detect automatically) (Default: 100) 17 | min_comm: Minimum number of communities to try (Default = 5) 18 | max_comm: Maximum number of communities to try (Default = 100) 19 | trials: How many trials for the number of communities (Default = 10) 20 | threads: Number of threads for parallelization (Default = 4) 21 | alpha: Alpha for backtracking line search (Default = 0.05) 22 | beta: Beta for backtracking line search (Default = 0.3) 23 | ''' 24 | 25 | snap_home, graph_file = setup(G) 26 | path_coda = os.path.join(snap_home, "examples", "coda", "coda") 27 | 28 | try: 29 | FNULL = open(os.devnull, 'w') 30 | 31 | out = subprocess.Popen([path_coda,"-o:"+data_prefix,"-i:"+graph_file,"-l:"+node_filepath,"-g:"+str(graph_type),"-c:"+str(detect_comm), "-mc:"+str(min_comm), "xc:"+str(max_comm), "-nc:"+str(trials), "-nt:"+str(threads), "-sa:"+str(alpha), "-sb:"+str(beta)], stdout=FNULL).wait() 32 | 33 | except TypeError as e: 34 | print("Error occurred: {}".format(e)) 35 | return 36 | 37 | 38 | os.remove(graph_file) 39 | 40 | #CODE returns an "in" and an "out" file. Not sure why... so am just using out 41 | return read_communities_by_community(data_prefix + "cmtyvv.out.txt", G) 42 | 43 | 44 | 45 | def main(): 46 | 47 | G = igraph.Graph.Erdos_Renyi(n=30, m=100) 48 | snap_home, filename = setup(G) 49 | 50 | vc = coda(G) 51 | print(vc) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /circulo/algorithms/snap_cpm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | import igraph 17 | import os 18 | import subprocess 19 | from circulo.utils import snap 20 | 21 | def clique_percolation(G, data_prefix='snap_'): 22 | 23 | ''' 24 | Parameters 25 | ----------- 26 | G: An iGraph or edge list file 27 | ''' 28 | 29 | snap_home, graph_file = snap.setup(G) 30 | 31 | if graph_file is None: 32 | return 33 | 34 | path_cpm = os.path.join(snap_home, "examples", "cliques", "cliquesmain") 35 | 36 | try: 37 | FNULL = open(os.devnull, 'w') 38 | out = subprocess.Popen([path_cpm,"-o:"+data_prefix,"-i:"+graph_file], stdout=FNULL).wait() 39 | 40 | 41 | except TypeError as e: 42 | print("Error occurred: {}".format(e)) 43 | return 44 | 45 | os.remove(graph_file) 46 | 47 | return snap.read_communities_by_community("cpm-" + data_prefix + ".txt", G, delete_file=True) 48 | 49 | 50 | 51 | def main(): 52 | 53 | G = igraph.load('/Users/ytesfaye/tmp/GRAPHS/flights.graphml') 54 | #snap_home, filename = setup(G) 55 | 56 | vc = cesna(G) 57 | print(vc) 58 | 59 | 60 | if __name__ == "__main__": 61 | main() -------------------------------------------------------------------------------- /circulo/algorithms/snap_girvan_newman.py: -------------------------------------------------------------------------------- 1 | import igraph 2 | from circulo.utils.snap import setup, divisive 3 | 4 | def girvan_newman(G, output="communities.txt"): 5 | return divisive(G, "1", output) 6 | 7 | def main(): 8 | 9 | G = igraph.Graph.Erdos_Renyi(n=30, m=100) 10 | snap_home, filename = setup(G) 11 | 12 | vc = girvan_newman(G) 13 | print(vc) 14 | 15 | if __name__ == "__main__": 16 | main() 17 | -------------------------------------------------------------------------------- /circulo/algorithms/snap_infomap.py: -------------------------------------------------------------------------------- 1 | import igraph 2 | from circulo.utils.snap import divisive, setup 3 | 4 | def infomap(G, output="communities"): 5 | return divisive(G, "3", output) 6 | 7 | def main(): 8 | 9 | G = igraph.Graph.Erdos_Renyi(n=30, m=100) 10 | snap_home, filename = setup(G) 11 | 12 | vc = infomap(G) 13 | print(vc) 14 | 15 | if __name__ == "__main__": 16 | main() 17 | -------------------------------------------------------------------------------- /circulo/algorithms/spectral.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.linalg import norm 3 | from scipy.sparse import csc_matrix, diags 4 | from scipy.sparse.linalg import eigsh 5 | from scipy.cluster.vq import vq, kmeans2 6 | 7 | from igraph import Graph, VertexClustering 8 | 9 | def __eigenvectors_to_vc(G, eigvc): 10 | centroid, label = kmeans2(eigvc, eigvc.shape[1], minit='points') 11 | return VertexClustering(G, label) 12 | 13 | def __community_spectral_base(G, k, weights, normalized): 14 | L = csc_matrix(G.laplacian(weights=weights, 15 | normalized=normalized), 16 | dtype='d') 17 | eigvl, eigvc = eigsh(L, k, which='SM') 18 | if normalized: 19 | for row in eigvc: 20 | row /= norm(row) 21 | return __eigenvectors_to_vc(G, eigvc) 22 | 23 | def __community_spectral_rw(G, k, weights): 24 | L = G.laplacian(weights=weights) 25 | D = np.diag(L) 26 | L = csc_matrix(L, dtype='d') 27 | D = diags(D, 0, dtype='d', format=L.format) 28 | 29 | eigvl, eigvc = eigsh(L, k, M=D, which='SM') 30 | 31 | return __eigenvectors_to_vc(G, eigvc) 32 | 33 | def community_spectral(G, k=2, weights=None, which='NCut_rw'): 34 | ''' 35 | Performs a relaxed version of Ratio or N-cut by performing k-means on 36 | the (n, k)-matrix of eigenvectors from different versions of the Graph 37 | Laplacian. 38 | @params 39 | G : an igraph.Graph. 40 | k : number of communities to cluster. 41 | weights : A weight vector or the name of an edge property. 42 | which : the type of cut to perform, one of RatioCut, NCut, or NCut_rw. 43 | @returns 44 | vc : VertexClustering with up to k clusters 45 | ''' 46 | method = { 47 | 'RatioCut'.lower() : lambda g, c, w: __community_spectral_base(g, c, w, normalized=False), 48 | 'NCut'.lower() : lambda g, c, w: __community_spectral_base(g, c, w, normalized=True), 49 | 'NCut_rw'.lower() : lambda g, c, w: __community_spectral_rw(g, c, w) 50 | } 51 | 52 | # The default cut is accross components 53 | vc = G.components() 54 | if len(vc) >= k: 55 | membership = [ x%k for x in vc.membership ] 56 | vc = VertexClustering(G, membership) 57 | else: 58 | vc = method[which.lower()](G,k,weights) 59 | 60 | return vc 61 | 62 | Graph.community_spectral = community_spectral 63 | -------------------------------------------------------------------------------- /circulo/data/README.md: -------------------------------------------------------------------------------- 1 | # Circulo Datasets 2 | 3 | ###Summary 4 | This directory contains the python scripts that download the individual datasets for the Circulo framework. Each subdirectory represents a single dataset. Each dataset is converted into graphml and stored in the [GRAPHS](circulo/data/GRAPHS) directory by the run.py script. As such, run.py is responsible for downloading and converting raw data into a graphml formatted file. Each run.py script must contain a class that inherits from the CirculoData class found in the [databot](circulo/data/databot.py) module. 5 | 6 | 7 | ###How do I add a new dataset? 8 | 9 | The key to understanding how to import a dataset into Circulo is to be familiar with the CirculoData class in the [databot](circulo/data/databot.py) module. We'll pretend our new dataset is called "friends". To import the friends dataset into the Circulo framework, follow these steps: 10 | 11 | 1. Create a new subdirectory with a name describing the new dataset: `mkdir friends` 12 | 2. Create the python file `friends/run.py` and be sure that `run.py` contains a class that inherits from CirculoData. In this case, we will call the class `FriendsData`. 13 | 3. Copy the README template into the new directory, naming it `README.md`: `cp README_template.md friends/README.md`. Be sure to be as thorough as possible when writing the README so that others will understand your dataset. 14 | 4. Override the necessary functions from the CirculoData class in the FriendsData class in `run.py`. Please see other `run.py` files for examples. The amount of code required in the `run.py` file largely depends on how close the original data is to a graph format. 15 | 5. Add a row to the Dataset Index in this README. 16 | 6. In setup/run_algos.py there is a list called "data_choices", add your newly created datasets to that list (it must match the folder name) 17 | 18 | ## Dataset Index 19 | | Dataset | Description | Has Ground Truth? 20 | | ------- | ------------|:---------------------:| 21 | | amazon | Co-purchasing Data | Yes | 22 | | as_data | Autonomous System Relationship Data | Yes | 23 | | house_voting | 2014 congress (house) voting data | Yes | 24 | | flights | Flights data from | Yes | 25 | | football | NCAA D1A games played in the Fall 2000 season | Yes | 26 | | karate | Famous data set of Zachary's karate club | Yes | 27 | | malaria | Amino acids in malaria parasite | **No** | 28 | | nba_schedule | Games played in the 2013-2014 NBA season | Yes | 29 | | netscience | Graph of collaborators on papers about network science | **No** | 30 | | pgp | Interactions in pretty good privacy | **No** | 31 | | revolution |This is a bipartite graph representing colonial American dissidents' membership |**No**| 32 | | school | Face-to-face interactions in a primary school | Yes | 33 | | scotus | Supreme court case citation network | **No** | 34 | | senate_voting | 2014 congress (senate) voting data | Yes | 35 | | southern_women | bipartite graph of southern women social groups | __No__ | 36 | 37 | ## Resources 38 | Here are some links with lots of graphs. Most of these sites also point you towards other resources. If you need a graph that we don't provide a script for, these sites are a good place to start looking. 39 | 40 | : igraph's own repository of graphs. Available in several formats. 41 | 42 | UC Irvine's repository of graphs. Available in several formats. 43 | 44 | Mark Newman's personal collection of graphs. Available in gml. 45 | 46 | Snap's repository of (especially large) datasets. 47 | 48 | Interesting datasets curated by University College Dublin. 49 | -------------------------------------------------------------------------------- /circulo/data/README_template.md: -------------------------------------------------------------------------------- 1 | ## [Dataset Name] 2 | 3 | The data can be found at (Link to dataset) 4 | 5 | ## Description 6 | (Give a high level description of the data set.) 7 | 8 | Directed: TODO 9 | 10 | Weighted: TODO 11 | 12 | Multigraph: TODO 13 | 14 | ### Vertices 15 | (describe what the vertices represent, and their attributes) 16 | 17 | Attributes: 18 | 19 | ### Edges 20 | (describe what the edges represent, and their attributes) 21 | 22 | Attributes: 23 | 24 | ## Ground Truth 25 | (describe the ground truth implemented, if any) 26 | 27 | ## Other Notes 28 | * See `run.py` for specific details 29 | 30 | ## References -------------------------------------------------------------------------------- /circulo/data/amazon/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urllib.request 3 | import igraph as ig 4 | import gzip 5 | import pickle 6 | import shutil 7 | import sys 8 | from circulo.utils.downloader import download_with_notes, _unzip 9 | import csv 10 | 11 | from circulo.data.databot import * 12 | 13 | ## First pass at downloading SNAP data. 14 | # 1. SNAP uses gzip for compression 15 | # 2. There are overlapping communities, but graphml attributes cannot be lists so stored as string 16 | # 3. Uses pickle file instead to store graph object (keeping groundtruth as list) 17 | # 4. igraph wants to keep vertex ids sequential but SNAP data is not, so some empty nodes are created 18 | # 5. after deleting these isolate nodes, the ids are remapped to remain sequetial, so have to also remap ground truth 19 | 20 | DOWNLOAD_URL = 'http://snap.stanford.edu/data/bigdata/communities/com-amazon.ungraph.txt.gz' 21 | DATA_NAME = 'com-amazon.ungraph.txt' 22 | GRAPH_NAME = 'amazon.graphml' 23 | 24 | #ground truth of the top 5000 copurchasing items 25 | DOWNLOAD_URL_GROUNDTRUTH = 'http://snap.stanford.edu/data/bigdata/communities/com-amazon.top5000.cmty.txt.gz' 26 | #DOWNLOAD_URL_GROUNDTRUTH = 'http://snap.stanford.edu/data/bigdata/communities/com-amazon.all.cmty.txt.gz' 27 | GROUNDTRUTH_NAME = 'com-amazon.top5000.cmty.txt' 28 | 29 | PICKLE_NAME = 'amazon-graph.pickle' 30 | 31 | 32 | class AmazonData(CirculoData): 33 | 34 | def __download__(self): 35 | ''' 36 | downloads graph from SNAP website 37 | ''' 38 | #download the graph as an edgelist 39 | self.download_with_notes(DOWNLOAD_URL) 40 | 41 | #download ground truth 42 | self.download_with_notes(DOWNLOAD_URL_GROUNDTRUTH) 43 | 44 | def __prepare__(self): 45 | 46 | data_path_old = os.path.join(self.raw_data_path, DATA_NAME + ".old") 47 | data_path = os.path.join(self.raw_data_path, DATA_NAME) 48 | 49 | #remove non edge data from edgelist 50 | shutil.move(data_path, data_path_old) 51 | 52 | with open(data_path_old, "r") as f: 53 | with open(data_path, "w") as out: 54 | for line in f: 55 | if(line.startswith('#') == False): 56 | out.write(line) 57 | 58 | groundtruth_path = os.path.join(self.raw_data_path,GROUNDTRUTH_NAME) 59 | 60 | # Read in Edgelist. Note that igraph creates extra nodes 61 | # with no edges for ids missing in sequential order 62 | # from the graph. We will delete these isolates later 63 | g = ig.Graph.Read_Edgelist(data_path,directed=False) 64 | 65 | # Assign communities as node attributes 66 | with open(groundtruth_path,'r') as gtp: 67 | csvreader = csv.reader(gtp,delimiter='\t') 68 | # note that converting to graphml, attributes cannot be lists 69 | # only boolean,int,long,float,double,or string 70 | # 71 | # storing groundtruth communities as both arrays and strings 72 | # so that graphml file can retain attribute 73 | g.vs()['groundtruth_str'] = '' 74 | 75 | count = 0 76 | for line in csvreader: 77 | for v in line: 78 | v = int(v) 79 | if g.vs[v]['groundtruth_str']: 80 | g.vs[v]['groundtruth_str'] += ',' + str(count) 81 | else: 82 | g.vs[v]['groundtruth_str'] = str(count) 83 | count += 1 84 | max_clusters = count 85 | 86 | # remove isolates - this changes node ids! 87 | g.delete_vertices(g.vs.select(_degree=0)) 88 | 89 | # Write out graphml file 90 | g.write_graphml(self.graph_path) 91 | 92 | 93 | 94 | def get_context(self): 95 | return { 96 | CirculoData.CONTEXT_OPTIMAL_PARTITIONS:5000 97 | } 98 | 99 | def get_ground_truth(self, G): 100 | 101 | cluster_dict = {} 102 | 103 | for idx, cluster_str in enumerate(G.vs()['groundtruth_str']): 104 | for c in cluster_str.split(): 105 | if c not in cluster_dict: 106 | cluster_dict[c] = [] 107 | 108 | #have to re-do this since id's likely changed by removing isolates 109 | cluster_dict[c].append(idx) 110 | 111 | return ig.VertexCover(G,[v for v in cluster_dict.values()]) 112 | 113 | 114 | def main(): 115 | databot = AmazonData("amazon") 116 | databot.get_ground_truth(databot.get_graph()) 117 | 118 | if __name__ == '__main__': 119 | main() 120 | -------------------------------------------------------------------------------- /circulo/data/as_data/README.md: -------------------------------------------------------------------------------- 1 | ## AS Relationship Data 2 | 3 | The data can be found at 4 | 5 | ## Description 6 | This dataset is taken from the Center for Applied Internet Data Analysis (CAIDA). This dataset assigns labels, either peer or isp, to Autonomous System (AS) Relationships. Understand AS relationships is useful for understanding the structure of the internet and why routing properties are the way they are. 7 | 8 | Directed: No 9 | 10 | Weighted: No 11 | 12 | Multigraph: No 13 | 14 | ### Vertices 15 | Each vertex represents an AS. 16 | 17 | Attributes: 18 | * **ASN**: The name of the researcher 19 | * **aut_name**: Name of AS 20 | * **changed**: Date of change to infomartion 21 | * **country**: Country of AS 22 | * **org_name**: Name of organization running AS 23 | * **source**: Registrar (i.e. ARIN) 24 | 25 | 26 | ### Edges 27 | An edge represents a relationship between two AS. 28 | 29 | Attributes: 30 | * **relationship**: 1 if it is a provider/customer link, 0 if it is a peer AS link 31 | 32 | ## Ground Truth 33 | Currently set to country the AS is in. Registrar might more closely reflect the community structure 34 | 35 | ## Other Notes 36 | * See `run.py` for specific details 37 | 38 | ## References 39 | The CAIDA UCSD AS-Relationship - 20141201, 40 | -------------------------------------------------------------------------------- /circulo/data/as_data/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | import igraph 17 | from igraph import VertexCover 18 | import gzip 19 | import bz2 20 | import os 21 | from operator import itemgetter 22 | from collections import defaultdict 23 | 24 | 25 | from circulo.data.databot import CirculoData 26 | 27 | DOWNLOAD_URL = 'http://data.caida.org/datasets/as-relationships/serial-1/20141201.as-rel.txt.bz2' 28 | AS_INFO_URL = 'http://data.caida.org/datasets/as-organizations/20141001.as-org2info.txt.gz' 29 | 30 | 31 | class ASData(CirculoData): 32 | @staticmethod 33 | def __make_asn_mapping(fname): 34 | """ 35 | Helper function to turn downloaded file into dictionary where key = ASN and values are dictionary of properties 36 | """ 37 | FORMAT_STRING = '# format' 38 | ORG_ID_STRING = 'org_id' 39 | ASN_STRING = 'aut' 40 | 41 | data_by_org_id = {} # Dictionary of properties by org ID {org_id:{prop_name:prop_value}} 42 | # Read source file using # format lines to parse fields 43 | with gzip.open(fname, 'rt') as f: 44 | for line in f: 45 | line = line.strip() 46 | if line.startswith(FORMAT_STRING): 47 | # Extract format 48 | format_fields = line[len(FORMAT_STRING)+1:].split('|') 49 | org_id_index = format_fields.index(ORG_ID_STRING) 50 | print(format_fields, org_id_index) 51 | elif line.startswith("#"): 52 | # Ignore comments that aren't format 53 | pass 54 | else: 55 | # Decode data and add to data_by_org_id 56 | data = line.split('|') 57 | org_id = data[org_id_index] 58 | if org_id not in data_by_org_id: 59 | data_by_org_id[org_id] = {} 60 | for i,data_field in enumerate(format_fields): 61 | if i != org_id_index: 62 | data_by_org_id[org_id][data_field] = data[i] 63 | 64 | # Restructure data to be sorted by asn 65 | data_by_asn = {} # {asn:{prop_name:prop_value}} 66 | for org_id in data_by_org_id: 67 | asn = data_by_org_id[org_id][ASN_STRING] 68 | if asn not in data_by_asn: 69 | data_by_asn[asn] = {} 70 | for (field_name, field_val) in data_by_org_id[org_id].items(): 71 | if field_name != ASN_STRING: 72 | data_by_asn[asn][field_name] = field_val 73 | return data_by_asn 74 | 75 | def __download__(self): 76 | print("Downloading") 77 | self.download_with_notes(DOWNLOAD_URL) 78 | self.download_with_notes(AS_INFO_URL) 79 | 80 | def __prepare__(self): 81 | filename = os.path.join(self.raw_data_path, os.path.basename(DOWNLOAD_URL)) 82 | 83 | edges = [] 84 | relationships = [] 85 | print("Reading links") 86 | num_nodes = -1 87 | # Read in raw AS Links 88 | with bz2.open(filename, 'rt') as f: 89 | for line in f: 90 | line = line.strip() 91 | if not line.startswith('#'): 92 | (src, dst, relationship) = line.split('|') 93 | src = int(src) 94 | dst = int(dst) 95 | if src and dst: 96 | # TODO: Consider changing to directed graph and duplicating peer links in both directions? 97 | edges.append((src, dst)) 98 | relationships.append(relationship) 99 | # Keep track of max node seen 100 | if src > num_nodes: 101 | num_nodes = src 102 | if dst > num_nodes: 103 | num_nodes = dst 104 | 105 | print("Creating Graph") 106 | g = igraph.Graph(directed=False) 107 | g.add_vertices(num_nodes+1) # Need +1 since ASN are 1 indexed but verticies are 0 indexed 108 | g.add_edges(edges) 109 | 110 | # Keep AS Names through pruning 111 | g.vs["ASN"] = [str(as_num) for as_num in range(len(g.vs))] 112 | # Add relationships before pruning 113 | g.es["relationship"] = relationships 114 | 115 | # Add other ASN Properties 116 | asn_filename = os.path.join(self.raw_data_path, os.path.basename(AS_INFO_URL)) 117 | asn_info = self.__make_asn_mapping(asn_filename) 118 | print("Num Nodes:", num_nodes) 119 | for asn in asn_info: 120 | if int(asn) <= num_nodes: 121 | for field_name, field_val in asn_info[asn].items(): 122 | g.vs[int(asn)][field_name] = field_val 123 | 124 | print("Checking Graph") 125 | # Take largest connected component 126 | components = g.components(mode=igraph.WEAK) 127 | if len(components) > 1: 128 | print("[Graph Prep - as_data]... Disconnected Graph Detected. Using largest component.") 129 | print("[Graph Prep - as_data]... Original graph: {} vertices and {} edges.".format(g.vcount(), g.ecount())) 130 | g = g.subgraph(max(components, key=len)) 131 | print("[Graph Prep - as_data]... Largest component: {} vertices and {} edges.".format(g.vcount(), g.ecount())) 132 | g.write_graphml(self.graph_path) 133 | 134 | def prune(self, G): 135 | # There aren't edge weights so there's no way to prune 136 | pass 137 | 138 | def get_ground_truth(self, G): 139 | """ 140 | Get a Vertex Cover representing the ground truth for this graph. It's not apparent what the right "ground truth" 141 | is but a guess is "country". It might be true that "source" (which is the registrar that handled the transaction 142 | ) is a better guess 143 | """ 144 | if G is None: 145 | return 146 | 147 | GROUND_TRUTH_FIELD = 'country' 148 | 149 | membership = G.vs[GROUND_TRUTH_FIELD] 150 | # Map community names to integers 151 | community_name_to_id = {} 152 | max_community_seen = 0 153 | 154 | cluster_dict = defaultdict(list) 155 | for vertex_id, community_name in enumerate(membership): 156 | cluster_dict[community_name].append(vertex_id) 157 | 158 | cluster_list = [v for v in cluster_dict.values()] 159 | return VertexCover(G, cluster_list) 160 | 161 | 162 | def main(): 163 | databot = ASData("as_data") 164 | G = databot.get_graph() 165 | databot.get_ground_truth(G) 166 | 167 | 168 | if __name__ == "__main__": 169 | main() -------------------------------------------------------------------------------- /circulo/data/databot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import sys 18 | import os 19 | import igraph 20 | from igraph import VertexCover 21 | import urllib.request 22 | import zipfile 23 | import gzip 24 | import statistics 25 | 26 | PRINT_PREFIX="[===DATA===]" 27 | 28 | class CirculoData: 29 | 30 | #class variable 31 | CONTEXT_OPTIMAL_PARTITIONS = "optimal_partitions" 32 | CONTEXT_ATTRS_TO_USE = "attributes_to_use" 33 | 34 | def __init__(self, dataset_name): 35 | data_dir = os.path.dirname(__file__) 36 | graph_dir = os.path.join(data_dir, "GRAPHS") 37 | 38 | #make sure that the graph dir exists 39 | if not os.path.exists(graph_dir): 40 | os.mkdir(graph_dir) 41 | 42 | self.raw_data_path = os.path.join(data_dir,dataset_name, "raw") 43 | self.dataset_name = dataset_name 44 | self.graph_path = os.path.join(graph_dir, dataset_name+".graphml") 45 | 46 | def __download__(self): 47 | ''' 48 | :data_dir an existing directory where raw data should be stored 49 | 50 | Downloads data 51 | ''' 52 | raise NotImplmentedError("function must be overridden") 53 | 54 | 55 | def __prepare__(self): 56 | ''' 57 | :data_dir an existing directory where raw data should be stored 58 | :graph_path path to serialized graph file 59 | :options additional optionss 60 | ''' 61 | raise NotImplementedError("function must be overridden") 62 | 63 | 64 | def get_context(self): 65 | ''' 66 | returns a dictionary of recommended optimizatios when running this data against certain algorithms. 67 | By default, returns an empty dictionary. 68 | ''' 69 | 70 | return dict() 71 | 72 | 73 | def get_ground_truth(self, G): 74 | ''' 75 | Returns a VertexCover representing the ground truth for the given graph 76 | ''' 77 | raise NotImplementedError("function must be overridden") 78 | 79 | def get_graph(self): 80 | ''' 81 | Returns the graph loaded in memory 82 | ''' 83 | 84 | if not os.path.exists(self.raw_data_path): 85 | os.mkdir(self.raw_data_path) 86 | self.__download__() 87 | 88 | if not os.path.exists(self.graph_path): 89 | self.__prepare__() 90 | 91 | return igraph.load(self.graph_path) 92 | 93 | 94 | def download_with_notes(self,url, progressbar=True, download_file=None): 95 | """ 96 | Uses urllib to download data from URL. Saves the file_download.. Provides basic logging to stdout. 97 | 98 | :url source url 99 | :file_downlaod destination file path 100 | :progressbar shows progress bar (default: true) 101 | 102 | """ 103 | print(PRINT_PREFIX, "Downloading data from " + url + ".....") 104 | 105 | if download_file is None: 106 | download_file = os.path.basename(url) 107 | 108 | download_path = os.path.join(self.raw_data_path, download_file) 109 | 110 | try: 111 | if progressbar: 112 | urllib.request.urlretrieve(url, download_path, reporthook=progress) 113 | else: 114 | urllib.request.urlretrieve(url, download_path) 115 | except Exception as e: 116 | print(PRINT_PREFIX, "Data download failed -- make sure the url is still valid, and that urllib is properly installed.\n\n") 117 | raise(e) 118 | print("Download complete.") 119 | 120 | _unzip(download_path) 121 | 122 | 123 | def _unzip(zip_path): 124 | ''' 125 | Unzips the file at zip_path into the current directory 126 | 127 | :zip_path src of zip file 128 | ''' 129 | 130 | if zipfile.is_zipfile(zip_path): 131 | try: 132 | z = zipfile.ZipFile(zip_path) 133 | except zipfile.BadZipFile as e: 134 | print(PRINT_PREFIX, "ZipFile error: {}".format(e)) 135 | sys.exit(0) 136 | print(PRINT_PREFIX, "Extracting from zip...") 137 | z.extractall(path=os.path.dirname(zip_path)) 138 | 139 | else: 140 | unzip_file = os.path.splitext(zip_path)[0] 141 | 142 | with gzip.open(zip_path,'rb') as infile: 143 | try: 144 | file_content = infile.read() 145 | except OSError as e: 146 | print(PRINT_PREFIX, "Neither gzip nor zipfile. No extraction necessary.") 147 | return 148 | 149 | with open(unzip_file, "wb") as f: 150 | print(PRINT_PREFIX, "Extracting from gzip...") 151 | f.write(file_content) 152 | 153 | def progress(blockNum, blockSize, totSize): 154 | """ 155 | Provides an ascii progress bar that is 50 characters wide. 156 | totSize is the total size of the task, blockSize is the size 157 | of each block, and blockNum is the current block being worked on. 158 | 159 | For example: 160 | 161 | for i in range(100): 162 | progress(i + 1, 1, 100) 163 | sleep(1) 164 | 165 | will print a progress bar over 100 seconds. 166 | """ 167 | downloaded = blockNum * blockSize 168 | per = min(100 * downloaded / totSize, 100) 169 | sys.stdout.write("\r%d%%" %per) 170 | for i in range(int(per / 2)): 171 | sys.stdout.write(".") 172 | for i in range(50 - int(per/2)): 173 | sys.stdout.write(" ") 174 | sys.stdout.write("# ") 175 | sys.stdout.flush() 176 | 177 | -------------------------------------------------------------------------------- /circulo/data/flights/README.md: -------------------------------------------------------------------------------- 1 | ## Airline Flight Data: Airport, Airline, and Route Data 2 | 3 | The data can be found at 4 | 5 | ## Description 6 | Route data between airports. 7 | 8 | **Directed**: Yes 9 | 10 | **Weighted**: No 11 | 12 | **Multigraph**: Default: No, but information is available. 13 | 14 | ### Vertices 15 | Each vertex represents some airport for which we have at least one flight record. 16 | 17 | Attributes: 18 | * **DST**: Daylight savings time. One of E (Europe), A (US/Canada), S (South America), O (Australia), Z (New Zealand), N (None) or U (Unknown). 19 | * **altitude**: In feet. 20 | * **ICAO**: 4-letter ICAO code. Blank if not assigned. 21 | * **id**: Unique integral identifier. Generally use "name" instead, to reference. 22 | * **name**: Unique OpenFlights identifier for this airport. 23 | * **city**: Main city served by airport. May be spelled differently from airport_name. 24 | * **latitude**: Decimal degrees, usually to six significant digits. Negative is South, positive is North. 25 | * **longitude**: Decimal degrees, usually to six significant digits. Negative is West, positive is East. 26 | * **timezone**: Hours offset from UTC. Fractional hours are expressed as decimals, eg. India is 5.5. 27 | * **IATA/FAA**: 3-letter FAA code, for airports located in Country "United States of America". 3-letter IATA code, for all other airports. Blank if not assigned. 28 | * **country**: Country or territory where airport is located. 29 | * **airport_name**: Name of airport. May or may not contain the city name. 30 | 31 | 32 | ### Edges 33 | There is a directed edge between two nodes wherever there is a flight between those nodes. By "flight," we mean a recurring flight like UA123, not an individual instance of a flight. Different airlines have flights between the same source and destination, so this is a multigraph that can be modified into a weighted graph by calling download_utils.multigraph_to_weights. `get_graph` does this automatically, but the graph is saved as a multigraph. 34 | 35 | Attributes (only available in multigraph. Otherwise, the only attribute is "weight"): 36 | * **airline_id**: Unique OpenFlights identifier for this airline. 37 | * **equipment**: 3-letter codes for plane type(s) generally used on this flight, separated by spaces. 38 | * **source_airport**: 3-letter (IATA) or 4-letter (ICAO) code of the source airport. 39 | * **stops**: Number of stops on this flight ("0" for direct) 40 | * **source_id**: Unique OpenFlights identifier for source airport 41 | * **codeshare**: "Y" if this flight is a codeshare (that is, not operated by Airline, but another carrier), empty otherwise. 42 | * **dest_airport**: 3-letter (IATA) or 4-letter (ICAO) code of the destination airport. 43 | * **dest_id**: Unique OpenFlights identifier for destination airport. 44 | * **airline**: 2-letter (IATA) or 3-letter (ICAO) code of the airline. 45 | 46 | ## Ground Truth 47 | `get_ground_truth` returns a VertexClustering of vertices grouped by some attribute from the vertex attributes supplied by the user. Currently, the ground truth defaults to clustering by country. 48 | 49 | ## Other Notes 50 | * See `run.py` for specific details 51 | 52 | ## References 53 | Thanks to OpenFlights.org -------------------------------------------------------------------------------- /circulo/data/football/README.md: -------------------------------------------------------------------------------- 1 | ## American College Football 2 | 3 | The data can be found at 4 | 5 | ## Description 6 | Football games played between Division 1A colleges during the regular season. 7 | 8 | Directed: No 9 | 10 | Weighted: No 11 | 12 | Multigraph: Yes 13 | 14 | ### Vertices 15 | Each vertex represents a team. 16 | 17 | Attributes: 18 | * **label**: School name 19 | * **id**: Unique identifying integer id 20 | * **value**: Integer value specifying conference. 21 | 22 | ### Edges 23 | There is an edge between two vertices for each game the teams have played each other. Since a few teams play each other multiple times, this is a multigraph. It can be converted into a weighted graph by calling `download_utils.multigraph_to_weights` from the Circulo package. 24 | 25 | Attributes: None 26 | 27 | ## Other Notes 28 | * See `run.py` for specific details 29 | 30 | ## Ground Truth 31 | `get_ground_truth` groups the vertices by conference. 32 | 33 | ## References 34 | Data from Mark Newman's personal website. 35 | 36 | M. Girvan and M. E. J. Newman, *Proc. Natl. Acad. Sci. USA* **99**, 7821-7826 (2002). -------------------------------------------------------------------------------- /circulo/data/football/run.py: -------------------------------------------------------------------------------- 1 | import igraph 2 | from igraph import VertexCover 3 | import os 4 | import sys 5 | import urllib.request 6 | import shutil 7 | from circulo.data.databot import * 8 | 9 | DOWNLOAD_URL = "http://www-personal.umich.edu/~mejn/netdata/football.zip" 10 | 11 | class FootballData(CirculoData): 12 | 13 | def __download__(self): 14 | """ 15 | downloads the graph from DOWNLOAD_URL into data_dir/GRAPH_NAME 16 | """ 17 | self.download_with_notes(DOWNLOAD_URL) 18 | 19 | def __prepare__(self): 20 | """ 21 | """ 22 | #convert gml to graphml 23 | G = igraph.load( os.path.join(self.raw_data_path, "football.gml")) 24 | #must delete the id attribute since graphml uses it as a reserved attribute and gml does not 25 | del G.vs['id'] 26 | G.write_graphml(self.graph_path) 27 | 28 | 29 | def get_ground_truth(self, G): 30 | """ 31 | Returns a VertexClustering object of the 32 | ground truth of the graph G. The ground truth for this 33 | football data is the conference to which each team belongs. 34 | """ 35 | 36 | #by default conferences are identified by a float number 37 | float_membership = G.vs['value'] 38 | conf_map = {} 39 | for vertex_id, conference_id in enumerate(float_membership): 40 | if conference_id not in conf_map: 41 | conf_map[conference_id] = [] 42 | conf_map[conference_id].append(vertex_id) 43 | 44 | 45 | cluster_list = [v for k,v in conf_map.items()] 46 | 47 | return VertexCover(G, cluster_list) 48 | 49 | 50 | def main(): 51 | FootballData("football").get_ground_truth() 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /circulo/data/house_voting/README.md: -------------------------------------------------------------------------------- 1 | ## Congress Voting Data 2 | 3 | The data can be found at 4 | 5 | ## Description 6 | Congress voting records from 2014. 7 | 8 | Directed: No 9 | 10 | Weighted: Yes 11 | 12 | Multigraph: No 13 | 14 | ### Vertices 15 | Each vertex represents a congressperson for whom we have voting data. 16 | 17 | Attributes: 18 | * **name**: Unique identifying id 19 | * **full_name**: Name of the congressperson 20 | * **state**: State represented 21 | * **id**: Unique identifier. In most cases, use "name." 22 | * **party**: Political party. 23 | 24 | ### Edges 25 | There is an edge between two nodes whenever the congresspeople vote together on an issue. The edges are weighted by the number of votes that are shared. 26 | 27 | Attributes: 28 | * **weight**: The number of times the congresspeople on each side of this edge have voted the same way. 29 | 30 | ## Ground Truth 31 | `get_ground_truth` returns a VertexClustering grouped by the parties of the politicians. 32 | 33 | ## Other Notes 34 | * See `run.py` for specific details 35 | 36 | ## References 37 | Thanks to GovTrack.us 38 | -------------------------------------------------------------------------------- /circulo/data/house_voting/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | if [ -z "$1" ]; then 5 | echo "Data dir required" 6 | exit 0 7 | fi 8 | 9 | if [ ! -d "$1" ]; then 10 | echo "Data dir does not exist" 11 | exit 0 12 | fi 13 | 14 | rsync -avz --delete --delete-excluded --exclude **/text-versions/ govtrack.us::govtrackdata/congress/113/votes/2014 $1 > /dev/null 2>&1 15 | 16 | rsync -avz --delete --delete-excluded --exclude **/text-versions/ govtrack.us::govtrackdata/congress-legislators/legislators-current.csv $1 > /dev/null 2>&1 17 | 18 | 19 | -------------------------------------------------------------------------------- /circulo/data/house_voting/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import json 18 | import glob 19 | import csv 20 | import itertools 21 | import os 22 | import igraph 23 | from igraph import VertexCover 24 | from subprocess import call 25 | from circulo.data.databot import CirculoData 26 | 27 | 28 | class HouseData(CirculoData): 29 | 30 | def __download__(self): 31 | 32 | try: 33 | call(["bash", os.path.join(os.path.dirname(__file__), "download.sh"), self.raw_data_path]) 34 | except Exception as e: 35 | #print("rsync failed to retrieve data") 36 | raise(e) 37 | 38 | def __prepare__(self): 39 | ''' 40 | Prepare congress data. NOTE: the vertex lookups should be indexed, however this 41 | funciton could prob be sped up by just created a dict with all possible congress pairs 42 | and counting how often they vote together, then at the end creating the edges 43 | ''' 44 | 45 | src_files = os.path.join(self.raw_data_path, "2014", "h*","*.json") 46 | c_type = "rep" 47 | G = igraph.Graph() 48 | 49 | 50 | #first load the vertices 51 | with open(os.path.join(self.raw_data_path, "legislators-current.csv"), 'r') as f: 52 | 53 | csvreader = csv.reader(f,delimiter=',',quotechar='"') 54 | #skip the headers 55 | next(csvreader, None) # skip the headers 56 | for row in csvreader: 57 | 58 | if c_type != row[4]: 59 | continue 60 | elif row[4] == "sen": 61 | congress_id = row[21] 62 | elif row[4] == "rep": 63 | congress_id = row[18] 64 | else: 65 | raise("Unidentified congress: {}".format(row[4])) 66 | 67 | G.add_vertex( 68 | congress_id, 69 | full_name="{} {}".format(row[1],row[0]), 70 | party=row[7], 71 | state=row[5] 72 | ) 73 | 74 | 75 | 76 | missing_ids = set() 77 | 78 | #now create the edges 79 | for fname in glob.glob(src_files): 80 | with open(fname,'r') as inputfile: 81 | data = json.load(inputfile) 82 | for vt in data['votes']: 83 | congress_ids = [n['id'] for n in data['votes'][vt]] 84 | pairs = itertools.combinations(congress_ids,2) 85 | 86 | for congress_id0, congress_id1 in pairs: 87 | try: 88 | v0 = G.vs.find(congress_id0) 89 | except ValueError as e: 90 | missing_ids.add(congress_id0) 91 | continue 92 | 93 | try: 94 | v1 = G.vs.find(congress_id1) 95 | except ValueError as e: 96 | missing_ids.add(congress_id1) 97 | continue 98 | 99 | e = G.get_eid(v0.index, v1.index, directed=False, error=False) 100 | 101 | if e>=0: 102 | G.es[e]['weight'] += 1 103 | else: 104 | G.add_edge(v0, v1, weight=1) 105 | 106 | #the graph is highly connected, so we will prune it 107 | self.prune(G) 108 | 109 | components = G.components(mode=igraph.WEAK) 110 | 111 | #the dataset by default is diconnected, so we must take the largest component 112 | if len(components) is not 1: 113 | G = G.subgraph(max(components, key=len)) 114 | 115 | 116 | G.write_graphml(self.graph_path) 117 | 118 | def prune(self,G): 119 | 120 | if G.is_weighted() is False: 121 | print("Error: Unable to prune a graph without edge weights") 122 | return 123 | 124 | weights = G.es()['weight'] 125 | threshold = .65 * max(weights) 126 | orig_edge_count = G.ecount() 127 | edges = G.es.select(weight_lt=threshold) 128 | G.delete_edges(edges) 129 | 130 | def __party_to_cluster__(self, party): 131 | if party == "Democrat": 132 | return 0 133 | elif party == "Republican": 134 | return 1 135 | elif party == "Independent": 136 | return 2 137 | else: 138 | raise("Unknown party affiliation {}".format(party)) 139 | 140 | def get_ground_truth(self, G): 141 | 142 | cluster_list = [[],[],[]] 143 | 144 | for vertex_id, party in enumerate(G.vs['party']): 145 | cluster_list[self.__party_to_cluster__(party)].append(vertex_id) 146 | 147 | return VertexCover(G, cluster_list) 148 | 149 | 150 | def main(): 151 | 152 | databot = HouseData("house_voting") 153 | G = databot.get_graph() 154 | databot.get_ground_truth(G) 155 | 156 | if __name__ == "__main__": 157 | main() 158 | -------------------------------------------------------------------------------- /circulo/data/karate/README.md: -------------------------------------------------------------------------------- 1 | ## Zachary's Karate Club 2 | 3 | The data can be found at 4 | 5 | ## Description 6 | Friendships at a university karate club in the 1970s. 7 | 8 | Directed: No 9 | 10 | Weighted: No 11 | 12 | Multigraph: No 13 | 14 | ### Vertices 15 | Each vertex represents a member of the karate club. 16 | 17 | Attributes: 18 | * **id**: Unique identifier. 19 | 20 | ### Edges 21 | Each edge represents a friendship between two members of the club. 22 | 23 | Attributes: None. 24 | 25 | ## Ground Truth 26 | Not yet implemented. 27 | 28 | ## Other Notes 29 | * See `run.py` for specific details 30 | 31 | ## References 32 | Taken from Mark Newman's personal site. 33 | 34 | W. W. Zachary, An information flow model for conflict and fission in small groups, *Journal of Anthropological Research* **33**, 452-473 (1977). -------------------------------------------------------------------------------- /circulo/data/karate/run.py: -------------------------------------------------------------------------------- 1 | import igraph 2 | from igraph import VertexCover 3 | import os 4 | import sys 5 | import urllib.request 6 | from circulo.utils.downloader import download_with_notes 7 | import shutil 8 | from circulo.data.databot import * 9 | 10 | 11 | DOWNLOAD_URL = 'http://www-personal.umich.edu/~mejn/netdata/karate.zip' 12 | KARATE_RAW = "karate.gml" 13 | 14 | class KarateData(CirculoData): 15 | 16 | def __download__(self): 17 | self.download_with_notes(DOWNLOAD_URL) 18 | 19 | def __prepare__(self): 20 | 21 | G = igraph.load(os.path.join(self.raw_data_path, KARATE_RAW)) 22 | del G.vs['id'] 23 | G.write_graphml(self.graph_path) 24 | 25 | def get_ground_truth(self, G): 26 | """ 27 | returns a VertexClustering object of the 28 | ground truth of the graph G. 29 | """ 30 | 31 | clusters_list = [ 32 | [0,1,2,3,4,5,6,7,10,11,12,13, 16,17,19, 20, 22,23,24,25,26,27,28,29,30,31,32,33], 33 | [8,9,14, 15, 18,21] 34 | ] 35 | 36 | return VertexCover(G, clusters_list) 37 | 38 | def main(): 39 | databot = KarateData("karate") 40 | databot.get_ground_truth(databot.get_graph()) 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /circulo/data/malaria/README.md: -------------------------------------------------------------------------------- 1 | ## malaria 2 | 3 | The data can be found at http://danlarremore.com/bipartiteSBM/malariaData.zip 4 | 5 | ## Description 6 | A bipartite graph representing genetic sequences from the malaria parasite 7 | *Plasmodium falciparum*. 8 | 9 | Directed: No 10 | 11 | Weighted: No 12 | 13 | Multigraph: No 14 | 15 | Bipartite: Yes 16 | 17 | ### Vertices 18 | 297 genes and their 806 shared amino acid substrings 19 | 20 | Attributes: 21 | None 22 | 23 | ### Edges 24 | Edge between a gene and an acid if theeamino acid appears in the gene. 25 | 26 | Attributes: 27 | None 28 | 29 | ## Ground Truth 30 | Not yet implemented 31 | 32 | ## Other Notes 33 | * See `run.py` for specific details 34 | 35 | ## References 36 | http://danlarremore.com/pdf/2014_LCJ_EfficientlyInferringCommunityStructureInBipartiteNetworks_PRE.pdf 37 | 38 | Larremore, D. B., Clauset, A., and Buckee, C. O. (2013). A Network Approach to Analyzing Highly Recombinant Malaria Parasite Genes. PLoS Computational Biology, 9(10), e1003268. -------------------------------------------------------------------------------- /circulo/data/malaria/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import igraph 18 | import os 19 | 20 | from circulo.data.databot import CirculoData 21 | 22 | DOWNLOAD_URL = 'http://danlarremore.com/bipartiteSBM/malariaData.zip' 23 | DATA_DIR = "malariaData" 24 | 25 | class MalariaData(CirculoData): 26 | 27 | def __download__(self): 28 | self.download_with_notes(DOWNLOAD_URL) 29 | 30 | 31 | def __prepare__(self): 32 | 33 | data = os.path.join(self.raw_data_path, DATA_DIR, "malaria.edgelist") 34 | mod_data = os.path.join(self.raw_data_path, DATA_DIR, "mod_malaria.edgelist") 35 | 36 | #we just need to remove the third column which has 1's in it 37 | #so igraph can read it as an edgelist 38 | with open(data, 'r') as f: 39 | with open(mod_data, 'w') as new: 40 | for line in f: 41 | new.write(line[:-2] + '\n') 42 | 43 | G = igraph.load(mod_data) 44 | G.write_graphml(self.graph_path) 45 | 46 | def main(): 47 | databot = MalariaData("malaria") 48 | databot.get_graph() 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /circulo/data/nba_schedule/README.md: -------------------------------------------------------------------------------- 1 | ## NBA Schedule 2 | 3 | The data can be found at 4 | 5 | ## Description 6 | Games played in the 2013-2014 NBA season. 7 | 8 | Directed: No 9 | 10 | Weighted: Yes 11 | 12 | Multigraph: No 13 | 14 | ### Vertices 15 | Each vertex represents a team. 16 | 17 | Attributes: 18 | * **id**: Unique identifier. 19 | * **name**: Team name. 20 | 21 | ### Edges 22 | There is an edge between each team that plays each other, weighted by the number of games played. 23 | 24 | Attributes: 25 | * **weight**: Number of games played between the two teams. 26 | 27 | ## Ground Truth 28 | `get_ground_truth` returns a VertexClustering of teams clustered by the six divisions. 29 | 30 | ## Other Notes 31 | * See `run.py` for specific details 32 | 33 | ## References 34 | Thanks to [Dave Walk](https://github.com/davewalk) and ESPN.com -------------------------------------------------------------------------------- /circulo/data/nba_schedule/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import csv 19 | import re 20 | import sys 21 | import os 22 | import glob 23 | import statistics 24 | 25 | from subprocess import call 26 | 27 | import igraph 28 | from igraph import VertexCover 29 | 30 | from circulo.data.databot import CirculoData 31 | 32 | 33 | class NBAData(CirculoData): 34 | 35 | 36 | def __download__(self): 37 | 38 | try: 39 | call(["git", "clone", "https://github.com/davewalk/2013-2014-nba-schedule", self.raw_data_path]) 40 | except Exception as e: 41 | print("Git Clone Failed to retrieve data") 42 | raise(e) 43 | 44 | 45 | def convert(self, string): 46 | ''' 47 | Puts the team names into a consistent format since the naming is inconsistent throughout 48 | the datasets 49 | ''' 50 | string = string.lower() 51 | string = re.sub('_',"-", string) 52 | string = re.sub(' ',"-", string) 53 | return string 54 | 55 | 56 | def __prepare__(self): 57 | 58 | team_dict = {} 59 | 60 | G = igraph.Graph(directed=True) 61 | 62 | data_regex = os.path.join(self.raw_data_path,"data","csv",'*.csv') 63 | 64 | #adds the vertices strictly based on the names of the files 65 | for filename in glob.glob(data_regex): 66 | std_team_name = re.sub('.csv',"",os.path.basename(filename)) 67 | std_team_name = re.sub('_',"-",std_team_name) 68 | G.add_vertex(std_team_name) 69 | 70 | #each file represents a team 71 | for filename in glob.glob(data_regex): 72 | 73 | with open(filename, "r") as data: 74 | 75 | reader = csv.reader(data) 76 | std_team_name = re.sub('.csv',"",os.path.basename(filename)) 77 | std_team_name = re.sub('_',"-", std_team_name) 78 | 79 | team0 = G.vs.find(name=std_team_name) 80 | 81 | #skip first row of header info 82 | next(reader,None) 83 | 84 | for row in reader: 85 | 86 | std_opponent = self.convert(row[2]) 87 | team1 = G.vs.find(name=std_opponent) 88 | 89 | if team0 is None or team1 is None: 90 | raise("Vertex not found for input team name") 91 | sys.exit(1) 92 | 93 | G.add_edge(team0, team1) 94 | 95 | 96 | #we need to set a weight of 1 to every edge 97 | G.es['weight'] = 1 98 | 99 | #we simplify the multigraph 100 | G.simplify(combine_edges={'weight':sum}) 101 | 102 | #we collapse the graph 103 | self.prune(G) 104 | G.write_graphml(self.graph_path) 105 | 106 | def get_context(self): 107 | return { 108 | CirculoData.CONTEXT_OPTIMAL_PARTITIONS:6 109 | } 110 | 111 | 112 | def get_ground_truth(self, G): 113 | 114 | #ground truth table 115 | divisions = { 116 | "boston-celtics":0, 117 | "brooklyn-nets":0, 118 | "new-york-knicks":0, 119 | "philadelphia-76ers":0, 120 | "toronto-raptors":0, 121 | "chicago-bulls":1, 122 | "cleveland-cavaliers":1, 123 | "detroit-pistons":1, 124 | "indiana-pacers":1, 125 | "milwaukee-bucks":1, 126 | "atlanta-hawks":2, 127 | "charlotte-bobcats":2, 128 | "miami-heat":2, 129 | "orlando-magic":2, 130 | "washington-wizards":2, 131 | "dallas-mavericks":3, 132 | "houston-rockets":3, 133 | "memphis-grizzlies":3, 134 | "new-orleans-pelicans":3, 135 | "san-antonio-spurs":3, 136 | "denver-nuggets":4, 137 | "minnesota-timberwolves":4, 138 | "oklahoma-city-thunder":4, 139 | "portland-trail-blazers":4, 140 | "utah-jazz":4, 141 | "golden-state-warriors":5, 142 | "los-angeles-clippers":5, 143 | "los-angeles-lakers":5, 144 | "phoenix-suns":5, 145 | "sacramento-kings":5 146 | } 147 | 148 | cluster_list = [[],[],[],[],[],[]] 149 | 150 | for vertex_id, team_name in enumerate(G.vs['name']): 151 | cluster_list[divisions[team_name]].append(vertex_id) 152 | 153 | 154 | return VertexCover(G, cluster_list) 155 | 156 | def prune(self,G): 157 | if G.is_weighted() is False: 158 | return G 159 | 160 | weights = G.es()['weight'] 161 | 162 | threshold = statistics.median(weights) + .0001 163 | 164 | orig_edge_count = G.ecount() 165 | edges = G.es.select(weight_lt=threshold) 166 | G.delete_edges(edges) 167 | #this is a special case because this pruning will create a disconnected component, so let's add back in one edge 168 | v0 = G.vs.find(name="washington-wizards") 169 | v1 = G.vs.find(name="san-antonio-spurs") 170 | G.add_edge(v0, v1, weight=1) 171 | 172 | 173 | def main(): 174 | databot = NBAData("nba_schedule") 175 | databot.get_ground_truth(databot.get_graph()) 176 | 177 | if __name__ == "__main__": 178 | main() 179 | -------------------------------------------------------------------------------- /circulo/data/netscience/README.md: -------------------------------------------------------------------------------- 1 | ## Network Science Collaborations 2 | 3 | The data can be found at 4 | 5 | ## Description 6 | Coauthorships of papers in the network science community. 7 | 8 | Directed: No 9 | 10 | Weighted: No (but can be) 11 | 12 | Multigraph: No 13 | 14 | ### Vertices 15 | Each vertex represents an author of some paper on network science. 16 | 17 | Attributes: 18 | * **label**: The name of the researcher 19 | * **id**: Unique identifier 20 | 21 | ### Edges 22 | There is an edge between two authors if they are coauthors on a paper. 23 | 24 | Attributes: 25 | * **value**: (I think) that the value represents the "importance" of a connection. If there are n authors on a paper, each author adds 1/n to the value of their edge to each other author. 26 | 27 | ## Ground Truth 28 | Not yet implemented. 29 | 30 | ## Other Notes 31 | * See `run.py` for specific details 32 | 33 | ## References 34 | Taken from Mark Newman's personal website. 35 | 36 | M. E. J. Newman, *Phys. Rev. E* **74**, 036104 (2006). -------------------------------------------------------------------------------- /circulo/data/netscience/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import igraph 18 | import os 19 | 20 | from circulo.data.databot import CirculoData 21 | 22 | DOWNLOAD_URL = 'http://www-personal.umich.edu/~mejn/netdata/netscience.zip' 23 | 24 | class NetscienceData(CirculoData): 25 | 26 | def __download__(self): 27 | self.download_with_notes(DOWNLOAD_URL) 28 | 29 | def __prepare__(self): 30 | 31 | G = igraph.load(os.path.join(self.raw_data_path, "netscience.gml")) 32 | del G.vs['id'] #graphml uses the id field, so we must remove it 33 | G.write_graphml(self.graph_path) 34 | 35 | def main(): 36 | NetscienceData("netscience").get_graph() 37 | 38 | if __name__ == "__main__": 39 | main() 40 | -------------------------------------------------------------------------------- /circulo/data/pgp/README.md: -------------------------------------------------------------------------------- 1 | ## Interactions within Pretty Good Privacy 2 | 3 | The data can be found at 4 | 5 | ## Description 6 | Users of the Pretty-Good-Privacy algorithm. Only the giant component included. 7 | 8 | Directed: No 9 | 10 | Weighted: Yes (But all weights are 1, so not really) 11 | 12 | Multigraph: No 13 | 14 | ### Vertices 15 | Each vertex is a person using the Pretty-Good-Privacy algorithm. 16 | 17 | Attributes: 18 | * **x**: x-coordinate for plotting 19 | * **y**: y-coordinate for plotting 20 | * **z**: z-coordinate for plotting 21 | * **id**: Unique identifier 22 | 23 | ### Edges 24 | Interactions under PGP. 25 | 26 | Attributes: 27 | * **weight**: Always 1. Unweighted, for all intents and purposes. 28 | 29 | ## Ground Truth 30 | Not yet implemented, 31 | 32 | ## Other Notes 33 | * See `run.py` for specific details 34 | 35 | ## References 36 | Taken from Alexandre Arenas' personal site. 37 | 38 | M. Boguña, R. Pastor-Satorras, A. Diaz-Guilera and A. Arenas, *Physical Review E*, vol. **70**, 056122 (2004). -------------------------------------------------------------------------------- /circulo/data/pgp/run.py: -------------------------------------------------------------------------------- 1 | import igraph 2 | import os 3 | import urllib.request 4 | 5 | from circulo.data.databot import CirculoData 6 | 7 | 8 | 9 | GRAPH_NAME = 'PGPgiantcompo.net' 10 | DOWNLOAD_URL = 'http://deim.urv.cat/~aarenas/data/xarxes/PGP.zip' 11 | 12 | 13 | class PGPData(CirculoData): 14 | 15 | def __download__(self): 16 | self.download_with_notes(DOWNLOAD_URL) 17 | 18 | def __prepare__(self): 19 | 20 | data_path = os.path.join(self.raw_data_path, GRAPH_NAME) 21 | G = igraph.load(data_path) 22 | del G.vs['id'] #graphml uses the id field 23 | G.write_graphml(self.graph_path) 24 | 25 | def get_ground_truth(self, G): 26 | raise(NotImplementedError) 27 | 28 | 29 | def main(): 30 | PGPData("pgp").get_graph() 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /circulo/data/revolution/README.md: -------------------------------------------------------------------------------- 1 | ## Revolutionary War participants 2 | This dataset is drawn from the appendix of David Hackett Fischer's _Paul Revere's Ride_; a CSV version is available within 3 | the repository https://github.com/kjhealy/revere.git. 4 | 5 | ## Description 6 | This is a bipartite graph representing colonial American dissidents' membership in seven Whig (anti-British) groups during the 7 | buildup to the American Revolutionary War. 8 | See http://kieranhealy.org/blog/archives/2013/06/09/using-metadata-to-find-paul-revere/ and http://www.sscnet.ucla.edu/polisci/faculty/chwe/ps269/han.pdf 9 | for some analyses using this data. 10 | 11 | For more traditional SNA applications, a one-mode projection of this data will yield a co-attendance network of the 254 12 | examined Revolutionary War figures. 13 | 14 | Graph properties: 15 | - Directed: False 16 | - Weighted: False 17 | - Multigraph: False 18 | - Bipartite: True 19 | 20 | ### Vertices 21 | The seven Type 1 nodes represent seven Whig groups: St. Andrew's Lodge, the Loyal Nine, the North Caucus, 22 | the Long Room Club, the Boston Tea Party, the Boston Committe, and the London Enemies. 23 | The 254 Type 2 nodes represent colonial Americans who attended meetings of these groups (including John Adams, Paul Revere, and Joseph Warren). 24 | 25 | Attributes: None. 26 | 27 | ### Edges 28 | Each edge represents membership by a colonial American in a Whig group. 29 | 30 | Attributes: None. 31 | 32 | ## Ground Truth 33 | None provided. 34 | 35 | ## Other Notes 36 | * See `run.py` for specific details 37 | 38 | ## References 39 | [1] Fischer, David Hackett. 1994. _Paul Revere’s Ride._ New York: Oxford University Press. 40 | -------------------------------------------------------------------------------- /circulo/data/revolution/run.py: -------------------------------------------------------------------------------- 1 | # This template should be copied and modified as necessary to become the 2 | # `run.py` in each directory. 3 | # 4 | # Do not modify this file unless the template needs changing -- modify 5 | # its copies in each data directory. 6 | 7 | import igraph 8 | import os 9 | import csv 10 | from subprocess import call 11 | 12 | from circulo.data.databot import CirculoData 13 | 14 | GIT_URL = 'https://github.com/kjhealy/revere.git' 15 | CSV_FILE = 'data/PaulRevereAppD.csv' 16 | 17 | class RevolutionData(CirculoData): 18 | 19 | def __download__(self): 20 | try: 21 | call(["git", "clone", GIT_URL, self.raw_data_path]) 22 | except Exception as e: 23 | print("Git clone failed to retrieve data. Please try again.") 24 | raise(e) 25 | 26 | def __prepare__(self): 27 | 28 | csv_path = os.path.join(self.raw_data_path, CSV_FILE) 29 | 30 | g = igraph.Graph() 31 | 32 | with open(csv_path) as f: 33 | 34 | reader = csv.DictReader(f) 35 | clubs = reader.fieldnames[:] 36 | clubs.remove('') 37 | 38 | for club in clubs: 39 | g.add_vertex(name=club) 40 | 41 | for patriot in reader: 42 | g.add_vertex(name=patriot['']) 43 | for club in clubs: 44 | if(patriot[club] == '1'): 45 | g.add_edge(patriot[''], club) 46 | 47 | g.write_graphml(self.graph_path) 48 | 49 | def get_ground_truth(self, G): 50 | raise(NotImplementedError) 51 | 52 | 53 | def main(): 54 | RevolutionData("revolution").get_graph() 55 | 56 | if __name__ == "__main__": 57 | main() 58 | -------------------------------------------------------------------------------- /circulo/data/school/README.md: -------------------------------------------------------------------------------- 1 | ## Primary School - Cumulative Networks 2 | 3 | The data can be found at 4 | 5 | ## Description 6 | A network of face to face time between people at a primary school. 7 | 8 | Directed: No 9 | 10 | Weighted: No (but can easily be modified to be weighted) 11 | 12 | Multigraph: No 13 | 14 | ### Vertices 15 | Each vertex represents a person at the school (either a student or a teacher). 16 | 17 | Attributes: 18 | * **classname**: The school class and grade, if a student. Otherwise, "Teachers" 19 | * **label**: Unique identifier. 20 | * **id**: Yet another unique identifier. 21 | * **gender**: M, F, or Unknown 22 | * **viz**: Undocumented. Always 0.0 23 | 24 | ### Edges 25 | An edge exists where some actor was face to face with another one. 26 | 27 | Attributes: 28 | * **id**: Unique identifier. 29 | * **count**: The number of times that contact was established during the day. 30 | * **duration**: The total time that the nodes on this edge spent in face to face contact, measured in 20 second intervals. 31 | 32 | ## Ground Truth 33 | `get_ground_truth` returns a VertexClustering object in which the vertices are grouped by "classname". 34 | 35 | ## Other Notes 36 | * See `run.py` for specific details 37 | * Either "count" or "duration" would make sense as a weight for use with a weighted algorithm. 38 | * `run.py` requires NetworkX from . 39 | 40 | ## References 41 | 42 | Thanks to sociopatterns.org. 43 | -------------------------------------------------------------------------------- /circulo/data/school/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import networkx as nx 19 | from subprocess import call 20 | import igraph 21 | from igraph import VertexCover 22 | import os 23 | import glob 24 | 25 | from circulo.data.databot import CirculoData 26 | 27 | 28 | 29 | class SchoolData(CirculoData): 30 | 31 | def __download__(self): 32 | """ 33 | Downloads graphs from http://www.sociopatterns.org/datasets/primary-school-cumulative-networks/ 34 | and saves them in a directory data. If data already exists, it will not redownload 35 | the files 36 | """ 37 | 38 | try: 39 | # Probably shouldn't be starting a subprocess, do this with a library 40 | # like urllib2 41 | call(["curl", "--create-dirs","-o", os.path.join(self.raw_data_path, "out_#1.gexf.gz"),"http://www.sociopatterns.org/wp-content/uploads/2014/05/sp_data_school_day_[1-2]_g.gexf_.gz"]) 42 | except Exception as e: 43 | print("cURL failed -- make sure you have cURL, and make sure the site still has the graph data.") 44 | raise(e) 45 | for filename in glob.glob(os.path.join(self.raw_data_path,"*.gz")): 46 | call(["gunzip", filename]) 47 | 48 | 49 | 50 | def __prepare__(self): 51 | 52 | for f in glob.glob(os.path.join(self.raw_data_path, "*.gexf")): 53 | 54 | G = nx.read_gexf(f) 55 | for node in G.node: 56 | for attrib in G.node[node]: 57 | if type(G.node[node][attrib]) == dict: 58 | # graphML doesn't play nice with dictionaries as attributes. 59 | # this line just deletes positional information. 60 | G.node[node][attrib] = 0 61 | #newFileName = f[:f.rfind('.')] + ".graphml" 62 | nx.write_graphml(G, self.graph_path) 63 | 64 | 65 | def get_ground_truth(self, G): 66 | 67 | class_list = G.vs['classname'] 68 | class_dict = dict.fromkeys(class_list) 69 | 70 | #set the indices for lookup purposes. These will be the cluster ids 71 | for idx, k in enumerate(class_dict): 72 | class_dict[k] = [] 73 | 74 | for student_number, class_id in enumerate(class_list): 75 | class_dict[class_id].append(student_number) 76 | 77 | cluster_list = [] 78 | 79 | for cluster in class_dict.values(): 80 | cluster_list.append(cluster) 81 | 82 | return VertexCover(G, cluster_list) 83 | 84 | 85 | def main(): 86 | databot = SchoolData("school") 87 | databot.get_ground_truth(databot.get_graph()) 88 | 89 | if __name__ == "__main__": 90 | main() 91 | -------------------------------------------------------------------------------- /circulo/data/scotus/README.md: -------------------------------------------------------------------------------- 1 | ## SCOTUS Citation Network 2 | The data can be found at http://jhfowler.ucsd.edu/judicial.htm (see [1]) 3 | 4 | ## Description 5 | The dataset represents the citation graph of the Supreme Court of the United States from 1762-2002, drawn from 534 6 | volumes of the U.S. Reports. 7 | 8 | Graph properties: 9 | - Directed: True 10 | - Weighted: False 11 | - Multigraph: False 12 | 13 | ### Vertices 14 | Each vertex of the graph represents a case argued before the U.S. Supreme Court. 15 | 16 | Attributes: 17 | - caseid: Internal ID used for identifying cases by authors of the dataset. 18 | - usid: ID of the case in the U.S. Supreme Court archives (volume and case number) 19 | - parties: Disputing parties in the case (e.g. 'Marbury v. Madison', 'Brown v. Board of Education of Topeka') 20 | - year: Year the case was argued. 21 | 22 | ### Edges 23 | Edges represent majority opinion citations of previous cases; they are directed and point from citing case to cited case. 24 | 25 | Attributes: none. 26 | 27 | ## Ground Truth 28 | No ground truth exists for this dataset. 29 | 30 | ## Other Notes 31 | * See `run.py` for specific details. 32 | 33 | ## References 34 | - [1] "The Authority of Supreme Court Precedent." James H. Fowler, Sangick Jeon. _Social Neworks_ 30 (1): 16-30 (January 2008) 35 | - [2] "Network Analysis and the Law: Measuring the Legal Importance of Supreme Court Precedents." James H. Fowler, Timothy R. Johnson, James F. Spriggs II, Sangick Jeon, Paul J. Wahlbeck. _Political Analysis,_ 15 (3): 324-346 (July 2007). 36 | -------------------------------------------------------------------------------- /circulo/data/scotus/run.py: -------------------------------------------------------------------------------- 1 | import igraph 2 | import os 3 | import csv 4 | from circulo.data.databot import * 5 | 6 | DATA_DOWNLOAD_URL = 'http://jhfowler.ucsd.edu/data/judicial.csv' 7 | VERTEX_DATA_FILE = 'judicial.csv' 8 | 9 | EDGE_DOWNLOAD_URL = 'http://jhfowler.ucsd.edu/data/allcites.txt' 10 | EDGE_DATA_FILE = 'allcites.txt' 11 | 12 | 13 | class SCOTUSData(CirculoData): 14 | 15 | def __download__(self): 16 | self.download_with_notes(DATA_DOWNLOAD_URL) 17 | self.download_with_notes(EDGE_DOWNLOAD_URL) 18 | 19 | def __prepare__(self): 20 | vertex_filename = os.path.join(self.raw_data_path, VERTEX_DATA_FILE) 21 | edge_filename = os.path.join(self.raw_data_path, EDGE_DATA_FILE) 22 | 23 | g = igraph.load(edge_filename) 24 | 25 | vertex_file = open(vertex_filename) 26 | reader = csv.DictReader(vertex_file) 27 | 28 | for case in reader: 29 | caseid = int(case['caseid']) 30 | v = g.vs[caseid] 31 | 32 | v['caseid'] = case['caseid'] 33 | v['usid'] = case['usid'] 34 | v['parties'] = case['parties'] 35 | v['year'] = case['year'] 36 | 37 | vertex_file.close() 38 | 39 | # Case IDs are 1-indexed, so we delete the 0th vertex as it was extraneous. 40 | g.delete_vertices([0]) 41 | g.write_graphml(self.graph_path) 42 | 43 | def main(): 44 | SCOTUSData("scotus").get_graph() 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /circulo/data/senate_voting/README.md: -------------------------------------------------------------------------------- 1 | ## Congress Voting Data 2 | 3 | The data can be found at 4 | 5 | ## Description 6 | Congress voting records from 2014. 7 | 8 | Directed: No 9 | 10 | Weighted: Yes 11 | 12 | Multigraph: No 13 | 14 | ### Vertices 15 | Each vertex represents a congressperson for whom we have voting data. 16 | 17 | Attributes: 18 | * **name**: Unique identifying id 19 | * **full_name**: Name of the congressperson 20 | * **state**: State represented 21 | * **id**: Unique identifier. In most cases, use "name." 22 | * **party**: Political party. 23 | 24 | ### Edges 25 | There is an edge between two nodes whenever the congresspeople vote together on an issue. The edges are weighted by the number of votes that are shared. 26 | 27 | Attributes: 28 | * **weight**: The number of times the congresspeople on each side of this edge have voted the same way. 29 | 30 | ## Ground Truth 31 | `get_ground_truth` returns a VertexClustering grouped by the parties of the politicians. 32 | 33 | ## Other Notes 34 | * See `run.py` for specific details 35 | 36 | ## References 37 | Thanks to GovTrack.us 38 | -------------------------------------------------------------------------------- /circulo/data/senate_voting/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | if [ -z "$1" ]; then 5 | echo "Data dir required" 6 | exit 0 7 | fi 8 | 9 | if [ ! -d "$1" ]; then 10 | echo "Data dir does not exist" 11 | exit 0 12 | fi 13 | 14 | rsync -avz --delete --delete-excluded --exclude **/text-versions/ govtrack.us::govtrackdata/congress/113/votes/2014 $1 > /dev/null 2>&1 15 | 16 | rsync -avz --delete --delete-excluded --exclude **/text-versions/ govtrack.us::govtrackdata/congress-legislators/legislators-current.csv $1 > /dev/null 2>&1 17 | 18 | 19 | -------------------------------------------------------------------------------- /circulo/data/senate_voting/exercise.md: -------------------------------------------------------------------------------- 1 | ## Exercise 2 | 3 | __Requirements__ 4 | 5 | - NetworkX Fork from .... 6 | - IPython QtConsole 7 | - Snap from ... 8 | 9 | 10 | 11 | From the iPython QtConsole 12 | 13 | #inline images 14 | import matplotlib 15 | %matplotlib inline 16 | 17 | import os 18 | os.environ['SNAPHOME'] = '/path/to/snap' 19 | 20 | 21 | #set the inline image size to be larger 22 | import matplotlib.pylab as pylab 23 | pylab.rcParams['figure.figsize'] = (14.0, 12.0) 24 | 25 | #ETL the congress voting data 26 | %run parse_congress.py Filter 27 | 28 | import networkx as nx 29 | 30 | #from the senate dir, read in the senate data (you can do the house data too) 31 | G = nx.read_graphml('senate/senate.graphml', node_type=int) 32 | 33 | #set the layout 34 | pos = nx.fruchterman_reingold_layout(G, k=2) 35 | 36 | #create the labels 37 | labels=dict((n, d['name'] + ' ' + d['party']) for n,d in G.nodes(data=True) if d.has_key('party')) 38 | 39 | 40 | nx.draw(G, pos = pos, node_size=60, node_color="red", edge_color="grey", with_labels=True, labels=labels) 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /circulo/data/senate_voting/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import json 18 | import glob 19 | import csv 20 | import itertools 21 | import os 22 | import igraph 23 | from igraph import VertexCover 24 | from subprocess import call 25 | 26 | from circulo.data.databot import CirculoData 27 | 28 | 29 | class SenateData(CirculoData): 30 | 31 | def __download__(self): 32 | 33 | try: 34 | call(["bash", os.path.join(os.path.dirname(__file__), "download.sh"), self.raw_data_path]) 35 | except Exception as e: 36 | print("rsync failed to retrieve data") 37 | raise(e) 38 | 39 | def __prepare__(self): 40 | ''' 41 | Prepare congress data. NOTE: the vertex lookups should be indexed, however this 42 | funciton could prob be sped up by just created a dict with all possible congress pairs 43 | and counting how often they vote together, then at the end creating the edges 44 | ''' 45 | 46 | src_files = os.path.join(self.raw_data_path, "2014", "s*","*.json") 47 | c_type = "sen" 48 | G = igraph.Graph() 49 | 50 | 51 | #first load the vertices 52 | with open(os.path.join(self.raw_data_path, "legislators-current.csv"), 'r') as f: 53 | 54 | csvreader = csv.reader(f,delimiter=',',quotechar='"') 55 | #skip the headers 56 | next(csvreader, None) # skip the headers 57 | for row in csvreader: 58 | 59 | if c_type != row[4]: 60 | continue 61 | elif row[4] == "sen": 62 | congress_id = row[21] 63 | else: 64 | raise("Unidentified congress: {}".format(row[4])) 65 | 66 | G.add_vertex( 67 | congress_id, 68 | full_name="{} {}".format(row[1],row[0]), 69 | party=row[7], 70 | state=row[5] 71 | ) 72 | 73 | 74 | missing_ids = set() 75 | 76 | #now create the edges 77 | for fname in glob.glob(src_files): 78 | with open(fname,'r') as inputfile: 79 | data = json.load(inputfile) 80 | #print("Processing: {}".format(fname)) 81 | for vt in data['votes']: 82 | congress_ids = [n['id'] for n in data['votes'][vt]] 83 | #print(congress_ids) 84 | pairs = itertools.combinations(congress_ids,2) 85 | 86 | for congress_id0, congress_id1 in pairs: 87 | #print("{} {}".format(congress_id0, congress_id1)) 88 | try: 89 | v0 = G.vs.find(congress_id0) 90 | except ValueError as e: 91 | missing_ids.add(congress_id0) 92 | continue 93 | 94 | try: 95 | v1 = G.vs.find(congress_id1) 96 | except ValueError as e: 97 | missing_ids.add(congress_id1) 98 | continue 99 | 100 | e = G.get_eid(v0.index, v1.index, directed=False, error=False) 101 | 102 | if e>=0: 103 | G.es[e]['weight'] += 1 104 | else: 105 | G.add_edge(v0, v1, weight=1) 106 | 107 | print("Ids not found: {}".format(missing_ids)) 108 | 109 | #prune the graph 110 | weights = G.es()['weight'] 111 | threshold = .65 * max(weights) 112 | edges = G.es.select(weight_lt=threshold) 113 | G.delete_edges(edges) 114 | 115 | #make sure that the graph is not disconnected. if so take larger component 116 | components = G.components(mode=igraph.WEAK) 117 | if len(components) > 1: 118 | print("[Graph Prep - Congress]... Disconnected Graph Detected. Using largest component.") 119 | print("[Graph Prep - Congress]... Original graph: {} vertices and {} edges.".format(G.vcount(), G.ecount())) 120 | G = G.subgraph(max(components, key=len)) 121 | print("[Graph Prep - Congress]... Largest component: {} vertices and {} edges.".format(G.vcount(), G.ecount())) 122 | 123 | 124 | 125 | G.write_graphml(self.graph_path) 126 | 127 | 128 | def __party_to_cluster__(self, party): 129 | if party == "Democrat": 130 | return 0 131 | elif party == "Republican": 132 | return 1 133 | elif party == "Independent": 134 | return 2 135 | else: 136 | raise("Unknown party affiliation {}".format(party)) 137 | 138 | def get_ground_truth(self, G): 139 | 140 | cluster_list = [[],[],[]] 141 | 142 | for vertex_id, party in enumerate(G.vs['party']): 143 | cluster_list[self.__party_to_cluster__(party)].append(vertex_id) 144 | 145 | return VertexCover(G, cluster_list) 146 | 147 | 148 | def main(): 149 | 150 | databot = SenateData('senate_voting') 151 | G = databot.get_graph() 152 | databot.get_ground_truth(G) 153 | 154 | if __name__ == "__main__": 155 | main() 156 | -------------------------------------------------------------------------------- /circulo/data/southernwomen/README.md: -------------------------------------------------------------------------------- 1 | ## Davis "Southern Women" dataset 2 | 3 | The data can be found at http://nexus.igraph.org/api/dataset_info?id=23&format=html. 4 | 5 | ## Description 6 | The network given represents a bipartite attendance network of 18 Southern women attending 14 social events in the Deep South, collected by Davis et al. in their 7 | book "Deep South." 8 | 9 | Directed: No 10 | 11 | Weighted: No 12 | 13 | Multigraph: No 14 | 15 | ### Vertices 16 | The graph is bipartite. Type 1 vertices represent the 18 women; type 2 vertices represent the 14 social events. 17 | 18 | Attributes: 19 | - name: Name of the woman or event. 20 | 21 | ### Edges 22 | Edges represent attendance by a woman at an event. 23 | 24 | Attributes: None 25 | 26 | ## Ground Truth 27 | No ground truth, although a clustering of the women was generated in later paper by Breiger et al. 28 | 29 | ## Other Notes 30 | * See `run.py` for specific details 31 | 32 | ## References 33 | - Breiger R. (1974). The duality of persons and groups. Social Forces, 53, 181-190. 34 | - Davis, A et al. (1941). Deep South. Chicago: University of Chicago Press. 35 | -------------------------------------------------------------------------------- /circulo/data/southernwomen/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from circulo.data.databot import * 4 | 5 | DOWNLOAD_URL = "http://nexus.igraph.org/api/dataset?id=23&format=GraphML" 6 | DATA_ID = "southernwomen" 7 | DOWNLOAD_FILE="Davis.GraphML" 8 | 9 | class SouthernWomenData(CirculoData): 10 | 11 | def __download__(self): 12 | self.download_with_notes(DOWNLOAD_URL, progressbar=False, download_file=DOWNLOAD_FILE) 13 | 14 | def __prepare__(self): 15 | shutil.copyfile(os.path.join(self.raw_data_path, DOWNLOAD_FILE), self.graph_path) 16 | 17 | def get_ground_truth(self, G): 18 | raise(NotImplementedError) 19 | 20 | def get_context(self): 21 | return { 22 | CirculoData.CONTEXT_OPTIMAL_PARTITIONS:10 23 | } 24 | 25 | 26 | def main(): 27 | SouthernWomenData(DATA_ID).get_graph() 28 | 29 | if __name__ == "__main__": 30 | main() 31 | -------------------------------------------------------------------------------- /circulo/metrics/graph.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # Goal is to annotate a vertex cover with dictionary representing various cluster metrics 3 | 4 | from igraph import Graph 5 | from circulo.utils.general import aggregate 6 | import circulo.algorithms.min_conductance 7 | 8 | def triangle_participation(G): 9 | ''' 10 | This returns an array indicating whether the ith node in the graph belongs to a triad. 11 | ''' 12 | rv = [False]*G.vcount() 13 | 14 | for u in G.vs(): 15 | if rv[u.index]: 16 | continue 17 | for v in u.neighbors(): 18 | for w in v.neighbors(): 19 | is_triad = u in w.neighbors() 20 | rv[u.index] |= is_triad 21 | rv[v.index] |= is_triad 22 | rv[w.index] |= is_triad 23 | return rv 24 | 25 | def triangle_participation_ratio(G): 26 | ''' 27 | The fraction of nodes in a graph that belong to a triad. 28 | ''' 29 | rv = G.triangle_participation() 30 | return 1.0*sum(rv)/G.vcount() 31 | 32 | def cohesiveness(G, weights=None): 33 | ''' 34 | Equation: g(S) = minS′⊂S φ(S′) where φ(S′) is the conductance of S′ measured in the induced subgraph by S. 35 | To iterate over all possible subgraphs of a community would be too inefficient 2^n, therefore we approximate 36 | the best subgraph (which would have the lowest conductance) by using Local Spectral communitying to find the best 37 | cut 38 | (cite: http://cs.stanford.edu/people/jure/pubs/comscore-icdm12.pdf) 39 | ''' 40 | from circulo.algorithms import spectral 41 | if G.vcount() <= 2: 42 | val = 1 43 | else: 44 | #TODO: Consider using G_i.mincut() instead. 45 | val, vc = G.min_conductance(weights=weights) 46 | return val 47 | 48 | def __helper_m(key_prefix, describe_dict): 49 | dict0 = {} 50 | 51 | for k, v in describe_dict.items(): 52 | new_key = key_prefix + " (" + k + ")" 53 | dict0[new_key] = v 54 | 55 | return dict0 56 | 57 | def compute_metrics(G, refresh = True): 58 | 59 | descriptTLU = 'TLU--Local Clustering Coefficient' 60 | descriptDegree = 'Degree Statistics' 61 | 62 | if refresh or G.metrics == None: 63 | 64 | #we treat a single node graph to have a density of 1 65 | #TODO: This is undefined for multigraphs. Prob should simplify if this happens 66 | density = G.density() if G.vcount() > 1 else 1.0 67 | 68 | G.metrics = { 69 | 'Internal Number Nodes' : G.vcount(), 70 | 'Internal Number Edges' : G.ecount(), 71 | 'Density' : density, 72 | 'Diameter' : G.diameter(), 73 | 'Cohesiveness' : G.cohesiveness(), 74 | 'Triangle Participation Ratio' : G.triangle_participation_ratio(), 75 | 'Transitivity Undirected (Global Clustering Coefficient)' 76 | : G.transitivity_undirected(mode='zero') 77 | } 78 | G.metrics.update(aggregate(G.transitivity_local_undirected(mode='zero'), prefix=descriptTLU)) 79 | G.metrics.update(aggregate(G.degree(), prefix=descriptDegree)) 80 | 81 | Graph.metrics = None 82 | Graph.compute_metrics = compute_metrics 83 | Graph.cohesiveness = cohesiveness 84 | Graph.triangle_participation = triangle_participation 85 | Graph.triangle_participation_ratio = triangle_participation_ratio 86 | -------------------------------------------------------------------------------- /circulo/metrics/omega.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | import scipy.sparse as sp 4 | 5 | from igraph import Graph, VertexCover 6 | 7 | def __reset_diagonal(A, sparse): 8 | ''' 9 | input: matrix 10 | ouput: matrix object with diagonals set to 0 11 | ''' 12 | 13 | if sparse: 14 | A = A - sp.dia_matrix((A.diagonal()[scipy.newaxis, :], [0]), shape=A.shape) 15 | else: 16 | A = A.copy() 17 | np.fill_diagonal(A, 0) 18 | return A 19 | 20 | def __get_diagonal(A, sparse): 21 | ''' 22 | input: Matrix 23 | output: vector with the diagonal entries 24 | ''' 25 | if sparse: 26 | return A.diagonal() 27 | else: 28 | return np.diag(A) 29 | 30 | 31 | def __get_matrix(vc, sparse): 32 | ''' 33 | inputs: List of lists (vertexCover) object 34 | output: Node x Node matrix with the cell values indicating the number of clusters 35 | each pair of nodes shares 36 | ''' 37 | n = len(vc) # number of nodes 38 | nc = max([max(i) for i in vc if i]) + 1 # number of clusters 39 | 40 | create_zero_matrix = sp.csr_matrix if sparse else np.zeros 41 | A = create_zero_matrix((n,n), dtype='int') 42 | for i in range(nc): 43 | # Create a Clique from Membership 44 | v = np.matrix([ (i in m)*1 for m in vc]) 45 | if sparse: 46 | v = sp.csr_matrix(v) 47 | Ai = v.T*v 48 | A = A+Ai 49 | # DO NOT ZERO THE DIAGONALS HERE, __get_omega_e depends on them. 50 | return A.tocsr() if sparse else A 51 | 52 | def __get_omega_u(A1, A2, sparse): 53 | ''' 54 | inputs: Two __get_matrix results 55 | outputs: un-adjusted omega score 56 | 57 | ''' 58 | n = A1.shape[0] 59 | M = n*(n-1)/2.0 60 | notA = __reset_diagonal((A1 != A2), sparse) 61 | rv = n*(n-1) - notA.sum() 62 | return rv/(2*M) 63 | 64 | def __get_omega_e(A1, A2, sparse): 65 | ''' 66 | inputs: Two __get_matrix results 67 | outputs: expected omega score 68 | 69 | ''' 70 | n = A1.shape[0] 71 | M = n*(n-1)/2.0 72 | k = max(max((__get_diagonal(A1, sparse))), max(__get_diagonal(A2, sparse))) 73 | 74 | # The 0th iteration is done with a negation since it is a sparse matrix 75 | t_not0_1 = __reset_diagonal((A1 != 0), sparse) 76 | t_not0_2 = __reset_diagonal((A2 != 0), sparse) 77 | rv = n*(n-1) - t_not0_1.sum() 78 | rv *= n*(n-1) - t_not0_2.sum() 79 | for i in range(1, k+1): 80 | t_i_1 = __reset_diagonal((A1 == i), sparse) 81 | t_i_2 = __reset_diagonal((A2 == i), sparse) 82 | 83 | rv += t_i_1.sum()*t_i_2.sum() 84 | rv /= (2*M)**2 85 | return rv; 86 | 87 | def omega_index(cover_membership_a, cover_membership_b, sparse=True): 88 | ''' 89 | Uses the Omega Index metrics to compare two covers of a given domain, e.g. a Graph. 90 | @param cover_membership_a : A list of vertex to membership list. 91 | Example - a = [[0,1],[1],[0,2]] 92 | @param cover_membership_b : A list of vertex to membership list. 93 | @returns: Best match = 1, No match = 0 94 | ''' 95 | 96 | A1 = __get_matrix(cover_membership_a, sparse) 97 | A2 = __get_matrix(cover_membership_b, sparse) 98 | omega_u = __get_omega_u(A1, A2, sparse) 99 | omega_e = __get_omega_e(A1, A2, sparse) 100 | 101 | return (omega_u - omega_e)/(1-omega_e) 102 | 103 | -------------------------------------------------------------------------------- /circulo/metrics/probability_metric.py: -------------------------------------------------------------------------------- 1 | import igraph as ig 2 | import statistics 3 | 4 | 5 | def probability_metric_score(G, clusters): 6 | """ 7 | Returns the mean of all of the cluster's scores under the probability metric 8 | defined below. Contains much less information than probability_metric_graph, 9 | but allows for a single number with which one can compare clustering algorithms. 10 | """ 11 | l = [p[0] for p in probability_metric_graph(G, clusters)] 12 | return statistics.mean(l); 13 | 14 | def probability_metric_graph(G, clusters): 15 | """ 16 | Calculates the probability metric on the graph G for each cluster in 17 | clusters. Returns a list of 3-tuples [(a, b, c),...] where a is the mean, 18 | b the standard deviation, and c the variance, indexed by cluster id. 19 | 20 | This metric measures how likely a particle placed on some vertex will stay within 21 | the original community after n random steps, where n is the number of vertices in 22 | the community (or some other, better value for normalization). 23 | 24 | This returns a list [(mean, variance)_0, (mean, variance)_1..., (mean, variance)_n] 25 | where the statistics within each cluster are represented by its index in the list. 26 | 27 | A high mean suggests strong community structure, but a high variance suggests that 28 | a few objects in the community might be outliers and not necessarily belong. 29 | 30 | Currently, we use the size of the community as the number of steps taken, but this 31 | isn't based on much theory. Some analysis is necessary to find a better choice for n. 32 | 33 | If you want one "score" to score the clustering instead of individual communities, 34 | call probability_metric_score instead. 35 | """ 36 | fullStats = [] 37 | for cluster in clusters: 38 | fullStats.append(probability_metric_cluster(G, cluster)) 39 | return fullStats 40 | 41 | 42 | def probability_metric_cluster(G, members): 43 | """ 44 | Given the members of a cluster and the graph they belong to, finds 45 | the cluster's mean, standard deviation, and variance. 46 | 47 | Note: n currently defaults to the number of members in the community. 48 | TODO: testing, to find out whether this is a legitimate normalization. 49 | """ 50 | nMembers = len(members) # figure out if this is a good normalization 51 | # numVertices = G.vcount(); 52 | # normalization = nMembers / numVertices 53 | data = [p_in_after_n(G, v, nMembers, members) for v in members] 54 | mean = statistics.mean(data) # could divide mean by normalization 55 | var = statistics.pvariance(data, mu=mean) 56 | return mean, var 57 | 58 | 59 | def p_in_after_n(G, v, n, comm): 60 | """ 61 | Finds the probability that a particle will remain 62 | within the community during every step of an 63 | n step random walk, beginning from v. At each step, 64 | the particle is equally likely to travel to any 65 | of its neighbors. 66 | 67 | TODO: use weights 68 | """ 69 | return p_in_after_n_r_cached(G, v, n, set(comm), {}) 70 | 71 | 72 | def p_in_after_n_r_cached(G, v, n, comm, cache): 73 | """ 74 | Memoized, recursive implementation of p_in_after_n. 75 | Internal function. 76 | """ 77 | if (v, n) in cache: 78 | # memoized, don't recurse 79 | return cache[(v, n)] 80 | if v not in comm: 81 | # left the community 82 | return 0 83 | 84 | neighbors = set(G.neighbors(v)) 85 | numNeighbors = float(len(neighbors)) 86 | 87 | if n == 1: 88 | # Second to last step of the possible recursion, 89 | # short circuit so we don't have to recurse down to 0 90 | return len(neighbors & comm) / numNeighbors 91 | 92 | totalP = 0. 93 | for neighbor in neighbors: 94 | pGivenNeighbor = p_in_after_n_r_cached(G, neighbor, n-1, comm, cache) 95 | cache[(neighbor, n-1)] = pGivenNeighbor 96 | totalP += 1/numNeighbors * pGivenNeighbor 97 | cache[(v, n)] = totalP 98 | return totalP 99 | -------------------------------------------------------------------------------- /circulo/setup/run_metrics.py: -------------------------------------------------------------------------------- 1 | # Now cluster the clusters 2 | from circulo import metrics 3 | from sklearn import metrics as skmetrics 4 | import numpy as np 5 | import pickle 6 | import argparse 7 | import os 8 | import glob 9 | import json 10 | from igraph import VertexCover 11 | import importlib 12 | import circulo.metrics.cover 13 | import multiprocessing 14 | import time 15 | import signal 16 | import os 17 | import errno 18 | import traceback 19 | import sys 20 | from collections import namedtuple 21 | import inspect 22 | from circulo.data.databot import CirculoData 23 | 24 | 25 | Worker = namedtuple('Worker', 'json_path output_path timeout') 26 | 27 | def main(): 28 | 29 | parser = argparse.ArgumentParser(description='Compute metrics for given cover.') 30 | parser.add_argument('input_path', type=str, help='file or directory containing results') 31 | parser.add_argument('output_path', type=str, help='output directory to write metric files') 32 | parser.add_argument('--workers', type=int, default=None, help='Number of workers to process (DEFAULT: number of processors)') 33 | parser.add_argument('--timeout', type=int, default=3600, help="timeout for a work item in seconds (DEFAULT: 3600)") 34 | args = parser.parse_args() 35 | 36 | if not os.path.exists(args.input_path): 37 | print("Path \"{}\" does not exist".format(args.input_path)) 38 | return 39 | 40 | if not os.path.exists(args.output_path): 41 | os.makedirs(args.output_path) 42 | 43 | workers = [] 44 | 45 | if os.path.isdir(args.input_path): 46 | for f in glob.glob(os.path.join(args.input_path, '*.json')): 47 | workers.append(Worker(f, args.output_path, args.timeout)) 48 | else: 49 | workers.append(Worker(args.input_path, args.output_path, args.timeout)) 50 | 51 | if args.workers is not None: 52 | pool = multiprocessing.Pool(processes = args.workers) 53 | else: 54 | pool = multiprocessing.Pool() 55 | 56 | r = pool.map_async(analyze_json, workers) 57 | r.get() #must call in order to get error from inside the child processes 58 | pool.close() 59 | pool.join() 60 | 61 | class TimeoutError(Exception): 62 | pass 63 | 64 | def __handle_timeout(signum, frame): 65 | raise TimeoutError(os.strerror(errno.ETIME)) 66 | 67 | 68 | def analyze_json(worker): 69 | 70 | signal.signal(signal.SIGALRM, __handle_timeout) 71 | signal.setitimer(signal.ITIMER_REAL, worker.timeout) 72 | t0 = time.time() 73 | 74 | 75 | data = None 76 | 77 | with open(worker.json_path) as f: 78 | data = json.load(f) 79 | 80 | if(data is None): 81 | print("No data found for ", worker.json_path) 82 | return 83 | 84 | print("###### Running metrics against " + data['job_name']) 85 | #load the graph and ground truth in 86 | data_mod = importlib.import_module('circulo.data.'+data['dataset']+'.run') 87 | 88 | instance = None 89 | 90 | for name,cls in inspect.getmembers(data_mod): 91 | if inspect.isclass(cls) and issubclass(cls, CirculoData) and name != "CirculoData": 92 | instance = cls(data['dataset']) 93 | 94 | if instance == None: 95 | print("Unable to find data module for ", data['dataset']) 96 | return 97 | 98 | G = instance.get_graph() 99 | 100 | #apply similar alterations as were done with the algos 101 | alterations = data['alterations'] 102 | 103 | if len(alterations) > 0: 104 | if "weighted" in alterations: 105 | G.es()['weight'] = 1 106 | 107 | if "undirected" in alterations: 108 | G.to_undirected(combine_edges={'weight':sum}) 109 | 110 | if "simple" in alterations: 111 | G.simplify(combine_edges={'weight':sum}) 112 | 113 | if "pruned" in alterations: 114 | instance.prune(G) 115 | 116 | 117 | weights = 'weight' if G.is_weighted() else None 118 | #some datasets might not have ground truth 119 | try: 120 | vc = instance.get_ground_truth(G) 121 | ground_truth_cover = cover_from_membership( vc.membership, G) 122 | except Exception as e: 123 | print("\t++NOTE for ", data['dataset'], ": Ground Truth Not Available") 124 | ground_truth_cover = None 125 | 126 | results_cover = cover_from_membership(data['membership'], G) 127 | 128 | try: 129 | t0 = time.time() 130 | #results are currently stored within the cover object 131 | results_cover.compute_metrics(weights=weights, ground_truth_cover=ground_truth_cover ) 132 | except TimeoutError as t: 133 | print("\t+Timeout ERROR: was analyzing: ", data['job_name']) 134 | signal.alarm(0) 135 | return 136 | except Exception as e: 137 | print(e) 138 | traceback.print_exc(file=sys.stdout) 139 | return 140 | out_dict = { 141 | "name" : data['job_name'], 142 | "elapsed" :data['elapsed'], 143 | "membership" : data['membership'], 144 | "omega": results_cover.compare_omega(ground_truth_cover), 145 | "metrics": results_cover.metrics, 146 | "metrics_elapsed": (time.time() - t0) 147 | } 148 | 149 | 150 | try: 151 | 152 | full_path = os.path.join(worker.output_path,data['job_name'] + ".json") 153 | with open(full_path, 'w') as outfile: 154 | json.dump(out_dict, outfile) 155 | except Exception as e: 156 | traceback.print_exc(file=sys.stdout) 157 | print(e) 158 | 159 | def cover_from_membership(membership, G): 160 | 161 | if(membership is None): 162 | return None 163 | 164 | cluster_dict = {} 165 | 166 | for vertex_id, cluster_id_list in enumerate(membership): 167 | for cluster_id in cluster_id_list: 168 | if(cluster_id not in cluster_dict): 169 | cluster_dict[cluster_id] = [] 170 | cluster_dict[cluster_id].append(vertex_id) 171 | 172 | return VertexCover(G, [v for v in cluster_dict.values()]) 173 | 174 | 175 | if __name__ == "__main__": 176 | main() 177 | -------------------------------------------------------------------------------- /circulo/unit_tests/karate.gml: -------------------------------------------------------------------------------- 1 | Creator "Mark Newman on Fri Jul 21 12:39:27 2006" 2 | graph 3 | [ 4 | node 5 | [ 6 | id 1 7 | ] 8 | node 9 | [ 10 | id 2 11 | ] 12 | node 13 | [ 14 | id 3 15 | ] 16 | node 17 | [ 18 | id 4 19 | ] 20 | node 21 | [ 22 | id 5 23 | ] 24 | node 25 | [ 26 | id 6 27 | ] 28 | node 29 | [ 30 | id 7 31 | ] 32 | node 33 | [ 34 | id 8 35 | ] 36 | node 37 | [ 38 | id 9 39 | ] 40 | node 41 | [ 42 | id 10 43 | ] 44 | node 45 | [ 46 | id 11 47 | ] 48 | node 49 | [ 50 | id 12 51 | ] 52 | node 53 | [ 54 | id 13 55 | ] 56 | node 57 | [ 58 | id 14 59 | ] 60 | node 61 | [ 62 | id 15 63 | ] 64 | node 65 | [ 66 | id 16 67 | ] 68 | node 69 | [ 70 | id 17 71 | ] 72 | node 73 | [ 74 | id 18 75 | ] 76 | node 77 | [ 78 | id 19 79 | ] 80 | node 81 | [ 82 | id 20 83 | ] 84 | node 85 | [ 86 | id 21 87 | ] 88 | node 89 | [ 90 | id 22 91 | ] 92 | node 93 | [ 94 | id 23 95 | ] 96 | node 97 | [ 98 | id 24 99 | ] 100 | node 101 | [ 102 | id 25 103 | ] 104 | node 105 | [ 106 | id 26 107 | ] 108 | node 109 | [ 110 | id 27 111 | ] 112 | node 113 | [ 114 | id 28 115 | ] 116 | node 117 | [ 118 | id 29 119 | ] 120 | node 121 | [ 122 | id 30 123 | ] 124 | node 125 | [ 126 | id 31 127 | ] 128 | node 129 | [ 130 | id 32 131 | ] 132 | node 133 | [ 134 | id 33 135 | ] 136 | node 137 | [ 138 | id 34 139 | ] 140 | edge 141 | [ 142 | source 2 143 | target 1 144 | ] 145 | edge 146 | [ 147 | source 3 148 | target 1 149 | ] 150 | edge 151 | [ 152 | source 3 153 | target 2 154 | ] 155 | edge 156 | [ 157 | source 4 158 | target 1 159 | ] 160 | edge 161 | [ 162 | source 4 163 | target 2 164 | ] 165 | edge 166 | [ 167 | source 4 168 | target 3 169 | ] 170 | edge 171 | [ 172 | source 5 173 | target 1 174 | ] 175 | edge 176 | [ 177 | source 6 178 | target 1 179 | ] 180 | edge 181 | [ 182 | source 7 183 | target 1 184 | ] 185 | edge 186 | [ 187 | source 7 188 | target 5 189 | ] 190 | edge 191 | [ 192 | source 7 193 | target 6 194 | ] 195 | edge 196 | [ 197 | source 8 198 | target 1 199 | ] 200 | edge 201 | [ 202 | source 8 203 | target 2 204 | ] 205 | edge 206 | [ 207 | source 8 208 | target 3 209 | ] 210 | edge 211 | [ 212 | source 8 213 | target 4 214 | ] 215 | edge 216 | [ 217 | source 9 218 | target 1 219 | ] 220 | edge 221 | [ 222 | source 9 223 | target 3 224 | ] 225 | edge 226 | [ 227 | source 10 228 | target 3 229 | ] 230 | edge 231 | [ 232 | source 11 233 | target 1 234 | ] 235 | edge 236 | [ 237 | source 11 238 | target 5 239 | ] 240 | edge 241 | [ 242 | source 11 243 | target 6 244 | ] 245 | edge 246 | [ 247 | source 12 248 | target 1 249 | ] 250 | edge 251 | [ 252 | source 13 253 | target 1 254 | ] 255 | edge 256 | [ 257 | source 13 258 | target 4 259 | ] 260 | edge 261 | [ 262 | source 14 263 | target 1 264 | ] 265 | edge 266 | [ 267 | source 14 268 | target 2 269 | ] 270 | edge 271 | [ 272 | source 14 273 | target 3 274 | ] 275 | edge 276 | [ 277 | source 14 278 | target 4 279 | ] 280 | edge 281 | [ 282 | source 17 283 | target 6 284 | ] 285 | edge 286 | [ 287 | source 17 288 | target 7 289 | ] 290 | edge 291 | [ 292 | source 18 293 | target 1 294 | ] 295 | edge 296 | [ 297 | source 18 298 | target 2 299 | ] 300 | edge 301 | [ 302 | source 20 303 | target 1 304 | ] 305 | edge 306 | [ 307 | source 20 308 | target 2 309 | ] 310 | edge 311 | [ 312 | source 22 313 | target 1 314 | ] 315 | edge 316 | [ 317 | source 22 318 | target 2 319 | ] 320 | edge 321 | [ 322 | source 26 323 | target 24 324 | ] 325 | edge 326 | [ 327 | source 26 328 | target 25 329 | ] 330 | edge 331 | [ 332 | source 28 333 | target 3 334 | ] 335 | edge 336 | [ 337 | source 28 338 | target 24 339 | ] 340 | edge 341 | [ 342 | source 28 343 | target 25 344 | ] 345 | edge 346 | [ 347 | source 29 348 | target 3 349 | ] 350 | edge 351 | [ 352 | source 30 353 | target 24 354 | ] 355 | edge 356 | [ 357 | source 30 358 | target 27 359 | ] 360 | edge 361 | [ 362 | source 31 363 | target 2 364 | ] 365 | edge 366 | [ 367 | source 31 368 | target 9 369 | ] 370 | edge 371 | [ 372 | source 32 373 | target 1 374 | ] 375 | edge 376 | [ 377 | source 32 378 | target 25 379 | ] 380 | edge 381 | [ 382 | source 32 383 | target 26 384 | ] 385 | edge 386 | [ 387 | source 32 388 | target 29 389 | ] 390 | edge 391 | [ 392 | source 33 393 | target 3 394 | ] 395 | edge 396 | [ 397 | source 33 398 | target 9 399 | ] 400 | edge 401 | [ 402 | source 33 403 | target 15 404 | ] 405 | edge 406 | [ 407 | source 33 408 | target 16 409 | ] 410 | edge 411 | [ 412 | source 33 413 | target 19 414 | ] 415 | edge 416 | [ 417 | source 33 418 | target 21 419 | ] 420 | edge 421 | [ 422 | source 33 423 | target 23 424 | ] 425 | edge 426 | [ 427 | source 33 428 | target 24 429 | ] 430 | edge 431 | [ 432 | source 33 433 | target 30 434 | ] 435 | edge 436 | [ 437 | source 33 438 | target 31 439 | ] 440 | edge 441 | [ 442 | source 33 443 | target 32 444 | ] 445 | edge 446 | [ 447 | source 34 448 | target 9 449 | ] 450 | edge 451 | [ 452 | source 34 453 | target 10 454 | ] 455 | edge 456 | [ 457 | source 34 458 | target 14 459 | ] 460 | edge 461 | [ 462 | source 34 463 | target 15 464 | ] 465 | edge 466 | [ 467 | source 34 468 | target 16 469 | ] 470 | edge 471 | [ 472 | source 34 473 | target 19 474 | ] 475 | edge 476 | [ 477 | source 34 478 | target 20 479 | ] 480 | edge 481 | [ 482 | source 34 483 | target 21 484 | ] 485 | edge 486 | [ 487 | source 34 488 | target 23 489 | ] 490 | edge 491 | [ 492 | source 34 493 | target 24 494 | ] 495 | edge 496 | [ 497 | source 34 498 | target 27 499 | ] 500 | edge 501 | [ 502 | source 34 503 | target 28 504 | ] 505 | edge 506 | [ 507 | source 34 508 | target 29 509 | ] 510 | edge 511 | [ 512 | source 34 513 | target 30 514 | ] 515 | edge 516 | [ 517 | source 34 518 | target 31 519 | ] 520 | edge 521 | [ 522 | source 34 523 | target 32 524 | ] 525 | edge 526 | [ 527 | source 34 528 | target 33 529 | ] 530 | ] 531 | -------------------------------------------------------------------------------- /circulo/unit_tests/metrics.py: -------------------------------------------------------------------------------- 1 | import random 2 | import unittest 3 | import numpy as np 4 | import circulo.metrics 5 | import igraph 6 | 7 | import importlib 8 | import inspect 9 | from circulo.data.databot import CirculoData 10 | from circulo.setup.run_metrics import cover_from_membership 11 | import circulo.metrics.cover 12 | 13 | 14 | class TestMetrics(unittest.TestCase): 15 | def setUp(self): 16 | DATASET='karate' 17 | #load the graph and ground truth in 18 | data_mod = importlib.import_module('circulo.data.'+DATASET+'.run') 19 | 20 | instance = None 21 | 22 | for name,cls in inspect.getmembers(data_mod): 23 | if inspect.isclass(cls) and issubclass(cls, CirculoData) and name != "CirculoData": 24 | instance = cls(DATASET) 25 | 26 | self.G = instance.get_graph() 27 | membership=[[0,1,2,3,7,11,12,13,17,19,21], 28 | [4,5,6,10,16], 29 | [8,9,14,15,18,20,22,23,24,25,26,27,28,29,30,31,32,33]] 30 | self.weights=[5,7,4,5,8,7,2,1,1,6,7,4,9,6,8,2,2,1,2,5,6,5,7,7,3,4,4,6,7,7,5,7,4,8,5,4,5,3,1,6,4,3,3,3,1,6,2,7,8,8,1,7,5,7,5,4,7,3,7,5,8,9,4,2,8,8,6,3,6,6,8,5,6,7,5,7,7,7] 31 | self.G.es['weight'] = self.weights 32 | 33 | self.cover = circulo.metrics.cover.VertexCover(self.G, membership) 34 | 35 | def test_internaldensity(self): 36 | #doesn't apply to weighted graphs 37 | truth = [.4181818, .6, .22875817] 38 | 39 | #Density is an igraph ``metric'' 40 | test = [ s.density() for s in self.cover.subgraphs()] 41 | self.assertListAlmostEquals(truth, test, 2) 42 | 43 | def test_avgdegree(self): 44 | truth = [4.181818182, 2.4, 3.8888889] 45 | 46 | # Average degree is an igraph + python method 47 | from scipy import mean 48 | test = [ mean(s.degree()) for s in self.cover.subgraphs() ] 49 | self.assertListAlmostEquals(truth, test, 2) 50 | 51 | def test_wavgdegree(self): 52 | truth = [24, 15.2, 23.3333334] 53 | 54 | # Average degree is an igraph + python method 55 | from scipy import mean 56 | test = [ mean(s.strength(weights='weight')) for s in self.cover.subgraphs() ] 57 | self.assertListAlmostEquals(truth, test, 2) 58 | 59 | def test_FOMD(self): 60 | truth = [0.545454545, 0, 0.277777778] 61 | 62 | test = circulo.metrics.cover.fomd(self.cover) 63 | self.assertListAlmostEquals(truth, test, 2) 64 | 65 | def test_WFOMD(self): 66 | truth = [0.545454545, 0.4 , 0.388888889] 67 | 68 | test = circulo.metrics.cover.fomd(self.cover, weights='weight') 69 | self.assertListAlmostEquals(truth, test, 2) 70 | 71 | def test_expansion(self): 72 | truth = [1.272727, 0.8, 0.555556] 73 | 74 | test = self.cover.expansion() 75 | self.assertListAlmostEquals(truth, test, 2) 76 | 77 | def test_wexpansion(self): 78 | truth = [2.181818, 1.2, 1] 79 | 80 | test = self.cover.expansion(weights='weight') 81 | self.assertListAlmostEquals(truth, test, 2) 82 | 83 | def test_cutratio(self): 84 | #not applicable to weighted graphs 85 | truth = [.05534,.02759,.03472,] 86 | 87 | test = circulo.metrics.cover.cut_ratio(self.cover, allow_nan=True) 88 | self.assertListAlmostEquals(truth, test, 2) 89 | 90 | def test_conductance(self): 91 | truth = [0.2333333,0.25, 0.125] 92 | 93 | test = self.cover.conductance() 94 | self.assertListAlmostEquals(truth, test, 2) 95 | 96 | def test_wconductance(self): 97 | truth = [0.083333, 0.0731707, 0.0410959] 98 | 99 | test = self.cover.conductance(weights='weight') 100 | self.assertListAlmostEquals(truth, test, 2) 101 | 102 | def test_normalizedcut(self): 103 | truth = [0.346236559, 0.277027027, 0.229166667] 104 | 105 | test = self.cover.normalized_cut() 106 | self.assertListAlmostEquals(truth, test, 2) 107 | 108 | def test_wnormalizedcut(self): 109 | truth = [0.125586854, 0.081300813, 0.085430866] 110 | 111 | test = self.cover.normalized_cut(weights='weight') 112 | self.assertListAlmostEquals(truth, test, 2) 113 | 114 | def test_TPR(self): 115 | #same for weighted and unweighted graphs 116 | truth = [0.9091,0.6, 0.9444444] 117 | 118 | test = [ s.triangle_participation_ratio() 119 | for s in self.cover.subgraphs() ] 120 | self.assertListAlmostEquals(truth, test, 2) 121 | 122 | def test_MaxODF(self): 123 | truth = [.5,0.3333333, 0.5 ] 124 | 125 | test = circulo.metrics.cover.maximum_out_degree_fraction(self.cover) 126 | self.assertListAlmostEquals(truth, test, 2) 127 | 128 | def test_WMaxODF(self): 129 | truth = [0.222222222, 0.153846154, 0.2] 130 | 131 | test = self.cover.maximum_out_degree_fraction(weights='weight') 132 | self.assertListAlmostEquals(truth, test, 2) 133 | 134 | def test_avgODF(self): 135 | truth = [0.138131313, 0.233333333, 0.117592593] 136 | 137 | test = self.cover.average_out_degree_fraction() 138 | self.assertListAlmostEquals(truth, test, 2) 139 | 140 | def test_wavgODF(self): 141 | truth = [0.064922913, 0.080586081, 0.041399798] 142 | 143 | test = self.cover.average_out_degree_fraction(weights='weight') 144 | self.assertListAlmostEquals(truth, test, 2) 145 | 146 | def test_FlakeODF(self): 147 | truth = [0,0,0] 148 | 149 | test = circulo.metrics.cover.flake_out_degree_fraction(self.cover) 150 | #test = self.cover.flake_out_degree_fraction() 151 | self.assertListAlmostEquals(truth, test, 2) 152 | 153 | def test_WFLakeODF(self): 154 | truth = [0,0,0] 155 | 156 | test = circulo.metrics.cover.flake_out_degree_fraction(self.cover, weights='weight') 157 | self.assertListAlmostEquals(truth, test, 2) 158 | 159 | def test_separability(self): 160 | truth = [1.6428571,1.5, 3.5] 161 | 162 | test = circulo.metrics.cover.separability(self.cover) 163 | self.assertListAlmostEquals(truth, test, 2) 164 | 165 | def test_wseparability(self): 166 | truth = [5.5, 6.3333333333, 11.666666667] 167 | 168 | test = self.cover.separability(weights='weight') 169 | self.assertListAlmostEquals(truth, test, 2) 170 | 171 | def test_localclusteringcoefficient(self): 172 | #This averages the local clustering coefficient 173 | #Results are the same for weighted and unweighted graphs 174 | 175 | truth = [0.75310245, 0.33333333, 0.65153920] 176 | 177 | # Local Clustering Coeff is an igraph function 178 | from scipy import mean 179 | test = [ mean(s.transitivity_local_undirected(mode='zero')) 180 | for s in self.cover.subgraphs() ] 181 | self.assertListAlmostEquals(truth, test, 2) 182 | 183 | def test_cohesiveness(self): 184 | # TODO: Calculate cohesiveness "truth" cohesiveness truth 185 | self.skipTest("Not sure what truth values for this should be, skipping for now") 186 | truth = [] 187 | 188 | test = [ s.cohesiveness() for s in self.cover.subgraphs() ] 189 | 190 | self.assertListAlmostEquals(truth, test, 2) 191 | 192 | def assertListAlmostEquals(self, a, b, places=None, msg=None): 193 | self.assertEquals(np.round(a,places).tolist(), 194 | np.round(b,places).tolist(), msg=msg) 195 | 196 | 197 | if __name__ == '__main__' : 198 | unittest.main() 199 | 200 | -------------------------------------------------------------------------------- /circulo/unit_tests/test_metrics.py: -------------------------------------------------------------------------------- 1 | import random 2 | import unittest 3 | import igraph 4 | from circulo.metrics import VertexCoverMetric 5 | 6 | class TestMetrics(unittest.TestCase): 7 | def setUp(self): 8 | self.G=igraph.load("karate.gml") 9 | 10 | membership=[ 11 | [0,1,2,3,7,11,12,13,17,19,21], 12 | [4,5,6,10,16], 13 | [8,9,14,15,18,20,22,23,24,25,26,27,28,29,30,31,32,33]] 14 | cover=igraph.VertexCover(self.G, membership) 15 | metrics=VertexCoverMetric.run_analysis(cover, weights=None) 16 | metrics.report() 17 | self.comm_metrics=metrics.comm_metrics 18 | 19 | def test_density(self): 20 | self.assertEqual(round(.4181818, 2), round(self.comm_metrics[0].density, 2)) 21 | self.assertEqual(round(.6, 2), round(self.comm_metrics[1].density,2)) 22 | self.assertEqual(round(.22875817, 2), round(self.comm_metrics[2].density,2)) 23 | 24 | def test_avgdegree(self): 25 | self.assertEqual(round(4.181818182, 2), round(self.comm_metrics[0].degree_avg,2)) 26 | self.assertEqual(round(2.4, 2), round(self.comm_metrics[1].degree_avg,2)) 27 | self.assertEqual(round(3.8888889,2), round(self.comm_metrics[2].degree_avg,2)) 28 | 29 | def test_FOMD(self): 30 | self.assertEqual(round(0.545454545,2), round(self.comm_metrics[0].fomd, 2)) 31 | self.assertEqual(round(0, 2), round(self.comm_metrics[1].fomd, 2)) 32 | self.assertEqual(round(0.277777778,2), round(self.comm_metrics[2].fomd,2)) 33 | 34 | def test_expansion(self): 35 | self.assertEqual(round(1.272727, 2), round(self.comm_metrics[0].degree_boundary_avg, 2)) 36 | self.assertEqual(round(0.8, 2), round(self.comm_metrics[1].degree_boundary_avg, 2)) 37 | self.assertEqual(round(0.555556, 2), round(self.comm_metrics[2].degree_boundary_avg,2)) 38 | 39 | def test_cutratio(self): 40 | self.assertEqual(round(.05534, 2), round(self.comm_metrics[0].cut_ratio, 2)) 41 | self.assertEqual(round(.02759, 2), round(self.comm_metrics[1].cut_ratio, 2)) 42 | self.assertEqual(round(.03472, 2), round(self.comm_metrics[2].cut_ratio, 2)) 43 | 44 | def test_conductance(self): 45 | self.assertEqual(round(0.2333333,2), round(self.comm_metrics[0].conductance,2)) 46 | self.assertEqual(round(0.25,2), round(self.comm_metrics[1].conductance,2)) 47 | self.assertEqual(round(0.125,2), round(self.comm_metrics[2].conductance,2)) 48 | 49 | def test_normalizedcut(self): 50 | self.assertEqual(round(0.346236559,2), round(self.comm_metrics[0].normalized_cut,2)) 51 | self.assertEqual(round(0.277027027,2), round(self.comm_metrics[1].normalized_cut,2)) 52 | self.assertEqual(round(0.229166667, 2), round(self.comm_metrics[2].normalized_cut,2)) 53 | 54 | def test_TPR(self): 55 | self.assertEqual(round(0.9091, 2), round(self.comm_metrics[0].tpr[1], 2)) 56 | self.assertEqual(round(0.6, 2), round(self.comm_metrics[1].tpr[1], 2)) 57 | self.assertEqual(round(0.9444, 2), round(self.comm_metrics[2].tpr[1], 2)) 58 | 59 | def test_MaxODF(self): 60 | self.assertEqual(round(0.5,2), round(self.comm_metrics[0].odf_dict["max"], 2)) 61 | self.assertEqual(round(0.3333333,2), round(self.comm_metrics[1].odf_dict["max"], 2)) 62 | self.assertEqual(round(0.5, 2), round(self.comm_metrics[2].odf_dict["max"], 2)) 63 | 64 | def test_avgODF(self): 65 | self.assertEqual(round(0.138131313,2), round(self.comm_metrics[0].odf_dict["average"], 2)) 66 | self.assertEqual(round(0.233333333,2), round(self.comm_metrics[1].odf_dict["average"], 2)) 67 | self.assertEqual(round(0.117592593, 2), round(self.comm_metrics[2].odf_dict["average"], 2)) 68 | 69 | def test_FlakeODF(self): 70 | self.assertEqual(round(0, 2), round(self.comm_metrics[0].odf_dict["flake"], 2)) 71 | self.assertEqual(round(0, 2), round(self.comm_metrics[1].odf_dict["flake"], 2)) 72 | self.assertEqual(round(0, 2), round(self.comm_metrics[2].odf_dict["flake"], 2)) 73 | 74 | def test_separability(self): 75 | self.assertEqual(round(1.6428571,2), round(self.comm_metrics[0].separability, 2)) 76 | self.assertEqual(round(1.5, 2), round(self.comm_metrics[1].separability, 2)) 77 | self.assertEqual(round(3.5, 2), round(self.comm_metrics[2].separability, 2)) 78 | 79 | def test_clusteringcoefficient(self): 80 | self.assertEqual(round(0.72503608, 2), round(self.comm_metrics[0].clustering_coefficient, 2)) 81 | self.assertEqual(round(0.66666667, 2), round(self.comm_metrics[1].clustering_coefficient, 2)) 82 | self.assertEqual(round(0.72045177, 2), round(self.comm_metrics[2].clustering_coefficient, 2)) 83 | 84 | 85 | 86 | 87 | 88 | 89 | if __name__ == '__main__' : 90 | unittest.main() 91 | 92 | -------------------------------------------------------------------------------- /circulo/utils/downloader.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import os 3 | import zipfile 4 | import gzip 5 | import sys 6 | import igraph as ig 7 | from collections import defaultdict 8 | 9 | def download_with_notes(url, filename, data_dir, progressbar=True): 10 | """ 11 | Uses urllib to download data from URL. Saves the results in 12 | data_dir/FILENAME. Provides basic logging to stdout. 13 | """ 14 | print("Downloading data from " + url + ".....") 15 | try: 16 | if progressbar: 17 | urllib.request.urlretrieve(url, os.path.join(data_dir, filename), progress) 18 | else: 19 | urllib.request.urlretrieve(url, os.path.join(data_dir, filename)) 20 | except Exception as e: 21 | print("Data download failed -- make sure the url is still valid, and that urllib is properly installed.\n\n") 22 | raise(e) 23 | print("Download complete.") 24 | 25 | _unzip(data_dir, filename) 26 | 27 | def _unzip(data_dir, filename): 28 | 29 | zip_path = os.path.join(data_dir, filename) 30 | 31 | if zipfile.is_zipfile(zip_path): 32 | try: 33 | z = zipfile.ZipFile(zip_path) 34 | except zipfile.BadZipFile as e: 35 | print("ZipFile error: {}".format(e)) 36 | sys.exit(0) 37 | print("Extracting from zip...") 38 | z.extractall(path=data_dir) 39 | 40 | else: 41 | unzip_file = os.path.splitext(zip_path)[0] 42 | 43 | with gzip.open(zip_path,'rb') as infile: 44 | try: 45 | file_content = infile.read() 46 | except OSError as e: 47 | print("Neither gzip nor zipfile. No extraction necessary.") 48 | return 49 | 50 | with open(unzip_file, "wb") as f: 51 | print("Extracting from gzip...") 52 | f.write(file_content) 53 | 54 | def progress(blockNum, blockSize, totSize): 55 | """ 56 | Provides an ascii progress bar that is 50 characters wide. 57 | totSize is the total size of the task, blockSize is the size 58 | of each block, and blockNum is the current block being worked on. 59 | 60 | For example: 61 | 62 | for i in range(100): 63 | progress(i + 1, 1, 100) 64 | sleep(1) 65 | 66 | will print a progress bar over 100 seconds. 67 | """ 68 | downloaded = blockNum * blockSize 69 | per = min(100 * downloaded / totSize, 100) 70 | sys.stdout.write("\r%d%%" %per) 71 | for i in range(int(per / 2)): 72 | sys.stdout.write(".") 73 | for i in range(50 - int(per/2)): 74 | sys.stdout.write(" ") 75 | sys.stdout.write("# ") 76 | sys.stdout.flush() 77 | 78 | 79 | def membership_to_clustering_list(membership): 80 | 81 | cluster_dict = {} 82 | 83 | for idx, cluster_id in enumerate(membership): 84 | if cluster_id not in cluster_dict: 85 | cluster_dict[cluster_id] = [] 86 | cluster_dict[cluster_id].append(idx) 87 | 88 | return [v for v in cluster_dict.values()] 89 | 90 | def multigraph_to_weights(G): 91 | """ 92 | Given a multigraph, coalesces all duplicate edges into a single 93 | weighted edge. Removes all other attributes. Assumes all edges 94 | are either weighted or unweighted. 95 | """ 96 | seen = defaultdict(float) 97 | for e in G.es: 98 | try: 99 | weight = e['weight'] 100 | except KeyError: 101 | weight = 1 102 | seen[e.tuple] += weight 103 | G.delete_edges(None) 104 | es = list(seen.keys()) 105 | weights = list(seen.values()) 106 | G.add_edges(es) 107 | G.es['weight'] = weights 108 | 109 | 110 | -------------------------------------------------------------------------------- /circulo/utils/general.py: -------------------------------------------------------------------------------- 1 | from scipy.stats import describe 2 | from scipy import median 3 | import igraph 4 | import numpy as np 5 | from itertools import combinations 6 | 7 | from circulo.metrics.omega import omega_index 8 | 9 | def aggregate(array, prefix="",axis=0): 10 | 11 | stats = describe(array, axis) 12 | 13 | if len(array) == 1: 14 | variance = -1.0 15 | else: 16 | variance = float(stats[3]) 17 | 18 | 19 | return { 20 | prefix+'Size':int(stats[0]), 21 | prefix+'Min':float(stats[1][0]), 22 | prefix+'Max':float(stats[1][1]), 23 | prefix+'Mean':float(stats[2]), 24 | prefix+'Unbiased Variance':variance, 25 | prefix+'Biased Skewness':float(stats[4]), 26 | prefix+'Biased Kurtosis':float(stats[5]), 27 | prefix+'Median':float(median(array,axis)) 28 | } 29 | 30 | 31 | 32 | def get_largest_component(G, descript="not specified"): 33 | """ 34 | Given a graph, returns the subgraph containing only its largest component". 35 | """ 36 | components = G.components(mode=igraph.WEAK) 37 | if len(components) == 1: 38 | return G 39 | print("[Graph Prep -",descript,"]... Disconnected Graph Detected. Using largest component.") 40 | print("[Graph Prep -",descript,"]... Original graph: {} vertices and {} edges.".format(G.vcount(), G.ecount())) 41 | G = G.subgraph(max(components, key=len)) 42 | print("[Graph Prep -",descript,"]... Largest component: {} vertices and {} edges.".format(G.vcount(), G.ecount())) 43 | return G 44 | 45 | 46 | 47 | def run_comparison(memberships, comparator="omega"): 48 | ''' 49 | Given a list of memberships, uses the comparator to compare results 50 | 51 | Args: 52 | membershps: a list of membership arrays 53 | comparator: the algorithm to use at the comparator (default: omega) 54 | 55 | Return: 56 | a symetric matrix containing the results 57 | ''' 58 | 59 | size = len(memberships) 60 | pairs = combinations(range(size), 2) 61 | M = np.zeros((size, size), dtype=float) 62 | np.fill_diagonal(M, 1) 63 | 64 | if comparator == "omega": 65 | comp_func = omega_index 66 | else: 67 | raise NotImplementedError('Unknown comparison function') 68 | 69 | #fill in top right 70 | for i, j in pairs: 71 | score = comp_func(memberships[i], memberships[j]) 72 | M[i,j] = score 73 | M[j,i] = score 74 | 75 | return M 76 | -------------------------------------------------------------------------------- /circulo/utils/snap.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import tempfile 4 | import sys 5 | import igraph 6 | from igraph.clustering import VertexCover 7 | from collections import OrderedDict 8 | 9 | from sklearn.feature_extraction import DictVectorizer 10 | import numpy as np 11 | 12 | import circulo 13 | 14 | __author__="""Paul M""" 15 | 16 | __all__ = [] 17 | 18 | 19 | ENV_SNAPPATH_VAR = "SNAPHOME" 20 | 21 | 22 | def read_communities_by_community(f_name, G, delete_file=False): 23 | ''' 24 | Reads a community file in the format where each line represents a community where the line is a list of nodes separated by white space 25 | ''' 26 | 27 | comm_list = list() 28 | 29 | with open(f_name, 'r') as community_file: 30 | 31 | for line in community_file: 32 | if line.startswith('#'): 33 | continue 34 | try: 35 | comm_list.append(map(int, line.split())) 36 | except ValueError as e: 37 | print("Node type is unclear for line: {}".format(line)) 38 | return 39 | 40 | if delete_file: 41 | os.remove(f_name) 42 | 43 | return VertexCover(G, comm_list) 44 | 45 | 46 | def read_communities_by_node(f_name, G): 47 | ''' 48 | Reads a community file where each line is a node and the community to which it belongs 49 | For example 50 | 0 1 51 | 0 4 52 | 0 0 53 | 1 3 54 | 1 4 55 | 2 5 56 | ''' 57 | 58 | #dict with keys as community_id and values are a list of nodes 59 | community_dict = dict() 60 | max_node_id = len(G.vs) 61 | with open(f_name, 'r') as community_file: 62 | for line in community_file: 63 | if line.startswith('#'): 64 | continue 65 | 66 | node_id, community_id = (int(x) for x in line.split()) 67 | if node_id <= max_node_id: 68 | if community_id not in community_dict: 69 | community_dict[community_id] = [] 70 | 71 | community_dict[community_id].append(node_id) 72 | 73 | return VertexCover(G, [v for v in community_dict.values()]) 74 | 75 | 76 | 77 | def divisive(G, algo_id, output): 78 | 79 | snap_home, graph_file = setup(G) 80 | 81 | if graph_file is None: 82 | return 83 | 84 | path_girvan_newman = os.path.join(snap_home, "examples", "community", "community") 85 | 86 | 87 | try: 88 | out = subprocess.Popen([path_girvan_newman, "-i:"+graph_file, "-o:"+output, "-a:"+algo_id]) 89 | except TypeError as e: 90 | print("Error occurred: {}".format(e)) 91 | return 92 | 93 | out.wait() 94 | 95 | os.remove(graph_file) 96 | return read_communities_by_node(output, G) 97 | 98 | 99 | def attribute_setup(G, attrs_of_interest): 100 | """ 101 | Create node name and node attribute files. Uses DictVectorizer to encode free form attribute input into set of 102 | binary classes. node_attribute_name_file contains the mapping of binary classes to names 103 | """ 104 | f = tempfile.mkstemp() 105 | node_attribute_name_file = f[1] 106 | 107 | f2 = tempfile.mkstemp() 108 | node_attribute_file = f2[1] 109 | 110 | # Create an array of attributes of interest 111 | attr_array = [] 112 | for node in G.vs: 113 | node_attributes_dict = {} 114 | for attr_name, attr_val in node.attributes().items(): 115 | if attr_name in attrs_of_interest: 116 | node_attributes_dict[attr_name] = attr_val 117 | attr_array.append(node_attributes_dict) 118 | 119 | # TODO: Don't make dense array for sparse input 120 | vec = DictVectorizer(dtype=np.int32) 121 | vectorized_array = vec.fit_transform(attr_array).toarray() 122 | try: 123 | with open(node_attribute_name_file, 'w') as out: 124 | for i, name in enumerate(vec.get_feature_names()): 125 | out.write("{}\t{}\n".format(i, name)) 126 | 127 | with open(node_attribute_file, 'w') as out: 128 | for node_num, bool_feature_array in enumerate(vectorized_array): 129 | for attr_num, val in enumerate(bool_feature_array): 130 | if val != 0: 131 | out.write("{}\t{}\n".format(node_num, attr_num)) 132 | except: 133 | print("Error writing attribute info") 134 | return None 135 | 136 | return (node_attribute_name_file, node_attribute_file) 137 | 138 | 139 | def setup(G, include_header=True): 140 | snap_home = os.path.join(os.path.dirname(circulo.__path__._path[0]), "lib","snap") 141 | 142 | if not os.path.exists(os.path.join(snap_home,"examples","bigclam","bigclam")): 143 | raise Exception("SNAP must be downloaded and built prior to using the snap algorithms") 144 | 145 | f = tempfile.mkstemp() 146 | filename = f[1] 147 | 148 | try: 149 | 150 | #some snap algos can't handle single space edge delimiters, and igraph can't output 151 | #tab delimited edgelist, so we always convert the single spaced output to a tabbed output 152 | with open(filename, 'w') as out: 153 | if include_header: 154 | out.write("# Directed Node Graph\n") 155 | out.write("# Descriptions\n") 156 | out.write("# Nodes: {}\tEdges: {}\n".format(len(G.vs), len(G.es))) 157 | out.write("# SrcNId\tDstNId\n") 158 | for src in G.vs: 159 | for dst in src.neighbors(mode=igraph.ALL): 160 | out.write("{}\t{}\n".format(src.index, dst.index)) 161 | #print(node.neighbors()) 162 | #for u,v in G.get_edgelist(): 163 | # out.write("{}\t{}\n".format(u, v)) 164 | # out.write("{}\t{}\n".format(v, u)) 165 | 166 | except: 167 | print("Error writing edgelist") 168 | return None 169 | 170 | return (snap_home, filename) 171 | -------------------------------------------------------------------------------- /circulo/utils/stochastic_selector.py: -------------------------------------------------------------------------------- 1 | # Now cluster the clusters 2 | from circulo import metrics 3 | from sklearn import metrics as skmetrics 4 | from scipy.spatial.distance import squareform 5 | from scipy.cluster.hierarchy import average,fcluster 6 | import igraph 7 | import numpy as np 8 | import pickle 9 | 10 | 11 | def to_crisp_membership(ovp_membership): 12 | return [ a[0] for a in ovp_membership ] 13 | 14 | 15 | def argmax(array): 16 | return max(zip(array, range(len(array))))[1] 17 | 18 | 19 | def select(covers): 20 | #distance_matrix, y, Z = compute_distance_matrix(covers) 21 | 22 | #pick_representatives(covers, distance_matrix, y, Z) 23 | 24 | #for now just return the first cover. TODO: Cluster the covers correctly 25 | return 0 26 | 27 | def pick_representatives(covers, dist_matrix, y, Z): 28 | 29 | mega_clusters = fcluster(Z,.5) 30 | 31 | G = igraph.Graph.Adjacency((dist_matrix < 1).tolist(), 'UNDIRECTED') 32 | 33 | G.vs()['vc'] = covers 34 | 35 | #G.vs()['vc'] = results['vc.orig'] if 'vc.original' in results else results['vc'] 36 | for e in G.es(): 37 | e['weight'] = 1-dist_matrix[e.source, e.target] 38 | 39 | #mega_clusters -= 1 40 | #cluster = igraph.VertexClustering(G, mega_clusters.tolist()) 41 | 42 | #reps = [] 43 | #for s in cluster.subgraphs(): 44 | # rep_id = argmax(s.strength(weights='weight')) 45 | # reps += [ s.vs()[rep_id]['vc'] ] 46 | 47 | 48 | 49 | def compute_distance_matrix(covers): 50 | # Compute stochastic clusters 51 | num_results = len(covers) 52 | distance_matrix= np.zeros((num_results,num_results)) 53 | print('Calculating distance matrix ... ') 54 | for i in range(num_results): 55 | for j in range(i+1,num_results): 56 | #score = metrics.omega_index(results['vc'][i].membership,results['vc'][j].membership) 57 | #score = skmetrics.f1_score(to_crisp_membership(results['vc'][i].membership), 58 | # to_crisp_membership(results['vc'][j].membership)) 59 | score = skmetrics.adjusted_rand_score(to_crisp_membership(covers[i].membership), 60 | to_crisp_membership(covers[j].membership)) 61 | distance_matrix[i,j] = 1-score 62 | distance_matrix[j,i] = 1-score 63 | distance_matrix = np.matrix(distance_matrix) 64 | 65 | y = squareform(distance_matrix) 66 | Z = average(y) 67 | return distance_matrix, y, Z 68 | -------------------------------------------------------------------------------- /circulo/wrappers/community.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import igraph 3 | 4 | import circulo.algorithms 5 | from circulo.algorithms import * 6 | 7 | import statistics 8 | 9 | 10 | from circulo.data.databot import CirculoData 11 | 12 | 13 | def cleanup(G, databot, descript, algo_directed, algo_simple, algo_uses_weights): 14 | ''' 15 | GRAPH Cleaning: Sometimes the graphs need to be cleaned for certain type of algorithms. 16 | The idea here is that we are trying to best match the data to what the algorithm can do. 17 | We start with specific matches and work our way to more general. 18 | ''' 19 | 20 | alterations = [] 21 | 22 | #first we check if algo and data have same directedness and type. 23 | if G.is_directed() == algo_directed and G.is_simple() == algo_simple and G.is_weighted() == algo_uses_weights: 24 | weight_attr = "weight" if G.is_weighted() else None 25 | return G, weight_attr, alterations 26 | 27 | if algo_directed and not G.is_directed(): 28 | print("\t[Info - ", descript, "] - Warning: Passing undirected graph to directed algo") 29 | 30 | #make a copy to prevserve original 31 | G_copy = G.copy() 32 | 33 | #add edge weights if not existing 34 | if not G_copy.is_weighted(): 35 | G_copy.es()['weight'] = 1 36 | alterations.append('weighted') 37 | 38 | #if the graph is directed and algo is not directed, we make the graph undirected 39 | if G_copy.is_directed() and not algo_directed: 40 | orig_edge_count = G_copy.ecount() 41 | G_copy.to_undirected(combine_edges={'weight':sum}) 42 | alterations.append('undirected') 43 | edges_removed = orig_edge_count - G_copy.ecount() 44 | print("\t[Info - ", descript, "] Converted directed to undirected: ", edges_removed, " edges collapsed of ", orig_edge_count) 45 | 46 | #if the algo is simple but the data is not, then we have to make the data simple 47 | if algo_simple and not G.is_simple(): 48 | orig_edge_count = G_copy.ecount() 49 | G_copy.simplify(combine_edges={'weight':sum}) 50 | alterations.append('simple') 51 | edges_removed = orig_edge_count - G_copy.ecount() 52 | print("\t[Info - ", descript, "] Simplifying multigraph: ", edges_removed, " edges collapsed of ", orig_edge_count) 53 | 54 | #just quick check to see if the graph is nearly complete. If so we want to warn the user 55 | #since many algos don't do well with nearly complete graphs 56 | if G_copy.is_simple(): 57 | complete_edges = G_copy.vcount()*(G.vcount()-1)/2 58 | 59 | if complete_edges *.8 < G_copy.ecount(): 60 | print("\t[WARNING: ",descript,"] Graph is nearly complete") 61 | 62 | return G_copy, "weight", alterations 63 | 64 | 65 | stochastic_algos = { 66 | "infomap", 67 | "fastgreedy", 68 | "leading_eigenvector", 69 | "multilevel", 70 | "label_propogation", 71 | "walktrap", 72 | "spinglass", 73 | "bigclam", 74 | "clauset_newman_moore" 75 | } 76 | 77 | def comm_infomap(G, databot, descript): 78 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True) 79 | return alterations, partial(igraph.Graph.community_infomap, G, edge_weights=weights, vertex_weights=None) 80 | 81 | def comm_fastgreedy(G, databot, descript): 82 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True) 83 | return alterations, partial(igraph.Graph.community_fastgreedy, G, weights=weights) 84 | 85 | def comm_edge_betweenness(G, databot, descript): 86 | #edge betweenness does support undirected and directed, so just say that the algo_directed is the 87 | #same as the data being passed to it 88 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=G.is_directed(), algo_simple=True, algo_uses_weights=True) 89 | return alterations, partial(igraph.Graph.community_edge_betweenness, G, G.is_directed(), weights) 90 | 91 | def comm_leading_eigenvector(G, databot, descript): 92 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True) 93 | return alterations, partial(igraph.Graph.community_leading_eigenvector, G, weights=weights) 94 | 95 | def comm_multilevel(G, databot, descript): 96 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True) 97 | return alterations, partial(igraph.Graph.community_multilevel, G, weights=weights) 98 | 99 | def comm_label_propagation(G, databot, descript): 100 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True) 101 | return alterations, partial(igraph.Graph.community_label_propagation, G, weights=weights) 102 | 103 | def comm_walktrap(G, databot, descript): 104 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True) 105 | return alterations, partial(igraph.Graph.community_walktrap, G, weights=weights) 106 | 107 | def comm_spinglass(G, databot, descript): 108 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=True) 109 | return alterations, partial(igraph.Graph.community_spinglass, G, weights=weights) 110 | 111 | def comm_conga(G, databot, descript): 112 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False) 113 | return alterations, partial(circulo.algorithms.conga.conga, G) 114 | 115 | def comm_congo(G, databot, descript): 116 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False) 117 | return alterations, partial(circulo.algorithms.congo.congo, G) 118 | 119 | def comm_radicchi_strong(G, databot, descript): 120 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False) 121 | return alterations, partial(circulo.algorithms.radicchi.radicchi,G,'strong') 122 | 123 | def comm_radicchi_weak(G, databot, descript): 124 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False) 125 | return alterations, partial(circulo.algorithms.radicchi.radicchi,G,'weak') 126 | 127 | def comm_clique_percolation(G, databot, descript): 128 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False) 129 | return alterations, partial(circulo.algorithms.snap_cpm.clique_percolation,G) 130 | 131 | 132 | def comm_bigclam(G, databot, descript): 133 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=True, algo_simple=True, algo_uses_weights=False) 134 | ctx = databot.get_context() 135 | num_comms = -1 # Detect automatically 136 | min_comms = 1 137 | max_comms = len(G.vs) 138 | 139 | return alterations, partial(circulo.algorithms.snap_bigclam.bigclam, G, detect_comm=num_comms, min_comm=min_comms, max_comm=max_comms) 140 | 141 | def comm_cesna(G, databot, descript): 142 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False) 143 | ctx = databot.get_context() 144 | num_comms = -1 # Detect automatically 145 | 146 | min_comms = 1 147 | max_comms = len(G.vs) 148 | 149 | try: 150 | attrs_to_use = ctx[CirculoData.CONTEXT_ATTRS_TO_USE] 151 | except KeyError: 152 | print("\t[skipping cesna because attributes not provided for ", descript) 153 | return None,None 154 | return alterations, partial(circulo.algorithms.snap_cesna.cesna, G, attrs_to_use, detect_comm=num_comms, min_comm=min_comms, max_comm=max_comms) 155 | 156 | 157 | def comm_coda(G, databot, descript): 158 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False) 159 | return alterations, partial(circulo.algorithms.snap_coda.coda, G) 160 | 161 | def comm_clauset_newman_moore(G, databot, descript): 162 | G, weights, alterations = cleanup(G, databot, descript, algo_directed=False, algo_simple=True, algo_uses_weights=False) 163 | return alterations, partial(circulo.algorithms.snap_cnm.clauset_newman_moore, G) 164 | -------------------------------------------------------------------------------- /experiments/README.md: -------------------------------------------------------------------------------- 1 | #Experiments 2 | 3 | 4 | ### Ground Truth Similarity Test 5 | - __PATH__: [metrics_clustering.py](metrics_clustering.py) 6 | - __GOAL__: Determine if metrics applied to communities from Ground Truth are similar when grouped with metrics of non Ground Truth communities. The experiemnt leverages kmeans clustering where the features are the metrics, and the observations are a set of features for a given community. The intuition behind this experiment is that if the ground truth communities fall under the same cluster, then there must exist some combination of metrics that represent the ideal community for this 7 | particular dataset. 8 | - __RUN__: `python experiments/metrics_clustering.py metrics_dir dataset_name` 9 | - __RESULTS__: The results show the _Groundtruth similarity_ (the largest percentage of ground truth communities in the same cluster), and the Frequency of ground truth communities in the same cluster. For example, you might see the following: 10 | Groundtruth similarity: 0.5833333333333334 11 | Frequency of groundtruth communities as part of centroids [[0 7][1 5]] 12 | 13 | ### Label Communities 14 | - __PATH__: [community_label.py](community_label.py) 15 | - __GOAL__: Attempt to label communities by using node and edge attributes. For each community look at attribute values that are common in that community. 16 | - __RUN__: `python community_label.py ` 17 | - __RESULTS__: For each community the most common label for each attribute is displayed [Note the most common attribute is only displayed if it is assigned to more than half of the nodes in the community] 18 | 19 | ![Community Labels](images/community_label_results.png) 20 | 21 | ### Time vs Accuracy (two approaches) 22 | #####Approach 1 23 | - __PATH__: [metricsCharts.R](metricsCharts.R) (Function: plotRunOmega) 24 | - __GOAL__: For a given data set with Ground Truth, measure how the result accuracy changes with certain algorithms, specifically taking into consideration executation time (time complexity). The idea would be to test, for example, time is correlated with accuracy. 25 | - __RUN__: 26 | - ` metrics <- getMetrics("/path/to/json/metrics", "dataset name (i.e. football)")` 27 | - `plotRunOmega(metrics)` 28 | - __RESULTS__: This example shows the log of the calculation time vs. omega score for all the datsets from https://github.com/Lab41/Circulo-Data/releases/tag/2 29 | 30 | ![Log(time) vs. Omega Score](images/time_vs_omega.png) 31 | #####Approach 2 32 | - __PATH__: [metricsCharts.R](metricsCharts.R) (Function: plotMetrics) 33 | - __GOAL__: Goal: Compare results computation time and accuracy across datasets 34 | - __RUN__: 35 | - `metrics <- getMetrics("/path/to/json/metrics", "dataset name (i.e. football)")` 36 | - `plotMetrics(metrics)` 37 | - __RESULTS__: This exmple shows the datasets vs algorithms. The size of the bubble represents Omega score and the color represents how long it took to compute that result 38 | 39 | ![Dataset vs. Algorithm](images/bubble_plot.png) 40 | 41 | 42 | ### Similar Algorithms 43 | - __PATH__: [cluster_omega_comparison.py](cluster_omega_comparison.py) 44 | - __GOAL__: Determine which algorithms produce similar results by comparing how similar their respective partitions are to eachother. 45 | - __RUN__: `python cluster_omega_comparison.py ` 46 | - __RESULTS__: Counts of how often two algos produce similar results. For example: 47 | 48 | ![Counts](images/counts.png) 49 | 50 | 51 | ### Histogram metrics across datasets 52 | - __PATH__: [histogram_metrics.py](histograph_metrics.py): 53 | - __GOAL__: This script allows you to compare the result of metrics across algorithms for a single dataset. It creates a histogram for each metric/algorithm pair showing the number of communities for that metric that fall into the specified bin. 54 | - __RUN__: `python histogram_metrics.py [Optional: --metrics Density,Cohesiveness]` 55 | - __RESULTS__: This example shows the distributions of five parameters across datasets for the football data 56 | 57 | ![Histogram of Football Data ](images/football_histogram.png) 58 | 59 | ### Goodness Metrics 60 | - __PATH__: [goodness_indicators.py](goodness_indicators.py) 61 | - __GOAL__: This experiment is based on _Jaewon Yang and Jure Leskovec, http://cs.stanford.edu/people/jure/pubs/comscore-icdm12.pdf, Defining and Evaluating Network Communities based on Ground-truth_. It determines which community metrics are most correlated. 62 | - __RUN__: `python goodness_indicators.py metrics_dir` 63 | - __RESULTS__: An example result for the football ground truth dataset is shown below: 64 | 65 | ![Correlated Metrics](images/football--groundtruth--0.png) 66 | 67 | 68 | #### Plot Community Detection 69 | - __PATH__: [gephi_plot](gephi_plot) 70 | - __GOAL__: The graphml file created by create_graphml.py makes it easy to view the datset in Gephi and explore the graph. This Java program creates static PDFs of the results from the various aglorithms using Gephi as a layout and plotting engine to visualize the results. 71 | - __COMPILE__: Use Maven to compile the project. mvn compile assembly:single will give you a jar that contains all the dependcies needed to run the executable 72 | - __RUN__: 73 | - `python create_graphml.py [--least]` 74 | - `java -jar gephi_plot-0.0.1-SNAPSHOT-jar-with-dependencies.jar ` 75 | - __RESULTS__: A set of PDFs are produced using the community detection results to color a visualization of the underlying graph that has been laid out using force-directed layout (Gephi's Force-Atlas 2). This plot is the flights data colored using the Infomap community detection results: 76 | 77 | ![Flights data colored using Infomap results](images/flights_algo_infomap.png) 78 | 79 | 80 | -------------------------------------------------------------------------------- /experiments/cluster_omega_comparison.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import numpy as np 18 | from sklearn.cluster import spectral_clustering 19 | import argparse 20 | from math import floor, sqrt 21 | from operator import itemgetter 22 | from itertools import combinations 23 | import os 24 | import glob 25 | import json 26 | import operator 27 | 28 | from circulo.utils.general import run_comparison 29 | 30 | THRESHOLD = .7 31 | 32 | def main(): 33 | parser = argparse.ArgumentParser(description= 'Use Relative Omega Scores to determine similarity of algorithms') 34 | parser.add_argument('results_path', type=str, help='directory containing algorithm results') 35 | args = parser.parse_args() 36 | 37 | if not os.path.exists(args.results_path): 38 | print("Path \"{}\" does not exist".format(args.results_path)) 39 | return 40 | 41 | dataset_groups = {} 42 | algos = set() 43 | 44 | #sets the list of json files to a Key (dataset name) 45 | #Allows us to quickly iterate over all result files for each dataset 46 | #At the same time, we collect the list of algos from the results 47 | for fname in glob.glob(os.path.join(args.results_path, '*.json')): 48 | dataset = os.path.basename(fname).split('--')[0] 49 | algos.add(os.path.basename(fname).split('--')[1]) 50 | if dataset in dataset_groups: 51 | dataset_groups[dataset].append(fname) 52 | else: 53 | dataset_groups[dataset] = [fname] 54 | 55 | #create count dict for all possible pairs of algos (includes groundtruth) 56 | counts=dict.fromkeys(combinations(sorted([a for a in algos]),2),0) 57 | 58 | #now iterate over each dataset name (there json files) and update the 59 | #counts accordingly 60 | for dataset_name, json_files in dataset_groups.items(): 61 | memberships = [] 62 | algo_names = [] 63 | for fjson in json_files: 64 | 65 | algo_names.append(os.path.basename(fjson).split("--")[1]) 66 | 67 | with open(fjson) as f: 68 | memberships.append(json.load(f)['membership']) 69 | 70 | coords = np.argwhere(run_comparison(memberships) > THRESHOLD) 71 | 72 | for v in coords: 73 | x,y = v.flatten() 74 | if x != y and algo_names[x] < algo_names[y]: 75 | counts[(algo_names[x], algo_names[y])]+=1 76 | 77 | sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True) 78 | 79 | print("Total Datasets: ", len(dataset_groups)) 80 | 81 | for s in sorted_counts: 82 | print(s) 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /experiments/gephi_plot/create_graphml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | import multiprocessing 17 | import time 18 | import signal 19 | import os 20 | import errno 21 | import traceback 22 | from collections import namedtuple 23 | import glob 24 | import sys 25 | import igraph 26 | import matplotlib.pyplot as plt 27 | from circulo.wrappers import community 28 | from circulo.metrics import omega 29 | import argparse 30 | import os 31 | import json 32 | import datetime 33 | import multiprocessing 34 | 35 | Worker = namedtuple('Worker', 'json_path raw_graph_path output_path pick_least_frequent pick_most_frequent timeout') 36 | 37 | 38 | def main(): 39 | parser = argparse.ArgumentParser(description='Compute metrics for given cover.') 40 | parser.add_argument('input_path', type=str, help='file or directory containing results') 41 | parser.add_argument('raw_graph_path', type=str, help='File or directory graphml files [typically circulo/data/GRAPHS/]') 42 | parser.add_argument('output_path', type=str, help='output directory to write metric files') 43 | parser.add_argument('--least', action="store_true", help='If you add this flag only keep least frequent community for a given node is kept (useful for plotting)') 44 | parser.add_argument('--most', action="store_true", help='If you add this flag only keep most frequent community for a given node is kept (useful for plotting)') 45 | parser.add_argument('--workers', type=int, default=multiprocessing.cpu_count(), help='Number of workers to process (DEFAULT: number of processors)') 46 | parser.add_argument('--timeout', type=int, default=3600, help="timeout for a work item in seconds (DEFAULT: 3600)") 47 | args = parser.parse_args() 48 | 49 | if args.least and args.most: 50 | print('Cannot select both least and most common community') 51 | return 52 | 53 | if not os.path.exists(args.input_path): 54 | print("Path \"{}\" does not exist".format(args.input_path)) 55 | return 56 | 57 | if not os.path.exists(args.output_path): 58 | os.makedirs(args.output_path) 59 | 60 | workers = [] 61 | json_groups = {} 62 | json_files = glob.glob(os.path.join(args.input_path, '*.json')) 63 | for json_file in json_files: 64 | dataset = os.path.basename(json_file).split('--')[0] 65 | if dataset in json_groups: 66 | json_groups[dataset].append(json_file) 67 | else: 68 | json_groups[dataset] = [json_file] 69 | 70 | raw_graph_files = glob.glob(os.path.join(args.raw_graph_path, '*.graphml')) 71 | for (dataset, json_files) in json_groups.items(): 72 | raw_graph_file_path = None 73 | for raw_graph_file in raw_graph_files: 74 | if os.path.basename(raw_graph_file).startswith(dataset): 75 | raw_graph_file_path = raw_graph_file 76 | workers.append(Worker(json_files, raw_graph_file_path, args.output_path, args.least, args.most, args.timeout)) 77 | 78 | if args.workers is not None: 79 | pool = multiprocessing.Pool(processes = args.workers) 80 | else: 81 | pool = multiprocessing.Pool() 82 | 83 | r = pool.map_async(analyze_json, workers) 84 | r.get() #must call in order to get error from inside the child processes 85 | pool.close() 86 | pool.join() 87 | 88 | 89 | class TimeoutError(Exception): 90 | pass 91 | 92 | 93 | def __handle_timeout(signum, frame): 94 | raise TimeoutError(os.strerror(errno.ETIME)) 95 | 96 | 97 | def __get_least_frequent_community(community_array, community_counts, reverse): 98 | counts = [] 99 | for community in community_array: 100 | counts.append((community_counts[community], community)) 101 | 102 | counts.sort() 103 | if reverse: 104 | counts.reverse() 105 | 106 | for i, (count,community) in enumerate(counts): 107 | if count != 1 or i == len(counts)-1: 108 | return community 109 | 110 | 111 | def analyze_json(worker): 112 | """ 113 | Take in a set of json community detection results files and a graphml file representing the raw graph and output a 114 | graphml file that contains, as attributes, the results of the algorithms 115 | 116 | Args: 117 | worker: Named tuple of json_path raw_graph_path output_path timeout 118 | """ 119 | signal.signal(signal.SIGALRM, __handle_timeout) 120 | signal.setitimer(signal.ITIMER_REAL, worker.timeout) 121 | 122 | print('Loading raw Graphml file truth file: %s'%worker.raw_graph_path) 123 | if worker.raw_graph_path is not None: 124 | G = igraph.load(worker.raw_graph_path) 125 | else: 126 | print("ERROR: Not able to load graph") 127 | return 128 | 129 | try: 130 | for json_path in worker.json_path: 131 | with open(json_path) as f: 132 | data = json.load(f) 133 | (name, algorithm) = data['job_name'].split('--')[:2] 134 | 135 | algo_name = 'algo_%s'%algorithm 136 | 137 | # Only if we are pulling least frequent 138 | if worker.pick_least_frequent or worker.pick_most_frequent: 139 | # Calculate number of nodes in each community 140 | community_counts = {} 141 | for node in data['membership']: 142 | for community in node: 143 | if community in community_counts: 144 | community_counts[community] += 1 145 | else: 146 | community_counts[community] = 1 147 | 148 | # Add property to graph 149 | for node in G.vs(): 150 | # Get cover Array 151 | # TODO: Fix this hacky way to turn node id (i.e. "n1") into node index (i.e. 1) 152 | try: 153 | community_array = data['membership'][int(node['id'][1:])] 154 | except IndexError: 155 | community_array= [] 156 | 157 | if worker.pick_least_frequent: 158 | least_frequent_community = __get_least_frequent_community(community_array, community_counts, reverse=False) 159 | if least_frequent_community is None: 160 | least_frequent_community = -1 161 | G.vs[node.index][algo_name] = str(least_frequent_community) 162 | elif worker.pick_most_frequent: 163 | least_frequent_community = __get_least_frequent_community(community_array, community_counts, reverse=True) 164 | if least_frequent_community is None: 165 | least_frequent_community = -1 166 | G.vs[node.index][algo_name] = str(least_frequent_community) 167 | else: 168 | G.vs[node.index][algo_name] = ','.join([str(x) for x in community_array]) 169 | 170 | except TimeoutError as t: 171 | print("\t+Timeout ERROR: was analyzing: ", data['job_name']) 172 | signal.alarm(0) 173 | return 174 | except Exception as e: 175 | print(e) 176 | traceback.print_exc(file=sys.stdout) 177 | return 178 | 179 | graphml_file_output = os.path.join(worker.output_path, "%s.graphml"% name) 180 | print("Writing Graph: %s"%graphml_file_output ) 181 | igraph.write(G, graphml_file_output) 182 | 183 | 184 | if __name__ == "__main__": 185 | main() -------------------------------------------------------------------------------- /experiments/gephi_plot/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.lab41.circulo 6 | gephi_plot 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | gephi_plot 11 | http://maven.apache.org 12 | 13 | 14 | 15 | 16 | gephi-snapshots 17 | Gephi Snapshots 18 | http://nexus.gephi.org/nexus/content/repositories/snapshots/ 19 | 20 | 21 | gephi-releases 22 | Gephi Releases 23 | http://nexus.gephi.org/nexus/content/repositories/releases/ 24 | 25 | 26 | 27 | UTF-8 28 | 29 | 30 | 31 | org.gephi 32 | gephi-toolkit 33 | 0.8.2 34 | 35 | 36 | junit 37 | junit 38 | 3.8.1 39 | test 40 | 41 | 42 | 43 | 44 | 45 | org.apache.maven.plugins 46 | maven-compiler-plugin 47 | 3.2 48 | 49 | 1.7 50 | 1.7 51 | 52 | 53 | 54 | maven-assembly-plugin 55 | 56 | 57 | 58 | com.lab41.circulo.gephi_plot.PlotGraphs 59 | 60 | 61 | 62 | jar-with-dependencies 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /experiments/gephi_plot/src/main/java/com/lab41/circulo/gephi_plot/PlotGraphs.java: -------------------------------------------------------------------------------- 1 | package com.lab41.circulo.gephi_plot; 2 | /* 3 | * Based on Gephi Headless Example by Mathieu Bastian (GPL v3) 4 | */ 5 | 6 | import java.io.File; 7 | import java.io.IOException; 8 | import java.nio.file.Paths; 9 | import java.util.ArrayList; 10 | 11 | import org.gephi.data.attributes.api.AttributeColumn; 12 | import org.gephi.data.attributes.api.AttributeController; 13 | import org.gephi.data.attributes.api.AttributeModel; 14 | import org.gephi.graph.api.DirectedGraph; 15 | import org.gephi.graph.api.GraphController; 16 | import org.gephi.graph.api.GraphModel; 17 | import org.gephi.io.exporter.api.ExportController; 18 | import org.gephi.io.importer.api.Container; 19 | import org.gephi.io.importer.api.ImportController; 20 | import org.gephi.io.processor.plugin.DefaultProcessor; 21 | import org.gephi.layout.plugin.forceAtlas2.ForceAtlas2; 22 | import org.gephi.layout.plugin.forceAtlas2.ForceAtlas2Builder; 23 | import org.gephi.partition.api.NodePartition; 24 | import org.gephi.partition.api.PartitionController; 25 | import org.gephi.partition.plugin.NodeColorTransformer; 26 | import org.gephi.preview.api.PreviewController; 27 | import org.gephi.preview.api.PreviewModel; 28 | import org.gephi.preview.api.PreviewProperty; 29 | import org.gephi.project.api.ProjectController; 30 | import org.gephi.project.api.Workspace; 31 | import org.openide.util.Lookup; 32 | 33 | public class PlotGraphs { 34 | public static void main(String[] args){ 35 | if(args.length != 2){ 36 | System.err.println("Usage java -jar gephi_plot.jar "); 37 | System.exit(65); 38 | } 39 | 40 | ArrayList filesToProcess = new ArrayList(); 41 | File inputPath = new File(args[0]); 42 | if (inputPath.exists()){ 43 | // If input is a single file add that and continue 44 | if (inputPath.isFile()){ 45 | filesToProcess.add(inputPath.getPath()); 46 | // For each input file process output 47 | }else{ 48 | for (String filePath: inputPath.list()){ 49 | if (filePath.endsWith(".graphml") == true){ 50 | String fullFilePath = Paths.get(inputPath.getPath(), filePath).toString(); 51 | filesToProcess.add(fullFilePath); 52 | } 53 | } 54 | } 55 | }else{ 56 | System.err.println("Input path does not exist: " + args[0]); 57 | System.exit(65); 58 | } 59 | 60 | for (String fileToProcess: filesToProcess){ 61 | PlotGraphs hs = new PlotGraphs(); 62 | hs.script(fileToProcess, args[1]); 63 | } 64 | } 65 | 66 | 67 | public void script(String graphPath, String outputPath) { 68 | // Extract dataset name 69 | String graphFileName = new File(graphPath).getName(); 70 | String datasetName = graphFileName.substring(0, graphFileName.indexOf(".graphml")); 71 | 72 | // Initialize a Gephi project and workspace 73 | ProjectController pc = Lookup.getDefault().lookup(ProjectController.class); 74 | pc.newProject(); 75 | Workspace workspace = pc.getCurrentWorkspace(); 76 | 77 | // Get models and controllers for this new workspace - will be useful later 78 | AttributeModel attributeModel = Lookup.getDefault().lookup(AttributeController.class).getModel(); 79 | GraphModel graphModel = Lookup.getDefault().lookup(GraphController.class).getModel(); 80 | PreviewModel model = Lookup.getDefault().lookup(PreviewController.class).getModel(); 81 | ImportController importController = Lookup.getDefault().lookup(ImportController.class); 82 | PartitionController partitionController = Lookup.getDefault().lookup(PartitionController.class); 83 | 84 | // Import file 85 | Container container; 86 | try { 87 | File file = new File(graphPath); 88 | container = importController.importFile(file); 89 | } catch (Exception ex) { 90 | ex.printStackTrace(); 91 | return; 92 | } 93 | 94 | // Append imported data to GraphAPI 95 | importController.process(container, new DefaultProcessor(), workspace); 96 | 97 | // See if graph is well imported 98 | DirectedGraph graph = graphModel.getDirectedGraph(); 99 | 100 | // Do ForceAtlas2 based layout 101 | ForceAtlas2Builder fa2b = new ForceAtlas2Builder(); 102 | ForceAtlas2 fa2Layout = fa2b.buildLayout(); 103 | fa2Layout.setGraphModel(graphModel); 104 | fa2Layout.setThreadsCount(Runtime.getRuntime().availableProcessors()); 105 | fa2Layout.initAlgo(); 106 | int i_max = 1000; // TODO: Look into setting this more intelligently (some sort of convergence metric) 107 | long startTime = System.currentTimeMillis(); 108 | long currentTime = System.currentTimeMillis(); 109 | for (int i = 0; i < i_max && fa2Layout.canAlgo() && currentTime - startTime < 1000*60*10 ; i++) { 110 | // Want to take faster steps at first but then be more careful 111 | if (i < i_max/4.0){ 112 | fa2Layout.setJitterTolerance(1.0); 113 | }else{ 114 | fa2Layout.setJitterTolerance(0.1); 115 | } 116 | fa2Layout.goAlgo(); 117 | currentTime = System.currentTimeMillis(); 118 | } 119 | fa2Layout.endAlgo(); 120 | 121 | // Figure out which algorithms are in the results set 122 | ArrayList algoResultsPresent = new ArrayList(); 123 | for(AttributeColumn ac: attributeModel.getNodeTable().getColumns()){ 124 | String title = ac.getTitle(); 125 | if (title.startsWith("algo")){ 126 | algoResultsPresent.add(ac.getTitle()); 127 | } 128 | } 129 | 130 | // For each algorithm, create an output of the results 131 | for (String algoResult: algoResultsPresent){ 132 | System.out.println("Printing: " + algoResult); 133 | NodePartition p = (NodePartition) partitionController.buildPartition(attributeModel.getNodeTable().getColumn(algoResult), graph); 134 | NodeColorTransformer nodeColorTransformer = new NodeColorTransformer(); 135 | nodeColorTransformer.randomizeColors(p); 136 | partitionController.transform(p, nodeColorTransformer); 137 | 138 | // Don't show node labels, make edges straight lines 139 | model.getProperties().putValue(PreviewProperty.SHOW_NODE_LABELS, Boolean.FALSE); 140 | model.getProperties().putValue(PreviewProperty.EDGE_CURVED, Boolean.FALSE); 141 | 142 | // Export to PDF file 143 | ExportController ec = Lookup.getDefault().lookup(ExportController.class); 144 | try { 145 | String outputFileName = datasetName + "_"+algoResult+".pdf"; 146 | ec.exportFile(Paths.get(outputPath, outputFileName).toFile()); 147 | } catch (IOException ex) { 148 | ex.printStackTrace(); 149 | return; 150 | } 151 | } 152 | 153 | } 154 | } -------------------------------------------------------------------------------- /experiments/histogram_metrics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2014 In-Q-Tel, Inc/Lab41, All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | import argparse 17 | import os 18 | import glob 19 | import json 20 | import sys 21 | import numpy as np 22 | import matplotlib.pyplot as plt 23 | 24 | 25 | def analyze_metrics(dataset, output_dir, file_names, metrics_to_evaluate): 26 | """ 27 | Creates histograms of specific metrics across algorithms 28 | 29 | Args: 30 | dataset (string): dataset being processed [used for naming output file] 31 | output_dir (string): output path 32 | file_names (list of strings): Input metrics json files 33 | metrics_to_evaluate (list of strings): Metrics to be histogramed 34 | Return: 35 | None 36 | """ 37 | num_files = len(file_names) 38 | # Load metrics into memory 39 | metrics = [] 40 | for json_path in file_names: 41 | with open(json_path) as f: 42 | metrics.append(json.load(f)) 43 | 44 | # Get min/max for each metric across all datasets 45 | metric_min_max = {} 46 | for column, metric_to_evaluate in enumerate(metrics_to_evaluate): 47 | mins = [] 48 | maxes = [] 49 | for i, data in enumerate(metrics): 50 | mins.append(min(data['metrics'][metric_to_evaluate]['results'])) 51 | maxes.append(max(data['metrics'][metric_to_evaluate]['results'])) 52 | 53 | metric_min_max[metric_to_evaluate] = (min(mins), max(maxes)) 54 | 55 | # Create Plots 56 | plt.clf() 57 | for column, metric_to_evaluate in enumerate(metrics_to_evaluate): 58 | for i, data in enumerate(metrics): 59 | (dataset, algorithm, number) = data['name'].split('--') 60 | print('Processing: ', dataset, algorithm) 61 | 62 | # Create subplot 63 | ax = plt.subplot(num_files, len(metrics_to_evaluate), i*(len(metrics_to_evaluate)) + 1 + column) 64 | plt.hist(data['metrics'][metric_to_evaluate]['results'], bins=20, range=metric_min_max[metric_to_evaluate]) 65 | plt.yticks(ax.get_ylim(), fontsize=8) 66 | 67 | # Set algorithm name on left hand side 68 | if column == 0: 69 | plt.ylabel(algorithm, rotation='horizontal', fontsize=8) 70 | 71 | # Set metric name on top of coluns 72 | if i == 0: 73 | print('Printing Title: ', metric_to_evaluate) 74 | plt.title(metric_to_evaluate, fontsize=8) 75 | 76 | # Only print x axis ticks at bottom of the columns 77 | if i != len(metrics)-1: 78 | plt.xticks(fontsize=0) 79 | else: 80 | plt.xticks(rotation='vertical', fontsize=8) 81 | 82 | plt.savefig(os.path.join(output_dir, '%s.png'%dataset)) 83 | 84 | def main(): 85 | 86 | parser = argparse.ArgumentParser(description= 87 | 'Create side by side histograms for various metrics across algorithms for a given dataset') 88 | parser.add_argument('input_path', type=str, help='file or directory containing metric json files') 89 | parser.add_argument('dataset', type=str, help='Dataset desired (i.e. football)') 90 | parser.add_argument('--metrics', type=str, 91 | default=','.join(['Separability', 'Cohesiveness', 'Density', 'Triangle Participation Ratio', 'Conductance']), 92 | help='Metrics to Compare (comma separated)') 93 | parser.add_argument('--output', type=str, default=os.getcwd(), help='Base output directory') 94 | args = parser.parse_args() 95 | 96 | if not os.path.exists(args.input_path): 97 | print("Path \"{}\" does not exist".format(args.input_path)) 98 | return 99 | 100 | 101 | if os.path.isdir(args.input_path): 102 | file_names = glob.glob(os.path.join(args.input_path, '*%s*.json'%args.dataset)) 103 | analyze_metrics(args.dataset, args.output, file_names, args.metrics.split(',')) 104 | else: 105 | analyze_metrics(args.dataset, args.output, [args.input_path], args.metrics.split(',')) 106 | 107 | if __name__ == "__main__": 108 | main() 109 | -------------------------------------------------------------------------------- /experiments/images/bubble_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/bubble_plot.png -------------------------------------------------------------------------------- /experiments/images/community_label_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/community_label_results.png -------------------------------------------------------------------------------- /experiments/images/counts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/counts.png -------------------------------------------------------------------------------- /experiments/images/flights_algo_infomap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/flights_algo_infomap.png -------------------------------------------------------------------------------- /experiments/images/football--groundtruth--0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/football--groundtruth--0.png -------------------------------------------------------------------------------- /experiments/images/football_histogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/football_histogram.png -------------------------------------------------------------------------------- /experiments/images/time_vs_omega.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lab41/Circulo/77692ff21566a721d4bf45c0d88053f9cf2bfa93/experiments/images/time_vs_omega.png -------------------------------------------------------------------------------- /experiments/metricsCharts.R: -------------------------------------------------------------------------------- 1 | #For Lab 41 Circulo metrics json files 2 | #Patrick Wheatley NGA + others :) 3 | #29Aug2014. Modified 22 Sep 2014 4 | #reads json files from directory and plots pdf bubble chart of computation time and omega accuracy 5 | 6 | # Sample Usage: 7 | # metrics <- getMetrics(datapath, "dataset name") 8 | # plotMetrics(metrics) 9 | # plotHist(metrics,'omega') 10 | # plotHist(metrics,'time') 11 | # plotRunOmega(metrics) 12 | 13 | library(ggplot2) 14 | #Switched from jsonlite to RJSIONIO for speed reasons 15 | library(RJSONIO) 16 | 17 | # Read metrics from json 18 | getMetrics <- function(datapath='/Users/paulm/Desktop/metrics', dataset="football") { 19 | 20 | #Get file names and load the json files 21 | filenames <- list.files(datapath, pattern=paste(".*",dataset,".*.json", sep=""), full.names=TRUE) 22 | N <- length(filenames) 23 | results <- lapply(filenames, fromJSON) 24 | 25 | #parse filenames to get algorithm names and dataset names 26 | names <- basename(filenames) 27 | names2 <- sapply(1:N, function(x) strsplit(names, "\\.")[[x]][1]) 28 | Datasets <- sapply(1:N, function(x) strsplit(names2, "--")[[x]][1]) 29 | Algorithms <-sapply(1:N, function(x) strsplit(names2, "--")[[x]][2]) 30 | 31 | #Pull computation time and omega from the json files 32 | ComputationTime <- sapply(1:N, function (x) results[[x]]$elapsed) 33 | OmegaAccuracy <-sapply(1:N, function (x) results[[x]]$omega) 34 | 35 | #fussy R data type formatting 36 | metrics <- cbind(Algorithms,Datasets,ComputationTime,OmegaAccuracy) 37 | ind <- which(metrics[,"OmegaAccuracy"] != "NULL") 38 | metrics <-data.frame(metrics[ind,],stringsAsFactors=FALSE) 39 | metrics <- data.frame(lapply(metrics, unlist),stringsAsFactors=FALSE) 40 | metrics$ComputationTime <- as.numeric(metrics$ComputationTime) 41 | # Normalize computation time by dataset 42 | metrics$ComputationTime <- ave(metrics$ComputationTime, list(metrics$Datasets), FUN=function(L) L/min(L)) 43 | metrics$OmegaAccuracy <- as.numeric(metrics$OmegaAccuracy) 44 | 45 | 46 | return(metrics) 47 | } 48 | 49 | # Plot Metrics 50 | plotMetrics <- function(metrics,toPDF=FALSE) { 51 | # Group metrics by Dataset and Algorithm, then summarize 52 | data <- aggregate(metrics[,c('ComputationTime','OmegaAccuracy')],list(metrics$Datasets,metrics$Algorithms),mean) 53 | colnames(data)[1:2] <- c("Datasets","Algorithms") 54 | 55 | keep <- which(data$Algorithms != 'groundtruth') 56 | data <- data[keep,] 57 | 58 | bubbleplot <- ggplot(data, aes(x=Datasets, y=Algorithms))+ 59 | geom_point(aes(size=ComputationTime, colour=OmegaAccuracy), alpha=0.75)+ 60 | scale_size_continuous(range =c(8, 25), trans='log')+ 61 | scale_colour_gradient2(midpoint=0.4, low="red",mid="yellow", high="dark green")+ 62 | theme_bw() + 63 | theme(text = element_text(size=20))+ 64 | ggtitle('Accuracy and Computation Time across Datasets and Algorithms') 65 | 66 | if (toPDF) { 67 | pdffile <- paste(Sys.time(),"metricsGraph.pdf", sep='') 68 | pdf(pdffile,height=10,width=12) 69 | print(bubbleplot) 70 | dev.off() 71 | cat(sprintf('printed to %s \n', pdffile)) 72 | } else { 73 | print(bubbleplot) 74 | } 75 | } 76 | 77 | # Plot chart comparing runtime to accuracy 78 | plotRunOmega <- function(metrics, toPDF=FALSE) { 79 | runtimeplot <- ggplot(metrics, aes(x=log10(ComputationTime), y=OmegaAccuracy)) + 80 | geom_point(size=0) + 81 | theme_bw()+ 82 | geom_text(aes(x=log10(ComputationTime), y=OmegaAccuracy, label=Algorithms, color=Datasets), size=4, angle=45) 83 | 84 | 85 | if (toPDF) { 86 | pdffile <- paste(Sys.time(),"runtimeVsOmegaAccuracy.png", sep='') 87 | png(pdffile, height=10, width=12, units='in', res=300) 88 | print(runtimeplot) 89 | dev.off() 90 | cat(sprintf('printed to %s \n', pdffile)) 91 | } else { 92 | print(runtimeplot) 93 | } 94 | } 95 | 96 | # Plots histogram of specified metric (omega or computation time right now) 97 | plotHist <- function(metrics,col='ComputationTime',toPDF=FALSE) { 98 | 99 | data <- metrics[c('Algorithms','Datasets',col)] 100 | colnames(data)[3] <- "value" 101 | 102 | p <- ggplot(data,aes(x=value,fill=Algorithms)) + 103 | facet_grid(. ~ Datasets) + 104 | geom_density(alpha=0.5) + 105 | #geom_histogram(alpha=0.5,position='identity') 106 | xlab(col) + 107 | ylab('Probability Density') + 108 | theme_bw() + 109 | ggtitle(Sys.time()) 110 | 111 | 112 | if (toPDF) { 113 | pdffile <- paste(Sys.time(),"metricsGraph.pdf", sep='') 114 | pdf(pdffile,height=10,width=12) 115 | print(p) 116 | dev.off() 117 | cat(sprintf('printed to %s \n', pdffile)) 118 | } else { 119 | print(p) 120 | } 121 | } 122 | 123 | -------------------------------------------------------------------------------- /experiments/metrics_clustering.py: -------------------------------------------------------------------------------- 1 | import json 2 | import glob 3 | import os 4 | from collections import Counter 5 | import numpy as np 6 | import argparse 7 | 8 | from scipy.stats import itemfreq 9 | from scipy.cluster.vq import kmeans2, whiten 10 | 11 | 12 | 13 | metric_list = [ 14 | "Conductance", 15 | "Cut Ratio", 16 | "Degree StatisticsBiased Kurtosis", 17 | "Density", 18 | "Expansion", 19 | "Cohesiveness", 20 | "Flake Out Degree Fraction", 21 | ] 22 | 23 | 24 | NUM_DIMENSIONS = len(metric_list) 25 | 26 | 27 | def run_experiment(metrics_path, dataset_name): 28 | 29 | num_comms = 0 30 | 31 | files_analyzed = 0 32 | #go through quickly to determine how many communities you have 33 | for f in glob.glob(metrics_path+"/"+dataset_name+"--*--*.json"): 34 | json_f = open(f) 35 | j = json.load(json_f) 36 | json_f.close() 37 | num_comms+=len(j['metrics']['Density']['results']) 38 | files_analyzed+=1 39 | 40 | if(files_analyzed == 0): 41 | print("No files to analyze") 42 | return 43 | 44 | print("Files Analyzed: ", files_analyzed) 45 | 46 | 47 | print("Running kmeans on ", num_comms, " communities") 48 | 49 | matrix = np.zeros((num_comms, NUM_DIMENSIONS)) 50 | comm_count = 0 51 | gt_start = -1 52 | gt_end = -1 53 | 54 | 55 | for i, f in enumerate(glob.glob(metrics_path+"/"+dataset_name+"--*--*.json")): 56 | 57 | print(f) 58 | json_f = open(f) 59 | j = json.load(json_f) 60 | json_f.close() 61 | metrics = j['metrics'] 62 | 63 | #get the number of comms for this file 64 | add_comms = len(metrics['Density']['results']) + comm_count 65 | 66 | if f == metrics_path+"/"+dataset_name+"--groundtruth--0.json": 67 | gt_start = comm_count 68 | gt_end = add_comms 69 | 70 | dim_idx=0 71 | for metric_name in metric_list: 72 | 73 | results = metrics[metric_name]['results'] 74 | 75 | try: 76 | matrix[comm_count:add_comms,dim_idx] = results 77 | except Exception as e: 78 | print(result_dict['results']) 79 | print("Error: ",e) 80 | 81 | dim_idx+=1 82 | if dim_idx == NUM_DIMENSIONS: 83 | break 84 | 85 | comm_count=add_comms 86 | 87 | matrix_norm = whiten(matrix) 88 | centroid, label = kmeans2(matrix_norm, k=3) 89 | 90 | freq = itemfreq(label[gt_start:gt_end]) 91 | 92 | m = max(freq, key=lambda y: y[1]) 93 | 94 | ratio = float(m[1])/(gt_end-gt_start) 95 | 96 | print("Groundtruth similarity: ", ratio) 97 | 98 | print("Frequency of groundtruth communities as part of centroids") 99 | print(freq) 100 | 101 | i = gt_start 102 | 103 | print("GroundTruth Centroids range: ", gt_start, "-> ", gt_end) 104 | while i < gt_end: 105 | # print(label[i]) 106 | i+=1 107 | 108 | 109 | 110 | def main(): 111 | # Parse user input 112 | parser = argparse.ArgumentParser(description='Experiment clustering community detection results') 113 | parser.add_argument('metrics_dir', help="path to metrics dir") 114 | parser.add_argument('dataset', help='dataset name.') 115 | args = parser.parse_args() 116 | 117 | run_experiment(args.metrics_dir, args.dataset) 118 | 119 | 120 | if __name__ == "__main__": 121 | main() 122 | 123 | -------------------------------------------------------------------------------- /experiments/omega_comparison.py: -------------------------------------------------------------------------------- 1 | import json 2 | import circulo 3 | import circulo.metrics 4 | import numpy 5 | import argparse 6 | import os 7 | import scipy 8 | import csv 9 | 10 | def omega_loop(path, output_filename): 11 | i = 0 12 | j = 0 13 | k = 0 14 | df_dimension = 0 15 | total_omega_fs = 0 16 | omega_fs = 0 17 | 18 | files = sorted(os.listdir(path[0])) 19 | 20 | for filename in files: 21 | df_dimension = df_dimension + 1 22 | omega_df = numpy.ones(shape = [df_dimension, df_dimension])*-1 23 | #omega_list = numpy.empty(shape = [df_dimension*df_dimension+df_dimension, 3], dtype='S100') 24 | omega_list = [] 25 | 26 | 27 | for f in files: 28 | print(f) 29 | json_data_f = open(path[0]+'/'+f) 30 | data_f = json.load(json_data_f) 31 | for s in files: 32 | json_data_s = open(path[0]+'/'+s) 33 | data_s = json.load(json_data_s) 34 | omega_fs = circulo.metrics.omega.omega_index(data_f['membership'], data_s['membership']) 35 | omega_list.append([f, s, omega_fs]) 36 | omega_df[i, j] = omega_fs 37 | if f != s: 38 | total_omega_fs = total_omega_fs + omega_fs 39 | j = j + 1 40 | omega_list.append([f, 'Adjusted_Average', total_omega_fs/(df_dimension-1)]) 41 | total_omega_fs = 0 42 | i = i + 1 43 | j = 0 44 | print(omega_df) 45 | numpy.save(output_filename[0], omega_df) 46 | 47 | with open(output_filename[0],'w') as myfile: 48 | csvwriter = csv.writer(myfile,delimiter='\t') 49 | csvwriter.writerow(['Graph1','Graph2','omega']) 50 | for x in omega_list: 51 | csvwriter.writerow(x) 52 | 53 | if omega_df.min() == 1: 54 | print('All files give identical results') 55 | else: 56 | print('Differences exist among the files') 57 | 58 | def main(): 59 | # Parse user input 60 | parser = argparse.ArgumentParser(description='Run metrics across several algorithms or across iterations of a stochastic algorithm.') 61 | parser.add_argument('path', nargs=1,help='Filepath location') 62 | parser.add_argument('output_filename', nargs=1, help='Output filename') 63 | args = parser.parse_args() 64 | omega_loop(args.path, args.output_filename) 65 | 66 | if __name__ == '__main__': 67 | main() 68 | -------------------------------------------------------------------------------- /experiments/partition_metrics.R: -------------------------------------------------------------------------------- 1 | # Sample Usage: 2 | # metrics <- getMetrics(datapath) 3 | # plotMetrics(metrics) 4 | # plotHist(metrics,'omega') 5 | # plotHist(metrics,'time') 6 | 7 | library(ggplot2) 8 | library(jsonlite) 9 | 10 | # Read metrics from json 11 | getMetrics <- function(datapath) { 12 | 13 | #Get file names and load the json files 14 | filenames <- list.files(datapath, pattern="*.json", full.names=TRUE) 15 | N <- length(filenames) 16 | results <- lapply(filenames, fromJSON) 17 | 18 | #parse filenames to get algorithm names and dataset names 19 | names <- basename(filenames) 20 | names2 <- sapply(1:N, function(x) strsplit(names, "\\.")[[x]][1]) 21 | Datasets <- sapply(1:N, function(x) strsplit(names2, "--")[[x]][1]) 22 | Algorithms <-sapply(1:N, function(x) strsplit(names2, "--")[[x]][2]) 23 | 24 | #fussy R data type formatting 25 | metrics <- data.frame(Algorithms,Datasets) 26 | 27 | metric.names <- names(results[[1]]$metrics) 28 | df <- data.frame(sapply(metric.names,function(l){ 29 | sapply(1:N,function(i) {results[[i]]$metrics[[l]]['results'] }) })) 30 | 31 | metrics <- cbind(metrics,df) 32 | 33 | return(metrics) 34 | } 35 | 36 | # Plot one metric across a collection of datasets/algorithms 37 | plotMetric<- function(metrics,column='Conductance',datasets=NULL,algos=NULL,logx=FALSE,logy=FALSE,toPDF=FALSE) { 38 | # Keep only data that matches datasets/algos criteria 39 | data <- metrics 40 | if (is.null(datasets)) {datasets <- unique(metrics$Datasets)} 41 | if (is.null(algos)) {algos <- unique(metrics$Algorithms)} 42 | 43 | keep <- (data$Datasets %in% datasets) & (data$Algorithms %in% algos) 44 | data <- data[keep,] 45 | 46 | # Reformat data columns into "long" format 47 | Algorithms <- rep(data$Algorithms,sapply(data$Conductance,length)) 48 | Datasets <- rep(data$Datasets,sapply(data$Conductance,length)) 49 | value <- unlist(data[column]) 50 | data <- data.frame(Algorithms,Datasets,value) 51 | 52 | # Create density plot 53 | densityplot<- ggplot(data, aes(x=value,colour=Algorithms,fill=Algorithms))+ 54 | facet_grid(. ~ Datasets) + 55 | geom_density(alpha=0.5) + 56 | #geom_histogram(alpha=0.5,position='identity') + 57 | xlab(column) + 58 | ylab('Counts') + 59 | theme_bw()+ 60 | ggtitle(Sys.time()) 61 | 62 | if (logx) {densityplot <- densityplot + scale_x_log10()} 63 | if (logy) {densityplot <- densityplot + scale_y_log10()} 64 | 65 | # Print plot to PDF or screen 66 | if (toPDF) { 67 | pdffile <- paste(Sys.time(),"metricsGraph.pdf", sep='') 68 | pdf(pdffile,height=10,width=12) 69 | print(densityplot) 70 | dev.off() 71 | cat(sprintf('printed to %s \n', pdffile)) 72 | } else { 73 | print(densityplot) 74 | } 75 | } 76 | 77 | # Plot specified metrics for one run (dataset/algorithm) 78 | ## NEEDS TO BE FINISHED. 79 | plotRun <- function(metrics,dataset='karate',algo='fastgreedy',columns=c('Conductance','Expansion'),logx=FALSE,logy=FALSE,toPDF=FALSE) { 80 | # Keep only data that matches datasets/algos criteria 81 | data <- metrics 82 | keep <- (data$Datasets == dataset) & (data$Algorithms == algo) 83 | data <- data[keep,] 84 | 85 | # Reformat data columns into "long" format 86 | df <- sapply(data[,columns],unlist) 87 | df <- data.frame(df,row.names=NULL) 88 | # not done yet.. 89 | } 90 | -------------------------------------------------------------------------------- /support/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3 2 | 3 | RUN apt-get update && apt-get install --assume-yes git openssl curl \ 4 | gcc g++ gfortran \ 5 | libopenblas-dev liblapack-dev \ 6 | libigraph0 \ 7 | libpng12-dev libfreetype6-dev 8 | 9 | ENV CFLAGS '-Wno-error=declaration-after-statement' 10 | RUN pip3 install numpy scipy scikit-learn matplotlib python-igraph 11 | 12 | ADD . /Circulo 13 | WORKDIR /Circulo 14 | RUN pip3 install -r requirements.txt 15 | ENV PYTHONPATH /Circulo 16 | 17 | CMD /bin/bash 18 | -------------------------------------------------------------------------------- /support/requirements.txt: -------------------------------------------------------------------------------- 1 | ipython==2.2.0 2 | matplotlib==1.4.0 3 | networkx==1.9 4 | numpy>=1.8.2 5 | python-igraph>=0.7 6 | scikit-learn>=0.15.1 7 | scipy>=0.14.0 8 | -------------------------------------------------------------------------------- /support/server_scripts/circulo_server.sh: -------------------------------------------------------------------------------- 1 | # Circulo server instructions! 2 | # This script should help with setting up a clean server instance with Circulo 3 | 4 | #Currently it requries root access 5 | 6 | ### 7 | # Requirements before you run this script: 8 | # git 9 | # virtualenv 10 | # virtualenvwrapper 11 | # python3 12 | # pip3 13 | ### 14 | 15 | 16 | if [ "$#" -ne 1 ]; then 17 | echo "Please provide remote for Circulo Git repo" 18 | exit 0 19 | fi 20 | 21 | CIRCULO_GIT_LOC = $1 22 | 23 | 24 | # create virtual environment folder and set the proper permissions 25 | cd /home 26 | sudo mkdir .venvs 27 | sudo chgrp -R admin .venvs/ 28 | sudo chmod -R 770 .venvs/ 29 | 30 | # create circulo folder 31 | sudo mkdir circulo 32 | sudo chgrp -R admin circulo/ 33 | sudo chmod -R 770 circulo/ 34 | echo "created circulo folder" 35 | 36 | # set up virtualenv. Requires virtualenvwrapper 37 | cd circulo 38 | export WORKON_HOME='~/.venvs' 39 | source '/usr/local/bin/virtualenvwrapper.sh' 40 | mkvirtualenv --python=`which python3` circulo 41 | deactivate 42 | 43 | # adding a couple minor files for convenience 44 | echo "export WORKON_HOME='/home/.venvs' 45 | source '/usr/local/bin/virtualenvwrapper.sh' 46 | workon circulo" > setup 47 | 48 | echo "Circulo 49 | 50 | To work on circulo on this virtualenv, run 51 | 52 | source setup 53 | 54 | Your prompt should start with (circulo) if it worked correctly. 55 | 56 | To exit the virtualenv, run 57 | 58 | deactivate 59 | 60 | 61 | If you need to add new packages to the virtualenv, pip3 should work\ 62 | as expected as long as you are within the virtualenv. However, you may\ 63 | have to install unexpected dependencies that this OS didn't ship with. 64 | " > README 65 | 66 | # get circulo! 67 | git clone $CIRCULO_GIT_LOC 68 | git clone https://github.com/snap-stanford/snap.git 69 | pushd snap 70 | 71 | # start using the circulo virtualenv 72 | source setup 73 | 74 | # add paths to virtualenv 75 | add2virtualenv /home/circulo/ /home/circulo/Circulo/ /home/circulo/Circulo/circulo/ 76 | 77 | # install dependencies for circulo's dependencies 78 | # (you may have to add more, depending on your machine 79 | sudo apt-get install gfortran libopenblas-dev liblapack-dev # for scipy 80 | sudo apt-get install libpng12-dev libfreetype6-dev # for matplotlib 81 | sudo apt-get install libxml2-dev libz-dev python3-dev #for igraph 82 | 83 | 84 | # install igraph 85 | pip3 install python-igraph 86 | pip3 install networkx 87 | 88 | # install circulo's dependencies 89 | pip3 install numpy 90 | pip3 install scipy 91 | pip3 install matplotlib 92 | pip3 install scikit-learn 93 | pip3 install ipython 94 | 95 | # finally, to use circulo, just cd into /home/circulo and run 96 | # source setup 97 | # if your prompt begins with (circulo), you're ready to go. 98 | -------------------------------------------------------------------------------- /support/server_scripts/clean_circulo.sh: -------------------------------------------------------------------------------- 1 | rm -rf /home/circulo 2 | rm -rf /home/.venvs 3 | --------------------------------------------------------------------------------