├── .gitignore ├── README.md ├── example.edges ├── example.features ├── example.similar ├── focusco.bat ├── focusco.bib ├── focusco.log ├── focusco.out.dm ├── focusco.out.weighted.edges ├── focusco_main.m ├── java_src ├── pom.xml └── src │ ├── main │ └── java │ │ ├── META-INF │ │ └── MANIFEST.MF │ │ └── edu │ │ └── stonybrook │ │ └── focused │ │ ├── GraphReweighter.java │ │ ├── Vertex.java │ │ ├── community │ │ ├── BookkeepingWeightedGraph.java │ │ ├── GreedyLocalCommunityBuilder.java │ │ ├── ICommunity.java │ │ ├── LocalCommunityBuilder.java │ │ ├── Outlier.java │ │ ├── PrecomputedCommunity.java │ │ ├── UnweightedCommunity.java │ │ ├── WeightedCommunity.java │ │ └── WeightedMeansCommunity.java │ │ ├── io │ │ ├── ascii │ │ │ ├── Clustering.java │ │ │ ├── Outliers.java │ │ │ └── WeightedEdgeList.java │ │ └── graphml │ │ │ ├── AttributeGetter.java │ │ │ ├── AttributeHandler.java │ │ │ ├── AttributeProvider.java │ │ │ ├── AttributeSetter.java │ │ │ ├── AttributeType.java │ │ │ ├── CommunityOutlierGraphMLExporter.java │ │ │ ├── ContinousNumericIDProviders.java │ │ │ └── GraphMLExporter.java │ │ └── main │ │ ├── CommunityClusterer.java │ │ ├── CommunityHolder.java │ │ └── FocuscoOptions.java │ └── test │ └── java │ └── edu │ └── stonybrook │ └── focused │ └── tests │ ├── BookkeepingGraphTests.java │ ├── CommunityTests.java │ └── GraphIOTests.java └── matlab_src ├── PGDM ├── D_constraint.m ├── D_constraint_sparse.m ├── D_objective.m ├── D_objective_sparse.m ├── Newton.m ├── Newton_sparse.m ├── Newton_sparse_top_k.m ├── fD.m ├── fD1.m ├── fS1.m ├── grad_projection.m ├── iter_projection_new2.m ├── opt.m ├── opt_sphere.m ├── packcolume.m ├── testPGDM.m └── unroll.m ├── compute_A_goodness.m ├── distance_metric_learning_manual.m ├── io └── load_edgelist.m ├── normc.m ├── reweigh.m ├── reweigh_sparse.m ├── savesparse.m └── savevector.m /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Mobile Tools for Java (J2ME) 4 | .mtj.tmp/ 5 | 6 | # Package Files # 7 | *.jar 8 | *.war 9 | *.ear 10 | 11 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 12 | hs_err_pid* 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Focused Clustering and Outlier Detection in Large Attributed Graphs 2 | 3 | Bryan Perozzi 4 | 5 | ## Disclaimers 6 | 7 | 1. This code is very "research", and so is probably more useful as an example than a product 8 | 1. Distance Metric Learning based on original code from Eric Xing, [available here]( http://www.cs.cmu.edu/~epxing/papers/Old_papers/code_Metric_online.tar.gz) 9 | 10 | ## Implementation Overview 11 | 12 | There are two programs. The first is a matlab script which learns a distance metric and reweighs the input graph. The second is a java program which extracts communities & outliers from the reweighted graph 13 | 14 | ## Running 15 | 16 | An example batch file `focusco.bat` shows how to use the matlab program from the command line. (it'll run the whole thing soon). It can be run like so: 17 | 18 | `>focusco.bat example.edges example.features example.similar` 19 | 20 | Which will produce `focusco.out.weighted.edges`. 21 | 22 | ## Installation 23 | 24 | ### Requirements 25 | 1. A recent version of Matlab 26 | 2. Java 6+ 27 | 28 | ### Setup 29 | The only required step should be to build the java, e.g. with maven: 30 | 31 | 1. `$ cd java_src` 32 | 1. `$ mvn clean install` 33 | 34 | -------------------------------------------------------------------------------- /example.edges: -------------------------------------------------------------------------------- 1 | 1 2 2 | 1 3 3 | 2 1 4 | 2 3 5 | 3 1 6 | 3 2 7 | 3 4 8 | 4 3 9 | 4 5 10 | 4 6 11 | 5 4 12 | 5 6 13 | 6 4 14 | 6 5 -------------------------------------------------------------------------------- /example.features: -------------------------------------------------------------------------------- 1 | 0 1 2 | 0 1 3 | 0 0 4 | 1 0 5 | 1 0 6 | 1 0 -------------------------------------------------------------------------------- /example.similar: -------------------------------------------------------------------------------- 1 | 1 2 -------------------------------------------------------------------------------- /focusco.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | set CUR_DIR=%CD% 3 | 4 | set edge_file=%1 5 | set feature_file=%2 6 | set similar_file=%3 7 | if "%~4"=="" goto no_arg4 8 | set output_file=%4 9 | goto done_parse 10 | :no_arg4 11 | set output_file=focusco.out 12 | :done_parse 13 | set intermediate_file=%output_file%.weighted.edges 14 | 15 | echo Focused Cluster and Outliers - Distance Metric Learning 16 | echo %CUR_DIR%\distance_metric_learning(arg1,arg2) 17 | echo edge file=%edge_file% 18 | echo feature file=%feature_file% 19 | echo node similarity file=%similar_file% 20 | echo distance metric file=%intermediate_file% 21 | echo output file=%output_file% 22 | start matlab -nosplash -nodesktop -minimize -r focusco_main('%edge_file%','%feature_file%','%similar_file%','file_output','%intermediate_file%') -logfile focusco.log -------------------------------------------------------------------------------- /focusco.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{2014-kdd-perozzi-focused, 2 | author = {Bryan Perozzi and Leman Akoglu and Patricia Iglesias Sanchez and Emmanuel Muller}, 3 | title = {Focused Clustering and Outlier Detection in Large Attributed Graphs}, 4 | booktitle = {Proceedings of the 20th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, 5 | series = {KDD '14}, 6 | year = {2014}, 7 | location = {New York, NY, USA}, 8 | publisher = {ACM}, 9 | address = {New York, NY, USA}, 10 | month = {August}, 11 | } 12 | -------------------------------------------------------------------------------- /focusco.log: -------------------------------------------------------------------------------- 1 | [Warning: Name is nonexistent or not a directory: io] 2 | 3 | To get started, type one of these: helpwin, helpdesk, or demo. 4 | For product information, visit www.mathworks.com. 5 | 6 | 7 | Student License -- for use in conjunction with courses offered at a 8 | degree-granting institution. Professional and commercial use prohibited. 9 | 10 | FocusCO Distance Metric Learning 11 | ------------------------------------- 12 | Gamma: 1.000000 13 | # Dissimilar pairs: 4 14 | Distance Metric Learning Data Type: sparse 15 | # Features to consider (if dml_datatype == sparse): 2 16 | Type of graph reweighting (sparse or dense similarity): sparse 17 | Graph Output File: focusco.out.weighted.edges 18 | 19 | [Warning: Name is nonexistent or not a directory: PGDM] 20 | [> In path at 110 21 | In addpath at 87 22 | In distance_metric_learning_manual at 10 23 | In focusco_main at 49] 24 | Iteration: 1 25 | Objective: 2.825059 26 | Iteration: 2 27 | Objective: 2.871824 28 | Distance metric: 29 | 30 | DM = 31 | 32 | (2,2) 1.2620 33 | 34 | 35 | ans = 36 | 37 | (2,1) 1.0000 38 | (3,1) 0.4709 39 | (1,2) 1.0000 40 | (3,2) 0.4709 41 | (1,3) 0.4709 42 | (2,3) 0.4709 43 | (4,3) 0.4709 44 | (3,4) 0.4709 45 | (5,4) 0.4709 46 | (6,4) 0.4709 47 | (4,5) 0.4709 48 | (6,5) 0.4709 49 | (4,6) 0.4709 50 | (5,6) 0.4709 51 | 52 | -------------------------------------------------------------------------------- /focusco.out.dm: -------------------------------------------------------------------------------- 1 | 2 1 1.000000 2 | 3 1 1.000000 3 | 1 2 1.000000 4 | 3 2 1.000000 5 | 1 3 1.000000 6 | 2 3 1.000000 7 | 4 3 0.471716 8 | 3 4 0.471716 9 | 5 4 0.471716 10 | 6 4 0.471716 11 | 4 5 0.471716 12 | 6 5 0.471716 13 | 4 6 0.471716 14 | 5 6 0.471716 15 | -------------------------------------------------------------------------------- /focusco.out.weighted.edges: -------------------------------------------------------------------------------- 1 | 2 1 1.000000 2 | 3 1 0.470942 3 | 1 2 1.000000 4 | 3 2 0.470942 5 | 1 3 0.470942 6 | 2 3 0.470942 7 | 4 3 0.470942 8 | 3 4 0.470942 9 | 5 4 0.470942 10 | 6 4 0.470942 11 | 4 5 0.470942 12 | 6 5 0.470942 13 | 4 6 0.470942 14 | 5 6 0.470942 15 | -------------------------------------------------------------------------------- /focusco_main.m: -------------------------------------------------------------------------------- 1 | function [WeightedA] = focusco_main(graph_file, data_file, similar_nodes_file, varargin) 2 | 3 | addpath('matlab_src/PGDM') 4 | addpath('matlab_src/io') 5 | addpath('matlab_src') 6 | 7 | % open and load files 8 | X = load(data_file); 9 | A = load_edgelist(graph_file); 10 | similar_pairs = load(similar_nodes_file); 11 | 12 | % pull out some useful variables 13 | num_vertices = size(A,1); 14 | 15 | % parse remaining arguments 16 | p = inputParser; 17 | defaultGamma = 1; 18 | defaultDissimilarSamples = 2*size(similar_pairs,2); 19 | default_topk_features = size(X,2); 20 | default_dml = 'sparse'; 21 | default_file_out = ''; 22 | default_reweight_type = 'sparse'; 23 | 24 | addOptional(p, 'gamma', defaultGamma,@isnumeric); 25 | addOptional(p, 'size_D', defaultDissimilarSamples,@isnumeric); 26 | addOptional(p, 'top_k_features', default_topk_features, @isnumeric); 27 | addOptional(p, 'dml_datatype', default_dml, @(x) strcmp(x, 'sparse') || strcmp(x, 'dense')); 28 | addOptional(p, 'file_output', default_file_out, @isstr); 29 | addOptional(p, 'reweight_type', default_reweight_type, @isstr); 30 | 31 | parse(p, varargin{:}); 32 | 33 | gamma = p.Results.gamma; 34 | num_dissimilar_pairs = p.Results.size_D; 35 | top_k_features = p.Results.top_k_features; 36 | dml_datatype = p.Results.dml_datatype; 37 | dm_file_out = p.Results.file_output; 38 | reweight_type = p.Results.reweight_type; 39 | 40 | fprintf('FocusCO Distance Metric Learning\n-------------------------------------\n') 41 | fprintf('Gamma: %f\n', gamma) 42 | fprintf('# Dissimilar pairs: %d\n', num_dissimilar_pairs) 43 | fprintf('Distance Metric Learning Data Type: %s\n', dml_datatype) 44 | fprintf('# Features to consider (if dml_datatype == sparse): %d\n', top_k_features) 45 | fprintf('Type of graph reweighting (sparse or dense similarity): %s\n', reweight_type) 46 | fprintf('Graph Output File: %s\n\n', dm_file_out) 47 | 48 | % use dense or sparse DML? 49 | [ DM, S, D ] = distance_metric_learning_manual(X, similar_pairs, num_dissimilar_pairs, num_vertices , gamma, top_k_features, dml_datatype); 50 | 51 | fprintf('Distance metric:\n'); 52 | DM 53 | 54 | if strcmp(reweight_type, 'sparse') 55 | WeightedA = reweigh_sparse(A, X, DM); 56 | else 57 | WeightedA = reweigh(A, X, DM); 58 | end 59 | 60 | if ~strcmp(dm_file_out, '') 61 | savesparse(dm_file_out, WeightedA); 62 | end 63 | %exit 64 | end 65 | -------------------------------------------------------------------------------- /java_src/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | edu.stonybrook.focused 8 | LocalFocusedGraphClustering 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | 14 | org.apache.maven.plugins 15 | maven-compiler-plugin 16 | 3.1 17 | 18 | 1.7 19 | 1.7 20 | 21 | 22 | 23 | 24 | 25 | 26 | maven-assembly-plugin 27 | 28 | 29 | package 30 | 31 | attached 32 | 33 | 34 | 35 | 36 | 37 | jar-with-dependencies 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | ca.umontreal.iro 47 | ssj 48 | 2.5 49 | 50 | 51 | 52 | args4j 53 | args4j 54 | 2.0.16 55 | 56 | 57 | 58 | 59 | net.sf.jgrapht 60 | jgrapht 61 | 0.8.3 62 | 63 | 64 | junit 65 | junit 66 | 4.10 67 | 68 | 69 | 70 | com.google.guava 71 | guava 72 | 15.0 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /java_src/src/main/java/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Main-Class: edu.stonybrook.focused.main.CommunityClusterer 3 | 4 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/GraphReweighter.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused; 2 | 3 | /** 4 | * User: hubris (Bryan Perozzi) 5 | * 6 | 7 | */ 8 | public class GraphReweighter { 9 | } 10 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/Vertex.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused; 2 | 3 | /** 4 | * Author: hubris(Bryan Perozzi) 5 | * 6 | 7 | */ 8 | public class Vertex { 9 | public Long id; 10 | } 11 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/community/BookkeepingWeightedGraph.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.community; 2 | 3 | import org.jgrapht.graph.DefaultWeightedEdge; 4 | import org.jgrapht.graph.SimpleWeightedGraph; 5 | 6 | import java.util.HashMap; 7 | 8 | /** 9 | * Author: Bryan Perozzi 10 | * 11 | 12 | */ 13 | public class BookkeepingWeightedGraph extends SimpleWeightedGraph { 14 | 15 | protected HashMap weightedOutDegree = new HashMap(); 16 | 17 | double totalWeightedVolume = 0.0; 18 | 19 | public BookkeepingWeightedGraph() { 20 | super(DefaultWeightedEdge.class); 21 | } 22 | 23 | @Override 24 | public DefaultWeightedEdge addEdge(Integer v, Integer v1) { 25 | double weight = DEFAULT_EDGE_WEIGHT; 26 | weightedOutDegree.put(v, weightedOutDegree.get(v) + weight); 27 | weightedOutDegree.put(v1, weightedOutDegree.get(v1) + weight); 28 | totalWeightedVolume += 2 * weight; 29 | 30 | return super.addEdge(v, v1); 31 | } 32 | 33 | @Override 34 | public boolean addVertex(Integer v){ 35 | if (!weightedOutDegree.containsKey(v)){ 36 | weightedOutDegree.put(v, 0.0); 37 | } 38 | return super.addVertex(v); 39 | } 40 | 41 | @Override 42 | public boolean addEdge(Integer v, Integer v1, DefaultWeightedEdge e) { 43 | double weight = getEdgeWeight(e); 44 | weightedOutDegree.put(v, weightedOutDegree.get(v) + weight); 45 | weightedOutDegree.put(v1, weightedOutDegree.get(v1) + weight); 46 | totalWeightedVolume += 2 * weight; 47 | 48 | return super.addEdge(v, v1, e); 49 | } 50 | 51 | @Override 52 | public void setEdgeWeight(DefaultWeightedEdge e, double val) { 53 | double weight = getEdgeWeight(e); 54 | 55 | Integer v1 = getEdgeSource(e); 56 | Integer v2 = getEdgeTarget(e); 57 | 58 | if (v1 != null || v2 != null){ 59 | weightedOutDegree.put(v1, weightedOutDegree.get(v1) - weight); 60 | weightedOutDegree.put(v2, weightedOutDegree.get(v2) - weight); 61 | totalWeightedVolume -= 2 * weight; 62 | 63 | weightedOutDegree.put(v1, weightedOutDegree.get(v1) + val); 64 | weightedOutDegree.put(v2, weightedOutDegree.get(v2) + val); 65 | totalWeightedVolume += 2 * val; 66 | } 67 | 68 | super.setEdgeWeight(e, val); 69 | } 70 | 71 | public double getWeightedVolume() { 72 | return totalWeightedVolume; 73 | } 74 | 75 | public double getWeightedOutDegreeOf(int v) { 76 | return weightedOutDegree.get(v); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/community/GreedyLocalCommunityBuilder.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.community; 2 | 3 | import org.jgrapht.Graph; 4 | import org.jgrapht.Graphs; 5 | import org.jgrapht.graph.DefaultWeightedEdge; 6 | 7 | import java.util.ArrayList; 8 | import java.util.HashSet; 9 | import java.util.logging.Logger; 10 | 11 | /** 12 | * User: hubris (Bryan Perozzi) 13 | * 14 | 15 | */ 16 | public class GreedyLocalCommunityBuilder implements LocalCommunityBuilder { 17 | 18 | Logger logger = Logger.getLogger(GreedyLocalCommunityBuilder.class.getName()); 19 | 20 | public int GREEDY_STRUCTUAL_NODES_REMOVED_CNT = 0; 21 | public int GREEDY_FOCUSED_NODES_REMOVED_CNT = 0; 22 | 23 | final public int MAX_MISTEPS = 0; 24 | 25 | final public double MIN_DELTA = -0.000001; 26 | 27 | final public int MAX_ITER = 5; 28 | 29 | // inputs 30 | Graph structuralGraph; 31 | Graph focusedGraph; 32 | Iterable seedSet; 33 | 34 | // outputs 35 | UnweightedCommunity structuralCommunity; 36 | WeightedCommunity focusedCommunity; 37 | 38 | // types of outliers 39 | HashSet outliers = new HashSet(); 40 | HashSet inliers = new HashSet(); 41 | 42 | public GreedyLocalCommunityBuilder(Graph structural, Graph focused, Iterable seeds, double EDGE_VARIANCE) { 43 | structuralGraph = structural; 44 | focusedGraph = focused; 45 | seedSet = seeds; 46 | 47 | structuralCommunity = new UnweightedCommunity(structural); 48 | focusedCommunity = new WeightedMeansCommunity(focused, EDGE_VARIANCE); 49 | 50 | buildCommunities(); 51 | } 52 | 53 | protected void buildCommunities() { 54 | HashSet bestStructuralNodes = new HashSet(); 55 | 56 | // Greedy Focused UnweightedCommunity Algorithm: 57 | // 1. initialize community with seed set 58 | // 2. iterate over neighbors to expand set 59 | // 2.a. get candidate nodes (neighbors of community for greedy) 60 | // 2.b. find best structural node and best focused node to add 61 | // 2.c. add best nodes if they exist 62 | // 3. do the time warp (check to see if removing any node decrease conductance, and remove it) 63 | // 4. record outliers 64 | 65 | // 1. initialize community with seed set 66 | for (Integer i : seedSet) { 67 | structuralCommunity.add(i); 68 | focusedCommunity.add(i); 69 | } 70 | 71 | // 2. iterate over neighbors to expand set 72 | ArrayList candidateNodes = new ArrayList(); 73 | Integer bestNode = null; 74 | Integer bestStructuralNode = null; 75 | 76 | // for backtracking greedy 77 | double minConductanceSeenSoFar = 1.0; 78 | ArrayList backtrackList = new ArrayList(); 79 | 80 | boolean any_focused_community_change = false; 81 | 82 | // found a case where it infitinately adds/removes? 83 | int iter = 0; 84 | 85 | do { 86 | any_focused_community_change = false; 87 | 88 | int missteps = 0; 89 | 90 | do { 91 | bestNode = null; 92 | bestStructuralNode = null; 93 | double bestDeltaPhi = Double.POSITIVE_INFINITY; 94 | double bestDeltaPhi_s = Double.POSITIVE_INFINITY; 95 | 96 | // 2.a. get candidate nodes (neighbors of community for greedy) 97 | neighbors(focusedCommunity, candidateNodes); 98 | 99 | // 2.b. find best structural node and best focused node to add 100 | for (Integer n : candidateNodes) { 101 | // check whether 'n' decreases conductance for either of the sets 102 | double deltaPhi_s = structuralCommunity.getDeltaConductance(n, true); 103 | double deltaPhi = focusedCommunity.getDeltaConductance(n, true); 104 | 105 | if (deltaPhi_s < bestDeltaPhi_s) { 106 | bestStructuralNode = n; 107 | bestDeltaPhi_s = deltaPhi_s; 108 | } 109 | 110 | if (deltaPhi < bestDeltaPhi) { 111 | bestNode = n; 112 | bestDeltaPhi = deltaPhi; 113 | } 114 | } 115 | 116 | // 2.c. add best nodes if they exist 117 | if (bestNode != null) { 118 | 119 | // add node, if its good, or if we have backtracking steps left 120 | if (bestDeltaPhi > MIN_DELTA && missteps < MAX_MISTEPS) { 121 | focusedCommunity.add(bestNode); 122 | structuralCommunity.add(bestNode); 123 | missteps++; 124 | backtrackList.add(bestNode); 125 | } else if (bestDeltaPhi <= MIN_DELTA) { 126 | focusedCommunity.add(bestNode); 127 | structuralCommunity.add(bestNode); 128 | if (missteps > 0) { 129 | backtrackList.add(bestNode); 130 | } 131 | any_focused_community_change = true; 132 | } else { 133 | // go back to minimum 134 | for (Integer i : backtrackList) { 135 | focusedCommunity.remove(i); 136 | } 137 | bestNode = null; 138 | } 139 | 140 | // if the backtrack made things better in the long run, reset it 141 | if (focusedCommunity.getConductance() < minConductanceSeenSoFar) { 142 | missteps = 0; 143 | minConductanceSeenSoFar = focusedCommunity.getConductance(); 144 | backtrackList.clear(); 145 | } 146 | 147 | } 148 | if (bestStructuralNode != null) { 149 | bestStructuralNodes.add(bestStructuralNode); 150 | } 151 | 152 | // logger.info("best: [" + bestNode + " : " + bestDeltaPhi + " , " + bestStructuralNode + " : " + bestDeltaPhi_s + "]"); 153 | } while (bestNode != null); 154 | 155 | // System.err.println("Done Adding. Current conductance: " + focusedCommunity.getConductance() + "size: " + focusedCommunity.size()); 156 | 157 | // 3. do the time warp (check to see if removing any node decreases conductance, and remove it) 158 | boolean removed = false; 159 | 160 | do { 161 | removed = false; 162 | bestNode = null; 163 | double bestDeltaPhi = 0.0; 164 | 165 | for (Integer n : focusedCommunity) { 166 | // check whether removing n decreases conductance. if so, do it 167 | double deltaPhi = focusedCommunity.getDeltaConductance(n, false); 168 | if (deltaPhi < bestDeltaPhi) { 169 | bestNode = n; 170 | bestDeltaPhi = deltaPhi; 171 | } 172 | } 173 | 174 | if (bestNode != null) { 175 | // logger.info("best: [" + bestNode + " : " + bestDeltaPhi + "]"); 176 | removed = true; 177 | focusedCommunity.remove(bestNode); 178 | any_focused_community_change = true; 179 | GREEDY_FOCUSED_NODES_REMOVED_CNT++; 180 | } 181 | 182 | } while (removed); 183 | 184 | // System.err.println("Done Removing. Current conductance: " + focusedCommunity.getConductance() + "size: " + focusedCommunity.size()); 185 | iter++; 186 | 187 | } while (any_focused_community_change && iter < MAX_ITER); 188 | 189 | // 4. record outliers. 190 | // we define outliers to be nodes that were in the focused community, but were not in the structural community 191 | bestStructuralNodes.removeAll(focusedCommunity); 192 | outliers.addAll(bestStructuralNodes); 193 | 194 | // we define an inlier here as something that was added to the focused community, but then later removed 195 | inliers.addAll(structuralCommunity); 196 | inliers.removeAll(focusedCommunity); 197 | } 198 | 199 | protected void neighbors(HashSet input, ArrayList output) { 200 | output.clear(); 201 | 202 | HashSet added = new HashSet(); 203 | 204 | // TODO (bperozzi) perhaps this should sort by edge weight 205 | for (Integer i : input) { 206 | for (DefaultWeightedEdge e : structuralGraph.edgesOf(i)) { 207 | Integer target = Graphs.getOppositeVertex(structuralGraph, e, i); 208 | 209 | if (!input.contains(target) && !added.contains(target)) { 210 | output.add(target); 211 | added.add(target); 212 | } 213 | } 214 | } 215 | } 216 | 217 | @Override 218 | public UnweightedCommunity getStructuralCommunity() { 219 | return structuralCommunity; 220 | } 221 | 222 | @Override 223 | public WeightedCommunity getFocusedCommunity() { 224 | return focusedCommunity; 225 | } 226 | 227 | @Override 228 | public HashSet getOutliers() { 229 | return outliers; 230 | } 231 | 232 | @Override 233 | public HashSet getInliers() { 234 | return inliers; 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/community/ICommunity.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.community; 2 | 3 | import java.util.Set; 4 | 5 | /** 6 | * Author: Bryan Perozzi 7 | * 8 | 9 | */ 10 | public interface ICommunity extends Set { 11 | double getConductance(); 12 | 13 | double getVolume(); 14 | 15 | double getDeltaConductance(Integer vertex, boolean toAdd); 16 | } 17 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/community/LocalCommunityBuilder.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.community; 2 | 3 | import java.util.HashSet; 4 | 5 | /** 6 | * User: hubris 7 | * 8 | 9 | */ 10 | public interface LocalCommunityBuilder { 11 | 12 | public UnweightedCommunity getStructuralCommunity(); 13 | 14 | public WeightedCommunity getFocusedCommunity(); 15 | 16 | public HashSet getOutliers(); 17 | 18 | public HashSet getInliers(); 19 | } 20 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/community/Outlier.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.community; 2 | 3 | /** 4 | * Author: Bryan Perozzi 5 | * 6 | 7 | */ 8 | public class Outlier { 9 | public int id = -1; 10 | public int votesOutlier = 1; 11 | public int votes = 1; 12 | public int notInCommunity = 0; 13 | 14 | public Outlier(int id) { 15 | this.id = id; 16 | } 17 | 18 | public double outlierRatio() { 19 | return (votesOutlier / (double) votes); 20 | } 21 | 22 | @Override 23 | public boolean equals(Object o) { 24 | Outlier other = (Outlier) o; 25 | if (other != null) { 26 | return other.id == id; 27 | } 28 | return false; 29 | } 30 | 31 | @Override 32 | // TODO this probably needs to be better 33 | public int hashCode() { 34 | return id; 35 | } 36 | 37 | @Override 38 | public String toString() { 39 | return "[" + id + "," + outlierRatio() + "," + notInCommunity + "]"; 40 | } 41 | } -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/community/PrecomputedCommunity.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.community; 2 | 3 | import java.util.Collection; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | 7 | /** 8 | * Author: Bryan Perozzi 9 | * 10 | 11 | */ 12 | public class PrecomputedCommunity implements ICommunity{ 13 | 14 | List community; 15 | double conductance; 16 | double volume; 17 | 18 | public PrecomputedCommunity(List community, double conductance, double volume){ 19 | this.community = community; 20 | this.conductance = conductance; 21 | this.volume = volume; 22 | } 23 | 24 | @Override 25 | public double getConductance() { 26 | return conductance; 27 | } 28 | 29 | @Override 30 | public double getVolume() { 31 | return volume; 32 | } 33 | 34 | @Override 35 | public double getDeltaConductance(Integer vertex, boolean toAdd) { 36 | throw new UnsupportedOperationException (); 37 | } 38 | 39 | @Override 40 | public int size() { 41 | return community.size(); 42 | } 43 | 44 | @Override 45 | public boolean isEmpty() { 46 | return community.isEmpty(); 47 | } 48 | 49 | @Override 50 | public boolean contains(Object o) { 51 | throw new UnsupportedOperationException (); 52 | } 53 | 54 | @Override 55 | public Iterator iterator() { 56 | return community.iterator(); 57 | } 58 | 59 | @Override 60 | public Object[] toArray() { 61 | return community.toArray(); 62 | } 63 | 64 | @Override 65 | public T[] toArray(T[] a) { 66 | return community.toArray(a); 67 | } 68 | 69 | @Override 70 | public boolean add(Integer integer) { 71 | throw new UnsupportedOperationException (); 72 | } 73 | 74 | @Override 75 | public boolean remove(Object o) { 76 | throw new UnsupportedOperationException (); 77 | } 78 | 79 | @Override 80 | public boolean containsAll(Collection c) { 81 | throw new UnsupportedOperationException (); 82 | } 83 | 84 | @Override 85 | public boolean addAll(Collection c) { 86 | throw new UnsupportedOperationException (); 87 | } 88 | 89 | @Override 90 | public boolean retainAll(Collection c) { 91 | throw new UnsupportedOperationException (); 92 | } 93 | 94 | @Override 95 | public boolean removeAll(Collection c) { 96 | throw new UnsupportedOperationException (); 97 | } 98 | 99 | @Override 100 | public void clear() { 101 | throw new UnsupportedOperationException (); 102 | } 103 | 104 | @Override 105 | public String toString() { 106 | StringBuffer buffer = new StringBuffer(); 107 | 108 | buffer.append("GenericCommunity Container:\n"); 109 | buffer.append("\t volume: " + volume + "\n"); 110 | buffer.append("\t conductance: " + conductance + "\n"); 111 | buffer.append("\t members: " + size() + "\n"); 112 | 113 | if (size() < 50) { 114 | buffer.append("\t {"); 115 | for (Integer i : this) { 116 | buffer.append(i + ", "); 117 | } 118 | buffer.append(" }\n"); 119 | } 120 | 121 | 122 | return buffer.toString(); 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/community/UnweightedCommunity.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.community; 2 | 3 | import com.google.common.collect.Lists; 4 | import org.jgrapht.Graph; 5 | import org.jgrapht.graph.DefaultWeightedEdge; 6 | 7 | import java.util.ArrayList; 8 | import java.util.Collections; 9 | import java.util.HashSet; 10 | import java.util.Set; 11 | 12 | /** 13 | * Author: Bryan Perozzi 14 | * 15 | 16 | */ 17 | public class UnweightedCommunity extends HashSet implements ICommunity { 18 | 19 | Graph graph; 20 | 21 | public UnweightedCommunity(Graph g) { 22 | graph = g; 23 | 24 | totalEdges = graph.edgeSet().size(); 25 | totalVolume = 2 * totalEdges; 26 | } 27 | 28 | protected double conductance = 1.0; 29 | 30 | protected long external_edges = 0; 31 | protected long denominator = 0; 32 | protected long volume = 0; 33 | 34 | protected long totalEdges = 0; 35 | protected long totalVolume = 0; 36 | 37 | public long countExternalEdges(Integer i, Set neighborSet) { 38 | 39 | int numberExternalEdges = 0; 40 | 41 | // count number of external edges 42 | for (DefaultWeightedEdge edge : neighborSet) { 43 | 44 | // no easy way to get 'other' out of JGraph's undirected graph edge traversal... ugh! 45 | Integer src = graph.getEdgeSource(edge); 46 | Integer dst = graph.getEdgeTarget(edge); 47 | Integer other = src.equals(i) ? dst : src; 48 | 49 | if (contains(other)) { 50 | numberExternalEdges -= 1; 51 | } else { 52 | numberExternalEdges += 1; 53 | } 54 | } 55 | 56 | return numberExternalEdges; 57 | } 58 | 59 | @Override 60 | public boolean add(Integer i) { 61 | // maintain unweighted volume 62 | Set neighborSet = graph.edgesOf(i); 63 | volume += neighborSet.size(); 64 | 65 | // maintain correct denominator to compute conductance with 66 | denominator = Math.min(volume, totalVolume - volume); 67 | 68 | // count how many external edges this node has with the community 69 | external_edges += countExternalEdges(i, neighborSet); 70 | 71 | conductance = external_edges / (double) denominator; 72 | 73 | return super.add(i); 74 | } 75 | 76 | @Override 77 | public boolean remove(Object o) { 78 | Integer i = (Integer) o; 79 | // maintain unweighted volume 80 | Set neighborSet = graph.edgesOf(i); 81 | volume -= neighborSet.size(); 82 | 83 | // maintain correct denominator to compute conductance with 84 | denominator = Math.min(volume, totalVolume - volume); 85 | 86 | boolean retValue = super.remove(i); 87 | 88 | // count how many external edges this node has with the community, and remove them (opposite of add) 89 | external_edges -= countExternalEdges(i, neighborSet); 90 | 91 | conductance = external_edges / (double) denominator; 92 | 93 | return retValue; 94 | } 95 | 96 | public long getExternalEdges() { 97 | return external_edges; 98 | } 99 | 100 | @Override 101 | public double getConductance() { 102 | return conductance; 103 | } 104 | 105 | @Override 106 | public double getVolume() { 107 | return volume; 108 | } 109 | 110 | /** 111 | * Return the change in conductance which would occur from adding a particular vertex to the set. 112 | * This is linear in the degree of the vertex. ie., O(degree(toAdd)) 113 | */ 114 | @Override 115 | public double getDeltaConductance(Integer vertex, boolean add) { 116 | 117 | Set neighborSet = graph.edgesOf(vertex); 118 | 119 | int degree_U = neighborSet.size(); 120 | long delta_E = 0; 121 | 122 | delta_E = countExternalEdges(vertex, neighborSet); 123 | 124 | if (!add) { 125 | delta_E = -1 * delta_E; 126 | degree_U = -1 * degree_U; 127 | } 128 | 129 | long new_volume = volume + degree_U; 130 | long new_denom = Math.min(new_volume, totalVolume - new_volume); 131 | double rescaled_conductance = ((denominator) / (double) (new_denom)) * conductance; 132 | double final_conductance = rescaled_conductance + (delta_E) / (double) (new_denom); 133 | 134 | // this is the change in conductance if one were to add node u 135 | return final_conductance - conductance; 136 | } 137 | 138 | @Override 139 | public String toString() { 140 | StringBuffer buffer = new StringBuffer(); 141 | 142 | buffer.append("UnweightedCommunity:\n"); 143 | buffer.append("\t outgoing edges: " + external_edges + "\n"); 144 | buffer.append("\t volume: " + volume + "\n"); 145 | buffer.append("\t conductance: " + conductance + "\n"); 146 | buffer.append("\t members: " + size() + "\n"); 147 | 148 | ArrayList members = Lists.newArrayList(this); 149 | Collections.sort(members); 150 | 151 | if (size() < 50) { 152 | buffer.append("\t {"); 153 | for (Integer i : members) { 154 | buffer.append(i + ", "); 155 | } 156 | buffer.append(" }\n"); 157 | } 158 | 159 | return buffer.toString(); 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/community/WeightedCommunity.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.community; 2 | 3 | import com.google.common.collect.Lists; 4 | import org.jgrapht.Graph; 5 | import org.jgrapht.graph.DefaultWeightedEdge; 6 | 7 | import java.util.*; 8 | 9 | /** 10 | * Author: Bryan Perozzi 11 | * 12 | 13 | */ 14 | public class WeightedCommunity extends HashSet implements ICommunity { 15 | 16 | public final double MIN_VERTEX_VOLUME = 0.0000001; 17 | 18 | Graph graph; 19 | 20 | public WeightedCommunity(Graph g) { 21 | graph = g; 22 | 23 | totalEdges = graph.edgeSet().size(); 24 | 25 | // sum up total edge weights 26 | // TODO (bperozzi) find a more stable way to do this? 27 | for (DefaultWeightedEdge edge : graph.edgeSet()) { 28 | totalVolume += graph.getEdgeWeight(edge); 29 | } 30 | totalVolume *= 2; 31 | } 32 | 33 | protected double conductance = 1.0; 34 | 35 | protected double external_edges = 0; 36 | protected double internal_edges_sum = 0; 37 | protected double internal_edges_sum_squares = 0; 38 | protected int internal_edges_cnt = 0; 39 | protected double denominator = 0; 40 | protected double volume = 0; 41 | 42 | protected double totalEdges = 0; 43 | protected double totalVolume = 0; 44 | 45 | @Override 46 | public boolean add(Integer i) { 47 | Set neighborSet = graph.edgesOf(i); 48 | 49 | // maintain unweighted volume and 50 | // count number of external edges 51 | for (DefaultWeightedEdge edge : neighborSet) { 52 | 53 | // no easy way to get 'other' out of JGraph's undirected graph edge traversal... ugh! 54 | Integer src = graph.getEdgeSource(edge); 55 | Integer dst = graph.getEdgeTarget(edge); 56 | 57 | Double weight = graph.getEdgeWeight(edge); 58 | Integer other = src.equals(i) ? dst : src; 59 | 60 | if (weight > MIN_VERTEX_VOLUME) { 61 | if (contains(other)) { 62 | external_edges -= weight; 63 | internal_edges_sum += weight; 64 | internal_edges_sum_squares += Math.pow(weight, 2); 65 | internal_edges_cnt++; 66 | } else { 67 | external_edges += weight; 68 | } 69 | volume += weight; 70 | } 71 | } 72 | 73 | // did floating point mess up the math? 74 | if (external_edges < 0) { 75 | external_edges = 0.0; 76 | } 77 | 78 | // maintain correct denominator to compute conductance with 79 | denominator = Math.min(volume, totalVolume - volume); 80 | 81 | // what if we've added the entire graph? 82 | if ((totalVolume - volume) < MIN_VERTEX_VOLUME) { 83 | denominator = 1.0; 84 | } 85 | 86 | conductance = external_edges / (double) denominator; 87 | 88 | return super.add(i); 89 | } 90 | 91 | @Override 92 | public boolean remove(Object o) { 93 | Integer i = (Integer) o; 94 | 95 | Set neighborSet = graph.edgesOf(i); 96 | 97 | double weightedDegree_U = 0; 98 | double delta_E = 0L; 99 | 100 | for (DefaultWeightedEdge edge : neighborSet) { 101 | 102 | // no easy way to get 'other' out of JGraph's undirected graph edge traversal... ugh! 103 | Integer src = graph.getEdgeSource(edge); 104 | Integer dst = graph.getEdgeTarget(edge); 105 | Integer other = src.equals(i) ? dst : src; 106 | Double weight = graph.getEdgeWeight(edge); 107 | 108 | if (contains(other)) { 109 | delta_E -= weight; 110 | internal_edges_sum -= weight; 111 | internal_edges_sum_squares -= Math.pow(weight, 2); 112 | internal_edges_cnt--; 113 | } else { 114 | delta_E += weight; 115 | } 116 | weightedDegree_U += weight; 117 | } 118 | 119 | // maintain weighted volume 120 | if (weightedDegree_U > MIN_VERTEX_VOLUME) { 121 | volume -= weightedDegree_U; 122 | } 123 | 124 | // maintain correct denominator to compute conductance with 125 | denominator = Math.min(volume, totalVolume - volume); 126 | 127 | // what if we've added the entire graph? 128 | if ((totalVolume - volume) < MIN_VERTEX_VOLUME) { 129 | denominator = 1.0; 130 | } 131 | 132 | boolean retValue = super.remove(i); 133 | 134 | // count how many external edges this node has with the community, and remove them (opposite of add) 135 | if (Math.abs(delta_E) > MIN_VERTEX_VOLUME) { 136 | external_edges -= delta_E; 137 | } 138 | 139 | // did floating point mess up the math? 140 | if (external_edges < 0) { 141 | external_edges = 0; 142 | } 143 | 144 | conductance = external_edges / (double) denominator; 145 | 146 | return retValue; 147 | } 148 | 149 | public double getExternalEdges() { 150 | return external_edges; 151 | } 152 | 153 | @Override 154 | public double getConductance() { 155 | return conductance; 156 | } 157 | 158 | @Override 159 | public double getVolume() { 160 | return volume; 161 | } 162 | 163 | /** 164 | * Return the change in conductance which would occur from adding a particular vertex to the set. 165 | * This is linear in the degree of the vertex. ie., O(degree(toAdd)) 166 | */ 167 | @Override 168 | public double getDeltaConductance(Integer vertex, boolean add) { 169 | 170 | Set neighborSet = graph.edgesOf(vertex); 171 | 172 | double weightedDegree_U = 0; 173 | double delta_E = 0L; 174 | 175 | for (DefaultWeightedEdge edge : neighborSet) { 176 | 177 | // no easy way to get 'other' out of JGraph's undirected graph edge traversal... ugh! 178 | Integer src = graph.getEdgeSource(edge); 179 | Integer dst = graph.getEdgeTarget(edge); 180 | Integer other = src.equals(vertex) ? dst : src; 181 | Double weight = graph.getEdgeWeight(edge); 182 | 183 | if (contains(other)) { 184 | delta_E -= weight; 185 | } else { 186 | delta_E += weight; 187 | } 188 | weightedDegree_U += weight; 189 | } 190 | 191 | // what if the node has 0 volume? ie, all edges about 0.0 192 | double absVolume = Math.abs(weightedDegree_U); 193 | if (absVolume < MIN_VERTEX_VOLUME) { 194 | return 0.0; 195 | } 196 | 197 | if (!add) { 198 | delta_E = -1 * delta_E; 199 | weightedDegree_U = -1 * weightedDegree_U; 200 | } 201 | 202 | double new_volume = volume + weightedDegree_U; 203 | double new_denom = Math.min(new_volume, totalVolume - new_volume); 204 | double rescaled_conductance = ((denominator) / (double) (new_denom)) * conductance; 205 | double final_conductance = rescaled_conductance + (delta_E) / (double) (new_denom); 206 | 207 | // this is the change in conductance if one were to add node u 208 | return final_conductance - conductance; 209 | } 210 | 211 | @Override 212 | public String toString() { 213 | StringBuffer buffer = new StringBuffer(); 214 | 215 | buffer.append("WeightedCommunity:\n"); 216 | buffer.append("\t outgoing edges: " + external_edges + "\n"); 217 | buffer.append("\t volume: " + volume + "\n"); 218 | buffer.append("\t conductance: " + conductance + "\n"); 219 | buffer.append("\t members: " + size() + "\n"); 220 | 221 | ArrayList members = Lists.newArrayList(this); 222 | Collections.sort(members); 223 | 224 | if (size() < 5000) { 225 | buffer.append("\t {"); 226 | for (Integer i : members) { 227 | buffer.append(i + ", "); 228 | } 229 | buffer.append(" }\n"); 230 | } 231 | 232 | 233 | return buffer.toString(); 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/community/WeightedMeansCommunity.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.community; 2 | 3 | import org.jgrapht.Graph; 4 | import org.jgrapht.graph.DefaultWeightedEdge; 5 | import umontreal.iro.lecuyer.probdist.NormalDistQuick; 6 | 7 | import java.util.Set; 8 | 9 | /** 10 | * Author: Bryan Perozzi 11 | * 12 | 13 | */ 14 | public class WeightedMeansCommunity extends WeightedCommunity { 15 | 16 | public final double P_VALUE = 0.05; 17 | public final double CLUSTER_VARIANCE; 18 | 19 | public final double MAP_BETA = 5; 20 | 21 | public WeightedMeansCommunity(Graph g, double variance) { 22 | super(g); 23 | CLUSTER_VARIANCE = variance; 24 | } 25 | 26 | public double getPValue(double x) { 27 | if (internal_edges_cnt == 0) { 28 | return 1.0; 29 | } 30 | 31 | double mean = internal_edges_sum / internal_edges_cnt; 32 | 33 | // double varianceMLE = (internal_edges_sum_squares - (Math.pow(internal_edges_sum, 2)/internal_edges_cnt))/internal_edges_cnt; 34 | // double varianceMAP = CLUSTER_VARIANCE / (1 + internal_edges_cnt) + (internal_edges_cnt * varianceMLE) / (internal_edges_cnt + 1); 35 | // return NormalDistQuick.cdf(mean, varianceMAP, x); 36 | 37 | return NormalDistQuick.cdf(mean, CLUSTER_VARIANCE, x); 38 | } 39 | 40 | // /** 41 | // * We want this to return the weighted means conductance here, so the sorting will perhaps be more meaningful? 42 | // * 43 | // * @return 44 | // */ 45 | // @Override 46 | // public double getConductance() { 47 | // 48 | // double internal = 0; 49 | // double external = 0; 50 | // 51 | // if (internal_edges_cnt > 0) { 52 | // for (Integer vertex : this) { 53 | // Set neighborSet = graph.edgesOf(vertex); 54 | // 55 | // for (DefaultWeightedEdge edge : neighborSet) { 56 | // // no easy way to get 'other' out of JGraph's undirected graph edge traversal... ugh! 57 | // Integer src = graph.getEdgeSource(edge); 58 | // Integer dst = graph.getEdgeTarget(edge); 59 | // Integer other = src.equals(vertex) ? dst : src; 60 | // Double weight = graph.getEdgeWeight(edge); 61 | // 62 | // if (contains(other)) { 63 | // // does this edge fit in the distribution? (p < 0.05) 64 | // // if not, don't give credit for including it 65 | // if (getPValue(weight) > P_VALUE) { 66 | // internal += weight; 67 | // } 68 | // } else { 69 | // if (getPValue(weight) > P_VALUE) { 70 | // external += weight; 71 | // } 72 | // } 73 | // } 74 | // } 75 | // 76 | //// internal = Math.min(internal, totalVolume - internal); 77 | // 78 | // if (internal < MIN_VERTEX_VOLUME) 79 | // return 0.0; 80 | // 81 | // return external / internal; 82 | // } 83 | // 84 | // return 0; 85 | // } 86 | 87 | /** 88 | * Return the change in conductance which would occur from adding a particular vertex to the set. 89 | * This is linear in the degree of the vertex. ie., O(degree(toAdd)) 90 | */ 91 | @Override 92 | public double getDeltaConductance(Integer vertex, boolean add) { 93 | 94 | Set neighborSet = graph.edgesOf(vertex); 95 | 96 | double weightedDegree_U = 0; 97 | double delta_E = 0L; 98 | 99 | for (DefaultWeightedEdge edge : neighborSet) { 100 | 101 | // no easy way to get 'other' out of JGraph's undirected graph edge traversal... ugh! 102 | Integer src = graph.getEdgeSource(edge); 103 | Integer dst = graph.getEdgeTarget(edge); 104 | Integer other = src.equals(vertex) ? dst : src; 105 | Double weight = graph.getEdgeWeight(edge); 106 | 107 | if (contains(other)) { 108 | // does this edge fit in the distribution? (p < 0.05) 109 | // if not, don't give credit for including it 110 | if (getPValue(weight) > P_VALUE) { 111 | delta_E -= weight; 112 | } 113 | } else { 114 | delta_E += weight; 115 | } 116 | weightedDegree_U += weight; 117 | } 118 | 119 | // what if the node has 0 volume? ie, all edges about 0.0 120 | double absVolume = Math.abs(weightedDegree_U); 121 | if (absVolume < MIN_VERTEX_VOLUME) { 122 | return 0.0; 123 | } 124 | 125 | if (!add) { 126 | delta_E = -1 * delta_E; 127 | weightedDegree_U = -1 * weightedDegree_U; 128 | } 129 | 130 | double new_volume = volume + weightedDegree_U; 131 | double new_denom = Math.min(new_volume, totalVolume - new_volume); 132 | 133 | // what if we've added the entire graph? 134 | if ((totalVolume - new_volume) < MIN_VERTEX_VOLUME) { 135 | new_denom = 1.0; 136 | } 137 | 138 | double rescaled_conductance = ((denominator) / (double) (new_denom)) * conductance; 139 | double final_conductance = rescaled_conductance + (delta_E) / (double) (new_denom); 140 | 141 | // this is the change in conductance if one were to add node u 142 | return final_conductance - conductance; 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/io/ascii/Clustering.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.io.ascii; 2 | 3 | import java.io.IOException; 4 | import java.io.PrintWriter; 5 | import java.io.Writer; 6 | import java.util.*; 7 | 8 | /** 9 | * Author: Bryan Perozzi 10 | * 11 | 12 | */ 13 | public class Clustering { 14 | 15 | public static void write(Writer writer, List> clustering) throws IOException { 16 | HashMap key = new HashMap(); 17 | 18 | // assign each cluster a label 19 | for(int i=0; i cluster = clustering.get(i); 21 | for(Integer v : cluster){ 22 | key.put(v, i+1); 23 | } 24 | } 25 | 26 | ArrayList vertices = new ArrayList(key.keySet()); 27 | Collections.sort(vertices); 28 | 29 | PrintWriter printWriter = new PrintWriter(writer); 30 | 31 | for(Integer v : vertices){ 32 | printWriter.append(v.toString()); 33 | printWriter.append(" "); 34 | printWriter.append(key.get(v).toString()); 35 | printWriter.append("\n"); 36 | } 37 | 38 | printWriter.close(); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/io/ascii/Outliers.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.io.ascii; 2 | 3 | import edu.stonybrook.focused.community.Outlier; 4 | 5 | import java.io.IOException; 6 | import java.io.PrintWriter; 7 | import java.io.Writer; 8 | import java.util.List; 9 | 10 | /** 11 | * Author: Bryan Perozzi 12 | * 13 | 14 | */ 15 | public class Outliers { 16 | public static void write(Writer writer, List clustering) throws IOException { 17 | 18 | PrintWriter printWriter = new PrintWriter(writer); 19 | 20 | int k = 1; 21 | for (Outlier o : clustering) { 22 | printWriter.append(Integer.toString(k)); 23 | printWriter.append(" "); 24 | printWriter.append(Integer.toString(o.id)); 25 | printWriter.append("\n"); 26 | k++; 27 | } 28 | 29 | printWriter.close(); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/io/ascii/WeightedEdgeList.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.io.ascii; 2 | 3 | import edu.stonybrook.focused.community.BookkeepingWeightedGraph; 4 | import org.jgrapht.Graph; 5 | import org.jgrapht.Graphs; 6 | import org.jgrapht.graph.DefaultWeightedEdge; 7 | 8 | import java.io.BufferedReader; 9 | import java.io.IOException; 10 | import java.io.Reader; 11 | 12 | /** 13 | * Author: Bryan Perozzi 14 | * 15 | 16 | */ 17 | public class WeightedEdgeList { 18 | public static Graph read(Reader reader) throws IOException { 19 | 20 | int SELF_LOOPS = 0; 21 | 22 | Graph graph = new BookkeepingWeightedGraph(); 23 | 24 | BufferedReader in = new BufferedReader(reader); 25 | String line = null; 26 | while ((line = in.readLine()) != null) { 27 | String[] entries = line.split("\\s+"); 28 | 29 | Integer src = Integer.parseInt(entries[0]); 30 | Integer dst = Integer.parseInt(entries[1]); 31 | 32 | if (src.equals(dst)) { 33 | SELF_LOOPS++; 34 | continue; 35 | } 36 | 37 | Graphs.addEdgeWithVertices(graph, src, dst, Double.parseDouble(entries[2])); 38 | } 39 | 40 | System.err.println("Loaded " + graph.edgeSet().size() + " edges. Self loops removed: " + SELF_LOOPS); 41 | 42 | return graph; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/io/graphml/AttributeGetter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012, Søren Atmakuri Davidsen 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, this 9 | * list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | package edu.stonybrook.focused.io.graphml; 27 | 28 | import java.util.Set; 29 | 30 | /** 31 | * Provides an callback for retrieving items from the XML-backed graphml document. 32 | * 33 | * @author Soren A. Davidsen 34 | */ 35 | public interface AttributeGetter { 36 | /** 37 | * Retrieve an element 38 | * @param clazz the type required 39 | * @param key the attribute name 40 | * @param 41 | * @return 42 | */ 43 | public T get(Class clazz, String key); 44 | 45 | /** 46 | * Check if an attribute is available for the given element 47 | * @param clazz the type required 48 | * @param key the attribute name 49 | * @param 50 | * @return 51 | */ 52 | public boolean has(Class clazz, String key); 53 | 54 | /** 55 | * Get all available attributes by key. 56 | * @return An array of the keys, empty array if no keys. 57 | */ 58 | public Set keys(); 59 | } 60 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/io/graphml/AttributeHandler.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012, Søren Atmakuri Davidsen 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, this 9 | * list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | package edu.stonybrook.focused.io.graphml; 27 | 28 | /** 29 | * @author Soren 30 | */ 31 | public interface AttributeHandler { 32 | public void handle(T obj, String id, AttributeGetter getter); 33 | } 34 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/io/graphml/AttributeProvider.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012, Søren Atmakuri Davidsen 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, this 9 | * list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | package edu.stonybrook.focused.io.graphml; 27 | 28 | /** 29 | * Provider for attributes for an object. 30 | * 31 | * @author Soren A. Davidsen 32 | */ 33 | public interface AttributeProvider { 34 | public void provide(T obj, AttributeSetter setter); 35 | } 36 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/io/graphml/AttributeSetter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012, Søren Atmakuri Davidsen 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, this 9 | * list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | package edu.stonybrook.focused.io.graphml; 27 | 28 | /** 29 | * Callback for providing attributes into the backing graphml XML, without modifying XML Document. 30 | * 31 | * @author Soren A. Davidsen 32 | */ 33 | public interface AttributeSetter { 34 | /** 35 | * Set an attribute in the backing graphml XML. 36 | * @param clazz 37 | * @param key 38 | * @param value 39 | * @param 40 | */ 41 | public void set(Class clazz, String key, T value); 42 | } 43 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/io/graphml/AttributeType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012, Søren Atmakuri Davidsen 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, this 9 | * list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | package edu.stonybrook.focused.io.graphml; 27 | 28 | /** 29 | * Representation of attr.type field. 30 | * 31 | * @author Soren A. Davidsen 32 | */ 33 | public enum AttributeType { 34 | 35 | // 'attr.type' of . Values of this attribute ('boolean', 'int', 'long', 'float', 'double', and 'string') 36 | type_boolean, 37 | type_int, 38 | type_long, 39 | type_float, 40 | type_double, 41 | type_string; 42 | 43 | public String xmlValue() { 44 | return name().substring(5); 45 | } 46 | } 47 | 48 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/io/graphml/CommunityOutlierGraphMLExporter.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.io.graphml; 2 | 3 | import edu.stonybrook.focused.community.ICommunity; 4 | import org.jgrapht.Graph; 5 | import org.jgrapht.graph.DefaultWeightedEdge; 6 | 7 | import java.util.HashSet; 8 | 9 | /** 10 | * Author: Bryan Perozzi 11 | * 12 | 13 | */ 14 | public class CommunityOutlierGraphMLExporter extends GraphMLExporter { 15 | 16 | public CommunityOutlierGraphMLExporter(final Graph graph, final ICommunity community, final HashSet outliers, final HashSet inliers) { 17 | this.edgeAttributeProvider(new AttributeProvider() { 18 | @Override 19 | public void provide(DefaultWeightedEdge obj, AttributeSetter setter) { 20 | setter.set(Double.class, "weight", graph.getEdgeWeight(obj)); 21 | } 22 | }); 23 | 24 | this.vertexAttributeProvider(new AttributeProvider() { 25 | @Override 26 | public void provide(Integer obj, AttributeSetter setter) { 27 | 28 | setter.set(String.class, "Label", obj.toString()); 29 | 30 | if (community.contains(obj)) { 31 | setter.set(String.class, "Community", "Member"); 32 | } else if(outliers.contains(obj)){ 33 | setter.set(String.class, "Community", "Outlier"); 34 | } 35 | else if(inliers.contains(obj)){ 36 | setter.set(String.class, "Community", "Other Outlier"); 37 | } 38 | else { 39 | setter.set(String.class, "Community", "Unassigned"); 40 | } 41 | } 42 | }); 43 | 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/io/graphml/ContinousNumericIDProviders.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012, Søren Atmakuri Davidsen 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, this 9 | * list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | package edu.stonybrook.focused.io.graphml; 27 | 28 | import org.jgrapht.ext.EdgeNameProvider; 29 | import org.jgrapht.ext.VertexNameProvider; 30 | 31 | import java.util.HashMap; 32 | import java.util.Map; 33 | 34 | /** 35 | * @author Soren A. Davidsen 36 | */ 37 | public class ContinousNumericIDProviders { 38 | 39 | public static class ContinousNumericVertexNameProvider implements VertexNameProvider { 40 | private Map map = new HashMap(); 41 | private int i = 0; 42 | @Override 43 | public String getVertexName(V v) { 44 | if (!map.containsKey(v)) 45 | map.put(v, i++); 46 | 47 | return "n" + map.get(v); 48 | } 49 | } 50 | 51 | public static class ContinousNumericEdgeNameProvider implements EdgeNameProvider { 52 | private Map map = new HashMap(); 53 | private int i = 0; 54 | @Override 55 | public String getEdgeName(E e) { 56 | if (!map.containsKey(e)) 57 | map.put(e, i++); 58 | 59 | return "e" + map.get(e); 60 | } 61 | } 62 | 63 | public static class IntegerVertexNameProvider implements VertexNameProvider { 64 | @Override 65 | public String getVertexName(Integer v) { 66 | return "n" + v; 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/io/graphml/GraphMLExporter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012, Søren Atmakuri Davidsen 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, this 9 | * list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | */ 25 | 26 | package edu.stonybrook.focused.io.graphml; 27 | 28 | import com.google.common.collect.Lists; 29 | import org.jgrapht.DirectedGraph; 30 | import org.jgrapht.Graph; 31 | import org.jgrapht.ext.EdgeNameProvider; 32 | import org.jgrapht.ext.VertexNameProvider; 33 | 34 | import javax.xml.stream.XMLOutputFactory; 35 | import javax.xml.stream.XMLStreamException; 36 | import javax.xml.stream.XMLStreamWriter; 37 | import java.io.OutputStream; 38 | import java.util.*; 39 | 40 | /** 41 | * Class for exporting to GraphML. This exporter supports additional attributes of edges and 42 | * vertices to be exported. 43 | * See {@link "http://graphml.graphdrawing.org/primer/graphml-primer.html"} for 44 | * more information on GraphML. 45 | *
    46 | *
  1. Create the exporter. Default is to use "n0..n10" for node IDs, and "e0..e10" for edge IDs.
  2. 47 | *
  3. To make more meaningful IDs use {@link GraphMLExporter#edgeIDProvider(org.jgrapht.ext.EdgeNameProvider)} and {@link GraphMLExporter#vertexIDProvider}
  4. 48 | *
  5. To map GraphML supported attributes, use {@link GraphMLExporter#edgeAttributeProvider(AttributeProvider)} and {@link GraphMLExporter#vertexAttributeProvider(AttributeProvider)}
  6. 49 | *
50 | * 51 | * @author Soren 52 | * @see org.jgrapht.ext.GraphMLExporter 53 | */ 54 | public class GraphMLExporter { 55 | 56 | public static class DummyAttributeProvider implements AttributeProvider { 57 | @Override 58 | public void provide(T obj, AttributeSetter setter) { 59 | // do nothing 60 | } 61 | } 62 | 63 | private static String PREFIX_NODE = "node+"; 64 | private static String PREFIX_EDGE = "edge+"; 65 | 66 | public static String GRAPHML_NS = "http://graphml.graphdrawing.org/xmlns"; 67 | public static String GRAPHML_SCHEMALOCATION = "http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd"; 68 | 69 | private VertexNameProvider vertexIDProvider = new ContinousNumericIDProviders.ContinousNumericVertexNameProvider(); 70 | private EdgeNameProvider edgeIDProvider = new ContinousNumericIDProviders.ContinousNumericEdgeNameProvider(); 71 | 72 | private AttributeProvider vertexAttributeProvider = new DummyAttributeProvider(); 73 | private AttributeProvider edgeAttributeProvider = new DummyAttributeProvider(); 74 | 75 | private Map dMap = new HashMap(); 76 | 77 | /** 78 | * Specify a provider for the "id" attribute of the "node" tag. Perform mapping between the node and the ID representing 79 | * the node in the GraphML XML format. 80 | * 81 | * @param vertexIDProvider 82 | * @return 83 | * @see ContinousNumericIDProviders.ContinousNumericVertexNameProvider 84 | * @see org.jgrapht.ext.StringNameProvider 85 | */ 86 | public GraphMLExporter vertexIDProvider(VertexNameProvider vertexIDProvider) { 87 | this.vertexIDProvider = vertexIDProvider; 88 | return this; 89 | } 90 | 91 | /** 92 | * Specify a provider for the "id" attribute of the "edge" tag. Performs mapping between edge and the ID representing the edge 93 | * in the GraphML XML format. 94 | * 95 | * @param edgeIDProvider 96 | * @return 97 | * @see ContinousNumericIDProviders.ContinousNumericEdgeNameProvider 98 | * @see EdgeNameProvider 99 | */ 100 | public GraphMLExporter edgeIDProvider(EdgeNameProvider edgeIDProvider) { 101 | this.edgeIDProvider = edgeIDProvider; 102 | return this; 103 | } 104 | 105 | public GraphMLExporter vertexAttributeProvider(AttributeProvider vertexAttributeProvider) { 106 | this.vertexAttributeProvider = vertexAttributeProvider; 107 | return this; 108 | } 109 | 110 | public GraphMLExporter edgeAttributeProvider(AttributeProvider edgeAttributeProvider) { 111 | this.edgeAttributeProvider = edgeAttributeProvider; 112 | return this; 113 | } 114 | 115 | private String mapGraphMLType(Class clazz) { 116 | if (clazz.isAssignableFrom(Boolean.class)) 117 | return "boolean"; 118 | else if (clazz.isAssignableFrom(Double.class)) 119 | return "double"; 120 | else if (clazz.isAssignableFrom(Integer.class)) 121 | return "int"; 122 | else if (clazz.isAssignableFrom(Long.class)) 123 | return "long"; 124 | else if (clazz.isAssignableFrom(Float.class)) 125 | return "float"; 126 | else if (clazz.isAssignableFrom(String.class)) 127 | return "string"; 128 | else 129 | throw new RuntimeException("Unsupported attribute type: " + clazz + " (supported: Boolean, Double, Integer, Long, Float, String)"); 130 | } 131 | 132 | private void declareKey(XMLStreamWriter xmlw, String id, String keyFor, String attrName, Class attrType) throws XMLStreamException { 133 | 134 | // declare the weight attribute 135 | xmlw.writeStartElement("key"); 136 | xmlw.writeAttribute("id", id); 137 | xmlw.writeAttribute("for", keyFor); 138 | xmlw.writeAttribute("attr.name", attrName); 139 | xmlw.writeAttribute("attr.type", mapGraphMLType(attrType)); 140 | xmlw.writeEndElement(); 141 | } 142 | 143 | private void declareData(XMLStreamWriter xmlw, String prefix, Map dMap, Map values) throws XMLStreamException { 144 | 145 | for (Map.Entry value : values.entrySet()) { 146 | xmlw.writeStartElement("data"); 147 | xmlw.writeAttribute("key", dMap.get(prefix + value.getKey())); 148 | xmlw.writeCharacters(value.getValue()); 149 | xmlw.writeEndElement(); 150 | } 151 | 152 | } 153 | 154 | private void declareVertex(XMLStreamWriter xmlw, V v, Map dMap, Map values) throws XMLStreamException { 155 | 156 | String id = vertexIDProvider.getVertexName(v); 157 | 158 | xmlw.writeStartElement("node"); 159 | xmlw.writeAttribute("id", id); 160 | declareData(xmlw, PREFIX_NODE, dMap, values); 161 | xmlw.writeEndElement(); 162 | } 163 | 164 | private void declareEdge(XMLStreamWriter xmlw, Graph graph, E e, Map dMap, Map values) throws XMLStreamException { 165 | 166 | String id = edgeIDProvider.getEdgeName(e); 167 | 168 | V src = graph.getEdgeSource(e); 169 | V dst = graph.getEdgeTarget(e); 170 | 171 | String srcID = vertexIDProvider.getVertexName(src); 172 | String dstID = vertexIDProvider.getVertexName(dst); 173 | 174 | xmlw.writeStartElement("edge"); 175 | xmlw.writeAttribute("id", id); 176 | xmlw.writeAttribute("source", srcID); 177 | xmlw.writeAttribute("target", dstID); 178 | 179 | declareData(xmlw, PREFIX_EDGE, dMap, values); 180 | 181 | xmlw.writeEndElement(); 182 | } 183 | 184 | /** 185 | * Export a graph using a writer to represent the output. 186 | * 187 | * @param w the writer where output goes into 188 | * @param graph the graph to export 189 | */ 190 | public void export(OutputStream w, Graph graph) { 191 | 192 | // Create an output factory 193 | XMLOutputFactory xmlof = XMLOutputFactory.newInstance(); 194 | // Set namespace prefix defaulting for all created writers 195 | // xmlof.setProperty("javax.xml.stream.isPrefixDefaulting", Boolean.TRUE); 196 | 197 | String edgeDefault = (graph instanceof DirectedGraph) ? "directed" : "undirected"; 198 | 199 | // process nodes and attribute mappings 200 | final Map nodeTypeMap = new HashMap(); 201 | final Map> nodeXml = new HashMap>(); 202 | for (V v : graph.vertexSet()) { 203 | final Map thisNodeXml = new HashMap(); 204 | this.vertexAttributeProvider.provide(v, new AttributeSetter() { 205 | @Override 206 | public void set(Class clazz, String key, T value) { 207 | thisNodeXml.put(key, value.toString()); 208 | if (!nodeTypeMap.containsKey(key)) nodeTypeMap.put(key, clazz); 209 | } 210 | }); 211 | nodeXml.put(v, thisNodeXml); 212 | } 213 | 214 | // process edges and attribute mappings 215 | final Map edgeTypeMap = new HashMap(); 216 | final Map> edgeXml = new HashMap>(); 217 | for (E e : graph.edgeSet()) { 218 | final Map thisEdgeXml = new HashMap(); 219 | this.edgeAttributeProvider.provide(e, new AttributeSetter() { 220 | @Override 221 | public void set(Class clazz, String key, T value) { 222 | thisEdgeXml.put(key, value.toString()); 223 | if (!edgeTypeMap.containsKey(key)) edgeTypeMap.put(key, clazz); 224 | } 225 | }); 226 | edgeXml.put(e, thisEdgeXml); 227 | } 228 | 229 | try { 230 | 231 | // Create an XML stream writer 232 | XMLStreamWriter xmlwP = xmlof.createXMLStreamWriter(w, "UTF-8"); 233 | // IndentingXMLStreamWriter xmlw = new IndentingXMLStreamWriter(xmlwP); 234 | XMLStreamWriter xmlw = xmlwP; 235 | 236 | // Write XML prologue 237 | xmlw.writeStartDocument("utf-8", "1.0"); 238 | 239 | // Now start with root element 240 | xmlw.writeStartElement("graphml"); 241 | xmlw.writeNamespace("xsi", "http://www.w3.org/2001/XMLSchema-instance"); 242 | xmlw.setPrefix("xsi", "http://www.w3.org/2001/XMLSchema-instance"); 243 | xmlw.writeDefaultNamespace(GRAPHML_NS); 244 | xmlw.writeAttribute("xsi:schemaLocation", GRAPHML_SCHEMALOCATION); 245 | 246 | // d-idx 247 | int d = 0; 248 | for (String attributeName : edgeTypeMap.keySet()) { 249 | String key = PREFIX_EDGE + attributeName; 250 | if (!dMap.containsKey(key)) 251 | dMap.put(key, "d" + (d++)); 252 | String id = dMap.get(key); 253 | declareKey(xmlw, id, "edge", attributeName, edgeTypeMap.get(attributeName)); 254 | } 255 | 256 | for (String attributeName : nodeTypeMap.keySet()) { 257 | String key = PREFIX_NODE + attributeName; 258 | if (!dMap.containsKey(key)) 259 | dMap.put(key, "d" + (d++)); 260 | String id = dMap.get(key); 261 | declareKey(xmlw, id, "node", attributeName, nodeTypeMap.get(attributeName)); 262 | } 263 | 264 | // start the graph 265 | xmlw.writeStartElement("graph"); 266 | xmlw.writeAttribute("id", "G"); 267 | xmlw.writeAttribute("edgedefault", edgeDefault); 268 | 269 | // write vertices 270 | // write vertices in sorted order 271 | ArrayList vertexList = Lists.newArrayList(graph.vertexSet()); 272 | Collections.sort(vertexList, new Comparator() { 273 | @Override 274 | public int compare(V o1, V o2) { 275 | return o1.compareTo(o2); 276 | } 277 | }); 278 | 279 | for (V v : vertexList) 280 | declareVertex(xmlw, v, dMap, nodeXml.get(v)); 281 | 282 | for (E e : graph.edgeSet()) { 283 | declareEdge(xmlw, graph, e, dMap, edgeXml.get(e)); 284 | } 285 | 286 | // Write document end. This closes all open structures 287 | xmlw.writeEndDocument(); 288 | // Close the writer to flush the output 289 | xmlw.close(); 290 | } catch (XMLStreamException e) { 291 | throw new RuntimeException("Error writing: " + e.getMessage(), e); 292 | } 293 | 294 | } 295 | 296 | 297 | } -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/main/CommunityClusterer.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.main; 2 | 3 | import com.google.common.collect.Lists; 4 | import com.google.common.collect.Sets; 5 | import edu.stonybrook.focused.community.*; 6 | import edu.stonybrook.focused.io.ascii.Clustering; 7 | import edu.stonybrook.focused.io.ascii.Outliers; 8 | import edu.stonybrook.focused.io.ascii.WeightedEdgeList; 9 | import org.jgrapht.Graph; 10 | import org.jgrapht.graph.DefaultWeightedEdge; 11 | import org.kohsuke.args4j.CmdLineException; 12 | import org.kohsuke.args4j.CmdLineParser; 13 | 14 | import java.io.File; 15 | import java.io.FileReader; 16 | import java.io.FileWriter; 17 | import java.io.FilenameFilter; 18 | import java.util.*; 19 | 20 | /** 21 | * Author: Bryan Perozzi 22 | * 23 | 24 | */ 25 | public class CommunityClusterer { 26 | 27 | public final static double OUTLIER_DETECTION_THRESHOLD = 1.0; 28 | 29 | public static double OVERLAP_THRESHOLD = 0.5; 30 | 31 | public static List> GreedyCommunityMerge(List communities, Graph graph, ArrayList outliers) { 32 | HashSet used = new HashSet(); 33 | ArrayList> ret = new ArrayList>(); 34 | int CONFLICTING_VERTEX_CNT = 0; 35 | 36 | ArrayList sortedCommunities = new ArrayList(communities); 37 | Collections.sort(sortedCommunities, new Comparator() { 38 | @Override 39 | public int compare(ICommunity o1, ICommunity o2) { 40 | return Double.compare(o1.getConductance(), o2.getConductance()); 41 | } 42 | }); 43 | 44 | // add outliers 45 | HashSet outlierSet = new HashSet(); 46 | for (Outlier i : outliers) { 47 | outlierSet.add(i.id); 48 | used.add(i.id); 49 | } 50 | ret.add(outlierSet); 51 | 52 | // now sorted from lowest to highest conductance 53 | for (ICommunity c : sortedCommunities) { 54 | HashSet temp = new HashSet(); 55 | for (Integer i : c) { 56 | if (!used.contains(i)) { 57 | temp.add(i); 58 | used.add(i); 59 | } else { 60 | CONFLICTING_VERTEX_CNT++; 61 | } 62 | } 63 | if (temp.size() > 0) { 64 | ret.add(temp); 65 | } 66 | } 67 | 68 | HashSet unusedNodes = new HashSet(graph.vertexSet()); 69 | unusedNodes.removeAll(used); 70 | 71 | for (Integer i : unusedNodes) { 72 | HashSet temp = new HashSet(); 73 | temp.add(i); 74 | ret.add(temp); 75 | } 76 | 77 | System.err.println("Conflicting vertex cnt: " + CONFLICTING_VERTEX_CNT); 78 | System.err.println("Orphan vertex cnt: " + unusedNodes.size()); 79 | 80 | return ret; 81 | } 82 | 83 | public static List RemoveOverlappingCommunity(List communities) { 84 | ArrayList sortedCommunities = new ArrayList(communities); 85 | Collections.sort(sortedCommunities, new Comparator() { 86 | @Override 87 | public int compare(ICommunity o1, ICommunity o2) { 88 | return Double.compare(o1.getConductance(), o2.getConductance()); 89 | } 90 | }); 91 | 92 | HashSet toRemove = new HashSet(); 93 | HashSet toKeep = new HashSet(); 94 | for (int i = 0; i < sortedCommunities.size(); i++) { 95 | if (!toRemove.contains(i)) { 96 | for (int j = i + 1; j < sortedCommunities.size(); j++) { 97 | HashSet x = Sets.newHashSet(sortedCommunities.get(i)); 98 | x.retainAll(sortedCommunities.get(j)); 99 | 100 | if ((x.size() / (sortedCommunities.get(i).size() + sortedCommunities.get(j).size() - x.size())) > OVERLAP_THRESHOLD) { 101 | toRemove.add(j); 102 | } 103 | } 104 | toKeep.add(i); 105 | } 106 | } 107 | 108 | ArrayList toReturn = new ArrayList(); 109 | for (Integer x : toKeep) { 110 | toReturn.add(sortedCommunities.get(x)); 111 | } 112 | 113 | return toReturn; 114 | } 115 | 116 | // public static void PrintOverlappingCommunities(List communities) { 117 | // ArrayList sortedCommunities = new ArrayList(communities); 118 | // Collections.sort(sortedCommunities, new Comparator() { 119 | // @Override 120 | // public int compare(ICommunity o1, ICommunity o2) { 121 | // return Double.compare(o1.getConductance(), o2.getConductance()); 122 | // } 123 | // }); 124 | // 125 | // System.err.print("Scanning for overlapping communities...."); 126 | // 127 | // HashSet toRemove = new HashSet(); 128 | // for (int i = 0; i < sortedCommunities.size(); i++) { 129 | // if (!toRemove.contains(i)) { 130 | // for (int j = i + 1; j < sortedCommunities.size(); j++) { 131 | // HashSet x = Sets.newHashSet(sortedCommunities.get(i)); 132 | // x.retainAll(sortedCommunities.get(j)); 133 | // 134 | // if (x.size() > 0) { 135 | // System.out.println("Community " + i + " overlaps with community " + j); 136 | // System.out.println(sortedCommunities.get(i)); 137 | // System.out.println(sortedCommunities.get(j)); 138 | // } 139 | // } 140 | // } 141 | // } 142 | // } 143 | 144 | 145 | public static String[] getDirectories(String path) { 146 | File file = new File(path); 147 | String[] directories = file.list(new FilenameFilter() { 148 | @Override 149 | public boolean accept(File dir, String name) { 150 | return new File(dir, name).isDirectory(); 151 | } 152 | }); 153 | return directories; 154 | } 155 | 156 | public void processOneFile(String input_file, String output_dir, boolean treatNonCommunityAsOutlier, String outputPostfix, boolean debug_Grapml) throws Exception { 157 | 158 | System.err.println("Performing task: " + input_file); 159 | 160 | BookkeepingWeightedGraph graph = (BookkeepingWeightedGraph) WeightedEdgeList.read(new FileReader(new File(input_file))); 161 | 162 | Date startTime = new Date(); 163 | 164 | String outputFilePrefix = new File(output_dir, "task.").toString(); 165 | CommunityHolder communityHolder = new CommunityHolder(outputFilePrefix, graph, debug_Grapml, OPTIONS).invoke(); 166 | 167 | HashMap outlierHashMap = new HashMap(); 168 | HashSet verticesInCommunities = new HashSet(); 169 | for (LocalCommunityBuilder builder : communityHolder.getCommunityBuildiers()) { 170 | for (Integer i : builder.getOutliers()) { 171 | if (!outlierHashMap.containsKey(i)) { 172 | outlierHashMap.put(i, new Outlier(i)); 173 | } 174 | outlierHashMap.get(i).votesOutlier++; 175 | } 176 | } 177 | 178 | for (LocalCommunityBuilder builder : communityHolder.getCommunityBuildiers()) { 179 | for (Integer i : builder.getFocusedCommunity()) { 180 | if (outlierHashMap.containsKey(i)) { 181 | outlierHashMap.get(i).votes++; 182 | } 183 | } 184 | } 185 | 186 | System.err.println("Focused communities found: " + communityHolder.getFocusedCommunties().size()); 187 | 188 | // print communities that overlap 189 | // PrintOverlappingCommunities(communityHolder.getFocusedCommunties()); 190 | 191 | 192 | // remove communities that overlap too much 193 | List communities = RemoveOverlappingCommunity(communityHolder.getFocusedCommunties()); 194 | System.err.println("Non-overlapping focused communities found: " + communities.size()); 195 | 196 | for (ICommunity community : communities) { 197 | // System.err.println("Focused community found:\n" + community); 198 | verticesInCommunities.addAll(community); 199 | } 200 | 201 | // did any vertices not make it into communities? 202 | if (treatNonCommunityAsOutlier) { 203 | for (Integer i : graph.vertexSet()) { 204 | if (!verticesInCommunities.contains(i)) { 205 | if (!outlierHashMap.containsKey(i)) { 206 | outlierHashMap.put(i, new Outlier(i)); 207 | outlierHashMap.get(i).votesOutlier++; 208 | } 209 | outlierHashMap.get(i).notInCommunity++; 210 | } 211 | } 212 | } else { 213 | UnweightedCommunity catchAllCommunity = new UnweightedCommunity(graph); 214 | for (Integer i : graph.vertexSet()) { 215 | if (!verticesInCommunities.contains(i)) { 216 | if (outlierHashMap.containsKey(i)) { 217 | outlierHashMap.get(i).notInCommunity++; 218 | } 219 | catchAllCommunity.add(i); 220 | } 221 | } 222 | communities.add(catchAllCommunity); 223 | } 224 | 225 | 226 | ArrayList outliers = Lists.newArrayList(outlierHashMap.values()); 227 | Collections.sort(outliers, new Comparator() { 228 | @Override 229 | public int compare(Outlier o1, Outlier o2) { 230 | return -1 * Double.compare(o1.outlierRatio(), o2.outlierRatio()); 231 | } 232 | }); 233 | System.err.println("Outliers: " + outliers); 234 | 235 | // remove outliers from communities if their outlier RATIO is greater than OUTLIER_DETECTION_THRESHOLD 236 | for (ICommunity community : communities) { 237 | ArrayList toRemove = new ArrayList(); 238 | for (Integer i : community) { 239 | if (outlierHashMap.containsKey(i)) { 240 | if (outlierHashMap.get(i).outlierRatio() >= OUTLIER_DETECTION_THRESHOLD) { 241 | toRemove.add(i); 242 | } 243 | } 244 | } 245 | for (Integer i : toRemove) { 246 | community.remove(i); 247 | } 248 | } 249 | 250 | // remove outliers that don't meet the threshold 251 | ArrayList toKeep = new ArrayList(); 252 | for (Outlier i : outliers) { 253 | if (i.outlierRatio() >= OUTLIER_DETECTION_THRESHOLD) { 254 | toKeep.add(i); 255 | } 256 | } 257 | outliers = toKeep; 258 | 259 | List> clustering = GreedyCommunityMerge(communities, graph, outliers); 260 | 261 | Date stopTime = new Date(); 262 | System.err.println("time:" + (stopTime.getTime() - startTime.getTime())); 263 | System.out.println((stopTime.getTime() - startTime.getTime())); 264 | 265 | System.err.println("Clustering: " + clustering); 266 | Clustering.write(new FileWriter(outputFilePrefix + "guess.clustering" + outputPostfix), clustering); 267 | Outliers.write(new FileWriter(outputFilePrefix + "guess.outliers" + outputPostfix), outliers); 268 | } 269 | 270 | // Container to hold all options 271 | FocuscoOptions OPTIONS = new FocuscoOptions(); 272 | 273 | public static void main(String[] args) throws Exception { 274 | new CommunityClusterer().doMain(args); 275 | } 276 | 277 | public void doMain(String[] args) throws Exception { 278 | 279 | 280 | CmdLineParser parser = new CmdLineParser(OPTIONS); 281 | 282 | parser.setUsageWidth(80); 283 | 284 | try { 285 | parser.parseArgument(args); 286 | } catch (CmdLineException e) { 287 | System.err.println(e.getMessage()); 288 | System.err.println("java CommunityClusterer [options...]"); 289 | 290 | // print the list of available options 291 | parser.printUsage(System.err); 292 | System.err.println(); 293 | return; 294 | } 295 | 296 | processOneFile(OPTIONS.inputGraph, OPTIONS.outputDirectory, false, OPTIONS.outputPostfix, OPTIONS.debug_graphml); 297 | } 298 | } 299 | -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/main/CommunityHolder.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.main; 2 | 3 | import com.google.common.collect.Sets; 4 | import edu.stonybrook.focused.community.BookkeepingWeightedGraph; 5 | import edu.stonybrook.focused.community.GreedyLocalCommunityBuilder; 6 | import edu.stonybrook.focused.community.ICommunity; 7 | import edu.stonybrook.focused.community.LocalCommunityBuilder; 8 | import edu.stonybrook.focused.io.graphml.CommunityOutlierGraphMLExporter; 9 | import edu.stonybrook.focused.io.graphml.ContinousNumericIDProviders; 10 | import org.jgrapht.Graph; 11 | import org.jgrapht.Graphs; 12 | import org.jgrapht.UndirectedGraph; 13 | import org.jgrapht.alg.ConnectivityInspector; 14 | import org.jgrapht.graph.DefaultWeightedEdge; 15 | import org.jgrapht.graph.SimpleWeightedGraph; 16 | import umontreal.iro.lecuyer.probdist.NormalDistQuick; 17 | 18 | import java.io.BufferedOutputStream; 19 | import java.io.FileOutputStream; 20 | import java.io.IOException; 21 | import java.util.*; 22 | 23 | /** 24 | * Author: Bryan Perozzi 25 | * 26 | 27 | */ 28 | public class CommunityHolder { 29 | 30 | public boolean from_same_distribution(double testValue, double mean, double variance, double p_value) { 31 | return NormalDistQuick.cdf(mean, variance, testValue) > p_value; 32 | } 33 | 34 | public static UndirectedGraph getSubgraph(Graph graph, List highestWeightedEdges) { 35 | UndirectedGraph seedGraph = new SimpleWeightedGraph(DefaultWeightedEdge.class); 36 | for (DefaultWeightedEdge edge : highestWeightedEdges) { 37 | Graphs.addEdgeWithVertices(seedGraph, graph.getEdgeSource(edge), graph.getEdgeTarget(edge), graph.getEdgeWeight(edge)); 38 | } 39 | 40 | return seedGraph; 41 | } 42 | 43 | public List getTopEdges(final Graph graph, double percent) { 44 | 45 | int top_k = (int) (graph.edgeSet().size() * (percent / 100)) + 1; 46 | 47 | return getTopKEdges(graph, top_k); 48 | } 49 | 50 | public List getTopKEdges(final Graph graph, int top_k) { 51 | 52 | ArrayList edgeList = new ArrayList(graph.edgeSet()); 53 | Collections.sort(edgeList, new Comparator() { 54 | @Override 55 | public int compare(DefaultWeightedEdge o1, DefaultWeightedEdge o2) { 56 | Double e1 = graph.getEdgeWeight(o1); 57 | Double e2 = graph.getEdgeWeight(o2); 58 | 59 | return Double.compare(e1, e2); 60 | } 61 | }); 62 | 63 | return edgeList.subList(edgeList.size() - top_k, edgeList.size()); 64 | } 65 | 66 | public List minEdgesToKeep(final Graph graph) { 67 | int top_k = (int) (graph.edgeSet().size() * (OPTIONS.PERCENT_OF_EDGES_TO_KEEP / 100)) + 1; 68 | 69 | System.err.println(OPTIONS.PERCENT_OF_EDGES_TO_KEEP + "% of edges: " + top_k); 70 | 71 | return getTopKEdges(graph, Math.min(top_k, OPTIONS.TOP_K_EDGES_TO_KEEP)); 72 | } 73 | 74 | public List getTopEdgesByDistributionFitting(final Graph graph) { 75 | 76 | ArrayList edgeList = new ArrayList(graph.edgeSet()); 77 | Collections.sort(edgeList, new Comparator() { 78 | @Override 79 | public int compare(DefaultWeightedEdge o1, DefaultWeightedEdge o2) { 80 | Double e1 = graph.getEdgeWeight(o1); 81 | Double e2 = graph.getEdgeWeight(o2); 82 | 83 | return -1 * Double.compare(e1, e2); 84 | } 85 | }); 86 | 87 | System.err.println("Initial edge bootstrap size: " + OPTIONS.INITIAL_SET_BOOTSTRAP_SIZE); 88 | 89 | double initial_set_sum = 0; 90 | double initial_set_mean = 0; 91 | int initial_set_cutoff = 0; 92 | for (int x = 0; x < edgeList.size(); x++) { 93 | double weight = graph.getEdgeWeight(edgeList.get(x)); 94 | if (x < OPTIONS.INITIAL_SET_BOOTSTRAP_SIZE) { 95 | initial_set_sum += weight; 96 | initial_set_mean = initial_set_sum / (x + 1); 97 | initial_set_cutoff = x; 98 | } else { 99 | if (from_same_distribution(weight, initial_set_mean, OPTIONS.INITIAL_SET_EDGE_VARIANCE, OPTIONS.INITIAL_SET_P_VALUE)) { 100 | initial_set_sum += weight; 101 | initial_set_mean = initial_set_sum / (x + 1); 102 | initial_set_cutoff = x; 103 | } else { 104 | break; 105 | } 106 | } 107 | } 108 | 109 | System.err.println("Starting edge cutoff weight: " + graph.getEdgeWeight(edgeList.get(0))); 110 | System.err.println("Ending edge cutoff weight: " + graph.getEdgeWeight(edgeList.get(initial_set_cutoff))); 111 | 112 | return edgeList.subList(0, initial_set_cutoff); 113 | } 114 | 115 | private String outputFilePrefix; 116 | private BookkeepingWeightedGraph graph; 117 | private ArrayList structuralCommunities; 118 | private ArrayList focusedCommunties; 119 | private boolean debug_Graphml = false; 120 | ArrayList builders = new ArrayList(); 121 | 122 | FocuscoOptions OPTIONS; 123 | 124 | public CommunityHolder(String outputFilePrefix, BookkeepingWeightedGraph graph, boolean debug_Graphml, FocuscoOptions myOptions) { 125 | this.outputFilePrefix = outputFilePrefix; 126 | this.graph = graph; 127 | this.debug_Graphml = debug_Graphml; 128 | this.OPTIONS = myOptions; 129 | } 130 | 131 | public ArrayList getStructuralCommunities() { 132 | return structuralCommunities; 133 | } 134 | 135 | public ArrayList getFocusedCommunties() { 136 | return focusedCommunties; 137 | } 138 | 139 | public ArrayList getCommunityBuildiers() { 140 | return builders; 141 | } 142 | 143 | public CommunityHolder invoke() throws IOException { 144 | // initialize the seed sets to perform expansion around 145 | 146 | List> connectedSets = new ArrayList>(); 147 | 148 | if (OPTIONS.SEED_SET_CSV.isEmpty()) { 149 | System.err.println("Finding candidate seed sets by distribution fitting."); 150 | 151 | // get top edges by fitting a distribution to the top ones and grabbing them 152 | List highestWeightedEdges = getTopEdgesByDistributionFitting(graph); 153 | 154 | System.err.println("Top edges saved: " + highestWeightedEdges.size()); 155 | 156 | // make a lil graph from G and these edges 157 | UndirectedGraph seedGraph = getSubgraph(graph, highestWeightedEdges); 158 | 159 | // get connected components 160 | ConnectivityInspector inspector = new ConnectivityInspector(seedGraph); 161 | 162 | connectedSets = inspector.connectedSets(); 163 | } else { 164 | System.err.println("Using manually entered seed set."); 165 | 166 | // a seed set has been manually specified on the command line 167 | HashSet manual_seed_set = Sets.newHashSet(); 168 | for (String seed_node : OPTIONS.SEED_SET_CSV.split(",")) { 169 | manual_seed_set.add(Integer.parseInt(seed_node)); 170 | } 171 | 172 | connectedSets.add(manual_seed_set); 173 | } 174 | 175 | System.err.println("Number of connected components: " + connectedSets.size()); 176 | 177 | // for each connected component, try to build a community 178 | structuralCommunities = new ArrayList(); 179 | focusedCommunties = new ArrayList(); 180 | 181 | long community_size = 0; 182 | 183 | for (Set seedSet : connectedSets) { 184 | GreedyLocalCommunityBuilder builder = new GreedyLocalCommunityBuilder(graph, graph, seedSet, OPTIONS.INTRACLUSTER_EDGE_VARIANCE); 185 | 186 | System.err.println("Conductance:" + builder.getFocusedCommunity().getConductance() + "; size: " + builder.getFocusedCommunity().size()); 187 | 188 | // XXX: beware min conductance cutoff in dense graphs 189 | if (builder.getFocusedCommunity().getConductance() <= OPTIONS.MIN_CONDUCTANCE_CUTOFF) { 190 | if (debug_Graphml) { 191 | CommunityOutlierGraphMLExporter exporter = new CommunityOutlierGraphMLExporter(graph, builder.getFocusedCommunity(), builder.getOutliers(), builder.getInliers()); 192 | exporter.vertexIDProvider(new ContinousNumericIDProviders.IntegerVertexNameProvider()); 193 | exporter.export(new BufferedOutputStream(new FileOutputStream(outputFilePrefix + "debug.focused.community-" + builders.size() + ".size-" + builder.getFocusedCommunity().size() + ".graphml")), graph); 194 | } 195 | 196 | builders.add(builder); 197 | structuralCommunities.add(builder.getStructuralCommunity()); 198 | focusedCommunties.add(builder.getFocusedCommunity()); 199 | community_size += builder.getFocusedCommunity().size(); 200 | } 201 | } 202 | 203 | System.err.println("Number of Communities Found" + focusedCommunties.size()); 204 | System.out.print(focusedCommunties.size() + ","); 205 | 206 | double average_size = ((double) community_size) / focusedCommunties.size(); 207 | 208 | System.err.println("Average commmunity size" + average_size); 209 | System.out.print(average_size + ","); 210 | 211 | return this; 212 | } 213 | } -------------------------------------------------------------------------------- /java_src/src/main/java/edu/stonybrook/focused/main/FocuscoOptions.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.main; 2 | 3 | import org.kohsuke.args4j.Argument; 4 | import org.kohsuke.args4j.Option; 5 | 6 | import java.io.File; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | /** 11 | * Author: Bryan Perozzi 12 | * 13 | 14 | */ 15 | public class FocuscoOptions { 16 | 17 | /* 18 | Program level options 19 | */ 20 | @Option(name = "-output_postfix") 21 | public String outputPostfix = ""; 22 | 23 | @Option(name = "-output_directory", usage = "directory path to output result files in") 24 | public String outputDirectory = new File("").toString(); 25 | 26 | @Option(name = "-input", required = true, usage = "weighted input edge list") 27 | public String inputGraph = "task.edges"; 28 | 29 | @Option(name = "-emit_graphml") 30 | public Boolean debug_graphml = false; 31 | 32 | /* 33 | FocusCO algorithm options. 34 | */ 35 | 36 | @Option(name = "-seed_set", usage = "comma separeted list of nodeids to include in a manually entered seed set") 37 | public String SEED_SET_CSV = ""; 38 | 39 | public double PERCENT_OF_EDGES_TO_KEEP = 5.0; 40 | 41 | public double MIN_CONDUCTANCE_CUTOFF = 1.0; 42 | 43 | public int TOP_K_EDGES_TO_KEEP = 1000; 44 | 45 | @Option(name = "-intra_cluster_variance", usage = "focused cluster variance") 46 | public double INTRACLUSTER_EDGE_VARIANCE = 0.1; 47 | 48 | @Option(name = "-initial_variance", usage = "initial set variance") 49 | public double INITIAL_SET_EDGE_VARIANCE = 0.001; 50 | 51 | @Option(name = "-initial_bootstrap_size", usage = "initial set bootstrap size (in # edges)") 52 | public int INITIAL_SET_BOOTSTRAP_SIZE = 5; 53 | 54 | @Option(name = "-initial_p_val", usage = "initial set p-value") 55 | public double INITIAL_SET_P_VALUE = 0.05; 56 | 57 | 58 | // receives other command line parameters 59 | @Argument 60 | public List arguments = new ArrayList(); 61 | } 62 | -------------------------------------------------------------------------------- /java_src/src/test/java/edu/stonybrook/focused/tests/BookkeepingGraphTests.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.tests; 2 | 3 | import edu.stonybrook.focused.community.BookkeepingWeightedGraph; 4 | import junit.framework.Assert; 5 | import org.jgrapht.Graphs; 6 | import org.junit.Test; 7 | 8 | import static edu.stonybrook.focused.tests.CommunityTests.addTriangle; 9 | 10 | /** 11 | * Author: Bryan Perozzi 12 | * 13 | 14 | */ 15 | public class BookkeepingGraphTests { 16 | 17 | @Test 18 | public void TestBasicBookkeeping1() { 19 | BookkeepingWeightedGraph graph = new BookkeepingWeightedGraph(); 20 | 21 | // triangle 1 22 | addTriangle(graph, 0, 1.0); 23 | addTriangle(graph, 3, 1.0); 24 | 25 | // bridge 26 | Graphs.addEdgeWithVertices(graph, 0, 3, 1.0); 27 | 28 | Assert.assertEquals("Weighted volume", 14.0, graph.getWeightedVolume(), 0.1); 29 | Assert.assertEquals("Outdegree of 0", 3, graph.getWeightedOutDegreeOf(0), 0.1); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /java_src/src/test/java/edu/stonybrook/focused/tests/CommunityTests.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.tests; 2 | 3 | import edu.stonybrook.focused.community.UnweightedCommunity; 4 | import edu.stonybrook.focused.community.WeightedCommunity; 5 | import edu.stonybrook.focused.community.WeightedMeansCommunity; 6 | import junit.framework.Assert; 7 | import org.jgrapht.Graph; 8 | import org.jgrapht.Graphs; 9 | import org.jgrapht.graph.DefaultWeightedEdge; 10 | import org.jgrapht.graph.SimpleWeightedGraph; 11 | import org.junit.Test; 12 | import umontreal.iro.lecuyer.probdist.NormalDistQuick; 13 | 14 | /** 15 | * Author: Bryan Perozzi 16 | * 17 | 18 | */ 19 | public class CommunityTests { 20 | 21 | public static Graph getTwoTriangles(Double weight) { 22 | Graph graph = new SimpleWeightedGraph(DefaultWeightedEdge.class); 23 | 24 | // triangle 1 25 | addTriangle(graph, 0, weight); 26 | addTriangle(graph, 3, weight); 27 | 28 | // bridge 29 | Graphs.addEdgeWithVertices(graph, 0, 3, weight); 30 | 31 | return graph; 32 | } 33 | 34 | // public static void addTriangle(Graph graph, Integer startId) { 35 | // // triangle 2 36 | // Graphs.addEdgeWithVertices(graph, startId, startId + 1); 37 | // Graphs.addEdgeWithVertices(graph, startId + 1, startId + 2); 38 | // Graphs.addEdgeWithVertices(graph, startId + 2, startId); 39 | // } 40 | 41 | public static void addTriangle(Graph graph, Integer startId, Double weight) { 42 | // triangle 2 43 | Graphs.addEdgeWithVertices(graph, startId, startId + 1, weight); 44 | Graphs.addEdgeWithVertices(graph, startId + 1, startId + 2, weight); 45 | Graphs.addEdgeWithVertices(graph, startId + 2, startId, weight); 46 | } 47 | 48 | @Test 49 | public void TestStructuralConductance1() { 50 | Graph graph = getTwoTriangles(1.0); 51 | 52 | UnweightedCommunity community = new UnweightedCommunity(graph); 53 | community.add(0); 54 | 55 | Assert.assertEquals("External edges equality", 3, community.getExternalEdges()); 56 | Assert.assertEquals("Conductance", 1.0, community.getConductance()); 57 | } 58 | 59 | @Test 60 | public void TestStructuralConductance2() { 61 | Graph graph = getTwoTriangles(1.0); 62 | 63 | UnweightedCommunity community = new UnweightedCommunity(graph); 64 | community.add(0); 65 | community.add(1); 66 | 67 | Assert.assertEquals("External edges equality", 3, community.getExternalEdges()); 68 | Assert.assertEquals("Conductance", 3.0/5, community.getConductance()); 69 | } 70 | 71 | @Test 72 | public void TestStructuralConductance3() { 73 | Graph graph = getTwoTriangles(1.0); 74 | 75 | UnweightedCommunity community = new UnweightedCommunity(graph); 76 | community.add(0); 77 | community.add(1); 78 | community.add(2); 79 | 80 | Assert.assertEquals("External edges equality", 1, community.getExternalEdges()); 81 | Assert.assertEquals("Conductance", 1.0/7, community.getConductance()); 82 | } 83 | 84 | @Test 85 | public void TestStructuralConductance4() { 86 | // test when community is larger than Vol(G)/2 87 | Graph graph = getTwoTriangles(1.0); 88 | 89 | UnweightedCommunity community = new UnweightedCommunity(graph); 90 | community.add(0); 91 | community.add(1); 92 | community.add(2); 93 | community.add(3); 94 | 95 | Assert.assertEquals("External edges equality", 2, community.getExternalEdges()); 96 | // note that the conductance should not be 2/10!! 97 | Assert.assertEquals("Conductance", 2.0/4, community.getConductance()); 98 | } 99 | 100 | @Test 101 | public void TestStructuralConductance5() { 102 | Graph graph = getTwoTriangles(1.0); 103 | 104 | UnweightedCommunity community = new UnweightedCommunity(graph); 105 | community.add(0); 106 | community.add(1); 107 | community.add(2); 108 | community.remove(2); 109 | 110 | Assert.assertEquals("External edges equality", 3, community.getExternalEdges()); 111 | Assert.assertEquals("Conductance", 3.0/5, community.getConductance()); 112 | } 113 | 114 | @Test 115 | public void TestStructuralConductance6() { 116 | Graph graph = getTwoTriangles(1.0); 117 | 118 | UnweightedCommunity community = new UnweightedCommunity(graph); 119 | community.add(0); 120 | community.add(1); 121 | community.add(2); 122 | community.remove(2); 123 | community.remove(1); 124 | 125 | Assert.assertEquals("External edges equality", 3, community.getExternalEdges()); 126 | Assert.assertEquals("Conductance", 1.0, community.getConductance()); 127 | } 128 | 129 | @Test 130 | public void TestStructuralConductanceDelta1() { 131 | // test when community is larger than Vol(G)/2 132 | Graph graph = getTwoTriangles(1.0); 133 | 134 | UnweightedCommunity community = new UnweightedCommunity(graph); 135 | community.add(0); 136 | community.add(1); 137 | 138 | Double delta = community.getDeltaConductance(2, true); 139 | 140 | Assert.assertEquals("Conductance change by adding 2", (1.0/7) - (3.0/5), delta); 141 | } 142 | 143 | @Test 144 | public void TestStructuralConductanceDelta2() { 145 | // test when community is larger than Vol(G)/2 146 | Graph graph = getTwoTriangles(1.0); 147 | 148 | UnweightedCommunity community = new UnweightedCommunity(graph); 149 | community.add(0); 150 | community.add(1); 151 | community.add(2); 152 | 153 | Double delta = community.getDeltaConductance(2, false); 154 | 155 | // removing 2 would cause this to happen 156 | Assert.assertEquals("Conductance change by removing 2", (3.0/5)-(1.0/7), delta); 157 | 158 | // no actual change should be done 159 | Assert.assertEquals("External edges equality", 1, community.getExternalEdges()); 160 | Assert.assertEquals("Conductance", 1.0/7, community.getConductance()); 161 | } 162 | 163 | @Test 164 | public void TestWeightedConductance1() { 165 | Graph graph = getTwoTriangles(0.3); 166 | 167 | WeightedCommunity community = new WeightedCommunity(graph); 168 | community.add(0); 169 | 170 | Assert.assertEquals("External edges weight", 0.9, community.getExternalEdges(), 0.01); 171 | Assert.assertEquals("Conductance", 1.0, community.getConductance()); 172 | } 173 | 174 | @Test 175 | public void TestWeightedConductance2() { 176 | Graph graph = getTwoTriangles(0.3); 177 | 178 | WeightedCommunity community = new WeightedCommunity(graph); 179 | community.add(0); 180 | community.add(1); 181 | 182 | Assert.assertEquals("External edges weight", 0.9, community.getExternalEdges(), 0.01); 183 | Assert.assertEquals("Conductance", 0.6, community.getConductance(), 0.01); 184 | } 185 | 186 | @Test 187 | public void TestWeightedConductance3() { 188 | Graph graph = new SimpleWeightedGraph(DefaultWeightedEdge.class); 189 | 190 | addTriangle(graph, 0, 0.5); 191 | addTriangle(graph, 3, 0.1); 192 | Graphs.addEdgeWithVertices(graph, 0, 3, 0.1); 193 | 194 | 195 | WeightedCommunity community = new WeightedCommunity(graph); 196 | community.add(0); 197 | community.add(1); 198 | 199 | // total volume in this graph is 3.8 200 | // community has vol() 2.1, so denominator should be 1.7 201 | Assert.assertEquals("External edges weight", 1.1, community.getExternalEdges(), 0.01); 202 | Assert.assertEquals("Conductance", 1.1/1.7, community.getConductance(), 0.01); 203 | } 204 | 205 | @Test 206 | public void TestWeightedConductance4() { 207 | Graph graph = new SimpleWeightedGraph(DefaultWeightedEdge.class); 208 | 209 | addTriangle(graph, 0, 0.5); 210 | addTriangle(graph, 3, 0.1); 211 | Graphs.addEdgeWithVertices(graph, 0, 3, 0.1); 212 | 213 | 214 | WeightedCommunity community = new WeightedCommunity(graph); 215 | community.add(0); 216 | community.add(1); 217 | community.add(2); 218 | 219 | Assert.assertEquals("External edges weight", 0.10, community.getExternalEdges(), 0.01); 220 | Assert.assertEquals("Conductance", 1.0/7, community.getConductance(), 0.01); 221 | } 222 | 223 | @Test 224 | public void TestWeightedConductance5() { 225 | Graph graph = new SimpleWeightedGraph(DefaultWeightedEdge.class); 226 | 227 | addTriangle(graph, 0, 0.5); 228 | addTriangle(graph, 3, 0.1); 229 | Graphs.addEdgeWithVertices(graph, 0, 3, 0.1); 230 | 231 | 232 | WeightedCommunity community = new WeightedCommunity(graph); 233 | community.add(0); 234 | community.add(1); 235 | community.add(2); 236 | 237 | Assert.assertEquals("External edges weight", 0.10, community.getExternalEdges(), 0.01); 238 | Assert.assertEquals("Conductance", 1.0/7, community.getConductance(), 0.01); 239 | 240 | community.remove(2); 241 | 242 | // total volume in this graph is 3.8 243 | // community has vol() 2.1, so denominator should be 1.7 244 | Assert.assertEquals("External edges weight", 1.1, community.getExternalEdges(), 0.01); 245 | Assert.assertEquals("Conductance", 1.1/1.7, community.getConductance(), 0.01); 246 | } 247 | 248 | @Test 249 | public void TestDeltaWeightedConductance1() { 250 | Graph graph = new SimpleWeightedGraph(DefaultWeightedEdge.class); 251 | 252 | addTriangle(graph, 0, 0.5); 253 | addTriangle(graph, 3, 0.5); 254 | Graphs.addEdgeWithVertices(graph, 0, 3, 0.5); 255 | 256 | 257 | WeightedCommunity community = new WeightedCommunity(graph); 258 | community.add(0); 259 | community.add(1); 260 | 261 | Double delta = community.getDeltaConductance(2, true); 262 | Assert.assertEquals("Conductance change by adding 2", (1.0/7) - (3.0/5), delta); 263 | } 264 | 265 | @Test 266 | public void TestDeltaWeightedConductance2() { 267 | Graph graph = new SimpleWeightedGraph(DefaultWeightedEdge.class); 268 | 269 | addTriangle(graph, 0, 0.5); 270 | addTriangle(graph, 3, 0.1); 271 | Graphs.addEdgeWithVertices(graph, 0, 3, 0.1); 272 | 273 | 274 | WeightedCommunity community = new WeightedCommunity(graph); 275 | community.add(0); 276 | community.add(1); 277 | community.add(2); 278 | 279 | Double delta = community.getDeltaConductance(2, false); 280 | 281 | Assert.assertEquals("Conductance change by removing 2", (1.1/1.7)-(1.0/7), delta, 0.01); 282 | Assert.assertEquals("External edges weight", 0.10, community.getExternalEdges(), 0.01); 283 | Assert.assertEquals("Conductance", 1.0/7, community.getConductance(), 0.01); 284 | } 285 | 286 | @Test 287 | public void TestDeltaWeightedConductance3() { 288 | Graph graph = new SimpleWeightedGraph(DefaultWeightedEdge.class); 289 | 290 | addTriangle(graph, 0, 0.5); 291 | addTriangle(graph, 3, 0.0); 292 | Graphs.addEdgeWithVertices(graph, 0, 3, 0.0); 293 | 294 | 295 | WeightedCommunity community = new WeightedCommunity(graph); 296 | community.add(0); 297 | community.add(1); 298 | community.add(2); 299 | 300 | Double delta = community.getDeltaConductance(3, true); 301 | 302 | Assert.assertEquals("Conductance change by adding 3", 0.0, delta, 0.01); 303 | Assert.assertEquals("External edges weight", 0.0, community.getExternalEdges(), 0.01); 304 | Assert.assertEquals("Conductance", 0.0, community.getConductance(), 0.01); 305 | } 306 | 307 | @Test 308 | public void TestWeightedMeansCommunity1() { 309 | Graph graph = new SimpleWeightedGraph(DefaultWeightedEdge.class); 310 | 311 | addTriangle(graph, 0, 1.0); 312 | addTriangle(graph, 3, 0.0); 313 | Graphs.addEdgeWithVertices(graph, 0, 3, 0.0); 314 | 315 | WeightedMeansCommunity c = new WeightedMeansCommunity(graph, 0.1); 316 | 317 | c.add(0); 318 | Assert.assertEquals("p values", NormalDistQuick.cdf(0, c.CLUSTER_VARIANCE, 0.5), c.getPValue(0.5), 0.01); 319 | 320 | c.add(1); 321 | Assert.assertEquals("p values", NormalDistQuick.cdf(1.0, c.CLUSTER_VARIANCE, 0.5), c.getPValue(0.5), 0.01); 322 | 323 | c.add(2); 324 | Assert.assertEquals("p values", NormalDistQuick.cdf(1.0, c.CLUSTER_VARIANCE, 0.5), c.getPValue(0.5), 0.01); 325 | 326 | c.remove(2); 327 | Assert.assertEquals("p values", NormalDistQuick.cdf(1.0, c.CLUSTER_VARIANCE, 0.5), c.getPValue(0.5), 0.01); 328 | 329 | c.add(2); 330 | Assert.assertEquals("p values", NormalDistQuick.cdf(1.0, c.CLUSTER_VARIANCE, 0.5), c.getPValue(0.5), 0.01); 331 | 332 | // would this reject a 0 edge? 333 | Assert.assertTrue("p values", c.getPValue(0.0) < 0.05); 334 | 335 | // would this reject a 0.25 edge? 336 | Assert.assertTrue("p values", c.getPValue(0.25) < 0.05); 337 | } 338 | 339 | @Test 340 | public void TestDeltaWeightedMeansConductance1() { 341 | Graph graph = new SimpleWeightedGraph(DefaultWeightedEdge.class); 342 | 343 | addTriangle(graph, 0, 0.5); 344 | addTriangle(graph, 3, 0.5); 345 | Graphs.addEdgeWithVertices(graph, 0, 3, 0.5); 346 | 347 | 348 | WeightedMeansCommunity community = new WeightedMeansCommunity(graph, 0.1); 349 | community.add(0); 350 | community.add(1); 351 | 352 | Double delta = community.getDeltaConductance(2, true); 353 | Assert.assertEquals("Conductance change by adding 2", (1.0/7) - (3.0/5), delta); 354 | } 355 | 356 | @Test 357 | public void TestDeltaWeightedMeansConductance2() { 358 | Graph graph = new SimpleWeightedGraph(DefaultWeightedEdge.class); 359 | 360 | addTriangle(graph, 0, 0.5); 361 | addTriangle(graph, 3, 0.0); 362 | Graphs.addEdgeWithVertices(graph, 0, 3, 0.0); 363 | 364 | 365 | WeightedMeansCommunity community = new WeightedMeansCommunity(graph, 0.1); 366 | community.add(0); 367 | community.add(1); 368 | community.add(2); 369 | 370 | Double delta = community.getDeltaConductance(3, true); 371 | 372 | Assert.assertEquals("Conductance change by adding 3", 0.0, delta, 0.01); 373 | Assert.assertEquals("External edges weight", 0.0, community.getExternalEdges(), 0.01); 374 | Assert.assertEquals("Conductance", 0.0, community.getConductance(), 0.01); 375 | } 376 | 377 | @Test 378 | public void TestDeltaWeightedMeansConductance3() { 379 | Graph graph = new SimpleWeightedGraph(DefaultWeightedEdge.class); 380 | 381 | addTriangle(graph, 0, 0.5); 382 | addTriangle(graph, 3, 0.0); 383 | Graphs.addEdgeWithVertices(graph, 0, 3, 0.4); 384 | 385 | 386 | WeightedMeansCommunity community = new WeightedMeansCommunity(graph, 0.1); 387 | community.add(0); 388 | community.add(1); 389 | community.add(2); 390 | 391 | Assert.assertEquals("p values", NormalDistQuick.cdf(0.5, community.CLUSTER_VARIANCE, 0.4), community.getPValue(0.4), 0.01); 392 | 393 | Double delta = community.getDeltaConductance(3, true); 394 | 395 | Assert.assertEquals("Conductance change by removing 3", -1.0, delta, 0.01); 396 | Assert.assertEquals("External edges weight", 0.4, community.getExternalEdges(), 0.01); 397 | Assert.assertEquals("Conductance", 0.133, community.getConductance(), 0.01); 398 | 399 | community.add(3); 400 | 401 | delta = community.getDeltaConductance(3, false); 402 | 403 | Assert.assertEquals("Conductance change by removing 3", 1.0, delta, 0.01); 404 | Assert.assertEquals("External edges weight", 0.0, community.getExternalEdges(), 0.01); 405 | Assert.assertEquals("Conductance", 0.0, community.getConductance(), 0.01); 406 | } 407 | 408 | @Test 409 | public void TestDeltaWeightedMeansConductance4() { 410 | Graph graph = new SimpleWeightedGraph(DefaultWeightedEdge.class); 411 | 412 | addTriangle(graph, 0, 0.5); 413 | addTriangle(graph, 3, 0.0); 414 | Graphs.addEdgeWithVertices(graph, 0, 3, 0.4); 415 | 416 | 417 | WeightedMeansCommunity community = new WeightedMeansCommunity(graph, 0.1); 418 | community.add(0); 419 | community.add(1); 420 | community.add(2); 421 | 422 | 423 | } 424 | } 425 | -------------------------------------------------------------------------------- /java_src/src/test/java/edu/stonybrook/focused/tests/GraphIOTests.java: -------------------------------------------------------------------------------- 1 | package edu.stonybrook.focused.tests; 2 | 3 | import edu.stonybrook.focused.io.ascii.WeightedEdgeList; 4 | import junit.framework.Assert; 5 | import org.jgrapht.Graph; 6 | import org.jgrapht.graph.DefaultWeightedEdge; 7 | import org.junit.Test; 8 | 9 | import java.io.IOException; 10 | import java.io.StringReader; 11 | 12 | /** 13 | * Author: Bryan Perozzi 14 | * 15 | 16 | */ 17 | public class GraphIOTests { 18 | @Test 19 | public void TestEdgelistReader() throws IOException { 20 | StringReader reader = new StringReader("1 2 0.5\n1 3 0.1\n"); 21 | 22 | Graph graph = WeightedEdgeList.read(reader); 23 | 24 | Assert.assertEquals(2, graph.edgeSet().size()); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /matlab_src/PGDM/D_constraint.m: -------------------------------------------------------------------------------- 1 | function [fD, fD_1st_d, fD_2nd_d] = D_constraint(X, D, a, N, d) 2 | % Compute the value, 1st derivative, second derivative (Hessian) of 3 | % a dissimilarity constrant function gF(sum_ij distance(d_ij A d_ij)) 4 | % where A is a diagnal matrix (in the form of a column vector 'a'). 5 | 6 | sum_dist = 0; 7 | sum_deri1 = zeros(1,d); 8 | sum_deri2 = zeros(d,d); 9 | 10 | %BP changed this to sparse indexing 11 | [i_arr, j_arr, s_arr] = find(D); 12 | for cnt = 1:numel(i_arr) 13 | i = i_arr(cnt); 14 | j = j_arr(cnt); 15 | if D(i,j) == 1 16 | d_ij = X(i,:) - X(j,:); % difference between 'i' and 'j' 17 | [dist_ij, deri1_d_ij, deri2_d_ij] = distance1(a, d_ij); 18 | sum_dist = sum_dist + dist_ij; 19 | sum_deri1 = sum_deri1 + deri1_d_ij; 20 | sum_deri2 = sum_deri2 + deri2_d_ij; 21 | end 22 | end 23 | 24 | 25 | % for i = 1:N 26 | % for j= i+1:N 27 | % if D(i,j) == 1 28 | % d_ij = X(i,:) - X(j,:); % difference between 'i' and 'j' 29 | % [dist_ij, deri1_d_ij, deri2_d_ij] = distance1(a, d_ij); 30 | % sum_dist = sum_dist + dist_ij; 31 | % sum_deri1 = sum_deri1 + deri1_d_ij; 32 | % sum_deri2 = sum_deri2 + deri2_d_ij; 33 | % end 34 | % end 35 | % end 36 | 37 | [fD, fD_1st_d, fD_2nd_d] = gF2(sum_dist, sum_deri1, sum_deri2); 38 | 39 | 40 | % __________cover function 1_________ 41 | function [fD, fD_1st_d, fD_2nd_d] = gF1(sum_dist, sum_deri1, sum_deri2) 42 | % gF1(y) = y 43 | fD = sum_dist; 44 | fD_1st_d = sum_deri1; 45 | fD_2nd_d = sum_deri2; 46 | 47 | function [fD, fD_1st_d, fD_2nd_d] = gF2(sum_dist, sum_deri1, sum_deri2) 48 | % gF1(y) = log(y) 49 | fD = log(sum_dist); 50 | fD_1st_d = sum_deri1/sum_dist; 51 | fD_2nd_d = sum_deri2/sum_dist - sum_deri1'*sum_deri1/(sum_dist^2); 52 | 53 | 54 | 55 | function [dist_ij, deri1_d_ij, deri2_d_ij] = distance1(a, d_ij) 56 | % distance and derivatives of distance using distance1: distance(d) = L1 57 | fudge = 0.000001; 58 | 59 | dist_ij = sqrt((d_ij.^2)*a); 60 | deri1_d_ij = 0.5*(d_ij.^2)/(dist_ij + (dist_ij==0)*fudge); 61 | deri2_d_ij = -0.25*(d_ij.^2)'*(d_ij.^2)/(dist_ij^3 + (dist_ij==0)*fudge); 62 | 63 | 64 | function [dist_ij, deri1_d_ij, deri2_d_ij] = distance2(a, d_ij) 65 | % distance and derivatives of distance using distance1: distance(d) = sqrt(L1) 66 | fudge = 0.000001; 67 | 68 | dist_ij = ((d_ij.^2)*a)^(1/4); 69 | deri1_d_ij = 0.25*(d_ij.^2)/(dist_ij^3 + (dist_ij==0)*fudge); 70 | deri2_d_ij = -0.25*0.75*(d_ij.^2)'*(d_ij.^2)/(dist_ij^7+(dist_ij==0)*fudge); 71 | 72 | 73 | function [dist_ij, deri1_d_ij, deri2_d_ij] = distance3(a, d_ij) 74 | % distance and derivative of distance using distance3: 1-exp(-\beta*L1) 75 | fudge = 0.000001; 76 | 77 | beta = 0.5; 78 | M2_ij = (d_ij.^2)'*(d_ij.^2); 79 | L1 = sqrt((d_ij.^2)*a); 80 | dist_ij = 1 - exp(-beta*L1); 81 | deri1_d_ij = 0.5*beta*exp(-beta*L1)*(d_ij.^2)/(L1+(L1==0)*fudge); 82 | deri2_d_ij = -0.25*beta^2*exp(-beta*L1)*M2_ij/(L1^2+(L1==0)*fudge) - ... 83 | 0.25*beta*exp(-beta*L1)*M2_ij/(L1^3+(L1==0)*fudge); 84 | -------------------------------------------------------------------------------- /matlab_src/PGDM/D_constraint_sparse.m: -------------------------------------------------------------------------------- 1 | function [fD, fD_1st_d, fD_2nd_d] = D_constraint_sparse(X, D, a, N, d) 2 | % Compute the value, 1st derivative, second derivative (Hessian) of 3 | % a dissimilarity constrant function gF(sum_ij distance(d_ij A d_ij)) 4 | % where A is a diagnal matrix (in the form of a column vector 'a'). 5 | 6 | sum_dist = 0; 7 | sum_deri1 = zeros(1,d); 8 | sum_deri2 = sparse(d,d); 9 | 10 | %BP changed this to sparse indexing 11 | [i_arr, j_arr, s_arr] = find(D); 12 | for cnt = 1:numel(i_arr) 13 | i = i_arr(cnt); 14 | j = j_arr(cnt); 15 | if D(i,j) == 1 16 | d_ij = full(X(i,:) - X(j,:) + ~(X(i,:) | X(j,:))); % difference between 'i' and 'j' + NOR(X_i,X_j) 17 | [dist_ij, deri1_d_ij, deri2_d_ij] = distance1(a, d_ij); 18 | sum_dist = sum_dist + dist_ij; 19 | sum_deri1 = sum_deri1 + deri1_d_ij; 20 | sum_deri2 = sum_deri2 + deri2_d_ij; 21 | end 22 | end 23 | 24 | 25 | % for i = 1:N 26 | % for j= i+1:N 27 | % if D(i,j) == 1 28 | % d_ij = X(i,:) - X(j,:); % difference between 'i' and 'j' 29 | % [dist_ij, deri1_d_ij, deri2_d_ij] = distance1(a, d_ij); 30 | % sum_dist = sum_dist + dist_ij; 31 | % sum_deri1 = sum_deri1 + deri1_d_ij; 32 | % sum_deri2 = sum_deri2 + deri2_d_ij; 33 | % end 34 | % end 35 | % end 36 | 37 | [fD, fD_1st_d, fD_2nd_d] = gF2(sum_dist, sum_deri1, sum_deri2); 38 | 39 | 40 | % __________cover function 1_________ 41 | function [fD, fD_1st_d, fD_2nd_d] = gF1(sum_dist, sum_deri1, sum_deri2) 42 | % gF1(y) = y 43 | fD = sum_dist; 44 | fD_1st_d = sum_deri1; 45 | fD_2nd_d = sum_deri2; 46 | 47 | function [fD, fD_1st_d, fD_2nd_d] = gF2(sum_dist, sum_deri1, sum_deri2) 48 | % gF1(y) = log(y) 49 | fD = log(sum_dist); 50 | fD_1st_d = sum_deri1/sum_dist; 51 | fD_2nd_d = sum_deri2/sum_dist - sum_deri1'*sum_deri1/(sum_dist^2); 52 | 53 | 54 | 55 | function [dist_ij, deri1_d_ij, deri2_d_ij] = distance1(a, d_ij) 56 | % distance and derivatives of distance using distance1: distance(d) = L1 57 | fudge = 0.000001; 58 | 59 | dist_ij = sqrt((d_ij.^2)*a); 60 | deri1_d_ij = 0.5*(d_ij.^2)/(dist_ij + (dist_ij==0)*fudge); 61 | deri2_d_ij = -0.25*(d_ij.^2)'*(d_ij.^2)/(dist_ij^3 + (dist_ij==0)*fudge); 62 | 63 | 64 | function [dist_ij, deri1_d_ij, deri2_d_ij] = distance2(a, d_ij) 65 | % distance and derivatives of distance using distance1: distance(d) = sqrt(L1) 66 | fudge = 0.000001; 67 | 68 | dist_ij = ((d_ij.^2)*a)^(1/4); 69 | deri1_d_ij = 0.25*(d_ij.^2)/(dist_ij^3 + (dist_ij==0)*fudge); 70 | deri2_d_ij = -0.25*0.75*(d_ij.^2)'*(d_ij.^2)/(dist_ij^7+(dist_ij==0)*fudge); 71 | 72 | 73 | function [dist_ij, deri1_d_ij, deri2_d_ij] = distance3(a, d_ij) 74 | % distance and derivative of distance using distance3: 1-exp(-\beta*L1) 75 | fudge = 0.000001; 76 | 77 | beta = 0.5; 78 | M2_ij = (d_ij.^2)'*(d_ij.^2); 79 | L1 = sqrt((d_ij.^2)*a); 80 | dist_ij = 1 - exp(-beta*L1); 81 | deri1_d_ij = 0.5*beta*exp(-beta*L1)*(d_ij.^2)/(L1+(L1==0)*fudge); 82 | deri2_d_ij = -0.25*beta^2*exp(-beta*L1)*M2_ij/(L1^2+(L1==0)*fudge) - ... 83 | 0.25*beta*exp(-beta*L1)*M2_ij/(L1^3+(L1==0)*fudge); 84 | -------------------------------------------------------------------------------- /matlab_src/PGDM/D_objective.m: -------------------------------------------------------------------------------- 1 | function fD = D_objective(X, D, a, N, d) 2 | 3 | sum_dist = 0; 4 | 5 | % for i = 1:N 6 | % for j= i+1:N 7 | % if D(i,j) == 1 8 | % d_ij = X(i,:) - X(j,:); % difference between 'i' and 'j' 9 | % dist_ij = distance1(a, d_ij); 10 | % sum_dist = sum_dist + dist_ij; 11 | % end 12 | % end 13 | % end 14 | 15 | %BP: changed this to a sparse operation 16 | 17 | [i_arr, j_arr, s_arr] = find(D); 18 | for cnt = 1:numel(i_arr) 19 | i = i_arr(cnt); 20 | j = j_arr(cnt); 21 | if D(i,j) == 1 22 | d_ij = X(i,:) - X(j,:); % difference between 'i' and 'j' 23 | dist_ij = distance1(a, d_ij); 24 | sum_dist = sum_dist + dist_ij; 25 | end 26 | end 27 | 28 | fD = gF2(sum_dist); 29 | 30 | 31 | % __________cover function 1_________ 32 | function fD = gF1(sum_dist) 33 | % gF1(y) = y 34 | fD = sum_dist; 35 | 36 | function fD = gF2(sum_dist) 37 | % gF1(y) = log(y) 38 | fD = log(sum_dist); 39 | 40 | 41 | function dist_ij = distance1(a, d_ij) 42 | % distance: distance(d) = L1 43 | fudge = 0.000001; 44 | dist_ij = sqrt((d_ij.^2)*a); 45 | 46 | 47 | function dist_ij = distance2(a, d_ij) 48 | % distance using distance2: distance(d) = sqrt(L1) 49 | fudge = 0.000001; 50 | dist_ij = ((d_ij.^2)*a)^(1/4); 51 | 52 | 53 | function dist_ij = distance3(a, d_ij) 54 | % distance using distance3: 1-exp(-\beta*L1) 55 | fudge = 0.000001; 56 | beta = 0.5; 57 | L1 = sqrt((d_ij.^2)*a); 58 | dist_ij = 1 - exp(-beta*L1); 59 | -------------------------------------------------------------------------------- /matlab_src/PGDM/D_objective_sparse.m: -------------------------------------------------------------------------------- 1 | function fD = D_objective_sparse(X, D, a, N, d) 2 | 3 | sum_dist = 0; 4 | 5 | %BP: changed this to a sparse operation 6 | 7 | [i_arr, j_arr, s_arr] = find(D); 8 | for cnt = 1:numel(i_arr) 9 | i = i_arr(cnt); 10 | j = j_arr(cnt); 11 | if D(i,j) == 1 12 | d_ij = X(i,:) - X(j,:) + ~(X(i,:) | X(j,:)); % difference between 'i' and 'j' + NOR(i,j) 13 | dist_ij = distance1(a, d_ij); 14 | sum_dist = sum_dist + dist_ij; 15 | end 16 | end 17 | 18 | fD = gF2(sum_dist); 19 | 20 | 21 | % __________cover function 1_________ 22 | function fD = gF1(sum_dist) 23 | % gF1(y) = y 24 | fD = sum_dist; 25 | 26 | function fD = gF2(sum_dist) 27 | % gF1(y) = log(y) 28 | fD = log(sum_dist); 29 | 30 | 31 | function dist_ij = distance1(a, d_ij) 32 | % distance: distance(d) = L1 33 | fudge = 0.000001; 34 | dist_ij = sqrt((d_ij.^2)*a); 35 | 36 | 37 | function dist_ij = distance2(a, d_ij) 38 | % distance using distance2: distance(d) = sqrt(L1) 39 | fudge = 0.000001; 40 | dist_ij = ((d_ij.^2)*a)^(1/4); 41 | 42 | 43 | function dist_ij = distance3(a, d_ij) 44 | % distance using distance3: 1-exp(-\beta*L1) 45 | fudge = 0.000001; 46 | beta = 0.5; 47 | L1 = sqrt((d_ij.^2)*a); 48 | dist_ij = 1 - exp(-beta*L1); 49 | -------------------------------------------------------------------------------- /matlab_src/PGDM/Newton.m: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | % solving constraint optimization problem using Newton-Raphson method 3 | % 4 | % Eric Xing 5 | % UC Berkeley 6 | % Jan 15, 2002 7 | % 8 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 9 | 10 | function A = Newton(data, S, D, C) 11 | 12 | size_data=size(data); 13 | N=size_data(1); 14 | d=size_data(2); 15 | 16 | a=ones(d,1); 17 | X=data; 18 | 19 | fudge = 0.000001; 20 | threshold1 = 0.001; 21 | reduction = 2; 22 | 23 | % suppose d is a column vector 24 | % sum(d'Ad) = sum(trace(d'Ad)) = sum(trace(dd'A)) 25 | % = trace(sum(dd'A) = trace(sum(dd')A) 26 | 27 | s_sum = zeros(1,d); 28 | d_sum = zeros(1,d); 29 | % for i = 1:N 30 | % for j = i+1:N 31 | % d_ij = X(i,:) - X(j,:); 32 | % if S(i,j) == 1 33 | % s_sum = s_sum + d_ij.^2; 34 | % elseif D(i,j) == 1 35 | % d_sum = d_sum + d_ij.^2; 36 | % end 37 | % end 38 | % end 39 | 40 | % BP another speed optimization 41 | [i_arr, j_arr, s_arr] = find(D); 42 | for cnt = 1:numel(i_arr) 43 | i = i_arr(cnt); 44 | j = j_arr(cnt); 45 | d_ij = X(i,:) - X(j,:); 46 | if D(i,j) == 1 47 | d_sum = d_sum + d_ij.^2; 48 | end 49 | end 50 | [i_arr, j_arr, s_arr] = find(S); 51 | for cnt = 1:numel(i_arr) 52 | i = i_arr(cnt); 53 | j = j_arr(cnt); 54 | d_ij = X(i,:) - X(j,:); 55 | if S(i,j) == 1 56 | s_sum = s_sum + d_ij.^2; 57 | end 58 | end 59 | 60 | 61 | tt=1; 62 | error=1; 63 | % BP added outer loop constraint, it got stuck in an infinite loop once? 64 | while error > threshold1 && tt < 50 65 | 66 | [fD0, fD_1st_d, fD_2nd_d] = D_constraint(X, D, a, N, d); 67 | obj_initial = s_sum*a + C*fD0; 68 | fS_1st_d = s_sum; % first derivative of the S constraints 69 | 70 | Gradient = fS_1st_d - C*fD_1st_d; % gradient of the objective 71 | Hessian = - C*fD_2nd_d + fudge*eye(d); % Hessian of the objective 72 | % invHessian = inv(Hessian); 73 | % step = invHessian*Gradient'; 74 | % 75 | % BP - Matlab suggests using A\b instead of inv(A)*b 76 | step = Hessian\Gradient'; 77 | 78 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 79 | % Newton-Raphson update 80 | % search over optimal lambda 81 | 82 | lambda=1; % initial step-size 83 | t=1; % counter 84 | atemp = a - lambda*step; 85 | atemp = max(atemp, 0); 86 | 87 | obj = s_sum*atemp + C*D_objective(X, D, atemp, N, d) 88 | 89 | % BP - this doesn't always work? 90 | obj_previous = obj * 1.1; % just to get the while loop started 91 | 92 | % BP without initialization this can bug out 93 | a_previous = atemp; 94 | 95 | while obj < obj_previous 96 | lambda_previous = lambda; 97 | obj_previous = obj; 98 | a_previous = atemp; 99 | lambda = lambda/reduction; 100 | atemp = a - lambda*step; 101 | atemp = max(atemp, 0); 102 | obj = s_sum*atemp + C*D_objective(X, D, atemp, N, d); 103 | t=t+1; % inner counter 104 | end % line search for lambda that minimize obj 105 | 106 | a = a_previous; 107 | 108 | error = abs((obj_previous - obj_initial)/obj_previous); 109 | tt = tt + 1 % outer counter 110 | 111 | end 112 | a 113 | A=diag(a); 114 | -------------------------------------------------------------------------------- /matlab_src/PGDM/Newton_sparse.m: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | % solving constraint optimization problem using Newton-Raphson method 3 | % 4 | % Eric Xing 5 | % UC Berkeley 6 | % Jan 15, 2002 7 | % 8 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 9 | 10 | % modified to treat 0's in feature vectors as absent values. 11 | 12 | function A = Newton_sparse(data, S, D, C) 13 | 14 | size_data=size(data); 15 | N=size_data(1); 16 | d=size_data(2); 17 | 18 | a=ones(d,1); 19 | X=data; 20 | 21 | fudge = 0.000001; 22 | threshold1 = 0.001; 23 | reduction = 2; 24 | 25 | % suppose d is a column vector 26 | % sum(d'Ad) = sum(trace(d'Ad)) = sum(trace(dd'A)) 27 | % = trace(sum(dd'A) = trace(sum(dd')A) 28 | 29 | s_sum = zeros(1,d); 30 | d_sum = zeros(1,d); 31 | 32 | % BP another speed optimization 33 | [i_arr, j_arr, s_arr] = find(D); 34 | for cnt = 1:numel(i_arr) 35 | i = i_arr(cnt); 36 | j = j_arr(cnt); 37 | d_ij = X(i,:) - X(j,:) + ~(X(i,:) | X(j,:)); 38 | if D(i,j) == 1 39 | d_sum = d_sum + d_ij.^2; 40 | end 41 | end 42 | [i_arr, j_arr, s_arr] = find(S); 43 | for cnt = 1:numel(i_arr) 44 | i = i_arr(cnt); 45 | j = j_arr(cnt); 46 | d_ij = X(i,:) - X(j,:) + ~(X(i,:) | X(j,:)); 47 | if S(i,j) == 1 48 | s_sum = s_sum + d_ij.^2; 49 | end 50 | end 51 | 52 | 53 | tt=1; 54 | error=1; 55 | % BP added outer loop constraint, it got stuck in an infinite loop once? 56 | while error > threshold1 && tt < 50 57 | 58 | [fD0, fD_1st_d, fD_2nd_d] = D_constraint_sparse(X, D, a, N, d); 59 | obj_initial = s_sum*a + C*fD0; 60 | fS_1st_d = s_sum; % first derivative of the S constraints 61 | 62 | Gradient = fS_1st_d - C*fD_1st_d; % gradient of the objective 63 | Hessian = - C*fD_2nd_d + fudge*eye(d); % Hessian of the objective 64 | % invHessian = inv(Hessian); 65 | % step = invHessian*Gradient'; 66 | % 67 | % BP - Matlab suggests using A\b instead of inv(A)*b 68 | step = Hessian\Gradient'; 69 | 70 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 71 | % Newton-Raphson update 72 | % search over optimal lambda 73 | 74 | lambda=1; % initial step-size 75 | t=1; % counter 76 | atemp = a - lambda*step; 77 | atemp = max(atemp, 0); 78 | 79 | obj = s_sum*atemp + C*D_objective_sparse(X, D, atemp, N, d) 80 | 81 | % BP - this doesn't always work? 82 | obj_previous = obj * 1.1; % just to get the while loop started 83 | 84 | % BP without initialization this can bug out 85 | a_previous = atemp; 86 | 87 | while obj < obj_previous 88 | lambda_previous = lambda; 89 | obj_previous = obj; 90 | a_previous = atemp; 91 | lambda = lambda/reduction; 92 | atemp = a - lambda*step; 93 | atemp = max(atemp, 0); 94 | obj = s_sum*atemp + C*D_objective_sparse(X, D, atemp, N, d); 95 | t=t+1; % inner counter 96 | end % line search for lambda that minimize obj 97 | 98 | a = a_previous; 99 | 100 | error = abs((obj_previous - obj_initial)/obj_previous); 101 | tt = tt + 1 % outer counter 102 | 103 | end 104 | a(a~=0) 105 | A=diag(a); 106 | -------------------------------------------------------------------------------- /matlab_src/PGDM/Newton_sparse_top_k.m: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | % solving constraint optimization problem using Newton-Raphson method 3 | % 4 | % Eric Xing 5 | % UC Berkeley 6 | % Jan 15, 2002 7 | % 8 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 9 | 10 | % Bryan Perozzi - modified for sparsity, etc. See BP comments. 11 | 12 | function A = Newton_sparse_top_k(data, S, D, C, topk) 13 | 14 | size_data=size(data); 15 | orig_N=size_data(1); 16 | orig_d=size_data(2); 17 | 18 | X=data; 19 | 20 | fudge = 0.000001; 21 | threshold1 = 0.001; 22 | reduction = 2; 23 | 24 | % suppose d is a column vector 25 | % sum(d'Ad) = sum(trace(d'Ad)) = sum(trace(dd'A)) 26 | % = trace(sum(dd'A) = trace(sum(dd')A) 27 | 28 | s_total_sum = zeros(1,orig_d); 29 | % BP another speed optimization 30 | [i_arr, j_arr, s_arr] = find(S); 31 | for cnt = 1:numel(i_arr) 32 | i = i_arr(cnt); 33 | j = j_arr(cnt); 34 | %d_ij = X(i,:) - X(j,:); 35 | if S(i,j) ~= 0 36 | s_total_sum = s_total_sum + X(i,:) + X(j,:); 37 | end 38 | end 39 | 40 | s_total_sum; 41 | 42 | % BP only keep the top k dimensions that are important in s 43 | [top_sim, top_idx] = sort(s_total_sum(1,:), 'descend'); 44 | top_similarity_indices = top_idx(1,(1:topk)); 45 | top_similarity_counts = top_sim(1,(1:topk)); 46 | sorted_top_idx = sort(top_idx(1,(1:topk))); 47 | X = X(:,sorted_top_idx); 48 | 49 | size_new_data = size(X); 50 | N=size_new_data(1); 51 | d=size_new_data(2); 52 | 53 | a=ones(d,1); 54 | s_sum = zeros(1,d); 55 | d_sum = zeros(1,d); 56 | 57 | % BP another speed optimization 58 | [i_arr, j_arr, s_arr] = find(D); 59 | for cnt = 1:numel(i_arr) 60 | i = i_arr(cnt); 61 | j = j_arr(cnt); 62 | d_ij = X(i,:) - X(j,:) + ~(X(i,:) | X(j,:)); 63 | if D(i,j) == 1 64 | d_sum = d_sum + d_ij.^2; 65 | end 66 | end 67 | [i_arr, j_arr, s_arr] = find(S); 68 | for cnt = 1:numel(i_arr) 69 | i = i_arr(cnt); 70 | j = j_arr(cnt); 71 | d_ij = X(i,:) - X(j,:) + ~(X(i,:) | X(j,:)); 72 | if S(i,j) == 1 73 | s_sum = s_sum + d_ij.^2; 74 | end 75 | end 76 | 77 | s_sum; 78 | d_sum; 79 | 80 | tt=1; 81 | error=1; 82 | % BP added outer loop constraint, it got stuck in an infinite loop once? 83 | while error > threshold1 && tt < 50 84 | fprintf('Iteration: %d\n', tt) 85 | [fD0, fD_1st_d, fD_2nd_d] = D_constraint_sparse(X, D, a, N, d); 86 | obj_initial = s_sum*a + C*fD0; 87 | fS_1st_d = s_sum; % first derivative of the S constraints 88 | 89 | Gradient = fS_1st_d - C*fD_1st_d; % gradient of the objective 90 | Hessian = - C*fD_2nd_d + fudge*eye(d); % Hessian of the objective 91 | % invHessian = inv(Hessian); 92 | % step = invHessian*Gradient'; 93 | % 94 | % BP - Replacing inverse (using A\b instead of inv(A)*b) 95 | step = Hessian\Gradient'; 96 | 97 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 98 | % Newton-Raphson update 99 | % search over optimal lambda 100 | 101 | lambda=1; % initial step-size 102 | t=1; % counter 103 | atemp = a - lambda*step; 104 | atemp = max(atemp, 0); 105 | 106 | obj = s_sum*atemp + C*D_objective_sparse(X, D, atemp, N, d); 107 | fprintf('Objective: %f\n', obj) 108 | 109 | % BP - this doesn't always work? 110 | obj_previous = obj * 1.1; % just to get the while loop started 111 | 112 | % BP without initialization this can bug out 113 | a_previous = atemp; 114 | 115 | while obj < obj_previous 116 | lambda_previous = lambda; 117 | obj_previous = obj; 118 | a_previous = atemp; 119 | lambda = lambda/reduction; 120 | atemp = a - lambda*step; 121 | atemp = max(atemp, 0); 122 | obj = s_sum*atemp + C*D_objective_sparse(X, D, atemp, N, d); 123 | t=t+1; % inner counter 124 | end % line search for lambda that minimize obj 125 | 126 | a = a_previous; 127 | 128 | error = abs((obj_previous - obj_initial)/obj_previous); 129 | tt = tt + 1; % outer counter 130 | end 131 | a(a~=0); 132 | 133 | 134 | % reconstruct diagonal for original feature space 135 | full_solution = zeros(1,orig_d); 136 | full_solution(sorted_top_idx) = a; 137 | A=spdiags(full_solution', 0, orig_d, orig_d); 138 | -------------------------------------------------------------------------------- /matlab_src/PGDM/fD.m: -------------------------------------------------------------------------------- 1 | function fd = fD(X, D, A, N, d) 2 | 3 | % --------------------------------------------------------------------------- 4 | % the value of dissimilarity constraint function 5 | % f = f(\sum_{ij \in D} distance(x_i, x_j)) 6 | % i.e. distance can be L1: \sqrt{(x_i-x_j)A(x_i-x_j)'}) ... 7 | % f(x) = x ... 8 | % --------------------------------------------------------------------------- 9 | 10 | fd = 0.000001; 11 | 12 | for i = 1:N 13 | for j= i+1:N 14 | if D(i,j) == 1, 15 | d_ij = X(i,:) - X(j,:); 16 | distij = distance1(A, d_ij); % distance between 'i' and 'j' 17 | fd = fd + distij; % constraint defined on disimilar set 18 | end 19 | end 20 | end 21 | 22 | fd = gF2(fd); 23 | 24 | % ___________L1 norm______________ 25 | function kd = distance1(A, d_ij) 26 | kd = (d_ij * A * d_ij')^(1/2); 27 | 28 | % ___________sqrt(L1 norm)___________ 29 | function kd = distance2(A, d_ij) 30 | kd = (d_ij * A * d_ij')^(1/4); 31 | 32 | % ___________1-exp(-beta*L1)_________ 33 | function kd = distance3(A, d_ij) 34 | beta = 0.5; 35 | kd = 1 - exp(-beta*(sqrt(d_ij * A * d_ij'))); 36 | 37 | % ___________cover function 1_________ 38 | function x = gF1(x1) 39 | x = x1; 40 | % ___________cover function 1_________ 41 | function x = gF2(x1) 42 | x = log(x1); 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /matlab_src/PGDM/fD1.m: -------------------------------------------------------------------------------- 1 | function fd_1st_d = fD1(X, D, A, N, d) 2 | 3 | % --------------------------------------------------------------------------- 4 | % the gradient of the dissimilarity constraint function w.r.t. A 5 | % 6 | % for example, let distance by L1 norm: 7 | % f = f(\sum_{ij \in D} \sqrt{(x_i-x_j)A(x_i-x_j)'}) 8 | % df/dA_{kl} = f'* d(\sum_{ij \in D} \sqrt{(x_i-x_j)^k*(x_i-x_j)^l})/dA_{kl} 9 | % 10 | % note that d_ij*A*d_ij' = tr(d_ij*A*d_ij') = tr(d_ij'*d_ij*A) 11 | % so, d(d_ij*A*d_ij')/dA = d_ij'*d_ij 12 | % df/dA = f'(\sum_{ij \in D} \sqrt{tr(d_ij'*d_ij*A)}) 13 | % * 0.5*(\sum_{ij \in D} (1/sqrt{tr(d_ij'*d_ij*A)})*(d_ij'*d_ij)) 14 | % --------------------------------------------------------------------------- 15 | 16 | sum_dist = 0.000001; sum_deri = zeros(d,d); 17 | 18 | for i = 1:N 19 | for j= i+1:N % count each pair once 20 | if D(i,j) == 1 21 | d_ij = X(i,:) - X(j,:); 22 | [dist_ij, deri_d_ij] = distance1(A, d_ij); 23 | sum_dist = sum_dist + dist_ij; 24 | sum_deri = sum_deri + deri_d_ij; 25 | end 26 | end 27 | end 28 | %sum_dist 29 | fd_1st_d = dgF2(sum_dist)*sum_deri; 30 | 31 | % ------------------------------------------------ 32 | 33 | 34 | % ___________derivative of cover function 1_________ 35 | function z = dgF1(y) 36 | z = 1; 37 | 38 | % ___________derivative of cover function 2_________ 39 | function z = dgF2(y) 40 | z = 1/y; 41 | 42 | 43 | 44 | function [dist_ij, deri_d_ij] = distance1(A, d_ij) 45 | % distance and derivative of distance using distance1: distance(d) = L1 46 | fudge = 0.000001; % regularizes derivates a little 47 | 48 | M_ij = d_ij'*d_ij; 49 | dist_ij = sqrt(trace(M_ij*A)); 50 | 51 | % derivative of dist_ij w.r.t. A 52 | deri_d_ij = 0.5*M_ij/(dist_ij+fudge); 53 | 54 | 55 | function [dist_ij, deri_d_ij] = distance2(A, d_ij) 56 | % distance and derivative of distance using distance2: distance(d) = sqrt(L1) 57 | fudge = 0.000001; % regularizes derivates a little 58 | 59 | M_ij = d_ij'*d_ij; 60 | L2 = trace(M_ij*A); % L2 norm 61 | dist_ij = sqrt(sqrt(L2)); 62 | 63 | % derivative of dist_ij w.r.t. A 64 | deri_d_ij = 0.25*M_ij/(L2^(3/4)+fudge); 65 | 66 | 67 | function [dist_ij, deri_d_ij] = distance3(A, d_ij) 68 | % distance and derivative of distance using distance3: 1-exp(-\beta*L1) 69 | fudge = 0.000001; % regularizes derivates a little 70 | 71 | beta = 0.5; 72 | M_ij = d_ij'*d_ij; 73 | L1 = sqrt(trace(M_ij*A)); 74 | dist_ij = 1 - exp(-beta*L1); 75 | 76 | % derivative of dist_ij w.r.t. A 77 | deri_d_ij = 0.5*beta*exp(-beta*L1)*M_ij/(L1+fudge); 78 | 79 | -------------------------------------------------------------------------------- /matlab_src/PGDM/fS1.m: -------------------------------------------------------------------------------- 1 | function fs_1st_d = fS1(X, S, A, N, d) 2 | 3 | % the gradient of the similarity constraint function w.r.t. A 4 | % f = \sum_{ij}(x_i-x_j)A(x_i-x_j)' = \sum_{ij}d_ij*A*d_ij' 5 | % df/dA = d(d_ij*A*d_ij')/dA 6 | % 7 | % note that d_ij*A*d_ij' = tr(d_ij*A*d_ij') = tr(d_ij'*d_ij*A) 8 | % so, d(d_ij*A*d_ij')/dA = d_ij'*d_ij 9 | 10 | %[N d] = size(X); 11 | fs_1st_d = zeros(d,d); 12 | 13 | fudge = 0.000001; % regularizes derivates a little if necessary 14 | 15 | for i = 1:N 16 | for j= i+1:N 17 | if S(i,j) == 1 18 | d_ij = X(i,:) - X(j,:); 19 | % distij = d_ij * A * d_ij'; % distance between 'i' and 'j' 20 | % full first derivative of the distance constraints 21 | fs_1st_d = fs_1st_d + d_ij'*d_ij; 22 | end 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /matlab_src/PGDM/grad_projection.m: -------------------------------------------------------------------------------- 1 | function gradProj = grad_projection(grad1, grad2, d) 2 | 3 | % the compoment of g1 that is perpendicular to g2 4 | g1 = unroll(grad1); 5 | g2 = unroll(grad2); 6 | 7 | g2 = g2/norm(g2, 2); 8 | gtemp = g1 - (g2'*g1)*g2; 9 | gtemp = gtemp/norm(gtemp, 2); % normalize 10 | gradProj = packcolume(gtemp, d, d); 11 | -------------------------------------------------------------------------------- /matlab_src/PGDM/iter_projection_new2.m: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | % solving constraint optimization problem using iterative projection 3 | % 4 | % Eric Xing 5 | % UC Berkeley 6 | % Jan 15, 2002 7 | % 8 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 9 | 10 | function [A, converged] = ... 11 | iter_projection_new2(X, S, D, A, w, t, maxiter) 12 | 13 | % --------------------------------------------------------------------------- 14 | % Input 15 | % X: data 16 | % S: similarity constraints (in the form of a pairwise-similarity matrix) 17 | % D: disimilarity constraints (in the form of a pairwise-disimilarity matrix) 18 | % A: initial distance metric matrix 19 | % w: a weight vector originated from similar data (see paper) 20 | % upper bound of constraint C1 (the sum of pairwise distance bound) 21 | % maxiter: maximum iterations 22 | % 23 | % Output 24 | % A: the solution of distance metric matrix 25 | % converged: indicator of convergence 26 | % iters: iterations passed until convergence 27 | % --------------------------------------------------------------------------- 28 | 29 | s = size(X) 30 | N = s(1); % number of examples 31 | d = s(2); % dimensionality of examples 32 | error1=1e10; error2=1e10; 33 | threshold2 = 0.01;% error-bound of main A-update iteration 34 | epsilon = 0.01; % error-bound of iterative projection on C1 and C2 35 | maxcount = 100; 36 | 37 | w1 = w/norm(w); % make 'w' a unit vector 38 | t1 = t/norm(w); % distance from origin to w^T*x=t plane 39 | 40 | count=1; 41 | alpha = 0.1; % initial step size along gradient 42 | 43 | grad1 = fS1(X, S, A, N, d); % gradient of similarity constraint function 44 | grad2 = fD1(X, D, A, N, d); % gradient of dissimilarity constraint func. 45 | M = grad_projection(grad1, grad2, d); % gradient of fD1 orthognal to fS1 46 | 47 | 48 | A_last = A; % initial A 49 | done = 0; 50 | 51 | while (~done) 52 | 53 | % projection of constrants C1 and C2 ______________________________ 54 | % _________________________________________________________________ 55 | A_update_cycle=count 56 | projection_iters = 0; 57 | satisfy=0; 58 | 59 | while projection_iters < maxiter & ~satisfy 60 | 61 | A0 = A; 62 | % _____________________________________________________________ 63 | % first constraint: 64 | % f(A) = \sum_{i,j \in S} d_ij' A d_ij <= t (1) 65 | % (1) can be rewritten as a linear constraint: w^T x = t, 66 | % where x is the unrolled matrix of A, 67 | % w is also an unroled matrix of W where 68 | % W_{kl}= \sum_{i,j \in S}d_ij^k * d_ij^l 69 | 70 | x0= unroll(A0); 71 | if w' * x0 <= t 72 | A = A0; 73 | x = x0; % BP added this part 74 | else 75 | x = x0 + (t1-w1'*x0)*w1; 76 | A = packcolume(x, d, d); 77 | end 78 | 79 | fDC1 = w'*x; % this is actually just 't' 80 | A_1 = A; % resulting A from constraint 1 81 | 82 | % __________________________________________________________________ 83 | % second constraint: 84 | % PSD constraint A>=0 85 | % project A onto domain A>0 86 | 87 | A = (A + A')/2; % enforce A to be symmetric 88 | [V,L] = eig(A); % V is an othornomal matrix of A's eigenvectors, 89 | % L is the diagnal matrix of A's eigenvalues, 90 | L = max(L, 0); 91 | A = V*L*V'; 92 | 93 | fDC2 = w'*unroll(A); 94 | A_2 = A; % resulting A from constraint 2 95 | 96 | % __________________________________________________________________ 97 | 98 | error2 = (fDC2-t)/t; 99 | projection_iters = projection_iters + 1; 100 | 101 | if error2 > epsilon 102 | satisfy=0; 103 | else 104 | satisfy=1; % loop until constrait is not violated after both projections 105 | end 106 | 107 | end % end projection on C1 and C2 108 | 109 | projection_iters 110 | %[fDC1 fDC2] 111 | %[error1, error2] 112 | 113 | 114 | % __________________________________________________________________ 115 | % third constraint: Gradient ascent 116 | % max: g(A)>=1 117 | % here we suppose g(A) = fD(A) = \sum_{I,J \in D} sqrt(d_ij' A d_ij) 118 | 119 | obj_previous = fD(X, D, A_last, N, d) % g(A_old) 120 | obj = fD(X, D, A, N, d) % g(A): current A 121 | 122 | if (obj > obj_previous | count == 1) & (satisfy ==1) 123 | 124 | % if projection of 1 and 2 is successful, 125 | % and such projection imprives objective function, 126 | % slightly increase learning rate, and updata from the current A 127 | 128 | alpha = alpha * 1.05; A_last = A; obj 129 | grad2 = fS1(X, S, A, N, d); 130 | grad1 = fD1(X, D, A, N, d); 131 | M = grad_projection(grad1, grad2, d); 132 | A = A + alpha*M; 133 | 134 | else 135 | % if projection of 1 and 2 failed, 136 | % or obj <= obj_previous due to projection of 1 and 2, 137 | % shrink learning rate, and re-updata from the previous A 138 | 139 | alpha = alpha/2; 140 | A = A_last + alpha*M; 141 | 142 | end; 143 | 144 | A % BP 145 | 146 | delta = norm(alpha*M, 'fro')/norm(A_last, 'fro') 147 | count = count + 1; 148 | if count == maxcount | delta threshold2, 156 | converged=0; 157 | else 158 | converged=1; 159 | end; 160 | 161 | A = A_last; 162 | 163 | -------------------------------------------------------------------------------- /matlab_src/PGDM/opt.m: -------------------------------------------------------------------------------- 1 | function A = opt(X, S, D, maxiter) 2 | 3 | % each row is a example: 4 | 5 | s = size(X); 6 | N = s(1); % number of examples 7 | d = s(2); % dimensionality of examples 8 | 9 | A = eye(d,d)*0.1; 10 | W = zeros(d,d); 11 | 12 | for i = 1:N, 13 | for j = i+1:N, 14 | if S(i,j) == 1, 15 | d_ij = X(i,:) - X(j,:); 16 | W = W + (d_ij'*d_ij); 17 | end; 18 | end; 19 | end; 20 | 21 | w = unroll(W); 22 | t = w' * unroll(A)/100; 23 | 24 | [A, converge] = iter_projection_new2(X, S, D, A, w, t, maxiter); 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /matlab_src/PGDM/opt_sphere.m: -------------------------------------------------------------------------------- 1 | function A = opt_sphere(X, S, D, maxiter) 2 | 3 | % Preprocessing: First sphere the data 4 | sphereMult = cov(X)^-0.5; 5 | sphereX = X*sphereMult; 6 | 7 | s = size(sphereX); 8 | N = s(1); % number of examples 9 | d = s(2); % dimensionality of examples 10 | 11 | A = eye(d,d)*0.1; 12 | W = zeros(d,d); 13 | 14 | for i = 1:N, 15 | for j = i+1:N, 16 | if S(i,j) == 1, 17 | d_ij = sphereX(i,:) - sphereX(j,:); 18 | W = W + d_ij'*d_ij; 19 | end; 20 | end; 21 | end; 22 | 23 | w = unroll(W); 24 | t = w' * unroll(A)/100; 25 | 26 | [A, converge] = iter_projection_new2(sphereX, S, D, A, w, t, maxiter); 27 | 28 | % Now unsphere the distance metric so that we get a version 29 | % for the original, unsphered data. 30 | 31 | unspheredA = inv(sphereMult)*A*inv(sphereMult); 32 | 33 | A = unspheredA; 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /matlab_src/PGDM/packcolume.m: -------------------------------------------------------------------------------- 1 | % A = pack(av, n, m) 2 | % pack vactor 'av' into a nxm matrix 'A' using acolumn concatenation 3 | function A = packcolume(av, n, m) 4 | for i = 1:m 5 | A(:,i) = av( ((i-1)*n+1) : (i*n) ) ; 6 | end 7 | 8 | -------------------------------------------------------------------------------- /matlab_src/PGDM/testPGDM.m: -------------------------------------------------------------------------------- 1 | % Bryan Perozzi 2 | 3 | function [A] = testPGDM() 4 | 5 | datasize = 300; 6 | 7 | [X,S,D] = test1(datasize, 50, 150); 8 | % [X,S,D] = test2(datasize, 50, 150); 9 | %[X,S,D] = test3(datasize, 10); 10 | % A = PGDM(X,S,D); 11 | 12 | A = Newton(X, S, D, 1) 13 | 14 | figure(2); 15 | colormap(); 16 | imagesc(A); 17 | colorbar(); 18 | 19 | figure(3); 20 | B = zeros(size(X)); 21 | for i=1:datasize 22 | B(i,:) = A^(1/2)*X(i,:)'; 23 | end 24 | scatter3(B(:,1),B(:,2),B(:,3)); 25 | end 26 | 27 | % test 1: uniform data, similar data from narrow guassian 28 | function [X,S,D] = test1(num_examples, num_dimensions, num_similar) 29 | % number similar examples 30 | num_S = num_similar; 31 | 32 | % generate data 33 | X = rand(num_examples, num_dimensions); 34 | 35 | % make some stuff similar 36 | X(1:num_S) = normal(1, num_S, 1, 0.1); 37 | 38 | % make pairwise similarity matrix 39 | %s = eye(num_examples); 40 | s = zeros(num_examples); 41 | s(1:num_S,1:num_S) = ones(num_S); 42 | 43 | % make pairwise dissimilarity matrix 44 | d = zeros(num_examples); 45 | start_dissim = num_examples - num_S + 1 46 | end_dissim = num_examples 47 | d(start_dissim : end_dissim, start_dissim:end_dissim ) = ... 48 | (ones(num_S) - eye(num_S)); % indexing by 1 is annoying 49 | 50 | S = s; 51 | % weird weight vector 52 | D = d; 53 | end 54 | 55 | % test 2: guassian data, similar data has a couple dimensions with narrower 56 | % guassians 57 | function [X,S,D] = test2(num_examples, num_dimensions, num_similar) 58 | % number similar examples 59 | num_S = num_similar; 60 | 61 | % generate data 62 | X = zeros(num_examples, num_dimensions); 63 | for j = 1:num_dimensions 64 | mean = 1000*rand(1); 65 | std = 100*rand(1); 66 | X(:,j) = normal(num_examples, 1, mean, std); 67 | end 68 | 69 | % make some stuff similar 70 | for j = 1:5 71 | mean = 1000*rand(1); 72 | std = 10*rand(1); 73 | X(1:num_S,j) = normal(num_S, 1, mean, std) 74 | end 75 | 76 | % make pairwise similarity matrix 77 | s = zeros(num_examples); 78 | s(1:num_S,1:num_S) = ones(num_S); 79 | 80 | % make pairwise dissimilarity matrix 81 | d = zeros(num_examples); 82 | start_dissim = num_examples - num_S + 1 83 | end_dissim = num_examples 84 | d(start_dissim : end_dissim, start_dissim:end_dissim ) = ... 85 | (ones(num_S) - eye(num_S)); % indexing by 1 is annoying 86 | 87 | S = s; 88 | % weird weight vector 89 | D = d; 90 | end 91 | 92 | % test 3: two seperate 2D guassians, just like in the paper 93 | function [X,S,D] = test3(num_examples, num_similar) 94 | % number similar examples 95 | num_S = num_similar; 96 | num_dimensions = 3; 97 | 98 | % generate data 99 | X = zeros(num_examples, num_dimensions); 100 | for j = 1:num_dimensions 101 | mean = 10*rand(1) - 5; 102 | std = 1; 103 | 104 | if j == 3 105 | std = 1; 106 | end 107 | 108 | X(:,j) = normal(num_examples, 1, mean, std); 109 | end 110 | 111 | % make some stuff similar 112 | for j = 1:num_dimensions 113 | mean = 10*rand(1) + 5; 114 | std = 1; 115 | 116 | if j == 3 117 | std = 1; 118 | end 119 | 120 | X(1:(num_examples/2 - 1),j) = normal((num_examples/2 - 1), 1, mean, std); 121 | end 122 | 123 | %X(:,3) = 0.1.*ones(num_examples,1); 124 | 125 | figure(1); 126 | scatter3(X(:,1),X(:,2),X(:,3)) 127 | 128 | % make pairwise similarity matrix 129 | s = zeros(num_examples); 130 | s(1:num_S,1:num_S) = ones(num_S); 131 | 132 | % make pairwise dissimilarity matrix 133 | d = zeros(num_examples); 134 | start_dissim = num_examples - num_S + 1 135 | end_dissim = num_examples 136 | d(start_dissim : end_dissim, start_dissim:end_dissim ) = ... 137 | (ones(num_S) - eye(num_S)); % indexing by 1 is annoying 138 | 139 | S = s; 140 | % weird weight vector 141 | D = d; 142 | end 143 | 144 | function [X,S,D] = test4(num_examples, num_dimensions, num_similar) 145 | 146 | end 147 | 148 | function A = PGDM(X,S,D) 149 | num_dimensions = size(X,2); 150 | W = makeWfast(X, S); 151 | w = unroll(W); 152 | 153 | % BP: how to initialize A? 154 | % eye() was a bad initialization, ended prematurely, frequently didn't 155 | % learn anything useful (sometimes did though) 156 | % rand() somehow got complex numbers in the output. results not as 157 | % expected 158 | % zeros() works best so far. 159 | original_A = zeros(num_dimensions); 160 | A = iter_projection_new2(X, S, D, original_A, w, 1, 1000); 161 | end 162 | 163 | function [random] = normal(n, m, mu, sigma) 164 | random = mu + sigma.*randn(n,m); 165 | end 166 | 167 | function [W] = makeW(X, S) 168 | % W is a weight matrix made from S 169 | % W_{kl}= \sum_{i,j \in S}d_ij^k * d_ij^l 170 | sizeX = size(X); 171 | W = zeros(sizeX(2)); % W is matrix for weighting features |f|x|f| 172 | for k = 1:sizeX(2) 173 | for l = 1:sizeX(2) 174 | for i = 1:sizeX(1) 175 | for j = 1:sizeX(1) 176 | if S(i,j) == 1 177 | d_ij = X(i,:) - X(j,:); % difference between 'i' and 'j' 178 | W(k,l) = W(k,l) + d_ij(k) *d_ij(l); 179 | end 180 | end 181 | end 182 | end 183 | end 184 | W 185 | end 186 | 187 | function W = makeWfast(X,S) 188 | sizeX = size(X); 189 | W = zeros(sizeX(2)); % W is matrix for weighting features |f|x|f| 190 | 191 | % for all similar elements (S_ij is non-zero) 192 | [i_arr,j_arr] = find(S); 193 | for a = 1:numel(i_arr) 194 | for b = 1:numel(j_arr) 195 | i = i_arr(a); 196 | j = j_arr(b); 197 | 198 | % calculate their difference 199 | d_ij = X(i,:) - X(j,:); 200 | 201 | % add their weights 202 | %W(k,l) = W(k,l) + d_ij(k) *d_ij(l); 203 | W = W + d_ij'*d_ij; 204 | end 205 | end 206 | W 207 | end 208 | 209 | function testW() 210 | X = [1,2,3;2,4,6] 211 | S = [0,1;1,0] 212 | 213 | makeW(X,S) 214 | makeWfast(X,S) 215 | end -------------------------------------------------------------------------------- /matlab_src/PGDM/unroll.m: -------------------------------------------------------------------------------- 1 | % av = unroll(A) 2 | % column concatenation of matrix 'A' into vactor 'av' 3 | 4 | function av = unroll(A) 5 | s = size(A); 6 | n = s(1); % # of rows 7 | m = s(2); % # of columns 8 | for i = 1:m 9 | av( ((i-1)*n+1) : (i*n) ) = A(:,i); 10 | end 11 | av=av'; 12 | -------------------------------------------------------------------------------- /matlab_src/compute_A_goodness.m: -------------------------------------------------------------------------------- 1 | function [goodness] = compute_A_goodness(A) 2 | 3 | if all(diag(A) < 1) 4 | goodness = 0; 5 | else 6 | goodness = nnz(diag(A ./ max(max(A)) < 0.01)); 7 | % goodness = std(diag(A)) % bad results with this on big feature? 8 | %spaces 9 | end 10 | end 11 | 12 | -------------------------------------------------------------------------------- /matlab_src/distance_metric_learning_manual.m: -------------------------------------------------------------------------------- 1 | function [ DM, S, D ] = distance_metric_learning_manual(X, s, n_dimsim_pairs, num_vertices, C, topk, dml_version) 2 | %DISTANCE_METRIC_LEARNING Samples S from similar pairs in the similarity 3 | % region, and D randomly from points outside the S region 4 | % s the [|S| x 2] similar rows to make into pairs 5 | % n_dimsim_pairs number of dissimilar entries to generate 6 | % num_vertices the # vertices in G 7 | % exclusion_start the similar community to use 8 | % exclusion_end the similar community to use 9 | 10 | addpath('PGDM') 11 | 12 | i_arr = zeros(2*size(s,1),1); 13 | j_arr = zeros(2*size(s,1),1); 14 | s_arr = ones(2*size(s,1),1); 15 | 16 | for z=1:size(s,1) 17 | pair = s(z,:); 18 | S(pair(1),pair(2)) = 1; 19 | S(pair(2),pair(1)) = 1; 20 | y = 2*z; 21 | i_arr(y - 1) = pair(1); 22 | j_arr(y - 1) = pair(2); 23 | i_arr(y) = pair(2); 24 | j_arr(y) = pair(1); 25 | end 26 | 27 | S = sparse(i_arr, j_arr, s_arr, num_vertices, num_vertices); 28 | 29 | NNZ_S = nnz(S); 30 | 31 | % sample randomly to create dissimilar pairs, D 32 | i_arr = zeros(n_dimsim_pairs,1); 33 | j_arr = zeros(n_dimsim_pairs,1); 34 | s_arr = ones(n_dimsim_pairs,1); 35 | for z=1:n_dimsim_pairs 36 | d1 = randi([1, num_vertices]); 37 | while any(d1 == s) 38 | d1 = randi([1, num_vertices]); 39 | end 40 | d2 = randi([1, num_vertices]); 41 | 42 | while any(any(d2 == s)) || d1 == d2 43 | d2 = randi([1, num_vertices]); 44 | end 45 | 46 | i_arr(z) = d1; 47 | j_arr(z) = d2; 48 | end 49 | 50 | D = sparse(i_arr, j_arr, s_arr, num_vertices, num_vertices); 51 | 52 | if strcmp(dml_version, 'sparse') 53 | DM = Newton_sparse_top_k(X, S, D, C, topk); 54 | else 55 | DM = Newton(X, S, D, C); 56 | end 57 | end 58 | 59 | -------------------------------------------------------------------------------- /matlab_src/io/load_edgelist.m: -------------------------------------------------------------------------------- 1 | % Bryan Perozzi 2 | 3 | function [A labels] = load_edgelist (filename, mapping) 4 | 5 | if nargin < 2 6 | no_mapping = 1; 7 | else 8 | no_mapping = 0; 9 | end 10 | 11 | % M = dlmread(filename, ' '); 12 | 13 | fid = fopen(filename); 14 | M = textscan(fid, '%s %s'); 15 | fclose(fid); 16 | 17 | M = [M{1} M{2}]; 18 | 19 | % relabel vertices 20 | 21 | if no_mapping 22 | mapping = unique(sort([M(:,1) M(:,2)])); 23 | end 24 | 25 | 26 | [~, edges] = ismember(M, mapping); 27 | 28 | % for i=1:numel(M) 29 | % % M(i) = find(mapping == M(i)); 30 | % % edges(i) = find(ismember(mapping, M(i))); 31 | % [~, edges(i)] = ismember(M(i), mapping); 32 | % end 33 | 34 | elements = numel(mapping); 35 | 36 | A = sparse(edges(:,1), edges(:,2), ones(size(edges,1),1), elements, elements); 37 | end 38 | -------------------------------------------------------------------------------- /matlab_src/normc.m: -------------------------------------------------------------------------------- 1 | function n = normc(m) 2 | %NORMC Normalize columns of a matrix. 3 | % 4 | % Syntax 5 | % 6 | % normc(M) 7 | % 8 | % Description 9 | % 10 | % NORMC(M) normalizes the columns of M to a length of 1. 11 | % 12 | % Examples 13 | % 14 | % m = [1 2; 3 4] 15 | % n = normc(m) 16 | % 17 | % See also NORMR 18 | 19 | % Mark Beale, 1-31-92 20 | % Copyright 1992-2007 The MathWorks, Inc. 21 | % $Revision: 1.1.6.5 $ $Date: 2007/11/09 20:49:52 $ 22 | 23 | if nargin < 1,error('NNET:Arguments','Not enough input arguments.'); end 24 | 25 | [mr,mc] = size(m); 26 | if (mr == 1) 27 | n = ones(1,mc); 28 | else 29 | n =ones(mr,1)*sqrt(ones./sum(m.*m)).*m; 30 | end -------------------------------------------------------------------------------- /matlab_src/reweigh.m: -------------------------------------------------------------------------------- 1 | % Bryan Perozzi 2 | 3 | function [Weighted] = reweigh(G, X, A) 4 | %REWEIGH Reweigh graph G given features X and distance metric A 5 | A_half = A.^(1/2); % sparse, so can do this 6 | 7 | nnz(G) 8 | [g_arr,h_arr] = find(G); 9 | weights = zeros(numel(g_arr),1); 10 | X = X'; 11 | 12 | for a = 1:numel(g_arr) 13 | g = g_arr(a); 14 | h = h_arr(a); 15 | % calculate their difference 16 | d_ij = X(:,g) - X(:,h); 17 | % weigh and convert distance vector d to similarity vector 1/d 18 | weight = 1.0/(1.0 + norm(A_half*d_ij, 2)); 19 | weights(a) = weight; 20 | end 21 | 22 | Weighted = sparse(g_arr, h_arr, weights, size(G,1), size(G,2)); 23 | end 24 | -------------------------------------------------------------------------------- /matlab_src/reweigh_sparse.m: -------------------------------------------------------------------------------- 1 | % Bryan Perozzi 2 | function [Weighted] = reweigh_sparse(G, X, A) 3 | %REWEIGH Reweigh graph G given features X and distance metric A 4 | A_half = A.^(1/2); % sparse, so can do this 5 | 6 | [g_arr,h_arr] = find(G); 7 | weights = zeros(numel(g_arr),1); 8 | X = X'; 9 | 10 | for a = 1:numel(g_arr) 11 | g = g_arr(a); 12 | h = h_arr(a); 13 | % calculate their difference 14 | d_ij = X(:,g) - X(:,h) + ~(X(:,g) | X(:,h)); 15 | % weigh and convert distance vector d to similarity vector 1/d 16 | weight = 1.0/(1.0 + norm(A_half*d_ij, 2)); 17 | weights(a) = weight; 18 | end 19 | Weighted = sparse(g_arr, h_arr, weights, size(G,1), size(G,2)); 20 | end 21 | -------------------------------------------------------------------------------- /matlab_src/savesparse.m: -------------------------------------------------------------------------------- 1 | function savesparse( filename, sparse_matrix ) 2 | %SAVESPARSE Save a sparse matrix as an edge list 3 | [i,j,val] = find(sparse_matrix); 4 | 5 | fid = fopen(filename,'W'); 6 | for a = 1:numel(i) 7 | fprintf( fid,'%d %d %f\n', i(a),j(a),val(a) ); 8 | end 9 | fclose(fid); 10 | end 11 | 12 | -------------------------------------------------------------------------------- /matlab_src/savevector.m: -------------------------------------------------------------------------------- 1 | function savevector( filename, vector ) 2 | %SAVEVECTOR Saves a vector in a format like an edge list 3 | 4 | fid = fopen(filename,'W'); 5 | 6 | if iscell(vector) 7 | for a = 1:length(vector); 8 | if ischar(vector{a}) 9 | fprintf( fid,'%d %s\n', a, vector{a} ); 10 | elseif mod(vector{a},1) == 0 11 | fprintf( fid,'%d %d\n', a, vector{a} ); 12 | else 13 | fprintf( fid,'%d %f\n', a, vector{a} ); 14 | end 15 | end 16 | elseif isvector(vector) 17 | [i,j,val] = find(vector); 18 | for a = 1:length(i); 19 | if mod(val(a),1) == 0 20 | fprintf( fid,'%d %d\n', i(a), val(a) ); 21 | else 22 | fprintf( fid,'%d %f\n', i(a), val(a) ); 23 | end 24 | end 25 | else 26 | assert(0 == 1, 'Unable to save vector!'); 27 | end 28 | fclose(fid); 29 | end --------------------------------------------------------------------------------