├── MDL ├── ._.svn ├── config.py ├── ._mdl.py ├── ._test.py ├── error.pyc ├── graph.pyc ├── makefile ├── mdl.pyc ├── model.pyc ├── ._config.py ├── ._error.py ├── ._graph.py ├── ._makefile ├── ._model.py ├── ._score.py ├── config.pyc ├── ._mdl_base.py ├── ._mdl_error.py ├── ._readme.txt ├── mdl_base.pyc ├── mdl_error.pyc ├── ._greedyScan.py ├── ._mdl_structs.py ├── mdl_structs.pyc ├── ._pythonOutput.txt ├── ._run_greedyScan.bash ├── ._cliqueStarClique.model ├── ._description_length.py ├── ._greedySearch_nStop.py ├── ._pythonOutput_toyExample.txt ├── pythonOutput.txt ├── pythonOutput_toyExample.txt ├── cliqueStarClique.model ├── run_greedyScan.bash ├── test.py ├── mdl_base.py ├── graph.py ├── mdl_error.py ├── readme.txt ├── score.py ├── greedyScan.py ├── error.py ├── mdl.py ├── greedySearch_nStop.py ├── model.py ├── mdl_structs.py └── description_length.py ├── DATA ├── ._.svn ├── ._cliqueStarClique.out └── cliqueStarClique.out ├── STRUCTURE_DISCOVERY ├── matbg.sh ├── ._.svn ├── ._BFS.m ├── ._BFScoloring.m ├── ._assertEqual.m ├── ._assertFalse.m ├── ._assertTrue.m ├── ._encodeAsBC.m ├── ._encodeAsNB.m ├── ._mdlCostAsBC.m ├── ._printModel.m ├── ._testMDLcost.m ├── ._EncodeSubgraph.m ├── ._ExactStructure.m ├── ._encodeAsChain.m ├── ._encodeAsStar.m ├── ._mdlCostAsChain.m ├── ._mdlCostAsStar.m ├── ._ExtractGccEncode.m ├── ._SlashBurnEncode.m ├── ._encodeAsFClique.m ├── ._encodeAsNClique.m ├── ._mdlCostAsBCorNB.m ├── ._test_error_edges.m ├── ._RemHdegreeGccEncode.m ├── ._encodeAsfANDnClique.m ├── ._compute_encodingCost.m ├── ._mdlCostAsfANDnClique.m ├── ._structureSelectionTop10.m ├── ._printStructureToModelFile.m ├── ._structureSelectionGreedyNforget.m ├── l2cnk.m ├── testMDLcost.m ├── test_error_edges.m ├── printModel.m ├── RemHdegreeGccEncode.m ├── structureSelectionTop10.m ├── printStructureToModelFile.m ├── encodeAsfANDnClique.m ├── assertFalse.m ├── ExtractGccEncode.m ├── BFScoloring.m ├── assertTrue.m ├── encodeAsChain.m ├── encodeAsFClique.m ├── encodeAsStar.m ├── encodeAsNClique.m ├── encodeAsBC.m ├── encodeAsNB.m ├── assertEqual.m ├── mdlCostAsfANDnClique.m ├── mdlCostAsBC.m ├── mdlCostAsStar.m ├── mdlCostAsChain.m ├── mdlCostAsBCorNB.m ├── BFS.m ├── EncodeSubgraph.m ├── structureSelectionGreedyNforget.m ├── compute_encodingCost.m ├── SlashBurnEncode.m └── ExactStructure.m ├── makefile ├── run_structureDiscovery.m ├── license.txt ├── README └── demo_vog.bash /MDL/._.svn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._.svn -------------------------------------------------------------------------------- /MDL/config.py: -------------------------------------------------------------------------------- 1 | optModelZeroes = False; 2 | optVerbosity = 1; 3 | optDefaultError = 'TP'; -------------------------------------------------------------------------------- /DATA/._.svn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/DATA/._.svn -------------------------------------------------------------------------------- /MDL/._mdl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._mdl.py -------------------------------------------------------------------------------- /MDL/._test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._test.py -------------------------------------------------------------------------------- /MDL/error.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/error.pyc -------------------------------------------------------------------------------- /MDL/graph.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/graph.pyc -------------------------------------------------------------------------------- /MDL/makefile: -------------------------------------------------------------------------------- 1 | demo: 2 | python score.py cliqueStarClique.graph cliqueStarClique.model 3 | -------------------------------------------------------------------------------- /MDL/mdl.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/mdl.pyc -------------------------------------------------------------------------------- /MDL/model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/model.pyc -------------------------------------------------------------------------------- /MDL/._config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._config.py -------------------------------------------------------------------------------- /MDL/._error.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._error.py -------------------------------------------------------------------------------- /MDL/._graph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._graph.py -------------------------------------------------------------------------------- /MDL/._makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._makefile -------------------------------------------------------------------------------- /MDL/._model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._model.py -------------------------------------------------------------------------------- /MDL/._score.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._score.py -------------------------------------------------------------------------------- /MDL/config.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/config.pyc -------------------------------------------------------------------------------- /MDL/._mdl_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._mdl_base.py -------------------------------------------------------------------------------- /MDL/._mdl_error.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._mdl_error.py -------------------------------------------------------------------------------- /MDL/._readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._readme.txt -------------------------------------------------------------------------------- /MDL/mdl_base.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/mdl_base.pyc -------------------------------------------------------------------------------- /MDL/mdl_error.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/mdl_error.pyc -------------------------------------------------------------------------------- /MDL/._greedyScan.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._greedyScan.py -------------------------------------------------------------------------------- /MDL/._mdl_structs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._mdl_structs.py -------------------------------------------------------------------------------- /MDL/mdl_structs.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/mdl_structs.pyc -------------------------------------------------------------------------------- /MDL/._pythonOutput.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._pythonOutput.txt -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/matbg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/csh -f 2 | 3 | unsetenv DISPLAY 4 | 5 | nohup matlab < $1 > $2 & 6 | 7 | -------------------------------------------------------------------------------- /MDL/._run_greedyScan.bash: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._run_greedyScan.bash -------------------------------------------------------------------------------- /DATA/._cliqueStarClique.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/DATA/._cliqueStarClique.out -------------------------------------------------------------------------------- /MDL/._cliqueStarClique.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._cliqueStarClique.model -------------------------------------------------------------------------------- /MDL/._description_length.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._description_length.py -------------------------------------------------------------------------------- /MDL/._greedySearch_nStop.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._greedySearch_nStop.py -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._.svn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._.svn -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._BFS.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._BFS.m -------------------------------------------------------------------------------- /MDL/._pythonOutput_toyExample.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/MDL/._pythonOutput_toyExample.txt -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._BFScoloring.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._BFScoloring.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._assertEqual.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._assertEqual.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._assertFalse.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._assertFalse.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._assertTrue.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._assertTrue.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._encodeAsBC.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._encodeAsBC.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._encodeAsNB.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._encodeAsNB.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._mdlCostAsBC.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._mdlCostAsBC.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._printModel.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._printModel.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._testMDLcost.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._testMDLcost.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._EncodeSubgraph.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._EncodeSubgraph.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._ExactStructure.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._ExactStructure.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._encodeAsChain.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._encodeAsChain.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._encodeAsStar.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._encodeAsStar.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._mdlCostAsChain.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._mdlCostAsChain.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._mdlCostAsStar.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._mdlCostAsStar.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._ExtractGccEncode.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._ExtractGccEncode.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._SlashBurnEncode.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._SlashBurnEncode.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._encodeAsFClique.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._encodeAsFClique.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._encodeAsNClique.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._encodeAsNClique.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._mdlCostAsBCorNB.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._mdlCostAsBCorNB.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._test_error_edges.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._test_error_edges.m -------------------------------------------------------------------------------- /MDL/pythonOutput.txt: -------------------------------------------------------------------------------- 1 | L(G,M) L(M) L(E) #E+ #E- #Ex 2 | M_0: 60310 2 60309 0/0 5467/4200651 0 3 | M_x: 57690 6447 51243 27/8992 4507/4191659 0 4 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._RemHdegreeGccEncode.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._RemHdegreeGccEncode.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._encodeAsfANDnClique.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._encodeAsfANDnClique.m -------------------------------------------------------------------------------- /MDL/pythonOutput_toyExample.txt: -------------------------------------------------------------------------------- 1 | L(G,M) L(M) L(E) #E+ #E- #Ex 2 | M_0: 52665 2 52664 0/0 7547/353220 0 3 | M_x: 33922 33066 856 109/7656 0/345564 0 4 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._compute_encodingCost.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._compute_encodingCost.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._mdlCostAsfANDnClique.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._mdlCostAsfANDnClique.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._structureSelectionTop10.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._structureSelectionTop10.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._printStructureToModelFile.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._printStructureToModelFile.m -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/._structureSelectionGreedyNforget.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GemsLab/VoG_Graph_Summarization/HEAD/STRUCTURE_DISCOVERY/._structureSelectionGreedyNforget.m -------------------------------------------------------------------------------- /MDL/cliqueStarClique.model: -------------------------------------------------------------------------------- 1 | fc 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 2 | fc 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 3 | st 21, 18 19 20 22 23 24 25 26 27 28 29 4 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/l2cnk.m: -------------------------------------------------------------------------------- 1 | function [nbits] = l2cnk(n,k) 2 | nbits = 0; 3 | for i = n:-1:n-k+1 4 | nbits = nbits + log2(i); 5 | end 6 | 7 | for i = k:-1:1 8 | nbits = nbits - log2(i); 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/testMDLcost.m: -------------------------------------------------------------------------------- 1 | function [] = testMDLcost( mdlCost ) 2 | 3 | if mdlCost < 0 4 | error('The MDL cost is negative...'); 5 | elseif isnan(mdlCost) 6 | error('The MDL cost is NaN...'); 7 | end 8 | 9 | end -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/test_error_edges.m: -------------------------------------------------------------------------------- 1 | function [ ] = test_error_edges(E) 2 | 3 | 4 | for i = 1 : length(E) 5 | if E(i) < 0 6 | error('Negative number of 1s or 0s in the error matrix E...') 7 | end 8 | end 9 | 10 | end -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | VOGFILES=README \ 2 | makefile \ 3 | license.txt \ 4 | DATA \ 5 | STRUCTURE_DISCOVERY \ 6 | MDL \ 7 | demo_vog.bash \ 8 | run_structureDiscovery.m 9 | 10 | all: demo 11 | 12 | demo: 13 | bash demo_vog.bash 14 | 15 | zip: tar 16 | 17 | tar: ${VOGFILES} 18 | tar -cvf vog.tar ${VOGFILES} 19 | 20 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/printModel.m: -------------------------------------------------------------------------------- 1 | function [ ] = printModel( model_ordered, outfile) 2 | %% Select the top 10 substructures to output to the user. 3 | % The ranking of the substructures is based on their MDL benefit. 4 | 5 | fid = fopen(outfile, 'w'); 6 | 7 | for i = 1 : length(model_ordered) 8 | printStructureToModelFile( model_ordered(i), fid ); 9 | end 10 | 11 | fclose(fid); 12 | 13 | end 14 | -------------------------------------------------------------------------------- /MDL/run_greedyScan.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | time -p python2.6 greedyScan.py ../../data/WikiUserGraphs/lcrMediaWiki_wholeEdges.graph ../../code/slashburn/wikipedia_files/lcrMediaWiki_whole_SB_noStar3_orderedALL.model > OUTPUT_greedyScan_lcr_whole.out 4 | time -p python2.6 greedyScan.py ../../data/WikiUserGraphs/chocMediaWiki_sentenceEdges.graph ../../code/slashburn/wikipedia_files/chocMediaWiki_sentence_SB_noStar3_orderedALL.model > OUTPUT_greedyScan_choc_sentence.out & 5 | time -p python2.6 greedyScan.py ../../data/WikiUserGraphs/kievMediaWiki_wholeEdges.graph ../../code/slashburn/wikipedia_files/kievMediaWiki_whole_SB_noStar3_orderedALL.model > OUTPUT_greedyScan_kiev_whole.out 6 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/RemHdegreeGccEncode.m: -------------------------------------------------------------------------------- 1 | function [disind,gccind,topind] = RemHdegreeGccEncode(B,k,dir,out_fid, top_gccind, N_tot, info, minSize) 2 | 3 | 4 | if nargin<3 5 | dir=1; 6 | end 7 | 8 | n = size(B,1); 9 | 10 | if (dir == 1) 11 | %D = inout_degree(B); 12 | D = sum(B,2); 13 | D = D + sum(B,1)'; 14 | else 15 | D=sum(B,2); 16 | end 17 | [Dsort,I]=sort(D); 18 | 19 | 20 | topind = flipud(I(n-k+1:n)); 21 | 22 | B(topind, :) = 0; 23 | B(:, topind) = 0; 24 | 25 | [gccind,disind] = ExtractGccEncode(B, out_fid, topind, top_gccind, N_tot, info, minSize ); 26 | %fullind = 1:n; 27 | %disind = setdiff(fullind, gccind); 28 | topind = topind'; 29 | 30 | mask = ismember(disind, topind); 31 | disind = disind(~mask); 32 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/structureSelectionTop10.m: -------------------------------------------------------------------------------- 1 | function [ ] = structureSelectionTop10(graphFile, model_ordered, outfile) 2 | %% Select the top 10 substructures to output to the user. 3 | % The ranking of the substructures is based on their MDL benefit. 4 | 5 | fid = fopen(outfile, 'w'); 6 | 7 | for i = 1 : min( 10, length(model_ordered) ) 8 | printStructureToModelFile( model_ordered(i), fid ); 9 | end 10 | 11 | %comm = sprintf('python ../mdl/score.py %s %s > pythonOutput.txt;', ... 12 | % graphFile, outfile ) 13 | %system(comm) 14 | %pythonOutput = importdata('pythonOutput.txt'); 15 | %cost0 = str2num(pythonOutput.textdata{2,2}); 16 | %cost = str2num(pythonOutput.textdata{3,2}); 17 | 18 | fclose(fid); 19 | 20 | end 21 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/printStructureToModelFile.m: -------------------------------------------------------------------------------- 1 | function [] = printStructureToModelFile( structure, fid ) 2 | %% Print a structure to the final model file. 3 | 4 | switch structure.code 5 | case 'nc' 6 | fprintf(fid, 'nc %d,', structure.edges); 7 | fprintf(fid, ' %d', structure.nodes1 ); 8 | fprintf(fid, '\n'); 9 | case {'fc', 'ch'} 10 | fprintf(fid, '%s', structure.code); 11 | fprintf(fid, ' %d', structure.nodes1 ); 12 | fprintf(fid, '\n'); 13 | case {'bc', 'nb', 'st'} 14 | fprintf(fid, '%s', structure.code); 15 | fprintf(fid, ' %d', structure.nodes1 ); 16 | fprintf(fid, ','); 17 | fprintf(fid, ' %d', structure.nodes2 ); 18 | fprintf(fid, '\n'); 19 | 20 | end 21 | 22 | 23 | 24 | end 25 | -------------------------------------------------------------------------------- /run_structureDiscovery.m: -------------------------------------------------------------------------------- 1 | input_file = 'DATA/cliqueStarClique.out'; 2 | unweighted_graph = input_file; 3 | output_model_greedy = 'DATA'; 4 | output_model_top10 = 'DATA'; 5 | 6 | addpath('STRUCTURE_DISCOVERY'); 7 | 8 | orig = spconvert(load(input_file)); 9 | orig(max(size(orig)),max(size(orig))) = 0; 10 | orig_sym = orig + orig'; 11 | [i,j,k] = find(orig_sym); 12 | orig_sym(i(find(k==2)),j(find(k==2))) = 1; 13 | orig_sym_nodiag = orig_sym - diag(diag(orig_sym)); 14 | 15 | disp('==== Running VoG for structure discovery ====') 16 | global model; 17 | model = struct('code', {}, 'edges', {}, 'nodes1', {}, 'nodes2', {}, 'benefit', {}, 'benefit_notEnc', {}); 18 | global model_idx; 19 | model_idx = 0; 20 | SlashBurnEncode( orig_sym_nodiag, 2, output_model_greedy, false, false, 3, unweighted_graph); 21 | 22 | quit 23 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/encodeAsfANDnClique.m: -------------------------------------------------------------------------------- 1 | function [] = encodeAsfANDnClique( Asmall, curind, top_gccind, out_fid ) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Encode given graph as clique and near-clique % 4 | % Author: Danai Koutra % 5 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 6 | 7 | n = size(curind, 2); 8 | m = nnz(Asmall); 9 | 10 | % encode as full clique 11 | fprintf(out_fid, 'fc'); 12 | for i=1:size(curind, 2) 13 | fprintf(out_fid, ' %d', top_gccind( curind(i) ) ); 14 | end 15 | fprintf(out_fid, '--- full clique \n'); 16 | 17 | % encode as near clique 18 | fprintf(out_fid, 'nc %d,', m/2); 19 | for i=1:size(curind, 2) 20 | fprintf(out_fid, ' %d', top_gccind( curind(i) ) ); 21 | end 22 | fprintf(out_fid, '--- nearClique \n'); 23 | 24 | end -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/assertFalse.m: -------------------------------------------------------------------------------- 1 | function assertFalse(condition, message) 2 | %assertFalse Assert that input condition is false 3 | % assertFalse(CONDITION, MESSAGE) throws an exception containing the string 4 | % MESSAGE if CONDITION is not false. 5 | % 6 | % MESSAGE is optional. 7 | % 8 | % Examples 9 | % -------- 10 | % assertFalse(isreal(sqrt(-1))) 11 | % 12 | % assertFalse(isreal(sqrt(-1)), ... 13 | % 'Expected isreal(sqrt(-1)) to be false.') 14 | % 15 | % See also assertTrue 16 | 17 | % Steven L. Eddins 18 | % Copyright 2008-2010 The MathWorks, Inc. 19 | 20 | if nargin < 2 21 | message = 'Asserted condition is not false.'; 22 | end 23 | 24 | if ~isscalar(condition) || ~islogical(condition) 25 | throwAsCaller(MException('assertFalse:invalidCondition', ... 26 | 'CONDITION must be a scalar logical value.')); 27 | end 28 | 29 | if condition 30 | throwAsCaller(MException('assertFalse:trueCondition', '%s', message)); 31 | end 32 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/ExtractGccEncode.m: -------------------------------------------------------------------------------- 1 | function [cur_gccind,cur_disind] = ExtractGccEncode(B, out_fid, topind, top_gccind, N_tot, info, minSize) 2 | 3 | [S,C]=graphconncomp(B, 'WEAK', true); 4 | 5 | maxind=-1; 6 | maxsize=0; 7 | 8 | size_v = zeros(0, S); 9 | 10 | for k=1:S 11 | size_v(k)=size(find(C == k), 2); 12 | end 13 | 14 | [size_sort,I]=sort(size_v, 'descend'); 15 | 16 | cur_gccind = find(C == I(1)); 17 | 18 | cur_disind = zeros(0,0); 19 | 20 | for k=2:S 21 | curind = find(C == I(k)); 22 | 23 | if( size(curind,2) ==1 ) 24 | mask = ismember(curind, topind); 25 | if sum(mask) == 1 26 | continue; 27 | end 28 | end 29 | 30 | if length(curind) > minSize 31 | % EncodeConnComp(B, curind, top_gccind, out_fid); 32 | EncodeSubgraph(B, curind, top_gccind, N_tot, out_fid, info, minSize); 33 | end 34 | cur_disind = [cur_disind curind]; 35 | end 36 | 37 | % fprintf('\tgccsize\t%d\n', size(cur_gccind,2)); 38 | 39 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/BFScoloring.m: -------------------------------------------------------------------------------- 1 | function [ set1, set2 ] = BFScoloring( Asmall ) 2 | %% Given a bipartite graph, find the two node-sets memberships 3 | 4 | n = size(Asmall,2); 5 | [seed ~] = find(Asmall); 6 | queue = [min(seed)]; 7 | % sets = 2 (if unvisited) or 0 (if in set 1) or 1 (if in set 2) 8 | sets = zeros(1,n)+2; 9 | color = 1; 10 | % coloring node 1 with color "0" 11 | sets(min(seed)) = 0; 12 | usedColor = false; 13 | 14 | 15 | while ~( isempty(queue) ) 16 | neighbors = find( Asmall(queue(1),:) ) ; 17 | 18 | for i = 1 : length(neighbors) 19 | if sets( neighbors(i) ) == 2 % unvisited neighbor 20 | sets( neighbors(i) ) = color; 21 | queue = [ queue, neighbors(i) ]; 22 | usedColor = true; 23 | end 24 | end 25 | 26 | qsize = length(queue); 27 | queue = queue(2:qsize); 28 | if usedColor 29 | color = mod(color+1, 2); 30 | end 31 | usedColor = false; 32 | end 33 | 34 | set1 = find( sets == 0 ); 35 | set2 = find( sets == 1 ); 36 | 37 | end -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/assertTrue.m: -------------------------------------------------------------------------------- 1 | function assertTrue(condition, message) 2 | %assertTrue Assert that input condition is true 3 | % assertTrue(CONDITION, MESSAGE) throws an exception containing the string 4 | % MESSAGE if CONDITION is not true. 5 | % 6 | % MESSAGE is optional. 7 | % 8 | % Examples 9 | % -------- 10 | % % This call returns silently. 11 | % assertTrue(rand < 1, 'Expected output of rand to be less than 1') 12 | % 13 | % % This call throws an error. 14 | % assertTrue(sum(sum(magic(3))) == 0, ... 15 | % 'Expected sum of elements of magic(3) to be 0') 16 | % 17 | % See also assertEqual, assertFalse 18 | 19 | % Steven L. Eddins 20 | % Copyright 2008-2010 The MathWorks, Inc. 21 | 22 | if nargin < 2 23 | message = 'Asserted condition is not true.'; 24 | end 25 | 26 | if ~isscalar(condition) || ~islogical(condition) 27 | throwAsCaller(MException('assertTrue:invalidCondition', ... 28 | 'CONDITION must be a scalar logical value.')); 29 | end 30 | 31 | if ~condition 32 | throwAsCaller(MException('assertTrue:falseCondition', '%s', message)); 33 | end 34 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Danai Koutra, Jilles Vreeken, U Kang 2 | All rights reserved. 3 | 4 | Permission is granted to use it for non-profit purposes, 5 | including research and teaching. For-profit use requires 6 | the express consent of the author (danai@cs.cmu.edu). 7 | Redistribution are not permitted. 8 | 9 | 10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 11 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 12 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 13 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 14 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 15 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 16 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 17 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 18 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 19 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 20 | POSSIBILITY OF SUCH DAMAGE. 21 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/encodeAsChain.m: -------------------------------------------------------------------------------- 1 | function [ ] = encodeAsChain( curind, top_gccind, chain, costGain, costGain_notEnc, out_fid, info ) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Print the encoding of the given graph as chain % 4 | % Output is stored in the model file in the form: % 5 | % ch node_ids_in_order, costGain % 6 | % Author: Danai Koutra % 7 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 8 | global model; 9 | global model_idx; 10 | 11 | %% Printing the encoded structure. 12 | fprintf(out_fid, 'ch '); 13 | fprintf(out_fid, ' %d', top_gccind( curind(chain) ) ); 14 | if info == false 15 | fprintf(out_fid, '\n'); 16 | else 17 | fprintf(out_fid, ', %f | %f --- nearChain \n', costGain, costGain_notEnc); 18 | end 19 | 20 | model_idx = model_idx + 1; 21 | model(model_idx) = struct('code', 'ch', 'edges', 0, 'nodes1', top_gccind(curind(chain)), 'nodes2', [], 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 22 | %n = size(model, 2); 23 | %model(n+1) = struct('code', 'ch', 'nodes1', top_gccind(curind(chain)), 'nodes2', [], 'benefit', costGain); 24 | 25 | end -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/encodeAsFClique.m: -------------------------------------------------------------------------------- 1 | function [ ] = encodeAsFClique( curind, top_gccind, costGain, costGain_notEnc, out_fid, info ) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Print the encoding of given graph as a full clique. % 4 | % Output is stored in the model file in the form: % 5 | % fc node_ids_in_clique, costGain % 6 | % Author: Danai Koutra % 7 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 8 | global model; 9 | global model_idx; 10 | 11 | %% Printing the encoded structure. 12 | % encode as full clique 13 | fprintf(out_fid, 'fc'); 14 | for i=1:size(curind, 2) 15 | fprintf(out_fid, ' %d', top_gccind( curind(i) ) ); 16 | end 17 | if info == false 18 | fprintf(out_fid, '\n'); 19 | else 20 | fprintf(out_fid, ', %f | %f --- full clique \n', costGain, costGain_notEnc); 21 | end 22 | 23 | model_idx = model_idx + 1; 24 | model(model_idx) = struct('code', 'fc', 'edges', 0, 'nodes1', top_gccind(curind), 'nodes2', [], 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 25 | %n = size(model, 2); 26 | %model(n+1) = struct('code', 'fc', 'nodes1', top_gccind(curind), 'nodes2', [], 'benefit', costGain); 27 | 28 | end -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/encodeAsStar.m: -------------------------------------------------------------------------------- 1 | function [ ] = encodeAsStar( curind, top_gccind, hub, spokes, costGain, costGain_notEnc, out_fid, info ) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Print the encoding of the given graph as star % 4 | % Output is stored in the model file in the form: % 5 | % st hub, spokes_ids, costGain % 6 | % Author: Danai Koutra % 7 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 8 | global model; 9 | global model_idx; 10 | 11 | fprintf(out_fid, 'st %d,', top_gccind( curind(hub) ) ); 12 | fprintf(out_fid, ' %d', top_gccind( curind(spokes) ) ); 13 | 14 | if info == false 15 | fprintf(out_fid, '\n'); 16 | else 17 | fprintf(out_fid, ', %f | %f --- nearStar \n', costGain, costGain_notEnc); 18 | end 19 | 20 | model_idx = model_idx + 1; 21 | model(model_idx) = struct('code', 'st', 'edges', 0, 'nodes1', top_gccind(curind(hub)), 'nodes2', top_gccind(curind(spokes)), 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 22 | %n = size(model, 2); 23 | %model(n+1) = struct('code', 'st', 'nodes1', top_gccind(curind(hub)), 'nodes2', top_gccind(curind(spokes)), 'benefit', costGain); 24 | 25 | 26 | end -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/encodeAsNClique.m: -------------------------------------------------------------------------------- 1 | function [ ] = encodeAsNClique( curind, top_gccind, m, costGain, costGain_notEnc, out_fid, info ) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Print the encoding of given graph as a near-clique. % 4 | % Output is stored in the model file in the form: % 5 | % nc node_ids_in_clique, costGain % 6 | % Note that the costGain is 0 in the case of near-clique. % 7 | % Author: Danai Koutra % 8 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 9 | global model; 10 | global model_idx; 11 | 12 | % encode as near clique 13 | fprintf(out_fid, 'nc %d,', m/2); 14 | for i=1:size(curind, 2) 15 | fprintf(out_fid, ' %d', top_gccind( curind(i) ) ); 16 | end 17 | if info == false 18 | fprintf(out_fid, '\n'); 19 | else 20 | fprintf(out_fid, ', %f | %f --- near clique \n', costGain, costGain_notEnc); 21 | end 22 | 23 | model_idx = model_idx + 1; 24 | model(model_idx) = struct('code', 'nc', 'edges', m/2, 'nodes1', top_gccind(curind), 'nodes2', [], 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 25 | %n = size(model, 2); 26 | %model(n+1) = struct('code', 'nc', 'nodes1', top_gccind(curind), 'nodes2', [], 'benefit', costGain); 27 | 28 | end -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/encodeAsBC.m: -------------------------------------------------------------------------------- 1 | function [ ] = encodeAsBC( curind, top_gccind, set1, set2, costGain, costGain_notEnc, out_fid, info ) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Print the encoding of the given graph as bipartite core % 4 | % Output is stored in the model file in the form: % 5 | % bc node_ids_of_1st_set, node_ids_of_2nd_set, costGain % 6 | % Author: Danai Koutra % 7 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 8 | global model; 9 | global model_idx; 10 | 11 | if ~isempty(set1) && ~isempty(set2) 12 | fprintf(out_fid, 'bc'); 13 | fprintf(out_fid, ' %d', top_gccind( curind(set1) )); 14 | fprintf(out_fid, ','); 15 | fprintf(out_fid, ' %d', top_gccind( curind(set2) ) ); 16 | if info == false 17 | fprintf(out_fid, '\n'); 18 | else 19 | fprintf(out_fid, ', %f | %f------ nearBC \n', costGain, costGain_notEnc); 20 | end 21 | end 22 | 23 | model_idx = model_idx + 1; 24 | model(model_idx) = struct('code', 'bc', 'edges', 0, 'nodes1', top_gccind(curind(set1)), 'nodes2', top_gccind(curind(set2)), 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 25 | 26 | %model(n+1) = struct('code', 'bc', 'nodes1', top_gccind(curind(set1)), 'nodes2', top_gccind(curind(set2)), 'benefit', costGain); 27 | 28 | end -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/encodeAsNB.m: -------------------------------------------------------------------------------- 1 | function [ ] = encodeAsNB( curind, top_gccind, set1, set2, costGain, costGain_notEnc, out_fid, info ) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Print the encoding of the given graph as bipartite core % 4 | % Output is stored in the model file in the form: % 5 | % bc node_ids_of_1st_set, node_ids_of_2nd_set, costGain % 6 | % Author: Danai Koutra % 7 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 8 | global model; 9 | global model_idx; 10 | 11 | if ~isempty(set1) && ~isempty(set2) 12 | fprintf(out_fid, 'nb'); 13 | fprintf(out_fid, ' %d', top_gccind( curind(set1) )); 14 | fprintf(out_fid, ','); 15 | fprintf(out_fid, ' %d', top_gccind( curind(set2) ) ); 16 | if info == false 17 | fprintf(out_fid, '\n'); 18 | else 19 | fprintf(out_fid, ', %f | %f ------ NB \n', costGain, costGain_notEnc); 20 | end 21 | end 22 | 23 | model_idx = model_idx + 1; 24 | model(model_idx) = struct('code', 'nb', 'edges', 0, 'nodes1', top_gccind(curind(set1)), 'nodes2', top_gccind(curind(set2)), 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 25 | %n = size(model, 2); 26 | %model(n+1) = struct('code', 'bc', 'nodes1', top_gccind(curind(set1)), 'nodes2', top_gccind(curind(set2)), 'benefit', costGain); 27 | 28 | end -------------------------------------------------------------------------------- /MDL/test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | #from mdl import * 5 | from error import Error; 6 | from graph import Graph; 7 | from model import *; 8 | from mdl import *; 9 | 10 | gFilename = "cliqueStarClique.graph"; 11 | mFilename = "cliqueStarClique_st1.model"; 12 | 13 | g = Graph(); 14 | g.Load(gFilename); 15 | m = Model(); 16 | 17 | #g.Plot(); 18 | 19 | (l_total, l_model, l_error, E) = L(g,m); 20 | print "empty model:" 21 | print " ", l_total, l_model, l_error, E.numErrors; 22 | 23 | if(False) : 24 | m = Model(); 25 | fc1 = FullClique([x for x in range(1,21)]); 26 | m.addStruct(fc1); 27 | fc2 = FullClique([x for x in range(27,52)]); 28 | m.addStruct(fc2); 29 | st1 = Star(21,[18,19,20,22,23,24,25,26,27,28,29]); 30 | m.addStruct(st1); 31 | 32 | nc1 = NearClique([x for x in range(1,21)]); 33 | #m.addStruct(nc1); 34 | 35 | (l_total, l_model, l_error, E) = L(g,m); 36 | print "model with two full cliques, resp. over nodes 1--20, and 27--38:" 37 | print " ", l_total, l_model, l_error, E.numErrors; 38 | 39 | if(True) : 40 | m = Model(); 41 | m.Load(mFilename); 42 | 43 | (l_total, l_model, l_error, E) = L(g,m); 44 | print "model \'" + mFilename + "\'"; 45 | print " ", l_total, l_model, l_error, E.numErrors; 46 | 47 | 48 | 49 | if(False): 50 | #g.Plot() 51 | E.plotCover(); 52 | E.plotError(); 53 | #print " ".join([str(x)+" "+str(E.errors[x]) for x in range(len(E.errors))]) 54 | #print " ".join([str(x)+" "+str(E.covered[x]) for x in range(len(E.covered))]) 55 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/assertEqual.m: -------------------------------------------------------------------------------- 1 | function assertEqual(A, B, custom_message) 2 | %assertEqual Assert that inputs are equal 3 | % assertEqual(A, B) throws an exception if A and B are not equal. A and B 4 | % must have the same class and sparsity to be considered equal. 5 | % 6 | % assertEqual(A, B, MESSAGE) prepends the string MESSAGE to the assertion 7 | % message if A and B are not equal. 8 | % 9 | % Examples 10 | % -------- 11 | % % This call returns silently. 12 | % assertEqual([1 NaN 2], [1 NaN 2]); 13 | % 14 | % % This call throws an error. 15 | % assertEqual({'A', 'B', 'C'}, {'A', 'foo', 'C'}); 16 | % 17 | % See also assertElementsAlmostEqual, assertVectorsAlmostEqual 18 | 19 | % Steven L. Eddins 20 | % Copyright 2008-2010 The MathWorks, Inc. 21 | 22 | if nargin < 3 23 | custom_message = ''; 24 | end 25 | 26 | if ~ (issparse(A) == issparse(B)) 27 | message = xunit.utils.comparisonMessage(custom_message, ... 28 | 'One input is sparse and the other is not.', A, B); 29 | throwAsCaller(MException('assertEqual:sparsityNotEqual', '%s', message)); 30 | end 31 | 32 | if ~strcmp(class(A), class(B)) 33 | message = xunit.utils.comparisonMessage(custom_message, ... 34 | 'The inputs differ in class.', A, B); 35 | throwAsCaller(MException('assertEqual:classNotEqual', '%s', message)); 36 | end 37 | 38 | if ~isequalwithequalnans(A, B) 39 | message = xunit.utils.comparisonMessage(custom_message, ... 40 | 'Inputs are not equal.', A, B); 41 | throwAsCaller(MException('assertEqual:nonEqual', '%s', message)); 42 | end 43 | 44 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/mdlCostAsfANDnClique.m: -------------------------------------------------------------------------------- 1 | function [ MDLcost_fc, MDLcost_nc ] = mdlCostAsfANDnClique( Asmall, N_tot ) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Encode given graph as clique and near-clique % 4 | % Author: Danai Koutra % 5 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 6 | 7 | n = size(Asmall, 2); 8 | 9 | %% Creating the adjacency matrix for the clique model (w/o noise). 10 | % Note that there is no Error matrix for the near-clique model. 11 | %M = ones(n,n) - eye(n); 12 | % Error matrix. 13 | %E1 = xor(M,Asmall); 14 | 15 | % 0s in the error matrix --- edges included in the structure (full clique) 16 | E(2) = nnz(Asmall); 17 | % 1s in the error matrix --- edges excluded from the structure (full clique) 18 | E(1) = n^2 - n - E(2); 19 | 20 | %% MDL cost of encoding given substructure as a full clique 21 | MDLcost_fc = compute_encodingCost( 'fc', N_tot, n, E); 22 | %% MDL cost of encoding given substructure as a near clique 23 | MDLcost_nc = compute_encodingCost( 'nc', N_tot, n, Asmall); 24 | 25 | 26 | % % %% Printing the encoded structure. 27 | % % % encode as full clique 28 | % % fprintf(out_fid, 'fc'); 29 | % % for i=1:size(curind, 2) 30 | % % fprintf(out_fid, ' %d', top_gccind( curind(i) ) ); 31 | % % end 32 | % % fprintf(out_fid, '--- full clique \n'); 33 | % % 34 | % % % encode as near clique 35 | % % fprintf(out_fid, 'nc %d,', m/2); 36 | % % for i=1:size(curind, 2) 37 | % % fprintf(out_fid, ' %d', top_gccind( curind(i) ) ); 38 | % % end 39 | % % fprintf(out_fid, '--- nearClique \n'); 40 | 41 | end -------------------------------------------------------------------------------- /MDL/mdl_base.py: -------------------------------------------------------------------------------- 1 | import config; 2 | 3 | from math import log,factorial; 4 | from error import Error; 5 | from graph import Graph; 6 | from model import Model; 7 | 8 | ### basic functions 9 | # determine possible number of edges between `numEdges' nodes 10 | def CalcCliqueNumPosEdges(numEdges): 11 | # directed graph, no self-loops 12 | # (|n|^2)-n 13 | return numEdges*numEdges - numEdges; 14 | 15 | # (n choose k) 16 | def choose(n, k): 17 | if 0 <= k <= n: 18 | p = 1 19 | for t in xrange(min(k, n - k)): 20 | p = (p * (n - t)) // (t + 1) 21 | return p; 22 | else: 23 | return 0; 24 | 25 | def composition(n,k) : 26 | return choose(n-1,k-1); 27 | 28 | def LC(n,k) : 29 | return log(composition(n,k),2); 30 | 31 | def weakcomposition(n,k) : 32 | return choose(n+k-1,k-1); 33 | 34 | def LwC(n,k) : 35 | return log(weakcomposition(n,k),2); 36 | 37 | # Encoded length of `n` 0/1 entries with `k` 1s (aka, Naive Uniform) 38 | def LnU(n,k): 39 | #print 'LnU', n, k 40 | if n==0 or k==0 or k==n: 41 | return 0; 42 | x = -log(k / float(n),2); 43 | y = -log((n-k)/float(n),2); 44 | return k * x + (n-k) * y; 45 | 46 | # Encoded length of `n` 0/1 entries with `k` 1s (aka, Uniform) 47 | def LU(n,k) : 48 | if n==0 or k==0 : 49 | return 0; 50 | return log(choose(n,k),2); 51 | 52 | # encoded size of an integer >=1 as by Rissanen's 1983 Universal code for integers 53 | def LN(z) : 54 | if z <= 0 : 55 | return 0; 56 | c = log(2.865064,2); 57 | i = log(z,2); 58 | while i > 0 : 59 | c = c + i; 60 | i = log(i,2); 61 | return c; 62 | -------------------------------------------------------------------------------- /MDL/graph.py: -------------------------------------------------------------------------------- 1 | class Graph : 2 | def __init__(self): 3 | self.numNodes = 0; 4 | self.numEdges = 0; 5 | # per node i a list of node-ids j for which (i,j) \in E 6 | self.edges = [frozenset()]; 7 | 8 | def hasEdge(self, i, j): 9 | return max(i,j)-1 in self.edges[min(i,j)-1]; 10 | 11 | def load(self, fullpath): 12 | fg = open(fullpath); 13 | self.edges = []; 14 | edgeList = []; 15 | for line in fg : 16 | tmp = line.strip().split(','); 17 | if len(tmp) < 2 : 18 | continue; 19 | 20 | i = int(tmp[0]); 21 | j = int(tmp[1]); 22 | if i > self.numNodes : 23 | self.numNodes = i; 24 | if j > self.numNodes : 25 | self.numNodes = j; 26 | edgeList.append((min(i,j),max(i,j))); 27 | 28 | tmpAdj = [set() for i in range(self.numNodes)]; 29 | 30 | for edge in edgeList : 31 | (i,j) = edge; 32 | # option 1 33 | if(j-1 not in tmpAdj[i-1]) : 34 | tmpAdj[i-1].add(j-1); 35 | self.numEdges += 1; 36 | 37 | # finalize edges into frozensets 38 | self.edges = [frozenset(x) for x in tmpAdj]; 39 | 40 | #print self.edges, self.numEdges; 41 | return; 42 | 43 | def plot(self): 44 | for idx in range(len(self.edges)) : 45 | mystr = "".join(["." for x in range(0,idx+1)]); 46 | for idy in range(idx+1,len(self.edges)) : 47 | if idy in self.edges[idx] : 48 | mystr += "1"; 49 | else : 50 | mystr += "0"; 51 | print mystr; 52 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Version 1.3 2 | 3 | Code for 4 | VoG: Summarizing and Understanding Large Graphs 5 | Danai Koutra, U Kang, Jilles Vreeken, and Christos Faloutsos 6 | http://www.cs.cmu.edu/~dkoutra/papers/VoG.pdf 7 | 8 | 9 | Contact: 10 | Danai Koutra, dkoutra@umich.edu 11 | 12 | 13 | To run: 14 | type 'make' 15 | 16 | 17 | Difference from Version 1.0: 18 | Using dynamic programming and the technique of memoization to 19 | speed up the application of the GREEDY'nFORGET heuristic. 20 | 21 | 22 | Algorithm: 23 | 24 | Input: graph G 25 | Step 1: Subgraph Generation. Generate candidate – possibly 26 | overlapping – subgraphs using one or more graph decomposition 27 | methods. 28 | Step 2: Subgraph Labeling. Characterize each subgraph as a 29 | perfect structure x \in Omega, or an approximate structure by using 30 | MDL to find the type x that locally minimizes the encoding cost. 31 | Populate the candidate set C. 32 | Step 3: Summary Assembly. Use the heuristics PLAIN, TOP10, 33 | TOP100, GREEDY’NFORGET (Sec. 4.3) to select a non-redundant 34 | subset from the candidate structures to instantiate the graph model 35 | M. Pick the model of the heuristic with the lowest description 36 | cost. 37 | Return graph summary M and its encoding cost. 38 | 39 | 40 | 41 | Change Log: 42 | =========== 43 | 44 | July 1, 2015 45 | - removed vpi(): using l2cnk.m to compute the log of n-choose-k efficiently 46 | leads to 30x speedup in the chocolate-wiki dataset 47 | - tic/toc instead of cputime to compute the runtime: following the recommendation at http://www.mathworks.com/help/matlab/ref/cputime.html 48 | 49 | January 9, 2015 50 | - Replaced the config.py file 51 | 52 | July 30, 2014 53 | - Fixed ordering of nodes in cliques 54 | 55 | June 15, 2014 56 | - Made the greedyNforget 100x faster by exploiting memoization 57 | -------------------------------------------------------------------------------- /demo_vog.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo '' 4 | echo -e "\e[34m======== Steps 1 & 2: Subgraph Generation and Labeling ==========\e[0m" 5 | matlab -r run_structureDiscovery 6 | echo '' 7 | echo 'Structure discovery finished.' 8 | 9 | unweighted_graph='DATA/cliqueStarClique.out' 10 | model='DATA/cliqueStarClique_orderedALL.model' 11 | modelFile='cliqueStarClique_orderedALL.model' 12 | modelTop10='DATA/cliqueStarClique_top10ordered.model' 13 | 14 | echo '' 15 | echo -e "\e[34m=============== Step 3: Summary Assembly ===============\e[0m" 16 | echo '' 17 | echo -e "\e[31m=============== TOP 10 structures ===============\e[0m" 18 | head -n 10 $model > $modelTop10 19 | echo 'Computing the encoding cost...' 20 | echo '' 21 | python MDL/score.py $unweighted_graph $modelTop10 22 | 23 | echo '' 24 | echo 'Explanation of the above output:' 25 | echo 'L(G,M): Number of bits to describe the data given a model M.' 26 | echo 'L(M): Number of bits to describe only the model.' 27 | echo 'L(E): Number of bits to describe only the error.' 28 | echo ': M_0 is the zero-model where the graph is encoded as noise (no structure is assumed).' 29 | echo ': M_x is the model of the graph as represented by the top-10 structures.' 30 | echo '' 31 | cat DATA/encoding_top10.out 32 | echo '' 33 | echo '' 34 | 35 | echo -e "\e[31m========= Greedy selection of structures =========\e[0m" 36 | echo 'Computing the encoding cost...' 37 | echo '' 38 | python2.7 MDL/greedySearch_nStop.py $unweighted_graph $model >/dev/null 2>&1 39 | mv heuristic* DATA/ 40 | echo '>> Outputs saved in DATA/. To interpret the structures that are selected, check the file MDL/readme.txt.' 41 | echo ": DATA/heuristicSelection_nStop_ALL_$modelFile has the lines of the $model structures included in the summary." 42 | echo ": DATA/heuristic_Selection_costs_ALL_$modelFile has the encoding cost of the considered model at each time step." 43 | echo '' 44 | echo '' 45 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/mdlCostAsBC.m: -------------------------------------------------------------------------------- 1 | function [ MDLcost, set1, set2 ] = mdlCostAsBC( Asmall, N_tot ) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Encode given graph as bipartite core % 4 | % max cut problem --> NP hard % 5 | % Heuristic: we use FaBP with heterophily and we initialize % 6 | % two nodes that are connected by an edge in opposite classes % 7 | % Author: Danai Koutra % 8 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 9 | 10 | %% Constants and variables of FaBP 11 | % heterophily factor 12 | h = -0.01; 13 | % prior belief for belonging in the pos/neg class 14 | positive = 0.01; 15 | negative = -0.01; 16 | 17 | a = 4*h^2/(1-4*h^2); 18 | c = 2*h/(1-4*h^2); 19 | 20 | %% setting up the matrices and vectors involved in FaBP 21 | n = size(Asmall, 2); 22 | deg = full(sum(Asmall)); 23 | D = diag(deg); 24 | matI = eye(n); 25 | 26 | %% Initialization: pick high degree node, and initialize as positive. 27 | % Set all its neighbors in the opposite class. 28 | phi = zeros(n,1); 29 | [ ~, idx ] = max(deg); 30 | neighbors = find(Asmall(idx,:)); 31 | phi(idx) = positive; 32 | phi(neighbors) = negative; 33 | 34 | %% FaBP: main equation 35 | b = [ matI + a * D - c * Asmall ] \ phi; 36 | 37 | %% Find the members of the two sets 38 | set1 = b > 0; 39 | set2 = b < 0; 40 | 41 | %% Creating the adjacency matrix for the bc model (w/o noise). 42 | % According to this model, all the nodes in set1 are connected to all the 43 | % nodes in set2. 44 | M(n,n) = 0; 45 | M( set1, set2 ) = 1; 46 | % Error matrix 47 | E = xor(M,Asmall); 48 | 49 | %% MDL cost of encoding given substructure as a star 50 | MDLcost = compute_encodingCost( 'bc', N_tot, sum(set1), E, sum(set2) ) 51 | 52 | 53 | % % if nargin == 4 && ~isempty(set1) && ~isempty(set2) 54 | % % fprintf(out_fid, 'bc'); 55 | % % fprintf(out_fid, ' %d', top_gccind( curind(set1) )); 56 | % % fprintf(out_fid, ','); 57 | % % fprintf(out_fid, ' %d', top_gccind( curind(set2) ) ); 58 | % % fprintf(out_fid, ' --- nearBC \n'); 59 | % % end 60 | 61 | end -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/mdlCostAsStar.m: -------------------------------------------------------------------------------- 1 | function [ MDLcost, idxMaxDeg, satellitesIdx ] = mdlCostAsStar( Asmall, curind, N_tot ) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Encode given graph as star % 4 | % Find the highest degree node and set it as the hub. The rest nodes will% 5 | % be encoded as spokes. % 6 | % OUTPUT % 7 | % MDLcost: the cost of encoding Asmall as a chain % 8 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 9 | 10 | n = size(Asmall, 2); 11 | deg = full(sum(Asmall)); 12 | 13 | if n < 3 14 | return 15 | end 16 | 17 | [ ~, idxMaxDeg ] = max(deg); 18 | 19 | if idxMaxDeg ~= 1 && idxMaxDeg ~= n 20 | satellitesIdx = [1 : (idxMaxDeg-1), (idxMaxDeg+1):n]; 21 | elseif idxMaxDeg == 1 22 | satellitesIdx = 2:n; 23 | elseif idxMaxDeg == n 24 | satellitesIdx = 1:(n-1); 25 | end 26 | 27 | %% Creating the adjacency matrix for the star model (w/o noise). 28 | % % M(n,n) = 0; 29 | % % for i = 1 : length( satellitesIdx ) 30 | % % M( idxMaxDeg, satellitesIdx(i) ) = 1; 31 | % % M( satellitesIdx(i), idxMaxDeg ) = 1; 32 | % % end 33 | % % % Error matrix. 34 | % % E1 = xor(M,Asmall); 35 | % % 36 | % % Einc1 = nnz(E1) 37 | % % Eexc1 = sum(E1(:)==0) 38 | 39 | % 1s in the error matrix 40 | % missing edges in star + extra edges not in star 41 | E(1) = 2* (n-1-nnz(Asmall(idxMaxDeg,:))) + nnz(Asmall(satellitesIdx, satellitesIdx)); 42 | % 0s in the error matrix 43 | % E(1) = n^2 - n - E(2); 44 | %wrong_edges_in_star = 2*(n-nnz(Asmall(idxMaxDeg,:))); 45 | E(2) = n^2 - E(1); 46 | 47 | if E(1) < 0 || E(2) < 0 48 | E 49 | n 50 | nnz(Asmall(idxMaxDeg,:)) 51 | end 52 | 53 | %% MDL cost of encoding given substructure as a star 54 | MDLcost = compute_encodingCost( 'st', N_tot, n, E); 55 | 56 | 57 | % % %% Printing the encoded structure. 58 | % % fprintf(out_fid, 'st %d,', top_gccind( curind(idxMaxDeg) ) ); 59 | % % fprintf(out_fid, ' %d', top_gccind( curind(satellitesIdx) ) ); 60 | % % fprintf(out_fid, ' --- nearStar \n'); 61 | 62 | 63 | % % check if we have a tie (multiple highest-degree nodes) 64 | % idx_center = find( deg == deg(idxMaxDeg) ); 65 | % 66 | % for i = 1 : length(idx_center) 67 | % : idxMaxDeg = idx_center(i); 68 | % fprintf(out_fid, 'st %d,', top_gccind( curind(idxMaxDeg) ) ); 69 | % 70 | % if idxMaxDeg ~= 1 && idxMaxDeg ~= n 71 | % satellitesIdx = curind( [1 : (idxMaxDeg-1), (idxMaxDeg+1):n] ); 72 | % elseif idxMaxDeg == 1 73 | % satellitesIdx = curind( 2:n ); 74 | % elseif idxMaxDeg == n 75 | % satellitesIdx = curind( 1:(n-1) ); 76 | % end 77 | % fprintf(out_fid, ' %d', top_gccind( satellitesIdx ) ); 78 | % fprintf(out_fid, '\n'); 79 | % end 80 | 81 | end 82 | -------------------------------------------------------------------------------- /MDL/mdl_error.py: -------------------------------------------------------------------------------- 1 | import config; 2 | import mdl_base; 3 | import mdl_structs; 4 | 5 | from math import log,factorial; 6 | from error import Error; 7 | from graph import Graph; 8 | from model import Model; 9 | 10 | from mdl_base import LU,LnU; 11 | from mdl_structs import *; 12 | 13 | ### Encoding the Error 14 | 15 | # here I encode all errors uniformly by a binomial -- hence, not yet the typed advanced stuff yet! 16 | def LErrorNaiveBinom(G, M, E) : 17 | # possible number of edges in an undirected, non-self-connected graph of N nodes 18 | posNumEdges = (G.numNodes * G.numNodes - G.numNodes) / 2 19 | cost = LU(posNumEdges - E.numCellsExcluded, E.numUnmodelledErrors + E.numModellingErrors); 20 | if config.optVerbosity > 1 : print ' - L_nb(E)', cost; 21 | return cost; 22 | 23 | def LErrorNaivePrefix(G, M, E) : 24 | # possible number of edges in an undirected, non-self-connected graph of N nodes 25 | posNumEdges = (G.numNodes * G.numNodes - G.numNodes) / 2 26 | cost = LnU(posNumEdges - E.numCellsExcluded, E.numModellingErrors + E.numUnmodelledErrors); 27 | if config.optVerbosity > 1 : print ' - L_np(E)', cost; 28 | return cost; 29 | 30 | # here I encode all errors uniformly by a binomial -- hence, not yet the typed advanced stuff yet! 31 | def LErrorTypedBinom(G, M, E) : 32 | # possible number of edges in an undirected, non-self-connected graph of N nodes 33 | posNumEdges = (G.numNodes * G.numNodes - G.numNodes) / 2 34 | 35 | # First encode the modelling errors 36 | #print 'First encode the modelling errors' 37 | #print 'E.numCellsCovered, E.numCellsExcluded, E.numModellingErrors;' 38 | #print E.numCellsCovered, E.numCellsExcluded, E.numModellingErrors; 39 | costM = LU(E.numCellsCovered - E.numCellsExcluded, E.numModellingErrors); 40 | if config.optVerbosity > 1 : print ' - L_tb(E+)', costM; 41 | 42 | # Second encode the unmodelled errors 43 | #print 'Second encode the unmodelled errors' (excluded cells are always covered!) 44 | #print posNumEdges - E.numCellsCovered, E.numUnmodelledErrors; 45 | costU = LU(posNumEdges - E.numCellsCovered, E.numUnmodelledErrors); 46 | if config.optVerbosity > 1 : print ' - L_tb(E-)', costU; 47 | return costM + costU; 48 | 49 | def LErrorTypedPrefix(G, M, E) : 50 | # possible number of edges in an undirected, non-self-connected graph of N nodes 51 | posNumEdges = (G.numNodes * G.numNodes - G.numNodes) / 2 52 | costM = LnU(E.numCellsCovered - E.numCellsExcluded, E.numModellingErrors); 53 | if config.optVerbosity > 1 : print ' - L_tp(E+)', costM; 54 | costU = LnU(posNumEdges - E.numCellsCovered, E.numUnmodelledErrors); 55 | if config.optVerbosity > 1 : print ' - L_tp(E-)', costU; 56 | #print E.numCellsCovered, E.numCellsExcluded, E.numModellingErrors, posNumEdges, E.numUnmodelledErrors; 57 | return costM + costU; 58 | -------------------------------------------------------------------------------- /MDL/readme.txt: -------------------------------------------------------------------------------- 1 | Author: Jilles Vreeken 2 | Email: jilles@mmci.uni-saarland.de 3 | 4 | 5 | :: General Assumptions on Input 6 | We deal with undirected, nonloopy graphs, where each node has an id, and node ids start with 1. 7 | 8 | :: Usage 9 | python score.py 10 | 11 | It prints the possible parameters if you give no options at all; if you only give a graph file it shows the encoded size by the empty model. 12 | 13 | 14 | :: Input Data Format for Graphs 15 | One edge per row, comma separated: 16 | , 17 | 18 | e.g. 19 | 1,2 20 | 1,3 21 | ... 22 | 23 | As we are working with undirected graphs, for pairs i,j it does not matter whether ii. (non-loopy, so no i=j, though I currently don't check for that) 24 | 25 | 26 | :: Input Data Format for Models 27 | One structure per row, e.g.: 28 | fc 1 2 3 4 29 | fc 5 4 2 6 30 | bc 1 2 3, 21 2 1 31 | ch 10 11 200 12 32 | 33 | The ordering of the rows does influence which structure encodes what part of the graph. Later, when we introduce structure-Typed-error encoding this may matter. 34 | 35 | Where for the different structure types (and hence encodings) I have 36 | 37 | 38 | Full clique: 39 | fc [node ids] 40 | e.g. 41 | fc 1 2 3 4 for a full-clique over nodes 1 to 4. 42 | This encoding is great for full cliques, and near-cliques with high connectivity 43 | E += { (1,2), (1,3), (1,4), (2,3), (2,4), (3,4) } 44 | 45 | 46 | nc <# of edges>, [node ids] 47 | e.g. 48 | nc 5, 1 2 3 4 for a near-clique over nodes 1 to 4, with 5 edges among them. 49 | This encoding gives the exact connections, without making error. 50 | ! in certain cases the full-clique encoding may be more efficient: 51 | depending on Error encoding, encoding some superfluous edges can 52 | be cheaper. Formal analysis needed. 53 | E= exactly what is in the data, using (locally optimal) prefix codes 54 | 55 | Chain 56 | ch [node ids] 57 | e.g. 58 | ch 4 2 1 3 for a chain from node 4 to 2 to 1 to 3 59 | E+={ (4,2), (2,1), (1,3) } 60 | 61 | Star 62 | st , [node ids] 63 | e.g. 64 | st 1, 2 3 4 for a star with node 1 as hub, and spokes to nodes 2, 3 and 4 65 | E+={ (1,2), (1,3), (1,4) } 66 | - BiPartiteCore of size 1 67 | 68 | 69 | BiPartiteCore 70 | bc [node id set A], [node id set B] 71 | e.g. 72 | bc 1 2 3, 4 5 for a fully connected bi-partite graph between node sets 1,2,3 and 4,5 73 | ! also means there are no edges between nodes 1,2,3, nor between nodes 4,5 74 | E+={ (1,4), (1,5), (2,4), (2,5), (3,4), (3,5) } 75 | 76 | NearBiPartiteCore 77 | nb [node id set A], [node id set B] 78 | e.g. 79 | nb 1 2 3, 4 5 for a possibly not fully connected bi-partite graph between node sets 1,2,3 and 4,5 80 | ! implies there are no edges between nodes 1,2,3, nor between nodes 4,5 --- errors are pushed into error matrix 81 | E= edges within sets A and B, no errors between A and B 82 | 83 | 84 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/mdlCostAsChain.m: -------------------------------------------------------------------------------- 1 | function [ MDLcost, chainExt ] = mdlCostAsChain( Asmall, N_tot ) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Encode given graph as chain % 4 | % Start from a node with deg 1 (p_rand) and find its furthest node using % 5 | % BFS. Then, starting from the found node (p_init), redo BFS and find its% 6 | % furthest node (p_fin). Report the shortest path between p_init and % 7 | % p_fin. If there are extra nodes in the shortest path, report them after% 8 | % the path (separating with comma from the path nodes). % 9 | % DESCRIPTION OF SOME VARS: % 10 | % E = M xor Asmall, error matrix (xor between true model and adjacency % 11 | % mat) % 12 | % OUTPUT % 13 | % MDLcost: the cost of encoding Asmall as a chain % 14 | % Author: Danai Koutra % 15 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 16 | 17 | n = size(Asmall, 2); 18 | 19 | if n < 3 20 | return; 21 | end 22 | 23 | deg = full(sum(Asmall)); 24 | deg1_nodes = find(deg==1); 25 | if isempty( deg1_nodes ) 26 | d = min(deg); 27 | deg1_nodes = find(deg==d); 28 | end 29 | p_rand = deg1_nodes(1); % pick as n_rand the first node with degree 1 30 | 31 | [ p_init, ~, ~ ] = BFS( Asmall, p_rand, false ); 32 | [ p_fin, chain, extra_nodes, chainExt, extra_nodesExt ] = ... 33 | BFS( Asmall, p_init, true, true ); 34 | 35 | %% Creating the adjacency matrix for the chain model 36 | % it describes the longest chain that we found, w/o noise). 37 | % % M(n,n) = 0; 38 | % % for i = 1 : length(chainExt)-1 39 | % % M( chainExt(i), chainExt(i+1) ) = 1; 40 | % % M( chainExt(i+1), chainExt(i) ) = 1; 41 | % % end 42 | % % % Error matrix 43 | % % %E = xor(M,Asmall); 44 | % % E1 = xor(M,Asmall); 45 | % % 46 | % % Einc1 = nnz(E1) 47 | % % Eexc1 = sum(E1(:)==0) 48 | 49 | % 1s in the error matrix 50 | % missing edges in bc + extra edges within sets 51 | missing = 0; 52 | existing = 0; 53 | for i = 1 : length(chainExt)-1 54 | if Asmall( chainExt(i), chainExt(i+1) ) == 0 55 | missing = missing+1; 56 | else 57 | existing = existing+1; 58 | end 59 | end 60 | E(1) = 2* missing + (nnz(Asmall) - 2*existing ); 61 | % 0s in the error matrix 62 | E(2) = n^2 - E(1); 63 | 64 | fprintf('E(1)=%d, E(2)=%d\n', E(1), E(2)); 65 | 66 | %% MDL cost of encoding given substructure as a chain 67 | MDLcost = compute_encodingCost( 'ch', N_tot, n, E); 68 | 69 | % %% Printing the encoded structure. 70 | % fprintf(out_fid, 'ch '); 71 | % fprintf(out_fid, ' %d', top_gccind( curind(chain) ) ); 72 | % if ~isempty(extra_nodes) 73 | % fprintf(out_fid, ','); 74 | % fprintf(out_fid, ' %d', top_gccind( curind(extra_nodes) ) ); 75 | % end 76 | % fprintf(out_fid, ' --- nearChain \n'); 77 | % 78 | % 79 | % fprintf(out_fid, 'ch '); 80 | % fprintf(out_fid, ' %d', top_gccind( curind(chainExt) ) ); 81 | % %if ~isempty(extra_nodesExt) 82 | % % fprintf(out_fid, ','); 83 | % % fprintf(out_fid, ' %d', top_gccind( curind(extra_nodesExt) ) ); 84 | % %end 85 | % fprintf(out_fid, ' --- nearChain Extended \n'); 86 | 87 | 88 | end 89 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/mdlCostAsBCorNB.m: -------------------------------------------------------------------------------- 1 | function [ MDLcostBC, MDLcostNB, set1, set2 ] = mdlCostAsBCorNB( Asmall, N_tot ) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Encode given graph as bipartite core % 4 | % max cut problem --> NP hard % 5 | % Heuristic: we use FaBP with heterophily and we initialize % 6 | % two nodes that are connected by an edge in opposite classes % 7 | % Author: Danai Koutra % 8 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 9 | 10 | %% Constants and variables of FaBP 11 | % heterophily factor 12 | h = -0.01; 13 | % prior belief for belonging in the pos/neg class 14 | positive = 0.01; 15 | negative = -0.01; 16 | 17 | a = 4*h^2/(1-4*h^2); 18 | c = 2*h/(1-4*h^2); 19 | 20 | %% setting up the matrices and vectors involved in FaBP 21 | n = size(Asmall, 2); 22 | deg = full(sum(Asmall)); 23 | %D = diag(deg); 24 | D = spdiags(deg', 0, n, n); 25 | matI = speye(n); 26 | 27 | %% Initialization: pick high degree node, and initialize as positive. 28 | % Set all its neighbors in the opposite class. 29 | phi = zeros(n,1); 30 | [ ~, idx ] = max(deg); 31 | neighbors = find(Asmall(idx,:)); 32 | phi(idx) = positive; 33 | phi(neighbors) = negative; 34 | 35 | %% FaBP: main equation 36 | b = ( matI + a * D - c * Asmall ) \ phi; 37 | 38 | %% Find the members of the two sets 39 | set1 = b > 0; 40 | set2 = b < 0; 41 | 42 | %% Creating the adjacency matrix for the bc model (w/o noise). 43 | % According to this model, all the nodes in set1 are connected to all the 44 | % nodes in set2. 45 | % % M(n,n) = 0; 46 | % % M( set1, set2 ) = 1; 47 | % % % M should be symmetric 48 | % % M( set2, set1 ) = 1; 49 | % % % Error matrix 50 | % % %E = xor(M,Asmall); 51 | % % E1 = xor(M,Asmall); 52 | % % 53 | % % Einc1 = nnz(E1) 54 | % % Eexc1 = sum(E1(:)==0) 55 | 56 | % 1s in the error matrix 57 | % missing edges in bc + extra edges within sets 58 | E(1) = 2* (sum(set1)*sum(set2)-nnz(Asmall(set1,set2))) + nnz(Asmall(set1, set1)) + nnz(Asmall(set2, set2)); 59 | % 0s in the error matrix 60 | E(2) = n^2 - E(1); 61 | 62 | %% MDL cost of encoding given substructure as a bipartite core 63 | MDLcostBC = compute_encodingCost( 'bc', N_tot, sum(set1), E, sum(set2) ); 64 | 65 | %% Creating the adjacency matrix for the nb model (w/o noise). 66 | % % % According to this model, some nodes in set1 are connected to some of the 67 | % % % nodes in set2. 68 | % % M(n,n) = 0; 69 | % % B(n,n) = 0; 70 | % % B(set1, set2) = Asmall(set1, set2); 71 | % % M = B + B'; 72 | % % % Error matrix 73 | % % E1 = xor(M,Asmall); 74 | % % 75 | % % Einc1 = nnz(E1) 76 | % % Eexc1 = sum(E1(:)==0) 77 | 78 | % 1s in the error matrix 79 | % extra edges within sets 80 | E(1) = nnz(Asmall(set1, set1)) + nnz(Asmall(set2, set2)); 81 | % 0s in the error matrix 82 | E(2) = n^2 - E(1); 83 | 84 | %% MDL cost of encoding given substructure as a bipartite core 85 | MDLcostNB = compute_encodingCost( 'nb', N_tot, sum(set1), E, sum(set2), [nnz(Asmall(set1,set2)), sum(set1)*sum(set2)-nnz(Asmall(set1,set2))] ); 86 | 87 | % % if nargin == 4 && ~isempty(set1) && ~isempty(set2) 88 | % % fprintf(out_fid, 'bc'); 89 | % % fprintf(out_fid, ' %d', top_gccind( curind(set1) )); 90 | % % fprintf(out_fid, ','); 91 | % % fprintf(out_fid, ' %d', top_gccind( curind(set2) ) ); 92 | % % fprintf(out_fid, ' --- nearBC \n'); 93 | % % end 94 | 95 | end -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/BFS.m: -------------------------------------------------------------------------------- 1 | function [ furthest_node, chain, extra_nodes, ... 2 | chainExt, extra_nodesExt ] = BFS( Asmall, start, path, extend ) 3 | %% Given a graph and a node, find the node that 4 | % is furthest away from it. Also, report the path, 5 | % the variable 'path' is set to true. 6 | % DISCLAIMER: This will not give the longest chain in the graph, but 7 | % the shortest path between the furthest apart nodes. 8 | % Finding the longest path in a graph is NP-complete in 9 | % graphs with cycles. It is polynomial for DAGs. 10 | 11 | n = size(Asmall,2); 12 | extra_nodes = []; 13 | extra_nodesExt = []; 14 | extra_nodes_search = ones(1,n); 15 | chain = []; 16 | chainExt = []; 17 | queue = [start]; 18 | % nodesList = 0 (if unvisited) or parentId (if visited) 19 | nodesList = zeros(1,n); 20 | nodesList(start)=start; % set as parent of the start node itself. 21 | 22 | while ~( isempty(queue) ) 23 | neighbors = find( Asmall(queue(1),:) ) ; 24 | 25 | for i = 1 : length(neighbors) 26 | if nodesList( neighbors(i) ) == 0 % unvisited neighbor 27 | nodesList( neighbors(i) ) = queue(1); 28 | queue = [ queue, neighbors(i) ]; 29 | end 30 | end 31 | 32 | qsize = length(queue); 33 | % has the furthest node from start up to that point 34 | furthest_node = queue(qsize); 35 | queue = queue(2:qsize); 36 | end 37 | 38 | if path == true 39 | curr = furthest_node; 40 | while curr ~= start 41 | chain = [ curr, chain]; 42 | extra_nodes_search(curr) = 0; 43 | curr = nodesList(curr); 44 | end 45 | chain = [ start, chain]; 46 | extra_nodes_search(start) = 0; 47 | extra_nodes = find(extra_nodes_search==1); 48 | 49 | % heuristic: check for the extra nodes if they are neighboring with one 50 | % of the end points - then we can update our chain and make it longer 51 | % Do BFS in the induced subgraph of one endpoint and extra_nodes. 52 | % Repeat for the other endpoint. If a chain is returned, then make the 53 | % previously found path longer. 54 | 55 | if extend == true 56 | 57 | % chain except from p_init 58 | chain_head = chain(1:length(chain)-1); 59 | % chain except from p_fin 60 | chain_tail = chain(2:length(chain)); 61 | 62 | %extend chain from start point (if possible) 63 | indSub_start = Asmall; 64 | indSub_start(chain_tail, :) = 0; 65 | indSub_start(:, chain_tail) = 0; 66 | [ furthestStart, chain1, ~ ] = BFS( indSub_start, chain(1), true, false ); 67 | 68 | 69 | % extend chain from end point (if possible) 70 | indSub_end = Asmall; 71 | indSub_end(chain_head, :) = 0; 72 | indSub_end(:, chain_head) = 0; 73 | [ furthestEnd, chain2, ~ ] = BFS( indSub_end, chain(end), true, false ); 74 | 75 | % checking if the chains have been extended to the same nodes. 76 | % This happened when I tried to encode a clique as a chain. 77 | overlap = false; 78 | for i = 2 : length(chain1) 79 | if ismember(chain1(i), chain2) 80 | overlap = true; 81 | % we include the nodes from chain1 up to the overlapped node 82 | % (excluding the overlapped node) 83 | chainExt = [ chain1((i-1):-1:2), chain, chain2(2:end) ]; 84 | break; 85 | end 86 | end 87 | 88 | % merging locally extended chains 89 | if overlap == false 90 | chainExt = [ chain1(end:-1:2), chain, chain2(2:end) ]; 91 | end 92 | extra_nodesExtIdx = ~ismember(extra_nodes, chainExt); 93 | extra_nodesExt = extra_nodes(extra_nodesExtIdx); 94 | 95 | end 96 | 97 | end 98 | 99 | end 100 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/EncodeSubgraph.m: -------------------------------------------------------------------------------- 1 | function [] = EncodeSubgraph( B, curind, top_gccind, N_tot, out_fid, info, minSize ) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Encode the connected component from SlashBurn: % 4 | % find whether it is clique, near-clique, star, chain or bipartite-core % 5 | % info: true (output the mdl benefit at the model file) / % 6 | % false (Jilles' format for model file) % 7 | % minSize: smallest size of reported structures (number of nodes) % 8 | % Author: Danai Koutra % 9 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 10 | 11 | Asmall = B(curind,curind); 12 | 13 | n = size(curind, 2); 14 | m = nnz(Asmall); 15 | 16 | % If the structure has less than 10 nodes, do not report it in the 17 | % model file 18 | if n < minSize 19 | return; 20 | end 21 | %fprintf('n=%d, m=%d\n', n, m); 22 | 23 | 24 | %% First try to find one of the synthetic structures (vocab words): 25 | % clique, star, chain, bipartite core 26 | exact_found = ExactStructure( Asmall, curind, top_gccind, N_tot, out_fid, info, minSize ); 27 | 28 | %% If it is not, try encoding it as near-structure (mispelled word) 29 | % and compute the MDL cost of each encoding. 30 | %%%%% TO DO: add some heuristics before we try to encode as chain for 31 | %%%%% instance -- check the degree distribution. 32 | % maxint = 2147483647 33 | MDLcosts = ones(1,5) * 2147483647; 34 | if ( exact_found == false ) 35 | [ MDLcostFC, MDLcostNC ] = mdlCostAsfANDnClique( Asmall, N_tot ); 36 | [ MDLcostST, hub, spokes ] = mdlCostAsStar( Asmall, curind, N_tot ); 37 | [ MDLcostBC, MDLcostNB, set1, set2 ] = mdlCostAsBCorNB( Asmall, N_tot ); 38 | MDLcosts = [ MDLcostFC, MDLcostNC, MDLcostST, MDLcostBC, MDLcostNB]; 39 | 40 | if m < 1.5*n 41 | [ MDLcostCH, chain ] = mdlCostAsChain( Asmall, N_tot ); 42 | MDLcosts = [ MDLcosts, MDLcostCH ]; 43 | end 44 | 45 | %% Find which structure best describes the given submatrix: i.e., find the 46 | % structure that has the minimum MDL cost. Then output to the model file 47 | % this structure and its encoding gain in bits (mdlcostNC - 48 | % mdlCostStructure). 49 | [ ~, idxMin ] = min(MDLcosts); 50 | 51 | cost_notEnc = compute_encodingCost( 'err', 0, 0, [nnz(Asmall) n^2-nnz(Asmall)]); 52 | 53 | if isinf(MDLcosts(idxMin)) || isinf(MDLcosts(2)) 54 | costGain_notEnc = cost_notEnc - MDLcostNC; 55 | encodeAsNClique( curind, top_gccind, m, 0, costGain_notEnc, out_fid, info ); 56 | %fprintf(out_fid, ' nan\n'); 57 | else 58 | switch idxMin 59 | case 1 60 | costGain = MDLcostNC - MDLcostFC; 61 | costGain_notEnc = cost_notEnc - MDLcostFC; 62 | encodeAsFClique( curind, top_gccind, costGain, costGain_notEnc, out_fid, info ); 63 | case 2 64 | costGain = MDLcostNC - MDLcostNC; 65 | costGain_notEnc = cost_notEnc - MDLcostNC; 66 | m = nnz(Asmall); 67 | encodeAsNClique( curind, top_gccind, m, costGain, costGain_notEnc, out_fid, info ); 68 | case 3 69 | costGain = MDLcostNC - MDLcostST; 70 | costGain_notEnc = cost_notEnc - MDLcostST; 71 | encodeAsStar( curind, top_gccind, hub, spokes, costGain, costGain_notEnc, out_fid, info ); 72 | case 4 73 | costGain = MDLcostNC - MDLcostBC; 74 | costGain_notEnc = cost_notEnc - MDLcostBC; 75 | encodeAsBC( curind, top_gccind, set1, set2, costGain, costGain_notEnc, out_fid, info ); 76 | case 5 77 | costGain = MDLcostNC - MDLcostNB; 78 | costGain_notEnc = cost_notEnc - MDLcostNB; 79 | encodeAsNB( curind, top_gccind, set1, set2, costGain, costGain_notEnc, out_fid, info ); 80 | case 6 81 | costGain = MDLcostNC - MDLcostCH; 82 | costGain_notEnc = cost_notEnc - MDLcostCH; 83 | encodeAsChain( curind, top_gccind, chain, costGain, costGain_notEnc, out_fid, info ); 84 | otherwise 85 | error_message = 'error: impossible to get this error...\n' 86 | end 87 | end 88 | 89 | end 90 | 91 | end 92 | 93 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/structureSelectionGreedyNforget.m: -------------------------------------------------------------------------------- 1 | function [ cost_noModel historyCosts historyCostsInc] = structureSelectionGreedyNforget(A, graphFile, model_ordered, cost_ALLencoded_struct, outfile) 2 | %% Select the substructures to output to the user. 3 | % The ranking of the substructures is based on their MDL benefit. Add one 4 | % substructure at a time and compute the mdl cost of encoding the whole 5 | % graph. If the MDL cost starts increasing, stop adding more structures. 6 | % 7 | % Inputs: 8 | % A: adjacency matrix of the whole graph 9 | % graphFile: edge file of the input graph (csv file, without weights) 10 | % if 'none', create the edge file from matrix A 11 | % model_ordered: the substructures ordered in decreasing mdl benefit 12 | % final_fid: the model file with the substructures (until the current 13 | % step) 14 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 15 | 16 | cost = zeros(1,2); 17 | 18 | if strcmp(graphFile, 'none') 19 | %% Creation of graph from adjacency matrix (may skip if already have the 20 | %% file) 21 | [~, fname, ~] = fileparts(outfile); 22 | graphFile = sprintf('%s.graph', fname) 23 | [i j k] = find( A ); 24 | graph_fid = fopen( graph_name, 'w' ); 25 | fprintf( graph_fid, '%d,%d\n', i, j ); 26 | fclose( graph_fid ); 27 | end 28 | 29 | % currentDirectory = pwd 30 | % [~, deepestFolder, ~] = fileparts(currentDirectory) 31 | 32 | fid = fopen(outfile, 'w'); 33 | 34 | comm = sprintf('python2.6 ../mdl/score.py %s > pythonOutput.txt;', graphFile ) 35 | system( comm ) 36 | 37 | pythonOutput = importdata('pythonOutput.txt'); 38 | % Initial cost: the MDL cost of the Empty Model. 39 | cost(1) = str2num(pythonOutput.textdata{2,2}); 40 | cost_noModel = cost(1); 41 | cost(2) = cost(1); 42 | 43 | cnt = 0; 44 | historyCosts(cnt+1) = cost(1); 45 | historyCostsInc(cnt+1) = cost(1); 46 | 47 | consecutiveInc = 0; 48 | cnt_structsInc = 0; 49 | 50 | [~,graphname,~] = fileparts(graphFile); 51 | 52 | 53 | while cnt < length(model_ordered) || historyCostsInc(cnt_structsInc+1) > cost_ALLencoded_struct %cnt < length(model_ordered) && historyCosts(cnt+1) > cost_ALLencoded_struct && consecutiveInc ~= 5 %cost(2) <= cost(1) 54 | cnt = cnt + 1; 55 | if mod(cnt,10) == 0 56 | cnt 57 | end 58 | cost(1) = cost(2); 59 | printStructureToModelFile(model_ordered(cnt), fid); 60 | comm = sprintf('time -p python2.6 ../mdl/score.py %s %s > pythonOutput.txt;', ... 61 | graphFile, outfile ); 62 | %comm = sprintf('cd ../mdl; python score.py ../%s/%s ../%s/%s > ../%s/pythonOutput.txt; cd ../%s', ... 63 | % deepestFolder, graph_name, deepestFolder, outfile, deepestFolder, deepestFolder ) 64 | system(comm); 65 | pythonOutput = importdata('pythonOutput.txt'); 66 | cost(2) = str2num(pythonOutput.textdata{3,2}); 67 | historyCosts(cnt+1) = cost(2); 68 | % removing the structure that caused the increase in the encoding cost 69 | if cost(2) > cost(1) 70 | fclose(fid); 71 | fid = fopen(outfile, 'a'); 72 | %comm = sprintf('cp %s tmp', outfile); 73 | comm = sprintf('head -n %d %s > tmp; cp tmp %s', cnt_structsInc, outfile, outfile); 74 | system(comm); 75 | %comm = sprintf('sed "N;$!P;$!D;$d" < tmp > %s; rm tmp', outfile); 76 | %system(comm); 77 | % update the cost to its last value 78 | cost(2) = historyCostsInc(cnt_structsInc+1); 79 | else 80 | cnt_structsInc = cnt_structsInc + 1; 81 | historyCostsInc(cnt_structsInc+1) = cost(2); 82 | end 83 | cnm=sprintf('%s_cost_nomodel_gnf', graphname); 84 | save(cnm, 'cost_noModel') 85 | cnm=sprintf('%s_all_costs_gnf', graphname); 86 | save(cnm, 'historyCosts') 87 | cnm=sprintf('%s_all_costs_incStruct_gnf', graphname); 88 | save(cnm, 'historyCostsInc') 89 | 90 | end 91 | 92 | fclose(fid); 93 | 94 | % remove the structure at the last line, since it caused increase in the 95 | % MDL encoding cost. 96 | % comm = sprintf('cp %s tmp', outfile); 97 | % system(comm); 98 | % if consecutiveInc == 2 99 | % comm = sprintf('sed "N;$!P;$!D;$d" < tmp > %s; rm tmp', outfile); 100 | % system(comm); 101 | % comm = sprintf('sed "$d" < %s > tmp; cp tmp %s; rm tmp', outfile, outfile); 102 | % else 103 | % comm = sprintf('sed "N;$!P;$!D;$d" < tmp > %s; rm tmp', outfile); 104 | % end 105 | % system(comm); 106 | 107 | 108 | end 109 | -------------------------------------------------------------------------------- /MDL/score.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python2.6 2 | 3 | import sys 4 | import os 5 | import config 6 | 7 | from time import time 8 | 9 | #from mdl import * 10 | from error import Error; 11 | from graph import Graph; 12 | from model import *; 13 | from mdl import *; 14 | 15 | if len(sys.argv) <= 1 : 16 | print 'at least: [model.model] [-pC] [-lC] [-pE] [-lE] [-e{NP,NB,TP,TB}]'; 17 | print ' optional argument model = file to read model from, otherwise only empty model'; 18 | print ' optional argument -vX = verbosity (1, 2, or 3)'; 19 | print ' optional argument -pG = plot Graph adjacency matrix'; 20 | print ' optional argument -pC = plot Cover matrix'; 21 | print ' optional argument -pE = plot Error matrix'; 22 | print ' optional argument -lC = list Cover entries'; 23 | print ' optional argument -lE = list Error entries'; 24 | print ' optional argument -eXX = encode error resp. untyped using prefix (NP), or'; 25 | print ' binomial (NB) codes, or using typed'; 26 | print ' prefix (TP) or binomial (TB, default) codes'; 27 | exit(); 28 | 29 | if (len(sys.argv) > 1 and ("-v1" in sys.argv)) : 30 | config.optVerbosity = 1; 31 | elif (len(sys.argv) > 1 and ("-v2" in sys.argv)) : 32 | config.optVerbosity = 2; 33 | if (len(sys.argv) > 1 and ("-v3" in sys.argv)) : 34 | config.optVerbosity = 3; 35 | 36 | t0 = time() 37 | 38 | gFilename = sys.argv[1]; 39 | g = Graph(); 40 | g.load(gFilename); 41 | 42 | 43 | if config.optVerbosity > 1 : print "- graph loaded." 44 | 45 | m = Model(); 46 | 47 | errorEnc = config.optDefaultError; 48 | if (len(sys.argv) > 1 and ("-eNP" in sys.argv or "-NP'" in sys.argv)) : 49 | errorEnc = "NP"; 50 | elif (len(sys.argv) > 1 and ("-eNB" in sys.argv or "-NB" in sys.argv)) : 51 | errorEnc = "NB"; 52 | elif (len(sys.argv) > 1 and ("-eTP" in sys.argv or "-TP" in sys.argv)) : 53 | errorEnc = "TP"; 54 | elif (len(sys.argv) > 1 and ("-eTB" in sys.argv or "-TB" in sys.argv)) : 55 | errorEnc = "TB"; 56 | 57 | if config.optVerbosity > 1 : print "- calculating L(M_0,G)" 58 | (l_total_0, l_model_0, l_error_0, E_0) = L(g,m, errorEnc); 59 | if config.optVerbosity > 1 : print "- calculated L(M_0,G)" 60 | print " \t" + "L(G,M)" + "\tL(M)" + "\tL(E)" + "\t#E+" + "\t#E-" + "\t\t#Ex"; 61 | print "M_0:\t" + '%.0f' % l_total_0 + "\t" + '%.0f' % l_model_0 + "\t" + '%.0f' % l_error_0 + "\t" + str(E_0.numModellingErrors) + '/' + str(E_0.numCellsCovered) + '\t' + str(E_0.numUnmodelledErrors) + '/' + str(((E_0.numNodes * E_0.numNodes)-E_0.numNodes)/2 - E_0.numCellsCovered) + '\t' + str(E_0.numCellsExcluded); 62 | 63 | if len(sys.argv) > 2 and sys.argv[2][0] != '-' : 64 | mFilename = sys.argv[2]; 65 | m.load(mFilename); 66 | if config.optVerbosity > 1 : print "- M_x loaded." 67 | (l_total_x, l_model_x, l_error_x, E_x) = L(g,m, errorEnc); 68 | #print "M_x:\t", l_total_x, "\t" + str(l_model_x), "\t" + str(l_error_x), "\t" + str(E_x.numModellingErrors), "\t" + str(E_x.numUnmodelledErrors); 69 | print "M_x:\t" + '%.0f' % l_total_x + "\t" + '%.0f' % l_model_x + "\t" + '%.0f' % l_error_x + "\t" + str(E_x.numModellingErrors) + '/' + str(E_x.numCellsCovered-E_x.numCellsExcluded) + '\t' + str(E_x.numUnmodelledErrors) + '/' + str(((E_x.numNodes * E_x.numNodes)-E_x.numNodes)/2 - E_x.numCellsCovered) + '\t\t' + str(E_x.numCellsExcluded); 70 | #return l_total_x; 71 | 72 | #print " -= ", l_total_0 - l_total_x, "\t" + str(l_model_0 - l_model_x), "\t" + str(l_error_0 - l_error_x), "\t" + str(E_0.numModellingErrors - E_x.numModellingErrors), "\t" + str(E_0.numUnmodelledErrors - E_x.numUnmodelledErrors); 73 | #print " %= ", "%.2f\t\t%.2f\t%.2f\t\t%.2f" % ((l_total_x / l_total_0 * 100), (l_model_x / l_model_0 * 100), (l_error_x / l_error_0 * 100), (E_x.numModellingErrors / E_0.numModellingErrors * 100)); 74 | 75 | if (len(sys.argv) > 3 and "-pG" in sys.argv) : 76 | print "Adjacency matrix:"; 77 | g.plot(); 78 | 79 | if (len(sys.argv) > 3 and "-pC" in sys.argv) : 80 | print "Cover matrix:"; 81 | E_x.plotCover(); 82 | 83 | if (len(sys.argv) > 3 and "-pE" in sys.argv) : 84 | print "Error matrix:"; 85 | E_x.plotError(); 86 | 87 | if (len(sys.argv) > 3 and "-lC" in sys.argv) : 88 | print "Cover list:"; 89 | E_x.listCover(); 90 | 91 | if (len(sys.argv) > 3 and "-lE" in sys.argv) : 92 | print "Error list:"; 93 | E_x.listError(); 94 | 95 | t1 = time() 96 | #print 'function vers1 takes %f' %(t1-t0) 97 | -------------------------------------------------------------------------------- /MDL/greedyScan.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python2.6 2 | 3 | import sys 4 | import os 5 | import config 6 | 7 | from time import time 8 | 9 | #from mdl import * 10 | from error import Error; 11 | from graph import Graph; 12 | from model import *; 13 | from mdl import *; 14 | 15 | if len(sys.argv) <= 1 : 16 | print 'at least: [model.model] [-pC] [-lC] [-pE] [-lE] [-e{NP,NB,TP,TB}]'; 17 | print ' optional argument model = file to read model from, otherwise only empty model'; 18 | print ' optional argument -vX = verbosity (1, 2, or 3)'; 19 | print ' optional argument -pG = plot Graph adjacency matrix'; 20 | print ' optional argument -pC = plot Cover matrix'; 21 | print ' optional argument -pE = plot Error matrix'; 22 | print ' optional argument -lC = list Cover entries'; 23 | print ' optional argument -lE = list Error entries'; 24 | print ' optional argument -eXX = encode error resp. untyped using prefix (NP), or'; 25 | print ' binomial (NB) codes, or using typed'; 26 | print ' prefix (TP) or binomial (TB, default) codes'; 27 | exit(); 28 | 29 | if (len(sys.argv) > 1 and ("-v1" in sys.argv)) : 30 | config.optVerbosity = 1; 31 | elif (len(sys.argv) > 1 and ("-v2" in sys.argv)) : 32 | config.optVerbosity = 2; 33 | if (len(sys.argv) > 1 and ("-v3" in sys.argv)) : 34 | config.optVerbosity = 3; 35 | 36 | t0 = time() 37 | 38 | gFilename = sys.argv[1]; 39 | g = Graph(); 40 | g.load(gFilename); 41 | 42 | 43 | if config.optVerbosity > 1 : print "- graph loaded." 44 | 45 | m = Model(); 46 | 47 | errorEnc = config.optDefaultError; 48 | if (len(sys.argv) > 1 and ("-eNP" in sys.argv or "-NP'" in sys.argv)) : 49 | errorEnc = "NP"; 50 | elif (len(sys.argv) > 1 and ("-eNB" in sys.argv or "-NB" in sys.argv)) : 51 | errorEnc = "NB"; 52 | elif (len(sys.argv) > 1 and ("-eTP" in sys.argv or "-TP" in sys.argv)) : 53 | errorEnc = "TP"; 54 | elif (len(sys.argv) > 1 and ("-eTB" in sys.argv or "-TB" in sys.argv)) : 55 | errorEnc = "TB"; 56 | 57 | if config.optVerbosity > 1 : print "- calculating L(M_0,G)" 58 | (l_total_0, l_model_0, l_error_0, E_0) = L(g,m, errorEnc); 59 | if config.optVerbosity > 1 : print "- calculated L(M_0,G)" 60 | print " \t" + "L(G,M)" + "\tL(M)" + "\tL(E)" + "\t#E+" + "\t#E-" + "\t\t#Ex"; 61 | print "M_0:\t" + '%.0f' % l_total_0 + "\t" + '%.0f' % l_model_0 + "\t" + '%.0f' % l_error_0 + "\t" + str(E_0.numModellingErrors) + '/' + str(E_0.numCellsCovered) + '\t' + str(E_0.numUnmodelledErrors) + '/' + str(((E_0.numNodes * E_0.numNodes)-E_0.numNodes) - E_0.numCellsCovered) + '\t' + str(E_0.numCellsExcluded); 62 | 63 | 64 | if len(sys.argv) > 2 and sys.argv[2][0] != '-' : 65 | mFilename = sys.argv[2]; 66 | m.load(mFilename); 67 | if config.optVerbosity > 1 : print "- M_x loaded." 68 | (l_total_x, l_model_x, l_error_x, E_x) = L(g,m, errorEnc); 69 | print "M_x:\t" + '%.0f' % l_total_x + "\t" + '%.0f' % l_model_x + "\t" + '%.0f' % l_error_x + "\t" + str(E_x.numModellingErrors) + '/' + str(E_x.numCellsCovered) + '\t' + str(E_x.numUnmodelledErrors) + '/' + str(((E_x.numNodes * E_x.numNodes)-E_x.numNodes) - E_x.numCellsCovered) + '\t\t' + str(E_x.numCellsExcluded); 70 | 71 | l_total_all = l_total_x; 72 | lines = []; 73 | lines_all = []; 74 | l_total_prev = l_total_0; 75 | times = 1; 76 | maxStructs = m.numStructs; 77 | 78 | mFilename_list = mFilename.split('/'); 79 | mFilename_main = mFilename_list[len(mFilename_list) - 1]; 80 | print '%s' % mFilename_main 81 | #mFilenameGreedy = 'greedySelection_' + mFilename_main; 82 | #fgreedy = open(mFilenameGreedy,'w') 83 | #mFilenameGreedyCost = 'greedySelection_costs_' + mFilename_main; 84 | mFilenameTotalCost = 'greedyScan_totalCosts_' + mFilename_main; 85 | #fgreedyCost = open(mFilenameGreedyCost,'w') 86 | ftotalCost = open(mFilenameTotalCost,'w') 87 | 88 | #fgreedyCost.write("%.0f\n" % l_total_0 ) 89 | ftotalCost.write("%.0f\n" % l_total_0 ) 90 | 91 | while times <= maxStructs : 92 | print "time\t" + '%.0f' % times; 93 | lines.append(times); 94 | m = Model(); 95 | m.loadLines(mFilename, lines); 96 | (l_total_x, l_model_x, l_error_x, E_x) = L(g,m, errorEnc); 97 | print "M_x:\t" + '%.0f' % l_total_x + "\t" + '%.0f' % l_model_x + "\t" + '%.0f' % l_error_x + "\t" + str(E_x.numModellingErrors) + '/' + str(E_x.numCellsCovered) + '\t' + str(E_x.numUnmodelledErrors) + '/' + str(((E_x.numNodes * E_x.numNodes)-E_x.numNodes) - E_x.numCellsCovered) + '\t\t' + str(E_x.numCellsExcluded); 98 | ftotalCost.write("%.0f\n" % l_total_x ) 99 | times = times + 1; 100 | 101 | ftotalCost.close(); 102 | #return l_total_x; 103 | 104 | #print " -= ", l_total_0 - l_total_x, "\t" + str(l_model_0 - l_model_x), "\t" + str(l_error_0 - l_error_x), "\t" + str(E_0.numModellingErrors - E_x.numModellingErrors), "\t" + str(E_0.numUnmodelledErrors - E_x.numUnmodelledErrors); 105 | #print " %= ", "%.2f\t\t%.2f\t%.2f\t\t%.2f" % ((l_total_x / l_total_0 * 100), (l_model_x / l_model_0 * 100), (l_error_x / l_error_0 * 100), (E_x.numModellingErrors / E_0.numModellingErrors * 100)); 106 | 107 | if (len(sys.argv) > 3 and "-pG" in sys.argv) : 108 | print "Adjacency matrix:"; 109 | g.plot(); 110 | 111 | if (len(sys.argv) > 3 and "-pC" in sys.argv) : 112 | print "Cover matrix:"; 113 | E_x.plotCover(); 114 | 115 | if (len(sys.argv) > 3 and "-pE" in sys.argv) : 116 | print "Error matrix:"; 117 | E_x.plotError(); 118 | 119 | if (len(sys.argv) > 3 and "-lC" in sys.argv) : 120 | print "Cover list:"; 121 | E_x.listCover(); 122 | 123 | if (len(sys.argv) > 3 and "-lE" in sys.argv) : 124 | print "Error list:"; 125 | E_x.listError(); 126 | 127 | t1 = time() 128 | #print 'function vers1 takes %f' %(t1-t0) 129 | -------------------------------------------------------------------------------- /DATA/cliqueStarClique.out: -------------------------------------------------------------------------------- 1 | 1,2,1 2 | 1,3,1 3 | 1,4,1 4 | 1,5,1 5 | 1,6,1 6 | 1,7,1 7 | 1,8,1 8 | 1,9,1 9 | 1,10,1 10 | 1,11,1 11 | 1,12,1 12 | 1,13,1 13 | 1,14,1 14 | 1,15,1 15 | 1,16,1 16 | 1,17,1 17 | 1,18,1 18 | 1,19,1 19 | 1,20,1 20 | 2,3,1 21 | 2,4,1 22 | 2,5,1 23 | 2,6,1 24 | 2,7,1 25 | 2,8,1 26 | 2,9,1 27 | 2,10,1 28 | 2,11,1 29 | 2,12,1 30 | 2,13,1 31 | 2,14,1 32 | 2,15,1 33 | 2,16,1 34 | 2,17,1 35 | 2,18,1 36 | 2,19,1 37 | 2,20,1 38 | 3,4,1 39 | 3,5,1 40 | 3,6,1 41 | 3,7,1 42 | 3,8,1 43 | 3,9,1 44 | 3,10,1 45 | 3,11,1 46 | 3,12,1 47 | 3,13,1 48 | 3,14,1 49 | 3,15,1 50 | 3,16,1 51 | 3,17,1 52 | 3,18,1 53 | 3,19,1 54 | 3,20,1 55 | 4,5,1 56 | 4,6,1 57 | 4,7,1 58 | 4,8,1 59 | 4,9,1 60 | 4,10,1 61 | 4,11,1 62 | 4,12,1 63 | 4,13,1 64 | 4,14,1 65 | 4,15,1 66 | 4,16,1 67 | 4,17,1 68 | 4,18,1 69 | 4,19,1 70 | 4,20,1 71 | 5,6,1 72 | 5,7,1 73 | 5,8,1 74 | 5,9,1 75 | 5,10,1 76 | 5,11,1 77 | 5,12,1 78 | 5,13,1 79 | 5,14,1 80 | 5,15,1 81 | 5,16,1 82 | 5,17,1 83 | 5,18,1 84 | 5,19,1 85 | 5,20,1 86 | 6,7,1 87 | 6,8,1 88 | 6,9,1 89 | 6,10,1 90 | 6,11,1 91 | 6,12,1 92 | 6,13,1 93 | 6,14,1 94 | 6,15,1 95 | 6,16,1 96 | 6,17,1 97 | 6,18,1 98 | 6,19,1 99 | 6,20,1 100 | 7,8,1 101 | 7,9,1 102 | 7,10,1 103 | 7,11,1 104 | 7,12,1 105 | 7,13,1 106 | 7,14,1 107 | 7,15,1 108 | 7,16,1 109 | 7,17,1 110 | 7,18,1 111 | 7,19,1 112 | 7,20,1 113 | 8,9,1 114 | 8,10,1 115 | 8,11,1 116 | 8,12,1 117 | 8,13,1 118 | 8,14,1 119 | 8,15,1 120 | 8,16,1 121 | 8,17,1 122 | 8,18,1 123 | 8,19,1 124 | 8,20,1 125 | 9,10,1 126 | 9,11,1 127 | 9,12,1 128 | 9,13,1 129 | 9,14,1 130 | 9,15,1 131 | 9,16,1 132 | 9,17,1 133 | 9,18,1 134 | 9,19,1 135 | 9,20,1 136 | 10,11,1 137 | 10,12,1 138 | 10,13,1 139 | 10,14,1 140 | 10,15,1 141 | 10,16,1 142 | 10,17,1 143 | 10,18,1 144 | 10,19,1 145 | 10,20,1 146 | 11,12,1 147 | 11,13,1 148 | 11,14,1 149 | 11,15,1 150 | 11,16,1 151 | 11,17,1 152 | 11,18,1 153 | 11,19,1 154 | 11,20,1 155 | 12,13,1 156 | 12,14,1 157 | 12,15,1 158 | 12,16,1 159 | 12,17,1 160 | 12,18,1 161 | 12,19,1 162 | 12,20,1 163 | 13,14,1 164 | 13,15,1 165 | 13,16,1 166 | 13,17,1 167 | 13,18,1 168 | 13,19,1 169 | 13,20,1 170 | 14,15,1 171 | 14,16,1 172 | 14,17,1 173 | 14,18,1 174 | 14,19,1 175 | 14,20,1 176 | 15,16,1 177 | 15,17,1 178 | 15,18,1 179 | 15,19,1 180 | 15,20,1 181 | 16,17,1 182 | 16,18,1 183 | 16,19,1 184 | 16,20,1 185 | 17,18,1 186 | 17,19,1 187 | 17,20,1 188 | 18,19,1 189 | 18,20,1 190 | 19,20,1 191 | 21,20,1 192 | 21,19,1 193 | 21,18,1 194 | 21,22,1 195 | 21,23,1 196 | 21,24,1 197 | 21,25,1 198 | 21,26,1 199 | 21,27,1 200 | 21,28,1 201 | 21,29,1 202 | 27,28,1 203 | 27,29,1 204 | 27,30,1 205 | 27,31,1 206 | 27,32,1 207 | 27,33,1 208 | 27,34,1 209 | 27,35,1 210 | 27,36,1 211 | 27,37,1 212 | 27,38,1 213 | 27,39,1 214 | 27,40,1 215 | 27,41,1 216 | 27,42,1 217 | 27,43,1 218 | 27,44,1 219 | 27,45,1 220 | 27,46,1 221 | 27,47,1 222 | 27,48,1 223 | 27,49,1 224 | 27,50,1 225 | 27,51,1 226 | 28,29,1 227 | 28,30,1 228 | 28,31,1 229 | 28,32,1 230 | 28,33,1 231 | 28,34,1 232 | 28,35,1 233 | 28,36,1 234 | 28,37,1 235 | 28,38,1 236 | 28,39,1 237 | 28,40,1 238 | 28,41,1 239 | 28,42,1 240 | 28,43,1 241 | 28,44,1 242 | 28,45,1 243 | 28,46,1 244 | 28,47,1 245 | 28,48,1 246 | 28,49,1 247 | 28,50,1 248 | 28,51,1 249 | 29,30,1 250 | 29,31,1 251 | 29,32,1 252 | 29,33,1 253 | 29,34,1 254 | 29,35,1 255 | 29,36,1 256 | 29,37,1 257 | 29,38,1 258 | 29,39,1 259 | 29,40,1 260 | 29,41,1 261 | 29,42,1 262 | 29,43,1 263 | 29,44,1 264 | 29,45,1 265 | 29,46,1 266 | 29,47,1 267 | 29,48,1 268 | 29,49,1 269 | 29,50,1 270 | 29,51,1 271 | 30,31,1 272 | 30,32,1 273 | 30,33,1 274 | 30,34,1 275 | 30,35,1 276 | 30,36,1 277 | 30,37,1 278 | 30,38,1 279 | 30,39,1 280 | 30,40,1 281 | 30,41,1 282 | 30,42,1 283 | 30,43,1 284 | 30,44,1 285 | 30,45,1 286 | 30,46,1 287 | 30,47,1 288 | 30,48,1 289 | 30,49,1 290 | 30,50,1 291 | 30,51,1 292 | 31,32,1 293 | 31,33,1 294 | 31,34,1 295 | 31,35,1 296 | 31,36,1 297 | 31,37,1 298 | 31,38,1 299 | 31,39,1 300 | 31,40,1 301 | 31,41,1 302 | 31,42,1 303 | 31,43,1 304 | 31,44,1 305 | 31,45,1 306 | 31,46,1 307 | 31,47,1 308 | 31,48,1 309 | 31,49,1 310 | 31,50,1 311 | 31,51,1 312 | 32,33,1 313 | 32,34,1 314 | 32,35,1 315 | 32,36,1 316 | 32,37,1 317 | 32,38,1 318 | 32,39,1 319 | 32,40,1 320 | 32,41,1 321 | 32,42,1 322 | 32,43,1 323 | 32,44,1 324 | 32,45,1 325 | 32,46,1 326 | 32,47,1 327 | 32,48,1 328 | 32,49,1 329 | 32,50,1 330 | 32,51,1 331 | 33,34,1 332 | 33,35,1 333 | 33,36,1 334 | 33,37,1 335 | 33,38,1 336 | 33,39,1 337 | 33,40,1 338 | 33,41,1 339 | 33,42,1 340 | 33,43,1 341 | 33,44,1 342 | 33,45,1 343 | 33,46,1 344 | 33,47,1 345 | 33,48,1 346 | 33,49,1 347 | 33,50,1 348 | 33,51,1 349 | 34,35,1 350 | 34,36,1 351 | 34,37,1 352 | 34,38,1 353 | 34,39,1 354 | 34,40,1 355 | 34,41,1 356 | 34,42,1 357 | 34,43,1 358 | 34,44,1 359 | 34,45,1 360 | 34,46,1 361 | 34,47,1 362 | 34,48,1 363 | 34,49,1 364 | 34,50,1 365 | 34,51,1 366 | 35,36,1 367 | 35,37,1 368 | 35,38,1 369 | 35,39,1 370 | 35,40,1 371 | 35,41,1 372 | 35,42,1 373 | 35,43,1 374 | 35,44,1 375 | 35,45,1 376 | 35,46,1 377 | 35,47,1 378 | 35,48,1 379 | 35,49,1 380 | 35,50,1 381 | 35,51,1 382 | 36,37,1 383 | 36,38,1 384 | 36,39,1 385 | 36,40,1 386 | 36,41,1 387 | 36,42,1 388 | 36,43,1 389 | 36,44,1 390 | 36,45,1 391 | 36,46,1 392 | 36,47,1 393 | 36,48,1 394 | 36,49,1 395 | 36,50,1 396 | 36,51,1 397 | 37,38,1 398 | 37,39,1 399 | 37,40,1 400 | 37,41,1 401 | 37,42,1 402 | 37,43,1 403 | 37,44,1 404 | 37,45,1 405 | 37,46,1 406 | 37,47,1 407 | 37,48,1 408 | 37,49,1 409 | 37,50,1 410 | 37,51,1 411 | 38,39,1 412 | 38,40,1 413 | 38,41,1 414 | 38,42,1 415 | 38,43,1 416 | 38,44,1 417 | 38,45,1 418 | 38,46,1 419 | 38,47,1 420 | 38,48,1 421 | 38,49,1 422 | 38,50,1 423 | 38,51,1 424 | 39,40,1 425 | 39,41,1 426 | 39,42,1 427 | 39,43,1 428 | 39,44,1 429 | 39,45,1 430 | 39,46,1 431 | 39,47,1 432 | 39,48,1 433 | 39,49,1 434 | 39,50,1 435 | 39,51,1 436 | 40,41,1 437 | 40,42,1 438 | 40,43,1 439 | 40,44,1 440 | 40,45,1 441 | 40,46,1 442 | 40,47,1 443 | 40,48,1 444 | 40,49,1 445 | 40,50,1 446 | 40,51,1 447 | 41,42,1 448 | 41,43,1 449 | 41,44,1 450 | 41,45,1 451 | 41,46,1 452 | 41,47,1 453 | 41,48,1 454 | 41,49,1 455 | 41,50,1 456 | 41,51,1 457 | 42,43,1 458 | 42,44,1 459 | 42,45,1 460 | 42,46,1 461 | 42,47,1 462 | 42,48,1 463 | 42,49,1 464 | 42,50,1 465 | 42,51,1 466 | 43,44,1 467 | 43,45,1 468 | 43,46,1 469 | 43,47,1 470 | 43,48,1 471 | 43,49,1 472 | 43,50,1 473 | 43,51,1 474 | 44,45,1 475 | 44,46,1 476 | 44,47,1 477 | 44,48,1 478 | 44,49,1 479 | 44,50,1 480 | 44,51,1 481 | 45,46,1 482 | 45,47,1 483 | 45,48,1 484 | 45,49,1 485 | 45,50,1 486 | 45,51,1 487 | 46,47,1 488 | 46,48,1 489 | 46,49,1 490 | 46,50,1 491 | 46,51,1 492 | 47,48,1 493 | 47,49,1 494 | 47,50,1 495 | 47,51,1 496 | 48,49,1 497 | 48,50,1 498 | 48,51,1 499 | 49,50,1 500 | 49,51,1 501 | 50,51,1 502 | -------------------------------------------------------------------------------- /MDL/error.py: -------------------------------------------------------------------------------- 1 | from graph import Graph; 2 | 3 | class Error : 4 | numNodes = 0; 5 | 6 | # 1s present in G but not in M 7 | numUnmodelledErrors = 0; 8 | unmodelled = []; 9 | numUnmodelledErrorsOld = 0; 10 | unmodelledOld = []; 11 | 12 | 13 | # incorrect cell values in M wrt G 14 | numModellingErrors = 0; 15 | modelled = []; 16 | numModellingErrorsOld = 0; 17 | modelledOld = []; 18 | 19 | # number of unique cells in M 20 | numCellsCovered = 0; 21 | covered = []; 22 | numCellsCoveredOld = 0; 23 | coveredOld = []; 24 | 25 | # number of cells directly encoded by M, no error possible 26 | numCellsExcluded = 0; 27 | excluded = []; 28 | numCellsExcludedOld = 0; 29 | excludedOld = []; 30 | 31 | 32 | def __init__(self, graph, err = None): 33 | 34 | if err is None : 35 | self.numNodes = graph.numNodes; 36 | 37 | self.unmodelled = [set(x) for x in graph.edges]; 38 | self.numUnmodelledErrors = graph.numEdges; 39 | 40 | self.modelled = [set() for x in range(len(graph.edges))]; 41 | self.numModellingErrors = 0; 42 | 43 | self.covered = [set() for i in range(self.numNodes)]; 44 | self.numCellsCovered = 0; 45 | 46 | self.excluded = [set() for i in range(self.numNodes)]; 47 | self.numCellsExcluded = 0; 48 | else : 49 | self.numNodes = err.numNodes; 50 | 51 | self.unmodelled = [set(x) for x in err.unmodelled]; 52 | self.numUnmodelledErrors = err.numUnmodelledErrors; 53 | 54 | self.modelled = [set(x) for x in err.modelled]; 55 | self.numModellingErrors = err.numModellingErrors; 56 | 57 | self.covered = [set(x) for x in err.covered]; 58 | self.numCellsCovered = err.numCellsCovered; 59 | 60 | self.excluded = [set(x) for x in err.excluded]; 61 | self.numCellsExcluded = err.numCellsExcluded; 62 | 63 | 64 | def recoverOld(self): 65 | self.numNodes = self.numNodesOld; 66 | 67 | self.unmodelled = self.unmodelledOld; 68 | self.numUnmodelledErrors = self.numUnmodelledErrorsOld; 69 | 70 | self.modelled = self.modelledOld; 71 | self.numModellingErrors = self.numModellingErrorsOld; 72 | 73 | self.covered = self.coveredOld; 74 | self.numCellsCovered = self.numCellsCoveredOld; 75 | 76 | self.excluded = self.excludedOld; 77 | self.numCellsExcluded = self.numCellsExcludedOld; 78 | 79 | 80 | 81 | # checks whether edge (i,j) is covered 82 | def isModelled(self, i, j) : 83 | return (max(i,j)-1 in self.covered[min(i,j)-1]); 84 | def isCovered(self, i, j) : 85 | return self.isModelled(i,j); 86 | 87 | # annotates edge (i,j) as covered 88 | # ! (i,j) does not have to be in E of G(V,E) 89 | def cover(self, i, j) : 90 | self.covered[min(i,j)-1].add(max(i,j)-1); 91 | self.numCellsCovered += 1; 92 | return; 93 | 94 | # annotates edge (i,j) as both covered, and error-free 95 | # ! (i,j) does not have to be in E of G(V,E) 96 | def coverAndExclude(self, i, j) : 97 | self.cover(i,j) 98 | self.exclude(i,j); 99 | return; 100 | 101 | def exclude(self, i, j) : 102 | self.excluded[min(i,j)-1].add(max(i,j)-1); 103 | self.numCellsExcluded += 1; 104 | return; 105 | 106 | def isError(self, i, j): 107 | return max(i,j)-1 in self.unmodelled[min(i,j)-1] or max(i,j)-1 in self.modelled[min(i,j)-1]; 108 | 109 | def isExcluded(self, i, j): 110 | return max(i,j)-1 in self.excluded[min(i,j)-1]; 111 | 112 | def isUnmodelledError(self, i, j): 113 | return max(i,j)-1 in self.unmodelled[min(i,j)-1]; 114 | def isUnmodelledEdge(self, i, j): 115 | return self.isUnmodelledError(i,j); 116 | 117 | def isModellingError(self, i, j): 118 | return max(i,j)-1 in self.modelled[min(i,j)-1]; 119 | 120 | # annotates edge (i,j) as correct 121 | def delError(self, i, j) : 122 | if self.isUnmodelledError(i,j) : 123 | self.delUnmodelledError(i,j); 124 | else : 125 | self.delModellingError(i,j); 126 | 127 | # annotates edge (i,j) as not-modelled 128 | def addUnmodelledError(self, i, j) : 129 | self.unmodelled[min(i,j)-1].add(max(i,j)-1); 130 | self.numUnmodelledErrors += 1; 131 | 132 | # annotates edge (i,j) as correctly modelled 133 | def delUnmodelledError(self, i, j) : 134 | self.unmodelled[min(i,j)-1].remove(max(i,j)-1); 135 | self.numUnmodelledErrors -= 1; 136 | 137 | # annotates edge (i,j) as erronously modelled 138 | def addModellingError(self, i, j) : 139 | self.modelled[min(i,j)-1].add(max(i,j)-1); 140 | self.numModellingErrors += 1; 141 | 142 | # annotates edge (i,j) as incorrectly modelled 143 | def delModellingError(self, i, j) : 144 | self.modelled[min(i,j)-1].remove(max(i,j)-1); 145 | self.numModellingErrors -= 1; 146 | 147 | 148 | def plotCover(self): 149 | for idx in range(len(self.covered)) : 150 | mystr = "".join(["." for x in range(0,idx+1)]); 151 | for idy in range(idx+1,len(self.covered)) : 152 | if idy in self.covered[idx] : 153 | mystr += "1"; 154 | else : 155 | mystr += "-"; 156 | print mystr; 157 | 158 | def plotError(self): 159 | for idx in range(len(self.unmodelled)) : # uses 'unmodelled' only as numNodes 160 | mystr = "".join(["." for x in range(0,idx+1)]); 161 | for idy in range(idx+1,len(self.unmodelled)) : 162 | if idy in self.covered[idx] : 163 | if idy in self.excluded[idx] : 164 | mystr += "*"; 165 | elif idy in self.modelled[idx] : 166 | mystr += "+"; 167 | else : 168 | mystr += "-"; 169 | else : 170 | if idy in self.unmodelled[idx] : 171 | mystr += "1"; 172 | else : 173 | mystr += "0"; 174 | print mystr; 175 | 176 | def plotExcluded(self): 177 | for idx in range(len(self.excluded)) : 178 | mystr = "".join(["." for x in range(0,idx+1)]); 179 | for idy in range(idx+1,len(self.excluded)) : 180 | if idy in self.excluded[idx] : 181 | mystr += "1"; 182 | else : 183 | mystr += "0"; 184 | print mystr; 185 | 186 | def listCover(self): 187 | print self.covered; 188 | 189 | def listError(self): 190 | for idx in range(len(self.unmodelled)) : 191 | if len(self.unmodelled[idx]) > 0 : 192 | print idx+1, "+: "+str([x+1 for x in self.unmodelled[idx]]), "-: "+str([x+1 for x in self.modelled[idx]]),; 193 | 194 | def listExcluded(self): 195 | print self.excluded; 196 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/compute_encodingCost.m: -------------------------------------------------------------------------------- 1 | function [ MDLcost ] = compute_encodingCost( subgraph, N_tot, n_sub, E, n_sub2, nb_edges, ca) 2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3 | %% Computation of the local encoding cost of a given substructure: % 4 | % INPUTS: % 5 | % subgraph: 'fc', 'nc', 'st', 'ch', 'bc', 'err' % 6 | % N_tot: total number of nodes in the whole graph % 7 | % n_sub: number of nodes in the given substructure OR number of nodes % 8 | % in the first set of a 'bc' (k) % 9 | % E : error matrix % 10 | % n_sub2: optional - number of nodes in the second set of a 'bc' (l) % 11 | % nb_edges: optional - edges *between* the two sets of % 12 | % the near-bipartite core % 13 | % ca: true if cross-association is used (encoding of nc) % 14 | % --------------- % 15 | % OUTPUTS: % 16 | % MDLcost = model_cost + error cost % 17 | % Author: Danai Koutra % 18 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 19 | 20 | if nargin < 7 21 | ca = false; 22 | end 23 | 24 | test_error_edges(E); 25 | 26 | switch subgraph 27 | case 'fc' 28 | if E(1) == 0 || E(2) == 0 % no excluded edges 29 | MDLcost = LN( n_sub ) + l2cnk(N_tot, n_sub); %log2( nchoosek(vpi(N_tot), n_sub) ); 30 | else 31 | MDLcost = LN( n_sub ) + l2cnk(N_tot, n_sub) + Lnu_opt(E); %log2( nchoosek(vpi(N_tot), n_sub) ) + Lnu_opt( E ); 32 | end 33 | case 'nc' 34 | % for the near clique: E is the Asmall matrix 35 | edges_inc = nnz(E); 36 | if ca == true % cross-association for bipartite graph (rectangular mat) 37 | edges_exc = size(E,1)*size(E,2)-nnz(E);%sum(E(:)==0); % computing the mdl cost of a bipartite graph encoded as near clique 38 | else 39 | edges_exc = size(E,1)*size(E,2)-nnz(E);%sum(E(:)==0)-n_sub; % diagonal elements are always 0 (no self-loops) 40 | end 41 | if edges_exc ~= 0 && edges_inc ~= 0 42 | MDLcost = LN( n_sub ) + l2cnk(N_tot, n_sub) + ... %log2(nchoosek(vpi(N_tot), n_sub)) + ... 43 | log2( n_sub^2 ) + edges_inc * NLL( edges_inc, edges_exc, 1) + ... 44 | edges_exc * NLL( edges_inc, edges_exc, 0); 45 | else 46 | MDLcost = LN( n_sub ) + log2(nchoosek(vpi(N_tot), n_sub)); 47 | end 48 | case 'st' 49 | if E(1) == 0 || E(2) == 0 %if sum(sum(E)) == 0 50 | MDLcost = LN( n_sub-1 ) + log2( N_tot ) + l2cnk(N_tot-1, n_sub-1); %log2( nchoosek( vpi(N_tot-1), n_sub-1 ) ); 51 | else 52 | MDLcost = LN( n_sub-1 ) + log2( N_tot ) + ... 53 | l2cnk(N_tot-1, n_sub-1) + ... %log2( nchoosek( vpi(N_tot-1), n_sub-1 ) ) + ... 54 | Lnu_opt( E ); 55 | end 56 | case 'ch' 57 | x = 0:(n_sub-1); 58 | N_tot_vec = N_tot * ones(1, n_sub); 59 | if E(1) == 0 || E(2) == 0 %if sum(sum(E)) == 0 60 | MDLcost = LN( n_sub-1 ) + sum( log2(N_tot_vec - x) ); 61 | else 62 | MDLcost = LN( n_sub-1 ) + sum( log2(N_tot_vec - x) ) + Lnu_opt( E ); 63 | end 64 | case 'bc' 65 | k = n_sub; 66 | l = n_sub2; 67 | if E(1) == 0 || E(2) == 0 %if sum(sum(E)) == 0 68 | MDLcost = LN(k) + LN(l) + l2cnk(N_tot, k) + l2cnk(N_tot-k, l); %log2( nchoosek(vpi(N_tot), k) ) + log2( nchoosek(vpi(N_tot-k), l) ); 69 | else 70 | MDLcost = LN(k) + LN(l) + l2cnk(N_tot, k) + ... %log2( nchoosek(vpi(N_tot), k) ) + ... 71 | l2cnk(N_tot-k, l) + ... %log2( nchoosek(vpi(N_tot-k), l) ) + ... 72 | Lnu_opt( E ); 73 | end 74 | case 'nb' 75 | k = n_sub; 76 | l = n_sub2; 77 | if E(1) == 0 || strcmp('ca', true) || E(2) == 0 %if sum(sum(E)) == 0 78 | MDLcost = LN(k) + LN(l) + l2cnk(N_tot, k) + l2cnk(N_tot-k, l); % log2( nchoosek(vpi(N_tot), k) ) + log2( nchoosek(vpi(N_tot-k), l) ); 79 | else 80 | % The error matrix E has only the edges *within* the two sets 81 | % of nodes. 82 | edges_inc = nb_edges(1); %nnz(nb_mat); 83 | edges_exc = nb_edges(2); %2*k*l - edges_inc; % the bipartite core model has 2*k*l edges (all the edges between the two sets of nodes - counting them 2 times, because the adjacency matrix is symmetric) 84 | if edges_inc == 0 || edges_exc == 0 85 | MDLcost = LN(k) + LN(l) + l2cnk(N_tot, k) + l2cnk(N_tot-k, l) + Lnu_opt( E); ... %log2( nchoosek(vpi(N_tot), k) ) + ... 86 | %log2( nchoosek(vpi(N_tot-k), l) ) + Lnu_opt( E); 87 | else 88 | MDLcost = LN(k) + LN(l) + l2cnk(N_tot, k) + l2cnk(N_tot-k, l) + ... %log2( nchoosek(vpi(N_tot), k) ) + log2( nchoosek(vpi(N_tot-k), l) ) + 89 | log2( (n_sub+n_sub2)^2 ) + ... 90 | edges_inc * NLL( edges_inc, edges_exc, 1) + ... 91 | edges_exc * NLL( edges_inc, edges_exc, 0) + Lnu_opt( E ); 92 | 93 | end 94 | end 95 | case 'err' 96 | if E(1) ~= 0 && E(2) ~= 0 97 | MDLcost = Lnu_opt( E ); 98 | elseif E(1) ~= 0 99 | MDLcost = LN( E(1) ); 100 | elseif E(2) ~= 0 101 | MDLcost = LN( E(2) ); 102 | end 103 | otherwise 104 | error_message = 'error: invalid structure...\n' 105 | end 106 | 107 | testMDLcost(MDLcost); 108 | 109 | %% encoded size of an integer >=1 as by Rissanen's 1983 Universal code for integers 110 | function [ c ] = LN( n ) 111 | c0 = 2.865064; 112 | c = log2(c0); 113 | logTerm = log2(n); 114 | while logTerm > 0 115 | c = c + logTerm; 116 | logTerm = log2(logTerm); 117 | end 118 | end 119 | 120 | %% error per structure: Naive Uniform 121 | function [ c_err ] = Lnu( E ) 122 | Einc = nnz(E); 123 | Eexc = sum(E(:)==0); 124 | c_err = LN( Einc ) + ... 125 | Einc * NLL( Einc, Eexc, 1) + ... 126 | Eexc * NLL( Einc, Eexc, 0); 127 | end 128 | 129 | %% error per full clique: Naive Uniform 130 | function [ c_err ] = Lnu_opt( E ) 131 | % E has two entries: # of included edges, # of excluded edges 132 | Einc = E(1); 133 | Eexc = E(2); 134 | c_err = LN( Einc ) + ... 135 | Einc * NLL( Einc, Eexc, 1) + ... 136 | Eexc * NLL( Einc, Eexc, 0); 137 | end 138 | 139 | 140 | %% Alternative error per structure: Naive Data-to-Model 141 | function [ c_err ] = Lnd( E ) 142 | Einc = nnz(E); 143 | c_err = LN( Einc ) + l2cnk (N_tot^2, Einc); %log2( nchoosek(vpi(N_tot^2), Einc) ); 144 | end 145 | 146 | 147 | %% Alternative error per structure: Naive Data-to-Model 148 | function [ c_err ] = Lnd_opt( E ) 149 | Einc = E(1); 150 | c_err = LN( Einc ) + l2cnk (N_tot^2, Einc); %log2( nchoosek(vpi(N_tot^2), Einc) ); 151 | end 152 | 153 | %% Negative log-likelihood 154 | % If sub = 0: p0 = -log2(excl / (incl + excl)) 155 | % if sub = 1: p1 = -log2(incl / (incl + excl)) 156 | function [ l ] = NLL( incl, excl, sub ) 157 | if sub == 0 158 | l = -log2(excl / (incl + excl)); 159 | elseif sub == 1 160 | l = -log2(incl / (incl + excl)); 161 | else 162 | err = 'error... Can only compute l0 ot l1 (negative log-likelihood)' 163 | end 164 | end 165 | 166 | end 167 | 168 | -------------------------------------------------------------------------------- /MDL/mdl.py: -------------------------------------------------------------------------------- 1 | import config; 2 | import mdl_base; 3 | import mdl_structs; 4 | import mdl_error; 5 | from copy import deepcopy; 6 | 7 | from math import log,factorial; 8 | from error import Error; 9 | from graph import Graph; 10 | from model import Model; 11 | 12 | from mdl_base import *; 13 | from mdl_structs import *; 14 | from mdl_error import *; 15 | 16 | 17 | ### Our Encoding Starts Here ### 18 | 19 | ### Total Encoded Size 20 | def L(G, M, errorEnc): 21 | E = Error(G); # initially, everything is error, nothing is covered 22 | error_cost = 0; 23 | 24 | 25 | 26 | model_cost = LN(M.numStructs+1); # encode number of structures we're encoding with 27 | model_cost += LwC(M.numStructs, M.numStrucTypes); # encode the number per structure 28 | 29 | # encode the structure-type identifier per type 30 | if M.numFullCliques > 0 : 31 | model_cost += M.numFullCliques * log(M.numFullCliques / float(M.numStructs), 2); 32 | if M.numNearCliques > 0 : 33 | model_cost += M.numNearCliques * log(M.numNearCliques / float(M.numStructs), 2); 34 | if M.numChains > 0 : 35 | model_cost += M.numChains * log(M.numChains / float(M.numStructs), 2); 36 | if M.numStars > 0 : 37 | model_cost += M.numStars * log(M.numStars / float(M.numStructs), 2); 38 | # off-diagonals 39 | if M.numFullOffDiagonals > 0 : 40 | model_cost += M.numFullOffDiagonals * log(M.numFullOffDiagonals / float(M.numStructs), 2); 41 | if M.numNearOffDiagonals > 0 : 42 | model_cost += M.numNearOffDiagonals * log(M.numNearOffDiagonals / float(M.numStructs), 2); 43 | # bipartite-cores 44 | if M.numBiPartiteCores > 0 : 45 | model_cost += M.numBiPartiteCores * log(M.numBiPartiteCores / float(M.numStructs), 2); 46 | if M.numNearBiPartiteCores > 0 : 47 | model_cost += M.numNearBiPartiteCores * log(M.numNearBiPartiteCores / float(M.numStructs), 2); 48 | if M.numJellyFishes > 0 : 49 | model_cost += M.numJellyFishes * log(M.numJellyFishes / float(M.numStructs), 2); 50 | if M.numCorePeripheries > 0 : 51 | model_cost += M.numCorePeripheries * log(M.numCorePeripheries / float(M.numStructs), 2); 52 | 53 | # encode the structures 54 | for struc in M.structs : 55 | if struc.isFullClique() : 56 | model_cost += LfullClique(struc,M,G,E); 57 | elif struc.isNearClique() : 58 | model_cost += LnearClique(struc,M,G,E); 59 | elif struc.isChain() : 60 | model_cost += Lchain(struc,M,G,E); 61 | elif struc.isStar() : 62 | model_cost += Lstar(struc,M,G,E); 63 | elif struc.isCorePeriphery() : 64 | model_cost += LcorePeriphery(struc,M,G,E); 65 | elif struc.isJellyFish() : 66 | model_cost += LjellyFish(struc,M,G,E); 67 | elif struc.isBiPartiteCore() : 68 | model_cost += LbiPartiteCore(struc,M,G,E); 69 | elif struc.isNearBiPartiteCore() : 70 | model_cost += LnearBiPartiteCore(struc,M,G,E); 71 | elif struc.isFullOffDiagonal() : 72 | model_cost += LfullOffDiagonal(struc,M,G,E); 73 | elif struc.isNearOffDiagonal() : 74 | model_cost += LnearOffDiagonal(struc,M,G,E); 75 | 76 | # encode the error 77 | error_cost += 0 if E.numCellsCovered == 0 else log(E.numCellsCovered, 2); # encode number of additive Errors 78 | if ((G.numNodes * G.numNodes - G.numNodes) / 2) - E.numCellsCovered > 0 : 79 | error_cost += log(((G.numNodes * G.numNodes - G.numNodes) / 2) - E.numCellsCovered, 2); # encode number of Errors 80 | 81 | if errorEnc == "NP" : 82 | error_cost += LErrorNaivePrefix(G,M,E); 83 | elif errorEnc == "NB" : 84 | error_cost += LErrorNaiveBinom(G,M,E); 85 | elif errorEnc == "TP" : 86 | error_cost += LErrorTypedPrefix(G,M,E); 87 | elif errorEnc == "TB" : 88 | error_cost += LErrorTypedBinom(G,M,E); 89 | 90 | total_cost = model_cost + error_cost; 91 | 92 | return (total_cost, model_cost, error_cost, E); 93 | 94 | 95 | 96 | ### Total Encoded Size for the greedy heuristic -- incrementally update the MDL cost 97 | ## for the newly added stucture 'struc' 98 | def Lgreedy(G, M, errorEnc, time, struc, totalCostOld, Eold, model_cost_struct): 99 | 100 | if time == 1: 101 | E = Error(G); # initially, everything is error, nothing is covered 102 | #E.saveOld(); 103 | # the cost for encoding each structure (to avoid recomputing it for the greedy updates) 104 | model_cost2 = 0; 105 | else : 106 | E = Error(G, Eold); 107 | #E.deepish_copy(Eold); 108 | #E = copy.deepcopy(Eold); 109 | #E = Eold; 110 | # the cost for encoding each structure separately 111 | # Just update the up-to-now cost by adding the cost of the new structure 112 | model_cost2 = model_cost_struct; 113 | 114 | error_cost = 0; 115 | 116 | model_cost = LN(M.numStructs+1); # encode number of structures we're encoding with 117 | model_cost += LwC(M.numStructs, M.numStrucTypes); # encode the number per structure 118 | 119 | # encode the structure-type identifier per type 120 | if M.numFullCliques > 0 : 121 | model_cost += M.numFullCliques * log(M.numFullCliques / float(M.numStructs), 2); 122 | if M.numNearCliques > 0 : 123 | model_cost += M.numNearCliques * log(M.numNearCliques / float(M.numStructs), 2); 124 | if M.numChains > 0 : 125 | model_cost += M.numChains * log(M.numChains / float(M.numStructs), 2); 126 | if M.numStars > 0 : 127 | model_cost += M.numStars * log(M.numStars / float(M.numStructs), 2); 128 | # off-diagonals 129 | if M.numFullOffDiagonals > 0 : 130 | model_cost += M.numFullOffDiagonals * log(M.numFullOffDiagonals / float(M.numStructs), 2); 131 | if M.numNearOffDiagonals > 0 : 132 | model_cost += M.numNearOffDiagonals * log(M.numNearOffDiagonals / float(M.numStructs), 2); 133 | # bipartite-cores 134 | if M.numBiPartiteCores > 0 : 135 | model_cost += M.numBiPartiteCores * log(M.numBiPartiteCores / float(M.numStructs), 2); 136 | if M.numNearBiPartiteCores > 0 : 137 | model_cost += M.numNearBiPartiteCores * log(M.numNearBiPartiteCores / float(M.numStructs), 2); 138 | if M.numJellyFishes > 0 : 139 | model_cost += M.numJellyFishes * log(M.numJellyFishes / float(M.numStructs), 2); 140 | if M.numCorePeripheries > 0 : 141 | model_cost += M.numCorePeripheries * log(M.numCorePeripheries / float(M.numStructs), 2); 142 | 143 | # encode the structures 144 | if struc.isFullClique() : 145 | model_cost2 += LfullClique(struc,M,G,E); 146 | elif struc.isNearClique() : 147 | model_cost2 += LnearClique(struc,M,G,E); 148 | elif struc.isChain() : 149 | model_cost2 += Lchain(struc,M,G,E); 150 | elif struc.isStar() : 151 | model_cost2 += Lstar(struc,M,G,E); 152 | elif struc.isCorePeriphery() : 153 | model_cost2 += LcorePeriphery(struc,M,G,E); 154 | elif struc.isJellyFish() : 155 | model_cost2 += LjellyFish(struc,M,G,E); 156 | elif struc.isBiPartiteCore() : 157 | model_cost2 += LbiPartiteCore(struc,M,G,E); 158 | elif struc.isNearBiPartiteCore() : 159 | model_cost2 += LnearBiPartiteCore(struc,M,G,E); 160 | elif struc.isFullOffDiagonal() : 161 | model_cost2 += LfullOffDiagonal(struc,M,G,E); 162 | elif struc.isNearOffDiagonal() : 163 | model_cost2 += LnearOffDiagonal(struc,M,G,E); 164 | 165 | # encode the error 166 | error_cost += 0 if E.numCellsCovered == 0 else log(E.numCellsCovered, 2); # encode number of additive Errors 167 | if ((G.numNodes * G.numNodes - G.numNodes) / 2) - E.numCellsCovered > 0 : 168 | error_cost += log(((G.numNodes * G.numNodes - G.numNodes) / 2) - E.numCellsCovered, 2); # encode number of Errors 169 | 170 | if errorEnc == "NP" : 171 | error_cost += LErrorNaivePrefix(G,M,E); 172 | elif errorEnc == "NB" : 173 | error_cost += LErrorNaiveBinom(G,M,E); 174 | elif errorEnc == "TP" : 175 | error_cost += LErrorTypedPrefix(G,M,E); 176 | elif errorEnc == "TB" : 177 | error_cost += LErrorTypedBinom(G,M,E); 178 | 179 | total_cost = model_cost + model_cost2 + error_cost; 180 | model_cost_total = model_cost + model_cost2; 181 | 182 | return (total_cost, model_cost_total, model_cost2, error_cost, E); 183 | 184 | 185 | 186 | 187 | 188 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/SlashBurnEncode.m: -------------------------------------------------------------------------------- 1 | % Author: Danai Koutra 2 | % Adaptation and extension of U Kang's code for SlashBurn 3 | % (http://www.cs.cmu.edu/~ukang/papers/sb_icdm2011.pdf) 4 | % 5 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 6 | % % 7 | % SlashBurn Encode: encode graph using SlashBurn % 8 | % % 9 | % Parameter % 10 | % AOrig : adjacency matrix of a graph. We assume symmetric matrix with % 11 | % both upper- and lower- diagonal elements are set. % 12 | % k : # of nodes to cut in SlashBurn % 13 | % outfile : file name to output the model % 14 | % info : true for detailed output (encoding gain reported) % 15 | % false for brief output (no encoding gain reported) % 16 | % starOption: true for encoding the vicinities of top degree nodes as % 17 | % stars % 18 | % false for encoding these vicinities as stars, nc or fc % 19 | % (depending on the smallest mdl cost) % 20 | % minSize: minimum size of structure that we want to encode % 21 | % graphFile: path to the edge file % 22 | % % 23 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 24 | 25 | function [ ] = SlashBurnEncode(AOrig, k, outFolder, info, starOption, minSize, graphFile ) 26 | 27 | %addpath('./VariablePrecisionIntegers/VariablePrecisionIntegers'); 28 | 29 | %% Definition of global variables: 30 | % model: 31 | global model; 32 | global model_idx; 33 | 34 | dir=0; 35 | % cost of encoding all the structures 36 | cost_ALLencoded_struct = 0; 37 | % if greedy is selected, all_costs has all the costs by adding one extra 38 | % structure for encoding 39 | all_costs = 0; 40 | 41 | %if nargin < 3 42 | % info = false; 43 | %end 44 | [~, fname, ~] = fileparts(graphFile); 45 | allOutFile = sprintf('%s/%s_ALL.model', outFolder, fname); 46 | outfile_ordered = sprintf('%s/%s_orderedALL.model', outFolder, fname); 47 | % Open 'outfile' for writing 48 | out_fid = fopen(allOutFile, 'w'); 49 | 50 | % Initialize variables 51 | gccsize = zeros(0,0); 52 | niter=0; 53 | n = max(size(AOrig,1),size(AOrig,2)); 54 | AOrig(n,n)=0; 55 | totalind = zeros(1,n); 56 | cur_lpos = 1; 57 | cur_rpos = n; 58 | gccind = [1:n]; 59 | cur_gccsize = n; 60 | total_SB_stars = 0; 61 | encoded_SB_stars = 0; 62 | total_cost = 0; 63 | 64 | if info == true 65 | info = false 66 | changingYourOption = 'Setting info to false, so that we can compute the encoding cost of all the found structures' 67 | end 68 | 69 | tic 70 | 71 | while niter == 0 || cur_gccsize > k 72 | niter = niter+1; 73 | fprintf('Iteration %d...\n', niter); 74 | 75 | A = AOrig(gccind,gccind); 76 | [disind,newgccind,topind] = RemHdegreeGccEncode(A, k, dir, out_fid, gccind, n, info, minSize); 77 | % save 'star' structures 78 | star_cores = topind; 79 | for i=1:size(star_cores, 2) 80 | E = zeros(1,2); 81 | cur_center = star_cores(i); 82 | 83 | satellites = find(A(cur_center, :)>0); 84 | 85 | % If the structure has less than minSize nodes, do not report it in the 86 | % model file 87 | if length(satellites) < minSize 88 | continue; 89 | end 90 | 91 | n_star = length(satellites) + 1; 92 | Asmall = A([cur_center, satellites],[cur_center, satellites]); 93 | MDLcostNC = compute_encodingCost( 'nc', n, n_star, Asmall); 94 | % 1s in the error matrix 95 | % missing edges in star + extra edges not in star 96 | E(1) = 2* (n_star-1-nnz(A(cur_center,satellites))) + nnz(A(satellites, satellites)); 97 | % 0s in the error matrix 98 | E(2) = n_star^2 - E(1); 99 | 100 | cost_notEnc = compute_encodingCost( 'err', 0, 0, [nnz(Asmall) n_star^2-nnz(Asmall)]); 101 | 102 | test_error_edges(E); 103 | 104 | % MDL cost of encoding given substructure as a star 105 | MDLcostST = compute_encodingCost( 'st', n, n_star, E); 106 | total_SB_stars = total_SB_stars + 1; 107 | 108 | if isinf(MDLcostNC) || isinf(MDLcostST) 109 | costGain = 0; 110 | costGain_notEnc = 0; 111 | else 112 | costGain = MDLcostNC - MDLcostST; 113 | costGain_notEnc = cost_notEnc - MDLcostST; 114 | end 115 | 116 | % encode the vicinities of high-deg nodes as stars 117 | if starOption == true 118 | fprintf( out_fid, 'st %d,', gccind(cur_center)); 119 | fprintf( out_fid, ' %d', gccind(satellites) ); 120 | encoded_SB_stars = encoded_SB_stars + 1; 121 | 122 | 123 | if info == false 124 | fprintf( out_fid, '\n'); 125 | else 126 | fprintf( out_fid, ', %f | %f -- SB \n', costGain, costGain_notEnc); 127 | end 128 | model_idx = model_idx + 1; 129 | model(model_idx) = struct('code', 'st', 'edges', 0, 'nodes1', gccind(cur_center), 'nodes2', gccind(satellites), 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 130 | % check which of the structures is best for encoding: star, fc, nc 131 | elseif starOption == false 132 | % 0s in the error matrix --- edges included in the structure (full clique) 133 | E(2) = nnz(Asmall); 134 | % 1s in the error matrix --- edges excluded from the structure (full clique) 135 | E(1) = n_star^2 - n_star - E(2); 136 | 137 | % MDL cost of encoding given substructure as a full clique 138 | MDLcostFC = compute_encodingCost( 'fc', n, n_star, E); 139 | MDLcosts = [ MDLcostFC, MDLcostNC, MDLcostST ]; 140 | [minCost minIdx] = min(MDLcosts); 141 | top_gccind = sort([gccind(cur_center), gccind(satellites)]); 142 | curind = 1:size(top_gccind,2); 143 | switch minIdx 144 | case 1 145 | costGain = MDLcostNC - MDLcostFC; 146 | costGain_notEnc = cost_notEnc - MDLcostFC; 147 | encodeAsFClique( curind, top_gccind, costGain, costGain_notEnc, out_fid, info ); 148 | case 2 149 | costGain = MDLcostNC - MDLcostNC; 150 | costGain_notEnc = cost_notEnc - MDLcostNC; 151 | m = nnz(Asmall); 152 | encodeAsNClique( curind, top_gccind, m, costGain, costGain_notEnc, out_fid, info ); 153 | case 3 154 | fprintf( out_fid, 'st %d,', gccind(cur_center)); 155 | for j=1:size(satellites,2) 156 | fprintf( out_fid, ' %d', gccind(satellites(j)) ); 157 | end 158 | 159 | if info == false 160 | fprintf( out_fid, '\n'); 161 | else 162 | fprintf( out_fid, ', %f | %f -- SB \n', costGain, costGain_notEnc); 163 | end 164 | model_idx = model_idx + 1; 165 | model(model_idx) = struct('code', 'st', 'edges', 0, 'nodes1', gccind(cur_center), 'nodes2', gccind(satellites), 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 166 | encoded_SB_stars = encoded_SB_stars + 1; 167 | end 168 | 169 | else 170 | wrongMessage = 'starOption should be true or false. Invalid value given.' 171 | return 172 | end 173 | end 174 | 175 | % save structures on the disconnected components 176 | 177 | % reorganize the matrix 178 | topind_size = size(topind, 2); 179 | 180 | totalind(cur_lpos:cur_lpos + topind_size - 1) = gccind(topind); 181 | cur_lpos = cur_lpos + topind_size; 182 | totalind(cur_rpos - size(disind,2) + 1:cur_rpos) = gccind(disind); 183 | cur_rpos = cur_rpos - size(disind,2); 184 | 185 | gccind = gccind(newgccind); 186 | cur_gccsize = size(gccind, 2); 187 | 188 | end 189 | 190 | if k > 1 && cur_gccsize >= 2 191 | EncodeSubgraph(AOrig(gccind,gccind), [1:size(gccind,2)], gccind, n, out_fid, info, minSize); 192 | end 193 | 194 | %% Selection of structures: 195 | % Method 1: top 10 196 | % Method 2: greedy selection 197 | 198 | 199 | [~, order] = sort([model(:).benefit_notEnc], 'descend'); 200 | model_ordered = model(order); 201 | printModel(model_ordered, outfile_ordered); 202 | all_costs = 0; 203 | all_costs_incStruct = 0; 204 | 205 | runtime = toc 206 | time_stored = sprintf('%s/%s_runtime.txt', outFolder, fname); 207 | save(time_stored, 'runtime', '-ascii'); 208 | 209 | disp('=== Graph decomposition and structure labeling: finished! ===') 210 | 211 | fclose(out_fid); 212 | 213 | end 214 | -------------------------------------------------------------------------------- /MDL/greedySearch_nStop.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python2.6 2 | 3 | ######################################################################### 4 | # # 5 | # Implementation of the GreedyNForget heuristic described in the paper # 6 | # VOG: Summarizing and Understanding Large Graphs # 7 | # by Danai Koutra, U Kang, Jilles Vreeken, Christos Faloutsos # 8 | # http://www.cs.cmu.edu/~dkoutra/papers/VoG.pdf # 9 | # # 10 | # v1.1: faster implementation using dynamic programming and the # 11 | # technique of memoization. Now, we can add to the summary many # 12 | # more structures much faster (structLim = 10000+) # 13 | # # 14 | # v1: very slow, naive implementation, computing the MDL encoding cost # 15 | # from scratch for each structure addition (structLim = 200) # 16 | # # 17 | ######################################################################### 18 | 19 | 20 | import sys 21 | import os 22 | import config 23 | import copy 24 | 25 | from time import time 26 | 27 | from mdl import *; 28 | from error import Error; 29 | from graph import Graph; 30 | from model import *; 31 | from random import shuffle; 32 | #from description_length import *; 33 | 34 | if len(sys.argv) <= 1 : 35 | print 'at least: [model.model] [-pC] [-lC] [-pE] [-lE] [-e{NP,NB,TP,TB}]'; 36 | print ' optional argument model = file to read model from, otherwise only empty model'; 37 | print ' optional argument -vX = verbosity (1, 2, or 3)'; 38 | print ' optional argument -pG = plot Graph adjacency matrix'; 39 | print ' optional argument -pC = plot Cover matrix'; 40 | print ' optional argument -pE = plot Error matrix'; 41 | print ' optional argument -lC = list Cover entries'; 42 | print ' optional argument -lE = list Error entries'; 43 | print ' optional argument -eXX = encode error resp. untyped using prefix (NP), or'; 44 | print ' binomial (NB) codes, or using typed'; 45 | print ' prefix (TP) or binomial (TB, default) codes'; 46 | exit(); 47 | 48 | if (len(sys.argv) > 1 and ("-v1" in sys.argv)) : 49 | config.optVerbosity = 1; 50 | elif (len(sys.argv) > 1 and ("-v2" in sys.argv)) : 51 | config.optVerbosity = 2; 52 | if (len(sys.argv) > 1 and ("-v3" in sys.argv)) : 53 | config.optVerbosity = 3; 54 | 55 | t0 = time() 56 | 57 | gFilename = sys.argv[1]; 58 | g = Graph(); 59 | g.load(gFilename); 60 | 61 | 62 | if config.optVerbosity > 1 : print "- graph loaded." 63 | 64 | m = Model(); 65 | 66 | 67 | errorEnc = config.optDefaultError; 68 | if (len(sys.argv) > 1 and ("-eNP" in sys.argv or "-NP'" in sys.argv)) : 69 | errorEnc = "NP"; 70 | elif (len(sys.argv) > 1 and ("-eNB" in sys.argv or "-NB" in sys.argv)) : 71 | errorEnc = "NB"; 72 | elif (len(sys.argv) > 1 and ("-eTP" in sys.argv or "-TP" in sys.argv)) : 73 | errorEnc = "TP"; 74 | elif (len(sys.argv) > 1 and ("-eTB" in sys.argv or "-TB" in sys.argv)) : 75 | errorEnc = "TB"; 76 | 77 | if config.optVerbosity > 1 : print "- calculating L(M_0,G)" 78 | (l_total_0, l_model_0, l_error_0, E_0) = L(g,m, errorEnc); 79 | if config.optVerbosity > 1 : print "- calculated L(M_0,G)" 80 | print " \t" + "L(G,M)" + "\tL(M)" + "\tL(E)" + "\t#E+" + "\t#E-" + "\t\t#Ex"; 81 | print "M_0:\t" + '%.0f' % l_total_0 + "\t" + '%.0f' % l_model_0 + "\t" + '%.0f' % l_error_0 + "\t" + str(E_0.numModellingErrors) + '/' + str(E_0.numCellsCovered) + '\t' + str(E_0.numUnmodelledErrors) + '/' + str(((E_0.numNodes * E_0.numNodes)-E_0.numNodes) - E_0.numCellsCovered) + '\t' + str(E_0.numCellsExcluded); 82 | 83 | 84 | if len(sys.argv) > 2 and sys.argv[2][0] != '-' : 85 | mFilename = sys.argv[2]; 86 | m.load(mFilename); 87 | print "Number of structures in the model: %.0f" % m.numStructs; 88 | if config.optVerbosity > 1 : print "- M_x loaded." 89 | (l_total_x, l_model_x, l_error_x, E_x) = L(g,m, errorEnc); 90 | print "M_x:\t" + '%.0f' % l_total_x + "\t" + '%.0f' % l_model_x + "\t" + '%.0f' % l_error_x + "\t" + str(E_x.numModellingErrors) + '/' + str(E_x.numCellsCovered) + '\t' + str(E_x.numUnmodelledErrors) + '/' + str(((E_x.numNodes * E_x.numNodes)-E_x.numNodes) - E_x.numCellsCovered) + '\t\t' + str(E_x.numCellsExcluded); 91 | 92 | # reinitialize the model for the greedy approach 93 | m = Model(); 94 | 95 | # maximum number of structures considered 96 | structLim = 10000; 97 | # read maxStructs structures from the model file and save it in modelContent 98 | mHandle = open(mFilename, 'r') 99 | mContent = mHandle.readlines(); #(structLim); 100 | # print mContent[0] 101 | # print "length of model file: %.0f" % len(mContent); 102 | maxStructs = len(mContent); 103 | # print "length of model file: %.0f" % maxStructs; 104 | lines_all = []; 105 | # shuffle(mContent); 106 | # print mContent[0]; 107 | 108 | l_total_prev = l_total_0; 109 | # the encoding costper structure is 0 initially 110 | lmodel_struct_prev = 0; 111 | E_x = E_0; 112 | E_x_old = E_0; 113 | structsInSummary = []; 114 | times = 1; 115 | 116 | mFilename_list = mFilename.split('/'); 117 | mFilename_main = mFilename_list[len(mFilename_list) - 1]; 118 | print '%s' % mFilename_main 119 | mFilenameGreedy = 'heuristicSelection_nStop_ALL_' + mFilename_main; 120 | fgreedy = open(mFilenameGreedy,'w') 121 | mFilenameGreedyCost = 'heuristic_Selection_costs_ALL_' + mFilename_main; 122 | fgreedyCost = open(mFilenameGreedyCost,'w') 123 | 124 | fgreedyCost.write("l_total_0: %.0f\n" % l_total_0 ) 125 | 126 | # number of structures in the summary 127 | kept_struct = 0; 128 | 129 | while times <= maxStructs : #min(structLim, maxStructs) : # add upto structLim structures or as many as there are in the model file 130 | print "time\t" + '%.0f' % times; 131 | # add to the model the new structure 132 | newStruct = m.loadLine(mContent, times-1); 133 | (l_total_x, l_model_x, l_model_struct, l_error_x, E_x) = Lgreedy(g, m, errorEnc, times, newStruct, l_total_prev, E_x_old, lmodel_struct_prev); 134 | print "M_x:\t" + '%.0f' % l_total_x + "\t" + '%.0f' % l_model_x + "\t" + '%.0f' % l_error_x + "\t" + str(E_x.numModellingErrors) + '/' + str(E_x.numCellsCovered) + '\t' + str(E_x.numUnmodelledErrors) + '/' + str(((E_x.numNodes * E_x.numNodes)-E_x.numNodes) - E_x.numCellsCovered) + '\t\t' + str(E_x.numCellsExcluded); 135 | # print "l_total_x %.0f" % l_total_x + "l_total_prev %.0f" % l_total_prev; 136 | if l_total_x > l_total_prev : 137 | print "dropped the structure"; 138 | l_total_x = l_total_prev; 139 | # remove the last added structure 140 | # print "structs in model %.0f " % m.numStructs; 141 | m.rmStructure(newStruct); 142 | # print "structs in model %.0f " % m.numStructs; 143 | #E_x = copy.deepcopy(); #E_x.recoverOld(); 144 | print "-----------------------------------------------------------" 145 | else : 146 | # print "kept the structure"; 147 | kept_struct += 1; 148 | # print "structs in model %.0f " % m.numStructs; 149 | # save the Error matrix to this point 150 | #E_x.saveOld(); 151 | #E_x_old = copy.deepcopy(E_x); 152 | E_x_old = E_x; 153 | #E_x_old = Error(g); 154 | #E_x_old.deepish_copy(E_x, g); 155 | 156 | l_total_prev = l_total_x; 157 | # update the up-to-now cost per structure 158 | lmodel_struct_prev = l_model_struct; 159 | structsInSummary.append(times); 160 | fgreedyCost.write("Time %.0f" % times + "\t%.0f\n" % l_total_x ) 161 | print "-----------------------------------------------------------" 162 | if kept_struct == structLim : 163 | break; 164 | if times == 50 or times % 100 == 0 : 165 | mFilenameGreedyTemp = 'greedySelection_' + str(times) + '_' + mFilename_main; 166 | fgreedyTemp = open(mFilenameGreedyTemp, 'w'); 167 | fgreedyTemp.write("Structures of model in the summary (each number is the corresponding line number of the structure in the model file)\n"); 168 | for line in structsInSummary: 169 | # fgreedyTemp.write("%s" % line + "\t%s" % mContent[line]); 170 | fgreedyTemp.write("%s\n" % line); 171 | times += 1; 172 | 173 | print "structs in model %.0f " % m.numStructs; 174 | 175 | for line in structsInSummary: 176 | fgreedy.write("%s\n" % line) 177 | 178 | fgreedy.close(); 179 | fgreedyCost.close(); 180 | 181 | 182 | if (len(sys.argv) > 3 and "-pG" in sys.argv) : 183 | print "Adjacency matrix:"; 184 | g.plot(); 185 | 186 | if (len(sys.argv) > 3 and "-pC" in sys.argv) : 187 | print "Cover matrix:"; 188 | E_x.plotCover(); 189 | 190 | if (len(sys.argv) > 3 and "-pE" in sys.argv) : 191 | print "Error matrix:"; 192 | E_x.plotError(); 193 | 194 | if (len(sys.argv) > 3 and "-lC" in sys.argv) : 195 | print "Cover list:"; 196 | E_x.listCover(); 197 | 198 | if (len(sys.argv) > 3 and "-lE" in sys.argv) : 199 | print "Error list:"; 200 | E_x.listError(); 201 | 202 | print time()-t0 203 | print "Total running time %.2f" % (time()-t0); 204 | 205 | mHandle.close() 206 | -------------------------------------------------------------------------------- /STRUCTURE_DISCOVERY/ExactStructure.m: -------------------------------------------------------------------------------- 1 | % Encode the connected component from SlashBurn. 2 | function [ exact_found ] = ExactStructure( Asmall, curind, top_gccind, N_tot, out_fid, info, minSize ) 3 | 4 | global model; 5 | global model_idx; 6 | 7 | % Asmall = B(curind,curind); 8 | 9 | exact_found = false; 10 | n = size(curind, 2); 11 | m = nnz(Asmall); 12 | 13 | if n==1 14 | return; 15 | end 16 | %fprintf('n=%d, m=%d\n', n, m); 17 | 18 | % cost of encoding the structure as near-clique 19 | MDLcost_nc = compute_encodingCost( 'nc', N_tot, n, Asmall); 20 | % cost of not encoding the structure at all (noise) 21 | cost_notEnc = compute_encodingCost( 'err', 0, 0, [nnz(Asmall) n^2-nnz(Asmall)]); 22 | 23 | if ( m == n*n - n ) % full clique 24 | if n ~= 2 25 | MDLcost_fc = compute_encodingCost( 'fc', N_tot, n, zeros(n,n)); 26 | costGain = MDLcost_nc - MDLcost_fc; 27 | costGain_notEnc = cost_notEnc - MDLcost_fc; 28 | fprintf(out_fid, 'fc'); 29 | for i=1:size(curind, 2) 30 | fprintf(out_fid, ' %d', top_gccind( curind(i) ) ); 31 | end 32 | if info == false 33 | fprintf(out_fid, '\n'); 34 | else 35 | fprintf(out_fid, ', %f | %f -- exact \n', costGain, costGain_notEnc); 36 | end 37 | exact_found = true; 38 | model_idx = model_idx + 1; 39 | model(model_idx) = struct('code', 'fc', 'edges', 0, 'nodes1', top_gccind(curind), 'nodes2', [], 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 40 | %entries = size(model, 2); 41 | %model(entries+1) = struct('code', 'fc', 'nodes1', top_gccind(curind), 'nodes2', [], 'benefit', costGain); 42 | elseif n==2 43 | MDLcost_ch = compute_encodingCost( 'ch', N_tot, n, zeros(n,n)); 44 | costGain = MDLcost_nc - MDLcost_ch; 45 | costGain_notEnc = cost_notEnc - MDLcost_ch; 46 | fprintf(out_fid, 'ch'); 47 | fprintf(out_fid, ' %d', top_gccind( curind(1:2) )); 48 | if info == false 49 | fprintf(out_fid, '\n'); 50 | else 51 | fprintf(out_fid, ', %f | %f -- exact \n', costGain, costGain_notEnc); 52 | end 53 | exact_found = true; 54 | model_idx = model_idx + 1; 55 | model(model_idx) = struct('code', 'ch', 'edges', 0, 'nodes1', top_gccind(curind), 'nodes2', [], 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 56 | %entries = size(model, 2); 57 | %model(entries+1) = struct('code', 'ch', 'nodes1', top_gccind(curind), 'nodes2', [], 'benefit', costGain); 58 | end 59 | elseif (m == 2*(n-1)) % chain or star 60 | degree = sum(Asmall); 61 | ind = find(degree > 0); 62 | d1count=0; 63 | d2count=0; 64 | dn1count=0; 65 | 66 | for i=1:size(degree, 2) 67 | if( degree(i) == 1 ) 68 | d1count = d1count + 1; 69 | elseif degree(i) == 2 70 | d2count = d2count + 1; 71 | elseif degree(i) == n-1 72 | dn1count = dn1count + 1; 73 | end 74 | end 75 | 76 | %fprintf('d1count=%d, d2count=%d, dn1count=%d\n', d1count, d2count, dn1count); 77 | 78 | if d1count == 2 && d2count == n-2 % chain 79 | MDLcost_ch = compute_encodingCost( 'ch', N_tot, n, zeros(n,n)); 80 | costGain = MDLcost_nc - MDLcost_ch; 81 | costGain_notEnc = cost_notEnc - MDLcost_ch; 82 | fprintf(out_fid, 'ch'); 83 | d1ind = find( degree == 1); 84 | fprintf(out_fid, ' %d', top_gccind( curind(d1ind(1)) ) ); 85 | 86 | d2ind = find(degree==2); 87 | %for i=1:size(d2ind, 2) 88 | fprintf(out_fid, ' %d', top_gccind( curind(d2ind(1:size(d2ind, 2))) ) ); 89 | %end 90 | 91 | fprintf(out_fid, ' %d', top_gccind( curind(d1ind(2) )) ); 92 | 93 | if info == false 94 | fprintf(out_fid, '\n'); 95 | else 96 | fprintf(out_fid, ', %f | %f -- exact \n', costGain, costGain_notEnc); 97 | end 98 | exact_found = true; 99 | model_idx = model_idx + 1; 100 | model(model_idx) = struct('code', 'ch', 'edges', 0, 'nodes1', [top_gccind(curind(d1ind(1))) top_gccind(curind(d2ind(1:size(d2ind, 2)))) top_gccind(curind(d1ind(2)))], 'nodes2', [], 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 101 | %entries = size(model, 2); 102 | %model(entries+1) = struct('code', 'ch', 'nodes1', top_gccind(curind), 'nodes2', [], 'benefit', costGain); 103 | 104 | % elseif d1count==n-1 & dn1count==1 % star 105 | % fprintf(out_fid, 'st'); 106 | % dn1ind = find( degree == n-1); 107 | % fprintf(out_fid, ' %d,', top_gccind( dn1ind(1) ) ); 108 | % 109 | % d1ind = find(degree==2); 110 | % for i=1:size(d1ind, 2) 111 | % fprintf(out_fid, ' %d', top_gccind( d1ind(i) ) ); 112 | % end 113 | % 114 | % fprintf(out_fid, '\n'); 115 | % exact_found = true; 116 | % end 117 | % else % near clique 118 | % fprintf(out_fid, 'nc %d,', m/2); 119 | % for i=1:size(curind, 2) 120 | % fprintf(out_fid, ' %d', top_gccind( curind(i) ) ); 121 | % end 122 | % fprintf(out_fid, '\n'); 123 | end 124 | else 125 | %evalmax = eigs( Asmall,1, 'LA' ); 126 | %evalmin = eigs( Asmall,1, 'SA' ); 127 | opts.tol = 1e-2; 128 | evals = eigs(Asmall, 2, 'lm', opts); % the eigenvalues with maximum magnitude 129 | 130 | if ( max(evals) == - min(evals) ) % bipartite graph (special case: star) 131 | [ set1, set2 ] = BFScoloring( Asmall ); 132 | if length(set1)+length(set2) < minSize 133 | exact_found = true; 134 | return; 135 | end 136 | if length(set1) == 1 && length(set2) == 1 137 | MDLcost_ch = compute_encodingCost( 'ch', N_tot, n, zeros(n,n)); 138 | costGain = MDLcost_nc - MDLcost_ch; 139 | costGain_notEnc = cost_notEnc - MDLcost_ch; 140 | fprintf(out_fid, 'ch'); 141 | fprintf(out_fid, ' %d', top_gccind( curind([set1, set2]) )); 142 | if info == false 143 | fprintf(out_fid, '\n'); 144 | else 145 | fprintf(out_fid, ', %f | %f -- exact \n', costGain, costGain_notEnc); 146 | end 147 | exact_found = true; 148 | model_idx = model_idx + 1; 149 | model(model_idx) = struct('code', 'ch', 'edges', 0, 'nodes1', top_gccind(curind([set1, set2])), 'nodes2', [], 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 150 | elseif length(set1) == 1 151 | MDLcost_st = compute_encodingCost( 'st', N_tot, n, zeros(n,n)); 152 | costGain = MDLcost_nc - MDLcost_st; 153 | costGain_notEnc = cost_notEnc - MDLcost_st; 154 | fprintf(out_fid, 'st %d,', top_gccind( curind(set1) )); 155 | fprintf(out_fid, ' %d', top_gccind( curind(set2) ) ); 156 | if info == false 157 | fprintf(out_fid, '\n'); 158 | else 159 | fprintf(out_fid, ', %f | %f -- exact \n', costGain, costGain_notEnc); 160 | end 161 | exact_found = true; 162 | model_idx = model_idx + 1; 163 | model(model_idx) = struct('code', 'fc', 'edges', 0, 'nodes1', top_gccind(curind), 'nodes2', [], 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 164 | %entries = size(model, 2); 165 | %model(entries+1) = struct('code', 'st', 'nodes1', top_gccind(curind(set1)), 'nodes2', top_gccind(curind(set2)), 'benefit', costGain); 166 | elseif length(set2) == 1 167 | MDLcost_st = compute_encodingCost( 'st', N_tot, n, zeros(n,n)); 168 | costGain = MDLcost_nc - MDLcost_st; 169 | costGain_notEnc = cost_notEnc - MDLcost_st; 170 | fprintf(out_fid, 'st %d,', top_gccind( curind(set2) )); 171 | fprintf(out_fid, ' %d', top_gccind( curind(set1) ) ); 172 | if info == false 173 | fprintf(out_fid, '\n'); 174 | else 175 | fprintf(out_fid, ', %f | %f -- exact \n', costGain, costGain_notEnc); 176 | end 177 | exact_found = true; 178 | model_idx = model_idx + 1; 179 | model(model_idx) = struct('code', 'st', 'edges', 0, 'nodes1', top_gccind(curind(set1)), 'nodes2', top_gccind(curind(set1)), 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 180 | %entries = size(model, 2); 181 | %model(entries+1) = struct('code', 'st', 'nodes1', top_gccind(curind(set2)), 'nodes2', top_gccind(curind(set1)), 'benefit', costGain); 182 | else % bipartite graph 183 | degrees = sum(Asmall,2); 184 | % First check if it is bipartite core: The degrees of the nodes 185 | % in the first set should be equal to the number of nodes in 186 | % the second set, and vice versa. 187 | if sum(full(degrees(set1)) ~= length(set2)*ones(length(set1),1)) && ... 188 | sum(full(degrees(set2)) ~= length(set1)*ones(length(set2),1)) == 0 189 | MDLcost_bc = compute_encodingCost( 'bc', N_tot, length(set1), zeros(n,n), length(set2)); 190 | costGain = MDLcost_nc - MDLcost_bc; 191 | costGain_notEnc = cost_notEnc - MDLcost_bc; 192 | fprintf(out_fid, 'bc'); 193 | fprintf(out_fid, ' %d', top_gccind( curind(set1) )); 194 | fprintf(out_fid, ','); 195 | fprintf(out_fid, ' %d', top_gccind( curind(set2) ) ); 196 | if info == false 197 | fprintf(out_fid, '\n'); 198 | else 199 | fprintf(out_fid, ', %f | %f -- exact \n', costGain, costGain_notEnc); 200 | end 201 | exact_found = true; 202 | model_idx = model_idx + 1; 203 | model(model_idx) = struct('code', 'bc', 'edges', 0, 'nodes1', top_gccind(curind(set1)), 'nodes2', top_gccind(curind(set2)), 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 204 | else 205 | % it's not a bipartite core (full bipartite graph) - 206 | % However, it is a bipartite graph. Let's see if we should 207 | % encode it as a bipartite core or a near bipartite core. 208 | MDLcost_bc = compute_encodingCost( 'bc', N_tot, length(set1), zeros(n,n), length(set2)); 209 | MDLcost_nb = compute_encodingCost( 'nb', N_tot, length(set1), zeros(n,n), length(set2)); 210 | if MDLcost_bc <= MDLcost_nb 211 | costGain = MDLcost_nc - MDLcost_bc; 212 | costGain_notEnc = cost_notEnc - MDLcost_bc; 213 | fprintf(out_fid, 'bc'); 214 | fprintf(out_fid, ' %d', top_gccind( curind(set1) )); 215 | fprintf(out_fid, ','); 216 | fprintf(out_fid, ' %d', top_gccind( curind(set2) ) ); 217 | if info == false 218 | fprintf(out_fid, '\n'); 219 | else 220 | fprintf(out_fid, ', %f | %f -- not exact \n', costGain, costGain_notEnc); 221 | end 222 | exact_found = true; 223 | model_idx = model_idx + 1; 224 | model(model_idx) = struct('code', 'bc', 'edges', 0, 'nodes1', top_gccind(curind(set1)), 'nodes2', top_gccind(curind(set2)), 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 225 | else % better to encode it as near-bipartite core 226 | costGain = MDLcost_nc - MDLcost_nb; 227 | costGain_notEnc = cost_notEnc - MDLcost_nb; 228 | fprintf(out_fid, 'nb'); 229 | fprintf(out_fid, ' %d', top_gccind( curind(set1) )); 230 | fprintf(out_fid, ','); 231 | fprintf(out_fid, ' %d', top_gccind( curind(set2) ) ); 232 | if info == false 233 | fprintf(out_fid, '\n'); 234 | else 235 | fprintf(out_fid, ', %f | %f -- not exact \n', costGain, costGain_notEnc); 236 | end 237 | exact_found = true; 238 | model_idx = model_idx + 1; 239 | model(model_idx) = struct('code', 'nb', 'edges', 0, 'nodes1', top_gccind(curind(set1)), 'nodes2', top_gccind(curind(set2)), 'benefit', costGain, 'benefit_notEnc', costGain_notEnc); 240 | end 241 | end 242 | 243 | %entries = size(model, 2); 244 | %model(entries+1) = struct('code', 'bc', 'nodes1', top_gccind(curind(set1)), 'nodes2', top_gccind(curind(set2)), 'benefit', costGain); 245 | end 246 | 247 | 248 | end 249 | end 250 | end 251 | 252 | 253 | -------------------------------------------------------------------------------- /MDL/model.py: -------------------------------------------------------------------------------- 1 | #from math import log; 2 | 3 | class Model : 4 | strucTypes = []; 5 | numStrucTypes = 0; 6 | structs = []; 7 | numStructs = 0; 8 | 9 | numFullCliques = 0; 10 | numNearCliques = 0; 11 | numFullOffDiagonals = 0; 12 | numNearOffDiagonals = 0; 13 | numChains = 0; 14 | numStars = 0; 15 | numBiPartiteCores = 0; 16 | numNearBiPartiteCores = 0; 17 | numCorePeripheries = 0; 18 | numJellyFishes = 0; 19 | 20 | def __init__(self): 21 | self.strucTypes = ["fc","nc","ch","st","bc","nb"]; #,"cp","jf","fod","nod"]; 22 | self.numStrucTypes = len(self.strucTypes); 23 | self.structs = []; 24 | self.numStructs = 0; 25 | 26 | def setStrucTypes(self, st) : 27 | self.strucTypes = st; 28 | self.numStrucTypes = len(self.strucTypes); 29 | 30 | # struct of type Struct 31 | def addStructure(self, struct) : 32 | self.structs.append(struct); 33 | self.numStructs += 1; 34 | 35 | if struct.getType() not in self.strucTypes : 36 | print "structure type not declared"; 37 | 38 | if struct.isFullClique() : 39 | self.numFullCliques += 1; 40 | elif struct.isNearClique() : 41 | self.numNearCliques += 1; 42 | if struct.isFullOffDiagonal() : 43 | self.numFullOffDiagonals+= 1; 44 | elif struct.isNearOffDiagonal() : 45 | self.numNearOffDiagonals += 1; 46 | elif struct.isChain() : 47 | self.numChains += 1; 48 | elif struct.isStar() : 49 | self.numStars += 1; 50 | elif struct.isBiPartiteCore() : 51 | self.numBiPartiteCores += 1; 52 | elif struct.isNearBiPartiteCore() : 53 | self.numNearBiPartiteCores += 1; 54 | elif struct.isCorePeriphery() : 55 | self.numCorePeripheries += 1; 56 | elif struct.isJellyFish() : 57 | self.numJellyFishes += 1; 58 | 59 | # remove structure struct 60 | def rmStructure(self, struct) : 61 | self.structs.remove(struct); 62 | self.numStructs -= 1; 63 | 64 | if struct.getType() not in self.strucTypes : 65 | print "structure type not declared"; 66 | 67 | if struct.isFullClique() : 68 | self.numFullCliques -= 1; 69 | elif struct.isNearClique() : 70 | self.numNearCliques -= 1; 71 | if struct.isFullOffDiagonal() : 72 | self.numFullOffDiagonals-= 1; 73 | elif struct.isNearOffDiagonal() : 74 | self.numNearOffDiagonals -= 1; 75 | elif struct.isChain() : 76 | self.numChains -= 1; 77 | elif struct.isStar() : 78 | self.numStars -= 1; 79 | elif struct.isBiPartiteCore() : 80 | self.numBiPartiteCores -= 1; 81 | elif struct.isNearBiPartiteCore() : 82 | self.numNearBiPartiteCores -= 1; 83 | elif struct.isCorePeriphery() : 84 | self.numCorePeripheries -= 1; 85 | elif struct.isJellyFish() : 86 | self.numJellyFishes -= 1; 87 | 88 | def load(self, fullpath): 89 | fg = open(fullpath); 90 | for line in fg : 91 | if len(line) < 4 or line[0] == "#" : 92 | continue; 93 | struct = Structure.load(line); 94 | if struct != 0 : 95 | self.addStructure(struct); 96 | return; 97 | 98 | def loadLine(self, content, lineNo): 99 | line = content[lineNo]; # line of the model to be added 100 | if len(line) < 4 or line[0] == "#": 101 | return -1; 102 | struct = Structure.load(line); 103 | if struct != 0 : 104 | self.addStructure(struct); 105 | return struct; 106 | 107 | def loadLines(self, fullpath, lineList): 108 | fg = open(fullpath); 109 | lineNo = 0; 110 | for line in fg : 111 | lineNo = lineNo + 1; 112 | if lineNo > lineList[len(lineList) - 1] : 113 | break; 114 | if lineNo in lineList : 115 | if len(line) < 4 or line[0] == "#": 116 | continue; 117 | struct = Structure.load(line); 118 | if struct != 0 : 119 | self.addStructure(struct); 120 | return; 121 | 122 | class Structure : 123 | def getType(self): 124 | return "?"; 125 | getType = staticmethod(getType); 126 | 127 | def isFullClique(self): 128 | return False; 129 | def isNearClique(self): 130 | return False; 131 | 132 | def isFullOffDiagonal(self): 133 | return False; 134 | def isNearOffDiagonal(self): 135 | return False; 136 | 137 | def isChain(self): 138 | return False; 139 | def isStar(self): 140 | return False; 141 | 142 | def isBiPartiteCore(self): 143 | return False; 144 | 145 | def isNearBiPartiteCore(self): 146 | return False; 147 | 148 | def isCorePeriphery(self): 149 | return False; 150 | 151 | def isJellyFish(self): 152 | return False; 153 | 154 | def load(line) : 155 | if line[:2] == FullClique.getType() : 156 | return FullClique.load(line); 157 | elif line[:2] == NearClique.getType() : 158 | return NearClique.load(line); 159 | if line[:3] == FullOffDiagonal.getType() : 160 | return FullOffDiagonal.load(line); 161 | elif line[:3] == NearOffDiagonal.getType() : 162 | return NearOffDiagonal.load(line); 163 | elif line[:2] == Chain.getType() : 164 | return Chain.load(line); 165 | elif line[:2] == Star.getType() : 166 | return Star.load(line); 167 | elif line[:2] == BiPartiteCore.getType() : 168 | return BiPartiteCore.load(line); 169 | elif line[:2] == NearBiPartiteCore.getType() : 170 | return NearBiPartiteCore.load(line); 171 | elif line[:2] == CorePeriphery.getType() : 172 | return CorePeriphery.load(line); 173 | elif line[:2] == JellyFish.getType() : 174 | return JellyFish.load(line); 175 | load = staticmethod(load) 176 | 177 | class Clique(Structure) : 178 | nodes = []; 179 | numNodes = 0; 180 | 181 | 182 | class FullClique(Clique) : 183 | def __init__(self, nodes): 184 | self.nodes = nodes; 185 | self.numNodes = len(nodes); 186 | 187 | def getType(): 188 | return "fc"; 189 | getType = staticmethod(getType); 190 | 191 | def isFullClique(self): 192 | return True; 193 | 194 | def load(line) : 195 | # "fc 1 2 3 4 .. 196 | if line[:2] != FullClique.getType() : 197 | return 0; 198 | parts = line[3:].strip().split(' '); 199 | nodes = []; 200 | for x in parts : 201 | if x.find('-') > 0 : 202 | y = x.strip().split('-'); 203 | nodes.extend([z for z in range(int(y[0]),int(y[1])+1)]); 204 | else : 205 | nodes.append(int(x)); 206 | return FullClique(sorted(nodes)); 207 | load = staticmethod(load); 208 | 209 | 210 | class NearClique(Clique) : 211 | numEdges = 0; 212 | 213 | def __init__(self, nodes, numEdges): 214 | self.nodes = nodes; 215 | self.numNodes = len(nodes); 216 | self.numEdges = numEdges; 217 | 218 | def getType(): 219 | return "nc"; 220 | getType = staticmethod(getType); 221 | 222 | def isNearClique(self): 223 | return True; 224 | 225 | def load(line) : 226 | # "nc , 1 2 3 4 .. 227 | if line[:2] != NearClique.getType() : 228 | return 0; 229 | cParts = line[3:].strip().split(','); 230 | numEdges = int(float(cParts[0].strip())); 231 | 232 | sParts = cParts[1].strip().split(' '); 233 | 234 | nodes = []; 235 | for x in sParts : 236 | if x.find('-') > 0 : 237 | y = x.strip().split('-'); 238 | nodes.extend([x for x in range(int(y[0]),int(y[1])+1)]); 239 | else : 240 | nodes.append(int(x)); 241 | return NearClique(sorted(nodes), numEdges); 242 | load = staticmethod(load); 243 | 244 | class Rectangle(Structure) : 245 | lNodeList = []; 246 | rNodeList = []; 247 | numNodesLeft = 0; 248 | numNodesRight = 0; 249 | 250 | def __init__(self, left, right): 251 | self.lNodeList = left; 252 | self.rNodeList = right; 253 | self.numNodesLeft = len(left); 254 | self.numNodesRight = len(right); 255 | 256 | class FullOffDiagonal(Rectangle) : 257 | def __init__(self, left, right): 258 | Rectangle.__init__(self, left, right) 259 | 260 | def getType(): 261 | return "fod"; 262 | getType = staticmethod(getType); 263 | 264 | def isFullOffDiagonal(self): 265 | return True; 266 | 267 | def load(line) : 268 | # "fod [left ids], [right ids] 269 | if line[:3] != FullOffDiagonal.getType() : 270 | return 0; 271 | parts = line[4:].strip().split(','); 272 | lParts = parts[0].strip().split(' '); 273 | lNodeList = []; 274 | for x in lParts : 275 | if x.find('-') > 0 : 276 | y = x.strip().split('-'); 277 | lNodeList.extend([z for z in range(int(y[0]),int(y[1])+1)]); 278 | else : 279 | lNodeList.append(int(x)); 280 | rParts = parts[1].strip().split(' '); 281 | rNodeList = []; 282 | for x in rParts : 283 | if x.find('-') > 0 : 284 | y = x.strip().split('-'); 285 | rNodeList.extend([z for z in range(int(y[0]),int(y[1])+1)]); 286 | else : 287 | rNodeList.append(int(x)); 288 | return FullOffDiagonal(sorted(lNodeList),sorted(rNodeList)); 289 | load = staticmethod(load); 290 | 291 | 292 | class NearOffDiagonal(Rectangle) : 293 | def __init__(self, left, right): 294 | Rectangle.__init__(self, left, right) 295 | 296 | def getType(): 297 | return "nod"; 298 | getType = staticmethod(getType); 299 | 300 | def isNearOffDiagonal(self): 301 | return True; 302 | 303 | def load(line) : 304 | # "fod [left ids], [right ids] 305 | if line[:3] != NearOffDiagonal.getType() : 306 | return 0; 307 | parts = line[4:].strip().split(','); 308 | lParts = parts[0].strip().split(' '); 309 | lNodeList = []; 310 | for x in lParts : 311 | if x.find('-') > 0 : 312 | y = x.strip().split('-'); 313 | lNodeList.extend([z for z in range(int(y[0]),int(y[1])+1)]); 314 | else : 315 | lNodeList.append(int(x)); 316 | rParts = parts[1].strip().split(' '); 317 | rNodeList = []; 318 | for x in rParts : 319 | if x.find('-') > 0 : 320 | y = x.strip().split('-'); 321 | rNodeList.extend([z for z in range(int(y[0]),int(y[1])+1)]); 322 | else : 323 | rNodeList.append(int(x)); 324 | return NearOffDiagonal(sorted(lNodeList),sorted(rNodeList)); 325 | load = staticmethod(load); 326 | 327 | 328 | 329 | class Chain(Structure) : 330 | nodes = []; 331 | numNodes = 0; 332 | 333 | def __init__(self, nodes): 334 | self.nodes = nodes; 335 | self.numNodes = len(nodes); 336 | 337 | def getType(): 338 | return "ch"; 339 | getType = staticmethod(getType); 340 | 341 | def isChain(self): 342 | return True; 343 | 344 | def load(line) : 345 | # "ch 1 2 3 4 .. 346 | if line[:2] != Chain.getType() : 347 | return 0; 348 | parts = line[3:].strip().split(' '); 349 | nodes = []; 350 | for x in parts : 351 | if x.find('-') > 0 : 352 | y = x.strip().split('-'); 353 | nodes.extend([x for x in range(int(y[0]),int(y[1])+1)]); 354 | else : 355 | nodes.append(int(x)); 356 | return Chain(nodes); 357 | load = staticmethod(load); 358 | 359 | 360 | class Star(Structure) : 361 | cNode = -1; 362 | sNodes = []; 363 | numSpokes = 0; 364 | 365 | def __init__(self, hub, spokes): 366 | self.cNode = hub; 367 | self.sNodes = spokes; 368 | self.numSpokes = len(spokes); 369 | 370 | def getType(): 371 | return "st"; 372 | getType = staticmethod(getType); 373 | 374 | def isStar(self): 375 | return True; 376 | 377 | def load(line) : 378 | # "st [spoke ids ...] 379 | if line[:2] != Star.getType() : 380 | return 0; 381 | parts = line[3:].strip().split(','); 382 | cParts = parts[0].strip().split(' '); 383 | cNodes = []; 384 | for x in cParts : 385 | if x.find('-') > 0 : 386 | y = x.split('-'); 387 | cNodes.extend([x for x in range(int(y[0]),int(y[1])+1)]); 388 | else : 389 | cNodes.append(int(x)); 390 | sParts = parts[1].strip().split(' '); 391 | sNodes = []; 392 | for x in sParts : 393 | 394 | if x.find('-') > 0 : 395 | y = x.split('-'); 396 | sNodes.extend([x for x in range(int(y[0]),int(y[1])+1)]); 397 | else : 398 | sNodes.append(int(x)); 399 | return Star(cNodes[0],sorted(sNodes)); 400 | load = staticmethod(load); 401 | 402 | 403 | class BiPartiteCore(Structure) : 404 | lNodes = []; 405 | numNodesLeft = 0; 406 | rNodes = []; 407 | numNodesRight = 0; 408 | 409 | def __init__(self, left, right): 410 | self.lNodes = left; 411 | self.numNodesLeft = len(left); 412 | self.rNodes = right; 413 | self.numNodesRight = len(right); 414 | 415 | def getType(): 416 | return "bc"; 417 | getType = staticmethod(getType); 418 | 419 | def isBiPartiteCore(self): 420 | return True; 421 | 422 | def load(line) : 423 | # "bc [left ids], [right ids] 424 | if line[:2] != BiPartiteCore.getType() : 425 | return 0; 426 | parts = line[3:].strip().split(','); 427 | lParts = parts[0].strip().split(' '); 428 | lNodes = []; 429 | for x in lParts : 430 | if x.find('-') > 0 : 431 | y = x.strip().split('-'); 432 | lNodes.extend([z for z in range(int(y[0]),int(y[1])+1)]); 433 | else : 434 | lNodes.append(int(x)); 435 | rParts = parts[1].strip().split(' '); 436 | rNodes = []; 437 | for x in rParts : 438 | if x.find('-') > 0 : 439 | y = x.strip().split('-'); 440 | rNodes.extend([z for z in range(int(y[0]),int(y[1])+1)]); 441 | else : 442 | rNodes.append(int(x)); 443 | return BiPartiteCore(sorted(lNodes),sorted(rNodes)); 444 | load = staticmethod(load); 445 | 446 | 447 | class NearBiPartiteCore(Structure) : 448 | lNodes = []; 449 | numNodesLeft = 0; 450 | rNodes = []; 451 | numNodesRight = 0; 452 | 453 | def __init__(self, left, right): 454 | self.lNodes = left; 455 | self.numNodesLeft = len(left); 456 | self.rNodes = right; 457 | self.numRightNodes = len(right); 458 | 459 | def getType(): 460 | return "nb"; 461 | getType = staticmethod(getType); 462 | 463 | def isNearBiPartiteCore(self): 464 | return True; 465 | 466 | def load(line) : 467 | # "nb [left ids], [right ids] 468 | if line[:2] != NearBiPartiteCore.getType() : 469 | return 0; 470 | parts = line[3:].strip().split(','); 471 | lParts = parts[0].strip().split(' '); 472 | lNodes = []; 473 | for x in lParts : 474 | if x.find('-') > 0 : 475 | y = x.strip().split('-'); 476 | lNodes.extend([z for z in range(int(y[0]),int(y[1])+1)]); 477 | else : 478 | lNodes.append(int(x)); 479 | rParts = parts[1].strip().split(' '); 480 | rNodes = []; 481 | for x in rParts : 482 | if x.find('-') > 0 : 483 | y = x.strip().split('-'); 484 | rNodes.extend([z for z in range(int(y[0]),int(y[1])+1)]); 485 | else : 486 | rNodes.append(int(x)); 487 | return NearBiPartiteCore(sorted(lNodes),sorted(rNodes)); 488 | load = staticmethod(load); 489 | 490 | 491 | class CorePeriphery(Structure) : 492 | cNodes = []; 493 | numCores = 0; 494 | sNodes = []; 495 | numSpokes = 0; 496 | 497 | def __init__(self, cores, spokes): 498 | self.cNodes = cores; 499 | self.numCores = len(cores); 500 | self.sNodes = spokes; 501 | self.numSpokes = len(spokes); 502 | 503 | def getType(): 504 | return "cp"; 505 | getType = staticmethod(getType); 506 | 507 | def isCorePeriphery(self): 508 | return True; 509 | 510 | def load(line) : 511 | # "cp [hubids], [spoke ids] 512 | if line[:2] != CorePeriphery.getType() : 513 | return 0; 514 | parts = line[3:].strip().split(','); 515 | cParts = parts[0].strip().split(' '); 516 | cNodes = []; 517 | for x in cParts : 518 | if x.find('-') > 0 : 519 | y = x.strip().split('-'); 520 | cNodes.extend([z for z in range(int(y[0]),int(y[1])+1)]); 521 | else : 522 | cNodes.append(int(x)); 523 | sParts = parts[1].strip().split(' '); 524 | sNodes = []; 525 | for x in sParts : 526 | if x.find('-') > 0 : 527 | y = x.strip().split('-'); 528 | sNodes.extend([z for z in range(int(y[0]),int(y[1])+1)]); 529 | else : 530 | sNodes.append(int(x)); 531 | return CorePeriphery(sorted(cNodes),sorted(sNodes)); 532 | load = staticmethod(load); 533 | 534 | 535 | class JellyFish(Structure) : 536 | cNodes = []; 537 | numCores = 0; 538 | sNodes = [[]]; 539 | numSpokes = []; 540 | numSpokeSum = 0; 541 | 542 | def __init__(self, cores, spokes): 543 | self.cNodes = cores; 544 | self.numCores = len(cores); 545 | self.sNodes = spokes; 546 | self.numSpokes = [len(s) for s in spokes]; 547 | self.numSpokeSum = sum(self.numSpokes); 548 | 549 | def getType(): 550 | return "jf"; 551 | getType = staticmethod(getType); 552 | 553 | def isJellyFish(self): 554 | return True; 555 | 556 | def load(line) : 557 | # jf [hubids], [[spoke ids],] 558 | if line[:2] != JellyFish.getType() : 559 | return 0; 560 | parts = line[3:].strip().split(','); 561 | cParts = parts[0].strip().split(' '); 562 | cNodes = []; 563 | for x in cParts : 564 | if x.find('-') > 0 : 565 | y = x.strip().split('-'); 566 | cNodes.extend([z for z in range(int(y[0]),int(y[1])+1)]); 567 | else : 568 | cNodes.append(int(x)); 569 | 570 | sNodes = [[] for x in range(len(cNodes))]; 571 | 572 | for i in range(len(cNodes)) : 573 | sParts = parts[i+1].strip().split(' '); 574 | for x in sParts : 575 | if x.find('-') > 0 : 576 | y = x.strip().split('-'); 577 | sNodes[i].extend([z for z in range(int(y[0]),int(y[1])+1)]); 578 | else : 579 | sNodes[i].append(int(x)); 580 | sNodes[i] = sorted(sNodes[i]); 581 | return JellyFish(sorted(cNodes),sNodes); 582 | load = staticmethod(load); 583 | 584 | 585 | 586 | -------------------------------------------------------------------------------- /MDL/mdl_structs.py: -------------------------------------------------------------------------------- 1 | import config; 2 | import mdl_base; 3 | 4 | from math import log,factorial; 5 | from error import Error; 6 | from graph import Graph; 7 | from model import Model; 8 | 9 | from mdl_base import *; 10 | 11 | # Encoded Size of a Full-Clique 12 | def LfullClique(c, M, G, E): 13 | # update Error 14 | coverFullClique(G, E, c); 15 | 16 | cost = LN(c.numNodes); # encode number of nodes 17 | if G.numNodes > 0 and c.numNodes > 0 : 18 | cost += LU(G.numNodes, c.numNodes); # encode node ids 19 | return cost; 20 | 21 | def coverFullClique(G, E, c): 22 | # c.nodes is ordered 23 | for i_idx in range(c.numNodes) : 24 | i = c.nodes[i_idx]; 25 | for j_idx in range(i_idx+1,c.numNodes) : 26 | j = c.nodes[j_idx]; 27 | 28 | if not E.isExcluded(i,j) : 29 | # only if (i,j) is not modelled perfectly 30 | 31 | if not E.isCovered(i,j) : 32 | # edge is not modelled yet 33 | if G.hasEdge(i,j) : 34 | # yet there is a real edge, so now we undo an error 35 | E.delUnmodelledError(i,j); 36 | else : 37 | # there is no real edge, but now we say there is, so we introduce error 38 | E.addModellingError(i,j); 39 | E.cover(i,j); 40 | 41 | else : 42 | # edge is already modelled 43 | if G.hasEdge(i,j) and E.isModellingError(i,j) : 44 | # edge exists, but model denied 45 | E.delModellingError(i,j); 46 | elif not G.hasEdge(i,j) and not E.isModellingError(i,j) : 47 | # edge does not exist, but now we say it does 48 | E.addModellingError(i,j); 49 | return; 50 | 51 | 52 | # Encoded Size of a Near-Clique 53 | def LnearClique(c, M, G, E) : 54 | # update Error, count coverage 55 | (cnt0,cnt1) = coverNearClique(G, E, c) 56 | 57 | cost = LN(c.numNodes); # encode number of nodes 58 | cost += LU(G.numNodes, c.numNodes); # encode node ids 59 | if cnt0+cnt1 > 0 : 60 | cost += log(cnt0+cnt1, 2); # encode probability of a 1 (cnt0+cnt1 is number of cells we describe, upperbounded by numnodes 2) 61 | cost += LnU(cnt0+cnt1, cnt1); # encode the edges 62 | return cost; 63 | 64 | def coverNearClique(G, E, c) : 65 | # c.nodes is ordered 66 | cnt0 = 0; 67 | cnt1 = 0; 68 | for i_idx in range(c.numNodes) : 69 | i = c.nodes[i_idx]; 70 | for j_idx in range(i_idx+1, c.numNodes) : 71 | j = c.nodes[j_idx]; 72 | 73 | if not E.isExcluded(i,j) : 74 | # only if (i,j) is not already modelled perfectly 75 | 76 | if not E.isCovered(i,j) : 77 | # edge is not modelled yet 78 | if G.hasEdge(i,j) : 79 | # yet there is a real edge, so now we undo an error 80 | E.delUnmodelledError(i,j); 81 | E.coverAndExclude(i,j); 82 | 83 | else : 84 | # edge is already modelled 85 | if E.isModellingError(i,j) : 86 | # but wrongly, we undo that error 87 | E.delModellingError(i,j); 88 | E.exclude(i,j) 89 | 90 | if G.hasEdge(i,j) : 91 | cnt1 += 1; 92 | else: 93 | cnt0 += 1; 94 | 95 | return (cnt0,cnt1); 96 | 97 | ## Off Diagonals 98 | # Encoded Size of a Full-Clique 99 | def LfullOffDiagonal(c, M, G, E): 100 | # update Error 101 | coverFullOffDiagonal(G, E, c); 102 | 103 | cost = LN(c.numNodesLeft) + LN(c.numNodesRight); # encode number of nodes 104 | cost += LU(G.numNodes, c.numNodesLeft); # encode node ids 105 | cost += LU(G.numNodes-c.numNodesLeft, c.numNodesRight); # encode node ids 106 | return cost; 107 | 108 | def coverFullOffDiagonal(G, E, c): 109 | # c.nodeListLeft is ordered 110 | for i_idx in range(c.numNodesLeft) : 111 | i = c.lNodeList[i_idx]; 112 | for j_idx in range(c.numNodesRight) : 113 | j = c.rNodeList[j_idx]; 114 | 115 | if not E.isExcluded(i,j) : 116 | # only if (i,j) is not modelled perfectly 117 | 118 | if not E.isCovered(i,j) : 119 | # edge is not modelled yet 120 | if G.hasEdge(i,j) : 121 | # yet there is a real edge, so now we undo an error 122 | E.delUnmodelledError(i,j); 123 | else : 124 | # there is no real edge, but now we say there is, so we introduce error 125 | E.addModellingError(i,j); 126 | E.cover(i,j); 127 | 128 | else : 129 | # edge is already modelled 130 | if G.hasEdge(i,j) and E.isModellingError(i,j) : 131 | # edge exists, but model denied 132 | E.delModellingError(i,j); 133 | elif not G.hasEdge(i,j) and not E.isModellingError(i,j) : 134 | # edge does not exist, but now we say it does 135 | E.addModellingError(i,j); 136 | return; 137 | 138 | 139 | # Encoded Size of a Near-Off Diagonal 140 | def LnearOffDiagonal(c, M, G, E) : 141 | # update Error, count coverage 142 | (cnt0,cnt1) = coverNearOffDiagonal(G, E, c) 143 | 144 | cost = LN(c.numNodesLeft) + LN(c.numNodesRight); # encode number of nodes 145 | cost += LU(G.numNodes, c.numNodesLeft); # encode node ids 146 | cost += LU(G.numNodes-c.numNodesLeft, c.numNodesRight); # encode node ids 147 | 148 | if cnt0+cnt1 > 0 : 149 | cost += log(cnt0+cnt1, 2); # encode probability of a 1 (cnt0+cnt1 is number of cells we describe, upperbounded by numnodes 2) 150 | cost += LnU(cnt0+cnt1, cnt1); # encode the edges 151 | return cost; 152 | 153 | def coverNearOffDiagonal(G, E, c) : 154 | # c.nodes is ordered 155 | cnt0 = 0; 156 | cnt1 = 0; 157 | for i_idx in range(c.numNodesLeft) : 158 | i = c.lNodeList[i_idx]; 159 | for j_idx in range(c.numNodesRight) : 160 | j = c.rNodeList[j_idx]; 161 | 162 | if not E.isExcluded(i,j) : 163 | # only if (i,j) is not already modelled perfectly 164 | 165 | if not E.isCovered(i,j) : 166 | # edge is not modelled yet 167 | if G.hasEdge(i,j) : 168 | # yet there is a real edge, so now we undo an error 169 | E.delUnmodelledError(i,j); 170 | E.coverAndExclude(i,j); 171 | 172 | else : 173 | # edge is already modelled 174 | if E.isModellingError(i,j) : 175 | # but wrongly, we undo that error 176 | E.delModellingError(i,j); 177 | E.exclude(i,j) 178 | 179 | if G.hasEdge(i,j) : 180 | cnt1 += 1; 181 | else: 182 | cnt0 += 1; 183 | 184 | return (cnt0,cnt1); 185 | 186 | 187 | 188 | # Encoded Size of a Chain 189 | def Lchain(ch, M, G, E) : 190 | # update Error 191 | coverChain(G,E,ch); 192 | 193 | cost = LN(ch.numNodes-1); # we know chain is at least 2 nodes 194 | cost += LU(G.numNodes,ch.numNodes); # identify the nodes 195 | cost += log(factorial(ch.numNodes),2) # identify their order 196 | 197 | ## same as LU + log(factorial) 198 | #for nid in range(ch.numNodes) : 199 | # cost += log(G.numNodes - nid, 2); # identify the node ids in order 200 | return cost; 201 | 202 | def coverChain(G, E, ch) : 203 | # model chain 204 | for i_idx in range(ch.numNodes-1) : 205 | i = ch.nodes[i_idx]; 206 | j = ch.nodes[i_idx+1]; 207 | 208 | if not E.isExcluded(i,j) : 209 | # only if (i,j) is not already modelled perfectly 210 | if not E.isCovered(i, j) : 211 | # edge is not modelled yet 212 | 213 | if G.hasEdge(i, j) : 214 | E.delUnmodelledError(i, j); 215 | else : 216 | E.addModellingError(i, j); 217 | E.cover(i,j); 218 | 219 | else : 220 | # edge is already modelled 221 | 222 | if G.hasEdge(i,j) and E.isModellingError(i,j) : 223 | # model is wrong in saying no edge 224 | E.delModellingError(i,j); 225 | # elif G.hasEdge(i,j) and not E.isModellingError(i,j) : 226 | # there is an edge, and we knew that 227 | # elif not G.hasEdge(i,j) and E.isModellingError(i,j) : 228 | # there is no edge, but we keep saying there is 229 | elif not G.hasEdge(i,j) and not E.isModellingError(i,j) : 230 | # there is no edge, but now we say there is 231 | E.addModellingError(i,j); 232 | 233 | if config.optModelZeroes == True : 234 | # model non-shortcuts 235 | for i_idx in range(ch.numNodes) : 236 | i = ch.nodes[i_idx]; 237 | for j_idx in range(i_idx+2, ch.numNodes) : # skip the direct neighbour 238 | j = ch.nodes[j_idx]; 239 | 240 | if not E.isExcluded(i,j) : 241 | # only if (i,j) is not already modelled perfectly 242 | if not E.isCovered(i,j) : 243 | # edge not yet modelled 244 | if G.hasEdge(i,j) : 245 | # oops, there is an edge, but we say there aint 246 | E.addModellingError(i,j); 247 | #else : 248 | # there is no edge, so we're good 249 | E.cover(i,j); 250 | #else : 251 | # edge is modelled 252 | #if G.hasEdge(i,j) and E.isModellingError(i,j) : 253 | # model incorrect in saying there is no edge - no change 254 | #if G.hasEdge(i,j) and not E.isModellingError(i,j) : 255 | # model correct in saying there is an edge, no change 256 | # ... 257 | return; 258 | 259 | 260 | 261 | # Encoded Size of a Star 262 | def Lstar(star, M, G, E) : 263 | # update Error 264 | coverStar(G, E, star); 265 | 266 | cost = LN(star.numSpokes); # number of spokes (we know there's one hub) 267 | cost += log(G.numNodes, 2); # identify the hub-node 268 | 269 | #cost += star.numSpokes * log(G.numNodes-1,2); # identify the spoke-nodes 270 | cost += LU(G.numNodes-1,star.numSpokes); # identify the spoke-nodes 271 | 272 | return cost; 273 | 274 | def coverStar(G, E, st) : 275 | 276 | i = st.cNode; 277 | for j in st.sNodes: 278 | x = min(i,j); 279 | y = max(i,j); 280 | if not E.isExcluded(i,j) : 281 | # only if (i,j) is not already modelled perfectly 282 | 283 | if G.hasEdge(x,y) : 284 | if E.isCovered(x,y) : 285 | if E.isModellingError(x,y) : 286 | # previously modelled as 0, we fix the error 287 | E.delModellingError(x,y); 288 | else : 289 | E.delUnmodelledError(x,y); 290 | E.cover(x,y); 291 | else : 292 | if E.isCovered(x,y) : 293 | if not E.isModellingError(x,y) : 294 | E.addModellingError(x,y); 295 | else : 296 | E.addModellingError(x,y); 297 | E.cover(x,y) 298 | 299 | 300 | if config.optModelZeroes == True : 301 | # model non-shortcuts 302 | for i_idx in range(st.numSpokes) : 303 | i = st.sNodes[i_idx]; 304 | for j_idx in range(i_idx+1, st.numSpokes) : 305 | j = st.sNodes[j_idx]; 306 | 307 | if not E.isExcluded(i,j) : 308 | # only if (i,j) is not already modelled perfectly 309 | 310 | if not E.isCovered(i,j) : 311 | # edge not yet modelled 312 | if G.hasEdge(i,j) : 313 | # oops, there is an edge, but we say there aint 314 | E.addModellingError(i,j); 315 | #else : 316 | # there is no edge, so we're good 317 | E.cover(i,j); 318 | #else : 319 | # edge is modelled 320 | #if G.hasEdge(i,j) and E.isModellingError(i,j) : 321 | # model incorrect in saying there is no edge - no change 322 | #if G.hasEdge(i,j) and not E.isModellingError(i,j) : 323 | # model correct in saying there is an edge, no change 324 | # ... 325 | 326 | return; 327 | 328 | # Encoded Size of a bi-partite core 329 | def LbiPartiteCore(bc, M, G, E) : 330 | # update Error 331 | coverBiPartiteCore(G, E, bc); 332 | 333 | cost = LN(bc.numNodesLeft) + LN(bc.numNodesRight); 334 | cost += LU(G.numNodes, bc.numNodesLeft); 335 | cost += LU(G.numNodes- bc.numNodesLeft, bc.numNodesRight); 336 | return cost; 337 | 338 | def coverBiPartiteCore(G, E, bc) : 339 | 340 | # 1. fill in the 1s between the parts 341 | for i in bc.lNodes : 342 | for j in bc.rNodes : 343 | if not E.isExcluded(i,j) : 344 | # only if (i,j) is not already modelled perfectly 345 | if G.hasEdge(i,j) : 346 | # there is an edge 347 | if E.isCovered(i,j) : 348 | if E.isModellingError(i,j) : 349 | # model says 0, we fix to 1 350 | E.delModellingError(i,j); 351 | else : 352 | # model didnt say anything, we fix it 353 | E.delUnmodelledError(i,j); 354 | E.cover(i,j); 355 | else : 356 | # there is no edge 357 | if E.isCovered(i,j) : 358 | # but the cell is modelled 359 | if not E.isModellingError(i,j) : 360 | E.addModellingError(i,j); # we make a boo-boo 361 | else : 362 | # the cell is not modelled, yet 363 | E.addModellingError(i,j); 364 | E.cover(i, j); 365 | # print E.numCellsCovered; 366 | 367 | # 2. fill in the 0s in left part 368 | for i_idx in range(len(bc.lNodes)-1) : 369 | i = bc.lNodes[i_idx]; 370 | for j_idx in range(i_idx+1,len(bc.lNodes)) : 371 | j = bc.lNodes[j_idx]; 372 | 373 | if not E.isExcluded(i,j) and not E.isCovered(i,j) : 374 | # only if (i,j) is not covered or already modelled perfectly 375 | if E.isUnmodelledError(i,j) : 376 | # edge exists! 377 | E.delUnmodelledError(i,j); # we now model this cell 378 | E.addModellingError(i,j); # but do so wrongly 379 | E.cover(i,j); 380 | # print E.numCellsCovered; 381 | 382 | # 3. fill in the 0s in right part 383 | for i_idx in range(len(bc.rNodes)-1) : 384 | i = bc.rNodes[i_idx]; 385 | for j_idx in range(i_idx+1,len(bc.rNodes)) : 386 | j = bc.rNodes[j_idx]; 387 | 388 | if not E.isExcluded(i,j) and not E.isCovered(i,j) : 389 | # only if (i,j) is not covered or already modelled perfectly 390 | if E.isUnmodelledError(i,j) : 391 | # edge exists! 392 | E.delUnmodelledError(i,j); # we now model this cell 393 | E.addModellingError(i,j); # but do so wrongly 394 | E.cover(i,j); 395 | # print E.numCellsCovered; 396 | return; 397 | 398 | 399 | # Encoded Size of a near bi-partite core 400 | def LnearBiPartiteCore(nb, M, G, E) : 401 | # update Error 402 | (cnt0,cnt1) = coverNearBiPartiteCore(G, E, nb); 403 | 404 | # encode number of nodes in sets A and B 405 | cost = LN(nb.numNodesLeft) + LN(nb.numNodesRight); 406 | # encode node ids of sets A and B 407 | cost += LU(G.numNodes, nb.numNodesLeft); 408 | cost += LU(G.numNodes- nb.numNodesLeft, nb.numNodesRight); 409 | 410 | if cnt0+cnt1 > 0 : 411 | # encode probability of a 1 between sets A and B 412 | cost += log(cnt0+cnt1, 2); 413 | # encode the actual edges between A and B 414 | cost += LnU(cnt0+cnt1, cnt1); 415 | return cost; 416 | 417 | 418 | def coverNearBiPartiteCore(G, E, nb) : 419 | # first encode the edges between the parts 420 | cnt0 = 0; 421 | cnt1 = 0; 422 | for i_idx in range(nb.numNodesLeft) : 423 | i = nb.lNodes[i_idx]; 424 | for j_idx in range(nb.numNodesRight) : 425 | j = nb.rNodes[j_idx]; 426 | 427 | if not E.isExcluded(i,j) : 428 | # only if (i,j) is not already modelled perfectly 429 | 430 | if not E.isCovered(i,j) : 431 | # edge is not modelled yet 432 | if G.hasEdge(i,j) : 433 | # yet there is a real edge, so now we undo an error 434 | E.delUnmodelledError(i,j); 435 | E.coverAndExclude(i,j); 436 | 437 | else : 438 | # edge is already modelled 439 | if E.isModellingError(i,j) : 440 | # but wrongly, we undo that error 441 | E.delModellingError(i,j); 442 | E.exclude(i,j) 443 | 444 | if G.hasEdge(i,j) : 445 | cnt1 += 1; 446 | else: 447 | cnt0 += 1; 448 | 449 | 450 | # 2. fill in the 0s in left part 451 | for i_idx in range(len(nb.lNodes)-1) : 452 | i = nb.lNodes[i_idx]; 453 | for j_idx in range(i_idx+1,len(nb.lNodes)) : 454 | j = nb.lNodes[j_idx]; 455 | 456 | if not E.isExcluded(i,j) and not E.isCovered(i,j) : 457 | # only if (i,j) is not covered or already modelled perfectly 458 | if E.isUnmodelledError(i,j) : 459 | # edge exists! 460 | E.delUnmodelledError(i,j); # we now model this cell 461 | E.addModellingError(i,j); # but do so wrongly 462 | E.cover(i,j); 463 | 464 | # 3. fill in the 0s in right part 465 | for i_idx in range(len(nb.rNodes)-1) : 466 | i = nb.rNodes[i_idx]; 467 | for j_idx in range(i_idx+1,len(nb.rNodes)) : 468 | j = nb.rNodes[j_idx]; 469 | 470 | if not E.isExcluded(i,j) and not E.isCovered(i,j) : 471 | # only if (i,j) is not covered or already modelled perfectly 472 | if E.isUnmodelledError(i,j) : 473 | # edge exists! 474 | E.delUnmodelledError(i,j); # we now model this cell 475 | E.addModellingError(i,j); # but do so wrongly 476 | E.cover(i,j); 477 | 478 | return (cnt0,cnt1); 479 | 480 | 481 | # Encoded Size of a jellyfish structure 482 | def LjellyFish(jf, M, G, E) : 483 | # update Error 484 | coverJellyFish(G, E, jf); 485 | 486 | cost = LN(jf.numCores); # number of core nodes 487 | cost += LU(G.numNodes, jf.numCores); # core node ids 488 | 489 | cost += LN(jf.numSpokeSum) + LC(jf.numSpokeSum, jf.numCores); # number of spokes per core node 490 | cost += LU(G.numNodes - jf.numCores, jf.numSpokeSum); # spoke ids (-no- overlap between sets!) 491 | return cost; 492 | 493 | def coverJellyFish(G, E, jf) : 494 | 495 | # first link up the nodes in the core 496 | for i_idx in range(len(jf.cNodes)) : 497 | i = jf.cNodes[i_idx]; 498 | for j_idx in range(i_idx+1,len(jf.cNodes)) : 499 | j = jf.cNodes[j_idx]; 500 | 501 | if not E.isExcluded(i,j) : 502 | # only if (i,j) is not already modelled perfectly 503 | 504 | if G.hasEdge(i,j) : 505 | # there is an edge 506 | if E.isCovered(i,j) : 507 | if E.isModellingError(i,j) : 508 | E.delModellingError(i,j); # model said 0, but we say 1 509 | else : 510 | # edge is there, but not covered, we fix it! 511 | E.delUnmodelledError(i,j); 512 | E.cover(i,j); 513 | else : 514 | # there is no edge 515 | if E.isCovered(i,j) : 516 | if not E.isModellingError(i,j) : 517 | E.addModellingError(i,j); # model said 0, we say 1 518 | else : 519 | E.addModellingError(i,j); 520 | E.cover(i,j); 521 | 522 | # 2. link up the core nodes up to their respective spokes 523 | for i_idx in range(len(jf.cNodes)) : 524 | i = jf.cNodes[i_idx]; 525 | for j_idx in range(len(jf.sNodes[i_idx])) : 526 | j = jf.sNodes[i_idx][j_idx]; 527 | 528 | if not E.isExcluded(i,j) : 529 | # only if (i,j) is not already modelled perfectly 530 | 531 | if G.hasEdge(i,j) : 532 | # there is an edge 533 | if E.isCovered(i,j) : 534 | if E.isModellingError(i,j) : 535 | E.delModellingError(i,j); # model said 0, we fix to 1 536 | else : 537 | # edge is there, but not covered, we fix it 538 | E.delUnmodelledError(i,j); 539 | E.cover(i,j); 540 | else : 541 | # there is no edge 542 | if E.isCovered(i,j) : 543 | if not E.isModellingError(i,j) : 544 | E.addModellingError(i,j); # model said 0, but we say 1 545 | else : 546 | E.addModellingError(i,j); 547 | E.cover(i,j); 548 | 549 | if config.optModelZeroes == True : 550 | # 3. model that the spokes within a set are not connected 551 | # !!! code can be made more efficient, by incorporating it in previous loop 552 | for i_idx in range(len(jf.cNodes)) : 553 | 554 | for j_idx in range(len(jf.sNodes[i_idx])-1) : 555 | j = jf.sNodes[i_idx][j_idx]; 556 | 557 | for k_idx in range(j_idx+1,len(jf.sNodes[i_idx])) : 558 | k = jf.sNodes[i_idx][k_idx]; 559 | 560 | if not E.isExcluded(j,k) : 561 | # only if (i,j) is not already modelled perfectly 562 | 563 | #if E.isModelled(j,k) : 564 | # we don't change previous modelling, but 565 | if not E.isModelled(j,k) : 566 | # cell not yet modelled, and should be a 0 567 | if G.hasEdge(j,k) : 568 | # but, it has a 1, change it to modelling error 569 | E.delUnmodelledError(j,k); 570 | E.addModellingError(j,k); 571 | E.cover(j,k); 572 | return; 573 | 574 | 575 | # Encoded Size of a core periphery 576 | def LcorePeriphery(cp, M, G, E) : 577 | # update Error 578 | coverCorePeriphery(G, E, cp); 579 | 580 | cost = LN(cp.numCores); # number of core-nodes 581 | cost += LN(cp.numSpokes); # number of spoke-nodes 582 | cost += cp.numCores * log(G.numNodes, 2); # identify core-nodes 583 | cost += cp.numSpokes * log(G.numNodes - cp.numCores, 2); # identify spoke-nodes 584 | return cost; 585 | 586 | # check whether ok 587 | def coverCorePeriphery(G, E, cp) : 588 | for i in cp.cNodes : 589 | for j in cp.sNodes : 590 | if not E.isModelled(i,j) : 591 | if G.hasEdge(i,j) : 592 | E.delUnmodelledError(i,j); 593 | else : 594 | E.addModellingError(i,j); 595 | E.cover(i,j); 596 | return; 597 | 598 | # Encoded Size of a core periphery (a bit smarter) 599 | def LcorePeripheryA(cp, M, G, E) : 600 | cost = LN(cp.numCoreNodes); # number of core-nodes 601 | cost += LN(cp.numSpokes); # number of spoke-nodes 602 | cost += LU(G.numNodes, cp.numCoreNodes); # identify core-nodes 603 | cost += LU(G.numNodes - cp.numCoreNodes, cp.numSpokes); # identify spoke-nodes 604 | return cost; 605 | -------------------------------------------------------------------------------- /MDL/description_length.py: -------------------------------------------------------------------------------- 1 | import config; 2 | 3 | from math import log,factorial; 4 | from error import Error; 5 | from graph import Graph; 6 | from model import Model; 7 | 8 | ### basic functions 9 | # determine possible number of edges between `numEdges' nodes 10 | def CalcCliqueNumPosEdges(numEdges): 11 | # directed graph, no self-loops 12 | # (|n|^2)-n 13 | return numEdges*numEdges - numEdges; 14 | 15 | # (n choose k) 16 | def choose(n, k): 17 | if 0 <= k <= n: 18 | p = 1 19 | for t in xrange(min(k, n - k)): 20 | p = (p * (n - t)) // (t + 1) 21 | return p; 22 | else: 23 | return 0; 24 | 25 | def composition(n,k) : 26 | return choose(n-1,k-1); 27 | 28 | def LC(n,k) : 29 | return log(composition(n,k),2); 30 | 31 | def weakcomposition(n,k) : 32 | return choose(n+k-1,k-1); 33 | 34 | def LwC(n,k) : 35 | return log(weakcomposition(n,k),2); 36 | 37 | # Encoded length of `n` 0/1 entries with `k` 1s (aka, Naive Uniform) 38 | def LnU(n,k): 39 | #print 'LnU', n, k 40 | if n==0 or k==0 or k==n: 41 | return 0; 42 | x = -log(k / float(n),2); 43 | y = -log((n-k)/float(n),2); 44 | return k * x + (n-k) * y; 45 | 46 | # Encoded length of `n` 0/1 entries with `k` 1s (aka, Uniform) 47 | def LU(n,k) : 48 | if n==0 or k==0 : 49 | return 0; 50 | return log(choose(n,k),2); 51 | 52 | # encoded size of an integer >=1 as by Rissanen's 1983 Universal code for integers 53 | def LN(z) : 54 | if z <= 0 : 55 | return 0; 56 | c = log(2.865064,2); 57 | i = log(z,2); 58 | while i > 0 : 59 | c = c + i; 60 | i = log(i,2); 61 | return c; 62 | 63 | 64 | 65 | ### Our Encoding Starts Here ### 66 | 67 | ### Total Encoded Size 68 | def L(G, M, errorEnc): 69 | E = Error(G); # initially, everything is error, nothing is covered 70 | error_cost = 0; 71 | 72 | model_cost = LN(M.numStructs+1); # encode number of structures we're encoding with 73 | model_cost += LwC(M.numStructs, M.numStrucTypes); # encode the number per structure 74 | 75 | # encode the structure-type identifier per type 76 | if M.numFullCliques > 0 : 77 | model_cost += M.numFullCliques * log(M.numFullCliques / float(M.numStructs), 2); 78 | if M.numNearCliques > 0 : 79 | model_cost += M.numNearCliques * log(M.numNearCliques / float(M.numStructs), 2); 80 | if M.numChains > 0 : 81 | model_cost += M.numChains * log(M.numChains / float(M.numStructs), 2); 82 | if M.numStars > 0 : 83 | model_cost += M.numStars * log(M.numStars / float(M.numStructs), 2); 84 | if M.numBiPartiteCores > 0 : 85 | model_cost += M.numBiPartiteCores * log(M.numBiPartiteCores / float(M.numStructs), 2); 86 | if M.numNearBiPartiteCores > 0 : 87 | model_cost += M.numNearBiPartiteCores * log(M.numNearBiPartiteCores / float(M.numStructs), 2); 88 | if M.numJellyFishes > 0 : 89 | model_cost += M.numJellyFishes * log(M.numJellyFishes / float(M.numStructs), 2); 90 | if M.numCorePeripheries > 0 : 91 | model_cost += M.numCorePeripheries * log(M.numCorePeripheries / float(M.numStructs), 2); 92 | 93 | 94 | # encode the structures 95 | for struc in M.structs : 96 | if struc.isFullClique() : 97 | model_cost += LfullClique(struc,M,G,E); 98 | elif struc.isNearClique() : 99 | model_cost += LnearClique(struc,M,G,E); 100 | elif struc.isChain() : 101 | model_cost += Lchain(struc,M,G,E); 102 | elif struc.isStar() : 103 | model_cost += Lstar(struc,M,G,E); 104 | elif struc.isCorePeriphery() : 105 | model_cost += LcorePeriphery(struc,M,G,E); 106 | elif struc.isBiPartiteCore() : 107 | model_cost += LbiPartiteCore(struc,M,G,E); 108 | elif struc.isNearBiPartiteCore() : 109 | model_cost += LnearBiPartiteCore(struc,M,G,E); 110 | elif struc.isJellyFish() : 111 | model_cost += LjellyFish(struc,M,G,E); 112 | 113 | 114 | # encode the error 115 | error_cost += 0 if E.numCellsCovered == 0 else log(E.numCellsCovered, 2); # encode number of additive Errors 116 | if ((G.numNodes * G.numNodes - G.numNodes) / 2) - E.numCellsCovered > 0 : 117 | error_cost += log(((G.numNodes * G.numNodes - G.numNodes) / 2) - E.numCellsCovered, 2); # encode number of Errors 118 | 119 | if errorEnc == "NP" : 120 | error_cost += LErrorNaivePrefix(G,M,E); 121 | elif errorEnc == "NB" : 122 | error_cost += LErrorNaiveBinom(G,M,E); 123 | elif errorEnc == "TP" : 124 | error_cost += LErrorTypedPrefix(G,M,E); 125 | elif errorEnc == "TB" : 126 | error_cost += LErrorTypedBinom(G,M,E); 127 | 128 | total_cost = model_cost + error_cost; 129 | 130 | return (total_cost, model_cost, error_cost, E); 131 | 132 | # Encoded Size of a Full-Clique 133 | def LfullClique(c, M, G, E): 134 | # update Error 135 | coverFullClique(G, E, c); 136 | 137 | cost = LN(c.numNodes); # encode number of nodes 138 | if G.numNodes > 0 and c.numNodes > 0 : 139 | cost += LU(G.numNodes, c.numNodes); # encode node ids 140 | return cost; 141 | 142 | def coverFullClique(G, E, c): 143 | # c.nodes is ordered 144 | for i_idx in range(c.numNodes) : 145 | i = c.nodes[i_idx]; 146 | for j_idx in range(i_idx+1,c.numNodes) : 147 | j = c.nodes[j_idx]; 148 | 149 | if not E.isExcluded(i,j) : 150 | # only if (i,j) is not modelled perfectly 151 | 152 | if not E.isCovered(i,j) : 153 | # edge is not modelled yet 154 | if G.hasEdge(i,j) : 155 | # yet there is a real edge, so now we undo an error 156 | E.delUnmodelledError(i,j); 157 | else : 158 | # there is no real edge, but now we say there is, so we introduce error 159 | E.addModellingError(i,j); 160 | E.cover(i,j); 161 | 162 | else : 163 | # edge is already modelled 164 | if G.hasEdge(i,j) and E.isModellingError(i,j) : 165 | # edge exists, but model denied 166 | E.delModellingError(i,j); 167 | elif not G.hasEdge(i,j) and not E.isModellingError(i,j) : 168 | # edge does not exist, but now we say it does 169 | E.addModellingError(i,j); 170 | return; 171 | 172 | 173 | # Encoded Size of a Near-Clique 174 | def LnearClique(c, M, G, E) : 175 | # update Error, count coverage 176 | (cnt0,cnt1) = coverNearClique(G, E, c) 177 | 178 | cost = LN(c.numNodes); # encode number of nodes 179 | cost += LU(G.numNodes, c.numNodes); # encode node ids 180 | if cnt0+cnt1 > 0 : 181 | cost += log(cnt0+cnt1, 2); # encode probability of a 1 (cnt0+cnt1 is number of cells we describe, upperbounded by numnodes 2) 182 | cost += LnU(cnt0+cnt1, cnt1); # encode the edges 183 | return cost; 184 | 185 | def coverNearClique(G, E, c) : 186 | # c.nodes is ordered 187 | cnt0 = 0; 188 | cnt1 = 0; 189 | for i_idx in range(c.numNodes) : 190 | i = c.nodes[i_idx]; 191 | for j_idx in range(i_idx+1, c.numNodes) : 192 | j = c.nodes[j_idx]; 193 | 194 | if not E.isExcluded(i,j) : 195 | # only if (i,j) is not already modelled perfectly 196 | 197 | if not E.isCovered(i,j) : 198 | # edge is not modelled yet 199 | if G.hasEdge(i,j) : 200 | # yet there is a real edge, so now we undo an error 201 | E.delUnmodelledError(i,j); 202 | E.coverAndExclude(i,j); 203 | 204 | else : 205 | # edge is already modelled 206 | if E.isModellingError(i,j) : 207 | # but wrongly, we undo that error 208 | E.delModellingError(i,j); 209 | E.exclude(i,j) 210 | 211 | if G.hasEdge(i,j) : 212 | cnt1 += 1; 213 | else: 214 | cnt0 += 1; 215 | 216 | return (cnt0,cnt1); 217 | 218 | 219 | # Encoded Size of a Chain 220 | def Lchain(ch, M, G, E) : 221 | # update Error 222 | coverChain(G,E,ch); 223 | 224 | cost = LN(ch.numNodes-1); # we know chain is at least 2 nodes 225 | cost += LU(G.numNodes,ch.numNodes); # identify the nodes 226 | cost += log(factorial(ch.numNodes),2) # identify their order 227 | 228 | ## same as LU + log(factorial) 229 | #for nid in range(ch.numNodes) : 230 | # cost += log(G.numNodes - nid, 2); # identify the node ids in order 231 | return cost; 232 | 233 | def coverChain(G, E, ch) : 234 | # model chain 235 | for i_idx in range(ch.numNodes-1) : 236 | i = ch.nodes[i_idx]; 237 | j = ch.nodes[i_idx+1]; 238 | 239 | if not E.isExcluded(i,j) : 240 | # only if (i,j) is not already modelled perfectly 241 | if not E.isCovered(i, j) : 242 | # edge is not modelled yet 243 | 244 | if G.hasEdge(i, j) : 245 | E.delUnmodelledError(i, j); 246 | else : 247 | E.addModellingError(i, j); 248 | E.cover(i,j); 249 | 250 | else : 251 | # edge is already modelled 252 | 253 | if G.hasEdge(i,j) and E.isModellingError(i,j) : 254 | # model is wrong in saying no edge 255 | E.delModellingError(i,j); 256 | # elif G.hasEdge(i,j) and not E.isModellingError(i,j) : 257 | # there is an edge, and we knew that 258 | # elif not G.hasEdge(i,j) and E.isModellingError(i,j) : 259 | # there is no edge, but we keep saying there is 260 | elif not G.hasEdge(i,j) and not E.isModellingError(i,j) : 261 | # there is no edge, but now we say there is 262 | E.addModellingError(i,j); 263 | 264 | if config.optModelZeroes == True : 265 | # model non-shortcuts 266 | for i_idx in range(ch.numNodes) : 267 | i = ch.nodes[i_idx]; 268 | for j_idx in range(i_idx+2, ch.numNodes) : # skip the direct neighbour 269 | j = ch.nodes[j_idx]; 270 | 271 | if not E.isExcluded(i,j) : 272 | # only if (i,j) is not already modelled perfectly 273 | if not E.isCovered(i,j) : 274 | # edge not yet modelled 275 | if G.hasEdge(i,j) : 276 | # oops, there is an edge, but we say there aint 277 | E.addModellingError(i,j); 278 | #else : 279 | # there is no edge, so we're good 280 | E.cover(i,j); 281 | #else : 282 | # edge is modelled 283 | #if G.hasEdge(i,j) and E.isModellingError(i,j) : 284 | # model incorrect in saying there is no edge - no change 285 | #if G.hasEdge(i,j) and not E.isModellingError(i,j) : 286 | # model correct in saying there is an edge, no change 287 | # ... 288 | return; 289 | 290 | 291 | 292 | # Encoded Size of a Star 293 | def Lstar(star, M, G, E) : 294 | # update Error 295 | coverStar(G, E, star); 296 | 297 | cost = LN(star.numSpokes); # number of spokes (we know there's one hub) 298 | cost += log(G.numNodes, 2); # identify the hub-node 299 | 300 | #cost += star.numSpokes * log(G.numNodes-1,2); # identify the spoke-nodes 301 | cost += LU(G.numNodes-1,star.numSpokes); # identify the spoke-nodes 302 | 303 | return cost; 304 | 305 | def coverStar(G, E, st) : 306 | 307 | i = st.cNode; 308 | for j in st.sNodes: 309 | x = min(i,j); 310 | y = max(i,j); 311 | if not E.isExcluded(i,j) : 312 | # only if (i,j) is not already modelled perfectly 313 | 314 | if G.hasEdge(x,y) : 315 | if E.isCovered(x,y) : 316 | if E.isModellingError(x,y) : 317 | # previously modelled as 0, we fix the error 318 | E.delModellingError(x,y); 319 | else : 320 | E.delUnmodelledError(x,y); 321 | E.cover(x,y); 322 | else : 323 | if E.isCovered(x,y) : 324 | if not E.isModellingError(x,y) : 325 | E.addModellingError(x,y); 326 | else : 327 | E.addModellingError(x,y); 328 | E.cover(x,y) 329 | 330 | 331 | if config.optModelZeroes == True : 332 | # model non-shortcuts 333 | for i_idx in range(st.numSpokes) : 334 | i = st.sNodes[i_idx]; 335 | for j_idx in range(i_idx+1, st.numSpokes) : 336 | j = st.sNodes[j_idx]; 337 | 338 | if not E.isExcluded(i,j) : 339 | # only if (i,j) is not already modelled perfectly 340 | 341 | if not E.isCovered(i,j) : 342 | # edge not yet modelled 343 | if G.hasEdge(i,j) : 344 | # oops, there is an edge, but we say there aint 345 | E.addModellingError(i,j); 346 | #else : 347 | # there is no edge, so we're good 348 | E.cover(i,j); 349 | #else : 350 | # edge is modelled 351 | #if G.hasEdge(i,j) and E.isModellingError(i,j) : 352 | # model incorrect in saying there is no edge - no change 353 | #if G.hasEdge(i,j) and not E.isModellingError(i,j) : 354 | # model correct in saying there is an edge, no change 355 | # ... 356 | 357 | return; 358 | 359 | # Encoded Size of a bi-partite core 360 | def LbiPartiteCore(bc, M, G, E) : 361 | # update Error 362 | coverBiPartiteCore(G, E, bc); 363 | 364 | cost = LN(bc.numLeftNodes) + LN(bc.numRightNodes); 365 | cost += LU(G.numNodes, bc.numLeftNodes); 366 | cost += LU(G.numNodes- bc.numLeftNodes, bc.numRightNodes); 367 | return cost; 368 | 369 | def coverBiPartiteCore(G, E, bc) : 370 | 371 | # 1. fill in the 1s between the parts 372 | for i in bc.lNodes : 373 | for j in bc.rNodes : 374 | if not E.isExcluded(i,j) : 375 | # only if (i,j) is not already modelled perfectly 376 | if G.hasEdge(i,j) : 377 | # there is an edge 378 | if E.isCovered(i,j) : 379 | if E.isModellingError(i,j) : 380 | # model says 0, we fix to 1 381 | E.delModellingError(i,j); 382 | else : 383 | # model didnt say anything, we fix it 384 | E.delUnmodelledError(i,j); 385 | E.cover(i,j); 386 | else : 387 | # there is no edge 388 | if E.isCovered(i,j) : 389 | # but the cell is modelled 390 | if not E.isModellingError(i,j) : 391 | E.addModellingError(i,j); # we make a boo-boo 392 | else : 393 | # the cell is not modelled, yet 394 | E.addModellingError(i,j); 395 | E.cover(i,j); 396 | 397 | # 2. fill in the 0s in left part 398 | for i_idx in range(len(bc.lNodes)-1) : 399 | i = bc.lNodes[i_idx]; 400 | for j_idx in range(i_idx+1,len(bc.lNodes)) : 401 | j = bc.lNodes[j_idx]; 402 | 403 | if not E.isExcluded(i,j) and not E.isCovered(i,j) : 404 | # only if (i,j) is not covered or already modelled perfectly 405 | if E.isUnmodelledError(i,j) : 406 | # edge exists! 407 | E.delUnmodelledError(i,j); # we now model this cell 408 | E.addModellingError(i,j); # but do so wrongly 409 | E.cover(i,j); 410 | 411 | # 3. fill in the 0s in right part 412 | for i_idx in range(len(bc.rNodes)-1) : 413 | i = bc.rNodes[i_idx]; 414 | for j_idx in range(i_idx+1,len(bc.rNodes)) : 415 | j = bc.rNodes[j_idx]; 416 | 417 | if not E.isExcluded(i,j) and not E.isCovered(i,j) : 418 | # only if (i,j) is not covered or already modelled perfectly 419 | if E.isUnmodelledError(i,j) : 420 | # edge exists! 421 | E.delUnmodelledError(i,j); # we now model this cell 422 | E.addModellingError(i,j); # but do so wrongly 423 | E.cover(i,j); 424 | return; 425 | 426 | 427 | # Encoded Size of a near bi-partite core 428 | def LnearBiPartiteCore(nb, M, G, E) : 429 | # update Error 430 | (cnt0,cnt1) = coverNearBiPartiteCore(G, E, nb); 431 | 432 | # encode number of nodes in sets A and B 433 | cost = LN(nb.numLeftNodes) + LN(nb.numRightNodes); 434 | # encode node ids of sets A and B 435 | cost += LU(G.numNodes, nb.numLeftNodes); 436 | cost += LU(G.numNodes- nb.numLeftNodes, nb.numRightNodes); 437 | 438 | if cnt0+cnt1 > 0 : 439 | # encode probability of a 1 between sets A and B 440 | cost += log(cnt0+cnt1, 2); 441 | # encode the actual edges between A and B 442 | cost += LnU(cnt0+cnt1, cnt1); 443 | return cost; 444 | 445 | 446 | def coverNearBiPartiteCore(G, E, nb) : 447 | # first encode the edges between the parts 448 | cnt0 = 0; 449 | cnt1 = 0; 450 | for i_idx in range(nb.numLeftNodes) : 451 | i = nb.lNodes[i_idx]; 452 | for j_idx in range(nb.numRightNodes) : 453 | j = nb.rNodes[j_idx]; 454 | 455 | if not E.isExcluded(i,j) : 456 | # only if (i,j) is not already modelled perfectly 457 | 458 | if not E.isCovered(i,j) : 459 | # edge is not modelled yet 460 | if G.hasEdge(i,j) : 461 | # yet there is a real edge, so now we undo an error 462 | E.delUnmodelledError(i,j); 463 | E.coverAndExclude(i,j); 464 | 465 | else : 466 | # edge is already modelled 467 | if E.isModellingError(i,j) : 468 | # but wrongly, we undo that error 469 | E.delModellingError(i,j); 470 | E.exclude(i,j) 471 | 472 | if G.hasEdge(i,j) : 473 | cnt1 += 1; 474 | else: 475 | cnt0 += 1; 476 | 477 | 478 | # 2. fill in the 0s in left part 479 | for i_idx in range(len(nb.lNodes)-1) : 480 | i = nb.lNodes[i_idx]; 481 | for j_idx in range(i_idx+1,len(nb.lNodes)) : 482 | j = nb.lNodes[j_idx]; 483 | 484 | if not E.isExcluded(i,j) and not E.isCovered(i,j) : 485 | # only if (i,j) is not covered or already modelled perfectly 486 | if E.isUnmodelledError(i,j) : 487 | # edge exists! 488 | E.delUnmodelledError(i,j); # we now model this cell 489 | E.addModellingError(i,j); # but do so wrongly 490 | E.cover(i,j); 491 | 492 | # 3. fill in the 0s in right part 493 | for i_idx in range(len(nb.rNodes)-1) : 494 | i = nb.rNodes[i_idx]; 495 | for j_idx in range(i_idx+1,len(nb.rNodes)) : 496 | j = nb.rNodes[j_idx]; 497 | 498 | if not E.isExcluded(i,j) and not E.isCovered(i,j) : 499 | # only if (i,j) is not covered or already modelled perfectly 500 | if E.isUnmodelledError(i,j) : 501 | # edge exists! 502 | E.delUnmodelledError(i,j); # we now model this cell 503 | E.addModellingError(i,j); # but do so wrongly 504 | E.cover(i,j); 505 | 506 | return (cnt0,cnt1); 507 | 508 | 509 | # Encoded Size of a jellyfish structure 510 | def LjellyFish(jf, M, G, E) : 511 | # update Error 512 | coverJellyFish(G, E, jf); 513 | 514 | cost = LN(jf.numCores); # number of core nodes 515 | cost += LU(G.numNodes, jf.numCores); # core node ids 516 | 517 | cost += LN(jf.numSpokeSum) + LC(jf.numSpokeSum, jf.numCores); # number of spokes per core node 518 | cost += LU(G.numNodes - jf.numCores, jf.numSpokeSum); # spoke ids (-no- overlap between sets!) 519 | return cost; 520 | 521 | def coverJellyFish(G, E, jf) : 522 | 523 | # first link up the nodes in the core 524 | for i_idx in range(len(jf.cNodes)) : 525 | i = jf.cNodes[i_idx]; 526 | for j_idx in range(i_idx+1,len(jf.cNodes)) : 527 | j = jf.cNodes[j_idx]; 528 | 529 | if not E.isExcluded(i,j) : 530 | # only if (i,j) is not already modelled perfectly 531 | 532 | if G.hasEdge(i,j) : 533 | # there is an edge 534 | if E.isCovered(i,j) : 535 | if E.isModellingError(i,j) : 536 | E.delModellingError(i,j); # model said 0, but we say 1 537 | else : 538 | # edge is there, but not covered, we fix it! 539 | E.delUnmodelledError(i,j); 540 | E.cover(i,j); 541 | else : 542 | # there is no edge 543 | if E.isCovered(i,j) : 544 | if not E.isModellingError(i,j) : 545 | E.addModellingError(i,j); # model said 0, we say 1 546 | else : 547 | E.addModellingError(i,j); 548 | E.cover(i,j); 549 | 550 | # 2. link up the core nodes up to their respective spokes 551 | for i_idx in range(len(jf.cNodes)) : 552 | i = jf.cNodes[i_idx]; 553 | for j_idx in range(len(jf.sNodes[i_idx])) : 554 | j = jf.sNodes[i_idx][j_idx]; 555 | 556 | if not E.isExcluded(i,j) : 557 | # only if (i,j) is not already modelled perfectly 558 | 559 | if G.hasEdge(i,j) : 560 | # there is an edge 561 | if E.isCovered(i,j) : 562 | if E.isModellingError(i,j) : 563 | E.delModellingError(i,j); # model said 0, we fix to 1 564 | else : 565 | # edge is there, but not covered, we fix it 566 | E.delUnmodelledError(i,j); 567 | E.cover(i,j); 568 | else : 569 | # there is no edge 570 | if E.isCovered(i,j) : 571 | if not E.isModellingError(i,j) : 572 | E.addModellingError(i,j); # model said 0, but we say 1 573 | else : 574 | E.addModellingError(i,j); 575 | E.cover(i,j); 576 | 577 | if config.optModelZeroes == True : 578 | # 3. model that the spokes within a set are not connected 579 | # !!! code can be made more efficient, by incorporating it in previous loop 580 | for i_idx in range(len(jf.cNodes)) : 581 | 582 | for j_idx in range(len(jf.sNodes[i_idx])-1) : 583 | j = jf.sNodes[i_idx][j_idx]; 584 | 585 | for k_idx in range(j_idx+1,len(jf.sNodes[i_idx])) : 586 | k = jf.sNodes[i_idx][k_idx]; 587 | 588 | if not E.isExcluded(j,k) : 589 | # only if (i,j) is not already modelled perfectly 590 | 591 | #if E.isModelled(j,k) : 592 | # we don't change previous modelling, but 593 | if not E.isModelled(j,k) : 594 | # cell not yet modelled, and should be a 0 595 | if G.hasEdge(j,k) : 596 | # but, it has a 1, change it to modelling error 597 | E.delUnmodelledError(j,k); 598 | E.addModellingError(j,k); 599 | E.cover(j,k); 600 | return; 601 | 602 | 603 | # Encoded Size of a core periphery 604 | def LcorePeriphery(cp, M, G, E) : 605 | # update Error 606 | coverCorePeriphery(G, E, cp); 607 | 608 | cost = LN(cp.numCores); # number of core-nodes 609 | cost += LN(cp.numSpokes); # number of spoke-nodes 610 | cost += cp.numCores * log(G.numNodes, 2); # identify core-nodes 611 | cost += cp.numSpokes * log(G.numNodes - cp.numCores, 2); # identify spoke-nodes 612 | return cost; 613 | 614 | # check whether ok 615 | def coverCorePeriphery(G, E, cp) : 616 | for i in cp.cNodes : 617 | for j in cp.sNodes : 618 | if not E.isModelled(i,j) : 619 | if G.hasEdge(i,j) : 620 | E.delUnmodelledError(i,j); 621 | else : 622 | E.addModellingError(i,j); 623 | E.cover(i,j); 624 | return; 625 | 626 | # Encoded Size of a core periphery (a bit smarter) 627 | def LcorePeripheryA(cp, M, G, E) : 628 | cost = LN(cp.numCoreNodes); # number of core-nodes 629 | cost += LN(cp.numSpokes); # number of spoke-nodes 630 | cost += LU(G.numNodes, cp.numCoreNodes); # identify core-nodes 631 | cost += LU(G.numNodes - cp.numCoreNodes, cp.numSpokes); # identify spoke-nodes 632 | return cost; 633 | 634 | 635 | ### Encoding the Error 636 | 637 | # here I encode all errors uniformly by a binomial -- hence, not yet the typed advanced stuff yet! 638 | def LErrorNaiveBinom(G, M, E) : 639 | # possible number of edges in an undirected, non-self-connected graph of N nodes 640 | posNumEdges = (G.numNodes * G.numNodes - G.numNodes) / 2 641 | cost = LU(posNumEdges - E.numCellsExcluded, E.numUnmodelledErrors + E.numModellingErrors); 642 | if config.optVerbosity > 1 : print ' - L_nb(E)', cost; 643 | return cost; 644 | 645 | def LErrorNaivePrefix(G, M, E) : 646 | # possible number of edges in an undirected, non-self-connected graph of N nodes 647 | posNumEdges = (G.numNodes * G.numNodes - G.numNodes) / 2 648 | cost = LnU(posNumEdges - E.numCellsExcluded, E.numModellingErrors + E.numUnmodelledErrors); 649 | if config.optVerbosity > 1 : print ' - L_np(E)', cost; 650 | return cost; 651 | 652 | # here I encode all errors uniformly by a binomial -- hence, not yet the typed advanced stuff yet! 653 | def LErrorTypedBinom(G, M, E) : 654 | # possible number of edges in an undirected, non-self-connected graph of N nodes 655 | posNumEdges = (G.numNodes * G.numNodes - G.numNodes) / 2 656 | 657 | # First encode the modelling errors 658 | #print 'First encode the modelling errors' 659 | #print 'E.numCellsCovered, E.numCellsExcluded, E.numModellingErrors;' 660 | #print E.numCellsCovered, E.numCellsExcluded, E.numModellingErrors; 661 | costM = LU(E.numCellsCovered - E.numCellsExcluded, E.numModellingErrors); 662 | if config.optVerbosity > 1 : print ' - L_tb(E+)', costM; 663 | 664 | # Second encode the unmodelled errors 665 | #print 'Second encode the unmodelled errors' (excluded cells are always covered!) 666 | #print posNumEdges - E.numCellsCovered, E.numUnmodelledErrors; 667 | costU = LU(posNumEdges - E.numCellsCovered, E.numUnmodelledErrors); 668 | if config.optVerbosity > 1 : print ' - L_tb(E-)', costU; 669 | return costM + costU; 670 | 671 | def LErrorTypedPrefix(G, M, E) : 672 | # possible number of edges in an undirected, non-self-connected graph of N nodes 673 | posNumEdges = (G.numNodes * G.numNodes - G.numNodes) / 2 674 | costM = LnU(E.numCellsCovered - E.numCellsExcluded, E.numModellingErrors); 675 | if config.optVerbosity > 1 : print ' - L_tp(E+)', costM; 676 | costU = LnU(posNumEdges - E.numCellsCovered, E.numUnmodelledErrors); 677 | if config.optVerbosity > 1 : print ' - L_tp(E-)', costU; 678 | return costM + costU; 679 | --------------------------------------------------------------------------------