0001 function cutsize = validate4 (A, clusters, Verif, verbose)
0002 % VALIDATE4 : Validate output from SSCA#2 Kernel 4 -- Graph Clustering
0003 %
0004 % cutsize = validate4 (G, clusters, Verif, verbose)
0005 %
0006 % Input:  G - directed, weighted multigraph (see kernel1 for details)
0007 %         clusters - vector that labels each vertex of G with a
0008 %                    cluster number, as output from kernel4
0009 %         Verif   - verification struct from LL data generator (optional)
0010 %         maxsize - max allowed cluster size in vertices (default Inf)
0011 %         verbose - output level, as follows:
0012 %                   0: no output or plots
0013 %                   1: print out statistics, no plots (default)
0014 %                   2: print and plot statistics
0015 %                   3: print and plot statistics and spy plot of graph
0016 %
0017 % Output: cutsize - number of edges that join different clusters
0018 %
0019 % This does several tests on the output from an implementation of
0020 % kernel 4 to verify that it's correct. 
0021 %
0022 % John R. Gilbert, 20 Mar 2005.
0023 % Viral Shah, 22 May 2006.
0024 
0025 if nargin < 3
0026     Verif = [];
0027 end;
0028 if nargin < 4
0029     verbose = 1;
0030 end;
0031 if verbose > 0
0032     fprintf('\n** Validating output from SSCA#2 kernel 4 ** \n');
0033 end;
0034 if isstruct(A)
0035     if isfield(A,'graph')
0036         A = A.graph;
0037     else 
0038         A = undirect(A.edgeWeights{1});
0039     end;
0040 end;
0041 cutsize = Inf;
0042 [nv, m] = size(A);
0043 if nv ~= m || nnz(A|A') ~= nnz(A)
0044     error('Input matrix is not square or not symmetric in structure.');
0045 end;
0046 if ~ isempty(Verif) && Verif.totVertices ~= nv
0047     fprintf('Warning: verification struct disagrees with number of vertices:\n');
0048     fprintf('         Verif.totVertices = %d, nv = %d\n\n', Verif.totVertices, nv);
0049 end;
0050 if min(size(clusters)) ~= 1 || max(size(clusters)) ~= nv
0051     if verbose > 0
0052         fprintf('Bad output: cluster vector is wrong size.\n');
0053     end;
0054     return;
0055 end;
0056 if min(clusters) < 1 || any(round(clusters) ~= clusters)
0057     if verbose > 0
0058         fprintf('Bad output: cluster vector is not all positive integers.\n');
0059     end;
0060     return;
0061 end;
0062 maxsize = Inf;
0063 clusters = fixuplabels (clusters);      % number clusters 1:nclusters
0064 nclusters = max(clusters);              % number of clusters
0065 csizes = histc(clusters,1:nclusters);   % csizes(i) is # vtxs in cluster i
0066 maxcluster = max(csizes);               % # vtxs in largest cluster
0067 ncbysize = histc(csizes,1:maxcluster);  % ncbysize(j) is # clusters with j vtxs
0068 A = A - diag(diag (A));                 % Make sure no self edges
0069 nedges = nnz(A)/2;                      % # edges in graph 
0070 [i,j] = find(A);
0071 condensation = sparse(clusters(i),clusters(j),1,nclusters,nclusters);
0072 edgesinside = sum(full(diag(condensation)))/2; % total # intracluster edges
0073 edgesoutside = nedges - edgesinside;           % total # intercluster edges
0074 densityinside = 2*edgesinside / sum(csizes .* (csizes-1));
0075 if ~ isempty(Verif)
0076     maxclique = Verif.maxCliqueSize;
0077     ncliques = max(Verif.cliqueID);
0078     maxsize = maxclique;
0079     if isfield(Verif,'numInterCliqueLinks')
0080         intercliqueedges = Verif.numInterCliqueLinks;
0081     else
0082         intercliqueedges = NaN;
0083     end;
0084     M = edgesoutside / intercliqueedges;
0085 else
0086     maxsize = Inf;
0087     maxclique = NaN;
0088     ncliques = NaN;
0089     intercliqueedges = NaN;
0090     M = NaN; 
0091 end;
0092 if verbose > 0
0093     fprintf('\n');
0094 
0095     fprintf('Scale                          : %d\n', fix(log2(nv)));
0096     fprintf('Number of graph vertices       : %d\n', nv);
0097     fprintf('Number of graph edges          : %d\n', nedges);
0098     fprintf('Intercluster/Interclique edges : %f\n', M);
0099     
0100     fprintf('\n\t\t\t\tINPUT     OUTPUT      CONFORM\n');
0101 
0102     if (maxcluster <= maxclique)
0103       conform = 'YES';
0104     else
0105       conform = sprintf ('NO: %d extra vertices', maxcluster - maxclique);
0106     end
0107     fprintf('Max Cluster Size         : %10d %10d      %s\n', ...
0108         maxclique,  maxcluster, conform);
0109 
0110     fprintf('Number of clusters       : %10d %10d\n', ...
0111         ncliques, nclusters);
0112 
0113     fprintf('Density inside clusters  : %10.2f %10.2f\n', ...
0114         1, densityinside);
0115 
0116     if edgesoutside < floor(1.05 * intercliqueedges)
0117       conform = 'YES';
0118     else
0119       conform = sprintf ('NO: %d%% extra edges', ...
0120              ceil ((M - 1.05) * 100));
0121     end
0122     fprintf('Edges between clusters   : %10d %10d      %s\n', ...
0123         intercliqueedges, edgesoutside, conform);
0124 
0125     fprintf('\n');
0126 end;
0127 if verbose >= 2
0128 %    figure; 
0129 %    bar(1:maxcluster,ncbysize,'histc');
0130 %    title('Histogram of cluster sizes'); xlabel('size'); ylabel('clusters');
0131     
0132     % In the input graph
0133     Vnclusters = max(Verif.cliqueID);            % number of clusters
0134     Vcsizes = histc(Verif.cliqueID,1:Vnclusters); % csizes(i) is # vtxs in cluster i
0135     Vmaxcluster = max(Vcsizes);               % # vtxs in largest cluster
0136     Vncbysize = histc(Vcsizes,1:Vmaxcluster);  % ncbysize(j) is # clusters with j vtx
0137 %    figure;
0138 %    bar(1:Vmaxcluster,Vncbysize,'histc');
0139 %    title('Histogram of clique sizes'); xlabel('size'); ylabel('cliques');
0140     
0141     figure;
0142     Vmaxcluster = max (maxcluster, Vmaxcluster);
0143     l1 = length(ncbysize); 
0144     l2 = length(Vncbysize);
0145     if l1 > l2
0146         Vncbysize = [Vncbysize zeros(l1-l2,1)'];
0147         Vmaxcluster = length (Vncbysize);
0148     else
0149         ncbysize = [ncbysize zeros(l2-l1,1)'];
0150         Vmaxcluster = length (ncbysize);
0151     end
0152     if size(ncbysize, 2) == 1
0153         ncbysize = ncbysize';
0154     end    
0155     bar (1:Vmaxcluster, [Vncbysize; ncbysize]', 'grouped');
0156     colormap cool
0157     legend ('Input', 'Output', 'Location', 'BestOutside');
0158     title('Histogram of cluster sizes'); xlabel('size'); ylabel('clusters');
0159 end;
0160 if verbose >= 3
0161     figure; 
0162     spy(A); title('Input graph in original order');
0163     [ignore,p] = sort(clusters);
0164     figure;
0165     spy(A(p,p)|speye(nv)); title('Input graph in cluster order');
0166     figure; 
0167     spy(condensation);
0168     title('Graph of intercluster connections');
0169 end;
0170 cutsize = edgesoutside;
0171 %if maxcluster > maxsize
0172 %    if verbose >= 1
0173 %        fprintf('Bad output: largest cluster exceeds maximum size.\n');
0174 %    end;
0175 %end;
0176 %allowed = floor(1.05 * intercliqueedges);
0177 %if cutsize > allowed
0178 %    fprintf ('Bad output: Too many edges between clusters. Allowed: %d, Excess: %d (%d%%)\n', ... 
0179 %        allowed, cutsize-allowed, ...
0180 %        ceil(((cutsize-allowed)/intercliqueedges)*100) );
0181 %end