0001 function label = kernel4d (G, nseeds)
0002 % KERNEL4D : SSCA#2 Kernel 4 -- Graph Clustering
0003 %
0004 % label = kernel4d (G, nseeds)
0005 % Input:  G - directed, weighted multigraph (see kernel1 for details)
0006 %         attach - if nonzero, attach 1-clusters to big ones (default 0)
0007 %         nseeds - number of clusters to seed per iteration (default n/50)
0008 % Output: label - vector that labels each vertex of G with a
0009 %                    cluster number (an integer >= 1)
0010 %
0011 % I will make several versions of kernel4 with different algorithms.
0012 % Kernel4d is an experiment using seed-growing breadth-first search.
0013 %
0014 % This is a concise sequential Matlab code by John R. Gilbert, 
0015 % based loosely on findSubgraphs in V1.0 of the executable spec 
0016 % by Bill Mann and Theresa Meuse.
0017 % Version of 8 Sep 2005
0018 % Modified 25 Oct 2005 by JRG
0019 
0020 fprintf('kernel4d version of 25 oct 2005\n\n');
0021 
0022 % Strip G down to an undirected 0-1 adjacency matrix.
0023 if isstruct(G)
0024     if isfield(G,'graph')
0025         G = G.graph;
0026     else 
0027         G = undirect(G.edgeWeights{1});
0028     end;
0029 end;
0030 if nnz(diag(G)) > 0
0031     G = G - diag(diag(G));
0032 end;
0033 if max(max(G)) > 1 || sum(sum(G)) ~= nnz(G)
0034     G = G~=0;
0035 end;
0036 
0037 n = size(G,1);
0038 if nargin < 2
0039     attach = 0;
0040 end;
0041 if nargin < 3
0042     nseeds = ceil(n/50);    % number of clusters to seed per iteration
0043 end;
0044 
0045 label = zeros(1,n);
0046 base = 0;
0047 news = 1;
0048 niters = 0;
0049 
0050 % iterate until all vertices labelled or no new vertices are labelled
0051 while (nnz(label) < n) && (nnz(news) > 0) 
0052     niters = niters+1;
0053     
0054     % seed new clusters from the first nseeds unlabelled vertices
0055     f = find(label==0);
0056     nseeds = min(nseeds, length(f));
0057     fprintf('iteration: %d, new seeds: %d, ', niters, double(nseeds));
0058     v = f(1:nseeds);
0059     
0060     % grow each cluster to include vertices reachable by at least two
0061     %    paths of length one or two from the seed
0062     C = sparse(v, 1:nseeds, 1, n, nseeds);
0063     C = G*C;
0064     C = C + G*C;
0065     C = C > 5;
0066     
0067     % assign each vertex to the first cluster that claimed it
0068     news = firstinrow(C);
0069     
0070     % omit vertices that were labelled at an earlier iteration
0071     news(label~=0) = 0;
0072       
0073     % label new vertices with their cluster numbers
0074     fprintf('new cluster vertices: %d, remaining: %d\n', ...
0075         nnz(news), double(n-nnz(label)-nnz(news)));
0076     label(news~=0) = news(news~=0) + base;
0077     base = base + nseeds;
0078 end;
0079 
0080 % Put each leftover vertex in its own cluster
0081 f = find(label==0);
0082 fprintf('leftover vertices: %d\n',nnz(f));
0083 label(f) = (base+1 : base+length(f));
0084 
0085 % make the labels consecutive integers starting from 1
0086 label = fixuplabels(label);
0087 
0088 % Attach each singleton cluster to an adjacent cluster
0089 %if attach
0090 if 0
0091     csize = full(sparse(label, 1, 1));
0092     fprintf('reattaching %d singleton clusters\n',nnz(csize==1));
0093     mysize = csize(label); % The size of the cluster I belong to.
0094     v = find(mysize==1);
0095     [I,J] = find(G(:,v));
0096     S = sparse(label(I),J,1,max(label),length(v));
0097     [ignore,newl] = max (S);
0098     label(v) = newl;    
0099 end;
0100 
0101 
0102 % plot histogram of final cluster sizes
0103 %if n < 10000
0104 %    dlabel = pp2matlab(label);
0105 %    csize = histc(dlabel,1:max(dlabel));
0106 %    figure; 
0107 %    hist(csize,min(csize):max(csize)); 
0108 %    title('Histogram of Cluster Sizes');
0109 %end;