Graphviz 14.1.3~dev.20260201.2050
Loading...
Searching...
No Matches
clustering.c
Go to the documentation of this file.
1/*************************************************************************
2 * Copyright (c) 2011 AT&T Intellectual Property
3 * All rights reserved. This program and the accompanying materials
4 * are made available under the terms of the Eclipse Public License v1.0
5 * which accompanies this distribution, and is available at
6 * https://www.eclipse.org/legal/epl-v10.html
7 *
8 * Contributors: Details at https://graphviz.org
9 *************************************************************************/
10
11#include "config.h"
12
13#define STANDALONE
14#include <math.h>
15#include <sparse/general.h>
16#include <sparse/SparseMatrix.h>
17#include <sparse/clustering.h>
18#include <stdbool.h>
19#include <util/alloc.h>
20
22 int n = A->n, i, j;
23
24 assert(A->type == MATRIX_TYPE_REAL);
25 assert(SparseMatrix_is_symmetric(A, false));
26
27 if (!A) return NULL;
28 assert(A->m == n);
30 grid->level = level;
31 grid->n = n;
32 grid->A = A;
33 grid->P = NULL;
34 grid->next = NULL;
35 grid->prev = NULL;
36 grid->delete_top_level_A = false;
37 grid->matching = gv_calloc(n, sizeof(double));
38 grid->deg = NULL;
39 grid->agglomerate_regardless = false;
40
41 if (level == 0){
42 double modularity = 0;
43 int *ia = A->ia, *ja = A->ja;
44 double deg_total = 0;
45 double *deg, *a = A->a;
46
47 grid->deg_total = 0.;
48 grid->deg = gv_calloc(n, sizeof(double));
49 deg = grid->deg;
50
51 double *indeg = gv_calloc(n, sizeof(double));
52 for (i = 0; i < n; i++){
53 deg[i] = 0;
54 indeg[i] = 0.;
55 for (j = ia[i]; j < ia[i+1]; j++){
56 deg[i] += a[j];
57 if (ja[j] == i) indeg[i] = a[j];
58 }
59 deg_total += deg[i];
60 }
61 deg_total = fmax(deg_total, 1);
62 for (i = 0; i < n; i++){
63 modularity += (indeg[i] - deg[i]*deg[i]/deg_total)/deg_total;
64 }
65 grid->deg_total = deg_total;
66 grid->deg = deg;
67 grid->modularity = modularity;
68 free(indeg);
69 }
70
71
72 return grid;
73}
74
76 if (!grid) return;
77 if (grid->A){
78 if (grid->level == 0) {
79 if (grid->delete_top_level_A) SparseMatrix_delete(grid->A);
80 } else {
82 }
83 }
85 free(grid->matching);
86 free(grid->deg);
87
89 free(grid);
90}
91
93 int *matching = grid->matching;
94 SparseMatrix A = grid->A;
95 int n = grid->n, level = grid->level, nc = 0;
96 double modularity = 0;
97 int *ia = A->ia, *ja = A->ja;
98 double *deg = grid->deg;
99 int i, j, jj, jc, jmax;
100 double inv_deg_total = 1./ grid->deg_total;
101 double gain;
102 double maxgain;
103 double total_gain = 0;
104 modularity = grid->modularity;
105
106 double *deg_new = gv_calloc(n, sizeof(double));
107 double *deg_inter = gv_calloc(n, sizeof(double));
108 int *mask = gv_calloc(n, sizeof(int));
109 for (i = 0; i < n; i++) mask[i] = -1;
110
111 assert(n == A->n);
112 for (i = 0; i < n; i++) matching[i] = UNMATCHED;
113
114 /* gain in merging node i into cluster j is
115 deg(i,j)/deg_total - 2*deg(i)*deg(j)/deg_total^2
116 */
117 double *a = A->a;
118 for (i = 0; i < n; i++){
119 if (matching[i] != UNMATCHED) continue;
120 /* accumulate connections between i and clusters */
121 for (j = ia[i]; j < ia[i+1]; j++){
122 jj = ja[j];
123 if (jj == i) continue;
124 if ((jc=matching[jj]) != UNMATCHED){
125 if (mask[jc] != i) {
126 mask[jc] = i;
127 deg_inter[jc] = a[j];
128 } else {
129 deg_inter[jc] += a[j];
130 }
131 }
132 }
133
134 maxgain = 0;
135 jmax = -1;
136 for (j = ia[i]; j < ia[i+1]; j++){
137 jj = ja[j];
138 if (jj == i) continue;
139 if ((jc=matching[jj]) == UNMATCHED){
140 /* the first 2 is due to the fact that deg_iter gives edge weight from i to jj, but there are also edges from jj to i */
141 gain = (2*a[j] - 2*deg[i]*deg[jj]*inv_deg_total)*inv_deg_total;
142 } else {
143 if (deg_inter[jc] > 0){
144 /* the first 2 is due to the fact that deg_iter gives edge weight from i to jc, but there are also edges from jc to i */
145 gain = (2*deg_inter[jc] - 2*deg[i]*deg_new[jc]*inv_deg_total)*inv_deg_total;
146 // printf("mod = %f deg_inter[jc] =%f, deg[i] = %f, deg_new[jc]=%f, gain = %f\n",modularity, deg_inter[jc],deg[i],deg_new[jc],gain);
147 deg_inter[jc] = -1; /* so that we do not redo the calulation when we hit another neighbor in cluster jc */
148 } else {
149 gain = -1;
150 }
151 }
152 if (jmax < 0 || gain > maxgain){
153 maxgain = gain;
154 jmax = jj;
155 }
156
157 }
158
159 /* now merge i and jmax */
160 if (maxgain > 0 || grid->agglomerate_regardless){
161 total_gain += maxgain;
162 jc = matching[jmax];
163 if (jc == UNMATCHED){
164 //fprintf(stderr, "maxgain=%f, merge %d, %d\n",maxgain, i, jmax);
165 matching[i] = matching[jmax] = nc;
166 deg_new[nc] = deg[i] + deg[jmax];
167 nc++;
168 } else {
169 //fprintf(stderr, "maxgain=%f, merge with existing cluster %d, %d\n",maxgain, i, jc);
170 deg_new[jc] += deg[i];
171 matching[i] = jc;
172 }
173 } else {
174 assert(maxgain <= 0);
175 matching[i] = nc;
176 deg_new[nc] = deg[i];
177 nc++;
178 }
179
180 }
181
182 if (Verbose) fprintf(stderr,"modularity = %f new modularity = %f level = %d, n = %d, nc = %d, gain = %g\n", modularity, modularity + total_gain,
183 level, n, nc, total_gain);
184
185 if (ncluster_target > 0){
186 if (nc <= ncluster_target && n >= ncluster_target){
187 if (n - ncluster_target > ncluster_target - nc){/* ncluster = nc */
188
189 } else if (n - ncluster_target <= ncluster_target - nc){/* ncluster_target close to n */
190 fprintf(stderr,"ncluster_target = %d, close to n=%d\n", ncluster_target, n);
191 for (i = 0; i < n; i++) matching[i] = i;
192 free(deg_new);
193 goto RETURN;
194 }
195 } else if (n < ncluster_target){
196 fprintf(stderr,"n < target\n");
197 for (i = 0; i < n; i++) matching[i] = i;
198 free(deg_new);
199 goto RETURN;
200 }
201 }
202
203 if (nc >= 1 && (total_gain > 0 || nc < n)){
204 /* now set up restriction and prolongation operator */
205 SparseMatrix P, R, R0, B, cA;
206 double one = 1.;
208
210 for (i = 0; i < n; i++){
211 jj = matching[i];
213 }
219 if (!B) {
220 free(deg_new);
221 goto RETURN;
222 }
223 cA = SparseMatrix_multiply(B, P);
225 if (!cA) {
226 free(deg_new);
227 goto RETURN;
228 }
229 grid->P = P;
230 level++;
231 cgrid = Multilevel_Modularity_Clustering_init(cA, level);
232 cgrid->deg = deg_new;
233 cgrid->modularity = grid->modularity + total_gain;
234 cgrid->deg_total = grid->deg_total;
235 cgrid = Multilevel_Modularity_Clustering_establish(cgrid, ncluster_target);
236 grid->next = cgrid;
237 cgrid->prev = grid;
238 } else {
239 /* if we want a small number of cluster but right now we have too many, we will force agglomeration */
240 if (ncluster_target > 0 && nc > ncluster_target && !(grid->agglomerate_regardless)){
242 free(deg_inter);
243 free(mask);
244 free(deg_new);
245 return Multilevel_Modularity_Clustering_establish(grid, ncluster_target);
246 }
247 /* no more improvement, stop and final clustering found */
248 for (i = 0; i < n; i++) matching[i] = i;
249 free(deg_new);
250 }
251
252 RETURN:
253 free(deg_inter);
254 free(mask);
255 return grid;
256}
257
259 /* ncluster_target is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters
260 is desired. The resulting clustering will give as close to this number as possible.
261 If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity.
262 . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2,
263 . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be
264 . selected among nc or nc2, which ever is closer to ncluster_target.
265 Default: ncluster_target <= 0 */
266
268 SparseMatrix A = A0;
269
270 if (!SparseMatrix_is_symmetric(A, false) || A->type != MATRIX_TYPE_REAL){
272 }
274
276
277 if (A != A0) grid->delete_top_level_A = true; // be sure to clean up later
278 return grid;
279}
280
281
282static void hierachical_modularity_clustering(SparseMatrix A, int ncluster_target,
283 int *nclusters, int **assignment, double *modularity){
284 /* find a clustering of vertices by maximize modularity
285 A: symmetric square matrix n x n. If real value, value will be used as edges weights, otherwise edge weights are considered as 1.
286
287 ncluster_target: is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters
288 is desired. The resulting clustering will give as close to this number as possible.
289 If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity.
290 . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2,
291 . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be
292 . selected among nc or nc2, which ever is closer to ncluster_target.
293 Default: ncluster_target <= 0
294
295 nclusters: on output the number of clusters
296 assignment: dimension n. Node i is assigned to cluster "assignment[i]". 0 <= assignment < nclusters
297 */
298
300 int *matching, i;
301 SparseMatrix P;
302 assert(A->m == A->n);
303
304 *modularity = 0.;
305
306 grid = Multilevel_Modularity_Clustering_new(A, ncluster_target);
307
308 /* find coarsest */
309 cgrid = grid;
310 while (cgrid->next){
311 cgrid = cgrid->next;
312 }
313
314 /* project clustering up */
315 double *u = gv_calloc(cgrid->n, sizeof(double));
316 for (i = 0; i < cgrid->n; i++) u[i] = (double) (cgrid->matching)[i];
317 *nclusters = cgrid->n;
318 *modularity = cgrid->modularity;
319
320 while (cgrid->prev){
321 double *v = NULL;
322 P = cgrid->prev->P;
324 free(u);
325 u = v;
326 cgrid = cgrid->prev;
327 }
328
329 if (*assignment){
330 matching = *assignment;
331 } else {
332 matching = gv_calloc(grid->n, sizeof(int));
333 *assignment = matching;
334 }
335 for (i = 0; i < grid->n; i++) (matching)[i] = (int) u[i];
336 free(u);
337
339}
340
341
342
343void modularity_clustering(SparseMatrix A, bool inplace, int ncluster_target,
344 int *nclusters, int **assignment, double *modularity){
345 /* find a clustering of vertices by maximize modularity
346 A: symmetric square matrix n x n. If real value, value will be used as edges weights, otherwise edge weights are considered as 1.
347 inplace: whether A can e modified. If true, A will be modified by removing diagonal.
348 ncluster_target: is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters
349 is desired. The resulting clustering will give as close to this number as possible.
350 If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity.
351 . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2,
352 . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be
353 . selected among nc or nc2, which ever is closer to ncluster_target.
354 Default: ncluster_target <= 0
355 nclusters: on output the number of clusters
356 assignment: dimension n. Node i is assigned to cluster "assignment[i]". 0 <= assignment < nclusters
357 */
359
360 assert(A->m == A->n);
361
362 B = SparseMatrix_symmetrize(A, false);
363
364 if (!inplace && B == A) {
366 }
367
369
371
372 hierachical_modularity_clustering(B, ncluster_target, nclusters, assignment, modularity);
373
374 if (B != A) SparseMatrix_delete(B);
375
376}
SparseMatrix SparseMatrix_from_coordinate_format(SparseMatrix A)
SparseMatrix SparseMatrix_transpose(SparseMatrix A)
SparseMatrix SparseMatrix_symmetrize(SparseMatrix A, bool pattern_symmetric_only)
bool SparseMatrix_is_symmetric(SparseMatrix A, bool test_pattern_symmetry_only)
void SparseMatrix_multiply_vector(SparseMatrix A, double *v, double **res)
SparseMatrix SparseMatrix_multiply(SparseMatrix A, SparseMatrix B)
void SparseMatrix_delete(SparseMatrix A)
SparseMatrix SparseMatrix_copy(SparseMatrix A)
SparseMatrix SparseMatrix_set_entries_to_real_one(SparseMatrix A)
SparseMatrix SparseMatrix_get_real_adjacency_matrix_symmetrized(SparseMatrix A)
SparseMatrix SparseMatrix_remove_diagonal(SparseMatrix A)
SparseMatrix SparseMatrix_new(int m, int n, size_t nz, int type, int format)
@ MATRIX_TYPE_REAL
@ FORMAT_COORD
#define SparseMatrix_coordinate_form_add_entry(A, irn, jcn, val)
wrap SparseMatrix_coordinate_form_add_entry_ for type safety
Memory allocation wrappers that exit on failure.
static void * gv_calloc(size_t nmemb, size_t size)
Definition alloc.h:26
static void * gv_alloc(size_t size)
Definition alloc.h:47
static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_establish(Multilevel_Modularity_Clustering grid, int ncluster_target)
Definition clustering.c:92
void modularity_clustering(SparseMatrix A, bool inplace, int ncluster_target, int *nclusters, int **assignment, double *modularity)
Definition clustering.c:343
static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_new(SparseMatrix A0, int ncluster_target)
Definition clustering.c:258
static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_init(SparseMatrix A, int level)
Definition clustering.c:21
static void hierachical_modularity_clustering(SparseMatrix A, int ncluster_target, int *nclusters, int **assignment, double *modularity)
Definition clustering.c:282
static void Multilevel_Modularity_Clustering_delete(Multilevel_Modularity_Clustering grid)
Definition clustering.c:75
#define A(n, t)
Definition expr.h:76
@ UNMATCHED
Definition general.h:77
static bool Verbose
Definition gml2gv.c:26
void free(void *)
node NULL
Definition grammar.y:181
@ grid
Definition gvgen.c:34
#define B
Definition hierarchy.c:120
#define RETURN(v)
Definition strmatch.c:146
Multilevel_Modularity_Clustering prev
Definition clustering.h:27
Multilevel_Modularity_Clustering next
Definition clustering.h:26