xref: /petsc/src/mat/impls/sbaij/mpi/sbaijov.c (revision d2522c19e8fa9bca20aaca277941d9a63e71db6a)
1 
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix.
4    Used for finding submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/sbaij/mpi/mpisbaij.h>
7 #include <petscbt.h>
8 
9 static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat, PetscInt, IS *);
10 static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat, PetscInt *, PetscInt, PetscInt *, PetscBT *);
11 
12 PetscErrorCode MatIncreaseOverlap_MPISBAIJ(Mat C, PetscInt is_max, IS is[], PetscInt ov) {
13   PetscInt        i, N = C->cmap->N, bs = C->rmap->bs, M = C->rmap->N, Mbs = M / bs, *nidx, isz, iov;
14   IS             *is_new, *is_row;
15   Mat            *submats;
16   Mat_MPISBAIJ   *c = (Mat_MPISBAIJ *)C->data;
17   Mat_SeqSBAIJ   *asub_i;
18   PetscBT         table;
19   PetscInt       *ai, brow, nz, nis, l, nmax, nstages_local, nstages, max_no, pos;
20   const PetscInt *idx;
21   PetscBool       flg;
22 
23   PetscFunctionBegin;
24   PetscCall(PetscMalloc1(is_max, &is_new));
25   /* Convert the indices into block format */
26   PetscCall(ISCompressIndicesGeneral(N, C->rmap->n, bs, is_max, is, is_new));
27   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
28 
29   /* ----- previous non-scalable implementation ----- */
30   flg = PETSC_FALSE;
31   PetscCall(PetscOptionsHasName(NULL, NULL, "-IncreaseOverlap_old", &flg));
32   if (flg) { /* previous non-scalable implementation */
33     printf("use previous non-scalable implementation...\n");
34     for (i = 0; i < ov; ++i) { PetscCall(MatIncreaseOverlap_MPISBAIJ_Once(C, is_max, is_new)); }
35   } else { /* implementation using modified BAIJ routines */
36 
37     PetscCall(PetscMalloc1(Mbs + 1, &nidx));
38     PetscCall(PetscBTCreate(Mbs, &table)); /* for column search */
39 
40     /* Create is_row */
41     PetscCall(PetscMalloc1(is_max, &is_row));
42     PetscCall(ISCreateStride(PETSC_COMM_SELF, Mbs, 0, 1, &is_row[0]));
43 
44     for (i = 1; i < is_max; i++) { is_row[i] = is_row[0]; /* reuse is_row[0] */ }
45 
46     /* Allocate memory to hold all the submatrices - Modified from MatCreateSubMatrices_MPIBAIJ() */
47     PetscCall(PetscMalloc1(is_max + 1, &submats));
48 
49     /* Determine the number of stages through which submatrices are done */
50     nmax = 20 * 1000000 / (c->Nbs * sizeof(PetscInt));
51     if (!nmax) nmax = 1;
52     nstages_local = is_max / nmax + ((is_max % nmax) ? 1 : 0);
53 
54     /* Make sure every processor loops through the nstages */
55     PetscCall(MPIU_Allreduce(&nstages_local, &nstages, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
56 
57     for (iov = 0; iov < ov; ++iov) {
58       /* 1) Get submats for column search */
59       for (i = 0, pos = 0; i < nstages; i++) {
60         if (pos + nmax <= is_max) max_no = nmax;
61         else if (pos == is_max) max_no = 0;
62         else max_no = is_max - pos;
63         c->ijonly = PETSC_TRUE; /* only matrix data structures are requested */
64         /* The resulting submatrices should be BAIJ, not SBAIJ, hence we change this value to trigger that */
65         PetscCall(PetscStrcpy(((PetscObject)c->A)->type_name, MATSEQBAIJ));
66         PetscCall(MatCreateSubMatrices_MPIBAIJ_local(C, max_no, is_row + pos, is_new + pos, MAT_INITIAL_MATRIX, submats + pos));
67         PetscCall(PetscStrcpy(((PetscObject)c->A)->type_name, MATSEQSBAIJ));
68         pos += max_no;
69       }
70 
71       /* 2) Row search */
72       PetscCall(MatIncreaseOverlap_MPIBAIJ_Once(C, is_max, is_new));
73 
74       /* 3) Column search */
75       for (i = 0; i < is_max; i++) {
76         asub_i = (Mat_SeqSBAIJ *)submats[i]->data;
77         ai     = asub_i->i;
78 
79         /* put is_new obtained from MatIncreaseOverlap_MPIBAIJ() to table */
80         PetscCall(PetscBTMemzero(Mbs, table));
81 
82         PetscCall(ISGetIndices(is_new[i], &idx));
83         PetscCall(ISGetLocalSize(is_new[i], &nis));
84         for (l = 0; l < nis; l++) {
85           PetscCall(PetscBTSet(table, idx[l]));
86           nidx[l] = idx[l];
87         }
88         isz = nis;
89 
90         /* add column entries to table */
91         for (brow = 0; brow < Mbs; brow++) {
92           nz = ai[brow + 1] - ai[brow];
93           if (nz) {
94             if (!PetscBTLookupSet(table, brow)) nidx[isz++] = brow;
95           }
96         }
97         PetscCall(ISRestoreIndices(is_new[i], &idx));
98         PetscCall(ISDestroy(&is_new[i]));
99 
100         /* create updated is_new */
101         PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz, nidx, PETSC_COPY_VALUES, is_new + i));
102       }
103 
104       /* Free tmp spaces */
105       for (i = 0; i < is_max; i++) { PetscCall(MatDestroy(&submats[i])); }
106     }
107 
108     PetscCall(PetscBTDestroy(&table));
109     PetscCall(PetscFree(submats));
110     PetscCall(ISDestroy(&is_row[0]));
111     PetscCall(PetscFree(is_row));
112     PetscCall(PetscFree(nidx));
113   }
114 
115   for (i = 0; i < is_max; i++) PetscCall(ISDestroy(&is[i]));
116   PetscCall(ISExpandIndicesGeneral(N, N, bs, is_max, is_new, is));
117 
118   for (i = 0; i < is_max; i++) PetscCall(ISDestroy(&is_new[i]));
119   PetscCall(PetscFree(is_new));
120   PetscFunctionReturn(0);
121 }
122 
123 typedef enum {
124   MINE,
125   OTHER
126 } WhoseOwner;
127 /*  data1, odata1 and odata2 are packed in the format (for communication):
128        data[0]          = is_max, no of is
129        data[1]          = size of is[0]
130         ...
131        data[is_max]     = size of is[is_max-1]
132        data[is_max + 1] = data(is[0])
133         ...
134        data[is_max+1+sum(size of is[k]), k=0,...,i-1] = data(is[i])
135         ...
136    data2 is packed in the format (for creating output is[]):
137        data[0]          = is_max, no of is
138        data[1]          = size of is[0]
139         ...
140        data[is_max]     = size of is[is_max-1]
141        data[is_max + 1] = data(is[0])
142         ...
143        data[is_max + 1 + Mbs*i) = data(is[i])
144         ...
145 */
146 static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C, PetscInt is_max, IS is[]) {
147   Mat_MPISBAIJ   *c = (Mat_MPISBAIJ *)C->data;
148   PetscMPIInt     size, rank, tag1, tag2, *len_s, nrqr, nrqs, *id_r1, *len_r1, flag, len, *iwork;
149   const PetscInt *idx_i;
150   PetscInt        idx, isz, col, *n, *data1, **data1_start, *data2, *data2_i, *data, *data_i;
151   PetscInt        Mbs, i, j, k, *odata1, *odata2;
152   PetscInt        proc_id, **odata2_ptr, *ctable = NULL, *btable, len_max, len_est;
153   PetscInt        proc_end = 0, len_unused, nodata2;
154   PetscInt        ois_max; /* max no of is[] in each of processor */
155   char           *t_p;
156   MPI_Comm        comm;
157   MPI_Request    *s_waits1, *s_waits2, r_req;
158   MPI_Status     *s_status, r_status;
159   PetscBT        *table; /* mark indices of this processor's is[] */
160   PetscBT         table_i;
161   PetscBT         otable; /* mark indices of other processors' is[] */
162   PetscInt        bs = C->rmap->bs, Bn = c->B->cmap->n, Bnbs = Bn / bs, *Bowners;
163   IS              garray_local, garray_gl;
164 
165   PetscFunctionBegin;
166   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
167   size = c->size;
168   rank = c->rank;
169   Mbs  = c->Mbs;
170 
171   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
172   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
173 
174   /* create tables used in
175      step 1: table[i] - mark c->garray of proc [i]
176      step 3: table[i] - mark indices of is[i] when whose=MINE
177              table[0] - mark incideces of is[] when whose=OTHER */
178   len = PetscMax(is_max, size);
179   PetscCall(PetscMalloc2(len, &table, (Mbs / PETSC_BITS_PER_BYTE + 1) * len, &t_p));
180   for (i = 0; i < len; i++) { table[i] = t_p + (Mbs / PETSC_BITS_PER_BYTE + 1) * i; }
181 
182   PetscCall(MPIU_Allreduce(&is_max, &ois_max, 1, MPIU_INT, MPI_MAX, comm));
183 
184   /* 1. Send this processor's is[] to other processors */
185   /*---------------------------------------------------*/
186   /* allocate spaces */
187   PetscCall(PetscMalloc1(is_max, &n));
188   len = 0;
189   for (i = 0; i < is_max; i++) {
190     PetscCall(ISGetLocalSize(is[i], &n[i]));
191     len += n[i];
192   }
193   if (!len) {
194     is_max = 0;
195   } else {
196     len += 1 + is_max; /* max length of data1 for one processor */
197   }
198 
199   PetscCall(PetscMalloc1(size * len + 1, &data1));
200   PetscCall(PetscMalloc1(size, &data1_start));
201   for (i = 0; i < size; i++) data1_start[i] = data1 + i * len;
202 
203   PetscCall(PetscMalloc4(size, &len_s, size, &btable, size, &iwork, size + 1, &Bowners));
204 
205   /* gather c->garray from all processors */
206   PetscCall(ISCreateGeneral(comm, Bnbs, c->garray, PETSC_COPY_VALUES, &garray_local));
207   PetscCall(ISAllGather(garray_local, &garray_gl));
208   PetscCall(ISDestroy(&garray_local));
209   PetscCallMPI(MPI_Allgather(&Bnbs, 1, MPIU_INT, Bowners + 1, 1, MPIU_INT, comm));
210 
211   Bowners[0] = 0;
212   for (i = 0; i < size; i++) Bowners[i + 1] += Bowners[i];
213 
214   if (is_max) {
215     /* hash table ctable which maps c->row to proc_id) */
216     PetscCall(PetscMalloc1(Mbs, &ctable));
217     for (proc_id = 0, j = 0; proc_id < size; proc_id++) {
218       for (; j < C->rmap->range[proc_id + 1] / bs; j++) ctable[j] = proc_id;
219     }
220 
221     /* hash tables marking c->garray */
222     PetscCall(ISGetIndices(garray_gl, &idx_i));
223     for (i = 0; i < size; i++) {
224       table_i = table[i];
225       PetscCall(PetscBTMemzero(Mbs, table_i));
226       for (j = Bowners[i]; j < Bowners[i + 1]; j++) { /* go through B cols of proc[i]*/
227         PetscCall(PetscBTSet(table_i, idx_i[j]));
228       }
229     }
230     PetscCall(ISRestoreIndices(garray_gl, &idx_i));
231   } /* if (is_max) */
232   PetscCall(ISDestroy(&garray_gl));
233 
234   /* evaluate communication - mesg to who, length, and buffer space */
235   for (i = 0; i < size; i++) len_s[i] = 0;
236 
237   /* header of data1 */
238   for (proc_id = 0; proc_id < size; proc_id++) {
239     iwork[proc_id]        = 0;
240     *data1_start[proc_id] = is_max;
241     data1_start[proc_id]++;
242     for (j = 0; j < is_max; j++) {
243       if (proc_id == rank) {
244         *data1_start[proc_id] = n[j];
245       } else {
246         *data1_start[proc_id] = 0;
247       }
248       data1_start[proc_id]++;
249     }
250   }
251 
252   for (i = 0; i < is_max; i++) {
253     PetscCall(ISGetIndices(is[i], &idx_i));
254     for (j = 0; j < n[i]; j++) {
255       idx                = idx_i[j];
256       *data1_start[rank] = idx;
257       data1_start[rank]++; /* for local proccessing */
258       proc_end = ctable[idx];
259       for (proc_id = 0; proc_id <= proc_end; proc_id++) {                        /* for others to process */
260         if (proc_id == rank) continue;                                           /* done before this loop */
261         if (proc_id < proc_end && !PetscBTLookup(table[proc_id], idx)) continue; /* no need for sending idx to [proc_id] */
262         *data1_start[proc_id] = idx;
263         data1_start[proc_id]++;
264         len_s[proc_id]++;
265       }
266     }
267     /* update header data */
268     for (proc_id = 0; proc_id < size; proc_id++) {
269       if (proc_id == rank) continue;
270       *(data1 + proc_id * len + 1 + i) = len_s[proc_id] - iwork[proc_id];
271       iwork[proc_id]                   = len_s[proc_id];
272     }
273     PetscCall(ISRestoreIndices(is[i], &idx_i));
274   }
275 
276   nrqs = 0;
277   nrqr = 0;
278   for (i = 0; i < size; i++) {
279     data1_start[i] = data1 + i * len;
280     if (len_s[i]) {
281       nrqs++;
282       len_s[i] += 1 + is_max; /* add no. of header msg */
283     }
284   }
285 
286   for (i = 0; i < is_max; i++) { PetscCall(ISDestroy(&is[i])); }
287   PetscCall(PetscFree(n));
288   PetscCall(PetscFree(ctable));
289 
290   /* Determine the number of messages to expect, their lengths, from from-ids */
291   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrqr));
292   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, len_s, &id_r1, &len_r1));
293 
294   /*  Now  post the sends */
295   PetscCall(PetscMalloc2(size, &s_waits1, size, &s_waits2));
296   k = 0;
297   for (proc_id = 0; proc_id < size; proc_id++) { /* send data1 to processor [proc_id] */
298     if (len_s[proc_id]) {
299       PetscCallMPI(MPI_Isend(data1_start[proc_id], len_s[proc_id], MPIU_INT, proc_id, tag1, comm, s_waits1 + k));
300       k++;
301     }
302   }
303 
304   /* 2. Receive other's is[] and process. Then send back */
305   /*-----------------------------------------------------*/
306   len = 0;
307   for (i = 0; i < nrqr; i++) {
308     if (len_r1[i] > len) len = len_r1[i];
309   }
310   PetscCall(PetscFree(len_r1));
311   PetscCall(PetscFree(id_r1));
312 
313   for (proc_id = 0; proc_id < size; proc_id++) len_s[proc_id] = iwork[proc_id] = 0;
314 
315   PetscCall(PetscMalloc1(len + 1, &odata1));
316   PetscCall(PetscMalloc1(size, &odata2_ptr));
317   PetscCall(PetscBTCreate(Mbs, &otable));
318 
319   len_max = ois_max * (Mbs + 1); /* max space storing all is[] for each receive */
320   len_est = 2 * len_max;         /* estimated space of storing is[] for all receiving messages */
321   PetscCall(PetscMalloc1(len_est + 1, &odata2));
322   nodata2 = 0; /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */
323 
324   odata2_ptr[nodata2] = odata2;
325 
326   len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max  */
327 
328   k = 0;
329   while (k < nrqr) {
330     /* Receive messages */
331     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag1, comm, &flag, &r_status));
332     if (flag) {
333       PetscCallMPI(MPI_Get_count(&r_status, MPIU_INT, &len));
334       proc_id = r_status.MPI_SOURCE;
335       PetscCallMPI(MPI_Irecv(odata1, len, MPIU_INT, proc_id, r_status.MPI_TAG, comm, &r_req));
336       PetscCallMPI(MPI_Wait(&r_req, &r_status));
337 
338       /*  Process messages */
339       /*  make sure there is enough unused space in odata2 array */
340       if (len_unused < len_max) { /* allocate more space for odata2 */
341         PetscCall(PetscMalloc1(len_est + 1, &odata2));
342 
343         odata2_ptr[++nodata2] = odata2;
344 
345         len_unused = len_est;
346       }
347 
348       PetscCall(MatIncreaseOverlap_MPISBAIJ_Local(C, odata1, OTHER, odata2, &otable));
349       len = 1 + odata2[0];
350       for (i = 0; i < odata2[0]; i++) len += odata2[1 + i];
351 
352       /* Send messages back */
353       PetscCallMPI(MPI_Isend(odata2, len, MPIU_INT, proc_id, tag2, comm, s_waits2 + k));
354       k++;
355       odata2 += len;
356       len_unused -= len;
357       len_s[proc_id] = len; /* num of messages sending back to [proc_id] by this proc */
358     }
359   }
360   PetscCall(PetscFree(odata1));
361   PetscCall(PetscBTDestroy(&otable));
362 
363   /* 3. Do local work on this processor's is[] */
364   /*-------------------------------------------*/
365   /* make sure there is enough unused space in odata2(=data) array */
366   len_max = is_max * (Mbs + 1); /* max space storing all is[] for this processor */
367   if (len_unused < len_max) {   /* allocate more space for odata2 */
368     PetscCall(PetscMalloc1(len_est + 1, &odata2));
369 
370     odata2_ptr[++nodata2] = odata2;
371   }
372 
373   data = odata2;
374   PetscCall(MatIncreaseOverlap_MPISBAIJ_Local(C, data1_start[rank], MINE, data, table));
375   PetscCall(PetscFree(data1_start));
376 
377   /* 4. Receive work done on other processors, then merge */
378   /*------------------------------------------------------*/
379   /* get max number of messages that this processor expects to recv */
380   PetscCall(MPIU_Allreduce(len_s, iwork, size, MPI_INT, MPI_MAX, comm));
381   PetscCall(PetscMalloc1(iwork[rank] + 1, &data2));
382   PetscCall(PetscFree4(len_s, btable, iwork, Bowners));
383 
384   k = 0;
385   while (k < nrqs) {
386     /* Receive messages */
387     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag2, comm, &flag, &r_status));
388     if (flag) {
389       PetscCallMPI(MPI_Get_count(&r_status, MPIU_INT, &len));
390 
391       proc_id = r_status.MPI_SOURCE;
392 
393       PetscCallMPI(MPI_Irecv(data2, len, MPIU_INT, proc_id, r_status.MPI_TAG, comm, &r_req));
394       PetscCallMPI(MPI_Wait(&r_req, &r_status));
395       if (len > 1 + is_max) { /* Add data2 into data */
396         data2_i = data2 + 1 + is_max;
397         for (i = 0; i < is_max; i++) {
398           table_i = table[i];
399           data_i  = data + 1 + is_max + Mbs * i;
400           isz     = data[1 + i];
401           for (j = 0; j < data2[1 + i]; j++) {
402             col = data2_i[j];
403             if (!PetscBTLookupSet(table_i, col)) data_i[isz++] = col;
404           }
405           data[1 + i] = isz;
406           if (i < is_max - 1) data2_i += data2[1 + i];
407         }
408       }
409       k++;
410     }
411   }
412   PetscCall(PetscFree(data2));
413   PetscCall(PetscFree2(table, t_p));
414 
415   /* phase 1 sends are complete */
416   PetscCall(PetscMalloc1(size, &s_status));
417   if (nrqs) PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status));
418   PetscCall(PetscFree(data1));
419 
420   /* phase 2 sends are complete */
421   if (nrqr) PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status));
422   PetscCall(PetscFree2(s_waits1, s_waits2));
423   PetscCall(PetscFree(s_status));
424 
425   /* 5. Create new is[] */
426   /*--------------------*/
427   for (i = 0; i < is_max; i++) {
428     data_i = data + 1 + is_max + Mbs * i;
429     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, data[1 + i], data_i, PETSC_COPY_VALUES, is + i));
430   }
431   for (k = 0; k <= nodata2; k++) { PetscCall(PetscFree(odata2_ptr[k])); }
432   PetscCall(PetscFree(odata2_ptr));
433   PetscFunctionReturn(0);
434 }
435 
436 /*
437    MatIncreaseOverlap_MPISBAIJ_Local - Called by MatIncreaseOverlap, to do
438        the work on the local processor.
439 
440      Inputs:
441       C      - MAT_MPISBAIJ;
442       data   - holds is[]. See MatIncreaseOverlap_MPISBAIJ_Once() for the format.
443       whose  - whose is[] to be processed,
444                MINE:  this processor's is[]
445                OTHER: other processor's is[]
446      Output:
447        nidx  - whose = MINE:
448                      holds input and newly found indices in the same format as data
449                whose = OTHER:
450                      only holds the newly found indices
451        table - table[i]: mark the indices of is[i], i=0,...,is_max. Used only in the case 'whose=MINE'.
452 */
453 /* Would computation be reduced by swapping the loop 'for each is' and 'for each row'? */
454 static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat C, PetscInt *data, PetscInt whose, PetscInt *nidx, PetscBT *table) {
455   Mat_MPISBAIJ *c = (Mat_MPISBAIJ *)C->data;
456   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)(c->A)->data;
457   Mat_SeqBAIJ  *b = (Mat_SeqBAIJ *)(c->B)->data;
458   PetscInt      row, mbs, Mbs, *nidx_i, col, col_max, isz, isz0, *ai, *aj, *bi, *bj, *garray, rstart, l;
459   PetscInt      a_start, a_end, b_start, b_end, i, j, k, is_max, *idx_i, n;
460   PetscBT       table0;  /* mark the indices of input is[] for look up */
461   PetscBT       table_i; /* poits to i-th table. When whose=OTHER, a single table is used for all is[] */
462 
463   PetscFunctionBegin;
464   Mbs    = c->Mbs;
465   mbs    = a->mbs;
466   ai     = a->i;
467   aj     = a->j;
468   bi     = b->i;
469   bj     = b->j;
470   garray = c->garray;
471   rstart = c->rstartbs;
472   is_max = data[0];
473 
474   PetscCall(PetscBTCreate(Mbs, &table0));
475 
476   nidx[0] = is_max;
477   idx_i   = data + is_max + 1;   /* ptr to input is[0] array */
478   nidx_i  = nidx + is_max + 1;   /* ptr to output is[0] array */
479   for (i = 0; i < is_max; i++) { /* for each is */
480     isz = 0;
481     n   = data[1 + i]; /* size of input is[i] */
482 
483     /* initialize and set table_i(mark idx and nidx) and table0(only mark idx) */
484     if (whose == MINE) { /* process this processor's is[] */
485       table_i = table[i];
486       nidx_i  = nidx + 1 + is_max + Mbs * i;
487     } else { /* process other processor's is[] - only use one temp table */
488       table_i = table[0];
489     }
490     PetscCall(PetscBTMemzero(Mbs, table_i));
491     PetscCall(PetscBTMemzero(Mbs, table0));
492     if (n == 0) {
493       nidx[1 + i] = 0; /* size of new is[i] */
494       continue;
495     }
496 
497     isz0    = 0;
498     col_max = 0;
499     for (j = 0; j < n; j++) {
500       col = idx_i[j];
501       PetscCheck(col < Mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index col %" PetscInt_FMT " >= Mbs %" PetscInt_FMT, col, Mbs);
502       if (!PetscBTLookupSet(table_i, col)) {
503         PetscCall(PetscBTSet(table0, col));
504         if (whose == MINE) nidx_i[isz0] = col;
505         if (col_max < col) col_max = col;
506         isz0++;
507       }
508     }
509 
510     if (whose == MINE) isz = isz0;
511     k = 0; /* no. of indices from input is[i] that have been examined */
512     for (row = 0; row < mbs; row++) {
513       a_start = ai[row];
514       a_end   = ai[row + 1];
515       b_start = bi[row];
516       b_end   = bi[row + 1];
517       if (PetscBTLookup(table0, row + rstart)) { /* row is on input is[i]:
518                                                 do row search: collect all col in this row */
519         for (l = a_start; l < a_end; l++) {      /* Amat */
520           col = aj[l] + rstart;
521           if (!PetscBTLookupSet(table_i, col)) nidx_i[isz++] = col;
522         }
523         for (l = b_start; l < b_end; l++) { /* Bmat */
524           col = garray[bj[l]];
525           if (!PetscBTLookupSet(table_i, col)) nidx_i[isz++] = col;
526         }
527         k++;
528         if (k >= isz0) break;               /* for (row=0; row<mbs; row++) */
529       } else {                              /* row is not on input is[i]:
530                   do col serach: add row onto nidx_i if there is a col in nidx_i */
531         for (l = a_start; l < a_end; l++) { /* Amat */
532           col = aj[l] + rstart;
533           if (col > col_max) break;
534           if (PetscBTLookup(table0, col)) {
535             if (!PetscBTLookupSet(table_i, row + rstart)) nidx_i[isz++] = row + rstart;
536             break; /* for l = start; l<end ; l++) */
537           }
538         }
539         for (l = b_start; l < b_end; l++) { /* Bmat */
540           col = garray[bj[l]];
541           if (col > col_max) break;
542           if (PetscBTLookup(table0, col)) {
543             if (!PetscBTLookupSet(table_i, row + rstart)) nidx_i[isz++] = row + rstart;
544             break; /* for l = start; l<end ; l++) */
545           }
546         }
547       }
548     }
549 
550     if (i < is_max - 1) {
551       idx_i += n;    /* ptr to input is[i+1] array */
552       nidx_i += isz; /* ptr to output is[i+1] array */
553     }
554     nidx[1 + i] = isz; /* size of new is[i] */
555   }                    /* for each is */
556   PetscCall(PetscBTDestroy(&table0));
557   PetscFunctionReturn(0);
558 }
559