xref: /petsc/src/mat/impls/sbaij/mpi/sbaijov.c (revision fdfbdca60871d1230bda7d5d096e2bd2d2a23f7a)
1632d0f97SHong Zhang 
2632d0f97SHong Zhang /*
3632d0f97SHong Zhang    Routines to compute overlapping regions of a parallel MPI matrix.
4632d0f97SHong Zhang    Used for finding submatrices that were shared across processors.
5632d0f97SHong Zhang */
6c6db04a5SJed Brown #include <../src/mat/impls/sbaij/mpi/mpisbaij.h>
7c6db04a5SJed Brown #include <petscbt.h>
8632d0f97SHong Zhang 
913f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat, PetscInt, IS *);
1013f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat, PetscInt *, PetscInt, PetscInt *, PetscBT *);
11632d0f97SHong Zhang 
12d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_MPISBAIJ(Mat C, PetscInt is_max, IS is[], PetscInt ov)
13d71ae5a4SJacob Faibussowitsch {
14b11de2a9SHong Zhang   PetscInt        i, N = C->cmap->N, bs = C->rmap->bs, M = C->rmap->N, Mbs = M / bs, *nidx, isz, iov;
15b11de2a9SHong Zhang   IS             *is_new, *is_row;
16b11de2a9SHong Zhang   Mat            *submats;
17b11de2a9SHong Zhang   Mat_MPISBAIJ   *c = (Mat_MPISBAIJ *)C->data;
18b11de2a9SHong Zhang   Mat_SeqSBAIJ   *asub_i;
19b11de2a9SHong Zhang   PetscBT         table;
20b11de2a9SHong Zhang   PetscInt       *ai, brow, nz, nis, l, nmax, nstages_local, nstages, max_no, pos;
21b11de2a9SHong Zhang   const PetscInt *idx;
22b829d874SHong Zhang   PetscBool       flg;
23632d0f97SHong Zhang 
24632d0f97SHong Zhang   PetscFunctionBegin;
259566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(is_max, &is_new));
26632d0f97SHong Zhang   /* Convert the indices into block format */
279566063dSJacob Faibussowitsch   PetscCall(ISCompressIndicesGeneral(N, C->rmap->n, bs, is_max, is, is_new));
2808401ef6SPierre Jolivet   PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified");
29db41cccfSHong Zhang 
302ef1f0ffSBarry Smith   /*  previous non-scalable implementation  */
31a0769a71SSatish Balay   flg = PETSC_FALSE;
329566063dSJacob Faibussowitsch   PetscCall(PetscOptionsHasName(NULL, NULL, "-IncreaseOverlap_old", &flg));
337a868f3eSHong Zhang   if (flg) { /* previous non-scalable implementation */
347a868f3eSHong Zhang     printf("use previous non-scalable implementation...\n");
3548a46eb9SPierre Jolivet     for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPISBAIJ_Once(C, is_max, is_new));
361c4982ebSHong Zhang   } else { /* implementation using modified BAIJ routines */
371c4982ebSHong Zhang 
389566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(Mbs + 1, &nidx));
399566063dSJacob Faibussowitsch     PetscCall(PetscBTCreate(Mbs, &table)); /* for column search */
40db41cccfSHong Zhang 
41db41cccfSHong Zhang     /* Create is_row */
429566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(is_max, &is_row));
439566063dSJacob Faibussowitsch     PetscCall(ISCreateStride(PETSC_COMM_SELF, Mbs, 0, 1, &is_row[0]));
4426fbe8dcSKarl Rupp 
459371c9d4SSatish Balay     for (i = 1; i < is_max; i++) { is_row[i] = is_row[0]; /* reuse is_row[0] */ }
46db41cccfSHong Zhang 
477dae84e0SHong Zhang     /* Allocate memory to hold all the submatrices - Modified from MatCreateSubMatrices_MPIBAIJ() */
489566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(is_max + 1, &submats));
49b11de2a9SHong Zhang 
50db41cccfSHong Zhang     /* Determine the number of stages through which submatrices are done */
51db41cccfSHong Zhang     nmax = 20 * 1000000 / (c->Nbs * sizeof(PetscInt));
52db41cccfSHong Zhang     if (!nmax) nmax = 1;
53db41cccfSHong Zhang     nstages_local = is_max / nmax + ((is_max % nmax) ? 1 : 0);
54db41cccfSHong Zhang 
55db41cccfSHong Zhang     /* Make sure every processor loops through the nstages */
561c2dc1cbSBarry Smith     PetscCall(MPIU_Allreduce(&nstages_local, &nstages, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C)));
57b11de2a9SHong Zhang 
58c6a7a370SJeremy L Thompson     {
59c6a7a370SJeremy L Thompson       const PetscObject obj = (PetscObject)c->A;
60c6a7a370SJeremy L Thompson       size_t            new_len, cur_len, max_len;
61c6a7a370SJeremy L Thompson 
62c6a7a370SJeremy L Thompson       PetscCall(PetscStrlen(MATSEQBAIJ, &new_len));
63c6a7a370SJeremy L Thompson       PetscCall(PetscStrlen(MATSEQSBAIJ, &cur_len));
64c6a7a370SJeremy L Thompson       max_len = PetscMax(cur_len, new_len) + 1;
65c6a7a370SJeremy L Thompson       PetscCall(PetscRealloc(max_len * sizeof(*(obj->type_name)), &obj->type_name));
66c6a7a370SJeremy L Thompson       /* The resulting submatrices should be BAIJ, not SBAIJ, hence we change this value to
67c6a7a370SJeremy L Thompson          trigger that */
68b11de2a9SHong Zhang       for (iov = 0; iov < ov; ++iov) {
69b11de2a9SHong Zhang         /* 1) Get submats for column search */
70c6a7a370SJeremy L Thompson         PetscCall(PetscStrncpy(obj->type_name, MATSEQBAIJ, max_len));
71db41cccfSHong Zhang         for (i = 0, pos = 0; i < nstages; i++) {
72db41cccfSHong Zhang           if (pos + nmax <= is_max) max_no = nmax;
73db41cccfSHong Zhang           else if (pos == is_max) max_no = 0;
74db41cccfSHong Zhang           else max_no = is_max - pos;
75b829d874SHong Zhang           c->ijonly = PETSC_TRUE; /* only matrix data structures are requested */
76c6a7a370SJeremy L Thompson 
77*fdfbdca6SPierre Jolivet           PetscCall(MatCreateSubMatrices_MPIBAIJ_local(C, max_no, is_row + pos, is_new + pos, MAT_INITIAL_MATRIX, submats + pos, PETSC_TRUE));
78db41cccfSHong Zhang           pos += max_no;
79db41cccfSHong Zhang         }
80c6a7a370SJeremy L Thompson         PetscCall(PetscStrncpy(obj->type_name, MATSEQSBAIJ, max_len));
81db41cccfSHong Zhang 
82b11de2a9SHong Zhang         /* 2) Row search */
839566063dSJacob Faibussowitsch         PetscCall(MatIncreaseOverlap_MPIBAIJ_Once(C, is_max, is_new));
84db41cccfSHong Zhang 
85b11de2a9SHong Zhang         /* 3) Column search */
86db41cccfSHong Zhang         for (i = 0; i < is_max; i++) {
87db41cccfSHong Zhang           asub_i = (Mat_SeqSBAIJ *)submats[i]->data;
88feb237baSPierre Jolivet           ai     = asub_i->i;
89db41cccfSHong Zhang 
90db41cccfSHong Zhang           /* put is_new obtained from MatIncreaseOverlap_MPIBAIJ() to table */
919566063dSJacob Faibussowitsch           PetscCall(PetscBTMemzero(Mbs, table));
92db41cccfSHong Zhang 
939566063dSJacob Faibussowitsch           PetscCall(ISGetIndices(is_new[i], &idx));
949566063dSJacob Faibussowitsch           PetscCall(ISGetLocalSize(is_new[i], &nis));
95db41cccfSHong Zhang           for (l = 0; l < nis; l++) {
969566063dSJacob Faibussowitsch             PetscCall(PetscBTSet(table, idx[l]));
97db41cccfSHong Zhang             nidx[l] = idx[l];
98db41cccfSHong Zhang           }
99db41cccfSHong Zhang           isz = nis;
100db41cccfSHong Zhang 
101b11de2a9SHong Zhang           /* add column entries to table */
102db41cccfSHong Zhang           for (brow = 0; brow < Mbs; brow++) {
103db41cccfSHong Zhang             nz = ai[brow + 1] - ai[brow];
104db41cccfSHong Zhang             if (nz) {
105db41cccfSHong Zhang               if (!PetscBTLookupSet(table, brow)) nidx[isz++] = brow;
106db41cccfSHong Zhang             }
107db41cccfSHong Zhang           }
1089566063dSJacob Faibussowitsch           PetscCall(ISRestoreIndices(is_new[i], &idx));
1099566063dSJacob Faibussowitsch           PetscCall(ISDestroy(&is_new[i]));
110db41cccfSHong Zhang 
111db41cccfSHong Zhang           /* create updated is_new */
1129566063dSJacob Faibussowitsch           PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz, nidx, PETSC_COPY_VALUES, is_new + i));
113db41cccfSHong Zhang         }
114db41cccfSHong Zhang 
115db41cccfSHong Zhang         /* Free tmp spaces */
11648a46eb9SPierre Jolivet         for (i = 0; i < is_max; i++) PetscCall(MatDestroy(&submats[i]));
117db41cccfSHong Zhang       }
118b829d874SHong Zhang 
1199566063dSJacob Faibussowitsch       PetscCall(PetscBTDestroy(&table));
1209566063dSJacob Faibussowitsch       PetscCall(PetscFree(submats));
1219566063dSJacob Faibussowitsch       PetscCall(ISDestroy(&is_row[0]));
1229566063dSJacob Faibussowitsch       PetscCall(PetscFree(is_row));
1239566063dSJacob Faibussowitsch       PetscCall(PetscFree(nidx));
124db41cccfSHong Zhang     }
125c6a7a370SJeremy L Thompson   }
126bd38d1d0SPierre Jolivet   for (i = 0; i < is_max; i++) {
127bd38d1d0SPierre Jolivet     PetscCall(ISDestroy(&is[i]));
128bd38d1d0SPierre Jolivet     PetscCall(ISGetLocalSize(is_new[i], &nis));
129bd38d1d0SPierre Jolivet     PetscCall(ISGetIndices(is_new[i], &idx));
130bd38d1d0SPierre Jolivet     PetscCall(ISCreateBlock(PetscObjectComm((PetscObject)is_new[i]), bs, nis, idx, PETSC_COPY_VALUES, &is[i]));
131bd38d1d0SPierre Jolivet     PetscCall(ISDestroy(&is_new[i]));
132bd38d1d0SPierre Jolivet   }
1339566063dSJacob Faibussowitsch   PetscCall(PetscFree(is_new));
1343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
135632d0f97SHong Zhang }
136632d0f97SHong Zhang 
1379371c9d4SSatish Balay typedef enum {
1389371c9d4SSatish Balay   MINE,
1399371c9d4SSatish Balay   OTHER
1409371c9d4SSatish Balay } WhoseOwner;
1410472cc68SHong Zhang /*  data1, odata1 and odata2 are packed in the format (for communication):
142a2a9f22aSHong Zhang        data[0]          = is_max, no of is
143a2a9f22aSHong Zhang        data[1]          = size of is[0]
144a2a9f22aSHong Zhang         ...
145a2a9f22aSHong Zhang        data[is_max]     = size of is[is_max-1]
146a2a9f22aSHong Zhang        data[is_max + 1] = data(is[0])
147a2a9f22aSHong Zhang         ...
148a2a9f22aSHong Zhang        data[is_max+1+sum(size of is[k]), k=0,...,i-1] = data(is[i])
149a2a9f22aSHong Zhang         ...
1500472cc68SHong Zhang    data2 is packed in the format (for creating output is[]):
1510472cc68SHong Zhang        data[0]          = is_max, no of is
1520472cc68SHong Zhang        data[1]          = size of is[0]
1530472cc68SHong Zhang         ...
1540472cc68SHong Zhang        data[is_max]     = size of is[is_max-1]
1550472cc68SHong Zhang        data[is_max + 1] = data(is[0])
1560472cc68SHong Zhang         ...
1570472cc68SHong Zhang        data[is_max + 1 + Mbs*i) = data(is[i])
1580472cc68SHong Zhang         ...
159a2a9f22aSHong Zhang */
160d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C, PetscInt is_max, IS is[])
161d71ae5a4SJacob Faibussowitsch {
162632d0f97SHong Zhang   Mat_MPISBAIJ   *c = (Mat_MPISBAIJ *)C->data;
163adcec1e5SJed Brown   PetscMPIInt     size, rank, tag1, tag2, *len_s, nrqr, nrqs, *id_r1, *len_r1, flag, len, *iwork;
1645d0c19d7SBarry Smith   const PetscInt *idx_i;
16526fbe8dcSKarl Rupp   PetscInt        idx, isz, col, *n, *data1, **data1_start, *data2, *data2_i, *data, *data_i;
16626fbe8dcSKarl Rupp   PetscInt        Mbs, i, j, k, *odata1, *odata2;
167f4259b30SLisandro Dalcin   PetscInt        proc_id, **odata2_ptr, *ctable = NULL, *btable, len_max, len_est;
168adcec1e5SJed Brown   PetscInt        proc_end = 0, len_unused, nodata2;
16913f74950SBarry Smith   PetscInt        ois_max; /* max no of is[] in each of processor */
170bfc6803cSHong Zhang   char           *t_p;
171632d0f97SHong Zhang   MPI_Comm        comm;
172e8527bf2SHong Zhang   MPI_Request    *s_waits1, *s_waits2, r_req;
173632d0f97SHong Zhang   MPI_Status     *s_status, r_status;
17445f43ab7SHong Zhang   PetscBT        *table; /* mark indices of this processor's is[] */
175fc70d14eSHong Zhang   PetscBT         table_i;
176bfc6803cSHong Zhang   PetscBT         otable; /* mark indices of other processors' is[] */
177d0f46423SBarry Smith   PetscInt        bs = C->rmap->bs, Bn = c->B->cmap->n, Bnbs = Bn / bs, *Bowners;
17810eea884SHong Zhang   IS              garray_local, garray_gl;
1795483b11dSHong Zhang 
180632d0f97SHong Zhang   PetscFunctionBegin;
1819566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
182632d0f97SHong Zhang   size = c->size;
183632d0f97SHong Zhang   rank = c->rank;
184632d0f97SHong Zhang   Mbs  = c->Mbs;
185632d0f97SHong Zhang 
1869566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1));
1879566063dSJacob Faibussowitsch   PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2));
188c910923dSHong Zhang 
189430a0127SHong Zhang   /* create tables used in
190430a0127SHong Zhang      step 1: table[i] - mark c->garray of proc [i]
19145f43ab7SHong Zhang      step 3: table[i] - mark indices of is[i] when whose=MINE
192430a0127SHong Zhang              table[0] - mark incideces of is[] when whose=OTHER */
1932f613bf5SBarry Smith   len = PetscMax(is_max, size);
1949566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(len, &table, (Mbs / PETSC_BITS_PER_BYTE + 1) * len, &t_p));
195ad540459SPierre Jolivet   for (i = 0; i < len; i++) table[i] = t_p + (Mbs / PETSC_BITS_PER_BYTE + 1) * i;
196430a0127SHong Zhang 
1971c2dc1cbSBarry Smith   PetscCall(MPIU_Allreduce(&is_max, &ois_max, 1, MPIU_INT, MPI_MAX, comm));
19810eea884SHong Zhang 
1995483b11dSHong Zhang   /* 1. Send this processor's is[] to other processors */
200e8527bf2SHong Zhang   /* allocate spaces */
2019566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(is_max, &n));
2025483b11dSHong Zhang   len = 0;
203c910923dSHong Zhang   for (i = 0; i < is_max; i++) {
2049566063dSJacob Faibussowitsch     PetscCall(ISGetLocalSize(is[i], &n[i]));
205632d0f97SHong Zhang     len += n[i];
206632d0f97SHong Zhang   }
207958c9bccSBarry Smith   if (!len) {
2085483b11dSHong Zhang     is_max = 0;
2095483b11dSHong Zhang   } else {
2105483b11dSHong Zhang     len += 1 + is_max; /* max length of data1 for one processor */
2115483b11dSHong Zhang   }
2125483b11dSHong Zhang 
2139566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(size * len + 1, &data1));
2149566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(size, &data1_start));
2155483b11dSHong Zhang   for (i = 0; i < size; i++) data1_start[i] = data1 + i * len;
2165483b11dSHong Zhang 
2179566063dSJacob Faibussowitsch   PetscCall(PetscMalloc4(size, &len_s, size, &btable, size, &iwork, size + 1, &Bowners));
218e8527bf2SHong Zhang 
21976f244e2SHong Zhang   /* gather c->garray from all processors */
2209566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(comm, Bnbs, c->garray, PETSC_COPY_VALUES, &garray_local));
2219566063dSJacob Faibussowitsch   PetscCall(ISAllGather(garray_local, &garray_gl));
2229566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&garray_local));
2239566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Allgather(&Bnbs, 1, MPIU_INT, Bowners + 1, 1, MPIU_INT, comm));
22426fbe8dcSKarl Rupp 
22576f244e2SHong Zhang   Bowners[0] = 0;
22676f244e2SHong Zhang   for (i = 0; i < size; i++) Bowners[i + 1] += Bowners[i];
22776f244e2SHong Zhang 
2285483b11dSHong Zhang   if (is_max) {
229430a0127SHong Zhang     /* hash table ctable which maps c->row to proc_id) */
2309566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(Mbs, &ctable));
2315483b11dSHong Zhang     for (proc_id = 0, j = 0; proc_id < size; proc_id++) {
23226fbe8dcSKarl Rupp       for (; j < C->rmap->range[proc_id + 1] / bs; j++) ctable[j] = proc_id;
2335483b11dSHong Zhang     }
2345483b11dSHong Zhang 
235430a0127SHong Zhang     /* hash tables marking c->garray */
2369566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(garray_gl, &idx_i));
2375483b11dSHong Zhang     for (i = 0; i < size; i++) {
238430a0127SHong Zhang       table_i = table[i];
2399566063dSJacob Faibussowitsch       PetscCall(PetscBTMemzero(Mbs, table_i));
240430a0127SHong Zhang       for (j = Bowners[i]; j < Bowners[i + 1]; j++) { /* go through B cols of proc[i]*/
2419566063dSJacob Faibussowitsch         PetscCall(PetscBTSet(table_i, idx_i[j]));
2425483b11dSHong Zhang       }
2435483b11dSHong Zhang     }
2449566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(garray_gl, &idx_i));
2455483b11dSHong Zhang   } /* if (is_max) */
2469566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&garray_gl));
2475483b11dSHong Zhang 
2485483b11dSHong Zhang   /* evaluate communication - mesg to who, length, and buffer space */
249e8527bf2SHong Zhang   for (i = 0; i < size; i++) len_s[i] = 0;
2505483b11dSHong Zhang 
2515483b11dSHong Zhang   /* header of data1 */
2525483b11dSHong Zhang   for (proc_id = 0; proc_id < size; proc_id++) {
2535483b11dSHong Zhang     iwork[proc_id]        = 0;
2545483b11dSHong Zhang     *data1_start[proc_id] = is_max;
2555483b11dSHong Zhang     data1_start[proc_id]++;
2565483b11dSHong Zhang     for (j = 0; j < is_max; j++) {
2575483b11dSHong Zhang       if (proc_id == rank) {
2585483b11dSHong Zhang         *data1_start[proc_id] = n[j];
2595483b11dSHong Zhang       } else {
2605483b11dSHong Zhang         *data1_start[proc_id] = 0;
2615483b11dSHong Zhang       }
2625483b11dSHong Zhang       data1_start[proc_id]++;
2635483b11dSHong Zhang     }
2645483b11dSHong Zhang   }
2655483b11dSHong Zhang 
2665483b11dSHong Zhang   for (i = 0; i < is_max; i++) {
2679566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(is[i], &idx_i));
2685483b11dSHong Zhang     for (j = 0; j < n[i]; j++) {
2695483b11dSHong Zhang       idx                = idx_i[j];
2709371c9d4SSatish Balay       *data1_start[rank] = idx;
27135cb6cd3SPierre Jolivet       data1_start[rank]++; /* for local processing */
2725483b11dSHong Zhang       proc_end = ctable[idx];
2735483b11dSHong Zhang       for (proc_id = 0; proc_id <= proc_end; proc_id++) {                        /* for others to process */
274e8527bf2SHong Zhang         if (proc_id == rank) continue;                                           /* done before this loop */
27526fbe8dcSKarl Rupp         if (proc_id < proc_end && !PetscBTLookup(table[proc_id], idx)) continue; /* no need for sending idx to [proc_id] */
2769371c9d4SSatish Balay         *data1_start[proc_id] = idx;
2779371c9d4SSatish Balay         data1_start[proc_id]++;
2785483b11dSHong Zhang         len_s[proc_id]++;
2795483b11dSHong Zhang       }
2805483b11dSHong Zhang     }
2815483b11dSHong Zhang     /* update header data */
2822cfbe0a4SHong Zhang     for (proc_id = 0; proc_id < size; proc_id++) {
2835483b11dSHong Zhang       if (proc_id == rank) continue;
2845483b11dSHong Zhang       *(data1 + proc_id * len + 1 + i) = len_s[proc_id] - iwork[proc_id];
2855483b11dSHong Zhang       iwork[proc_id]                   = len_s[proc_id];
2865483b11dSHong Zhang     }
2879566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(is[i], &idx_i));
288e8527bf2SHong Zhang   }
2895483b11dSHong Zhang 
2909371c9d4SSatish Balay   nrqs = 0;
2919371c9d4SSatish Balay   nrqr = 0;
2925483b11dSHong Zhang   for (i = 0; i < size; i++) {
2935483b11dSHong Zhang     data1_start[i] = data1 + i * len;
2945483b11dSHong Zhang     if (len_s[i]) {
2955483b11dSHong Zhang       nrqs++;
2965483b11dSHong Zhang       len_s[i] += 1 + is_max; /* add no. of header msg */
2975483b11dSHong Zhang     }
2985483b11dSHong Zhang   }
2995483b11dSHong Zhang 
30048a46eb9SPierre Jolivet   for (i = 0; i < is_max; i++) PetscCall(ISDestroy(&is[i]));
3019566063dSJacob Faibussowitsch   PetscCall(PetscFree(n));
3029566063dSJacob Faibussowitsch   PetscCall(PetscFree(ctable));
3035483b11dSHong Zhang 
3045483b11dSHong Zhang   /* Determine the number of messages to expect, their lengths, from from-ids */
3059566063dSJacob Faibussowitsch   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrqr));
3069566063dSJacob Faibussowitsch   PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, len_s, &id_r1, &len_r1));
307632d0f97SHong Zhang 
308632d0f97SHong Zhang   /*  Now  post the sends */
3099566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(size, &s_waits1, size, &s_waits2));
310632d0f97SHong Zhang   k = 0;
3115483b11dSHong Zhang   for (proc_id = 0; proc_id < size; proc_id++) { /* send data1 to processor [proc_id] */
3125483b11dSHong Zhang     if (len_s[proc_id]) {
3139566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Isend(data1_start[proc_id], len_s[proc_id], MPIU_INT, proc_id, tag1, comm, s_waits1 + k));
314632d0f97SHong Zhang       k++;
315632d0f97SHong Zhang     }
316632d0f97SHong Zhang   }
317632d0f97SHong Zhang 
31845f43ab7SHong Zhang   /* 2. Receive other's is[] and process. Then send back */
3195483b11dSHong Zhang   len = 0;
3205483b11dSHong Zhang   for (i = 0; i < nrqr; i++) {
3215483b11dSHong Zhang     if (len_r1[i] > len) len = len_r1[i];
3225483b11dSHong Zhang   }
3239566063dSJacob Faibussowitsch   PetscCall(PetscFree(len_r1));
3249566063dSJacob Faibussowitsch   PetscCall(PetscFree(id_r1));
32545f43ab7SHong Zhang 
32626fbe8dcSKarl Rupp   for (proc_id = 0; proc_id < size; proc_id++) len_s[proc_id] = iwork[proc_id] = 0;
32745f43ab7SHong Zhang 
3289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(len + 1, &odata1));
3299566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(size, &odata2_ptr));
3309566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(Mbs, &otable));
33110eea884SHong Zhang 
33210eea884SHong Zhang   len_max = ois_max * (Mbs + 1); /* max space storing all is[] for each receive */
333240e5896SHong Zhang   len_est = 2 * len_max;         /* estimated space of storing is[] for all receiving messages */
3349566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(len_est + 1, &odata2));
33510eea884SHong Zhang   nodata2 = 0; /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */
33626fbe8dcSKarl Rupp 
337240e5896SHong Zhang   odata2_ptr[nodata2] = odata2;
33826fbe8dcSKarl Rupp 
33910eea884SHong Zhang   len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max  */
34010eea884SHong Zhang 
341632d0f97SHong Zhang   k = 0;
3425483b11dSHong Zhang   while (k < nrqr) {
343632d0f97SHong Zhang     /* Receive messages */
3449566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag1, comm, &flag, &r_status));
345632d0f97SHong Zhang     if (flag) {
3469566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Get_count(&r_status, MPIU_INT, &len));
347632d0f97SHong Zhang       proc_id = r_status.MPI_SOURCE;
3489566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Irecv(odata1, len, MPIU_INT, proc_id, r_status.MPI_TAG, comm, &r_req));
3499566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Wait(&r_req, &r_status));
350632d0f97SHong Zhang 
351fc70d14eSHong Zhang       /*  Process messages */
352240e5896SHong Zhang       /*  make sure there is enough unused space in odata2 array */
35310eea884SHong Zhang       if (len_unused < len_max) { /* allocate more space for odata2 */
3549566063dSJacob Faibussowitsch         PetscCall(PetscMalloc1(len_est + 1, &odata2));
35526fbe8dcSKarl Rupp 
356240e5896SHong Zhang         odata2_ptr[++nodata2] = odata2;
35726fbe8dcSKarl Rupp 
35810eea884SHong Zhang         len_unused = len_est;
35910eea884SHong Zhang       }
36010eea884SHong Zhang 
3619566063dSJacob Faibussowitsch       PetscCall(MatIncreaseOverlap_MPISBAIJ_Local(C, odata1, OTHER, odata2, &otable));
362a2a9f22aSHong Zhang       len = 1 + odata2[0];
36326fbe8dcSKarl Rupp       for (i = 0; i < odata2[0]; i++) len += odata2[1 + i];
364632d0f97SHong Zhang 
365632d0f97SHong Zhang       /* Send messages back */
3669566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Isend(odata2, len, MPIU_INT, proc_id, tag2, comm, s_waits2 + k));
367fc70d14eSHong Zhang       k++;
36810eea884SHong Zhang       odata2 += len;
36910eea884SHong Zhang       len_unused -= len;
37045f43ab7SHong Zhang       len_s[proc_id] = len; /* num of messages sending back to [proc_id] by this proc */
371632d0f97SHong Zhang     }
3725483b11dSHong Zhang   }
3739566063dSJacob Faibussowitsch   PetscCall(PetscFree(odata1));
3749566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&otable));
375632d0f97SHong Zhang 
37645f43ab7SHong Zhang   /* 3. Do local work on this processor's is[] */
37745f43ab7SHong Zhang   /* make sure there is enough unused space in odata2(=data) array */
37845f43ab7SHong Zhang   len_max = is_max * (Mbs + 1); /* max space storing all is[] for this processor */
37910eea884SHong Zhang   if (len_unused < len_max) {   /* allocate more space for odata2 */
3809566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(len_est + 1, &odata2));
38126fbe8dcSKarl Rupp 
382240e5896SHong Zhang     odata2_ptr[++nodata2] = odata2;
38310eea884SHong Zhang   }
384bfc6803cSHong Zhang 
385240e5896SHong Zhang   data = odata2;
3869566063dSJacob Faibussowitsch   PetscCall(MatIncreaseOverlap_MPISBAIJ_Local(C, data1_start[rank], MINE, data, table));
3879566063dSJacob Faibussowitsch   PetscCall(PetscFree(data1_start));
38845f43ab7SHong Zhang 
38945f43ab7SHong Zhang   /* 4. Receive work done on other processors, then merge */
39045f43ab7SHong Zhang   /* get max number of messages that this processor expects to recv */
3911c2dc1cbSBarry Smith   PetscCall(MPIU_Allreduce(len_s, iwork, size, MPI_INT, MPI_MAX, comm));
3929566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(iwork[rank] + 1, &data2));
3939566063dSJacob Faibussowitsch   PetscCall(PetscFree4(len_s, btable, iwork, Bowners));
39445f43ab7SHong Zhang 
395632d0f97SHong Zhang   k = 0;
3965483b11dSHong Zhang   while (k < nrqs) {
397632d0f97SHong Zhang     /* Receive messages */
3989566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag2, comm, &flag, &r_status));
399632d0f97SHong Zhang     if (flag) {
4009566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Get_count(&r_status, MPIU_INT, &len));
40126fbe8dcSKarl Rupp 
402632d0f97SHong Zhang       proc_id = r_status.MPI_SOURCE;
40326fbe8dcSKarl Rupp 
4049566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Irecv(data2, len, MPIU_INT, proc_id, r_status.MPI_TAG, comm, &r_req));
4059566063dSJacob Faibussowitsch       PetscCallMPI(MPI_Wait(&r_req, &r_status));
4065483b11dSHong Zhang       if (len > 1 + is_max) { /* Add data2 into data */
4070472cc68SHong Zhang         data2_i = data2 + 1 + is_max;
408fc70d14eSHong Zhang         for (i = 0; i < is_max; i++) {
409fc70d14eSHong Zhang           table_i = table[i];
410bfc6803cSHong Zhang           data_i  = data + 1 + is_max + Mbs * i;
411bfc6803cSHong Zhang           isz     = data[1 + i];
4120472cc68SHong Zhang           for (j = 0; j < data2[1 + i]; j++) {
4130472cc68SHong Zhang             col = data2_i[j];
41426fbe8dcSKarl Rupp             if (!PetscBTLookupSet(table_i, col)) data_i[isz++] = col;
415fc70d14eSHong Zhang           }
416bfc6803cSHong Zhang           data[1 + i] = isz;
4170472cc68SHong Zhang           if (i < is_max - 1) data2_i += data2[1 + i];
418fc70d14eSHong Zhang         }
4195483b11dSHong Zhang       }
420632d0f97SHong Zhang       k++;
421632d0f97SHong Zhang     }
4225483b11dSHong Zhang   }
4239566063dSJacob Faibussowitsch   PetscCall(PetscFree(data2));
4249566063dSJacob Faibussowitsch   PetscCall(PetscFree2(table, t_p));
4255483b11dSHong Zhang 
4265483b11dSHong Zhang   /* phase 1 sends are complete */
4279566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(size, &s_status));
4289566063dSJacob Faibussowitsch   if (nrqs) PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status));
4299566063dSJacob Faibussowitsch   PetscCall(PetscFree(data1));
4305483b11dSHong Zhang 
431240e5896SHong Zhang   /* phase 2 sends are complete */
4329566063dSJacob Faibussowitsch   if (nrqr) PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status));
4339566063dSJacob Faibussowitsch   PetscCall(PetscFree2(s_waits1, s_waits2));
4349566063dSJacob Faibussowitsch   PetscCall(PetscFree(s_status));
435632d0f97SHong Zhang 
436c910923dSHong Zhang   /* 5. Create new is[] */
437c910923dSHong Zhang   for (i = 0; i < is_max; i++) {
438bfc6803cSHong Zhang     data_i = data + 1 + is_max + Mbs * i;
4399566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, data[1 + i], data_i, PETSC_COPY_VALUES, is + i));
440632d0f97SHong Zhang   }
44148a46eb9SPierre Jolivet   for (k = 0; k <= nodata2; k++) PetscCall(PetscFree(odata2_ptr[k]));
4429566063dSJacob Faibussowitsch   PetscCall(PetscFree(odata2_ptr));
4433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
444632d0f97SHong Zhang }
445632d0f97SHong Zhang 
446632d0f97SHong Zhang /*
447dc008846SHong Zhang    MatIncreaseOverlap_MPISBAIJ_Local - Called by MatIncreaseOverlap, to do
448632d0f97SHong Zhang        the work on the local processor.
449632d0f97SHong Zhang 
450632d0f97SHong Zhang      Inputs:
451632d0f97SHong Zhang       C      - MAT_MPISBAIJ;
452bfc6803cSHong Zhang       data   - holds is[]. See MatIncreaseOverlap_MPISBAIJ_Once() for the format.
453bfc6803cSHong Zhang       whose  - whose is[] to be processed,
454bfc6803cSHong Zhang                MINE:  this processor's is[]
455bfc6803cSHong Zhang                OTHER: other processor's is[]
456632d0f97SHong Zhang      Output:
45710eea884SHong Zhang        nidx  - whose = MINE:
4580472cc68SHong Zhang                      holds input and newly found indices in the same format as data
4590472cc68SHong Zhang                whose = OTHER:
4600472cc68SHong Zhang                      only holds the newly found indices
4610472cc68SHong Zhang        table - table[i]: mark the indices of is[i], i=0,...,is_max. Used only in the case 'whose=MINE'.
462632d0f97SHong Zhang */
46376f244e2SHong Zhang /* Would computation be reduced by swapping the loop 'for each is' and 'for each row'? */
464d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat C, PetscInt *data, PetscInt whose, PetscInt *nidx, PetscBT *table)
465d71ae5a4SJacob Faibussowitsch {
466632d0f97SHong Zhang   Mat_MPISBAIJ *c = (Mat_MPISBAIJ *)C->data;
467dc008846SHong Zhang   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)(c->A)->data;
468dc008846SHong Zhang   Mat_SeqBAIJ  *b = (Mat_SeqBAIJ *)(c->B)->data;
46913f74950SBarry Smith   PetscInt      row, mbs, Mbs, *nidx_i, col, col_max, isz, isz0, *ai, *aj, *bi, *bj, *garray, rstart, l;
47013f74950SBarry Smith   PetscInt      a_start, a_end, b_start, b_end, i, j, k, is_max, *idx_i, n;
471bfc6803cSHong Zhang   PetscBT       table0;  /* mark the indices of input is[] for look up */
472da81f932SPierre Jolivet   PetscBT       table_i; /* points to i-th table. When whose=OTHER, a single table is used for all is[] */
473632d0f97SHong Zhang 
474632d0f97SHong Zhang   PetscFunctionBegin;
4759371c9d4SSatish Balay   Mbs    = c->Mbs;
4769371c9d4SSatish Balay   mbs    = a->mbs;
4779371c9d4SSatish Balay   ai     = a->i;
4789371c9d4SSatish Balay   aj     = a->j;
4799371c9d4SSatish Balay   bi     = b->i;
4809371c9d4SSatish Balay   bj     = b->j;
481632d0f97SHong Zhang   garray = c->garray;
482899cda47SBarry Smith   rstart = c->rstartbs;
483dc008846SHong Zhang   is_max = data[0];
484632d0f97SHong Zhang 
4859566063dSJacob Faibussowitsch   PetscCall(PetscBTCreate(Mbs, &table0));
486fc70d14eSHong Zhang 
487fc70d14eSHong Zhang   nidx[0] = is_max;
488fc70d14eSHong Zhang   idx_i   = data + is_max + 1;   /* ptr to input is[0] array */
489bfc6803cSHong Zhang   nidx_i  = nidx + is_max + 1;   /* ptr to output is[0] array */
490dc008846SHong Zhang   for (i = 0; i < is_max; i++) { /* for each is */
491dc008846SHong Zhang     isz = 0;
492fc70d14eSHong Zhang     n   = data[1 + i]; /* size of input is[i] */
493dc008846SHong Zhang 
49476f244e2SHong Zhang     /* initialize and set table_i(mark idx and nidx) and table0(only mark idx) */
495bfc6803cSHong Zhang     if (whose == MINE) { /* process this processor's is[] */
496bfc6803cSHong Zhang       table_i = table[i];
4970472cc68SHong Zhang       nidx_i  = nidx + 1 + is_max + Mbs * i;
498bfc6803cSHong Zhang     } else { /* process other processor's is[] - only use one temp table */
499430a0127SHong Zhang       table_i = table[0];
500bfc6803cSHong Zhang     }
5019566063dSJacob Faibussowitsch     PetscCall(PetscBTMemzero(Mbs, table_i));
5029566063dSJacob Faibussowitsch     PetscCall(PetscBTMemzero(Mbs, table0));
50376f244e2SHong Zhang     if (n == 0) {
50476f244e2SHong Zhang       nidx[1 + i] = 0; /* size of new is[i] */
50576f244e2SHong Zhang       continue;
50676f244e2SHong Zhang     }
50776f244e2SHong Zhang 
5089371c9d4SSatish Balay     isz0    = 0;
5099371c9d4SSatish Balay     col_max = 0;
510dc008846SHong Zhang     for (j = 0; j < n; j++) {
511dc008846SHong Zhang       col = idx_i[j];
51208401ef6SPierre Jolivet       PetscCheck(col < Mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index col %" PetscInt_FMT " >= Mbs %" PetscInt_FMT, col, Mbs);
513bfc6803cSHong Zhang       if (!PetscBTLookupSet(table_i, col)) {
5149566063dSJacob Faibussowitsch         PetscCall(PetscBTSet(table0, col));
51526fbe8dcSKarl Rupp         if (whose == MINE) nidx_i[isz0] = col;
516dbe03f88SHong Zhang         if (col_max < col) col_max = col;
517bfc6803cSHong Zhang         isz0++;
518bfc6803cSHong Zhang       }
519632d0f97SHong Zhang     }
520dc008846SHong Zhang 
52126fbe8dcSKarl Rupp     if (whose == MINE) isz = isz0;
522fc70d14eSHong Zhang     k = 0; /* no. of indices from input is[i] that have been examined */
523dc008846SHong Zhang     for (row = 0; row < mbs; row++) {
5249371c9d4SSatish Balay       a_start = ai[row];
5259371c9d4SSatish Balay       a_end   = ai[row + 1];
5269371c9d4SSatish Balay       b_start = bi[row];
5279371c9d4SSatish Balay       b_end   = bi[row + 1];
5280472cc68SHong Zhang       if (PetscBTLookup(table0, row + rstart)) { /* row is on input is[i]:
5290472cc68SHong Zhang                                                 do row search: collect all col in this row */
530dc008846SHong Zhang         for (l = a_start; l < a_end; l++) {      /* Amat */
531dc008846SHong Zhang           col = aj[l] + rstart;
53226fbe8dcSKarl Rupp           if (!PetscBTLookupSet(table_i, col)) nidx_i[isz++] = col;
533dc008846SHong Zhang         }
534dc008846SHong Zhang         for (l = b_start; l < b_end; l++) { /* Bmat */
535dc008846SHong Zhang           col = garray[bj[l]];
53626fbe8dcSKarl Rupp           if (!PetscBTLookupSet(table_i, col)) nidx_i[isz++] = col;
537dc008846SHong Zhang         }
538dc008846SHong Zhang         k++;
539dc008846SHong Zhang         if (k >= isz0) break;               /* for (row=0; row<mbs; row++) */
5400472cc68SHong Zhang       } else {                              /* row is not on input is[i]:
541da81f932SPierre Jolivet                   do col search: add row onto nidx_i if there is a col in nidx_i */
542dc008846SHong Zhang         for (l = a_start; l < a_end; l++) { /* Amat */
543dc008846SHong Zhang           col = aj[l] + rstart;
54476f244e2SHong Zhang           if (col > col_max) break;
545dc008846SHong Zhang           if (PetscBTLookup(table0, col)) {
54626fbe8dcSKarl Rupp             if (!PetscBTLookupSet(table_i, row + rstart)) nidx_i[isz++] = row + rstart;
547dc008846SHong Zhang             break; /* for l = start; l<end ; l++) */
548632d0f97SHong Zhang           }
549632d0f97SHong Zhang         }
550dc008846SHong Zhang         for (l = b_start; l < b_end; l++) { /* Bmat */
551dc008846SHong Zhang           col = garray[bj[l]];
55276f244e2SHong Zhang           if (col > col_max) break;
553dc008846SHong Zhang           if (PetscBTLookup(table0, col)) {
55426fbe8dcSKarl Rupp             if (!PetscBTLookupSet(table_i, row + rstart)) nidx_i[isz++] = row + rstart;
555dc008846SHong Zhang             break; /* for l = start; l<end ; l++) */
556632d0f97SHong Zhang           }
557dc008846SHong Zhang         }
558dc008846SHong Zhang       }
559bfc6803cSHong Zhang     }
560dc008846SHong Zhang 
561dc008846SHong Zhang     if (i < is_max - 1) {
562fc70d14eSHong Zhang       idx_i += n;    /* ptr to input is[i+1] array */
563bfc6803cSHong Zhang       nidx_i += isz; /* ptr to output is[i+1] array */
564dc008846SHong Zhang     }
565fc70d14eSHong Zhang     nidx[1 + i] = isz; /* size of new is[i] */
5661ab97a2aSSatish Balay   }                    /* for each is */
5679566063dSJacob Faibussowitsch   PetscCall(PetscBTDestroy(&table0));
5683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
569632d0f97SHong Zhang }
570