1632d0f97SHong Zhang 2632d0f97SHong Zhang /* 3632d0f97SHong Zhang Routines to compute overlapping regions of a parallel MPI matrix. 4632d0f97SHong Zhang Used for finding submatrices that were shared across processors. 5632d0f97SHong Zhang */ 6c6db04a5SJed Brown #include <../src/mat/impls/sbaij/mpi/mpisbaij.h> 7c6db04a5SJed Brown #include <petscbt.h> 8632d0f97SHong Zhang 913f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat, PetscInt, IS *); 1013f74950SBarry Smith static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat, PetscInt *, PetscInt, PetscInt *, PetscBT *); 11632d0f97SHong Zhang 12d71ae5a4SJacob Faibussowitsch PetscErrorCode MatIncreaseOverlap_MPISBAIJ(Mat C, PetscInt is_max, IS is[], PetscInt ov) 13d71ae5a4SJacob Faibussowitsch { 14b11de2a9SHong Zhang PetscInt i, N = C->cmap->N, bs = C->rmap->bs, M = C->rmap->N, Mbs = M / bs, *nidx, isz, iov; 15b11de2a9SHong Zhang IS *is_new, *is_row; 16b11de2a9SHong Zhang Mat *submats; 17b11de2a9SHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ *)C->data; 18b11de2a9SHong Zhang Mat_SeqSBAIJ *asub_i; 19b11de2a9SHong Zhang PetscBT table; 20b11de2a9SHong Zhang PetscInt *ai, brow, nz, nis, l, nmax, nstages_local, nstages, max_no, pos; 21b11de2a9SHong Zhang const PetscInt *idx; 22b829d874SHong Zhang PetscBool flg; 23632d0f97SHong Zhang 24632d0f97SHong Zhang PetscFunctionBegin; 259566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(is_max, &is_new)); 26632d0f97SHong Zhang /* Convert the indices into block format */ 279566063dSJacob Faibussowitsch PetscCall(ISCompressIndicesGeneral(N, C->rmap->n, bs, is_max, is, is_new)); 2808401ef6SPierre Jolivet PetscCheck(ov >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative overlap specified"); 29db41cccfSHong Zhang 302ef1f0ffSBarry Smith /* previous non-scalable implementation */ 31a0769a71SSatish Balay flg = PETSC_FALSE; 329566063dSJacob Faibussowitsch PetscCall(PetscOptionsHasName(NULL, NULL, "-IncreaseOverlap_old", &flg)); 337a868f3eSHong Zhang if (flg) { /* previous non-scalable implementation */ 347a868f3eSHong Zhang printf("use previous non-scalable implementation...\n"); 3548a46eb9SPierre Jolivet for (i = 0; i < ov; ++i) PetscCall(MatIncreaseOverlap_MPISBAIJ_Once(C, is_max, is_new)); 361c4982ebSHong Zhang } else { /* implementation using modified BAIJ routines */ 371c4982ebSHong Zhang 389566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(Mbs + 1, &nidx)); 399566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(Mbs, &table)); /* for column search */ 40db41cccfSHong Zhang 41db41cccfSHong Zhang /* Create is_row */ 429566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(is_max, &is_row)); 439566063dSJacob Faibussowitsch PetscCall(ISCreateStride(PETSC_COMM_SELF, Mbs, 0, 1, &is_row[0])); 4426fbe8dcSKarl Rupp 459371c9d4SSatish Balay for (i = 1; i < is_max; i++) { is_row[i] = is_row[0]; /* reuse is_row[0] */ } 46db41cccfSHong Zhang 477dae84e0SHong Zhang /* Allocate memory to hold all the submatrices - Modified from MatCreateSubMatrices_MPIBAIJ() */ 489566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(is_max + 1, &submats)); 49b11de2a9SHong Zhang 50db41cccfSHong Zhang /* Determine the number of stages through which submatrices are done */ 51db41cccfSHong Zhang nmax = 20 * 1000000 / (c->Nbs * sizeof(PetscInt)); 52db41cccfSHong Zhang if (!nmax) nmax = 1; 53db41cccfSHong Zhang nstages_local = is_max / nmax + ((is_max % nmax) ? 1 : 0); 54db41cccfSHong Zhang 55db41cccfSHong Zhang /* Make sure every processor loops through the nstages */ 561c2dc1cbSBarry Smith PetscCall(MPIU_Allreduce(&nstages_local, &nstages, 1, MPIU_INT, MPI_MAX, PetscObjectComm((PetscObject)C))); 57b11de2a9SHong Zhang 58c6a7a370SJeremy L Thompson { 59c6a7a370SJeremy L Thompson const PetscObject obj = (PetscObject)c->A; 60c6a7a370SJeremy L Thompson size_t new_len, cur_len, max_len; 61c6a7a370SJeremy L Thompson 62c6a7a370SJeremy L Thompson PetscCall(PetscStrlen(MATSEQBAIJ, &new_len)); 63c6a7a370SJeremy L Thompson PetscCall(PetscStrlen(MATSEQSBAIJ, &cur_len)); 64c6a7a370SJeremy L Thompson max_len = PetscMax(cur_len, new_len) + 1; 65c6a7a370SJeremy L Thompson PetscCall(PetscRealloc(max_len * sizeof(*(obj->type_name)), &obj->type_name)); 66c6a7a370SJeremy L Thompson /* The resulting submatrices should be BAIJ, not SBAIJ, hence we change this value to 67c6a7a370SJeremy L Thompson trigger that */ 68b11de2a9SHong Zhang for (iov = 0; iov < ov; ++iov) { 69b11de2a9SHong Zhang /* 1) Get submats for column search */ 70c6a7a370SJeremy L Thompson PetscCall(PetscStrncpy(obj->type_name, MATSEQBAIJ, max_len)); 71db41cccfSHong Zhang for (i = 0, pos = 0; i < nstages; i++) { 72db41cccfSHong Zhang if (pos + nmax <= is_max) max_no = nmax; 73db41cccfSHong Zhang else if (pos == is_max) max_no = 0; 74db41cccfSHong Zhang else max_no = is_max - pos; 75b829d874SHong Zhang c->ijonly = PETSC_TRUE; /* only matrix data structures are requested */ 76c6a7a370SJeremy L Thompson 77*fdfbdca6SPierre Jolivet PetscCall(MatCreateSubMatrices_MPIBAIJ_local(C, max_no, is_row + pos, is_new + pos, MAT_INITIAL_MATRIX, submats + pos, PETSC_TRUE)); 78db41cccfSHong Zhang pos += max_no; 79db41cccfSHong Zhang } 80c6a7a370SJeremy L Thompson PetscCall(PetscStrncpy(obj->type_name, MATSEQSBAIJ, max_len)); 81db41cccfSHong Zhang 82b11de2a9SHong Zhang /* 2) Row search */ 839566063dSJacob Faibussowitsch PetscCall(MatIncreaseOverlap_MPIBAIJ_Once(C, is_max, is_new)); 84db41cccfSHong Zhang 85b11de2a9SHong Zhang /* 3) Column search */ 86db41cccfSHong Zhang for (i = 0; i < is_max; i++) { 87db41cccfSHong Zhang asub_i = (Mat_SeqSBAIJ *)submats[i]->data; 88feb237baSPierre Jolivet ai = asub_i->i; 89db41cccfSHong Zhang 90db41cccfSHong Zhang /* put is_new obtained from MatIncreaseOverlap_MPIBAIJ() to table */ 919566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(Mbs, table)); 92db41cccfSHong Zhang 939566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is_new[i], &idx)); 949566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is_new[i], &nis)); 95db41cccfSHong Zhang for (l = 0; l < nis; l++) { 969566063dSJacob Faibussowitsch PetscCall(PetscBTSet(table, idx[l])); 97db41cccfSHong Zhang nidx[l] = idx[l]; 98db41cccfSHong Zhang } 99db41cccfSHong Zhang isz = nis; 100db41cccfSHong Zhang 101b11de2a9SHong Zhang /* add column entries to table */ 102db41cccfSHong Zhang for (brow = 0; brow < Mbs; brow++) { 103db41cccfSHong Zhang nz = ai[brow + 1] - ai[brow]; 104db41cccfSHong Zhang if (nz) { 105db41cccfSHong Zhang if (!PetscBTLookupSet(table, brow)) nidx[isz++] = brow; 106db41cccfSHong Zhang } 107db41cccfSHong Zhang } 1089566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is_new[i], &idx)); 1099566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is_new[i])); 110db41cccfSHong Zhang 111db41cccfSHong Zhang /* create updated is_new */ 1129566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, isz, nidx, PETSC_COPY_VALUES, is_new + i)); 113db41cccfSHong Zhang } 114db41cccfSHong Zhang 115db41cccfSHong Zhang /* Free tmp spaces */ 11648a46eb9SPierre Jolivet for (i = 0; i < is_max; i++) PetscCall(MatDestroy(&submats[i])); 117db41cccfSHong Zhang } 118b829d874SHong Zhang 1199566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table)); 1209566063dSJacob Faibussowitsch PetscCall(PetscFree(submats)); 1219566063dSJacob Faibussowitsch PetscCall(ISDestroy(&is_row[0])); 1229566063dSJacob Faibussowitsch PetscCall(PetscFree(is_row)); 1239566063dSJacob Faibussowitsch PetscCall(PetscFree(nidx)); 124db41cccfSHong Zhang } 125c6a7a370SJeremy L Thompson } 126bd38d1d0SPierre Jolivet for (i = 0; i < is_max; i++) { 127bd38d1d0SPierre Jolivet PetscCall(ISDestroy(&is[i])); 128bd38d1d0SPierre Jolivet PetscCall(ISGetLocalSize(is_new[i], &nis)); 129bd38d1d0SPierre Jolivet PetscCall(ISGetIndices(is_new[i], &idx)); 130bd38d1d0SPierre Jolivet PetscCall(ISCreateBlock(PetscObjectComm((PetscObject)is_new[i]), bs, nis, idx, PETSC_COPY_VALUES, &is[i])); 131bd38d1d0SPierre Jolivet PetscCall(ISDestroy(&is_new[i])); 132bd38d1d0SPierre Jolivet } 1339566063dSJacob Faibussowitsch PetscCall(PetscFree(is_new)); 1343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 135632d0f97SHong Zhang } 136632d0f97SHong Zhang 1379371c9d4SSatish Balay typedef enum { 1389371c9d4SSatish Balay MINE, 1399371c9d4SSatish Balay OTHER 1409371c9d4SSatish Balay } WhoseOwner; 1410472cc68SHong Zhang /* data1, odata1 and odata2 are packed in the format (for communication): 142a2a9f22aSHong Zhang data[0] = is_max, no of is 143a2a9f22aSHong Zhang data[1] = size of is[0] 144a2a9f22aSHong Zhang ... 145a2a9f22aSHong Zhang data[is_max] = size of is[is_max-1] 146a2a9f22aSHong Zhang data[is_max + 1] = data(is[0]) 147a2a9f22aSHong Zhang ... 148a2a9f22aSHong Zhang data[is_max+1+sum(size of is[k]), k=0,...,i-1] = data(is[i]) 149a2a9f22aSHong Zhang ... 1500472cc68SHong Zhang data2 is packed in the format (for creating output is[]): 1510472cc68SHong Zhang data[0] = is_max, no of is 1520472cc68SHong Zhang data[1] = size of is[0] 1530472cc68SHong Zhang ... 1540472cc68SHong Zhang data[is_max] = size of is[is_max-1] 1550472cc68SHong Zhang data[is_max + 1] = data(is[0]) 1560472cc68SHong Zhang ... 1570472cc68SHong Zhang data[is_max + 1 + Mbs*i) = data(is[i]) 1580472cc68SHong Zhang ... 159a2a9f22aSHong Zhang */ 160d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C, PetscInt is_max, IS is[]) 161d71ae5a4SJacob Faibussowitsch { 162632d0f97SHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ *)C->data; 163adcec1e5SJed Brown PetscMPIInt size, rank, tag1, tag2, *len_s, nrqr, nrqs, *id_r1, *len_r1, flag, len, *iwork; 1645d0c19d7SBarry Smith const PetscInt *idx_i; 16526fbe8dcSKarl Rupp PetscInt idx, isz, col, *n, *data1, **data1_start, *data2, *data2_i, *data, *data_i; 16626fbe8dcSKarl Rupp PetscInt Mbs, i, j, k, *odata1, *odata2; 167f4259b30SLisandro Dalcin PetscInt proc_id, **odata2_ptr, *ctable = NULL, *btable, len_max, len_est; 168adcec1e5SJed Brown PetscInt proc_end = 0, len_unused, nodata2; 16913f74950SBarry Smith PetscInt ois_max; /* max no of is[] in each of processor */ 170bfc6803cSHong Zhang char *t_p; 171632d0f97SHong Zhang MPI_Comm comm; 172e8527bf2SHong Zhang MPI_Request *s_waits1, *s_waits2, r_req; 173632d0f97SHong Zhang MPI_Status *s_status, r_status; 17445f43ab7SHong Zhang PetscBT *table; /* mark indices of this processor's is[] */ 175fc70d14eSHong Zhang PetscBT table_i; 176bfc6803cSHong Zhang PetscBT otable; /* mark indices of other processors' is[] */ 177d0f46423SBarry Smith PetscInt bs = C->rmap->bs, Bn = c->B->cmap->n, Bnbs = Bn / bs, *Bowners; 17810eea884SHong Zhang IS garray_local, garray_gl; 1795483b11dSHong Zhang 180632d0f97SHong Zhang PetscFunctionBegin; 1819566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 182632d0f97SHong Zhang size = c->size; 183632d0f97SHong Zhang rank = c->rank; 184632d0f97SHong Zhang Mbs = c->Mbs; 185632d0f97SHong Zhang 1869566063dSJacob Faibussowitsch PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag1)); 1879566063dSJacob Faibussowitsch PetscCall(PetscObjectGetNewTag((PetscObject)C, &tag2)); 188c910923dSHong Zhang 189430a0127SHong Zhang /* create tables used in 190430a0127SHong Zhang step 1: table[i] - mark c->garray of proc [i] 19145f43ab7SHong Zhang step 3: table[i] - mark indices of is[i] when whose=MINE 192430a0127SHong Zhang table[0] - mark incideces of is[] when whose=OTHER */ 1932f613bf5SBarry Smith len = PetscMax(is_max, size); 1949566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(len, &table, (Mbs / PETSC_BITS_PER_BYTE + 1) * len, &t_p)); 195ad540459SPierre Jolivet for (i = 0; i < len; i++) table[i] = t_p + (Mbs / PETSC_BITS_PER_BYTE + 1) * i; 196430a0127SHong Zhang 1971c2dc1cbSBarry Smith PetscCall(MPIU_Allreduce(&is_max, &ois_max, 1, MPIU_INT, MPI_MAX, comm)); 19810eea884SHong Zhang 1995483b11dSHong Zhang /* 1. Send this processor's is[] to other processors */ 200e8527bf2SHong Zhang /* allocate spaces */ 2019566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(is_max, &n)); 2025483b11dSHong Zhang len = 0; 203c910923dSHong Zhang for (i = 0; i < is_max; i++) { 2049566063dSJacob Faibussowitsch PetscCall(ISGetLocalSize(is[i], &n[i])); 205632d0f97SHong Zhang len += n[i]; 206632d0f97SHong Zhang } 207958c9bccSBarry Smith if (!len) { 2085483b11dSHong Zhang is_max = 0; 2095483b11dSHong Zhang } else { 2105483b11dSHong Zhang len += 1 + is_max; /* max length of data1 for one processor */ 2115483b11dSHong Zhang } 2125483b11dSHong Zhang 2139566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(size * len + 1, &data1)); 2149566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(size, &data1_start)); 2155483b11dSHong Zhang for (i = 0; i < size; i++) data1_start[i] = data1 + i * len; 2165483b11dSHong Zhang 2179566063dSJacob Faibussowitsch PetscCall(PetscMalloc4(size, &len_s, size, &btable, size, &iwork, size + 1, &Bowners)); 218e8527bf2SHong Zhang 21976f244e2SHong Zhang /* gather c->garray from all processors */ 2209566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(comm, Bnbs, c->garray, PETSC_COPY_VALUES, &garray_local)); 2219566063dSJacob Faibussowitsch PetscCall(ISAllGather(garray_local, &garray_gl)); 2229566063dSJacob Faibussowitsch PetscCall(ISDestroy(&garray_local)); 2239566063dSJacob Faibussowitsch PetscCallMPI(MPI_Allgather(&Bnbs, 1, MPIU_INT, Bowners + 1, 1, MPIU_INT, comm)); 22426fbe8dcSKarl Rupp 22576f244e2SHong Zhang Bowners[0] = 0; 22676f244e2SHong Zhang for (i = 0; i < size; i++) Bowners[i + 1] += Bowners[i]; 22776f244e2SHong Zhang 2285483b11dSHong Zhang if (is_max) { 229430a0127SHong Zhang /* hash table ctable which maps c->row to proc_id) */ 2309566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(Mbs, &ctable)); 2315483b11dSHong Zhang for (proc_id = 0, j = 0; proc_id < size; proc_id++) { 23226fbe8dcSKarl Rupp for (; j < C->rmap->range[proc_id + 1] / bs; j++) ctable[j] = proc_id; 2335483b11dSHong Zhang } 2345483b11dSHong Zhang 235430a0127SHong Zhang /* hash tables marking c->garray */ 2369566063dSJacob Faibussowitsch PetscCall(ISGetIndices(garray_gl, &idx_i)); 2375483b11dSHong Zhang for (i = 0; i < size; i++) { 238430a0127SHong Zhang table_i = table[i]; 2399566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(Mbs, table_i)); 240430a0127SHong Zhang for (j = Bowners[i]; j < Bowners[i + 1]; j++) { /* go through B cols of proc[i]*/ 2419566063dSJacob Faibussowitsch PetscCall(PetscBTSet(table_i, idx_i[j])); 2425483b11dSHong Zhang } 2435483b11dSHong Zhang } 2449566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(garray_gl, &idx_i)); 2455483b11dSHong Zhang } /* if (is_max) */ 2469566063dSJacob Faibussowitsch PetscCall(ISDestroy(&garray_gl)); 2475483b11dSHong Zhang 2485483b11dSHong Zhang /* evaluate communication - mesg to who, length, and buffer space */ 249e8527bf2SHong Zhang for (i = 0; i < size; i++) len_s[i] = 0; 2505483b11dSHong Zhang 2515483b11dSHong Zhang /* header of data1 */ 2525483b11dSHong Zhang for (proc_id = 0; proc_id < size; proc_id++) { 2535483b11dSHong Zhang iwork[proc_id] = 0; 2545483b11dSHong Zhang *data1_start[proc_id] = is_max; 2555483b11dSHong Zhang data1_start[proc_id]++; 2565483b11dSHong Zhang for (j = 0; j < is_max; j++) { 2575483b11dSHong Zhang if (proc_id == rank) { 2585483b11dSHong Zhang *data1_start[proc_id] = n[j]; 2595483b11dSHong Zhang } else { 2605483b11dSHong Zhang *data1_start[proc_id] = 0; 2615483b11dSHong Zhang } 2625483b11dSHong Zhang data1_start[proc_id]++; 2635483b11dSHong Zhang } 2645483b11dSHong Zhang } 2655483b11dSHong Zhang 2665483b11dSHong Zhang for (i = 0; i < is_max; i++) { 2679566063dSJacob Faibussowitsch PetscCall(ISGetIndices(is[i], &idx_i)); 2685483b11dSHong Zhang for (j = 0; j < n[i]; j++) { 2695483b11dSHong Zhang idx = idx_i[j]; 2709371c9d4SSatish Balay *data1_start[rank] = idx; 27135cb6cd3SPierre Jolivet data1_start[rank]++; /* for local processing */ 2725483b11dSHong Zhang proc_end = ctable[idx]; 2735483b11dSHong Zhang for (proc_id = 0; proc_id <= proc_end; proc_id++) { /* for others to process */ 274e8527bf2SHong Zhang if (proc_id == rank) continue; /* done before this loop */ 27526fbe8dcSKarl Rupp if (proc_id < proc_end && !PetscBTLookup(table[proc_id], idx)) continue; /* no need for sending idx to [proc_id] */ 2769371c9d4SSatish Balay *data1_start[proc_id] = idx; 2779371c9d4SSatish Balay data1_start[proc_id]++; 2785483b11dSHong Zhang len_s[proc_id]++; 2795483b11dSHong Zhang } 2805483b11dSHong Zhang } 2815483b11dSHong Zhang /* update header data */ 2822cfbe0a4SHong Zhang for (proc_id = 0; proc_id < size; proc_id++) { 2835483b11dSHong Zhang if (proc_id == rank) continue; 2845483b11dSHong Zhang *(data1 + proc_id * len + 1 + i) = len_s[proc_id] - iwork[proc_id]; 2855483b11dSHong Zhang iwork[proc_id] = len_s[proc_id]; 2865483b11dSHong Zhang } 2879566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(is[i], &idx_i)); 288e8527bf2SHong Zhang } 2895483b11dSHong Zhang 2909371c9d4SSatish Balay nrqs = 0; 2919371c9d4SSatish Balay nrqr = 0; 2925483b11dSHong Zhang for (i = 0; i < size; i++) { 2935483b11dSHong Zhang data1_start[i] = data1 + i * len; 2945483b11dSHong Zhang if (len_s[i]) { 2955483b11dSHong Zhang nrqs++; 2965483b11dSHong Zhang len_s[i] += 1 + is_max; /* add no. of header msg */ 2975483b11dSHong Zhang } 2985483b11dSHong Zhang } 2995483b11dSHong Zhang 30048a46eb9SPierre Jolivet for (i = 0; i < is_max; i++) PetscCall(ISDestroy(&is[i])); 3019566063dSJacob Faibussowitsch PetscCall(PetscFree(n)); 3029566063dSJacob Faibussowitsch PetscCall(PetscFree(ctable)); 3035483b11dSHong Zhang 3045483b11dSHong Zhang /* Determine the number of messages to expect, their lengths, from from-ids */ 3059566063dSJacob Faibussowitsch PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrqr)); 3069566063dSJacob Faibussowitsch PetscCall(PetscGatherMessageLengths(comm, nrqs, nrqr, len_s, &id_r1, &len_r1)); 307632d0f97SHong Zhang 308632d0f97SHong Zhang /* Now post the sends */ 3099566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(size, &s_waits1, size, &s_waits2)); 310632d0f97SHong Zhang k = 0; 3115483b11dSHong Zhang for (proc_id = 0; proc_id < size; proc_id++) { /* send data1 to processor [proc_id] */ 3125483b11dSHong Zhang if (len_s[proc_id]) { 3139566063dSJacob Faibussowitsch PetscCallMPI(MPI_Isend(data1_start[proc_id], len_s[proc_id], MPIU_INT, proc_id, tag1, comm, s_waits1 + k)); 314632d0f97SHong Zhang k++; 315632d0f97SHong Zhang } 316632d0f97SHong Zhang } 317632d0f97SHong Zhang 31845f43ab7SHong Zhang /* 2. Receive other's is[] and process. Then send back */ 3195483b11dSHong Zhang len = 0; 3205483b11dSHong Zhang for (i = 0; i < nrqr; i++) { 3215483b11dSHong Zhang if (len_r1[i] > len) len = len_r1[i]; 3225483b11dSHong Zhang } 3239566063dSJacob Faibussowitsch PetscCall(PetscFree(len_r1)); 3249566063dSJacob Faibussowitsch PetscCall(PetscFree(id_r1)); 32545f43ab7SHong Zhang 32626fbe8dcSKarl Rupp for (proc_id = 0; proc_id < size; proc_id++) len_s[proc_id] = iwork[proc_id] = 0; 32745f43ab7SHong Zhang 3289566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(len + 1, &odata1)); 3299566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(size, &odata2_ptr)); 3309566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(Mbs, &otable)); 33110eea884SHong Zhang 33210eea884SHong Zhang len_max = ois_max * (Mbs + 1); /* max space storing all is[] for each receive */ 333240e5896SHong Zhang len_est = 2 * len_max; /* estimated space of storing is[] for all receiving messages */ 3349566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(len_est + 1, &odata2)); 33510eea884SHong Zhang nodata2 = 0; /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */ 33626fbe8dcSKarl Rupp 337240e5896SHong Zhang odata2_ptr[nodata2] = odata2; 33826fbe8dcSKarl Rupp 33910eea884SHong Zhang len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max */ 34010eea884SHong Zhang 341632d0f97SHong Zhang k = 0; 3425483b11dSHong Zhang while (k < nrqr) { 343632d0f97SHong Zhang /* Receive messages */ 3449566063dSJacob Faibussowitsch PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag1, comm, &flag, &r_status)); 345632d0f97SHong Zhang if (flag) { 3469566063dSJacob Faibussowitsch PetscCallMPI(MPI_Get_count(&r_status, MPIU_INT, &len)); 347632d0f97SHong Zhang proc_id = r_status.MPI_SOURCE; 3489566063dSJacob Faibussowitsch PetscCallMPI(MPI_Irecv(odata1, len, MPIU_INT, proc_id, r_status.MPI_TAG, comm, &r_req)); 3499566063dSJacob Faibussowitsch PetscCallMPI(MPI_Wait(&r_req, &r_status)); 350632d0f97SHong Zhang 351fc70d14eSHong Zhang /* Process messages */ 352240e5896SHong Zhang /* make sure there is enough unused space in odata2 array */ 35310eea884SHong Zhang if (len_unused < len_max) { /* allocate more space for odata2 */ 3549566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(len_est + 1, &odata2)); 35526fbe8dcSKarl Rupp 356240e5896SHong Zhang odata2_ptr[++nodata2] = odata2; 35726fbe8dcSKarl Rupp 35810eea884SHong Zhang len_unused = len_est; 35910eea884SHong Zhang } 36010eea884SHong Zhang 3619566063dSJacob Faibussowitsch PetscCall(MatIncreaseOverlap_MPISBAIJ_Local(C, odata1, OTHER, odata2, &otable)); 362a2a9f22aSHong Zhang len = 1 + odata2[0]; 36326fbe8dcSKarl Rupp for (i = 0; i < odata2[0]; i++) len += odata2[1 + i]; 364632d0f97SHong Zhang 365632d0f97SHong Zhang /* Send messages back */ 3669566063dSJacob Faibussowitsch PetscCallMPI(MPI_Isend(odata2, len, MPIU_INT, proc_id, tag2, comm, s_waits2 + k)); 367fc70d14eSHong Zhang k++; 36810eea884SHong Zhang odata2 += len; 36910eea884SHong Zhang len_unused -= len; 37045f43ab7SHong Zhang len_s[proc_id] = len; /* num of messages sending back to [proc_id] by this proc */ 371632d0f97SHong Zhang } 3725483b11dSHong Zhang } 3739566063dSJacob Faibussowitsch PetscCall(PetscFree(odata1)); 3749566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&otable)); 375632d0f97SHong Zhang 37645f43ab7SHong Zhang /* 3. Do local work on this processor's is[] */ 37745f43ab7SHong Zhang /* make sure there is enough unused space in odata2(=data) array */ 37845f43ab7SHong Zhang len_max = is_max * (Mbs + 1); /* max space storing all is[] for this processor */ 37910eea884SHong Zhang if (len_unused < len_max) { /* allocate more space for odata2 */ 3809566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(len_est + 1, &odata2)); 38126fbe8dcSKarl Rupp 382240e5896SHong Zhang odata2_ptr[++nodata2] = odata2; 38310eea884SHong Zhang } 384bfc6803cSHong Zhang 385240e5896SHong Zhang data = odata2; 3869566063dSJacob Faibussowitsch PetscCall(MatIncreaseOverlap_MPISBAIJ_Local(C, data1_start[rank], MINE, data, table)); 3879566063dSJacob Faibussowitsch PetscCall(PetscFree(data1_start)); 38845f43ab7SHong Zhang 38945f43ab7SHong Zhang /* 4. Receive work done on other processors, then merge */ 39045f43ab7SHong Zhang /* get max number of messages that this processor expects to recv */ 3911c2dc1cbSBarry Smith PetscCall(MPIU_Allreduce(len_s, iwork, size, MPI_INT, MPI_MAX, comm)); 3929566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(iwork[rank] + 1, &data2)); 3939566063dSJacob Faibussowitsch PetscCall(PetscFree4(len_s, btable, iwork, Bowners)); 39445f43ab7SHong Zhang 395632d0f97SHong Zhang k = 0; 3965483b11dSHong Zhang while (k < nrqs) { 397632d0f97SHong Zhang /* Receive messages */ 3989566063dSJacob Faibussowitsch PetscCallMPI(MPI_Iprobe(MPI_ANY_SOURCE, tag2, comm, &flag, &r_status)); 399632d0f97SHong Zhang if (flag) { 4009566063dSJacob Faibussowitsch PetscCallMPI(MPI_Get_count(&r_status, MPIU_INT, &len)); 40126fbe8dcSKarl Rupp 402632d0f97SHong Zhang proc_id = r_status.MPI_SOURCE; 40326fbe8dcSKarl Rupp 4049566063dSJacob Faibussowitsch PetscCallMPI(MPI_Irecv(data2, len, MPIU_INT, proc_id, r_status.MPI_TAG, comm, &r_req)); 4059566063dSJacob Faibussowitsch PetscCallMPI(MPI_Wait(&r_req, &r_status)); 4065483b11dSHong Zhang if (len > 1 + is_max) { /* Add data2 into data */ 4070472cc68SHong Zhang data2_i = data2 + 1 + is_max; 408fc70d14eSHong Zhang for (i = 0; i < is_max; i++) { 409fc70d14eSHong Zhang table_i = table[i]; 410bfc6803cSHong Zhang data_i = data + 1 + is_max + Mbs * i; 411bfc6803cSHong Zhang isz = data[1 + i]; 4120472cc68SHong Zhang for (j = 0; j < data2[1 + i]; j++) { 4130472cc68SHong Zhang col = data2_i[j]; 41426fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_i, col)) data_i[isz++] = col; 415fc70d14eSHong Zhang } 416bfc6803cSHong Zhang data[1 + i] = isz; 4170472cc68SHong Zhang if (i < is_max - 1) data2_i += data2[1 + i]; 418fc70d14eSHong Zhang } 4195483b11dSHong Zhang } 420632d0f97SHong Zhang k++; 421632d0f97SHong Zhang } 4225483b11dSHong Zhang } 4239566063dSJacob Faibussowitsch PetscCall(PetscFree(data2)); 4249566063dSJacob Faibussowitsch PetscCall(PetscFree2(table, t_p)); 4255483b11dSHong Zhang 4265483b11dSHong Zhang /* phase 1 sends are complete */ 4279566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(size, &s_status)); 4289566063dSJacob Faibussowitsch if (nrqs) PetscCallMPI(MPI_Waitall(nrqs, s_waits1, s_status)); 4299566063dSJacob Faibussowitsch PetscCall(PetscFree(data1)); 4305483b11dSHong Zhang 431240e5896SHong Zhang /* phase 2 sends are complete */ 4329566063dSJacob Faibussowitsch if (nrqr) PetscCallMPI(MPI_Waitall(nrqr, s_waits2, s_status)); 4339566063dSJacob Faibussowitsch PetscCall(PetscFree2(s_waits1, s_waits2)); 4349566063dSJacob Faibussowitsch PetscCall(PetscFree(s_status)); 435632d0f97SHong Zhang 436c910923dSHong Zhang /* 5. Create new is[] */ 437c910923dSHong Zhang for (i = 0; i < is_max; i++) { 438bfc6803cSHong Zhang data_i = data + 1 + is_max + Mbs * i; 4399566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, data[1 + i], data_i, PETSC_COPY_VALUES, is + i)); 440632d0f97SHong Zhang } 44148a46eb9SPierre Jolivet for (k = 0; k <= nodata2; k++) PetscCall(PetscFree(odata2_ptr[k])); 4429566063dSJacob Faibussowitsch PetscCall(PetscFree(odata2_ptr)); 4433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 444632d0f97SHong Zhang } 445632d0f97SHong Zhang 446632d0f97SHong Zhang /* 447dc008846SHong Zhang MatIncreaseOverlap_MPISBAIJ_Local - Called by MatIncreaseOverlap, to do 448632d0f97SHong Zhang the work on the local processor. 449632d0f97SHong Zhang 450632d0f97SHong Zhang Inputs: 451632d0f97SHong Zhang C - MAT_MPISBAIJ; 452bfc6803cSHong Zhang data - holds is[]. See MatIncreaseOverlap_MPISBAIJ_Once() for the format. 453bfc6803cSHong Zhang whose - whose is[] to be processed, 454bfc6803cSHong Zhang MINE: this processor's is[] 455bfc6803cSHong Zhang OTHER: other processor's is[] 456632d0f97SHong Zhang Output: 45710eea884SHong Zhang nidx - whose = MINE: 4580472cc68SHong Zhang holds input and newly found indices in the same format as data 4590472cc68SHong Zhang whose = OTHER: 4600472cc68SHong Zhang only holds the newly found indices 4610472cc68SHong Zhang table - table[i]: mark the indices of is[i], i=0,...,is_max. Used only in the case 'whose=MINE'. 462632d0f97SHong Zhang */ 46376f244e2SHong Zhang /* Would computation be reduced by swapping the loop 'for each is' and 'for each row'? */ 464d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Local(Mat C, PetscInt *data, PetscInt whose, PetscInt *nidx, PetscBT *table) 465d71ae5a4SJacob Faibussowitsch { 466632d0f97SHong Zhang Mat_MPISBAIJ *c = (Mat_MPISBAIJ *)C->data; 467dc008846SHong Zhang Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)(c->A)->data; 468dc008846SHong Zhang Mat_SeqBAIJ *b = (Mat_SeqBAIJ *)(c->B)->data; 46913f74950SBarry Smith PetscInt row, mbs, Mbs, *nidx_i, col, col_max, isz, isz0, *ai, *aj, *bi, *bj, *garray, rstart, l; 47013f74950SBarry Smith PetscInt a_start, a_end, b_start, b_end, i, j, k, is_max, *idx_i, n; 471bfc6803cSHong Zhang PetscBT table0; /* mark the indices of input is[] for look up */ 472da81f932SPierre Jolivet PetscBT table_i; /* points to i-th table. When whose=OTHER, a single table is used for all is[] */ 473632d0f97SHong Zhang 474632d0f97SHong Zhang PetscFunctionBegin; 4759371c9d4SSatish Balay Mbs = c->Mbs; 4769371c9d4SSatish Balay mbs = a->mbs; 4779371c9d4SSatish Balay ai = a->i; 4789371c9d4SSatish Balay aj = a->j; 4799371c9d4SSatish Balay bi = b->i; 4809371c9d4SSatish Balay bj = b->j; 481632d0f97SHong Zhang garray = c->garray; 482899cda47SBarry Smith rstart = c->rstartbs; 483dc008846SHong Zhang is_max = data[0]; 484632d0f97SHong Zhang 4859566063dSJacob Faibussowitsch PetscCall(PetscBTCreate(Mbs, &table0)); 486fc70d14eSHong Zhang 487fc70d14eSHong Zhang nidx[0] = is_max; 488fc70d14eSHong Zhang idx_i = data + is_max + 1; /* ptr to input is[0] array */ 489bfc6803cSHong Zhang nidx_i = nidx + is_max + 1; /* ptr to output is[0] array */ 490dc008846SHong Zhang for (i = 0; i < is_max; i++) { /* for each is */ 491dc008846SHong Zhang isz = 0; 492fc70d14eSHong Zhang n = data[1 + i]; /* size of input is[i] */ 493dc008846SHong Zhang 49476f244e2SHong Zhang /* initialize and set table_i(mark idx and nidx) and table0(only mark idx) */ 495bfc6803cSHong Zhang if (whose == MINE) { /* process this processor's is[] */ 496bfc6803cSHong Zhang table_i = table[i]; 4970472cc68SHong Zhang nidx_i = nidx + 1 + is_max + Mbs * i; 498bfc6803cSHong Zhang } else { /* process other processor's is[] - only use one temp table */ 499430a0127SHong Zhang table_i = table[0]; 500bfc6803cSHong Zhang } 5019566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(Mbs, table_i)); 5029566063dSJacob Faibussowitsch PetscCall(PetscBTMemzero(Mbs, table0)); 50376f244e2SHong Zhang if (n == 0) { 50476f244e2SHong Zhang nidx[1 + i] = 0; /* size of new is[i] */ 50576f244e2SHong Zhang continue; 50676f244e2SHong Zhang } 50776f244e2SHong Zhang 5089371c9d4SSatish Balay isz0 = 0; 5099371c9d4SSatish Balay col_max = 0; 510dc008846SHong Zhang for (j = 0; j < n; j++) { 511dc008846SHong Zhang col = idx_i[j]; 51208401ef6SPierre Jolivet PetscCheck(col < Mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "index col %" PetscInt_FMT " >= Mbs %" PetscInt_FMT, col, Mbs); 513bfc6803cSHong Zhang if (!PetscBTLookupSet(table_i, col)) { 5149566063dSJacob Faibussowitsch PetscCall(PetscBTSet(table0, col)); 51526fbe8dcSKarl Rupp if (whose == MINE) nidx_i[isz0] = col; 516dbe03f88SHong Zhang if (col_max < col) col_max = col; 517bfc6803cSHong Zhang isz0++; 518bfc6803cSHong Zhang } 519632d0f97SHong Zhang } 520dc008846SHong Zhang 52126fbe8dcSKarl Rupp if (whose == MINE) isz = isz0; 522fc70d14eSHong Zhang k = 0; /* no. of indices from input is[i] that have been examined */ 523dc008846SHong Zhang for (row = 0; row < mbs; row++) { 5249371c9d4SSatish Balay a_start = ai[row]; 5259371c9d4SSatish Balay a_end = ai[row + 1]; 5269371c9d4SSatish Balay b_start = bi[row]; 5279371c9d4SSatish Balay b_end = bi[row + 1]; 5280472cc68SHong Zhang if (PetscBTLookup(table0, row + rstart)) { /* row is on input is[i]: 5290472cc68SHong Zhang do row search: collect all col in this row */ 530dc008846SHong Zhang for (l = a_start; l < a_end; l++) { /* Amat */ 531dc008846SHong Zhang col = aj[l] + rstart; 53226fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_i, col)) nidx_i[isz++] = col; 533dc008846SHong Zhang } 534dc008846SHong Zhang for (l = b_start; l < b_end; l++) { /* Bmat */ 535dc008846SHong Zhang col = garray[bj[l]]; 53626fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_i, col)) nidx_i[isz++] = col; 537dc008846SHong Zhang } 538dc008846SHong Zhang k++; 539dc008846SHong Zhang if (k >= isz0) break; /* for (row=0; row<mbs; row++) */ 5400472cc68SHong Zhang } else { /* row is not on input is[i]: 541da81f932SPierre Jolivet do col search: add row onto nidx_i if there is a col in nidx_i */ 542dc008846SHong Zhang for (l = a_start; l < a_end; l++) { /* Amat */ 543dc008846SHong Zhang col = aj[l] + rstart; 54476f244e2SHong Zhang if (col > col_max) break; 545dc008846SHong Zhang if (PetscBTLookup(table0, col)) { 54626fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_i, row + rstart)) nidx_i[isz++] = row + rstart; 547dc008846SHong Zhang break; /* for l = start; l<end ; l++) */ 548632d0f97SHong Zhang } 549632d0f97SHong Zhang } 550dc008846SHong Zhang for (l = b_start; l < b_end; l++) { /* Bmat */ 551dc008846SHong Zhang col = garray[bj[l]]; 55276f244e2SHong Zhang if (col > col_max) break; 553dc008846SHong Zhang if (PetscBTLookup(table0, col)) { 55426fbe8dcSKarl Rupp if (!PetscBTLookupSet(table_i, row + rstart)) nidx_i[isz++] = row + rstart; 555dc008846SHong Zhang break; /* for l = start; l<end ; l++) */ 556632d0f97SHong Zhang } 557dc008846SHong Zhang } 558dc008846SHong Zhang } 559bfc6803cSHong Zhang } 560dc008846SHong Zhang 561dc008846SHong Zhang if (i < is_max - 1) { 562fc70d14eSHong Zhang idx_i += n; /* ptr to input is[i+1] array */ 563bfc6803cSHong Zhang nidx_i += isz; /* ptr to output is[i+1] array */ 564dc008846SHong Zhang } 565fc70d14eSHong Zhang nidx[1 + i] = isz; /* size of new is[i] */ 5661ab97a2aSSatish Balay } /* for each is */ 5679566063dSJacob Faibussowitsch PetscCall(PetscBTDestroy(&table0)); 5683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 569632d0f97SHong Zhang } 570