1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++) { 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++) { 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank) { 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows) { 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++) { 115 if (tosizes_temp[i]) { 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++) { 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++) { 133 for (j=0; j<fromsizes[2*i]; j++) { 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++) { 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++) { 169 for (j=0; j<tosizes[2*i]; j++) { 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 MPI_Comm *iscomms; 198 PetscErrorCode ierr; 199 200 PetscFunctionBegin; 201 max_lsize = 0; 202 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 203 for (i=0; i<nidx; i++){ 204 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 205 max_lsize = lsize>max_lsize ? lsize:max_lsize; 206 isz[i] = lsize; 207 } 208 ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr); 209 for (i=0; i<nidx; i++){ 210 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 211 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr); 213 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 214 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 215 } 216 /* retrieve information to get row id and its overlap */ 217 for (i=0; i<nrecvs; ){ 218 is_id = recvdata[i++]; 219 data_size = recvdata[i++]; 220 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 221 isz_i = isz[is_id]; 222 for (j=0; j< data_size; j++){ 223 col = recvdata[i++]; 224 indices_i[isz_i++] = col; 225 } 226 isz[is_id] = isz_i; 227 } 228 /* remove duplicate entities */ 229 for (i=0; i<nidx; i++){ 230 indices_i = indices_temp+(max_lsize+nrecvs)*i; 231 isz_i = isz[i]; 232 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 233 ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 234 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 235 } 236 ierr = PetscFree(isz);CHKERRQ(ierr); 237 ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr); 238 PetscFunctionReturn(0); 239 } 240 241 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 242 { 243 PetscLayout rmap,cmap; 244 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 245 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 246 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 247 const PetscInt *gcols,*ai,*aj,*bi,*bj; 248 Mat amat,bmat; 249 PetscMPIInt rank; 250 PetscBool done; 251 MPI_Comm comm; 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 256 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 257 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 258 /* Even if the mat is symmetric, we still assume it is not symmetric */ 259 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 260 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 261 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 262 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 263 /* total number of nonzero values is used to estimate the memory usage in the next step */ 264 tnz = ai[an]+bi[bn]; 265 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 266 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 267 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 268 /* to find the longest message */ 269 max_fszs = 0; 270 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 271 /* better way to estimate number of nonzero in the mat??? */ 272 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 273 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 274 rows_pos = 0; 275 totalrows = 0; 276 for (i=0; i<nfrom; i++){ 277 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 278 /* group data together */ 279 for (j=0; j<fromsizes[2*i]; j+=2){ 280 is_id = fromrows[rows_pos++];/* no of is */ 281 rows_i = rows_data[is_id]; 282 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 283 } 284 /* estimate a space to avoid multiple allocations */ 285 for (j=0; j<nidx; j++){ 286 indvc_ij = 0; 287 rows_i = rows_data[j]; 288 for (l=0; l<rows_pos_i[j]; l++){ 289 row = rows_i[l]-rstart; 290 start = ai[row]; 291 end = ai[row+1]; 292 for (k=start; k<end; k++){ /* Amat */ 293 col = aj[k] + cstart; 294 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 295 } 296 start = bi[row]; 297 end = bi[row+1]; 298 for (k=start; k<end; k++) { /* Bmat */ 299 col = gcols[bj[k]]; 300 indices_tmp[indvc_ij++] = col; 301 } 302 } 303 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 304 indv_counts[i*nidx+j] = indvc_ij; 305 totalrows += indvc_ij; 306 } 307 } 308 /* message triple <no of is, number of rows, rows> */ 309 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 310 totalrows = 0; 311 rows_pos = 0; 312 /* use this code again */ 313 for (i=0;i<nfrom;i++){ 314 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 315 for (j=0; j<fromsizes[2*i]; j+=2){ 316 is_id = fromrows[rows_pos++]; 317 rows_i = rows_data[is_id]; 318 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 319 } 320 /* add data */ 321 for (j=0; j<nidx; j++){ 322 if (!indv_counts[i*nidx+j]) continue; 323 indvc_ij = 0; 324 sbdata[totalrows++] = j; 325 sbdata[totalrows++] = indv_counts[i*nidx+j]; 326 sbsizes[2*i] += 2; 327 rows_i = rows_data[j]; 328 for (l=0; l<rows_pos_i[j]; l++){ 329 row = rows_i[l]-rstart; 330 start = ai[row]; 331 end = ai[row+1]; 332 for (k=start; k<end; k++){ /* Amat */ 333 col = aj[k] + cstart; 334 indices_tmp[indvc_ij++] = col; 335 } 336 start = bi[row]; 337 end = bi[row+1]; 338 for (k=start; k<end; k++) { /* Bmat */ 339 col = gcols[bj[k]]; 340 indices_tmp[indvc_ij++] = col; 341 } 342 } 343 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 344 sbsizes[2*i] += indvc_ij; 345 ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr); 346 totalrows += indvc_ij; 347 } 348 } 349 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 350 for (i=0; i<nfrom; i++){ 351 offsets[i+1] = offsets[i] + sbsizes[2*i]; 352 sbsizes[2*i+1] = offsets[i]; 353 } 354 ierr = PetscFree(offsets);CHKERRQ(ierr); 355 if (sbrowsizes) *sbrowsizes = sbsizes; 356 if (sbrows) *sbrows = sbdata; 357 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 358 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 359 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 360 PetscFunctionReturn(0); 361 } 362 363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 364 { 365 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 366 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 367 PetscInt lsize,lsize_tmp,owner; 368 PetscMPIInt rank; 369 Mat amat,bmat; 370 PetscBool done; 371 PetscLayout cmap,rmap; 372 MPI_Comm comm; 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 377 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 378 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 379 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 380 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 381 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 382 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 383 /* is it a safe way to compute number of nonzero values ? */ 384 tnz = ai[an]+bi[bn]; 385 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 386 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 387 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 388 /* it is a better way to estimate memory than the old implementation 389 * where global size of matrix is used 390 * */ 391 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 392 for (i=0; i<nidx; i++) { 393 MPI_Comm iscomm; 394 395 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 396 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 397 lsize_tmp = 0; 398 for (j=0; j<lsize; j++) { 399 owner = -1; 400 row = indices[j]; 401 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 402 if (owner != rank) continue; 403 /* local number */ 404 row -= rstart; 405 start = ai[row]; 406 end = ai[row+1]; 407 for (k=start; k<end; k++) { /* Amat */ 408 col = aj[k] + cstart; 409 indices_temp[lsize_tmp++] = col; 410 } 411 start = bi[row]; 412 end = bi[row+1]; 413 for (k=start; k<end; k++) { /* Bmat */ 414 col = gcols[bj[k]]; 415 indices_temp[lsize_tmp++] = col; 416 } 417 } 418 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 419 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr); 420 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 421 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 422 ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 423 ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr); 424 } 425 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 426 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 427 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 432 /* 433 Sample message format: 434 If a processor A wants processor B to process some elements corresponding 435 to index sets is[1],is[5] 436 mesg [0] = 2 (no of index sets in the mesg) 437 ----------- 438 mesg [1] = 1 => is[1] 439 mesg [2] = sizeof(is[1]); 440 ----------- 441 mesg [3] = 5 => is[5] 442 mesg [4] = sizeof(is[5]); 443 ----------- 444 mesg [5] 445 mesg [n] datas[1] 446 ----------- 447 mesg[n+1] 448 mesg[m] data(is[5]) 449 ----------- 450 451 Notes: 452 nrqs - no of requests sent (or to be sent out) 453 nrqr - no of requests recieved (which have to be or which have been processed 454 */ 455 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 456 { 457 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 458 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 459 const PetscInt **idx,*idx_i; 460 PetscInt *n,**data,len; 461 #if defined(PETSC_USE_CTABLE) 462 PetscTable *table_data,table_data_i; 463 PetscInt *tdata,tcount,tcount_max; 464 #else 465 PetscInt *data_i,*d_p; 466 #endif 467 PetscErrorCode ierr; 468 PetscMPIInt size,rank,tag1,tag2; 469 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 470 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 471 PetscBT *table; 472 MPI_Comm comm; 473 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 474 MPI_Status *s_status,*recv_status; 475 MPI_Comm *iscomms; 476 char *t_p; 477 478 PetscFunctionBegin; 479 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 480 size = c->size; 481 rank = c->rank; 482 M = C->rmap->N; 483 484 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 485 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 486 487 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 488 489 for (i=0; i<imax; i++) { 490 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 491 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 492 } 493 494 /* evaluate communication - mesg to who,length of mesg, and buffer space 495 required. Based on this, buffers are allocated, and data copied into them */ 496 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 497 for (i=0; i<imax; i++) { 498 ierr = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/ 499 idx_i = idx[i]; 500 len = n[i]; 501 for (j=0; j<len; j++) { 502 row = idx_i[j]; 503 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 504 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 505 w4[proc]++; 506 } 507 for (j=0; j<size; j++) { 508 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 509 } 510 } 511 512 nrqs = 0; /* no of outgoing messages */ 513 msz = 0; /* total mesg length (for all proc */ 514 w1[rank] = 0; /* no mesg sent to intself */ 515 w3[rank] = 0; 516 for (i=0; i<size; i++) { 517 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 518 } 519 /* pa - is list of processors to communicate with */ 520 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 521 for (i=0,j=0; i<size; i++) { 522 if (w1[i]) {pa[j] = i; j++;} 523 } 524 525 /* Each message would have a header = 1 + 2*(no of IS) + data */ 526 for (i=0; i<nrqs; i++) { 527 j = pa[i]; 528 w1[j] += w2[j] + 2*w3[j]; 529 msz += w1[j]; 530 } 531 532 /* Determine the number of messages to expect, their lengths, from from-ids */ 533 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 534 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 535 536 /* Now post the Irecvs corresponding to these messages */ 537 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 538 539 /* Allocate Memory for outgoing messages */ 540 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 541 /* TODO: use PetscCalloc2() */ 542 ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr); 543 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 544 545 { 546 PetscInt *iptr = tmp,ict = 0; 547 for (i=0; i<nrqs; i++) { 548 j = pa[i]; 549 iptr += ict; 550 outdat[j] = iptr; 551 ict = w1[j]; 552 } 553 } 554 555 /* Form the outgoing messages */ 556 /* plug in the headers */ 557 for (i=0; i<nrqs; i++) { 558 j = pa[i]; 559 outdat[j][0] = 0; 560 ierr = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr); 561 ptr[j] = outdat[j] + 2*w3[j] + 1; 562 } 563 564 /* Memory for doing local proc's work */ 565 { 566 PetscInt M_BPB_imax = 0; 567 #if defined(PETSC_USE_CTABLE) 568 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 569 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 570 for (i=0; i<imax; i++) { 571 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 572 } 573 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 574 for (i=0; i<imax; i++) { 575 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 576 } 577 #else 578 PetscInt Mimax = 0; 579 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 580 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 581 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 582 for (i=0; i<imax; i++) { 583 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 584 data[i] = d_p + M*i; 585 } 586 #endif 587 } 588 589 /* Parse the IS and update local tables and the outgoing buf with the data */ 590 { 591 PetscInt n_i,isz_i,*outdat_j,ctr_j; 592 PetscBT table_i; 593 594 for (i=0; i<imax; i++) { 595 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 596 n_i = n[i]; 597 table_i = table[i]; 598 idx_i = idx[i]; 599 #if defined(PETSC_USE_CTABLE) 600 table_data_i = table_data[i]; 601 #else 602 data_i = data[i]; 603 #endif 604 isz_i = isz[i]; 605 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 606 row = idx_i[j]; 607 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 608 if (proc != rank) { /* copy to the outgoing buffer */ 609 ctr[proc]++; 610 *ptr[proc] = row; 611 ptr[proc]++; 612 } else if (!PetscBTLookupSet(table_i,row)) { 613 #if defined(PETSC_USE_CTABLE) 614 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 615 #else 616 data_i[isz_i] = row; /* Update the local table */ 617 #endif 618 isz_i++; 619 } 620 } 621 /* Update the headers for the current IS */ 622 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 623 if ((ctr_j = ctr[j])) { 624 outdat_j = outdat[j]; 625 k = ++outdat_j[0]; 626 outdat_j[2*k] = ctr_j; 627 outdat_j[2*k-1] = i; 628 } 629 } 630 isz[i] = isz_i; 631 } 632 } 633 634 /* Now post the sends */ 635 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 636 for (i=0; i<nrqs; ++i) { 637 j = pa[i]; 638 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 639 } 640 641 /* No longer need the original indices */ 642 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 643 for (i=0; i<imax; ++i) { 644 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 645 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 646 } 647 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 648 649 for (i=0; i<imax; ++i) { 650 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 651 } 652 653 /* Do Local work */ 654 #if defined(PETSC_USE_CTABLE) 655 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 656 #else 657 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 658 #endif 659 660 /* Receive messages */ 661 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 662 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 663 664 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 665 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 666 667 /* Phase 1 sends are complete - deallocate buffers */ 668 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 669 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 670 671 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 672 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 673 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 674 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 675 ierr = PetscFree(rbuf);CHKERRQ(ierr); 676 677 678 /* Send the data back */ 679 /* Do a global reduction to know the buffer space req for incoming messages */ 680 { 681 PetscMPIInt *rw1; 682 683 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 684 685 for (i=0; i<nrqr; ++i) { 686 proc = recv_status[i].MPI_SOURCE; 687 688 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 689 rw1[proc] = isz1[i]; 690 } 691 ierr = PetscFree(onodes1);CHKERRQ(ierr); 692 ierr = PetscFree(olengths1);CHKERRQ(ierr); 693 694 /* Determine the number of messages to expect, their lengths, from from-ids */ 695 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 696 ierr = PetscFree(rw1);CHKERRQ(ierr); 697 } 698 /* Now post the Irecvs corresponding to these messages */ 699 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 700 701 /* Now post the sends */ 702 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 703 for (i=0; i<nrqr; ++i) { 704 j = recv_status[i].MPI_SOURCE; 705 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 706 } 707 708 /* receive work done on other processors */ 709 { 710 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 711 PetscMPIInt idex; 712 PetscBT table_i; 713 MPI_Status *status2; 714 715 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 716 for (i=0; i<nrqs; ++i) { 717 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 718 /* Process the message */ 719 rbuf2_i = rbuf2[idex]; 720 ct1 = 2*rbuf2_i[0]+1; 721 jmax = rbuf2[idex][0]; 722 for (j=1; j<=jmax; j++) { 723 max = rbuf2_i[2*j]; 724 is_no = rbuf2_i[2*j-1]; 725 isz_i = isz[is_no]; 726 table_i = table[is_no]; 727 #if defined(PETSC_USE_CTABLE) 728 table_data_i = table_data[is_no]; 729 #else 730 data_i = data[is_no]; 731 #endif 732 for (k=0; k<max; k++,ct1++) { 733 row = rbuf2_i[ct1]; 734 if (!PetscBTLookupSet(table_i,row)) { 735 #if defined(PETSC_USE_CTABLE) 736 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 737 #else 738 data_i[isz_i] = row; 739 #endif 740 isz_i++; 741 } 742 } 743 isz[is_no] = isz_i; 744 } 745 } 746 747 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 748 ierr = PetscFree(status2);CHKERRQ(ierr); 749 } 750 751 #if defined(PETSC_USE_CTABLE) 752 tcount_max = 0; 753 for (i=0; i<imax; ++i) { 754 table_data_i = table_data[i]; 755 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 756 if (tcount_max < tcount) tcount_max = tcount; 757 } 758 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 759 #endif 760 761 for (i=0; i<imax; ++i) { 762 #if defined(PETSC_USE_CTABLE) 763 PetscTablePosition tpos; 764 table_data_i = table_data[i]; 765 766 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 767 while (tpos) { 768 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 769 tdata[--j] = --k; 770 } 771 ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 772 #else 773 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 774 #endif 775 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 776 } 777 778 ierr = PetscFree(iscomms);CHKERRQ(ierr); 779 ierr = PetscFree(onodes2);CHKERRQ(ierr); 780 ierr = PetscFree(olengths2);CHKERRQ(ierr); 781 782 ierr = PetscFree(pa);CHKERRQ(ierr); 783 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 784 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 785 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 786 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 787 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 788 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 789 ierr = PetscFree(s_status);CHKERRQ(ierr); 790 ierr = PetscFree(recv_status);CHKERRQ(ierr); 791 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 792 ierr = PetscFree(xdata);CHKERRQ(ierr); 793 ierr = PetscFree(isz1);CHKERRQ(ierr); 794 #if defined(PETSC_USE_CTABLE) 795 for (i=0; i<imax; i++) { 796 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 797 } 798 ierr = PetscFree(table_data);CHKERRQ(ierr); 799 ierr = PetscFree(tdata);CHKERRQ(ierr); 800 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 801 #else 802 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 803 #endif 804 PetscFunctionReturn(0); 805 } 806 807 /* 808 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 809 the work on the local processor. 810 811 Inputs: 812 C - MAT_MPIAIJ; 813 imax - total no of index sets processed at a time; 814 table - an array of char - size = m bits. 815 816 Output: 817 isz - array containing the count of the solution elements corresponding 818 to each index set; 819 data or table_data - pointer to the solutions 820 */ 821 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 822 { 823 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 824 Mat A = c->A,B = c->B; 825 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 826 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 827 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 828 PetscBT table_i; 829 #if defined(PETSC_USE_CTABLE) 830 PetscTable table_data_i; 831 PetscErrorCode ierr; 832 PetscTablePosition tpos; 833 PetscInt tcount,*tdata; 834 #else 835 PetscInt *data_i; 836 #endif 837 838 PetscFunctionBegin; 839 rstart = C->rmap->rstart; 840 cstart = C->cmap->rstart; 841 ai = a->i; 842 aj = a->j; 843 bi = b->i; 844 bj = b->j; 845 garray = c->garray; 846 847 for (i=0; i<imax; i++) { 848 #if defined(PETSC_USE_CTABLE) 849 /* copy existing entries of table_data_i into tdata[] */ 850 table_data_i = table_data[i]; 851 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 852 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 853 854 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 855 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 856 while (tpos) { 857 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 858 tdata[--j] = --row; 859 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 860 } 861 #else 862 data_i = data[i]; 863 #endif 864 table_i = table[i]; 865 isz_i = isz[i]; 866 max = isz[i]; 867 868 for (j=0; j<max; j++) { 869 #if defined(PETSC_USE_CTABLE) 870 row = tdata[j] - rstart; 871 #else 872 row = data_i[j] - rstart; 873 #endif 874 start = ai[row]; 875 end = ai[row+1]; 876 for (k=start; k<end; k++) { /* Amat */ 877 val = aj[k] + cstart; 878 if (!PetscBTLookupSet(table_i,val)) { 879 #if defined(PETSC_USE_CTABLE) 880 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 881 #else 882 data_i[isz_i] = val; 883 #endif 884 isz_i++; 885 } 886 } 887 start = bi[row]; 888 end = bi[row+1]; 889 for (k=start; k<end; k++) { /* Bmat */ 890 val = garray[bj[k]]; 891 if (!PetscBTLookupSet(table_i,val)) { 892 #if defined(PETSC_USE_CTABLE) 893 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 894 #else 895 data_i[isz_i] = val; 896 #endif 897 isz_i++; 898 } 899 } 900 } 901 isz[i] = isz_i; 902 903 #if defined(PETSC_USE_CTABLE) 904 ierr = PetscFree(tdata);CHKERRQ(ierr); 905 #endif 906 } 907 PetscFunctionReturn(0); 908 } 909 910 /* 911 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 912 and return the output 913 914 Input: 915 C - the matrix 916 nrqr - no of messages being processed. 917 rbuf - an array of pointers to the recieved requests 918 919 Output: 920 xdata - array of messages to be sent back 921 isz1 - size of each message 922 923 For better efficiency perhaps we should malloc separately each xdata[i], 924 then if a remalloc is required we need only copy the data for that one row 925 rather then all previous rows as it is now where a single large chunck of 926 memory is used. 927 928 */ 929 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 930 { 931 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 932 Mat A = c->A,B = c->B; 933 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 934 PetscErrorCode ierr; 935 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 936 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 937 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 938 PetscInt *rbuf_i,kmax,rbuf_0; 939 PetscBT xtable; 940 941 PetscFunctionBegin; 942 m = C->rmap->N; 943 rstart = C->rmap->rstart; 944 cstart = C->cmap->rstart; 945 ai = a->i; 946 aj = a->j; 947 bi = b->i; 948 bj = b->j; 949 garray = c->garray; 950 951 952 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 953 rbuf_i = rbuf[i]; 954 rbuf_0 = rbuf_i[0]; 955 ct += rbuf_0; 956 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 957 } 958 959 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 960 else max1 = 1; 961 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 962 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 963 ++no_malloc; 964 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 965 ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr); 966 967 ct3 = 0; 968 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 969 rbuf_i = rbuf[i]; 970 rbuf_0 = rbuf_i[0]; 971 ct1 = 2*rbuf_0+1; 972 ct2 = ct1; 973 ct3 += ct1; 974 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 975 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 976 oct2 = ct2; 977 kmax = rbuf_i[2*j]; 978 for (k=0; k<kmax; k++,ct1++) { 979 row = rbuf_i[ct1]; 980 if (!PetscBTLookupSet(xtable,row)) { 981 if (!(ct3 < mem_estimate)) { 982 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 983 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 984 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 985 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 986 xdata[0] = tmp; 987 mem_estimate = new_estimate; ++no_malloc; 988 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 989 } 990 xdata[i][ct2++] = row; 991 ct3++; 992 } 993 } 994 for (k=oct2,max2=ct2; k<max2; k++) { 995 row = xdata[i][k] - rstart; 996 start = ai[row]; 997 end = ai[row+1]; 998 for (l=start; l<end; l++) { 999 val = aj[l] + cstart; 1000 if (!PetscBTLookupSet(xtable,val)) { 1001 if (!(ct3 < mem_estimate)) { 1002 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1003 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1004 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1005 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1006 xdata[0] = tmp; 1007 mem_estimate = new_estimate; ++no_malloc; 1008 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1009 } 1010 xdata[i][ct2++] = val; 1011 ct3++; 1012 } 1013 } 1014 start = bi[row]; 1015 end = bi[row+1]; 1016 for (l=start; l<end; l++) { 1017 val = garray[bj[l]]; 1018 if (!PetscBTLookupSet(xtable,val)) { 1019 if (!(ct3 < mem_estimate)) { 1020 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1021 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1022 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1023 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1024 xdata[0] = tmp; 1025 mem_estimate = new_estimate; ++no_malloc; 1026 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1027 } 1028 xdata[i][ct2++] = val; 1029 ct3++; 1030 } 1031 } 1032 } 1033 /* Update the header*/ 1034 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1035 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1036 } 1037 xdata[i][0] = rbuf_0; 1038 xdata[i+1] = xdata[i] + ct2; 1039 isz1[i] = ct2; /* size of each message */ 1040 } 1041 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1042 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1043 PetscFunctionReturn(0); 1044 } 1045 /* -------------------------------------------------------------------------*/ 1046 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1047 /* 1048 Every processor gets the entire matrix 1049 */ 1050 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1051 { 1052 Mat B; 1053 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1054 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1055 PetscErrorCode ierr; 1056 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1057 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1058 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1059 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1060 1061 PetscFunctionBegin; 1062 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1063 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1064 1065 if (scall == MAT_INITIAL_MATRIX) { 1066 /* ---------------------------------------------------------------- 1067 Tell every processor the number of nonzeros per row 1068 */ 1069 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1070 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1071 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1072 } 1073 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1074 for (i=0; i<size; i++) { 1075 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1076 displs[i] = A->rmap->range[i]; 1077 } 1078 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1079 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1080 #else 1081 sendcount = A->rmap->rend - A->rmap->rstart; 1082 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1083 #endif 1084 /* --------------------------------------------------------------- 1085 Create the sequential matrix of the same type as the local block diagonal 1086 */ 1087 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1088 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1089 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1090 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1091 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1092 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1093 **Bin = B; 1094 b = (Mat_SeqAIJ*)B->data; 1095 1096 /*-------------------------------------------------------------------- 1097 Copy my part of matrix column indices over 1098 */ 1099 sendcount = ad->nz + bd->nz; 1100 jsendbuf = b->j + b->i[rstarts[rank]]; 1101 a_jsendbuf = ad->j; 1102 b_jsendbuf = bd->j; 1103 n = A->rmap->rend - A->rmap->rstart; 1104 cnt = 0; 1105 for (i=0; i<n; i++) { 1106 1107 /* put in lower diagonal portion */ 1108 m = bd->i[i+1] - bd->i[i]; 1109 while (m > 0) { 1110 /* is it above diagonal (in bd (compressed) numbering) */ 1111 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1112 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1113 m--; 1114 } 1115 1116 /* put in diagonal portion */ 1117 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1118 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1119 } 1120 1121 /* put in upper diagonal portion */ 1122 while (m-- > 0) { 1123 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1124 } 1125 } 1126 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1127 1128 /*-------------------------------------------------------------------- 1129 Gather all column indices to all processors 1130 */ 1131 for (i=0; i<size; i++) { 1132 recvcounts[i] = 0; 1133 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1134 recvcounts[i] += lens[j]; 1135 } 1136 } 1137 displs[0] = 0; 1138 for (i=1; i<size; i++) { 1139 displs[i] = displs[i-1] + recvcounts[i-1]; 1140 } 1141 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1142 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1143 #else 1144 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1145 #endif 1146 /*-------------------------------------------------------------------- 1147 Assemble the matrix into useable form (note numerical values not yet set) 1148 */ 1149 /* set the b->ilen (length of each row) values */ 1150 ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr); 1151 /* set the b->i indices */ 1152 b->i[0] = 0; 1153 for (i=1; i<=A->rmap->N; i++) { 1154 b->i[i] = b->i[i-1] + lens[i-1]; 1155 } 1156 ierr = PetscFree(lens);CHKERRQ(ierr); 1157 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1158 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1159 1160 } else { 1161 B = **Bin; 1162 b = (Mat_SeqAIJ*)B->data; 1163 } 1164 1165 /*-------------------------------------------------------------------- 1166 Copy my part of matrix numerical values into the values location 1167 */ 1168 if (flag == MAT_GET_VALUES) { 1169 sendcount = ad->nz + bd->nz; 1170 sendbuf = b->a + b->i[rstarts[rank]]; 1171 a_sendbuf = ad->a; 1172 b_sendbuf = bd->a; 1173 b_sendj = bd->j; 1174 n = A->rmap->rend - A->rmap->rstart; 1175 cnt = 0; 1176 for (i=0; i<n; i++) { 1177 1178 /* put in lower diagonal portion */ 1179 m = bd->i[i+1] - bd->i[i]; 1180 while (m > 0) { 1181 /* is it above diagonal (in bd (compressed) numbering) */ 1182 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1183 sendbuf[cnt++] = *b_sendbuf++; 1184 m--; 1185 b_sendj++; 1186 } 1187 1188 /* put in diagonal portion */ 1189 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1190 sendbuf[cnt++] = *a_sendbuf++; 1191 } 1192 1193 /* put in upper diagonal portion */ 1194 while (m-- > 0) { 1195 sendbuf[cnt++] = *b_sendbuf++; 1196 b_sendj++; 1197 } 1198 } 1199 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1200 1201 /* ----------------------------------------------------------------- 1202 Gather all numerical values to all processors 1203 */ 1204 if (!recvcounts) { 1205 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1206 } 1207 for (i=0; i<size; i++) { 1208 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1209 } 1210 displs[0] = 0; 1211 for (i=1; i<size; i++) { 1212 displs[i] = displs[i-1] + recvcounts[i-1]; 1213 } 1214 recvbuf = b->a; 1215 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1216 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1217 #else 1218 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1219 #endif 1220 } /* endof (flag == MAT_GET_VALUES) */ 1221 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1222 1223 if (A->symmetric) { 1224 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1225 } else if (A->hermitian) { 1226 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1227 } else if (A->structurally_symmetric) { 1228 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1229 } 1230 PetscFunctionReturn(0); 1231 } 1232 1233 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1234 { 1235 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1236 Mat submat,A = c->A,B = c->B; 1237 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1238 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1239 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1240 const PetscInt *icol,*irow; 1241 PetscInt nrow,ncol,start; 1242 PetscErrorCode ierr; 1243 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1244 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1245 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1246 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1247 PetscInt *lens,rmax,ncols,*cols,Crow; 1248 #if defined(PETSC_USE_CTABLE) 1249 PetscTable cmap,rmap; 1250 PetscInt *cmap_loc,*rmap_loc; 1251 #else 1252 PetscInt *cmap,*rmap; 1253 #endif 1254 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1255 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1256 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1257 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1258 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1259 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1260 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1261 MPI_Comm comm; 1262 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1263 PetscMPIInt *onodes1,*olengths1,idex,end; 1264 Mat_SubSppt *smatis1; 1265 PetscBool isrowsorted,iscolsorted; 1266 1267 PetscFunctionBegin; 1268 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1269 1270 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1271 size = c->size; 1272 rank = c->rank; 1273 1274 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1275 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1276 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1277 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1278 if (allcolumns) { 1279 icol = NULL; 1280 ncol = C->cmap->N; 1281 } else { 1282 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1283 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1284 } 1285 1286 if (scall == MAT_INITIAL_MATRIX) { 1287 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1288 1289 /* Get some new tags to keep the communication clean */ 1290 tag1 = ((PetscObject)C)->tag; 1291 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1292 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1293 1294 /* evaluate communication - mesg to who, length of mesg, and buffer space 1295 required. Based on this, buffers are allocated, and data copied into them */ 1296 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1297 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1298 1299 /* w1[proc] = num of rows owned by proc -- to be requested */ 1300 proc = 0; 1301 nrqs = 0; /* num of outgoing messages */ 1302 for (j=0; j<nrow; j++) { 1303 row = irow[j]; 1304 if (!isrowsorted) proc = 0; 1305 while (row >= C->rmap->range[proc+1]) proc++; 1306 w1[proc]++; 1307 row2proc[j] = proc; /* map row index to proc */ 1308 1309 if (proc != rank && !w2[proc]) { 1310 w2[proc] = 1; nrqs++; 1311 } 1312 } 1313 w1[rank] = 0; /* rows owned by self will not be requested */ 1314 1315 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1316 for (proc=0,j=0; proc<size; proc++) { 1317 if (w1[proc]) { pa[j++] = proc;} 1318 } 1319 1320 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1321 msz = 0; /* total mesg length (for all procs) */ 1322 for (i=0; i<nrqs; i++) { 1323 proc = pa[i]; 1324 w1[proc] += 3; 1325 msz += w1[proc]; 1326 } 1327 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1328 1329 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1330 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1331 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1332 1333 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1334 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1335 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1336 1337 /* Now post the Irecvs corresponding to these messages */ 1338 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1339 1340 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1341 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1342 1343 /* Allocate Memory for outgoing messages */ 1344 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1345 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 1346 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 1347 1348 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1349 iptr = tmp; 1350 for (i=0; i<nrqs; i++) { 1351 proc = pa[i]; 1352 sbuf1[proc] = iptr; 1353 iptr += w1[proc]; 1354 } 1355 1356 /* Form the outgoing messages */ 1357 /* Initialize the header space */ 1358 for (i=0; i<nrqs; i++) { 1359 proc = pa[i]; 1360 ierr = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr); 1361 ptr[proc] = sbuf1[proc] + 3; 1362 } 1363 1364 /* Parse the isrow and copy data into outbuf */ 1365 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 1366 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1367 proc = row2proc[j]; 1368 if (proc != rank) { /* copy to the outgoing buf*/ 1369 *ptr[proc] = irow[j]; 1370 ctr[proc]++; ptr[proc]++; 1371 } 1372 } 1373 1374 /* Update the headers for the current IS */ 1375 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1376 if ((ctr_j = ctr[j])) { 1377 sbuf1_j = sbuf1[j]; 1378 k = ++sbuf1_j[0]; 1379 sbuf1_j[2*k] = ctr_j; 1380 sbuf1_j[2*k-1] = 0; 1381 } 1382 } 1383 1384 /* Now post the sends */ 1385 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1386 for (i=0; i<nrqs; ++i) { 1387 proc = pa[i]; 1388 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1389 } 1390 1391 /* Post Receives to capture the buffer size */ 1392 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1393 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1394 1395 rbuf2[0] = tmp + msz; 1396 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1397 1398 for (i=0; i<nrqs; ++i) { 1399 proc = pa[i]; 1400 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1401 } 1402 1403 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1404 1405 /* Send to other procs the buf size they should allocate */ 1406 /* Receive messages*/ 1407 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1408 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1409 1410 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1411 for (i=0; i<nrqr; ++i) { 1412 req_size[i] = 0; 1413 rbuf1_i = rbuf1[i]; 1414 start = 2*rbuf1_i[0] + 1; 1415 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1416 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1417 sbuf2_i = sbuf2[i]; 1418 for (j=start; j<end; j++) { 1419 k = rbuf1_i[j] - rstart; 1420 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1421 sbuf2_i[j] = ncols; 1422 req_size[i] += ncols; 1423 } 1424 req_source1[i] = r_status1[i].MPI_SOURCE; 1425 1426 /* form the header */ 1427 sbuf2_i[0] = req_size[i]; 1428 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1429 1430 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1431 } 1432 1433 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1434 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1435 1436 /* rbuf2 is received, Post recv column indices a->j */ 1437 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1438 1439 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1440 for (i=0; i<nrqs; ++i) { 1441 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1442 req_source2[i] = r_status2[i].MPI_SOURCE; 1443 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1444 } 1445 1446 /* Wait on sends1 and sends2 */ 1447 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1448 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1449 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1450 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1451 1452 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1453 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1454 1455 /* Now allocate sending buffers for a->j, and send them off */ 1456 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1457 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1458 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1459 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1460 1461 for (i=0; i<nrqr; i++) { /* for each requested message */ 1462 rbuf1_i = rbuf1[i]; 1463 sbuf_aj_i = sbuf_aj[i]; 1464 ct1 = 2*rbuf1_i[0] + 1; 1465 ct2 = 0; 1466 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1467 1468 kmax = rbuf1[i][2]; 1469 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1470 row = rbuf1_i[ct1] - rstart; 1471 nzA = ai[row+1] - ai[row]; 1472 nzB = bi[row+1] - bi[row]; 1473 ncols = nzA + nzB; 1474 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1475 1476 /* load the column indices for this row into cols*/ 1477 cols = sbuf_aj_i + ct2; 1478 1479 lwrite = 0; 1480 for (l=0; l<nzB; l++) { 1481 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1482 } 1483 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1484 for (l=0; l<nzB; l++) { 1485 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1486 } 1487 1488 ct2 += ncols; 1489 } 1490 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1491 } 1492 1493 /* create column map (cmap): global col of C -> local col of submat */ 1494 #if defined(PETSC_USE_CTABLE) 1495 if (!allcolumns) { 1496 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1497 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1498 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1499 if (icol[j] >= cstart && icol[j] <cend) { 1500 cmap_loc[icol[j] - cstart] = j+1; 1501 } else { /* use PetscTable for non-local col indices */ 1502 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1503 } 1504 } 1505 } else { 1506 cmap = NULL; 1507 cmap_loc = NULL; 1508 } 1509 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1510 #else 1511 if (!allcolumns) { 1512 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1513 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1514 } else { 1515 cmap = NULL; 1516 } 1517 #endif 1518 1519 /* Create lens for MatSeqAIJSetPreallocation() */ 1520 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1521 1522 /* Compute lens from local part of C */ 1523 for (j=0; j<nrow; j++) { 1524 row = irow[j]; 1525 proc = row2proc[j]; 1526 if (proc == rank) { 1527 /* diagonal part A = c->A */ 1528 ncols = ai[row-rstart+1] - ai[row-rstart]; 1529 cols = aj + ai[row-rstart]; 1530 if (!allcolumns) { 1531 for (k=0; k<ncols; k++) { 1532 #if defined(PETSC_USE_CTABLE) 1533 tcol = cmap_loc[cols[k]]; 1534 #else 1535 tcol = cmap[cols[k]+cstart]; 1536 #endif 1537 if (tcol) lens[j]++; 1538 } 1539 } else { /* allcolumns */ 1540 lens[j] = ncols; 1541 } 1542 1543 /* off-diagonal part B = c->B */ 1544 ncols = bi[row-rstart+1] - bi[row-rstart]; 1545 cols = bj + bi[row-rstart]; 1546 if (!allcolumns) { 1547 for (k=0; k<ncols; k++) { 1548 #if defined(PETSC_USE_CTABLE) 1549 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1550 #else 1551 tcol = cmap[bmap[cols[k]]]; 1552 #endif 1553 if (tcol) lens[j]++; 1554 } 1555 } else { /* allcolumns */ 1556 lens[j] += ncols; 1557 } 1558 } 1559 } 1560 1561 /* Create row map (rmap): global row of C -> local row of submat */ 1562 #if defined(PETSC_USE_CTABLE) 1563 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1564 for (j=0; j<nrow; j++) { 1565 row = irow[j]; 1566 proc = row2proc[j]; 1567 if (proc == rank) { /* a local row */ 1568 rmap_loc[row - rstart] = j; 1569 } else { 1570 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1571 } 1572 } 1573 #else 1574 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1575 for (j=0; j<nrow; j++) { 1576 rmap[irow[j]] = j; 1577 } 1578 #endif 1579 1580 /* Update lens from offproc data */ 1581 /* recv a->j is done */ 1582 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1583 for (i=0; i<nrqs; i++) { 1584 proc = pa[i]; 1585 sbuf1_i = sbuf1[proc]; 1586 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1587 ct1 = 2 + 1; 1588 ct2 = 0; 1589 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1590 rbuf3_i = rbuf3[i]; /* received C->j */ 1591 1592 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1593 max1 = sbuf1_i[2]; 1594 for (k=0; k<max1; k++,ct1++) { 1595 #if defined(PETSC_USE_CTABLE) 1596 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1597 row--; 1598 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1599 #else 1600 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1601 #endif 1602 /* Now, store row index of submat in sbuf1_i[ct1] */ 1603 sbuf1_i[ct1] = row; 1604 1605 nnz = rbuf2_i[ct1]; 1606 if (!allcolumns) { 1607 for (l=0; l<nnz; l++,ct2++) { 1608 #if defined(PETSC_USE_CTABLE) 1609 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1610 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1611 } else { 1612 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1613 } 1614 #else 1615 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1616 #endif 1617 if (tcol) lens[row]++; 1618 } 1619 } else { /* allcolumns */ 1620 lens[row] += nnz; 1621 } 1622 } 1623 } 1624 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1625 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1626 1627 /* Create the submatrices */ 1628 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1629 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1630 1631 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1632 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1633 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1634 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1635 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1636 1637 /* create struct Mat_SubSppt and attached it to submat */ 1638 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1639 subc = (Mat_SeqAIJ*)submat->data; 1640 subc->submatis1 = smatis1; 1641 1642 smatis1->id = 0; 1643 smatis1->nrqs = nrqs; 1644 smatis1->nrqr = nrqr; 1645 smatis1->rbuf1 = rbuf1; 1646 smatis1->rbuf2 = rbuf2; 1647 smatis1->rbuf3 = rbuf3; 1648 smatis1->sbuf2 = sbuf2; 1649 smatis1->req_source2 = req_source2; 1650 1651 smatis1->sbuf1 = sbuf1; 1652 smatis1->ptr = ptr; 1653 smatis1->tmp = tmp; 1654 smatis1->ctr = ctr; 1655 1656 smatis1->pa = pa; 1657 smatis1->req_size = req_size; 1658 smatis1->req_source1 = req_source1; 1659 1660 smatis1->allcolumns = allcolumns; 1661 smatis1->singleis = PETSC_TRUE; 1662 smatis1->row2proc = row2proc; 1663 smatis1->rmap = rmap; 1664 smatis1->cmap = cmap; 1665 #if defined(PETSC_USE_CTABLE) 1666 smatis1->rmap_loc = rmap_loc; 1667 smatis1->cmap_loc = cmap_loc; 1668 #endif 1669 1670 smatis1->destroy = submat->ops->destroy; 1671 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1672 submat->factortype = C->factortype; 1673 1674 /* compute rmax */ 1675 rmax = 0; 1676 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1677 1678 } else { /* scall == MAT_REUSE_MATRIX */ 1679 submat = submats[0]; 1680 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1681 1682 subc = (Mat_SeqAIJ*)submat->data; 1683 rmax = subc->rmax; 1684 smatis1 = subc->submatis1; 1685 nrqs = smatis1->nrqs; 1686 nrqr = smatis1->nrqr; 1687 rbuf1 = smatis1->rbuf1; 1688 rbuf2 = smatis1->rbuf2; 1689 rbuf3 = smatis1->rbuf3; 1690 req_source2 = smatis1->req_source2; 1691 1692 sbuf1 = smatis1->sbuf1; 1693 sbuf2 = smatis1->sbuf2; 1694 ptr = smatis1->ptr; 1695 tmp = smatis1->tmp; 1696 ctr = smatis1->ctr; 1697 1698 pa = smatis1->pa; 1699 req_size = smatis1->req_size; 1700 req_source1 = smatis1->req_source1; 1701 1702 allcolumns = smatis1->allcolumns; 1703 row2proc = smatis1->row2proc; 1704 rmap = smatis1->rmap; 1705 cmap = smatis1->cmap; 1706 #if defined(PETSC_USE_CTABLE) 1707 rmap_loc = smatis1->rmap_loc; 1708 cmap_loc = smatis1->cmap_loc; 1709 #endif 1710 } 1711 1712 /* Post recv matrix values */ 1713 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1714 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1715 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1716 for (i=0; i<nrqs; ++i) { 1717 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1718 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1719 } 1720 1721 /* Allocate sending buffers for a->a, and send them off */ 1722 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1723 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1724 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1725 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1726 1727 for (i=0; i<nrqr; i++) { 1728 rbuf1_i = rbuf1[i]; 1729 sbuf_aa_i = sbuf_aa[i]; 1730 ct1 = 2*rbuf1_i[0]+1; 1731 ct2 = 0; 1732 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1733 1734 kmax = rbuf1_i[2]; 1735 for (k=0; k<kmax; k++,ct1++) { 1736 row = rbuf1_i[ct1] - rstart; 1737 nzA = ai[row+1] - ai[row]; 1738 nzB = bi[row+1] - bi[row]; 1739 ncols = nzA + nzB; 1740 cworkB = bj + bi[row]; 1741 vworkA = a_a + ai[row]; 1742 vworkB = b_a + bi[row]; 1743 1744 /* load the column values for this row into vals*/ 1745 vals = sbuf_aa_i + ct2; 1746 1747 lwrite = 0; 1748 for (l=0; l<nzB; l++) { 1749 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1750 } 1751 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1752 for (l=0; l<nzB; l++) { 1753 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1754 } 1755 1756 ct2 += ncols; 1757 } 1758 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1759 } 1760 1761 /* Assemble submat */ 1762 /* First assemble the local rows */ 1763 for (j=0; j<nrow; j++) { 1764 row = irow[j]; 1765 proc = row2proc[j]; 1766 if (proc == rank) { 1767 Crow = row - rstart; /* local row index of C */ 1768 #if defined(PETSC_USE_CTABLE) 1769 row = rmap_loc[Crow]; /* row index of submat */ 1770 #else 1771 row = rmap[row]; 1772 #endif 1773 1774 if (allcolumns) { 1775 /* diagonal part A = c->A */ 1776 ncols = ai[Crow+1] - ai[Crow]; 1777 cols = aj + ai[Crow]; 1778 vals = a->a + ai[Crow]; 1779 i = 0; 1780 for (k=0; k<ncols; k++) { 1781 subcols[i] = cols[k] + cstart; 1782 subvals[i++] = vals[k]; 1783 } 1784 1785 /* off-diagonal part B = c->B */ 1786 ncols = bi[Crow+1] - bi[Crow]; 1787 cols = bj + bi[Crow]; 1788 vals = b->a + bi[Crow]; 1789 for (k=0; k<ncols; k++) { 1790 subcols[i] = bmap[cols[k]]; 1791 subvals[i++] = vals[k]; 1792 } 1793 1794 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1795 1796 } else { /* !allcolumns */ 1797 #if defined(PETSC_USE_CTABLE) 1798 /* diagonal part A = c->A */ 1799 ncols = ai[Crow+1] - ai[Crow]; 1800 cols = aj + ai[Crow]; 1801 vals = a->a + ai[Crow]; 1802 i = 0; 1803 for (k=0; k<ncols; k++) { 1804 tcol = cmap_loc[cols[k]]; 1805 if (tcol) { 1806 subcols[i] = --tcol; 1807 subvals[i++] = vals[k]; 1808 } 1809 } 1810 1811 /* off-diagonal part B = c->B */ 1812 ncols = bi[Crow+1] - bi[Crow]; 1813 cols = bj + bi[Crow]; 1814 vals = b->a + bi[Crow]; 1815 for (k=0; k<ncols; k++) { 1816 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1817 if (tcol) { 1818 subcols[i] = --tcol; 1819 subvals[i++] = vals[k]; 1820 } 1821 } 1822 #else 1823 /* diagonal part A = c->A */ 1824 ncols = ai[Crow+1] - ai[Crow]; 1825 cols = aj + ai[Crow]; 1826 vals = a->a + ai[Crow]; 1827 i = 0; 1828 for (k=0; k<ncols; k++) { 1829 tcol = cmap[cols[k]+cstart]; 1830 if (tcol) { 1831 subcols[i] = --tcol; 1832 subvals[i++] = vals[k]; 1833 } 1834 } 1835 1836 /* off-diagonal part B = c->B */ 1837 ncols = bi[Crow+1] - bi[Crow]; 1838 cols = bj + bi[Crow]; 1839 vals = b->a + bi[Crow]; 1840 for (k=0; k<ncols; k++) { 1841 tcol = cmap[bmap[cols[k]]]; 1842 if (tcol) { 1843 subcols[i] = --tcol; 1844 subvals[i++] = vals[k]; 1845 } 1846 } 1847 #endif 1848 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1849 } 1850 } 1851 } 1852 1853 /* Now assemble the off-proc rows */ 1854 for (i=0; i<nrqs; i++) { /* for each requested message */ 1855 /* recv values from other processes */ 1856 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1857 proc = pa[idex]; 1858 sbuf1_i = sbuf1[proc]; 1859 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1860 ct1 = 2 + 1; 1861 ct2 = 0; /* count of received C->j */ 1862 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1863 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1864 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1865 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1866 1867 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1868 max1 = sbuf1_i[2]; /* num of rows */ 1869 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1870 row = sbuf1_i[ct1]; /* row index of submat */ 1871 if (!allcolumns) { 1872 idex = 0; 1873 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1874 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1875 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1876 #if defined(PETSC_USE_CTABLE) 1877 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1878 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1879 } else { 1880 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1881 } 1882 #else 1883 tcol = cmap[rbuf3_i[ct2]]; 1884 #endif 1885 if (tcol) { 1886 subcols[idex] = --tcol; /* may not be sorted */ 1887 subvals[idex++] = rbuf4_i[ct2]; 1888 1889 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1890 For reuse, we replace received C->j with index that should be inserted to submat */ 1891 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1892 } 1893 } 1894 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1895 } else { /* scall == MAT_REUSE_MATRIX */ 1896 submat = submats[0]; 1897 subc = (Mat_SeqAIJ*)submat->data; 1898 1899 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1900 for (l=0; l<nnz; l++) { 1901 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1902 subvals[idex++] = rbuf4_i[ct2]; 1903 } 1904 1905 bj = subc->j + subc->i[row]; /* sorted column indices */ 1906 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1907 } 1908 } else { /* allcolumns */ 1909 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1910 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1911 ct2 += nnz; 1912 } 1913 } 1914 } 1915 1916 /* sending a->a are done */ 1917 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1918 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1919 1920 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1921 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1922 submats[0] = submat; 1923 1924 /* Restore the indices */ 1925 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1926 if (!allcolumns) { 1927 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1928 } 1929 1930 /* Destroy allocated memory */ 1931 for (i=0; i<nrqs; ++i) { 1932 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1933 } 1934 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1935 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1936 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1937 1938 if (scall == MAT_INITIAL_MATRIX) { 1939 ierr = PetscFree(lens);CHKERRQ(ierr); 1940 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1941 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1942 } 1943 PetscFunctionReturn(0); 1944 } 1945 1946 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1947 { 1948 PetscErrorCode ierr; 1949 PetscInt ncol; 1950 PetscBool colflag,allcolumns=PETSC_FALSE; 1951 1952 PetscFunctionBegin; 1953 /* Allocate memory to hold all the submatrices */ 1954 if (scall == MAT_INITIAL_MATRIX) { 1955 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1956 } 1957 1958 /* Check for special case: each processor gets entire matrix columns */ 1959 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1960 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1961 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1962 1963 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1964 PetscFunctionReturn(0); 1965 } 1966 1967 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1968 { 1969 PetscErrorCode ierr; 1970 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1971 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1972 Mat_SeqAIJ *subc; 1973 Mat_SubSppt *smat; 1974 1975 PetscFunctionBegin; 1976 /* Check for special case: each processor has a single IS */ 1977 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 1978 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1979 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */ 1980 PetscFunctionReturn(0); 1981 } 1982 1983 /* Collect global wantallmatrix and nstages */ 1984 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1985 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1986 if (!nmax) nmax = 1; 1987 1988 if (scall == MAT_INITIAL_MATRIX) { 1989 /* Collect global wantallmatrix and nstages */ 1990 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1991 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1992 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1993 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1994 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1995 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1996 wantallmatrix = PETSC_TRUE; 1997 1998 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1999 } 2000 } 2001 2002 /* Determine the number of stages through which submatrices are done 2003 Each stage will extract nmax submatrices. 2004 nmax is determined by the matrix column dimension. 2005 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2006 */ 2007 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2008 2009 in[0] = -1*(PetscInt)wantallmatrix; 2010 in[1] = nstages; 2011 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2012 wantallmatrix = (PetscBool)(-out[0]); 2013 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2014 2015 } else { /* MAT_REUSE_MATRIX */ 2016 if (ismax) { 2017 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2018 smat = subc->submatis1; 2019 } else { /* (*submat)[0] is a dummy matrix */ 2020 smat = (Mat_SubSppt*)(*submat)[0]->data; 2021 } 2022 if (!smat) { 2023 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2024 wantallmatrix = PETSC_TRUE; 2025 } else if (smat->singleis) { 2026 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2027 PetscFunctionReturn(0); 2028 } else { 2029 nstages = smat->nstages; 2030 } 2031 } 2032 2033 if (wantallmatrix) { 2034 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2035 PetscFunctionReturn(0); 2036 } 2037 2038 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2039 if (scall == MAT_INITIAL_MATRIX) { 2040 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2041 } 2042 2043 for (i=0,pos=0; i<nstages; i++) { 2044 if (pos+nmax <= ismax) max_no = nmax; 2045 else if (pos >= ismax) max_no = 0; 2046 else max_no = ismax-pos; 2047 2048 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2049 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2050 smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++; 2051 smat->nstages = nstages; 2052 } 2053 pos += max_no; 2054 } 2055 2056 if (ismax && scall == MAT_INITIAL_MATRIX) { 2057 /* save nstages for reuse */ 2058 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2059 smat = subc->submatis1; 2060 smat->nstages = nstages; 2061 } 2062 PetscFunctionReturn(0); 2063 } 2064 2065 /* -------------------------------------------------------------------------*/ 2066 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2067 { 2068 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2069 Mat A = c->A; 2070 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2071 const PetscInt **icol,**irow; 2072 PetscInt *nrow,*ncol,start; 2073 PetscErrorCode ierr; 2074 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2075 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2076 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2077 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2078 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2079 #if defined(PETSC_USE_CTABLE) 2080 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2081 #else 2082 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2083 #endif 2084 const PetscInt *irow_i; 2085 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2086 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2087 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2088 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2089 MPI_Status *r_status3,*r_status4,*s_status4; 2090 MPI_Comm comm; 2091 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2092 PetscMPIInt *onodes1,*olengths1,end; 2093 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2094 Mat_SubSppt *smat_i; 2095 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2096 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2097 2098 PetscFunctionBegin; 2099 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2100 size = c->size; 2101 rank = c->rank; 2102 2103 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2104 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2105 2106 for (i=0; i<ismax; i++) { 2107 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2108 if (!issorted[i]) iscsorted = issorted[i]; 2109 2110 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2111 2112 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2113 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2114 2115 /* Check for special case: allcolumn */ 2116 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2117 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2118 if (colflag && ncol[i] == C->cmap->N) { 2119 allcolumns[i] = PETSC_TRUE; 2120 icol[i] = NULL; 2121 } else { 2122 allcolumns[i] = PETSC_FALSE; 2123 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2124 } 2125 } 2126 2127 if (scall == MAT_REUSE_MATRIX) { 2128 /* Assumes new rows are same length as the old rows */ 2129 for (i=0; i<ismax; i++) { 2130 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2131 subc = (Mat_SeqAIJ*)submats[i]->data; 2132 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2133 2134 /* Initial matrix as if empty */ 2135 ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr); 2136 2137 smat_i = subc->submatis1; 2138 2139 nrqs = smat_i->nrqs; 2140 nrqr = smat_i->nrqr; 2141 rbuf1 = smat_i->rbuf1; 2142 rbuf2 = smat_i->rbuf2; 2143 rbuf3 = smat_i->rbuf3; 2144 req_source2 = smat_i->req_source2; 2145 2146 sbuf1 = smat_i->sbuf1; 2147 sbuf2 = smat_i->sbuf2; 2148 ptr = smat_i->ptr; 2149 tmp = smat_i->tmp; 2150 ctr = smat_i->ctr; 2151 2152 pa = smat_i->pa; 2153 req_size = smat_i->req_size; 2154 req_source1 = smat_i->req_source1; 2155 2156 allcolumns[i] = smat_i->allcolumns; 2157 row2proc[i] = smat_i->row2proc; 2158 rmap[i] = smat_i->rmap; 2159 cmap[i] = smat_i->cmap; 2160 } 2161 2162 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2163 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2164 smat_i = (Mat_SubSppt*)submats[0]->data; 2165 2166 nrqs = smat_i->nrqs; 2167 nrqr = smat_i->nrqr; 2168 rbuf1 = smat_i->rbuf1; 2169 rbuf2 = smat_i->rbuf2; 2170 rbuf3 = smat_i->rbuf3; 2171 req_source2 = smat_i->req_source2; 2172 2173 sbuf1 = smat_i->sbuf1; 2174 sbuf2 = smat_i->sbuf2; 2175 ptr = smat_i->ptr; 2176 tmp = smat_i->tmp; 2177 ctr = smat_i->ctr; 2178 2179 pa = smat_i->pa; 2180 req_size = smat_i->req_size; 2181 req_source1 = smat_i->req_source1; 2182 2183 allcolumns[0] = PETSC_FALSE; 2184 } 2185 } else { /* scall == MAT_INITIAL_MATRIX */ 2186 /* Get some new tags to keep the communication clean */ 2187 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2188 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2189 2190 /* evaluate communication - mesg to who, length of mesg, and buffer space 2191 required. Based on this, buffers are allocated, and data copied into them*/ 2192 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2193 2194 for (i=0; i<ismax; i++) { 2195 jmax = nrow[i]; 2196 irow_i = irow[i]; 2197 2198 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2199 row2proc[i] = row2proc_i; 2200 2201 if (issorted[i]) proc = 0; 2202 for (j=0; j<jmax; j++) { 2203 if (!issorted[i]) proc = 0; 2204 row = irow_i[j]; 2205 while (row >= C->rmap->range[proc+1]) proc++; 2206 w4[proc]++; 2207 row2proc_i[j] = proc; /* map row index to proc */ 2208 } 2209 for (j=0; j<size; j++) { 2210 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2211 } 2212 } 2213 2214 nrqs = 0; /* no of outgoing messages */ 2215 msz = 0; /* total mesg length (for all procs) */ 2216 w1[rank] = 0; /* no mesg sent to self */ 2217 w3[rank] = 0; 2218 for (i=0; i<size; i++) { 2219 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2220 } 2221 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2222 for (i=0,j=0; i<size; i++) { 2223 if (w1[i]) { pa[j] = i; j++; } 2224 } 2225 2226 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2227 for (i=0; i<nrqs; i++) { 2228 j = pa[i]; 2229 w1[j] += w2[j] + 2* w3[j]; 2230 msz += w1[j]; 2231 } 2232 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2233 2234 /* Determine the number of messages to expect, their lengths, from from-ids */ 2235 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2236 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2237 2238 /* Now post the Irecvs corresponding to these messages */ 2239 tag0 = ((PetscObject)C)->tag; 2240 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2241 2242 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2243 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2244 2245 /* Allocate Memory for outgoing messages */ 2246 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2247 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 2248 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 2249 2250 { 2251 PetscInt *iptr = tmp; 2252 k = 0; 2253 for (i=0; i<nrqs; i++) { 2254 j = pa[i]; 2255 iptr += k; 2256 sbuf1[j] = iptr; 2257 k = w1[j]; 2258 } 2259 } 2260 2261 /* Form the outgoing messages. Initialize the header space */ 2262 for (i=0; i<nrqs; i++) { 2263 j = pa[i]; 2264 sbuf1[j][0] = 0; 2265 ierr = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr); 2266 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2267 } 2268 2269 /* Parse the isrow and copy data into outbuf */ 2270 for (i=0; i<ismax; i++) { 2271 row2proc_i = row2proc[i]; 2272 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 2273 irow_i = irow[i]; 2274 jmax = nrow[i]; 2275 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2276 proc = row2proc_i[j]; 2277 if (proc != rank) { /* copy to the outgoing buf*/ 2278 ctr[proc]++; 2279 *ptr[proc] = irow_i[j]; 2280 ptr[proc]++; 2281 } 2282 } 2283 /* Update the headers for the current IS */ 2284 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2285 if ((ctr_j = ctr[j])) { 2286 sbuf1_j = sbuf1[j]; 2287 k = ++sbuf1_j[0]; 2288 sbuf1_j[2*k] = ctr_j; 2289 sbuf1_j[2*k-1] = i; 2290 } 2291 } 2292 } 2293 2294 /* Now post the sends */ 2295 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2296 for (i=0; i<nrqs; ++i) { 2297 j = pa[i]; 2298 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2299 } 2300 2301 /* Post Receives to capture the buffer size */ 2302 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2303 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2304 rbuf2[0] = tmp + msz; 2305 for (i=1; i<nrqs; ++i) { 2306 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2307 } 2308 for (i=0; i<nrqs; ++i) { 2309 j = pa[i]; 2310 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2311 } 2312 2313 /* Send to other procs the buf size they should allocate */ 2314 /* Receive messages*/ 2315 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2316 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2317 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2318 { 2319 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2320 PetscInt *sbuf2_i; 2321 2322 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2323 for (i=0; i<nrqr; ++i) { 2324 req_size[i] = 0; 2325 rbuf1_i = rbuf1[i]; 2326 start = 2*rbuf1_i[0] + 1; 2327 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2328 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2329 sbuf2_i = sbuf2[i]; 2330 for (j=start; j<end; j++) { 2331 id = rbuf1_i[j] - rstart; 2332 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2333 sbuf2_i[j] = ncols; 2334 req_size[i] += ncols; 2335 } 2336 req_source1[i] = r_status1[i].MPI_SOURCE; 2337 /* form the header */ 2338 sbuf2_i[0] = req_size[i]; 2339 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2340 2341 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2342 } 2343 } 2344 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2345 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2346 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2347 2348 /* Receive messages*/ 2349 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2350 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2351 2352 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2353 for (i=0; i<nrqs; ++i) { 2354 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2355 req_source2[i] = r_status2[i].MPI_SOURCE; 2356 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2357 } 2358 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2359 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2360 2361 /* Wait on sends1 and sends2 */ 2362 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2363 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2364 2365 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2366 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2367 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2368 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2369 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2370 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2371 2372 /* Now allocate sending buffers for a->j, and send them off */ 2373 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2374 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2375 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2376 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2377 2378 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2379 { 2380 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2381 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2382 PetscInt cend = C->cmap->rend; 2383 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2384 2385 for (i=0; i<nrqr; i++) { 2386 rbuf1_i = rbuf1[i]; 2387 sbuf_aj_i = sbuf_aj[i]; 2388 ct1 = 2*rbuf1_i[0] + 1; 2389 ct2 = 0; 2390 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2391 kmax = rbuf1[i][2*j]; 2392 for (k=0; k<kmax; k++,ct1++) { 2393 row = rbuf1_i[ct1] - rstart; 2394 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2395 ncols = nzA + nzB; 2396 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2397 2398 /* load the column indices for this row into cols */ 2399 cols = sbuf_aj_i + ct2; 2400 2401 lwrite = 0; 2402 for (l=0; l<nzB; l++) { 2403 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2404 } 2405 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2406 for (l=0; l<nzB; l++) { 2407 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2408 } 2409 2410 ct2 += ncols; 2411 } 2412 } 2413 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2414 } 2415 } 2416 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2417 2418 /* create col map: global col of C -> local col of submatrices */ 2419 { 2420 const PetscInt *icol_i; 2421 #if defined(PETSC_USE_CTABLE) 2422 for (i=0; i<ismax; i++) { 2423 if (!allcolumns[i]) { 2424 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2425 2426 jmax = ncol[i]; 2427 icol_i = icol[i]; 2428 cmap_i = cmap[i]; 2429 for (j=0; j<jmax; j++) { 2430 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2431 } 2432 } else cmap[i] = NULL; 2433 } 2434 #else 2435 for (i=0; i<ismax; i++) { 2436 if (!allcolumns[i]) { 2437 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2438 jmax = ncol[i]; 2439 icol_i = icol[i]; 2440 cmap_i = cmap[i]; 2441 for (j=0; j<jmax; j++) { 2442 cmap_i[icol_i[j]] = j+1; 2443 } 2444 } else cmap[i] = NULL; 2445 } 2446 #endif 2447 } 2448 2449 /* Create lens which is required for MatCreate... */ 2450 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2451 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2452 2453 if (ismax) { 2454 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2455 } 2456 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2457 2458 /* Update lens from local data */ 2459 for (i=0; i<ismax; i++) { 2460 row2proc_i = row2proc[i]; 2461 jmax = nrow[i]; 2462 if (!allcolumns[i]) cmap_i = cmap[i]; 2463 irow_i = irow[i]; 2464 lens_i = lens[i]; 2465 for (j=0; j<jmax; j++) { 2466 row = irow_i[j]; 2467 proc = row2proc_i[j]; 2468 if (proc == rank) { 2469 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2470 if (!allcolumns[i]) { 2471 for (k=0; k<ncols; k++) { 2472 #if defined(PETSC_USE_CTABLE) 2473 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2474 #else 2475 tcol = cmap_i[cols[k]]; 2476 #endif 2477 if (tcol) lens_i[j]++; 2478 } 2479 } else { /* allcolumns */ 2480 lens_i[j] = ncols; 2481 } 2482 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2483 } 2484 } 2485 } 2486 2487 /* Create row map: global row of C -> local row of submatrices */ 2488 #if defined(PETSC_USE_CTABLE) 2489 for (i=0; i<ismax; i++) { 2490 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2491 irow_i = irow[i]; 2492 jmax = nrow[i]; 2493 for (j=0; j<jmax; j++) { 2494 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2495 } 2496 } 2497 #else 2498 for (i=0; i<ismax; i++) { 2499 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2500 rmap_i = rmap[i]; 2501 irow_i = irow[i]; 2502 jmax = nrow[i]; 2503 for (j=0; j<jmax; j++) { 2504 rmap_i[irow_i[j]] = j; 2505 } 2506 } 2507 #endif 2508 2509 /* Update lens from offproc data */ 2510 { 2511 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2512 2513 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2514 for (tmp2=0; tmp2<nrqs; tmp2++) { 2515 sbuf1_i = sbuf1[pa[tmp2]]; 2516 jmax = sbuf1_i[0]; 2517 ct1 = 2*jmax+1; 2518 ct2 = 0; 2519 rbuf2_i = rbuf2[tmp2]; 2520 rbuf3_i = rbuf3[tmp2]; 2521 for (j=1; j<=jmax; j++) { 2522 is_no = sbuf1_i[2*j-1]; 2523 max1 = sbuf1_i[2*j]; 2524 lens_i = lens[is_no]; 2525 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2526 rmap_i = rmap[is_no]; 2527 for (k=0; k<max1; k++,ct1++) { 2528 #if defined(PETSC_USE_CTABLE) 2529 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2530 row--; 2531 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2532 #else 2533 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2534 #endif 2535 max2 = rbuf2_i[ct1]; 2536 for (l=0; l<max2; l++,ct2++) { 2537 if (!allcolumns[is_no]) { 2538 #if defined(PETSC_USE_CTABLE) 2539 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2540 #else 2541 tcol = cmap_i[rbuf3_i[ct2]]; 2542 #endif 2543 if (tcol) lens_i[row]++; 2544 } else { /* allcolumns */ 2545 lens_i[row]++; /* lens_i[row] += max2 ? */ 2546 } 2547 } 2548 } 2549 } 2550 } 2551 } 2552 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2553 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2554 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2555 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2556 2557 /* Create the submatrices */ 2558 for (i=0; i<ismax; i++) { 2559 PetscInt rbs,cbs; 2560 2561 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2562 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2563 2564 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2565 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2566 2567 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2568 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2569 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2570 2571 /* create struct Mat_SubSppt and attached it to submat */ 2572 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2573 subc = (Mat_SeqAIJ*)submats[i]->data; 2574 subc->submatis1 = smat_i; 2575 2576 smat_i->destroy = submats[i]->ops->destroy; 2577 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2578 submats[i]->factortype = C->factortype; 2579 2580 smat_i->id = i; 2581 smat_i->nrqs = nrqs; 2582 smat_i->nrqr = nrqr; 2583 smat_i->rbuf1 = rbuf1; 2584 smat_i->rbuf2 = rbuf2; 2585 smat_i->rbuf3 = rbuf3; 2586 smat_i->sbuf2 = sbuf2; 2587 smat_i->req_source2 = req_source2; 2588 2589 smat_i->sbuf1 = sbuf1; 2590 smat_i->ptr = ptr; 2591 smat_i->tmp = tmp; 2592 smat_i->ctr = ctr; 2593 2594 smat_i->pa = pa; 2595 smat_i->req_size = req_size; 2596 smat_i->req_source1 = req_source1; 2597 2598 smat_i->allcolumns = allcolumns[i]; 2599 smat_i->singleis = PETSC_FALSE; 2600 smat_i->row2proc = row2proc[i]; 2601 smat_i->rmap = rmap[i]; 2602 smat_i->cmap = cmap[i]; 2603 } 2604 2605 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2606 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2607 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2608 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2609 2610 /* create struct Mat_SubSppt and attached it to submat */ 2611 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2612 submats[0]->data = (void*)smat_i; 2613 2614 smat_i->destroy = submats[0]->ops->destroy; 2615 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2616 submats[0]->factortype = C->factortype; 2617 2618 smat_i->id = 0; 2619 smat_i->nrqs = nrqs; 2620 smat_i->nrqr = nrqr; 2621 smat_i->rbuf1 = rbuf1; 2622 smat_i->rbuf2 = rbuf2; 2623 smat_i->rbuf3 = rbuf3; 2624 smat_i->sbuf2 = sbuf2; 2625 smat_i->req_source2 = req_source2; 2626 2627 smat_i->sbuf1 = sbuf1; 2628 smat_i->ptr = ptr; 2629 smat_i->tmp = tmp; 2630 smat_i->ctr = ctr; 2631 2632 smat_i->pa = pa; 2633 smat_i->req_size = req_size; 2634 smat_i->req_source1 = req_source1; 2635 2636 smat_i->allcolumns = PETSC_FALSE; 2637 smat_i->singleis = PETSC_FALSE; 2638 smat_i->row2proc = NULL; 2639 smat_i->rmap = NULL; 2640 smat_i->cmap = NULL; 2641 } 2642 2643 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2644 ierr = PetscFree(lens);CHKERRQ(ierr); 2645 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2646 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2647 2648 } /* endof scall == MAT_INITIAL_MATRIX */ 2649 2650 /* Post recv matrix values */ 2651 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2652 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2653 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2654 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2655 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2656 for (i=0; i<nrqs; ++i) { 2657 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2658 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2659 } 2660 2661 /* Allocate sending buffers for a->a, and send them off */ 2662 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2663 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2664 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2665 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2666 2667 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2668 { 2669 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2670 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2671 PetscInt cend = C->cmap->rend; 2672 PetscInt *b_j = b->j; 2673 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2674 2675 for (i=0; i<nrqr; i++) { 2676 rbuf1_i = rbuf1[i]; 2677 sbuf_aa_i = sbuf_aa[i]; 2678 ct1 = 2*rbuf1_i[0]+1; 2679 ct2 = 0; 2680 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2681 kmax = rbuf1_i[2*j]; 2682 for (k=0; k<kmax; k++,ct1++) { 2683 row = rbuf1_i[ct1] - rstart; 2684 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2685 ncols = nzA + nzB; 2686 cworkB = b_j + b_i[row]; 2687 vworkA = a_a + a_i[row]; 2688 vworkB = b_a + b_i[row]; 2689 2690 /* load the column values for this row into vals*/ 2691 vals = sbuf_aa_i+ct2; 2692 2693 lwrite = 0; 2694 for (l=0; l<nzB; l++) { 2695 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2696 } 2697 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2698 for (l=0; l<nzB; l++) { 2699 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2700 } 2701 2702 ct2 += ncols; 2703 } 2704 } 2705 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2706 } 2707 } 2708 2709 /* Assemble the matrices */ 2710 /* First assemble the local rows */ 2711 for (i=0; i<ismax; i++) { 2712 row2proc_i = row2proc[i]; 2713 subc = (Mat_SeqAIJ*)submats[i]->data; 2714 imat_ilen = subc->ilen; 2715 imat_j = subc->j; 2716 imat_i = subc->i; 2717 imat_a = subc->a; 2718 2719 if (!allcolumns[i]) cmap_i = cmap[i]; 2720 rmap_i = rmap[i]; 2721 irow_i = irow[i]; 2722 jmax = nrow[i]; 2723 for (j=0; j<jmax; j++) { 2724 row = irow_i[j]; 2725 proc = row2proc_i[j]; 2726 if (proc == rank) { 2727 old_row = row; 2728 #if defined(PETSC_USE_CTABLE) 2729 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2730 row--; 2731 #else 2732 row = rmap_i[row]; 2733 #endif 2734 ilen_row = imat_ilen[row]; 2735 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2736 mat_i = imat_i[row]; 2737 mat_a = imat_a + mat_i; 2738 mat_j = imat_j + mat_i; 2739 if (!allcolumns[i]) { 2740 for (k=0; k<ncols; k++) { 2741 #if defined(PETSC_USE_CTABLE) 2742 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2743 #else 2744 tcol = cmap_i[cols[k]]; 2745 #endif 2746 if (tcol) { 2747 *mat_j++ = tcol - 1; 2748 *mat_a++ = vals[k]; 2749 ilen_row++; 2750 } 2751 } 2752 } else { /* allcolumns */ 2753 for (k=0; k<ncols; k++) { 2754 *mat_j++ = cols[k]; /* global col index! */ 2755 *mat_a++ = vals[k]; 2756 ilen_row++; 2757 } 2758 } 2759 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2760 2761 imat_ilen[row] = ilen_row; 2762 } 2763 } 2764 } 2765 2766 /* Now assemble the off proc rows */ 2767 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2768 for (tmp2=0; tmp2<nrqs; tmp2++) { 2769 sbuf1_i = sbuf1[pa[tmp2]]; 2770 jmax = sbuf1_i[0]; 2771 ct1 = 2*jmax + 1; 2772 ct2 = 0; 2773 rbuf2_i = rbuf2[tmp2]; 2774 rbuf3_i = rbuf3[tmp2]; 2775 rbuf4_i = rbuf4[tmp2]; 2776 for (j=1; j<=jmax; j++) { 2777 is_no = sbuf1_i[2*j-1]; 2778 rmap_i = rmap[is_no]; 2779 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2780 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2781 imat_ilen = subc->ilen; 2782 imat_j = subc->j; 2783 imat_i = subc->i; 2784 imat_a = subc->a; 2785 max1 = sbuf1_i[2*j]; 2786 for (k=0; k<max1; k++,ct1++) { 2787 row = sbuf1_i[ct1]; 2788 #if defined(PETSC_USE_CTABLE) 2789 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2790 row--; 2791 #else 2792 row = rmap_i[row]; 2793 #endif 2794 ilen = imat_ilen[row]; 2795 mat_i = imat_i[row]; 2796 mat_a = imat_a + mat_i; 2797 mat_j = imat_j + mat_i; 2798 max2 = rbuf2_i[ct1]; 2799 if (!allcolumns[is_no]) { 2800 for (l=0; l<max2; l++,ct2++) { 2801 #if defined(PETSC_USE_CTABLE) 2802 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2803 #else 2804 tcol = cmap_i[rbuf3_i[ct2]]; 2805 #endif 2806 if (tcol) { 2807 *mat_j++ = tcol - 1; 2808 *mat_a++ = rbuf4_i[ct2]; 2809 ilen++; 2810 } 2811 } 2812 } else { /* allcolumns */ 2813 for (l=0; l<max2; l++,ct2++) { 2814 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2815 *mat_a++ = rbuf4_i[ct2]; 2816 ilen++; 2817 } 2818 } 2819 imat_ilen[row] = ilen; 2820 } 2821 } 2822 } 2823 2824 if (!iscsorted) { /* sort column indices of the rows */ 2825 for (i=0; i<ismax; i++) { 2826 subc = (Mat_SeqAIJ*)submats[i]->data; 2827 imat_j = subc->j; 2828 imat_i = subc->i; 2829 imat_a = subc->a; 2830 imat_ilen = subc->ilen; 2831 2832 if (allcolumns[i]) continue; 2833 jmax = nrow[i]; 2834 for (j=0; j<jmax; j++) { 2835 mat_i = imat_i[j]; 2836 mat_a = imat_a + mat_i; 2837 mat_j = imat_j + mat_i; 2838 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2839 } 2840 } 2841 } 2842 2843 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2844 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2845 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2846 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2847 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2848 2849 /* Restore the indices */ 2850 for (i=0; i<ismax; i++) { 2851 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2852 if (!allcolumns[i]) { 2853 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2854 } 2855 } 2856 2857 for (i=0; i<ismax; i++) { 2858 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2859 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2860 } 2861 2862 /* Destroy allocated memory */ 2863 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2864 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2865 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 2866 2867 for (i=0; i<nrqs; ++i) { 2868 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2869 } 2870 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2871 2872 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2873 PetscFunctionReturn(0); 2874 } 2875 2876 /* 2877 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2878 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2879 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2880 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2881 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2882 state, and needs to be "assembled" later by compressing B's column space. 2883 2884 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2885 Following this call, C->A & C->B have been created, even if empty. 2886 */ 2887 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2888 { 2889 /* If making this function public, change the error returned in this function away from _PLIB. */ 2890 PetscErrorCode ierr; 2891 Mat_MPIAIJ *aij; 2892 Mat_SeqAIJ *Baij; 2893 PetscBool seqaij,Bdisassembled; 2894 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2895 PetscScalar v; 2896 const PetscInt *rowindices,*colindices; 2897 2898 PetscFunctionBegin; 2899 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2900 if (A) { 2901 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2902 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2903 if (rowemb) { 2904 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2905 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2906 } else { 2907 if (C->rmap->n != A->rmap->n) { 2908 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2909 } 2910 } 2911 if (dcolemb) { 2912 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2913 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2914 } else { 2915 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2916 } 2917 } 2918 if (B) { 2919 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2920 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2921 if (rowemb) { 2922 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2923 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2924 } else { 2925 if (C->rmap->n != B->rmap->n) { 2926 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2927 } 2928 } 2929 if (ocolemb) { 2930 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2931 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2932 } else { 2933 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2934 } 2935 } 2936 2937 aij = (Mat_MPIAIJ*)C->data; 2938 if (!aij->A) { 2939 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2940 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2941 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2942 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2943 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2944 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2945 } 2946 if (A) { 2947 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2948 } else { 2949 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2950 } 2951 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2952 /* 2953 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2954 need to "disassemble" B -- convert it to using C's global indices. 2955 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2956 2957 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2958 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2959 2960 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2961 At least avoid calling MatSetValues() and the implied searches? 2962 */ 2963 2964 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2965 #if defined(PETSC_USE_CTABLE) 2966 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2967 #else 2968 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2969 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2970 if (aij->B) { 2971 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2972 } 2973 #endif 2974 ngcol = 0; 2975 if (aij->lvec) { 2976 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2977 } 2978 if (aij->garray) { 2979 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2980 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2981 } 2982 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2983 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2984 } 2985 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2986 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2987 } 2988 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2989 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2990 } 2991 } 2992 Bdisassembled = PETSC_FALSE; 2993 if (!aij->B) { 2994 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2995 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2996 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2997 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2998 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2999 Bdisassembled = PETSC_TRUE; 3000 } 3001 if (B) { 3002 Baij = (Mat_SeqAIJ*)B->data; 3003 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3004 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3005 for (i=0; i<B->rmap->n; i++) { 3006 nz[i] = Baij->i[i+1] - Baij->i[i]; 3007 } 3008 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3009 ierr = PetscFree(nz);CHKERRQ(ierr); 3010 } 3011 3012 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3013 shift = rend-rstart; 3014 count = 0; 3015 rowindices = NULL; 3016 colindices = NULL; 3017 if (rowemb) { 3018 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3019 } 3020 if (ocolemb) { 3021 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3022 } 3023 for (i=0; i<B->rmap->n; i++) { 3024 PetscInt row; 3025 row = i; 3026 if (rowindices) row = rowindices[i]; 3027 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3028 col = Baij->j[count]; 3029 if (colindices) col = colindices[col]; 3030 if (Bdisassembled && col>=rstart) col += shift; 3031 v = Baij->a[count]; 3032 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3033 ++count; 3034 } 3035 } 3036 /* No assembly for aij->B is necessary. */ 3037 /* FIXME: set aij->B's nonzerostate correctly. */ 3038 } else { 3039 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3040 } 3041 C->preallocated = PETSC_TRUE; 3042 C->was_assembled = PETSC_FALSE; 3043 C->assembled = PETSC_FALSE; 3044 /* 3045 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3046 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3047 */ 3048 PetscFunctionReturn(0); 3049 } 3050 3051 /* 3052 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3053 */ 3054 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3055 { 3056 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3057 3058 PetscFunctionBegin; 3059 PetscValidPointer(A,2); 3060 PetscValidPointer(B,3); 3061 /* FIXME: make sure C is assembled */ 3062 *A = aij->A; 3063 *B = aij->B; 3064 /* Note that we don't incref *A and *B, so be careful! */ 3065 PetscFunctionReturn(0); 3066 } 3067 3068 /* 3069 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3070 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3071 */ 3072 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3073 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3074 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3075 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3076 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3077 { 3078 PetscErrorCode ierr; 3079 PetscMPIInt isize,flag; 3080 PetscInt i,ii,cismax,ispar; 3081 Mat *A,*B; 3082 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3083 3084 PetscFunctionBegin; 3085 if (!ismax) PetscFunctionReturn(0); 3086 3087 for (i = 0, cismax = 0; i < ismax; ++i) { 3088 PetscMPIInt isize; 3089 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3090 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3091 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3092 if (isize > 1) ++cismax; 3093 } 3094 3095 /* 3096 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3097 ispar counts the number of parallel ISs across C's comm. 3098 */ 3099 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3100 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3101 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3102 PetscFunctionReturn(0); 3103 } 3104 3105 /* if (ispar) */ 3106 /* 3107 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3108 These are used to extract the off-diag portion of the resulting parallel matrix. 3109 The row IS for the off-diag portion is the same as for the diag portion, 3110 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3111 */ 3112 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3113 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3114 for (i = 0, ii = 0; i < ismax; ++i) { 3115 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3116 if (isize > 1) { 3117 /* 3118 TODO: This is the part that's ***NOT SCALABLE***. 3119 To fix this we need to extract just the indices of C's nonzero columns 3120 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3121 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3122 to be done without serializing on the IS list, so, most likely, it is best 3123 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3124 */ 3125 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3126 /* Now we have to 3127 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3128 were sorted on each rank, concatenated they might no longer be sorted; 3129 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3130 indices in the nondecreasing order to the original index positions. 3131 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3132 */ 3133 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3134 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3135 ++ii; 3136 } 3137 } 3138 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3139 for (i = 0, ii = 0; i < ismax; ++i) { 3140 PetscInt j,issize; 3141 const PetscInt *indices; 3142 3143 /* 3144 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3145 */ 3146 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3147 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3148 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3149 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3150 for (j = 1; j < issize; ++j) { 3151 if (indices[j] == indices[j-1]) { 3152 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3153 } 3154 } 3155 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3156 3157 3158 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3159 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3160 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3161 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3162 for (j = 1; j < issize; ++j) { 3163 if (indices[j-1] == indices[j]) { 3164 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3165 } 3166 } 3167 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3168 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3169 if (isize > 1) { 3170 cisrow[ii] = isrow[i]; 3171 ++ii; 3172 } 3173 } 3174 /* 3175 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3176 array of sequential matrices underlying the resulting parallel matrices. 3177 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3178 contain duplicates. 3179 3180 There are as many diag matrices as there are original index sets. There are only as many parallel 3181 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3182 3183 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3184 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3185 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3186 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3187 with A[i] and B[ii] extracted from the corresponding MPI submat. 3188 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3189 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3190 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3191 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3192 values into A[i] and B[ii] sitting inside the corresponding submat. 3193 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3194 A[i], B[ii], AA[i] or BB[ii] matrices. 3195 */ 3196 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3197 if (scall == MAT_INITIAL_MATRIX) { 3198 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3199 } 3200 3201 /* Now obtain the sequential A and B submatrices separately. */ 3202 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3203 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3204 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3205 3206 /* 3207 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3208 matrices A & B have been extracted directly into the parallel matrices containing them, or 3209 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3210 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3211 to have the same sparsity pattern. 3212 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3213 must be constructed for C. This is done by setseqmat(s). 3214 */ 3215 for (i = 0, ii = 0; i < ismax; ++i) { 3216 /* 3217 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3218 That way we can avoid sorting and computing permutations when reusing. 3219 To this end: 3220 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3221 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3222 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3223 */ 3224 MatStructure pattern; 3225 pattern = DIFFERENT_NONZERO_PATTERN; 3226 3227 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3228 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3229 if (isize > 1) { 3230 if (scall == MAT_INITIAL_MATRIX) { 3231 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3232 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3233 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3234 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3235 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3236 } 3237 /* 3238 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3239 */ 3240 { 3241 Mat AA,BB; 3242 AA = A[i]; 3243 BB = B[ii]; 3244 if (AA || BB) { 3245 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3246 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3247 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3248 } 3249 3250 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3251 } 3252 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3253 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3254 ++ii; 3255 } else { /* if (isize == 1) */ 3256 if (scall == MAT_REUSE_MATRIX) { 3257 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3258 } 3259 if (isrow_p[i] || iscol_p[i]) { 3260 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3261 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3262 /* Otherwise A is extracted straight into (*submats)[i]. */ 3263 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3264 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3265 } else (*submat)[i] = A[i]; 3266 } 3267 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3268 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3269 } 3270 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3271 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3272 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3273 ierr = PetscFree(A);CHKERRQ(ierr); 3274 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3275 PetscFunctionReturn(0); 3276 } 3277 3278 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3279 { 3280 PetscErrorCode ierr; 3281 3282 PetscFunctionBegin; 3283 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3284 PetscFunctionReturn(0); 3285 } 3286