1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++){ 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++){ 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank){ 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows){ 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++){ 115 if (tosizes_temp[i]){ 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++){ 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++){ 133 for (j=0; j<fromsizes[2*i]; j++){ 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++){ 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++){ 169 for (j=0; j<tosizes[2*i]; j++){ 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++){ 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++){ 209 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 210 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 213 } 214 /* retrieve information to get row id and its overlap */ 215 for (i=0; i<nrecvs; ){ 216 is_id = recvdata[i++]; 217 data_size = recvdata[i++]; 218 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 219 isz_i = isz[is_id]; 220 for (j=0; j< data_size; j++){ 221 col = recvdata[i++]; 222 indices_i[isz_i++] = col; 223 } 224 isz[is_id] = isz_i; 225 } 226 /* remove duplicate entities */ 227 for (i=0; i<nidx; i++){ 228 indices_i = indices_temp+(max_lsize+nrecvs)*i; 229 isz_i = isz[i]; 230 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 231 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 232 } 233 ierr = PetscFree(isz);CHKERRQ(ierr); 234 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 239 { 240 PetscLayout rmap,cmap; 241 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 242 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 243 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 244 const PetscInt *gcols,*ai,*aj,*bi,*bj; 245 Mat amat,bmat; 246 PetscMPIInt rank; 247 PetscBool done; 248 MPI_Comm comm; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 253 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 254 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 255 /* Even if the mat is symmetric, we still assume it is not symmetric */ 256 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 257 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 258 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 /* total number of nonzero values is used to estimate the memory usage in the next step */ 261 tnz = ai[an]+bi[bn]; 262 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 263 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 264 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 265 /* to find the longest message */ 266 max_fszs = 0; 267 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 268 /* better way to estimate number of nonzero in the mat??? */ 269 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 270 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 271 rows_pos = 0; 272 totalrows = 0; 273 for (i=0; i<nfrom; i++){ 274 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 275 /* group data together */ 276 for (j=0; j<fromsizes[2*i]; j+=2){ 277 is_id = fromrows[rows_pos++];/* no of is */ 278 rows_i = rows_data[is_id]; 279 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 280 } 281 /* estimate a space to avoid multiple allocations */ 282 for (j=0; j<nidx; j++){ 283 indvc_ij = 0; 284 rows_i = rows_data[j]; 285 for (l=0; l<rows_pos_i[j]; l++){ 286 row = rows_i[l]-rstart; 287 start = ai[row]; 288 end = ai[row+1]; 289 for (k=start; k<end; k++){ /* Amat */ 290 col = aj[k] + cstart; 291 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 292 } 293 start = bi[row]; 294 end = bi[row+1]; 295 for (k=start; k<end; k++) { /* Bmat */ 296 col = gcols[bj[k]]; 297 indices_tmp[indvc_ij++] = col; 298 } 299 } 300 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 301 indv_counts[i*nidx+j] = indvc_ij; 302 totalrows += indvc_ij; 303 } 304 } 305 /* message triple <no of is, number of rows, rows> */ 306 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 307 totalrows = 0; 308 rows_pos = 0; 309 /* use this code again */ 310 for (i=0;i<nfrom;i++){ 311 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 312 for (j=0; j<fromsizes[2*i]; j+=2){ 313 is_id = fromrows[rows_pos++]; 314 rows_i = rows_data[is_id]; 315 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 316 } 317 /* add data */ 318 for (j=0; j<nidx; j++){ 319 if (!indv_counts[i*nidx+j]) continue; 320 indvc_ij = 0; 321 sbdata[totalrows++] = j; 322 sbdata[totalrows++] = indv_counts[i*nidx+j]; 323 sbsizes[2*i] += 2; 324 rows_i = rows_data[j]; 325 for (l=0; l<rows_pos_i[j]; l++){ 326 row = rows_i[l]-rstart; 327 start = ai[row]; 328 end = ai[row+1]; 329 for (k=start; k<end; k++){ /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; 332 } 333 start = bi[row]; 334 end = bi[row+1]; 335 for (k=start; k<end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 341 sbsizes[2*i] += indvc_ij; 342 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 343 totalrows += indvc_ij; 344 } 345 } 346 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 347 for (i=0; i<nfrom; i++){ 348 offsets[i+1] = offsets[i] + sbsizes[2*i]; 349 sbsizes[2*i+1] = offsets[i]; 350 } 351 ierr = PetscFree(offsets);CHKERRQ(ierr); 352 if (sbrowsizes) *sbrowsizes = sbsizes; 353 if (sbrows) *sbrows = sbdata; 354 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 355 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 356 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 357 PetscFunctionReturn(0); 358 } 359 360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 361 { 362 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 363 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 364 PetscInt lsize,lsize_tmp,owner; 365 PetscMPIInt rank; 366 Mat amat,bmat; 367 PetscBool done; 368 PetscLayout cmap,rmap; 369 MPI_Comm comm; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 374 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 375 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 376 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 377 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 378 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 379 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 380 /* is it a safe way to compute number of nonzero values ? */ 381 tnz = ai[an]+bi[bn]; 382 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 383 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 384 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 385 /* it is a better way to estimate memory than the old implementation 386 * where global size of matrix is used 387 * */ 388 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 389 for (i=0; i<nidx; i++) { 390 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 391 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 392 lsize_tmp = 0; 393 for (j=0; j<lsize; j++) { 394 owner = -1; 395 row = indices[j]; 396 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 397 if (owner != rank) continue; 398 /* local number */ 399 row -= rstart; 400 start = ai[row]; 401 end = ai[row+1]; 402 for (k=start; k<end; k++) { /* Amat */ 403 col = aj[k] + cstart; 404 indices_temp[lsize_tmp++] = col; 405 } 406 start = bi[row]; 407 end = bi[row+1]; 408 for (k=start; k<end; k++) { /* Bmat */ 409 col = gcols[bj[k]]; 410 indices_temp[lsize_tmp++] = col; 411 } 412 } 413 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 414 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 415 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 416 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 417 } 418 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 419 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 420 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 421 PetscFunctionReturn(0); 422 } 423 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests recieved (which have to be or which have been processed 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscErrorCode ierr; 461 PetscMPIInt size,rank,tag1,tag2; 462 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 463 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 464 PetscBT *table; 465 MPI_Comm comm; 466 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 467 MPI_Status *s_status,*recv_status; 468 char *t_p; 469 470 PetscFunctionBegin; 471 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 477 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 478 479 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 480 481 for (i=0; i<imax; i++) { 482 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 483 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 489 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 490 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 491 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 492 for (i=0; i<imax; i++) { 493 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 494 idx_i = idx[i]; 495 len = n[i]; 496 for (j=0; j<len; j++) { 497 row = idx_i[j]; 498 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 499 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 500 w4[proc]++; 501 } 502 for (j=0; j<size; j++) { 503 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 504 } 505 } 506 507 nrqs = 0; /* no of outgoing messages */ 508 msz = 0; /* total mesg length (for all proc */ 509 w1[rank] = 0; /* no mesg sent to intself */ 510 w3[rank] = 0; 511 for (i=0; i<size; i++) { 512 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 513 } 514 /* pa - is list of processors to communicate with */ 515 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 516 for (i=0,j=0; i<size; i++) { 517 if (w1[i]) {pa[j] = i; j++;} 518 } 519 520 /* Each message would have a header = 1 + 2*(no of IS) + data */ 521 for (i=0; i<nrqs; i++) { 522 j = pa[i]; 523 w1[j] += w2[j] + 2*w3[j]; 524 msz += w1[j]; 525 } 526 527 /* Determine the number of messages to expect, their lengths, from from-ids */ 528 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 529 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 530 531 /* Now post the Irecvs corresponding to these messages */ 532 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 533 534 /* Allocate Memory for outgoing messages */ 535 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 536 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 537 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 538 539 { 540 PetscInt *iptr = tmp,ict = 0; 541 for (i=0; i<nrqs; i++) { 542 j = pa[i]; 543 iptr += ict; 544 outdat[j] = iptr; 545 ict = w1[j]; 546 } 547 } 548 549 /* Form the outgoing messages */ 550 /* plug in the headers */ 551 for (i=0; i<nrqs; i++) { 552 j = pa[i]; 553 outdat[j][0] = 0; 554 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 555 ptr[j] = outdat[j] + 2*w3[j] + 1; 556 } 557 558 /* Memory for doing local proc's work */ 559 { 560 PetscInt M_BPB_imax = 0; 561 #if defined(PETSC_USE_CTABLE) 562 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 563 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 564 for (i=0; i<imax; i++) { 565 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 566 } 567 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 570 } 571 #else 572 PetscInt Mimax = 0; 573 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 574 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 575 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 578 data[i] = d_p + M*i; 579 } 580 #endif 581 } 582 583 /* Parse the IS and update local tables and the outgoing buf with the data */ 584 { 585 PetscInt n_i,isz_i,*outdat_j,ctr_j; 586 PetscBT table_i; 587 588 for (i=0; i<imax; i++) { 589 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 590 n_i = n[i]; 591 table_i = table[i]; 592 idx_i = idx[i]; 593 #if defined(PETSC_USE_CTABLE) 594 table_data_i = table_data[i]; 595 #else 596 data_i = data[i]; 597 #endif 598 isz_i = isz[i]; 599 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 600 row = idx_i[j]; 601 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 602 if (proc != rank) { /* copy to the outgoing buffer */ 603 ctr[proc]++; 604 *ptr[proc] = row; 605 ptr[proc]++; 606 } else if (!PetscBTLookupSet(table_i,row)) { 607 #if defined(PETSC_USE_CTABLE) 608 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 609 #else 610 data_i[isz_i] = row; /* Update the local table */ 611 #endif 612 isz_i++; 613 } 614 } 615 /* Update the headers for the current IS */ 616 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 617 if ((ctr_j = ctr[j])) { 618 outdat_j = outdat[j]; 619 k = ++outdat_j[0]; 620 outdat_j[2*k] = ctr_j; 621 outdat_j[2*k-1] = i; 622 } 623 } 624 isz[i] = isz_i; 625 } 626 } 627 628 /* Now post the sends */ 629 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 630 for (i=0; i<nrqs; ++i) { 631 j = pa[i]; 632 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 633 } 634 635 /* No longer need the original indices */ 636 for (i=0; i<imax; ++i) { 637 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 638 } 639 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 640 641 for (i=0; i<imax; ++i) { 642 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 643 } 644 645 /* Do Local work */ 646 #if defined(PETSC_USE_CTABLE) 647 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 648 #else 649 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 650 #endif 651 652 /* Receive messages */ 653 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 654 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 655 656 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 657 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 658 659 /* Phase 1 sends are complete - deallocate buffers */ 660 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 661 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 662 663 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 664 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 665 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 666 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 667 ierr = PetscFree(rbuf);CHKERRQ(ierr); 668 669 670 /* Send the data back */ 671 /* Do a global reduction to know the buffer space req for incoming messages */ 672 { 673 PetscMPIInt *rw1; 674 675 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 676 677 for (i=0; i<nrqr; ++i) { 678 proc = recv_status[i].MPI_SOURCE; 679 680 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 681 rw1[proc] = isz1[i]; 682 } 683 ierr = PetscFree(onodes1);CHKERRQ(ierr); 684 ierr = PetscFree(olengths1);CHKERRQ(ierr); 685 686 /* Determine the number of messages to expect, their lengths, from from-ids */ 687 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 688 ierr = PetscFree(rw1);CHKERRQ(ierr); 689 } 690 /* Now post the Irecvs corresponding to these messages */ 691 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 692 693 /* Now post the sends */ 694 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 695 for (i=0; i<nrqr; ++i) { 696 j = recv_status[i].MPI_SOURCE; 697 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 698 } 699 700 /* receive work done on other processors */ 701 { 702 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 703 PetscMPIInt idex; 704 PetscBT table_i; 705 MPI_Status *status2; 706 707 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 708 for (i=0; i<nrqs; ++i) { 709 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 710 /* Process the message */ 711 rbuf2_i = rbuf2[idex]; 712 ct1 = 2*rbuf2_i[0]+1; 713 jmax = rbuf2[idex][0]; 714 for (j=1; j<=jmax; j++) { 715 max = rbuf2_i[2*j]; 716 is_no = rbuf2_i[2*j-1]; 717 isz_i = isz[is_no]; 718 table_i = table[is_no]; 719 #if defined(PETSC_USE_CTABLE) 720 table_data_i = table_data[is_no]; 721 #else 722 data_i = data[is_no]; 723 #endif 724 for (k=0; k<max; k++,ct1++) { 725 row = rbuf2_i[ct1]; 726 if (!PetscBTLookupSet(table_i,row)) { 727 #if defined(PETSC_USE_CTABLE) 728 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 729 #else 730 data_i[isz_i] = row; 731 #endif 732 isz_i++; 733 } 734 } 735 isz[is_no] = isz_i; 736 } 737 } 738 739 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 740 ierr = PetscFree(status2);CHKERRQ(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 } 768 769 ierr = PetscFree(onodes2);CHKERRQ(ierr); 770 ierr = PetscFree(olengths2);CHKERRQ(ierr); 771 772 ierr = PetscFree(pa);CHKERRQ(ierr); 773 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 774 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 775 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 776 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 779 ierr = PetscFree(s_status);CHKERRQ(ierr); 780 ierr = PetscFree(recv_status);CHKERRQ(ierr); 781 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 782 ierr = PetscFree(xdata);CHKERRQ(ierr); 783 ierr = PetscFree(isz1);CHKERRQ(ierr); 784 #if defined(PETSC_USE_CTABLE) 785 for (i=0; i<imax; i++) { 786 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 787 } 788 ierr = PetscFree(table_data);CHKERRQ(ierr); 789 ierr = PetscFree(tdata);CHKERRQ(ierr); 790 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 791 #else 792 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 793 #endif 794 PetscFunctionReturn(0); 795 } 796 797 /* 798 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 799 the work on the local processor. 800 801 Inputs: 802 C - MAT_MPIAIJ; 803 imax - total no of index sets processed at a time; 804 table - an array of char - size = m bits. 805 806 Output: 807 isz - array containing the count of the solution elements corresponding 808 to each index set; 809 data or table_data - pointer to the solutions 810 */ 811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 812 { 813 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 814 Mat A = c->A,B = c->B; 815 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 816 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 817 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 818 PetscBT table_i; 819 #if defined(PETSC_USE_CTABLE) 820 PetscTable table_data_i; 821 PetscErrorCode ierr; 822 PetscTablePosition tpos; 823 PetscInt tcount,*tdata; 824 #else 825 PetscInt *data_i; 826 #endif 827 828 PetscFunctionBegin; 829 rstart = C->rmap->rstart; 830 cstart = C->cmap->rstart; 831 ai = a->i; 832 aj = a->j; 833 bi = b->i; 834 bj = b->j; 835 garray = c->garray; 836 837 for (i=0; i<imax; i++) { 838 #if defined(PETSC_USE_CTABLE) 839 /* copy existing entries of table_data_i into tdata[] */ 840 table_data_i = table_data[i]; 841 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 842 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 843 844 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 845 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 846 while (tpos) { 847 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 848 tdata[--j] = --row; 849 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 850 } 851 #else 852 data_i = data[i]; 853 #endif 854 table_i = table[i]; 855 isz_i = isz[i]; 856 max = isz[i]; 857 858 for (j=0; j<max; j++) { 859 #if defined(PETSC_USE_CTABLE) 860 row = tdata[j] - rstart; 861 #else 862 row = data_i[j] - rstart; 863 #endif 864 start = ai[row]; 865 end = ai[row+1]; 866 for (k=start; k<end; k++) { /* Amat */ 867 val = aj[k] + cstart; 868 if (!PetscBTLookupSet(table_i,val)) { 869 #if defined(PETSC_USE_CTABLE) 870 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 871 #else 872 data_i[isz_i] = val; 873 #endif 874 isz_i++; 875 } 876 } 877 start = bi[row]; 878 end = bi[row+1]; 879 for (k=start; k<end; k++) { /* Bmat */ 880 val = garray[bj[k]]; 881 if (!PetscBTLookupSet(table_i,val)) { 882 #if defined(PETSC_USE_CTABLE) 883 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 884 #else 885 data_i[isz_i] = val; 886 #endif 887 isz_i++; 888 } 889 } 890 } 891 isz[i] = isz_i; 892 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscFree(tdata);CHKERRQ(ierr); 895 #endif 896 } 897 PetscFunctionReturn(0); 898 } 899 900 /* 901 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 902 and return the output 903 904 Input: 905 C - the matrix 906 nrqr - no of messages being processed. 907 rbuf - an array of pointers to the recieved requests 908 909 Output: 910 xdata - array of messages to be sent back 911 isz1 - size of each message 912 913 For better efficiency perhaps we should malloc separately each xdata[i], 914 then if a remalloc is required we need only copy the data for that one row 915 rather then all previous rows as it is now where a single large chunck of 916 memory is used. 917 918 */ 919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 920 { 921 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 922 Mat A = c->A,B = c->B; 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 924 PetscErrorCode ierr; 925 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 926 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 927 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 928 PetscInt *rbuf_i,kmax,rbuf_0; 929 PetscBT xtable; 930 931 PetscFunctionBegin; 932 m = C->rmap->N; 933 rstart = C->rmap->rstart; 934 cstart = C->cmap->rstart; 935 ai = a->i; 936 aj = a->j; 937 bi = b->i; 938 bj = b->j; 939 garray = c->garray; 940 941 942 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 943 rbuf_i = rbuf[i]; 944 rbuf_0 = rbuf_i[0]; 945 ct += rbuf_0; 946 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 947 } 948 949 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 950 else max1 = 1; 951 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 952 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 953 ++no_malloc; 954 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 955 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 956 957 ct3 = 0; 958 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 959 rbuf_i = rbuf[i]; 960 rbuf_0 = rbuf_i[0]; 961 ct1 = 2*rbuf_0+1; 962 ct2 = ct1; 963 ct3 += ct1; 964 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 965 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 966 oct2 = ct2; 967 kmax = rbuf_i[2*j]; 968 for (k=0; k<kmax; k++,ct1++) { 969 row = rbuf_i[ct1]; 970 if (!PetscBTLookupSet(xtable,row)) { 971 if (!(ct3 < mem_estimate)) { 972 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 973 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 974 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 975 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 976 xdata[0] = tmp; 977 mem_estimate = new_estimate; ++no_malloc; 978 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 979 } 980 xdata[i][ct2++] = row; 981 ct3++; 982 } 983 } 984 for (k=oct2,max2=ct2; k<max2; k++) { 985 row = xdata[i][k] - rstart; 986 start = ai[row]; 987 end = ai[row+1]; 988 for (l=start; l<end; l++) { 989 val = aj[l] + cstart; 990 if (!PetscBTLookupSet(xtable,val)) { 991 if (!(ct3 < mem_estimate)) { 992 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 993 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 994 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 995 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 996 xdata[0] = tmp; 997 mem_estimate = new_estimate; ++no_malloc; 998 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 999 } 1000 xdata[i][ct2++] = val; 1001 ct3++; 1002 } 1003 } 1004 start = bi[row]; 1005 end = bi[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = garray[bj[l]]; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 } 1023 /* Update the header*/ 1024 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1025 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1026 } 1027 xdata[i][0] = rbuf_0; 1028 xdata[i+1] = xdata[i] + ct2; 1029 isz1[i] = ct2; /* size of each message */ 1030 } 1031 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1032 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1033 PetscFunctionReturn(0); 1034 } 1035 /* -------------------------------------------------------------------------*/ 1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1038 /* 1039 Every processor gets the entire matrix 1040 */ 1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1042 { 1043 Mat B; 1044 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1045 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1046 PetscErrorCode ierr; 1047 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1048 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1049 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1050 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1051 1052 PetscFunctionBegin; 1053 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1054 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1055 1056 if (scall == MAT_INITIAL_MATRIX) { 1057 /* ---------------------------------------------------------------- 1058 Tell every processor the number of nonzeros per row 1059 */ 1060 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1061 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1062 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1063 } 1064 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1065 for (i=0; i<size; i++) { 1066 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1067 displs[i] = A->rmap->range[i]; 1068 } 1069 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1070 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1071 #else 1072 sendcount = A->rmap->rend - A->rmap->rstart; 1073 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1074 #endif 1075 /* --------------------------------------------------------------- 1076 Create the sequential matrix of the same type as the local block diagonal 1077 */ 1078 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1079 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1080 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1081 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1082 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1083 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1084 **Bin = B; 1085 b = (Mat_SeqAIJ*)B->data; 1086 1087 /*-------------------------------------------------------------------- 1088 Copy my part of matrix column indices over 1089 */ 1090 sendcount = ad->nz + bd->nz; 1091 jsendbuf = b->j + b->i[rstarts[rank]]; 1092 a_jsendbuf = ad->j; 1093 b_jsendbuf = bd->j; 1094 n = A->rmap->rend - A->rmap->rstart; 1095 cnt = 0; 1096 for (i=0; i<n; i++) { 1097 1098 /* put in lower diagonal portion */ 1099 m = bd->i[i+1] - bd->i[i]; 1100 while (m > 0) { 1101 /* is it above diagonal (in bd (compressed) numbering) */ 1102 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1103 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1104 m--; 1105 } 1106 1107 /* put in diagonal portion */ 1108 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1109 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1110 } 1111 1112 /* put in upper diagonal portion */ 1113 while (m-- > 0) { 1114 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1115 } 1116 } 1117 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1118 1119 /*-------------------------------------------------------------------- 1120 Gather all column indices to all processors 1121 */ 1122 for (i=0; i<size; i++) { 1123 recvcounts[i] = 0; 1124 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1125 recvcounts[i] += lens[j]; 1126 } 1127 } 1128 displs[0] = 0; 1129 for (i=1; i<size; i++) { 1130 displs[i] = displs[i-1] + recvcounts[i-1]; 1131 } 1132 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1133 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1134 #else 1135 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1136 #endif 1137 /*-------------------------------------------------------------------- 1138 Assemble the matrix into useable form (note numerical values not yet set) 1139 */ 1140 /* set the b->ilen (length of each row) values */ 1141 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1142 /* set the b->i indices */ 1143 b->i[0] = 0; 1144 for (i=1; i<=A->rmap->N; i++) { 1145 b->i[i] = b->i[i-1] + lens[i-1]; 1146 } 1147 ierr = PetscFree(lens);CHKERRQ(ierr); 1148 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1149 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1150 1151 } else { 1152 B = **Bin; 1153 b = (Mat_SeqAIJ*)B->data; 1154 } 1155 1156 /*-------------------------------------------------------------------- 1157 Copy my part of matrix numerical values into the values location 1158 */ 1159 if (flag == MAT_GET_VALUES) { 1160 sendcount = ad->nz + bd->nz; 1161 sendbuf = b->a + b->i[rstarts[rank]]; 1162 a_sendbuf = ad->a; 1163 b_sendbuf = bd->a; 1164 b_sendj = bd->j; 1165 n = A->rmap->rend - A->rmap->rstart; 1166 cnt = 0; 1167 for (i=0; i<n; i++) { 1168 1169 /* put in lower diagonal portion */ 1170 m = bd->i[i+1] - bd->i[i]; 1171 while (m > 0) { 1172 /* is it above diagonal (in bd (compressed) numbering) */ 1173 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1174 sendbuf[cnt++] = *b_sendbuf++; 1175 m--; 1176 b_sendj++; 1177 } 1178 1179 /* put in diagonal portion */ 1180 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1181 sendbuf[cnt++] = *a_sendbuf++; 1182 } 1183 1184 /* put in upper diagonal portion */ 1185 while (m-- > 0) { 1186 sendbuf[cnt++] = *b_sendbuf++; 1187 b_sendj++; 1188 } 1189 } 1190 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1191 1192 /* ----------------------------------------------------------------- 1193 Gather all numerical values to all processors 1194 */ 1195 if (!recvcounts) { 1196 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1197 } 1198 for (i=0; i<size; i++) { 1199 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1200 } 1201 displs[0] = 0; 1202 for (i=1; i<size; i++) { 1203 displs[i] = displs[i-1] + recvcounts[i-1]; 1204 } 1205 recvbuf = b->a; 1206 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1207 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1208 #else 1209 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1210 #endif 1211 } /* endof (flag == MAT_GET_VALUES) */ 1212 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1213 1214 if (A->symmetric) { 1215 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1216 } else if (A->hermitian) { 1217 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1218 } else if (A->structurally_symmetric) { 1219 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1220 } 1221 PetscFunctionReturn(0); 1222 } 1223 1224 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1225 { 1226 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1227 Mat submat,A = c->A,B = c->B; 1228 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1229 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1230 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1231 const PetscInt *icol,*irow; 1232 PetscInt nrow,ncol,start; 1233 PetscErrorCode ierr; 1234 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1235 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1236 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1237 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1238 PetscInt *lens,rmax,ncols,*cols,Crow; 1239 #if defined(PETSC_USE_CTABLE) 1240 PetscTable cmap,rmap; 1241 PetscInt *cmap_loc,*rmap_loc; 1242 #else 1243 PetscInt *cmap,*rmap; 1244 #endif 1245 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1246 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1247 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1248 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1249 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1250 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1251 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1252 MPI_Comm comm; 1253 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1254 PetscMPIInt *onodes1,*olengths1,idex,end; 1255 Mat_SubMat *smatis1; 1256 PetscBool isrowsorted; 1257 1258 PetscFunctionBegin; 1259 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1260 1261 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1262 size = c->size; 1263 rank = c->rank; 1264 1265 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1266 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1267 1268 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1269 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1270 if (allcolumns) { 1271 icol = NULL; 1272 ncol = C->cmap->N; 1273 } else { 1274 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1275 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1276 } 1277 1278 if (scall == MAT_INITIAL_MATRIX) { 1279 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1280 1281 /* Get some new tags to keep the communication clean */ 1282 tag1 = ((PetscObject)C)->tag; 1283 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1284 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1285 1286 /* evaluate communication - mesg to who, length of mesg, and buffer space 1287 required. Based on this, buffers are allocated, and data copied into them */ 1288 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1289 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1290 1291 /* w1[proc] = num of rows owned by proc -- to be requested */ 1292 proc = 0; 1293 nrqs = 0; /* num of outgoing messages */ 1294 for (j=0; j<nrow; j++) { 1295 row = irow[j]; /* sorted! */ 1296 while (row >= C->rmap->range[proc+1]) proc++; 1297 w1[proc]++; 1298 row2proc[j] = proc; /* map row index to proc */ 1299 1300 if (proc != rank && !w2[proc]) { 1301 w2[proc] = 1; nrqs++; 1302 } 1303 } 1304 w1[rank] = 0; /* rows owned by self will not be requested */ 1305 1306 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1307 for (proc=0,j=0; proc<size; proc++) { 1308 if (w1[proc]) { pa[j++] = proc;} 1309 } 1310 1311 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1312 msz = 0; /* total mesg length (for all procs) */ 1313 for (i=0; i<nrqs; i++) { 1314 proc = pa[i]; 1315 w1[proc] += 3; 1316 msz += w1[proc]; 1317 } 1318 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1319 1320 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1321 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1322 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1323 1324 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1325 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1326 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1327 1328 /* Now post the Irecvs corresponding to these messages */ 1329 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1330 1331 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1332 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1333 1334 /* Allocate Memory for outgoing messages */ 1335 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1336 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1337 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1338 1339 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1340 iptr = tmp; 1341 for (i=0; i<nrqs; i++) { 1342 proc = pa[i]; 1343 sbuf1[proc] = iptr; 1344 iptr += w1[proc]; 1345 } 1346 1347 /* Form the outgoing messages */ 1348 /* Initialize the header space */ 1349 for (i=0; i<nrqs; i++) { 1350 proc = pa[i]; 1351 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1352 ptr[proc] = sbuf1[proc] + 3; 1353 } 1354 1355 /* Parse the isrow and copy data into outbuf */ 1356 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1357 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1358 proc = row2proc[j]; 1359 if (proc != rank) { /* copy to the outgoing buf*/ 1360 *ptr[proc] = irow[j]; 1361 ctr[proc]++; ptr[proc]++; 1362 } 1363 } 1364 1365 /* Update the headers for the current IS */ 1366 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1367 if ((ctr_j = ctr[j])) { 1368 sbuf1_j = sbuf1[j]; 1369 k = ++sbuf1_j[0]; 1370 sbuf1_j[2*k] = ctr_j; 1371 sbuf1_j[2*k-1] = 0; 1372 } 1373 } 1374 1375 /* Now post the sends */ 1376 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1377 for (i=0; i<nrqs; ++i) { 1378 proc = pa[i]; 1379 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1380 } 1381 1382 /* Post Receives to capture the buffer size */ 1383 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1384 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1385 1386 rbuf2[0] = tmp + msz; 1387 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1388 1389 for (i=0; i<nrqs; ++i) { 1390 proc = pa[i]; 1391 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1392 } 1393 1394 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1395 1396 /* Send to other procs the buf size they should allocate */ 1397 /* Receive messages*/ 1398 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1399 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1400 1401 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1402 for (i=0; i<nrqr; ++i) { 1403 req_size[i] = 0; 1404 rbuf1_i = rbuf1[i]; 1405 start = 2*rbuf1_i[0] + 1; 1406 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1407 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1408 sbuf2_i = sbuf2[i]; 1409 for (j=start; j<end; j++) { 1410 k = rbuf1_i[j] - rstart; 1411 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1412 sbuf2_i[j] = ncols; 1413 req_size[i] += ncols; 1414 } 1415 req_source1[i] = r_status1[i].MPI_SOURCE; 1416 1417 /* form the header */ 1418 sbuf2_i[0] = req_size[i]; 1419 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1420 1421 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1422 } 1423 1424 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1425 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1426 1427 /* rbuf2 is received, Post recv column indices a->j */ 1428 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1429 1430 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1431 for (i=0; i<nrqs; ++i) { 1432 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1433 req_source2[i] = r_status2[i].MPI_SOURCE; 1434 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1435 } 1436 1437 /* Wait on sends1 and sends2 */ 1438 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1439 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1440 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1441 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1442 1443 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1444 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1445 1446 /* Now allocate sending buffers for a->j, and send them off */ 1447 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1448 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1449 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1450 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1451 1452 for (i=0; i<nrqr; i++) { /* for each requested message */ 1453 rbuf1_i = rbuf1[i]; 1454 sbuf_aj_i = sbuf_aj[i]; 1455 ct1 = 2*rbuf1_i[0] + 1; 1456 ct2 = 0; 1457 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1458 1459 kmax = rbuf1[i][2]; 1460 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1461 row = rbuf1_i[ct1] - rstart; 1462 nzA = ai[row+1] - ai[row]; 1463 nzB = bi[row+1] - bi[row]; 1464 ncols = nzA + nzB; 1465 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1466 1467 /* load the column indices for this row into cols*/ 1468 cols = sbuf_aj_i + ct2; 1469 1470 lwrite = 0; 1471 for (l=0; l<nzB; l++) { 1472 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1473 } 1474 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1475 for (l=0; l<nzB; l++) { 1476 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1477 } 1478 1479 ct2 += ncols; 1480 } 1481 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1482 } 1483 1484 /* create column map (cmap): global col of C -> local col of submat */ 1485 #if defined(PETSC_USE_CTABLE) 1486 if (!allcolumns) { 1487 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1488 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1489 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1490 if (icol[j] >= cstart && icol[j] <cend) { 1491 cmap_loc[icol[j] - cstart] = j+1; 1492 } else { /* use PetscTable for non-local col indices */ 1493 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1494 } 1495 } 1496 } else { 1497 cmap = NULL; 1498 cmap_loc = NULL; 1499 } 1500 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1501 #else 1502 if (!allcolumns) { 1503 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1504 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1505 } else { 1506 cmap = NULL; 1507 } 1508 #endif 1509 1510 /* Create lens for MatSeqAIJSetPreallocation() */ 1511 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1512 1513 /* Compute lens from local part of C */ 1514 for (j=0; j<nrow; j++) { 1515 row = irow[j]; 1516 proc = row2proc[j]; 1517 if (proc == rank) { 1518 /* diagonal part A = c->A */ 1519 ncols = ai[row-rstart+1] - ai[row-rstart]; 1520 cols = aj + ai[row-rstart]; 1521 if (!allcolumns) { 1522 for (k=0; k<ncols; k++) { 1523 #if defined(PETSC_USE_CTABLE) 1524 tcol = cmap_loc[cols[k]]; 1525 #else 1526 tcol = cmap[cols[k]+cstart]; 1527 #endif 1528 if (tcol) lens[j]++; 1529 } 1530 } else { /* allcolumns */ 1531 lens[j] = ncols; 1532 } 1533 1534 /* off-diagonal part B = c->B */ 1535 ncols = bi[row-rstart+1] - bi[row-rstart]; 1536 cols = bj + bi[row-rstart]; 1537 if (!allcolumns) { 1538 for (k=0; k<ncols; k++) { 1539 #if defined(PETSC_USE_CTABLE) 1540 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1541 #else 1542 tcol = cmap[bmap[cols[k]]]; 1543 #endif 1544 if (tcol) lens[j]++; 1545 } 1546 } else { /* allcolumns */ 1547 lens[j] += ncols; 1548 } 1549 } 1550 } 1551 1552 /* Create row map (rmap): global row of C -> local row of submat */ 1553 #if defined(PETSC_USE_CTABLE) 1554 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1555 for (j=0; j<nrow; j++) { 1556 row = irow[j]; 1557 proc = row2proc[j]; 1558 if (proc == rank) { /* a local row */ 1559 rmap_loc[row - rstart] = j; 1560 } else { 1561 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1562 } 1563 } 1564 #else 1565 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1566 for (j=0; j<nrow; j++) { 1567 rmap[irow[j]] = j; 1568 } 1569 #endif 1570 1571 /* Update lens from offproc data */ 1572 /* recv a->j is done */ 1573 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1574 for (i=0; i<nrqs; i++) { 1575 proc = pa[i]; 1576 sbuf1_i = sbuf1[proc]; 1577 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1578 ct1 = 2 + 1; 1579 ct2 = 0; 1580 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1581 rbuf3_i = rbuf3[i]; /* received C->j */ 1582 1583 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1584 max1 = sbuf1_i[2]; 1585 for (k=0; k<max1; k++,ct1++) { 1586 #if defined(PETSC_USE_CTABLE) 1587 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1588 row--; 1589 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1590 #else 1591 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1592 #endif 1593 /* Now, store row index of submat in sbuf1_i[ct1] */ 1594 sbuf1_i[ct1] = row; 1595 1596 nnz = rbuf2_i[ct1]; 1597 if (!allcolumns) { 1598 for (l=0; l<nnz; l++,ct2++) { 1599 #if defined(PETSC_USE_CTABLE) 1600 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1601 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1602 } else { 1603 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1604 } 1605 #else 1606 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1607 #endif 1608 if (tcol) lens[row]++; 1609 } 1610 } else { /* allcolumns */ 1611 lens[row] += nnz; 1612 } 1613 } 1614 } 1615 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1616 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1617 1618 /* Create the submatrices */ 1619 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1620 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1621 1622 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1623 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1624 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1625 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1626 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1627 1628 /* create struct Mat_SubMat and attached it to submat */ 1629 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1630 subc = (Mat_SeqAIJ*)submat->data; 1631 subc->submatis1 = smatis1; 1632 1633 smatis1->id = 0; 1634 smatis1->nrqs = nrqs; 1635 smatis1->nrqr = nrqr; 1636 smatis1->rbuf1 = rbuf1; 1637 smatis1->rbuf2 = rbuf2; 1638 smatis1->rbuf3 = rbuf3; 1639 smatis1->sbuf2 = sbuf2; 1640 smatis1->req_source2 = req_source2; 1641 1642 smatis1->sbuf1 = sbuf1; 1643 smatis1->ptr = ptr; 1644 smatis1->tmp = tmp; 1645 smatis1->ctr = ctr; 1646 1647 smatis1->pa = pa; 1648 smatis1->req_size = req_size; 1649 smatis1->req_source1 = req_source1; 1650 1651 smatis1->allcolumns = allcolumns; 1652 smatis1->singleis = PETSC_TRUE; 1653 smatis1->row2proc = row2proc; 1654 smatis1->rmap = rmap; 1655 smatis1->cmap = cmap; 1656 #if defined(PETSC_USE_CTABLE) 1657 smatis1->rmap_loc = rmap_loc; 1658 smatis1->cmap_loc = cmap_loc; 1659 #endif 1660 1661 smatis1->destroy = submat->ops->destroy; 1662 submat->ops->destroy = MatDestroy_SeqAIJ_Submatrices; 1663 submat->factortype = C->factortype; 1664 1665 /* compute rmax */ 1666 rmax = 0; 1667 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1668 1669 } else { /* scall == MAT_REUSE_MATRIX */ 1670 submat = submats[0]; 1671 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1672 1673 subc = (Mat_SeqAIJ*)submat->data; 1674 rmax = subc->rmax; 1675 smatis1 = subc->submatis1; 1676 nrqs = smatis1->nrqs; 1677 nrqr = smatis1->nrqr; 1678 rbuf1 = smatis1->rbuf1; 1679 rbuf2 = smatis1->rbuf2; 1680 rbuf3 = smatis1->rbuf3; 1681 req_source2 = smatis1->req_source2; 1682 1683 sbuf1 = smatis1->sbuf1; 1684 sbuf2 = smatis1->sbuf2; 1685 ptr = smatis1->ptr; 1686 tmp = smatis1->tmp; 1687 ctr = smatis1->ctr; 1688 1689 pa = smatis1->pa; 1690 req_size = smatis1->req_size; 1691 req_source1 = smatis1->req_source1; 1692 1693 allcolumns = smatis1->allcolumns; 1694 row2proc = smatis1->row2proc; 1695 rmap = smatis1->rmap; 1696 cmap = smatis1->cmap; 1697 #if defined(PETSC_USE_CTABLE) 1698 rmap_loc = smatis1->rmap_loc; 1699 cmap_loc = smatis1->cmap_loc; 1700 #endif 1701 } 1702 1703 /* Post recv matrix values */ 1704 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1705 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1706 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1707 for (i=0; i<nrqs; ++i) { 1708 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1709 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1710 } 1711 1712 /* Allocate sending buffers for a->a, and send them off */ 1713 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1714 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1715 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1716 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1717 1718 for (i=0; i<nrqr; i++) { 1719 rbuf1_i = rbuf1[i]; 1720 sbuf_aa_i = sbuf_aa[i]; 1721 ct1 = 2*rbuf1_i[0]+1; 1722 ct2 = 0; 1723 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1724 1725 kmax = rbuf1_i[2]; 1726 for (k=0; k<kmax; k++,ct1++) { 1727 row = rbuf1_i[ct1] - rstart; 1728 nzA = ai[row+1] - ai[row]; 1729 nzB = bi[row+1] - bi[row]; 1730 ncols = nzA + nzB; 1731 cworkB = bj + bi[row]; 1732 vworkA = a_a + ai[row]; 1733 vworkB = b_a + bi[row]; 1734 1735 /* load the column values for this row into vals*/ 1736 vals = sbuf_aa_i + ct2; 1737 1738 lwrite = 0; 1739 for (l=0; l<nzB; l++) { 1740 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1741 } 1742 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1743 for (l=0; l<nzB; l++) { 1744 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1745 } 1746 1747 ct2 += ncols; 1748 } 1749 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1750 } 1751 1752 /* Assemble submat */ 1753 /* First assemble the local rows */ 1754 for (j=0; j<nrow; j++) { 1755 row = irow[j]; 1756 proc = row2proc[j]; 1757 if (proc == rank) { 1758 Crow = row - rstart; /* local row index of C */ 1759 #if defined(PETSC_USE_CTABLE) 1760 row = rmap_loc[Crow]; /* row index of submat */ 1761 #else 1762 row = rmap[row]; 1763 #endif 1764 1765 if (allcolumns) { 1766 /* diagonal part A = c->A */ 1767 ncols = ai[Crow+1] - ai[Crow]; 1768 cols = aj + ai[Crow]; 1769 vals = a->a + ai[Crow]; 1770 i = 0; 1771 for (k=0; k<ncols; k++) { 1772 subcols[i] = cols[k] + cstart; 1773 subvals[i++] = vals[k]; 1774 } 1775 1776 /* off-diagonal part B = c->B */ 1777 ncols = bi[Crow+1] - bi[Crow]; 1778 cols = bj + bi[Crow]; 1779 vals = b->a + bi[Crow]; 1780 for (k=0; k<ncols; k++) { 1781 subcols[i] = bmap[cols[k]]; 1782 subvals[i++] = vals[k]; 1783 } 1784 1785 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1786 1787 } else { /* !allcolumns */ 1788 #if defined(PETSC_USE_CTABLE) 1789 /* diagonal part A = c->A */ 1790 ncols = ai[Crow+1] - ai[Crow]; 1791 cols = aj + ai[Crow]; 1792 vals = a->a + ai[Crow]; 1793 i = 0; 1794 for (k=0; k<ncols; k++) { 1795 tcol = cmap_loc[cols[k]]; 1796 if (tcol) { 1797 subcols[i] = --tcol; 1798 subvals[i++] = vals[k]; 1799 } 1800 } 1801 1802 /* off-diagonal part B = c->B */ 1803 ncols = bi[Crow+1] - bi[Crow]; 1804 cols = bj + bi[Crow]; 1805 vals = b->a + bi[Crow]; 1806 for (k=0; k<ncols; k++) { 1807 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1808 if (tcol) { 1809 subcols[i] = --tcol; 1810 subvals[i++] = vals[k]; 1811 } 1812 } 1813 #else 1814 /* diagonal part A = c->A */ 1815 ncols = ai[Crow+1] - ai[Crow]; 1816 cols = aj + ai[Crow]; 1817 vals = a->a + ai[Crow]; 1818 i = 0; 1819 for (k=0; k<ncols; k++) { 1820 tcol = cmap[cols[k]+cstart]; 1821 if (tcol) { 1822 subcols[i] = --tcol; 1823 subvals[i++] = vals[k]; 1824 } 1825 } 1826 1827 /* off-diagonal part B = c->B */ 1828 ncols = bi[Crow+1] - bi[Crow]; 1829 cols = bj + bi[Crow]; 1830 vals = b->a + bi[Crow]; 1831 for (k=0; k<ncols; k++) { 1832 tcol = cmap[bmap[cols[k]]]; 1833 if (tcol) { 1834 subcols[i] = --tcol; 1835 subvals[i++] = vals[k]; 1836 } 1837 } 1838 #endif 1839 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1840 } 1841 } 1842 } 1843 1844 /* Now assemble the off-proc rows */ 1845 for (i=0; i<nrqs; i++) { /* for each requested message */ 1846 /* recv values from other processes */ 1847 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1848 proc = pa[idex]; 1849 sbuf1_i = sbuf1[proc]; 1850 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1851 ct1 = 2 + 1; 1852 ct2 = 0; /* count of received C->j */ 1853 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1854 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1855 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1856 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1857 1858 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1859 max1 = sbuf1_i[2]; /* num of rows */ 1860 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1861 row = sbuf1_i[ct1]; /* row index of submat */ 1862 if (!allcolumns) { 1863 idex = 0; 1864 if (scall == MAT_INITIAL_MATRIX) { 1865 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1866 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1867 #if defined(PETSC_USE_CTABLE) 1868 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1869 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1870 } else { 1871 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1872 } 1873 #else 1874 tcol = cmap[rbuf3_i[ct2]]; 1875 #endif 1876 if (tcol) { 1877 subcols[idex] = --tcol; 1878 subvals[idex++] = rbuf4_i[ct2]; 1879 1880 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1881 For reuse, we replace received C->j with index that should be inserted to submat */ 1882 rbuf3_i[ct3++] = ct2; 1883 } 1884 } 1885 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1886 1887 } else { /* scall == MAT_REUSE_MATRIX */ 1888 submat = submats[0]; 1889 subc = (Mat_SeqAIJ*)submat->data; 1890 1891 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1892 for (l=0; l<nnz; l++) { 1893 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1894 subvals[idex++] = rbuf4_i[ct2]; 1895 } 1896 1897 bj = subc->j + subc->i[row]; 1898 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1899 } 1900 } else { /* allcolumns */ 1901 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1902 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1903 ct2 += nnz; 1904 } 1905 } 1906 } 1907 1908 /* sending a->a are done */ 1909 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1910 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1911 1912 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1913 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1914 submats[0] = submat; 1915 1916 /* Restore the indices */ 1917 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1918 if (!allcolumns) { 1919 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1920 } 1921 1922 /* Destroy allocated memory */ 1923 for (i=0; i<nrqs; ++i) { 1924 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1925 } 1926 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1927 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1928 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1929 1930 if (scall == MAT_INITIAL_MATRIX) { 1931 ierr = PetscFree(lens);CHKERRQ(ierr); 1932 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1933 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1934 } 1935 PetscFunctionReturn(0); 1936 } 1937 1938 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1939 { 1940 PetscErrorCode ierr; 1941 PetscInt ncol; 1942 PetscBool colflag,allcolumns=PETSC_FALSE; 1943 1944 PetscFunctionBegin; 1945 /* Allocate memory to hold all the submatrices */ 1946 if (scall == MAT_INITIAL_MATRIX) { 1947 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1948 } 1949 1950 /* Check for special case: each processor gets entire matrix columns */ 1951 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1952 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1953 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1954 1955 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1956 PetscFunctionReturn(0); 1957 } 1958 1959 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1960 { 1961 PetscErrorCode ierr; 1962 PetscInt nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2]; 1963 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1964 Mat_SeqAIJ *subc; 1965 Mat_SubMat *smat; 1966 1967 PetscFunctionBegin; 1968 /* Check for special case: each processor has a single IS */ 1969 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 1970 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1971 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */ 1972 PetscFunctionReturn(0); 1973 } 1974 1975 /* Collect global wantallmatrix and nstages */ 1976 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1977 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1978 if (!nmax) nmax = 1; 1979 1980 if (scall == MAT_INITIAL_MATRIX) { 1981 /* Collect global wantallmatrix and nstages */ 1982 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1983 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1984 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1985 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1986 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1987 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1988 wantallmatrix = PETSC_TRUE; 1989 1990 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1991 } 1992 } 1993 1994 /* Determine the number of stages through which submatrices are done 1995 Each stage will extract nmax submatrices. 1996 nmax is determined by the matrix column dimension. 1997 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1998 */ 1999 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2000 2001 in[0] = -1*(PetscInt)wantallmatrix; 2002 in[1] = nstages; 2003 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2004 wantallmatrix = (PetscBool)(-out[0]); 2005 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2006 2007 } else { /* MAT_REUSE_MATRIX */ 2008 if (ismax) { 2009 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2010 smat = subc->submatis1; 2011 } else { /* (*submat)[0] is a dummy matrix */ 2012 smat = (Mat_SubMat*)(*submat)[0]->data; 2013 } 2014 if (!smat) { 2015 /* smat is not generated by MatGetSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2016 wantallmatrix = PETSC_TRUE; 2017 } else if (smat->singleis) { 2018 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2019 PetscFunctionReturn(0); 2020 } else { 2021 nstages = smat->nstages; 2022 } 2023 } 2024 2025 if (wantallmatrix) { 2026 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2027 PetscFunctionReturn(0); 2028 } 2029 2030 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2031 if (scall == MAT_INITIAL_MATRIX) { 2032 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2033 } 2034 2035 #if 0 2036 PetscMPIInt rank; 2037 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)C),&rank);CHKERRQ(ierr); 2038 printf("[%d] reuse %d, ismax %d, CN %d, nstages %d, nmax %d\n",rank,scall,ismax,C->cmap->N,nstages,nmax); 2039 #endif 2040 for (i=0,pos=0; i<nstages; i++) { 2041 if (pos+nmax <= ismax) max_no = nmax; 2042 else if (pos == ismax) max_no = 0; 2043 else max_no = ismax-pos; 2044 /* if (!max_no) printf("[%d] max_no=0, %d-stage\n",rank,i); */ 2045 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2046 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2047 smat = (Mat_SubMat*)(*submat)[pos]->data; 2048 smat->nstages = nstages; 2049 } 2050 pos += max_no; 2051 } 2052 2053 if (ismax && scall == MAT_INITIAL_MATRIX) { 2054 /* save nstages for reuse */ 2055 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2056 smat = subc->submatis1; 2057 smat->nstages = nstages; 2058 } 2059 PetscFunctionReturn(0); 2060 } 2061 2062 /* -------------------------------------------------------------------------*/ 2063 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2064 { 2065 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2066 Mat A = c->A; 2067 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2068 const PetscInt **icol,**irow; 2069 PetscInt *nrow,*ncol,start; 2070 PetscErrorCode ierr; 2071 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2072 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2073 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2074 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2075 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2076 #if defined(PETSC_USE_CTABLE) 2077 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2078 #else 2079 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2080 #endif 2081 const PetscInt *irow_i; 2082 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2083 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2084 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2085 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2086 MPI_Status *r_status3,*r_status4,*s_status4; 2087 MPI_Comm comm; 2088 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2089 PetscMPIInt *onodes1,*olengths1,end; 2090 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2091 Mat_SubMat *smat_i; 2092 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2093 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2094 2095 PetscFunctionBegin; 2096 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2097 size = c->size; 2098 rank = c->rank; 2099 2100 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2101 ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2102 2103 for (i=0; i<ismax; i++) { 2104 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2105 if (!issorted[i]) iscsorted = issorted[i]; 2106 2107 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2108 2109 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2110 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2111 2112 /* Check for special case: allcolumn */ 2113 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2114 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2115 if (colflag && ncol[i] == C->cmap->N) { 2116 allcolumns[i] = PETSC_TRUE; 2117 icol[i] = NULL; 2118 } else { 2119 allcolumns[i] = PETSC_FALSE; 2120 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2121 } 2122 } 2123 2124 if (scall == MAT_REUSE_MATRIX) { 2125 /* Assumes new rows are same length as the old rows */ 2126 for (i=0; i<ismax; i++) { 2127 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2128 subc = (Mat_SeqAIJ*)submats[i]->data; 2129 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2130 2131 /* Initial matrix as if empty */ 2132 ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2133 2134 smat_i = subc->submatis1; 2135 2136 nrqs = smat_i->nrqs; 2137 nrqr = smat_i->nrqr; 2138 rbuf1 = smat_i->rbuf1; 2139 rbuf2 = smat_i->rbuf2; 2140 rbuf3 = smat_i->rbuf3; 2141 req_source2 = smat_i->req_source2; 2142 2143 sbuf1 = smat_i->sbuf1; 2144 sbuf2 = smat_i->sbuf2; 2145 ptr = smat_i->ptr; 2146 tmp = smat_i->tmp; 2147 ctr = smat_i->ctr; 2148 2149 pa = smat_i->pa; 2150 req_size = smat_i->req_size; 2151 req_source1 = smat_i->req_source1; 2152 2153 allcolumns[i] = smat_i->allcolumns; 2154 row2proc[i] = smat_i->row2proc; 2155 rmap[i] = smat_i->rmap; 2156 cmap[i] = smat_i->cmap; 2157 } 2158 2159 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2160 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2161 smat_i = (Mat_SubMat*)submats[0]->data; 2162 2163 nrqs = smat_i->nrqs; 2164 nrqr = smat_i->nrqr; 2165 rbuf1 = smat_i->rbuf1; 2166 rbuf2 = smat_i->rbuf2; 2167 rbuf3 = smat_i->rbuf3; 2168 req_source2 = smat_i->req_source2; 2169 2170 sbuf1 = smat_i->sbuf1; 2171 sbuf2 = smat_i->sbuf2; 2172 ptr = smat_i->ptr; 2173 tmp = smat_i->tmp; 2174 ctr = smat_i->ctr; 2175 2176 pa = smat_i->pa; 2177 req_size = smat_i->req_size; 2178 req_source1 = smat_i->req_source1; 2179 2180 allcolumns[0] = PETSC_TRUE; 2181 } 2182 } else { /* scall == MAT_INITIAL_MATRIX */ 2183 /* Get some new tags to keep the communication clean */ 2184 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2185 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2186 2187 /* evaluate communication - mesg to who, length of mesg, and buffer space 2188 required. Based on this, buffers are allocated, and data copied into them*/ 2189 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2190 2191 for (i=0; i<ismax; i++) { 2192 jmax = nrow[i]; 2193 irow_i = irow[i]; 2194 2195 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2196 row2proc[i] = row2proc_i; 2197 2198 if (issorted[i]) proc = 0; 2199 for (j=0; j<jmax; j++) { 2200 if (!issorted[i]) proc = 0; 2201 row = irow_i[j]; 2202 while (row >= C->rmap->range[proc+1]) proc++; 2203 w4[proc]++; 2204 row2proc_i[j] = proc; /* map row index to proc */ 2205 } 2206 for (j=0; j<size; j++) { 2207 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2208 } 2209 } 2210 2211 nrqs = 0; /* no of outgoing messages */ 2212 msz = 0; /* total mesg length (for all procs) */ 2213 w1[rank] = 0; /* no mesg sent to self */ 2214 w3[rank] = 0; 2215 for (i=0; i<size; i++) { 2216 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2217 } 2218 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2219 for (i=0,j=0; i<size; i++) { 2220 if (w1[i]) { pa[j] = i; j++; } 2221 } 2222 2223 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2224 for (i=0; i<nrqs; i++) { 2225 j = pa[i]; 2226 w1[j] += w2[j] + 2* w3[j]; 2227 msz += w1[j]; 2228 } 2229 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2230 2231 /* Determine the number of messages to expect, their lengths, from from-ids */ 2232 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2233 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2234 2235 /* Now post the Irecvs corresponding to these messages */ 2236 tag0 = ((PetscObject)C)->tag; 2237 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2238 2239 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2240 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2241 2242 /* Allocate Memory for outgoing messages */ 2243 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2244 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2245 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2246 2247 { 2248 PetscInt *iptr = tmp; 2249 k = 0; 2250 for (i=0; i<nrqs; i++) { 2251 j = pa[i]; 2252 iptr += k; 2253 sbuf1[j] = iptr; 2254 k = w1[j]; 2255 } 2256 } 2257 2258 /* Form the outgoing messages. Initialize the header space */ 2259 for (i=0; i<nrqs; i++) { 2260 j = pa[i]; 2261 sbuf1[j][0] = 0; 2262 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2263 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2264 } 2265 2266 /* Parse the isrow and copy data into outbuf */ 2267 for (i=0; i<ismax; i++) { 2268 row2proc_i = row2proc[i]; 2269 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2270 irow_i = irow[i]; 2271 jmax = nrow[i]; 2272 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2273 proc = row2proc_i[j]; 2274 if (proc != rank) { /* copy to the outgoing buf*/ 2275 ctr[proc]++; 2276 *ptr[proc] = irow_i[j]; 2277 ptr[proc]++; 2278 } 2279 } 2280 /* Update the headers for the current IS */ 2281 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2282 if ((ctr_j = ctr[j])) { 2283 sbuf1_j = sbuf1[j]; 2284 k = ++sbuf1_j[0]; 2285 sbuf1_j[2*k] = ctr_j; 2286 sbuf1_j[2*k-1] = i; 2287 } 2288 } 2289 } 2290 2291 /* Now post the sends */ 2292 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2293 for (i=0; i<nrqs; ++i) { 2294 j = pa[i]; 2295 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2296 } 2297 2298 /* Post Receives to capture the buffer size */ 2299 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2300 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2301 rbuf2[0] = tmp + msz; 2302 for (i=1; i<nrqs; ++i) { 2303 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2304 } 2305 for (i=0; i<nrqs; ++i) { 2306 j = pa[i]; 2307 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2308 } 2309 2310 /* Send to other procs the buf size they should allocate */ 2311 /* Receive messages*/ 2312 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2313 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2314 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2315 { 2316 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2317 PetscInt *sbuf2_i; 2318 2319 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2320 for (i=0; i<nrqr; ++i) { 2321 req_size[i] = 0; 2322 rbuf1_i = rbuf1[i]; 2323 start = 2*rbuf1_i[0] + 1; 2324 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2325 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2326 sbuf2_i = sbuf2[i]; 2327 for (j=start; j<end; j++) { 2328 id = rbuf1_i[j] - rstart; 2329 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2330 sbuf2_i[j] = ncols; 2331 req_size[i] += ncols; 2332 } 2333 req_source1[i] = r_status1[i].MPI_SOURCE; 2334 /* form the header */ 2335 sbuf2_i[0] = req_size[i]; 2336 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2337 2338 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2339 } 2340 } 2341 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2342 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2343 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2344 2345 /* Receive messages*/ 2346 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2347 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2348 2349 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2350 for (i=0; i<nrqs; ++i) { 2351 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2352 req_source2[i] = r_status2[i].MPI_SOURCE; 2353 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2354 } 2355 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2356 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2357 2358 /* Wait on sends1 and sends2 */ 2359 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2360 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2361 2362 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2363 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2364 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2365 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2366 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2367 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2368 2369 /* Now allocate sending buffers for a->j, and send them off */ 2370 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2371 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2372 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2373 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2374 2375 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2376 { 2377 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2378 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2379 PetscInt cend = C->cmap->rend; 2380 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2381 2382 for (i=0; i<nrqr; i++) { 2383 rbuf1_i = rbuf1[i]; 2384 sbuf_aj_i = sbuf_aj[i]; 2385 ct1 = 2*rbuf1_i[0] + 1; 2386 ct2 = 0; 2387 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2388 kmax = rbuf1[i][2*j]; 2389 for (k=0; k<kmax; k++,ct1++) { 2390 row = rbuf1_i[ct1] - rstart; 2391 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2392 ncols = nzA + nzB; 2393 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2394 2395 /* load the column indices for this row into cols */ 2396 cols = sbuf_aj_i + ct2; 2397 2398 lwrite = 0; 2399 for (l=0; l<nzB; l++) { 2400 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2401 } 2402 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2403 for (l=0; l<nzB; l++) { 2404 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2405 } 2406 2407 ct2 += ncols; 2408 } 2409 } 2410 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2411 } 2412 } 2413 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2414 2415 /* create col map: global col of C -> local col of submatrices */ 2416 { 2417 const PetscInt *icol_i; 2418 #if defined(PETSC_USE_CTABLE) 2419 for (i=0; i<ismax; i++) { 2420 if (!allcolumns[i]) { 2421 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2422 2423 jmax = ncol[i]; 2424 icol_i = icol[i]; 2425 cmap_i = cmap[i]; 2426 for (j=0; j<jmax; j++) { 2427 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2428 } 2429 } else cmap[i] = NULL; 2430 } 2431 #else 2432 for (i=0; i<ismax; i++) { 2433 if (!allcolumns[i]) { 2434 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2435 jmax = ncol[i]; 2436 icol_i = icol[i]; 2437 cmap_i = cmap[i]; 2438 for (j=0; j<jmax; j++) { 2439 cmap_i[icol_i[j]] = j+1; 2440 } 2441 } else cmap[i] = NULL; 2442 } 2443 #endif 2444 } 2445 2446 /* Create lens which is required for MatCreate... */ 2447 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2448 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2449 2450 if (ismax) { 2451 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2452 } 2453 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2454 2455 /* Update lens from local data */ 2456 for (i=0; i<ismax; i++) { 2457 row2proc_i = row2proc[i]; 2458 jmax = nrow[i]; 2459 if (!allcolumns[i]) cmap_i = cmap[i]; 2460 irow_i = irow[i]; 2461 lens_i = lens[i]; 2462 for (j=0; j<jmax; j++) { 2463 row = irow_i[j]; 2464 proc = row2proc_i[j]; 2465 if (proc == rank) { 2466 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2467 if (!allcolumns[i]) { 2468 for (k=0; k<ncols; k++) { 2469 #if defined(PETSC_USE_CTABLE) 2470 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2471 #else 2472 tcol = cmap_i[cols[k]]; 2473 #endif 2474 if (tcol) lens_i[j]++; 2475 } 2476 } else { /* allcolumns */ 2477 lens_i[j] = ncols; 2478 } 2479 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2480 } 2481 } 2482 } 2483 2484 /* Create row map: global row of C -> local row of submatrices */ 2485 #if defined(PETSC_USE_CTABLE) 2486 for (i=0; i<ismax; i++) { 2487 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2488 irow_i = irow[i]; 2489 jmax = nrow[i]; 2490 for (j=0; j<jmax; j++) { 2491 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2492 } 2493 } 2494 #else 2495 for (i=0; i<ismax; i++) { 2496 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2497 rmap_i = rmap[i]; 2498 irow_i = irow[i]; 2499 jmax = nrow[i]; 2500 for (j=0; j<jmax; j++) { 2501 rmap_i[irow_i[j]] = j; 2502 } 2503 } 2504 #endif 2505 2506 /* Update lens from offproc data */ 2507 { 2508 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2509 2510 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2511 for (tmp2=0; tmp2<nrqs; tmp2++) { 2512 sbuf1_i = sbuf1[pa[tmp2]]; 2513 jmax = sbuf1_i[0]; 2514 ct1 = 2*jmax+1; 2515 ct2 = 0; 2516 rbuf2_i = rbuf2[tmp2]; 2517 rbuf3_i = rbuf3[tmp2]; 2518 for (j=1; j<=jmax; j++) { 2519 is_no = sbuf1_i[2*j-1]; 2520 max1 = sbuf1_i[2*j]; 2521 lens_i = lens[is_no]; 2522 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2523 rmap_i = rmap[is_no]; 2524 for (k=0; k<max1; k++,ct1++) { 2525 #if defined(PETSC_USE_CTABLE) 2526 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2527 row--; 2528 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2529 #else 2530 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2531 #endif 2532 max2 = rbuf2_i[ct1]; 2533 for (l=0; l<max2; l++,ct2++) { 2534 if (!allcolumns[is_no]) { 2535 #if defined(PETSC_USE_CTABLE) 2536 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2537 #else 2538 tcol = cmap_i[rbuf3_i[ct2]]; 2539 #endif 2540 if (tcol) lens_i[row]++; 2541 } else { /* allcolumns */ 2542 lens_i[row]++; /* lens_i[row] += max2 ? */ 2543 } 2544 } 2545 } 2546 } 2547 } 2548 } 2549 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2550 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2551 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2552 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2553 2554 /* Create the submatrices */ 2555 for (i=0; i<ismax; i++) { 2556 PetscInt rbs,cbs; 2557 2558 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2559 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2560 2561 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2562 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2563 2564 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2565 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2566 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2567 2568 /* create struct Mat_SubMat and attached it to submat */ 2569 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2570 subc = (Mat_SeqAIJ*)submats[i]->data; 2571 subc->submatis1 = smat_i; 2572 2573 smat_i->destroy = submats[i]->ops->destroy; 2574 submats[i]->ops->destroy = MatDestroy_SeqAIJ_Submatrices; 2575 submats[i]->factortype = C->factortype; 2576 2577 smat_i->id = i; 2578 smat_i->nrqs = nrqs; 2579 smat_i->nrqr = nrqr; 2580 smat_i->rbuf1 = rbuf1; 2581 smat_i->rbuf2 = rbuf2; 2582 smat_i->rbuf3 = rbuf3; 2583 smat_i->sbuf2 = sbuf2; 2584 smat_i->req_source2 = req_source2; 2585 2586 smat_i->sbuf1 = sbuf1; 2587 smat_i->ptr = ptr; 2588 smat_i->tmp = tmp; 2589 smat_i->ctr = ctr; 2590 2591 smat_i->pa = pa; 2592 smat_i->req_size = req_size; 2593 smat_i->req_source1 = req_source1; 2594 2595 smat_i->allcolumns = allcolumns[i]; 2596 smat_i->singleis = PETSC_FALSE; 2597 smat_i->row2proc = row2proc[i]; 2598 smat_i->rmap = rmap[i]; 2599 smat_i->cmap = cmap[i]; 2600 } 2601 2602 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2603 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2604 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2605 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2606 2607 /* create struct Mat_SubMat and attached it to submat */ 2608 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2609 submats[0]->data = (void*)smat_i; 2610 2611 smat_i->destroy = submats[0]->ops->destroy; 2612 submats[0]->ops->destroy = MatDestroy_Dummy_Submatrices; 2613 submats[0]->factortype = C->factortype; 2614 2615 smat_i->id = i; 2616 smat_i->nrqs = nrqs; 2617 smat_i->nrqr = nrqr; 2618 smat_i->rbuf1 = rbuf1; 2619 smat_i->rbuf2 = rbuf2; 2620 smat_i->rbuf3 = rbuf3; 2621 smat_i->sbuf2 = sbuf2; 2622 smat_i->req_source2 = req_source2; 2623 2624 smat_i->sbuf1 = sbuf1; 2625 smat_i->ptr = ptr; 2626 smat_i->tmp = tmp; 2627 smat_i->ctr = ctr; 2628 2629 smat_i->pa = pa; 2630 smat_i->req_size = req_size; 2631 smat_i->req_source1 = req_source1; 2632 2633 smat_i->allcolumns = PETSC_TRUE; 2634 smat_i->singleis = PETSC_FALSE; 2635 smat_i->row2proc = NULL; 2636 smat_i->rmap = NULL; 2637 smat_i->cmap = NULL; 2638 } 2639 2640 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2641 ierr = PetscFree(lens);CHKERRQ(ierr); 2642 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2643 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2644 2645 } /* endof scall == MAT_INITIAL_MATRIX */ 2646 2647 /* Post recv matrix values */ 2648 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2649 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2650 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2651 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2652 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2653 for (i=0; i<nrqs; ++i) { 2654 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2655 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2656 } 2657 2658 /* Allocate sending buffers for a->a, and send them off */ 2659 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2660 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2661 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2662 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2663 2664 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2665 { 2666 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2667 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2668 PetscInt cend = C->cmap->rend; 2669 PetscInt *b_j = b->j; 2670 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2671 2672 for (i=0; i<nrqr; i++) { 2673 rbuf1_i = rbuf1[i]; 2674 sbuf_aa_i = sbuf_aa[i]; 2675 ct1 = 2*rbuf1_i[0]+1; 2676 ct2 = 0; 2677 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2678 kmax = rbuf1_i[2*j]; 2679 for (k=0; k<kmax; k++,ct1++) { 2680 row = rbuf1_i[ct1] - rstart; 2681 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2682 ncols = nzA + nzB; 2683 cworkB = b_j + b_i[row]; 2684 vworkA = a_a + a_i[row]; 2685 vworkB = b_a + b_i[row]; 2686 2687 /* load the column values for this row into vals*/ 2688 vals = sbuf_aa_i+ct2; 2689 2690 lwrite = 0; 2691 for (l=0; l<nzB; l++) { 2692 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2693 } 2694 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2695 for (l=0; l<nzB; l++) { 2696 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2697 } 2698 2699 ct2 += ncols; 2700 } 2701 } 2702 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2703 } 2704 } 2705 2706 /* Assemble the matrices */ 2707 /* First assemble the local rows */ 2708 for (i=0; i<ismax; i++) { 2709 row2proc_i = row2proc[i]; 2710 subc = (Mat_SeqAIJ*)submats[i]->data; 2711 imat_ilen = subc->ilen; 2712 imat_j = subc->j; 2713 imat_i = subc->i; 2714 imat_a = subc->a; 2715 2716 if (!allcolumns[i]) cmap_i = cmap[i]; 2717 rmap_i = rmap[i]; 2718 irow_i = irow[i]; 2719 jmax = nrow[i]; 2720 for (j=0; j<jmax; j++) { 2721 row = irow_i[j]; 2722 proc = row2proc_i[j]; 2723 if (proc == rank) { 2724 old_row = row; 2725 #if defined(PETSC_USE_CTABLE) 2726 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2727 row--; 2728 #else 2729 row = rmap_i[row]; 2730 #endif 2731 ilen_row = imat_ilen[row]; 2732 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2733 mat_i = imat_i[row]; 2734 mat_a = imat_a + mat_i; 2735 mat_j = imat_j + mat_i; 2736 if (!allcolumns[i]) { 2737 for (k=0; k<ncols; k++) { 2738 #if defined(PETSC_USE_CTABLE) 2739 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2740 #else 2741 tcol = cmap_i[cols[k]]; 2742 #endif 2743 if (tcol) { 2744 *mat_j++ = tcol - 1; 2745 *mat_a++ = vals[k]; 2746 ilen_row++; 2747 } 2748 } 2749 } else { /* allcolumns */ 2750 for (k=0; k<ncols; k++) { 2751 *mat_j++ = cols[k]; /* global col index! */ 2752 *mat_a++ = vals[k]; 2753 ilen_row++; 2754 } 2755 } 2756 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2757 2758 imat_ilen[row] = ilen_row; 2759 } 2760 } 2761 } 2762 2763 /* Now assemble the off proc rows */ 2764 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2765 for (tmp2=0; tmp2<nrqs; tmp2++) { 2766 sbuf1_i = sbuf1[pa[tmp2]]; 2767 jmax = sbuf1_i[0]; 2768 ct1 = 2*jmax + 1; 2769 ct2 = 0; 2770 rbuf2_i = rbuf2[tmp2]; 2771 rbuf3_i = rbuf3[tmp2]; 2772 rbuf4_i = rbuf4[tmp2]; 2773 for (j=1; j<=jmax; j++) { 2774 is_no = sbuf1_i[2*j-1]; 2775 rmap_i = rmap[is_no]; 2776 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2777 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2778 imat_ilen = subc->ilen; 2779 imat_j = subc->j; 2780 imat_i = subc->i; 2781 imat_a = subc->a; 2782 max1 = sbuf1_i[2*j]; 2783 for (k=0; k<max1; k++,ct1++) { 2784 row = sbuf1_i[ct1]; 2785 #if defined(PETSC_USE_CTABLE) 2786 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2787 row--; 2788 #else 2789 row = rmap_i[row]; 2790 #endif 2791 ilen = imat_ilen[row]; 2792 mat_i = imat_i[row]; 2793 mat_a = imat_a + mat_i; 2794 mat_j = imat_j + mat_i; 2795 max2 = rbuf2_i[ct1]; 2796 if (!allcolumns[is_no]) { 2797 for (l=0; l<max2; l++,ct2++) { 2798 #if defined(PETSC_USE_CTABLE) 2799 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2800 #else 2801 tcol = cmap_i[rbuf3_i[ct2]]; 2802 #endif 2803 if (tcol) { 2804 *mat_j++ = tcol - 1; 2805 *mat_a++ = rbuf4_i[ct2]; 2806 ilen++; 2807 } 2808 } 2809 } else { /* allcolumns */ 2810 for (l=0; l<max2; l++,ct2++) { 2811 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2812 *mat_a++ = rbuf4_i[ct2]; 2813 ilen++; 2814 } 2815 } 2816 imat_ilen[row] = ilen; 2817 } 2818 } 2819 } 2820 2821 if (!iscsorted) { /* sort column indices of the rows */ 2822 for (i=0; i<ismax; i++) { 2823 subc = (Mat_SeqAIJ*)submats[i]->data; 2824 imat_j = subc->j; 2825 imat_i = subc->i; 2826 imat_a = subc->a; 2827 imat_ilen = subc->ilen; 2828 2829 if (allcolumns[i]) continue; 2830 jmax = nrow[i]; 2831 for (j=0; j<jmax; j++) { 2832 mat_i = imat_i[j]; 2833 mat_a = imat_a + mat_i; 2834 mat_j = imat_j + mat_i; 2835 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2836 } 2837 } 2838 } 2839 2840 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2841 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2842 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2843 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2844 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2845 2846 /* Restore the indices */ 2847 for (i=0; i<ismax; i++) { 2848 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2849 if (!allcolumns[i]) { 2850 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2851 } 2852 } 2853 2854 for (i=0; i<ismax; i++) { 2855 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2856 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2857 } 2858 2859 /* Destroy allocated memory */ 2860 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2861 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2862 ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr); 2863 2864 for (i=0; i<nrqs; ++i) { 2865 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2866 } 2867 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2868 2869 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2870 PetscFunctionReturn(0); 2871 } 2872 2873 /* 2874 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2875 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2876 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2877 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2878 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2879 state, and needs to be "assembled" later by compressing B's column space. 2880 2881 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2882 Following this call, C->A & C->B have been created, even if empty. 2883 */ 2884 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2885 { 2886 /* If making this function public, change the error returned in this function away from _PLIB. */ 2887 PetscErrorCode ierr; 2888 Mat_MPIAIJ *aij; 2889 Mat_SeqAIJ *Baij; 2890 PetscBool seqaij,Bdisassembled; 2891 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2892 PetscScalar v; 2893 const PetscInt *rowindices,*colindices; 2894 2895 PetscFunctionBegin; 2896 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2897 if (A) { 2898 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2899 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2900 if (rowemb) { 2901 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2902 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2903 } else { 2904 if (C->rmap->n != A->rmap->n) { 2905 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2906 } 2907 } 2908 if (dcolemb) { 2909 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2910 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2911 } else { 2912 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2913 } 2914 } 2915 if (B) { 2916 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2917 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2918 if (rowemb) { 2919 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2920 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2921 } else { 2922 if (C->rmap->n != B->rmap->n) { 2923 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2924 } 2925 } 2926 if (ocolemb) { 2927 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2928 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2929 } else { 2930 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2931 } 2932 } 2933 2934 aij = (Mat_MPIAIJ*)C->data; 2935 if (!aij->A) { 2936 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2937 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2938 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2939 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2940 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2941 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2942 } 2943 if (A) { 2944 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2945 } else { 2946 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2947 } 2948 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2949 /* 2950 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2951 need to "disassemble" B -- convert it to using C's global indices. 2952 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2953 2954 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2955 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2956 2957 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2958 At least avoid calling MatSetValues() and the implied searches? 2959 */ 2960 2961 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2962 #if defined(PETSC_USE_CTABLE) 2963 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2964 #else 2965 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2966 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2967 if (aij->B) { 2968 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2969 } 2970 #endif 2971 ngcol = 0; 2972 if (aij->lvec) { 2973 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2974 } 2975 if (aij->garray) { 2976 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2977 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2978 } 2979 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2980 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2981 } 2982 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2983 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2984 } 2985 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2986 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2987 } 2988 } 2989 Bdisassembled = PETSC_FALSE; 2990 if (!aij->B) { 2991 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2992 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2993 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2994 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2995 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2996 Bdisassembled = PETSC_TRUE; 2997 } 2998 if (B) { 2999 Baij = (Mat_SeqAIJ*)B->data; 3000 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3001 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3002 for (i=0; i<B->rmap->n; i++) { 3003 nz[i] = Baij->i[i+1] - Baij->i[i]; 3004 } 3005 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3006 ierr = PetscFree(nz);CHKERRQ(ierr); 3007 } 3008 3009 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3010 shift = rend-rstart; 3011 count = 0; 3012 rowindices = NULL; 3013 colindices = NULL; 3014 if (rowemb) { 3015 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3016 } 3017 if (ocolemb) { 3018 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3019 } 3020 for (i=0; i<B->rmap->n; i++) { 3021 PetscInt row; 3022 row = i; 3023 if (rowindices) row = rowindices[i]; 3024 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3025 col = Baij->j[count]; 3026 if (colindices) col = colindices[col]; 3027 if (Bdisassembled && col>=rstart) col += shift; 3028 v = Baij->a[count]; 3029 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3030 ++count; 3031 } 3032 } 3033 /* No assembly for aij->B is necessary. */ 3034 /* FIXME: set aij->B's nonzerostate correctly. */ 3035 } else { 3036 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3037 } 3038 C->preallocated = PETSC_TRUE; 3039 C->was_assembled = PETSC_FALSE; 3040 C->assembled = PETSC_FALSE; 3041 /* 3042 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3043 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3044 */ 3045 PetscFunctionReturn(0); 3046 } 3047 3048 /* 3049 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3050 */ 3051 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3052 { 3053 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3054 3055 PetscFunctionBegin; 3056 PetscValidPointer(A,2); 3057 PetscValidPointer(B,3); 3058 /* FIXME: make sure C is assembled */ 3059 *A = aij->A; 3060 *B = aij->B; 3061 /* Note that we don't incref *A and *B, so be careful! */ 3062 PetscFunctionReturn(0); 3063 } 3064 3065 /* 3066 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3067 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3068 */ 3069 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3070 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3071 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3072 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3073 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3074 { 3075 PetscErrorCode ierr; 3076 PetscMPIInt isize,flag; 3077 PetscInt i,ii,cismax,ispar; 3078 Mat *A,*B; 3079 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3080 3081 PetscFunctionBegin; 3082 if (!ismax) PetscFunctionReturn(0); 3083 3084 for (i = 0, cismax = 0; i < ismax; ++i) { 3085 PetscMPIInt isize; 3086 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3087 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3088 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3089 if (isize > 1) ++cismax; 3090 } 3091 3092 /* 3093 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3094 ispar counts the number of parallel ISs across C's comm. 3095 */ 3096 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3097 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3098 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3099 PetscFunctionReturn(0); 3100 } 3101 3102 /* if (ispar) */ 3103 /* 3104 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3105 These are used to extract the off-diag portion of the resulting parallel matrix. 3106 The row IS for the off-diag portion is the same as for the diag portion, 3107 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3108 */ 3109 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3110 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3111 for (i = 0, ii = 0; i < ismax; ++i) { 3112 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3113 if (isize > 1) { 3114 /* 3115 TODO: This is the part that's ***NOT SCALABLE***. 3116 To fix this we need to extract just the indices of C's nonzero columns 3117 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3118 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3119 to be done without serializing on the IS list, so, most likely, it is best 3120 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3121 */ 3122 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3123 /* Now we have to 3124 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3125 were sorted on each rank, concatenated they might no longer be sorted; 3126 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3127 indices in the nondecreasing order to the original index positions. 3128 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3129 */ 3130 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3131 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3132 ++ii; 3133 } 3134 } 3135 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3136 for (i = 0, ii = 0; i < ismax; ++i) { 3137 PetscInt j,issize; 3138 const PetscInt *indices; 3139 3140 /* 3141 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3142 */ 3143 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3144 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3145 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3146 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3147 for (j = 1; j < issize; ++j) { 3148 if (indices[j] == indices[j-1]) { 3149 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3150 } 3151 } 3152 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3153 3154 3155 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3156 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3157 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3158 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3159 for (j = 1; j < issize; ++j) { 3160 if (indices[j-1] == indices[j]) { 3161 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3162 } 3163 } 3164 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3165 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3166 if (isize > 1) { 3167 cisrow[ii] = isrow[i]; 3168 ++ii; 3169 } 3170 } 3171 /* 3172 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3173 array of sequential matrices underlying the resulting parallel matrices. 3174 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3175 contain duplicates. 3176 3177 There are as many diag matrices as there are original index sets. There are only as many parallel 3178 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3179 3180 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3181 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3182 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3183 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3184 with A[i] and B[ii] extracted from the corresponding MPI submat. 3185 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3186 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3187 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3188 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3189 values into A[i] and B[ii] sitting inside the corresponding submat. 3190 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3191 A[i], B[ii], AA[i] or BB[ii] matrices. 3192 */ 3193 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3194 if (scall == MAT_INITIAL_MATRIX) { 3195 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3196 } 3197 3198 /* Now obtain the sequential A and B submatrices separately. */ 3199 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3200 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3201 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3202 3203 /* 3204 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3205 matrices A & B have been extracted directly into the parallel matrices containing them, or 3206 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3207 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3208 to have the same sparsity pattern. 3209 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3210 must be constructed for C. This is done by setseqmat(s). 3211 */ 3212 for (i = 0, ii = 0; i < ismax; ++i) { 3213 /* 3214 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3215 That way we can avoid sorting and computing permutations when reusing. 3216 To this end: 3217 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3218 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3219 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3220 */ 3221 MatStructure pattern; 3222 pattern = DIFFERENT_NONZERO_PATTERN; 3223 3224 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3225 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3226 if (isize > 1) { 3227 if (scall == MAT_INITIAL_MATRIX) { 3228 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3229 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3230 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3231 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3232 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3233 } 3234 /* 3235 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3236 */ 3237 { 3238 Mat AA,BB; 3239 AA = A[i]; 3240 BB = B[ii]; 3241 if (AA || BB) { 3242 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3243 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3244 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3245 } 3246 3247 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3248 } 3249 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3250 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3251 ++ii; 3252 } else { /* if (isize == 1) */ 3253 if (scall == MAT_REUSE_MATRIX) { 3254 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3255 } 3256 if (isrow_p[i] || iscol_p[i]) { 3257 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3258 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3259 /* Otherwise A is extracted straight into (*submats)[i]. */ 3260 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3261 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3262 } else (*submat)[i] = A[i]; 3263 } 3264 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3265 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3266 } 3267 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3268 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3269 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3270 ierr = PetscFree(A);CHKERRQ(ierr); 3271 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3272 PetscFunctionReturn(0); 3273 } 3274 3275 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3276 { 3277 PetscErrorCode ierr; 3278 3279 PetscFunctionBegin; 3280 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3281 PetscFunctionReturn(0); 3282 } 3283