1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 #undef __FUNCT__ 23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 25 { 26 PetscErrorCode ierr; 27 PetscInt i; 28 29 PetscFunctionBegin; 30 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 31 for (i=0; i<ov; ++i) { 32 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 33 } 34 PetscFunctionReturn(0); 35 } 36 37 #undef __FUNCT__ 38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 40 { 41 PetscErrorCode ierr; 42 PetscInt i; 43 44 PetscFunctionBegin; 45 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 46 for (i=0; i<ov; ++i) { 47 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 48 } 49 PetscFunctionReturn(0); 50 } 51 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 56 { 57 PetscErrorCode ierr; 58 MPI_Comm comm; 59 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 60 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 61 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 62 const PetscInt *indices_i,**indices; 63 PetscLayout rmap; 64 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 65 PetscSF sf; 66 PetscSFNode *remote; 67 68 PetscFunctionBegin; 69 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 70 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 71 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 72 /* get row map to determine where rows should be going */ 73 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 74 /* retrieve IS data and put all together so that we 75 * can optimize communication 76 * */ 77 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 78 for (i=0,tlength=0; i<nidx; i++){ 79 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 80 tlength += length[i]; 81 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 82 } 83 /* find these rows on remote processors */ 84 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 85 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 86 nrrows = 0; 87 for (i=0; i<nidx; i++){ 88 length_i = length[i]; 89 indices_i = indices[i]; 90 for (j=0; j<length_i; j++){ 91 owner = -1; 92 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 93 /* remote processors */ 94 if (owner != rank){ 95 tosizes_temp[owner]++; /* number of rows to owner */ 96 rrow_ranks[nrrows] = owner; /* processor */ 97 rrow_isids[nrrows] = i; /* is id */ 98 remoterows[nrrows++] = indices_i[j]; /* row */ 99 } 100 } 101 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 102 } 103 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 104 /* test if we need to exchange messages 105 * generally speaking, we do not need to exchange 106 * data when overlap is 1 107 * */ 108 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 109 /* we do not have any messages 110 * It usually corresponds to overlap 1 111 * */ 112 if (!reducednrrows){ 113 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 114 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 115 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 nto = 0; 119 /* send sizes and ranks for building a two-sided communcation */ 120 for (i=0; i<size; i++){ 121 if (tosizes_temp[i]){ 122 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 123 tosizes_temp[i] = nto; /* a map from processor to index */ 124 toranks[nto++] = i; /* processor */ 125 } 126 } 127 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 128 for (i=0; i<nto; i++){ 129 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 130 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 131 } 132 /* send information to other processors */ 133 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 134 nrecvrows = 0; 135 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 136 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 137 nrecvrows = 0; 138 for (i=0; i<nfrom; i++){ 139 for (j=0; j<fromsizes[2*i]; j++){ 140 remote[nrecvrows].rank = fromranks[i]; 141 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 142 } 143 } 144 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 145 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 146 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 147 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 148 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 149 /* message pair <no of is, row> */ 150 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 151 for (i=0; i<nrrows; i++){ 152 owner = rrow_ranks[i]; /* processor */ 153 j = tosizes_temp[owner]; /* index */ 154 todata[toffsets[j]++] = rrow_isids[i]; 155 todata[toffsets[j]++] = remoterows[i]; 156 } 157 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 158 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 159 ierr = PetscFree(toffsets);CHKERRQ(ierr); 160 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 161 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 162 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 163 /* send rows belonging to the remote so that then we could get the overlapping data back */ 164 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 165 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 166 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 167 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 168 ierr = PetscFree(fromranks);CHKERRQ(ierr); 169 nrecvrows = 0; 170 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 171 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 172 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 173 nrecvrows = 0; 174 for (i=0; i<nto; i++){ 175 for (j=0; j<tosizes[2*i]; j++){ 176 remote[nrecvrows].rank = toranks[i]; 177 remote[nrecvrows++].index = tosizes[2*i+1]+j; 178 } 179 } 180 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 181 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 182 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 183 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 184 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 185 /* overlap communication and computation */ 186 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 187 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 188 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 189 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 190 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 191 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 192 ierr = PetscFree(toranks);CHKERRQ(ierr); 193 ierr = PetscFree(tosizes);CHKERRQ(ierr); 194 ierr = PetscFree(todata);CHKERRQ(ierr); 195 PetscFunctionReturn(0); 196 } 197 198 #undef __FUNCT__ 199 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 200 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 201 { 202 PetscInt *isz,isz_i,i,j,is_id, data_size; 203 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 204 const PetscInt *indices_i_temp; 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 max_lsize = 0; 209 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 210 for (i=0; i<nidx; i++){ 211 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 212 max_lsize = lsize>max_lsize ? lsize:max_lsize; 213 isz[i] = lsize; 214 } 215 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 216 for (i=0; i<nidx; i++){ 217 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 218 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 219 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 220 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 221 } 222 /* retrieve information to get row id and its overlap */ 223 for (i=0; i<nrecvs; ){ 224 is_id = recvdata[i++]; 225 data_size = recvdata[i++]; 226 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 227 isz_i = isz[is_id]; 228 for (j=0; j< data_size; j++){ 229 col = recvdata[i++]; 230 indices_i[isz_i++] = col; 231 } 232 isz[is_id] = isz_i; 233 } 234 /* remove duplicate entities */ 235 for (i=0; i<nidx; i++){ 236 indices_i = indices_temp+(max_lsize+nrecvs)*i; 237 isz_i = isz[i]; 238 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 239 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 240 } 241 ierr = PetscFree(isz);CHKERRQ(ierr); 242 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 243 PetscFunctionReturn(0); 244 } 245 246 #undef __FUNCT__ 247 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 248 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 249 { 250 PetscLayout rmap,cmap; 251 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 252 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 253 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 254 const PetscInt *gcols,*ai,*aj,*bi,*bj; 255 Mat amat,bmat; 256 PetscMPIInt rank; 257 PetscBool done; 258 MPI_Comm comm; 259 PetscErrorCode ierr; 260 261 PetscFunctionBegin; 262 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 263 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 264 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 265 /* Even if the mat is symmetric, we still assume it is not symmetric */ 266 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 267 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 268 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 269 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 270 /* total number of nonzero values is used to estimate the memory usage in the next step */ 271 tnz = ai[an]+bi[bn]; 272 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 273 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 274 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 275 /* to find the longest message */ 276 max_fszs = 0; 277 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 278 /* better way to estimate number of nonzero in the mat??? */ 279 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 280 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 281 rows_pos = 0; 282 totalrows = 0; 283 for (i=0; i<nfrom; i++){ 284 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 285 /* group data together */ 286 for (j=0; j<fromsizes[2*i]; j+=2){ 287 is_id = fromrows[rows_pos++];/* no of is */ 288 rows_i = rows_data[is_id]; 289 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 290 } 291 /* estimate a space to avoid multiple allocations */ 292 for (j=0; j<nidx; j++){ 293 indvc_ij = 0; 294 rows_i = rows_data[j]; 295 for (l=0; l<rows_pos_i[j]; l++){ 296 row = rows_i[l]-rstart; 297 start = ai[row]; 298 end = ai[row+1]; 299 for (k=start; k<end; k++){ /* Amat */ 300 col = aj[k] + cstart; 301 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 302 } 303 start = bi[row]; 304 end = bi[row+1]; 305 for (k=start; k<end; k++) { /* Bmat */ 306 col = gcols[bj[k]]; 307 indices_tmp[indvc_ij++] = col; 308 } 309 } 310 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 311 indv_counts[i*nidx+j] = indvc_ij; 312 totalrows += indvc_ij; 313 } 314 } 315 /* message triple <no of is, number of rows, rows> */ 316 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 317 totalrows = 0; 318 rows_pos = 0; 319 /* use this code again */ 320 for (i=0;i<nfrom;i++){ 321 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 322 for (j=0; j<fromsizes[2*i]; j+=2){ 323 is_id = fromrows[rows_pos++]; 324 rows_i = rows_data[is_id]; 325 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 326 } 327 /* add data */ 328 for (j=0; j<nidx; j++){ 329 if (!indv_counts[i*nidx+j]) continue; 330 indvc_ij = 0; 331 sbdata[totalrows++] = j; 332 sbdata[totalrows++] = indv_counts[i*nidx+j]; 333 sbsizes[2*i] += 2; 334 rows_i = rows_data[j]; 335 for (l=0; l<rows_pos_i[j]; l++){ 336 row = rows_i[l]-rstart; 337 start = ai[row]; 338 end = ai[row+1]; 339 for (k=start; k<end; k++){ /* Amat */ 340 col = aj[k] + cstart; 341 indices_tmp[indvc_ij++] = col; 342 } 343 start = bi[row]; 344 end = bi[row+1]; 345 for (k=start; k<end; k++) { /* Bmat */ 346 col = gcols[bj[k]]; 347 indices_tmp[indvc_ij++] = col; 348 } 349 } 350 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 351 sbsizes[2*i] += indvc_ij; 352 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 353 totalrows += indvc_ij; 354 } 355 } 356 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 357 for (i=0; i<nfrom; i++){ 358 offsets[i+1] = offsets[i] + sbsizes[2*i]; 359 sbsizes[2*i+1] = offsets[i]; 360 } 361 ierr = PetscFree(offsets);CHKERRQ(ierr); 362 if (sbrowsizes) *sbrowsizes = sbsizes; 363 if (sbrows) *sbrows = sbdata; 364 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 365 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 366 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 367 PetscFunctionReturn(0); 368 } 369 370 #undef __FUNCT__ 371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 373 { 374 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 375 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 376 PetscInt lsize,lsize_tmp,owner; 377 PetscMPIInt rank; 378 Mat amat,bmat; 379 PetscBool done; 380 PetscLayout cmap,rmap; 381 MPI_Comm comm; 382 PetscErrorCode ierr; 383 384 PetscFunctionBegin; 385 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 386 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 387 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 388 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 389 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 390 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 391 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 392 /* is it a safe way to compute number of nonzero values ? */ 393 tnz = ai[an]+bi[bn]; 394 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 395 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 396 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 397 /* it is a better way to estimate memory than the old implementation 398 * where global size of matrix is used 399 * */ 400 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 401 for (i=0; i<nidx; i++) { 402 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 403 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 404 lsize_tmp = 0; 405 for (j=0; j<lsize; j++) { 406 owner = -1; 407 row = indices[j]; 408 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 409 if (owner != rank) continue; 410 /* local number */ 411 row -= rstart; 412 start = ai[row]; 413 end = ai[row+1]; 414 for (k=start; k<end; k++) { /* Amat */ 415 col = aj[k] + cstart; 416 indices_temp[lsize_tmp++] = col; 417 } 418 start = bi[row]; 419 end = bi[row+1]; 420 for (k=start; k<end; k++) { /* Bmat */ 421 col = gcols[bj[k]]; 422 indices_temp[lsize_tmp++] = col; 423 } 424 } 425 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 426 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 427 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 428 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 429 } 430 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 431 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 432 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 433 PetscFunctionReturn(0); 434 } 435 436 437 /* 438 Sample message format: 439 If a processor A wants processor B to process some elements corresponding 440 to index sets is[1],is[5] 441 mesg [0] = 2 (no of index sets in the mesg) 442 ----------- 443 mesg [1] = 1 => is[1] 444 mesg [2] = sizeof(is[1]); 445 ----------- 446 mesg [3] = 5 => is[5] 447 mesg [4] = sizeof(is[5]); 448 ----------- 449 mesg [5] 450 mesg [n] datas[1] 451 ----------- 452 mesg[n+1] 453 mesg[m] data(is[5]) 454 ----------- 455 456 Notes: 457 nrqs - no of requests sent (or to be sent out) 458 nrqr - no of requests recieved (which have to be or which have been processed 459 */ 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 463 { 464 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 465 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 466 const PetscInt **idx,*idx_i; 467 PetscInt *n,**data,len; 468 #if defined(PETSC_USE_CTABLE) 469 PetscTable *table_data,table_data_i; 470 PetscInt *tdata,tcount,tcount_max; 471 #else 472 PetscInt *data_i,*d_p; 473 #endif 474 PetscErrorCode ierr; 475 PetscMPIInt size,rank,tag1,tag2; 476 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 477 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 478 PetscBT *table; 479 MPI_Comm comm; 480 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 481 MPI_Status *s_status,*recv_status; 482 char *t_p; 483 484 PetscFunctionBegin; 485 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 486 size = c->size; 487 rank = c->rank; 488 M = C->rmap->N; 489 490 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 491 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 492 493 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 494 495 for (i=0; i<imax; i++) { 496 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 497 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 498 } 499 500 /* evaluate communication - mesg to who,length of mesg, and buffer space 501 required. Based on this, buffers are allocated, and data copied into them */ 502 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 503 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 504 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 505 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 506 for (i=0; i<imax; i++) { 507 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 508 idx_i = idx[i]; 509 len = n[i]; 510 for (j=0; j<len; j++) { 511 row = idx_i[j]; 512 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 513 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 514 w4[proc]++; 515 } 516 for (j=0; j<size; j++) { 517 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 518 } 519 } 520 521 nrqs = 0; /* no of outgoing messages */ 522 msz = 0; /* total mesg length (for all proc */ 523 w1[rank] = 0; /* no mesg sent to intself */ 524 w3[rank] = 0; 525 for (i=0; i<size; i++) { 526 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 527 } 528 /* pa - is list of processors to communicate with */ 529 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 530 for (i=0,j=0; i<size; i++) { 531 if (w1[i]) {pa[j] = i; j++;} 532 } 533 534 /* Each message would have a header = 1 + 2*(no of IS) + data */ 535 for (i=0; i<nrqs; i++) { 536 j = pa[i]; 537 w1[j] += w2[j] + 2*w3[j]; 538 msz += w1[j]; 539 } 540 541 /* Determine the number of messages to expect, their lengths, from from-ids */ 542 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 543 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 544 545 /* Now post the Irecvs corresponding to these messages */ 546 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 547 548 /* Allocate Memory for outgoing messages */ 549 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 550 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 551 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 552 553 { 554 PetscInt *iptr = tmp,ict = 0; 555 for (i=0; i<nrqs; i++) { 556 j = pa[i]; 557 iptr += ict; 558 outdat[j] = iptr; 559 ict = w1[j]; 560 } 561 } 562 563 /* Form the outgoing messages */ 564 /* plug in the headers */ 565 for (i=0; i<nrqs; i++) { 566 j = pa[i]; 567 outdat[j][0] = 0; 568 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 569 ptr[j] = outdat[j] + 2*w3[j] + 1; 570 } 571 572 /* Memory for doing local proc's work */ 573 { 574 #if defined(PETSC_USE_CTABLE) 575 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 578 } 579 ierr = PetscCalloc4(imax,&table,imax,&data,imax,&isz,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 580 for (i=0; i<imax; i++) { 581 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 582 } 583 #else 584 PetscInt Mimax = 0,M_BPB_imax = 0; 585 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 586 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 587 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 588 ierr = PetscCalloc5(imax,&table,imax,&data,imax,&isz,M*imax,&d_p,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 589 for (i=0; i<imax; i++) { 590 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 591 data[i] = d_p + M*i; 592 } 593 #endif 594 } 595 596 /* Parse the IS and update local tables and the outgoing buf with the data */ 597 { 598 PetscInt n_i,isz_i,*outdat_j,ctr_j; 599 PetscBT table_i; 600 601 for (i=0; i<imax; i++) { 602 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 603 n_i = n[i]; 604 table_i = table[i]; 605 idx_i = idx[i]; 606 #if defined(PETSC_USE_CTABLE) 607 table_data_i = table_data[i]; 608 #else 609 data_i = data[i]; 610 #endif 611 isz_i = isz[i]; 612 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 613 row = idx_i[j]; 614 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 615 if (proc != rank) { /* copy to the outgoing buffer */ 616 ctr[proc]++; 617 *ptr[proc] = row; 618 ptr[proc]++; 619 } else if (!PetscBTLookupSet(table_i,row)) { 620 #if defined(PETSC_USE_CTABLE) 621 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 622 #else 623 data_i[isz_i] = row; /* Update the local table */ 624 #endif 625 isz_i++; 626 } 627 } 628 /* Update the headers for the current IS */ 629 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 630 if ((ctr_j = ctr[j])) { 631 outdat_j = outdat[j]; 632 k = ++outdat_j[0]; 633 outdat_j[2*k] = ctr_j; 634 outdat_j[2*k-1] = i; 635 } 636 } 637 isz[i] = isz_i; 638 } 639 } 640 641 /* Now post the sends */ 642 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 643 for (i=0; i<nrqs; ++i) { 644 j = pa[i]; 645 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 646 } 647 648 /* No longer need the original indices */ 649 for (i=0; i<imax; ++i) { 650 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 651 } 652 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 653 654 for (i=0; i<imax; ++i) { 655 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 656 } 657 658 /* Do Local work */ 659 #if defined(PETSC_USE_CTABLE) 660 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 661 #else 662 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 663 #endif 664 665 /* Receive messages */ 666 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 667 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 668 669 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 670 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 671 672 /* Phase 1 sends are complete - deallocate buffers */ 673 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 674 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 675 676 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 677 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 678 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 679 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 680 ierr = PetscFree(rbuf);CHKERRQ(ierr); 681 682 683 /* Send the data back */ 684 /* Do a global reduction to know the buffer space req for incoming messages */ 685 { 686 PetscMPIInt *rw1; 687 688 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 689 690 for (i=0; i<nrqr; ++i) { 691 proc = recv_status[i].MPI_SOURCE; 692 693 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 694 rw1[proc] = isz1[i]; 695 } 696 ierr = PetscFree(onodes1);CHKERRQ(ierr); 697 ierr = PetscFree(olengths1);CHKERRQ(ierr); 698 699 /* Determine the number of messages to expect, their lengths, from from-ids */ 700 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 701 ierr = PetscFree(rw1);CHKERRQ(ierr); 702 } 703 /* Now post the Irecvs corresponding to these messages */ 704 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 705 706 /* Now post the sends */ 707 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 708 for (i=0; i<nrqr; ++i) { 709 j = recv_status[i].MPI_SOURCE; 710 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 711 } 712 713 /* receive work done on other processors */ 714 { 715 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 716 PetscMPIInt idex; 717 PetscBT table_i; 718 MPI_Status *status2; 719 720 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 721 for (i=0; i<nrqs; ++i) { 722 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 723 /* Process the message */ 724 rbuf2_i = rbuf2[idex]; 725 ct1 = 2*rbuf2_i[0]+1; 726 jmax = rbuf2[idex][0]; 727 for (j=1; j<=jmax; j++) { 728 max = rbuf2_i[2*j]; 729 is_no = rbuf2_i[2*j-1]; 730 isz_i = isz[is_no]; 731 table_i = table[is_no]; 732 #if defined(PETSC_USE_CTABLE) 733 table_data_i = table_data[is_no]; 734 #else 735 data_i = data[is_no]; 736 #endif 737 for (k=0; k<max; k++,ct1++) { 738 row = rbuf2_i[ct1]; 739 if (!PetscBTLookupSet(table_i,row)) { 740 #if defined(PETSC_USE_CTABLE) 741 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 742 #else 743 data_i[isz_i] = row; 744 #endif 745 isz_i++; 746 } 747 } 748 isz[is_no] = isz_i; 749 } 750 } 751 752 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 753 ierr = PetscFree(status2);CHKERRQ(ierr); 754 } 755 756 #if defined(PETSC_USE_CTABLE) 757 tcount_max = 0; 758 for (i=0; i<imax; ++i) { 759 table_data_i = table_data[i]; 760 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 761 if (tcount_max < tcount) tcount_max = tcount; 762 } 763 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 764 #endif 765 766 for (i=0; i<imax; ++i) { 767 #if defined(PETSC_USE_CTABLE) 768 PetscTablePosition tpos; 769 table_data_i = table_data[i]; 770 771 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 772 while (tpos) { 773 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 774 tdata[--j] = --k; 775 } 776 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 777 #else 778 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 779 #endif 780 } 781 782 ierr = PetscFree(onodes2);CHKERRQ(ierr); 783 ierr = PetscFree(olengths2);CHKERRQ(ierr); 784 785 ierr = PetscFree(pa);CHKERRQ(ierr); 786 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 787 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 788 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 789 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 790 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 791 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 792 ierr = PetscFree(s_status);CHKERRQ(ierr); 793 ierr = PetscFree(recv_status);CHKERRQ(ierr); 794 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 795 ierr = PetscFree(xdata);CHKERRQ(ierr); 796 ierr = PetscFree(isz1);CHKERRQ(ierr); 797 #if defined(PETSC_USE_CTABLE) 798 for (i=0; i<imax; i++) { 799 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 800 } 801 ierr = PetscFree(table_data);CHKERRQ(ierr); 802 ierr = PetscFree(tdata);CHKERRQ(ierr); 803 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 804 #else 805 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 806 #endif 807 PetscFunctionReturn(0); 808 } 809 810 #undef __FUNCT__ 811 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 812 /* 813 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 814 the work on the local processor. 815 816 Inputs: 817 C - MAT_MPIAIJ; 818 imax - total no of index sets processed at a time; 819 table - an array of char - size = m bits. 820 821 Output: 822 isz - array containing the count of the solution elements corresponding 823 to each index set; 824 data or table_data - pointer to the solutions 825 */ 826 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 827 { 828 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 829 Mat A = c->A,B = c->B; 830 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 831 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 832 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 833 PetscBT table_i; 834 #if defined(PETSC_USE_CTABLE) 835 PetscTable table_data_i; 836 PetscErrorCode ierr; 837 PetscTablePosition tpos; 838 PetscInt tcount,*tdata; 839 #else 840 PetscInt *data_i; 841 #endif 842 843 PetscFunctionBegin; 844 rstart = C->rmap->rstart; 845 cstart = C->cmap->rstart; 846 ai = a->i; 847 aj = a->j; 848 bi = b->i; 849 bj = b->j; 850 garray = c->garray; 851 852 for (i=0; i<imax; i++) { 853 #if defined(PETSC_USE_CTABLE) 854 /* copy existing entries of table_data_i into tdata[] */ 855 table_data_i = table_data[i]; 856 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 857 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 858 859 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 860 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 861 while (tpos) { 862 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 863 tdata[--j] = --row; 864 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 865 } 866 #else 867 data_i = data[i]; 868 #endif 869 table_i = table[i]; 870 isz_i = isz[i]; 871 max = isz[i]; 872 873 for (j=0; j<max; j++) { 874 #if defined(PETSC_USE_CTABLE) 875 row = tdata[j] - rstart; 876 #else 877 row = data_i[j] - rstart; 878 #endif 879 start = ai[row]; 880 end = ai[row+1]; 881 for (k=start; k<end; k++) { /* Amat */ 882 val = aj[k] + cstart; 883 if (!PetscBTLookupSet(table_i,val)) { 884 #if defined(PETSC_USE_CTABLE) 885 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 886 #else 887 data_i[isz_i] = val; 888 #endif 889 isz_i++; 890 } 891 } 892 start = bi[row]; 893 end = bi[row+1]; 894 for (k=start; k<end; k++) { /* Bmat */ 895 val = garray[bj[k]]; 896 if (!PetscBTLookupSet(table_i,val)) { 897 #if defined(PETSC_USE_CTABLE) 898 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 899 #else 900 data_i[isz_i] = val; 901 #endif 902 isz_i++; 903 } 904 } 905 } 906 isz[i] = isz_i; 907 908 #if defined(PETSC_USE_CTABLE) 909 ierr = PetscFree(tdata);CHKERRQ(ierr); 910 #endif 911 } 912 PetscFunctionReturn(0); 913 } 914 915 #undef __FUNCT__ 916 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 917 /* 918 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 919 and return the output 920 921 Input: 922 C - the matrix 923 nrqr - no of messages being processed. 924 rbuf - an array of pointers to the recieved requests 925 926 Output: 927 xdata - array of messages to be sent back 928 isz1 - size of each message 929 930 For better efficiency perhaps we should malloc separately each xdata[i], 931 then if a remalloc is required we need only copy the data for that one row 932 rather then all previous rows as it is now where a single large chunck of 933 memory is used. 934 935 */ 936 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 937 { 938 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 939 Mat A = c->A,B = c->B; 940 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 941 PetscErrorCode ierr; 942 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 943 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 944 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 945 PetscInt *rbuf_i,kmax,rbuf_0; 946 PetscBT xtable; 947 948 PetscFunctionBegin; 949 m = C->rmap->N; 950 rstart = C->rmap->rstart; 951 cstart = C->cmap->rstart; 952 ai = a->i; 953 aj = a->j; 954 bi = b->i; 955 bj = b->j; 956 garray = c->garray; 957 958 959 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 960 rbuf_i = rbuf[i]; 961 rbuf_0 = rbuf_i[0]; 962 ct += rbuf_0; 963 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 964 } 965 966 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 967 else max1 = 1; 968 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 969 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 970 ++no_malloc; 971 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 972 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 973 974 ct3 = 0; 975 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 976 rbuf_i = rbuf[i]; 977 rbuf_0 = rbuf_i[0]; 978 ct1 = 2*rbuf_0+1; 979 ct2 = ct1; 980 ct3 += ct1; 981 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 982 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 983 oct2 = ct2; 984 kmax = rbuf_i[2*j]; 985 for (k=0; k<kmax; k++,ct1++) { 986 row = rbuf_i[ct1]; 987 if (!PetscBTLookupSet(xtable,row)) { 988 if (!(ct3 < mem_estimate)) { 989 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 990 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 991 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 992 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 993 xdata[0] = tmp; 994 mem_estimate = new_estimate; ++no_malloc; 995 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 996 } 997 xdata[i][ct2++] = row; 998 ct3++; 999 } 1000 } 1001 for (k=oct2,max2=ct2; k<max2; k++) { 1002 row = xdata[i][k] - rstart; 1003 start = ai[row]; 1004 end = ai[row+1]; 1005 for (l=start; l<end; l++) { 1006 val = aj[l] + cstart; 1007 if (!PetscBTLookupSet(xtable,val)) { 1008 if (!(ct3 < mem_estimate)) { 1009 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1010 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1011 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1012 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1013 xdata[0] = tmp; 1014 mem_estimate = new_estimate; ++no_malloc; 1015 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1016 } 1017 xdata[i][ct2++] = val; 1018 ct3++; 1019 } 1020 } 1021 start = bi[row]; 1022 end = bi[row+1]; 1023 for (l=start; l<end; l++) { 1024 val = garray[bj[l]]; 1025 if (!PetscBTLookupSet(xtable,val)) { 1026 if (!(ct3 < mem_estimate)) { 1027 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1028 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1029 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1030 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1031 xdata[0] = tmp; 1032 mem_estimate = new_estimate; ++no_malloc; 1033 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1034 } 1035 xdata[i][ct2++] = val; 1036 ct3++; 1037 } 1038 } 1039 } 1040 /* Update the header*/ 1041 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1042 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1043 } 1044 xdata[i][0] = rbuf_0; 1045 xdata[i+1] = xdata[i] + ct2; 1046 isz1[i] = ct2; /* size of each message */ 1047 } 1048 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1049 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1050 PetscFunctionReturn(0); 1051 } 1052 /* -------------------------------------------------------------------------*/ 1053 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 1054 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1055 /* 1056 Every processor gets the entire matrix 1057 */ 1058 #undef __FUNCT__ 1059 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 1060 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1061 { 1062 Mat B; 1063 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1064 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1065 PetscErrorCode ierr; 1066 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1067 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1068 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1069 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1070 1071 PetscFunctionBegin; 1072 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1073 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1074 1075 if (scall == MAT_INITIAL_MATRIX) { 1076 /* ---------------------------------------------------------------- 1077 Tell every processor the number of nonzeros per row 1078 */ 1079 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1080 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1081 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1082 } 1083 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1084 for (i=0; i<size; i++) { 1085 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1086 displs[i] = A->rmap->range[i]; 1087 } 1088 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1089 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1090 #else 1091 sendcount = A->rmap->rend - A->rmap->rstart; 1092 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1093 #endif 1094 /* --------------------------------------------------------------- 1095 Create the sequential matrix of the same type as the local block diagonal 1096 */ 1097 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1098 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1099 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1100 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1101 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1102 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1103 **Bin = B; 1104 b = (Mat_SeqAIJ*)B->data; 1105 1106 /*-------------------------------------------------------------------- 1107 Copy my part of matrix column indices over 1108 */ 1109 sendcount = ad->nz + bd->nz; 1110 jsendbuf = b->j + b->i[rstarts[rank]]; 1111 a_jsendbuf = ad->j; 1112 b_jsendbuf = bd->j; 1113 n = A->rmap->rend - A->rmap->rstart; 1114 cnt = 0; 1115 for (i=0; i<n; i++) { 1116 1117 /* put in lower diagonal portion */ 1118 m = bd->i[i+1] - bd->i[i]; 1119 while (m > 0) { 1120 /* is it above diagonal (in bd (compressed) numbering) */ 1121 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1122 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1123 m--; 1124 } 1125 1126 /* put in diagonal portion */ 1127 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1128 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1129 } 1130 1131 /* put in upper diagonal portion */ 1132 while (m-- > 0) { 1133 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1134 } 1135 } 1136 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1137 1138 /*-------------------------------------------------------------------- 1139 Gather all column indices to all processors 1140 */ 1141 for (i=0; i<size; i++) { 1142 recvcounts[i] = 0; 1143 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1144 recvcounts[i] += lens[j]; 1145 } 1146 } 1147 displs[0] = 0; 1148 for (i=1; i<size; i++) { 1149 displs[i] = displs[i-1] + recvcounts[i-1]; 1150 } 1151 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1152 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1153 #else 1154 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1155 #endif 1156 /*-------------------------------------------------------------------- 1157 Assemble the matrix into useable form (note numerical values not yet set) 1158 */ 1159 /* set the b->ilen (length of each row) values */ 1160 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1161 /* set the b->i indices */ 1162 b->i[0] = 0; 1163 for (i=1; i<=A->rmap->N; i++) { 1164 b->i[i] = b->i[i-1] + lens[i-1]; 1165 } 1166 ierr = PetscFree(lens);CHKERRQ(ierr); 1167 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1168 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1169 1170 } else { 1171 B = **Bin; 1172 b = (Mat_SeqAIJ*)B->data; 1173 } 1174 1175 /*-------------------------------------------------------------------- 1176 Copy my part of matrix numerical values into the values location 1177 */ 1178 if (flag == MAT_GET_VALUES) { 1179 sendcount = ad->nz + bd->nz; 1180 sendbuf = b->a + b->i[rstarts[rank]]; 1181 a_sendbuf = ad->a; 1182 b_sendbuf = bd->a; 1183 b_sendj = bd->j; 1184 n = A->rmap->rend - A->rmap->rstart; 1185 cnt = 0; 1186 for (i=0; i<n; i++) { 1187 1188 /* put in lower diagonal portion */ 1189 m = bd->i[i+1] - bd->i[i]; 1190 while (m > 0) { 1191 /* is it above diagonal (in bd (compressed) numbering) */ 1192 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1193 sendbuf[cnt++] = *b_sendbuf++; 1194 m--; 1195 b_sendj++; 1196 } 1197 1198 /* put in diagonal portion */ 1199 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1200 sendbuf[cnt++] = *a_sendbuf++; 1201 } 1202 1203 /* put in upper diagonal portion */ 1204 while (m-- > 0) { 1205 sendbuf[cnt++] = *b_sendbuf++; 1206 b_sendj++; 1207 } 1208 } 1209 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1210 1211 /* ----------------------------------------------------------------- 1212 Gather all numerical values to all processors 1213 */ 1214 if (!recvcounts) { 1215 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1216 } 1217 for (i=0; i<size; i++) { 1218 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1219 } 1220 displs[0] = 0; 1221 for (i=1; i<size; i++) { 1222 displs[i] = displs[i-1] + recvcounts[i-1]; 1223 } 1224 recvbuf = b->a; 1225 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1226 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1227 #else 1228 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1229 #endif 1230 } /* endof (flag == MAT_GET_VALUES) */ 1231 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1232 1233 if (A->symmetric) { 1234 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1235 } else if (A->hermitian) { 1236 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1237 } else if (A->structurally_symmetric) { 1238 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1239 } 1240 PetscFunctionReturn(0); 1241 } 1242 #undef __FUNCT__ 1243 #define __FUNCT__ "MatDestroy_MPIAIJ_MatGetSubmatrices" 1244 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1245 { 1246 PetscErrorCode ierr; 1247 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1248 Mat_SubMat *submatj = c->submatis1; 1249 PetscInt i; 1250 1251 PetscFunctionBegin; 1252 ierr = submatj->destroy(C);CHKERRQ(ierr); 1253 1254 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1255 1256 for (i=0; i<submatj->nrqr; ++i) { 1257 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1258 } 1259 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1260 1261 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1262 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1263 1264 for (i=0; i<submatj->nrqs; ++i) { 1265 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1266 } 1267 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1268 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1269 1270 #if defined(PETSC_USE_CTABLE) 1271 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1272 ierr = PetscFree2(submatj->cmap_loc,submatj->rmap_loc);CHKERRQ(ierr); 1273 #else 1274 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1275 #endif 1276 1277 if (!submatj->allcolumns) { 1278 #if defined(PETSC_USE_CTABLE) 1279 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1280 #else 1281 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1282 #endif 1283 } 1284 1285 ierr = PetscFree(submatj);CHKERRQ(ierr); 1286 PetscFunctionReturn(0); 1287 } 1288 1289 #undef __FUNCT__ 1290 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS_Local" 1291 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1292 { 1293 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1294 Mat submat,A = c->A,B = c->B; 1295 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1296 PetscInt *ai = a->i,*bi = b->i,nzA,nzB; 1297 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1298 const PetscInt *icol,*irow; 1299 PetscInt nrow,ncol,start; 1300 PetscErrorCode ierr; 1301 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 1302 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1303 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1304 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1305 PetscInt *lens,ncols,*cols,Crow; 1306 #if defined(PETSC_USE_CTABLE) 1307 PetscTable cmap,rmap; 1308 PetscInt *cmap_loc,*rmap_loc; 1309 #else 1310 PetscInt *cmap,*rmap; 1311 #endif 1312 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1313 PetscInt *cworkB,lwrite,*bj = b->j,*subcols; 1314 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1315 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1316 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1317 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1318 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1319 MPI_Comm comm; 1320 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1321 PetscMPIInt *onodes1,*olengths1,idex,end; 1322 Mat_SubMat *smatis1; 1323 #if defined(PETSC_USE_LOG) 1324 PetscLogStage stage[5]; 1325 #endif 1326 1327 PetscFunctionBegin; 1328 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1329 1330 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1331 size = c->size; 1332 rank = c->rank; 1333 1334 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1335 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1336 if (allcolumns) { 1337 icol = NULL; 1338 ncol = C->cmap->N; 1339 } else { 1340 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1341 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1342 } 1343 1344 if (scall == MAT_INITIAL_MATRIX) { 1345 PetscInt *sbuf2_i,*cworkA,lwrite,*aj = a->j,ctmp; 1346 1347 ierr = PetscLogStageRegister("Setup",&stage[0]);CHKERRQ(ierr); 1348 ierr = PetscLogStageRegister("Aj comm",&stage[1]);CHKERRQ(ierr); 1349 ierr = PetscLogStageRegister("Aval comm",&stage[2]);CHKERRQ(ierr); 1350 ierr = PetscLogStageRegister("MAssem_loc",&stage[3]);CHKERRQ(ierr); 1351 ierr = PetscLogStageRegister("MAssem_off",&stage[4]);CHKERRQ(ierr); 1352 1353 ierr = PetscLogStagePush(stage[0]);CHKERRQ(ierr); 1354 1355 /* Get some new tags to keep the communication clean */ 1356 tag1 = ((PetscObject)C)->tag; 1357 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1358 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1359 1360 /* evaluate communication - mesg to who, length of mesg, and buffer space 1361 required. Based on this, buffers are allocated, and data copied into them */ 1362 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 1363 1364 for (j=0; j<nrow; j++) { 1365 proc = 0; 1366 row = irow[j]; 1367 while (row >= C->rmap->range[proc+1]) proc++; 1368 w4[proc]++; 1369 } 1370 for (j=0; j<size; j++) { 1371 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 1372 } 1373 1374 nrqs = 0; /* no of outgoing messages */ 1375 msz = 0; /* total mesg length (for all procs) */ 1376 w1[rank] = 0; /* no mesg sent to self */ 1377 w3[rank] = 0; 1378 for (i=0; i<size; i++) { 1379 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 1380 } 1381 1382 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1383 for (i=0,j=0; i<size; i++) { 1384 if (w1[i]) { pa[j] = i; j++; } 1385 } 1386 1387 /* Each message would have a header = 1 + 2*(no of IS) + data */ 1388 for (i=0; i<nrqs; i++) { 1389 proc = pa[i]; 1390 w1[proc] += w2[proc] + 2* w3[proc]; 1391 msz += w1[proc]; 1392 } 1393 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1394 1395 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1396 /* if w2[i]=1, a message of length w1[i] will be sent to proc[i]; */ 1397 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1398 1399 /* nrqs: nsend; nrqr: nrecv; w1: send message length; onodes1: recv id; olengths1: recv message length */ 1400 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1401 1402 /* Now post the Irecvs corresponding to these messages */ 1403 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1404 1405 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1406 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1407 1408 /* Allocate Memory for outgoing messages */ 1409 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1410 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1411 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1412 1413 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1414 iptr = tmp; 1415 k = 0; 1416 for (i=0; i<nrqs; i++) { 1417 proc = pa[i]; 1418 iptr += k; 1419 sbuf1[proc] = iptr; 1420 k = w1[proc]; 1421 } 1422 1423 /* Form the outgoing messages */ 1424 /* Initialize the header space */ 1425 for (i=0; i<nrqs; i++) { 1426 proc = pa[i]; 1427 sbuf1[proc][0] = 0; 1428 ierr = PetscMemzero(sbuf1[proc]+1,2*w3[proc]*sizeof(PetscInt));CHKERRQ(ierr); 1429 ptr[proc] = sbuf1[proc] + 2*w3[proc] + 1; 1430 } 1431 1432 /* Parse the isrow and copy data into outbuf */ 1433 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1434 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1435 proc = 0; 1436 row = irow[j]; 1437 while (row >= C->rmap->range[proc+1]) proc++; 1438 if (proc != rank) { /* copy to the outgoing buf*/ 1439 ctr[proc]++; 1440 *ptr[proc] = row; 1441 ptr[proc]++; 1442 } 1443 } 1444 1445 /* Update the headers for the current IS */ 1446 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1447 if ((ctr_j = ctr[j])) { 1448 sbuf1_j = sbuf1[j]; 1449 k = ++sbuf1_j[0]; 1450 sbuf1_j[2*k] = ctr_j; 1451 sbuf1_j[2*k-1] = 0; 1452 } 1453 } 1454 1455 /* Now post the sends */ 1456 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1457 for (i=0; i<nrqs; ++i) { 1458 proc = pa[i]; 1459 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1460 } 1461 1462 /* Post Receives to capture the buffer size */ 1463 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1464 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1465 1466 rbuf2[0] = tmp + msz; 1467 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1468 1469 for (i=0; i<nrqs; ++i) { 1470 proc = pa[i]; 1471 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1472 } 1473 1474 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1475 1476 /* Send to other procs the buf size they should allocate */ 1477 /* Receive messages*/ 1478 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1479 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1480 1481 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1482 for (i=0; i<nrqr; ++i) { 1483 req_size[i] = 0; 1484 rbuf1_i = rbuf1[i]; 1485 start = 2*rbuf1_i[0] + 1; 1486 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1487 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1488 sbuf2_i = sbuf2[i]; 1489 for (j=start; j<end; j++) { 1490 k = rbuf1_i[j] - rstart; 1491 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1492 sbuf2_i[j] = ncols; 1493 req_size[i] += ncols; 1494 } 1495 req_source1[i] = r_status1[i].MPI_SOURCE; 1496 1497 /* form the header */ 1498 sbuf2_i[0] = req_size[i]; 1499 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1500 1501 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1502 } 1503 1504 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1505 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1506 1507 /* rbuf2 is received, Post recv column indices a->j */ 1508 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1509 1510 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1511 for (i=0; i<nrqs; ++i) { 1512 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1513 req_source2[i] = r_status2[i].MPI_SOURCE; 1514 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1515 } 1516 1517 /* Wait on sends1 and sends2 */ 1518 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1519 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1520 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1521 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1522 1523 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1524 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1525 1526 ierr = PetscLogStagePop();CHKERRQ(ierr); 1527 ierr = PetscLogStagePush(stage[1]);CHKERRQ(ierr); 1528 1529 /* Now allocate sending buffers for a->j, and send them off */ 1530 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1531 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1532 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1533 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1534 1535 for (i=0; i<nrqr; i++) { /* for each requested message */ 1536 rbuf1_i = rbuf1[i]; 1537 sbuf_aj_i = sbuf_aj[i]; 1538 ct1 = 2*rbuf1_i[0] + 1; 1539 ct2 = 0; 1540 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1541 1542 kmax = rbuf1[i][2]; 1543 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1544 row = rbuf1_i[ct1] - rstart; 1545 nzA = ai[row+1] - ai[row]; nzB = bi[row+1] - bi[row]; 1546 ncols = nzA + nzB; 1547 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1548 1549 /* load the column indices for this row into cols*/ 1550 cols = sbuf_aj_i + ct2; 1551 1552 lwrite = 0; 1553 for (l=0; l<nzB; l++) { 1554 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1555 } 1556 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1557 for (l=0; l<nzB; l++) { 1558 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1559 } 1560 1561 ct2 += ncols; 1562 } 1563 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1564 } 1565 1566 /* create column map (cmap): global col of C -> local col of submat */ 1567 #if defined(PETSC_USE_CTABLE) 1568 if (!allcolumns) { 1569 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1570 ierr = PetscCalloc2(C->cmap->n,&cmap_loc,C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1571 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1572 if (icol[j] >= cstart && icol[j] <cend) { 1573 cmap_loc[icol[j] - cstart] = j+1; 1574 } else { /* use PetscTable for non-local col indices */ 1575 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1576 } 1577 } 1578 } else { 1579 cmap = NULL; 1580 } 1581 #else 1582 if (!allcolumns) { 1583 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1584 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1585 } else { 1586 cmap = NULL; 1587 } 1588 #endif 1589 1590 /* Create lens for MatSeqAIJSetPreallocation() */ 1591 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1592 1593 /* Compute lens from local part of C */ 1594 for (j=0; j<nrow; j++) { 1595 proc = 0; 1596 row = irow[j]; 1597 while (row >= C->rmap->range[proc+1]) proc++; 1598 if (proc == rank) { 1599 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1600 if (!allcolumns) { 1601 for (k=0; k<ncols; k++) { 1602 #if defined(PETSC_USE_CTABLE) 1603 if (cols[k] >= cstart && cols[k] <cend) { 1604 tcol = cmap_loc[cols[k] - cstart]; 1605 } else { 1606 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1607 } 1608 #else 1609 tcol = cmap[cols[k]]; 1610 #endif 1611 if (tcol) lens[j]++; 1612 } 1613 } else { /* allcolumns */ 1614 lens[j] = ncols; 1615 } 1616 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1617 } 1618 } 1619 1620 /* Create row map (rmap): global row of C -> local row of submat */ 1621 #if defined(PETSC_USE_CTABLE) 1622 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1623 for (j=0; j<nrow; j++) { 1624 proc = 0; 1625 row = irow[j]; 1626 while (row >= C->rmap->range[proc+1]) proc++; 1627 if (proc == rank) { /* a local row */ 1628 rmap_loc[row - rstart] = j; 1629 } else { 1630 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1631 } 1632 } 1633 #else 1634 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1635 for (j=0; j<nrow; j++) { 1636 rmap[irow[j]] = j; 1637 } 1638 #endif 1639 1640 /* Update lens from offproc data */ 1641 /* recv a->j is done */ 1642 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1643 for (i=0; i<nrqs; i++) { 1644 proc = pa[i]; 1645 sbuf1_i = sbuf1[proc]; 1646 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1647 ct1 = 2 + 1; 1648 ct2 = 0; 1649 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1650 rbuf3_i = rbuf3[i]; /* received C->j */ 1651 1652 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1653 max1 = sbuf1_i[2]; 1654 for (k=0; k<max1; k++,ct1++) { 1655 #if defined(PETSC_USE_CTABLE) 1656 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1657 row--; 1658 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1659 #else 1660 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1661 #endif 1662 /* Now, store row index of submat in sbuf1_i[ct1] */ 1663 sbuf1_i[ct1] = row; 1664 1665 nnz = rbuf2_i[ct1]; 1666 if (!allcolumns) { 1667 for (l=0; l<nnz; l++,ct2++) { 1668 #if defined(PETSC_USE_CTABLE) 1669 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1670 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1671 } else { 1672 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1673 } 1674 #else 1675 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1676 #endif 1677 if (tcol) { 1678 lens[row]++; 1679 1680 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1681 For reuse, we replace received C->j with index that should be inserted to submat */ 1682 /* rbuf3_i[ct3++] = ct2; ??? */ 1683 } 1684 } 1685 } else { /* allcolumns */ 1686 lens[row] += nnz; 1687 } 1688 } 1689 } 1690 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1691 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1692 1693 /* Create the submatrices */ 1694 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1695 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1696 1697 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1698 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1699 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1700 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1701 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1702 1703 /* create struct Mat_SubMat and attached it to submat */ 1704 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1705 subc = (Mat_SeqAIJ*)submat->data; 1706 subc->submatis1 = smatis1; 1707 1708 smatis1->nrqs = nrqs; 1709 smatis1->nrqr = nrqr; 1710 smatis1->rbuf1 = rbuf1; 1711 smatis1->rbuf2 = rbuf2; 1712 smatis1->rbuf3 = rbuf3; 1713 smatis1->sbuf2 = sbuf2; 1714 smatis1->req_source2 = req_source2; 1715 1716 smatis1->sbuf1 = sbuf1; 1717 smatis1->ptr = ptr; 1718 smatis1->tmp = tmp; 1719 smatis1->ctr = ctr; 1720 1721 smatis1->pa = pa; 1722 smatis1->req_size = req_size; 1723 smatis1->req_source1 = req_source1; 1724 1725 smatis1->allcolumns = allcolumns; 1726 smatis1->rmap = rmap; 1727 smatis1->cmap = cmap; 1728 #if defined(PETSC_USE_CTABLE) 1729 smatis1->rmap_loc = rmap_loc; 1730 smatis1->cmap_loc = cmap_loc; 1731 #endif 1732 1733 smatis1->destroy = submat->ops->destroy; 1734 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1735 submat->factortype = C->factortype; 1736 1737 ierr = PetscLogStagePop();CHKERRQ(ierr); 1738 ierr = PetscLogStagePush(stage[2]);CHKERRQ(ierr); 1739 } else { /* scall == MAT_REUSE_MATRIX */ 1740 submat = submats[0]; 1741 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1742 1743 subc = (Mat_SeqAIJ*)submat->data; 1744 smatis1 = subc->submatis1; 1745 nrqs = smatis1->nrqs; 1746 nrqr = smatis1->nrqr; 1747 rbuf1 = smatis1->rbuf1; 1748 rbuf2 = smatis1->rbuf2; 1749 rbuf3 = smatis1->rbuf3; 1750 req_source2 = smatis1->req_source2; 1751 1752 sbuf1 = smatis1->sbuf1; 1753 sbuf2 = smatis1->sbuf2; 1754 ptr = smatis1->ptr; 1755 tmp = smatis1->tmp; 1756 ctr = smatis1->ctr; 1757 1758 pa = smatis1->pa; 1759 req_size = smatis1->req_size; 1760 req_source1 = smatis1->req_source1; 1761 1762 allcolumns = smatis1->allcolumns; 1763 rmap = smatis1->rmap; 1764 cmap = smatis1->cmap; 1765 #if defined(PETSC_USE_CTABLE) 1766 rmap_loc = smatis1->rmap_loc; 1767 cmap_loc = smatis1->cmap_loc; 1768 #endif 1769 } 1770 1771 /* Post recv matrix values */ 1772 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1773 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1774 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1775 for (i=0; i<nrqs; ++i) { 1776 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1777 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1778 } 1779 1780 /* Allocate sending buffers for a->a, and send them off */ 1781 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1782 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1783 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1784 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1785 1786 for (i=0; i<nrqr; i++) { 1787 rbuf1_i = rbuf1[i]; 1788 sbuf_aa_i = sbuf_aa[i]; 1789 ct1 = 2*rbuf1_i[0]+1; 1790 ct2 = 0; 1791 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1792 1793 kmax = rbuf1_i[2]; 1794 for (k=0; k<kmax; k++,ct1++) { 1795 row = rbuf1_i[ct1] - rstart; 1796 nzA = ai[row+1] - ai[row]; 1797 nzB = bi[row+1] - bi[row]; 1798 ncols = nzA + nzB; 1799 cworkB = bj + bi[row]; 1800 vworkA = a_a + ai[row]; 1801 vworkB = b_a + bi[row]; 1802 1803 /* load the column values for this row into vals*/ 1804 vals = sbuf_aa_i + ct2; 1805 1806 lwrite = 0; 1807 for (l=0; l<nzB; l++) { 1808 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1809 } 1810 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1811 for (l=0; l<nzB; l++) { 1812 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1813 } 1814 1815 ct2 += ncols; 1816 } 1817 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1818 } 1819 1820 if (scall == MAT_INITIAL_MATRIX) { 1821 ierr = PetscLogStagePop();CHKERRQ(ierr); 1822 ierr = PetscLogStagePush(stage[3]);CHKERRQ(ierr); 1823 } 1824 1825 /* Assemble submat */ 1826 if (!allcolumns) { 1827 ierr = PetscCalloc2(ncol,&subcols,ncol,&subvals);CHKERRQ(ierr); 1828 } 1829 /* First assemble the local rows */ 1830 for (j=0; j<nrow; j++) { 1831 proc = 0; 1832 row = irow[j]; 1833 while (row >= C->rmap->range[proc+1]) proc++; 1834 if (proc == rank) { 1835 Crow = row; 1836 #if defined(PETSC_USE_CTABLE) 1837 row = rmap_loc[Crow - rstart]; /* row index of submat */ 1838 #else 1839 row = rmap[row]; 1840 #endif 1841 ierr = MatGetRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr); 1842 1843 if (!allcolumns) { 1844 #if defined(PETSC_USE_CTABLE) 1845 i = 0; 1846 k = 0; 1847 while (cols[k] < cstart) { 1848 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1849 if (tcol) { 1850 subcols[i] = --tcol; 1851 subvals[i++] = vals[k]; 1852 } 1853 k++; 1854 } 1855 while (cols[k] < cend && k<ncols) { 1856 tcol = cmap_loc[cols[k]-cstart]; 1857 if (tcol) { 1858 subcols[i] = --tcol; 1859 subvals[i++] = vals[k]; 1860 } 1861 k++; 1862 } 1863 while (k<ncols) { 1864 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1865 if (tcol) { 1866 subcols[i] = --tcol; 1867 subvals[i++] = vals[k]; 1868 } 1869 k++; 1870 } 1871 #else 1872 i = 0; 1873 for (k=0; k<ncols; k++) { 1874 tcol = cmap[cols[k]]; 1875 if (tcol) { 1876 subcols[i] = --tcol; 1877 subvals[i++] = vals[k]; 1878 } 1879 } 1880 #endif 1881 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1882 } else { /* allcolumns */ 1883 ierr = MatSetValues_SeqAIJ(submat,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr); 1884 } 1885 ierr = MatRestoreRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr); 1886 } 1887 } 1888 1889 if (scall == MAT_INITIAL_MATRIX) { 1890 ierr = PetscLogStagePop();CHKERRQ(ierr); 1891 ierr = PetscLogStagePush(stage[4]);CHKERRQ(ierr); 1892 } 1893 1894 /* Now assemble the off-proc rows */ 1895 for (i=0; i<nrqs; i++) { /* for each requested message */ 1896 /* recv values from other processes */ 1897 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1898 proc = pa[idex]; 1899 sbuf1_i = sbuf1[proc]; 1900 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1901 ct1 = 2 + 1; 1902 ct2 = 0; /* count of received C->j */ 1903 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1904 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1905 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1906 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1907 1908 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1909 max1 = sbuf1_i[2]; /* num of rows */ 1910 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1911 row = sbuf1_i[ct1]; /* row index of submat */ 1912 if (!allcolumns) { 1913 idex = 0; 1914 if (scall == MAT_INITIAL_MATRIX) { 1915 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1916 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1917 #if defined(PETSC_USE_CTABLE) 1918 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1919 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1920 } else { 1921 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1922 } 1923 #else 1924 tcol = cmap[rbuf3_i[ct2]]; 1925 #endif 1926 if (tcol) { 1927 subcols[idex] = --tcol; 1928 subvals[idex++] = rbuf4_i[ct2]; 1929 1930 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1931 For reuse, we replace received C->j with index that should be inserted to submat */ 1932 rbuf3_i[ct3++] = ct2; 1933 } 1934 } 1935 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1936 1937 } else { /* scall == MAT_REUSE_MATRIX */ 1938 submat = submats[0]; 1939 subc = (Mat_SeqAIJ*)submat->data; 1940 1941 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1942 for (l=0; l<nnz; l++) { 1943 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1944 subvals[idex++] = rbuf4_i[ct2]; 1945 } 1946 1947 bj = subc->j + subc->i[row]; 1948 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1949 } 1950 } else { /* allcolumns */ 1951 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1952 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1953 ct2 += nnz; 1954 } 1955 } 1956 } 1957 1958 /* sending a->a are done */ 1959 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1960 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1961 1962 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1963 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1964 submats[0] = submat; 1965 1966 /* Restore the indices */ 1967 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1968 if (!allcolumns) { 1969 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1970 } 1971 1972 /* Destroy allocated memory */ 1973 for (i=0; i<nrqs; ++i) { 1974 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1975 } 1976 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1977 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1978 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1979 1980 if (!allcolumns) { 1981 ierr = PetscFree2(subcols,subvals);CHKERRQ(ierr); 1982 } 1983 1984 if (scall == MAT_INITIAL_MATRIX) { 1985 ierr = PetscLogStagePop();CHKERRQ(ierr); 1986 1987 ierr = PetscFree(lens);CHKERRQ(ierr); 1988 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1989 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1990 } 1991 PetscFunctionReturn(0); 1992 } 1993 1994 #undef __FUNCT__ 1995 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS" 1996 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1997 { 1998 PetscErrorCode ierr; 1999 PetscInt ncol; 2000 PetscBool colflag,allcolumns=PETSC_FALSE; 2001 2002 PetscFunctionBegin; 2003 /* Allocate memory to hold all the submatrices */ 2004 if (scall != MAT_REUSE_MATRIX) { 2005 ierr = PetscMalloc1(1,submat);CHKERRQ(ierr); 2006 } 2007 2008 /* Check for special case: each processor gets entire matrix columns */ 2009 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 2010 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 2011 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 2012 2013 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2014 PetscFunctionReturn(0); 2015 } 2016 2017 #undef __FUNCT__ 2018 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 2019 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2020 { 2021 PetscErrorCode ierr; 2022 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 2023 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 2024 2025 PetscFunctionBegin; 2026 /* Check for special case: each processor gets entire matrix */ 2027 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */ 2028 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2029 PetscFunctionReturn(0); 2030 } 2031 2032 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2033 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2034 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2035 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2036 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2037 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2038 wantallmatrix = PETSC_TRUE; 2039 2040 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2041 } 2042 } 2043 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2044 if (twantallmatrix) { 2045 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2046 PetscFunctionReturn(0); 2047 } 2048 2049 /* Allocate memory to hold all the submatrices */ 2050 if (scall != MAT_REUSE_MATRIX) { 2051 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 2052 } 2053 2054 /* Check for special case: each processor gets entire matrix columns */ 2055 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 2056 for (i=0; i<ismax; i++) { 2057 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2058 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 2059 if (colflag && ncol == C->cmap->N) { 2060 allcolumns[i] = PETSC_TRUE; 2061 } else { 2062 allcolumns[i] = PETSC_FALSE; 2063 } 2064 } 2065 2066 /* Determine the number of stages through which submatrices are done */ 2067 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2068 2069 /* 2070 Each stage will extract nmax submatrices. 2071 nmax is determined by the matrix column dimension. 2072 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2073 */ 2074 if (!nmax) nmax = 1; 2075 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 2076 2077 /* Make sure every processor loops through the nstages */ 2078 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2079 2080 for (i=0,pos=0; i<nstages; i++) { 2081 if (pos+nmax <= ismax) max_no = nmax; 2082 else if (pos == ismax) max_no = 0; 2083 else max_no = ismax-pos; 2084 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 2085 pos += max_no; 2086 } 2087 2088 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 2089 PetscFunctionReturn(0); 2090 } 2091 2092 /* -------------------------------------------------------------------------*/ 2093 #undef __FUNCT__ 2094 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 2095 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 2096 { 2097 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2098 Mat A = c->A; 2099 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 2100 const PetscInt **icol,**irow; 2101 PetscInt *nrow,*ncol,start; 2102 PetscErrorCode ierr; 2103 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 2104 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 2105 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 2106 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 2107 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2108 #if defined(PETSC_USE_CTABLE) 2109 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2110 #else 2111 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2112 #endif 2113 const PetscInt *irow_i; 2114 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2115 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2116 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2117 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2118 MPI_Status *r_status3,*r_status4,*s_status4; 2119 MPI_Comm comm; 2120 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 2121 PetscMPIInt *onodes1,*olengths1; 2122 PetscMPIInt idex,idex2,end; 2123 2124 PetscFunctionBegin; 2125 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2126 tag0 = ((PetscObject)C)->tag; 2127 size = c->size; 2128 rank = c->rank; 2129 2130 /* Get some new tags to keep the communication clean */ 2131 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 2132 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2133 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2134 2135 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 2136 2137 for (i=0; i<ismax; i++) { 2138 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2139 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2140 if (allcolumns[i]) { 2141 icol[i] = NULL; 2142 ncol[i] = C->cmap->N; 2143 } else { 2144 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2145 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2146 } 2147 } 2148 2149 /* evaluate communication - mesg to who, length of mesg, and buffer space 2150 required. Based on this, buffers are allocated, and data copied into them*/ 2151 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 2152 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2153 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2154 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2155 for (i=0; i<ismax; i++) { 2156 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2157 jmax = nrow[i]; 2158 irow_i = irow[i]; 2159 for (j=0; j<jmax; j++) { 2160 l = 0; 2161 row = irow_i[j]; 2162 while (row >= C->rmap->range[l+1]) l++; 2163 proc = l; 2164 w4[proc]++; 2165 } 2166 for (j=0; j<size; j++) { 2167 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 2168 } 2169 } 2170 2171 nrqs = 0; /* no of outgoing messages */ 2172 msz = 0; /* total mesg length (for all procs) */ 2173 w1[rank] = 0; /* no mesg sent to self */ 2174 w3[rank] = 0; 2175 for (i=0; i<size; i++) { 2176 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2177 } 2178 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2179 for (i=0,j=0; i<size; i++) { 2180 if (w1[i]) { pa[j] = i; j++; } 2181 } 2182 2183 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2184 for (i=0; i<nrqs; i++) { 2185 j = pa[i]; 2186 w1[j] += w2[j] + 2* w3[j]; 2187 msz += w1[j]; 2188 } 2189 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2190 2191 /* Determine the number of messages to expect, their lengths, from from-ids */ 2192 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2193 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2194 2195 /* Now post the Irecvs corresponding to these messages */ 2196 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2197 2198 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2199 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2200 2201 /* Allocate Memory for outgoing messages */ 2202 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2203 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2204 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2205 2206 { 2207 PetscInt *iptr = tmp; 2208 k = 0; 2209 for (i=0; i<nrqs; i++) { 2210 j = pa[i]; 2211 iptr += k; 2212 sbuf1[j] = iptr; 2213 k = w1[j]; 2214 } 2215 } 2216 2217 /* Form the outgoing messages */ 2218 /* Initialize the header space */ 2219 for (i=0; i<nrqs; i++) { 2220 j = pa[i]; 2221 sbuf1[j][0] = 0; 2222 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2223 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2224 } 2225 2226 /* Parse the isrow and copy data into outbuf */ 2227 for (i=0; i<ismax; i++) { 2228 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2229 irow_i = irow[i]; 2230 jmax = nrow[i]; 2231 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2232 l = 0; 2233 row = irow_i[j]; 2234 while (row >= C->rmap->range[l+1]) l++; 2235 proc = l; 2236 if (proc != rank) { /* copy to the outgoing buf*/ 2237 ctr[proc]++; 2238 *ptr[proc] = row; 2239 ptr[proc]++; 2240 } 2241 } 2242 /* Update the headers for the current IS */ 2243 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2244 if ((ctr_j = ctr[j])) { 2245 sbuf1_j = sbuf1[j]; 2246 k = ++sbuf1_j[0]; 2247 sbuf1_j[2*k] = ctr_j; 2248 sbuf1_j[2*k-1] = i; 2249 } 2250 } 2251 } 2252 2253 /* Now post the sends */ 2254 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2255 for (i=0; i<nrqs; ++i) { 2256 j = pa[i]; 2257 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2258 } 2259 2260 /* Post Receives to capture the buffer size */ 2261 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2262 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 2263 rbuf2[0] = tmp + msz; 2264 for (i=1; i<nrqs; ++i) { 2265 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2266 } 2267 for (i=0; i<nrqs; ++i) { 2268 j = pa[i]; 2269 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 2270 } 2271 2272 /* Send to other procs the buf size they should allocate */ 2273 2274 2275 /* Receive messages*/ 2276 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2277 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2278 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 2279 { 2280 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2281 PetscInt *sbuf2_i; 2282 2283 for (i=0; i<nrqr; ++i) { 2284 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 2285 2286 req_size[idex] = 0; 2287 rbuf1_i = rbuf1[idex]; 2288 start = 2*rbuf1_i[0] + 1; 2289 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2290 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 2291 sbuf2_i = sbuf2[idex]; 2292 for (j=start; j<end; j++) { 2293 id = rbuf1_i[j] - rstart; 2294 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2295 sbuf2_i[j] = ncols; 2296 req_size[idex] += ncols; 2297 } 2298 req_source[idex] = r_status1[i].MPI_SOURCE; 2299 /* form the header */ 2300 sbuf2_i[0] = req_size[idex]; 2301 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2302 2303 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 2304 } 2305 } 2306 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2307 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2308 2309 /* recv buffer sizes */ 2310 /* Receive messages*/ 2311 2312 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 2313 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2314 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2315 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2316 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2317 2318 for (i=0; i<nrqs; ++i) { 2319 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 2320 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 2321 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 2322 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 2323 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 2324 } 2325 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2326 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2327 2328 /* Wait on sends1 and sends2 */ 2329 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2330 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2331 2332 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2333 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2334 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2335 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2336 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2337 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2338 2339 /* Now allocate buffers for a->j, and send them off */ 2340 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2341 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2342 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2343 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2344 2345 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2346 { 2347 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2348 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2349 PetscInt cend = C->cmap->rend; 2350 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2351 2352 for (i=0; i<nrqr; i++) { 2353 rbuf1_i = rbuf1[i]; 2354 sbuf_aj_i = sbuf_aj[i]; 2355 ct1 = 2*rbuf1_i[0] + 1; 2356 ct2 = 0; 2357 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2358 kmax = rbuf1[i][2*j]; 2359 for (k=0; k<kmax; k++,ct1++) { 2360 row = rbuf1_i[ct1] - rstart; 2361 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2362 ncols = nzA + nzB; 2363 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2364 2365 /* load the column indices for this row into cols*/ 2366 cols = sbuf_aj_i + ct2; 2367 2368 lwrite = 0; 2369 for (l=0; l<nzB; l++) { 2370 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2371 } 2372 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2373 for (l=0; l<nzB; l++) { 2374 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2375 } 2376 2377 ct2 += ncols; 2378 } 2379 } 2380 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 2381 } 2382 } 2383 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 2384 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 2385 2386 /* Allocate buffers for a->a, and send them off */ 2387 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2388 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2389 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2390 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2391 2392 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2393 { 2394 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2395 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2396 PetscInt cend = C->cmap->rend; 2397 PetscInt *b_j = b->j; 2398 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2399 2400 for (i=0; i<nrqr; i++) { 2401 rbuf1_i = rbuf1[i]; 2402 sbuf_aa_i = sbuf_aa[i]; 2403 ct1 = 2*rbuf1_i[0]+1; 2404 ct2 = 0; 2405 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2406 kmax = rbuf1_i[2*j]; 2407 for (k=0; k<kmax; k++,ct1++) { 2408 row = rbuf1_i[ct1] - rstart; 2409 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2410 ncols = nzA + nzB; 2411 cworkB = b_j + b_i[row]; 2412 vworkA = a_a + a_i[row]; 2413 vworkB = b_a + b_i[row]; 2414 2415 /* load the column values for this row into vals*/ 2416 vals = sbuf_aa_i+ct2; 2417 2418 lwrite = 0; 2419 for (l=0; l<nzB; l++) { 2420 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2421 } 2422 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2423 for (l=0; l<nzB; l++) { 2424 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2425 } 2426 2427 ct2 += ncols; 2428 } 2429 } 2430 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 2431 } 2432 } 2433 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 2434 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 2435 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2436 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2437 2438 /* Form the matrix */ 2439 /* create col map: global col of C -> local col of submatrices */ 2440 { 2441 const PetscInt *icol_i; 2442 #if defined(PETSC_USE_CTABLE) 2443 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 2444 for (i=0; i<ismax; i++) { 2445 if (!allcolumns[i]) { 2446 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2447 2448 jmax = ncol[i]; 2449 icol_i = icol[i]; 2450 cmap_i = cmap[i]; 2451 for (j=0; j<jmax; j++) { 2452 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2453 } 2454 } else { 2455 cmap[i] = NULL; 2456 } 2457 } 2458 #else 2459 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 2460 for (i=0; i<ismax; i++) { 2461 if (!allcolumns[i]) { 2462 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2463 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2464 jmax = ncol[i]; 2465 icol_i = icol[i]; 2466 cmap_i = cmap[i]; 2467 for (j=0; j<jmax; j++) { 2468 cmap_i[icol_i[j]] = j+1; 2469 } 2470 } else { 2471 cmap[i] = NULL; 2472 } 2473 } 2474 #endif 2475 } 2476 2477 /* Create lens which is required for MatCreate... */ 2478 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2479 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2480 if (ismax) { 2481 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 2482 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 2483 } 2484 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2485 2486 /* Update lens from local data */ 2487 for (i=0; i<ismax; i++) { 2488 jmax = nrow[i]; 2489 if (!allcolumns[i]) cmap_i = cmap[i]; 2490 irow_i = irow[i]; 2491 lens_i = lens[i]; 2492 for (j=0; j<jmax; j++) { 2493 l = 0; 2494 row = irow_i[j]; 2495 while (row >= C->rmap->range[l+1]) l++; 2496 proc = l; 2497 if (proc == rank) { 2498 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2499 if (!allcolumns[i]) { 2500 for (k=0; k<ncols; k++) { 2501 #if defined(PETSC_USE_CTABLE) 2502 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2503 #else 2504 tcol = cmap_i[cols[k]]; 2505 #endif 2506 if (tcol) lens_i[j]++; 2507 } 2508 } else { /* allcolumns */ 2509 lens_i[j] = ncols; 2510 } 2511 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2512 } 2513 } 2514 } 2515 2516 /* Create row map: global row of C -> local row of submatrices */ 2517 #if defined(PETSC_USE_CTABLE) 2518 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 2519 for (i=0; i<ismax; i++) { 2520 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2521 irow_i = irow[i]; 2522 jmax = nrow[i]; 2523 for (j=0; j<jmax; j++) { 2524 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2525 } 2526 } 2527 #else 2528 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 2529 if (ismax) { 2530 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 2531 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2532 } 2533 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 2534 for (i=0; i<ismax; i++) { 2535 rmap_i = rmap[i]; 2536 irow_i = irow[i]; 2537 jmax = nrow[i]; 2538 for (j=0; j<jmax; j++) { 2539 rmap_i[irow_i[j]] = j; 2540 } 2541 } 2542 #endif 2543 2544 /* Update lens from offproc data */ 2545 { 2546 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2547 2548 for (tmp2=0; tmp2<nrqs; tmp2++) { 2549 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 2550 idex = pa[idex2]; 2551 sbuf1_i = sbuf1[idex]; 2552 jmax = sbuf1_i[0]; 2553 ct1 = 2*jmax+1; 2554 ct2 = 0; 2555 rbuf2_i = rbuf2[idex2]; 2556 rbuf3_i = rbuf3[idex2]; 2557 for (j=1; j<=jmax; j++) { 2558 is_no = sbuf1_i[2*j-1]; 2559 max1 = sbuf1_i[2*j]; 2560 lens_i = lens[is_no]; 2561 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2562 rmap_i = rmap[is_no]; 2563 for (k=0; k<max1; k++,ct1++) { 2564 #if defined(PETSC_USE_CTABLE) 2565 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2566 row--; 2567 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2568 #else 2569 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2570 #endif 2571 max2 = rbuf2_i[ct1]; 2572 for (l=0; l<max2; l++,ct2++) { 2573 if (!allcolumns[is_no]) { 2574 #if defined(PETSC_USE_CTABLE) 2575 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2576 #else 2577 tcol = cmap_i[rbuf3_i[ct2]]; 2578 #endif 2579 if (tcol) lens_i[row]++; 2580 } else { /* allcolumns */ 2581 lens_i[row]++; /* lens_i[row] += max2 ? */ 2582 } 2583 } 2584 } 2585 } 2586 } 2587 } 2588 ierr = PetscFree(r_status3);CHKERRQ(ierr); 2589 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2590 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2591 ierr = PetscFree(s_status3);CHKERRQ(ierr); 2592 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2593 2594 /* Create the submatrices */ 2595 if (scall == MAT_REUSE_MATRIX) { 2596 PetscBool flag; 2597 2598 /* 2599 Assumes new rows are same length as the old rows,hence bug! 2600 */ 2601 for (i=0; i<ismax; i++) { 2602 mat = (Mat_SeqAIJ*)(submats[i]->data); 2603 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2604 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2605 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 2606 /* Initial matrix as if empty */ 2607 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2608 2609 submats[i]->factortype = C->factortype; 2610 } 2611 } else { 2612 for (i=0; i<ismax; i++) { 2613 PetscInt rbs,cbs; 2614 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2615 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2616 2617 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2618 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2619 2620 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2621 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2622 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2623 } 2624 } 2625 2626 /* Assemble the matrices */ 2627 /* First assemble the local rows */ 2628 { 2629 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2630 PetscScalar *imat_a; 2631 2632 for (i=0; i<ismax; i++) { 2633 mat = (Mat_SeqAIJ*)submats[i]->data; 2634 imat_ilen = mat->ilen; 2635 imat_j = mat->j; 2636 imat_i = mat->i; 2637 imat_a = mat->a; 2638 2639 if (!allcolumns[i]) cmap_i = cmap[i]; 2640 rmap_i = rmap[i]; 2641 irow_i = irow[i]; 2642 jmax = nrow[i]; 2643 for (j=0; j<jmax; j++) { 2644 l = 0; 2645 row = irow_i[j]; 2646 while (row >= C->rmap->range[l+1]) l++; 2647 proc = l; 2648 if (proc == rank) { 2649 old_row = row; 2650 #if defined(PETSC_USE_CTABLE) 2651 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2652 row--; 2653 #else 2654 row = rmap_i[row]; 2655 #endif 2656 ilen_row = imat_ilen[row]; 2657 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2658 mat_i = imat_i[row]; 2659 mat_a = imat_a + mat_i; 2660 mat_j = imat_j + mat_i; 2661 if (!allcolumns[i]) { 2662 for (k=0; k<ncols; k++) { 2663 #if defined(PETSC_USE_CTABLE) 2664 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2665 #else 2666 tcol = cmap_i[cols[k]]; 2667 #endif 2668 if (tcol) { 2669 *mat_j++ = tcol - 1; 2670 *mat_a++ = vals[k]; 2671 ilen_row++; 2672 } 2673 } 2674 } else { /* allcolumns */ 2675 for (k=0; k<ncols; k++) { 2676 *mat_j++ = cols[k]; /* global col index! */ 2677 *mat_a++ = vals[k]; 2678 ilen_row++; 2679 } 2680 } 2681 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2682 2683 imat_ilen[row] = ilen_row; 2684 } 2685 } 2686 } 2687 } 2688 2689 /* Now assemble the off proc rows*/ 2690 { 2691 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 2692 PetscInt *imat_j,*imat_i; 2693 PetscScalar *imat_a,*rbuf4_i; 2694 2695 for (tmp2=0; tmp2<nrqs; tmp2++) { 2696 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 2697 idex = pa[idex2]; 2698 sbuf1_i = sbuf1[idex]; 2699 jmax = sbuf1_i[0]; 2700 ct1 = 2*jmax + 1; 2701 ct2 = 0; 2702 rbuf2_i = rbuf2[idex2]; 2703 rbuf3_i = rbuf3[idex2]; 2704 rbuf4_i = rbuf4[idex2]; 2705 for (j=1; j<=jmax; j++) { 2706 is_no = sbuf1_i[2*j-1]; 2707 rmap_i = rmap[is_no]; 2708 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2709 mat = (Mat_SeqAIJ*)submats[is_no]->data; 2710 imat_ilen = mat->ilen; 2711 imat_j = mat->j; 2712 imat_i = mat->i; 2713 imat_a = mat->a; 2714 max1 = sbuf1_i[2*j]; 2715 for (k=0; k<max1; k++,ct1++) { 2716 row = sbuf1_i[ct1]; 2717 #if defined(PETSC_USE_CTABLE) 2718 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2719 row--; 2720 #else 2721 row = rmap_i[row]; 2722 #endif 2723 ilen = imat_ilen[row]; 2724 mat_i = imat_i[row]; 2725 mat_a = imat_a + mat_i; 2726 mat_j = imat_j + mat_i; 2727 max2 = rbuf2_i[ct1]; 2728 if (!allcolumns[is_no]) { 2729 for (l=0; l<max2; l++,ct2++) { 2730 2731 #if defined(PETSC_USE_CTABLE) 2732 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2733 #else 2734 tcol = cmap_i[rbuf3_i[ct2]]; 2735 #endif 2736 if (tcol) { 2737 *mat_j++ = tcol - 1; 2738 *mat_a++ = rbuf4_i[ct2]; 2739 ilen++; 2740 } 2741 } 2742 } else { /* allcolumns */ 2743 for (l=0; l<max2; l++,ct2++) { 2744 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2745 *mat_a++ = rbuf4_i[ct2]; 2746 ilen++; 2747 } 2748 } 2749 imat_ilen[row] = ilen; 2750 } 2751 } 2752 } 2753 } 2754 2755 /* sort the rows */ 2756 { 2757 PetscInt *imat_ilen,*imat_j,*imat_i; 2758 PetscScalar *imat_a; 2759 2760 for (i=0; i<ismax; i++) { 2761 mat = (Mat_SeqAIJ*)submats[i]->data; 2762 imat_j = mat->j; 2763 imat_i = mat->i; 2764 imat_a = mat->a; 2765 imat_ilen = mat->ilen; 2766 2767 if (allcolumns[i]) continue; 2768 jmax = nrow[i]; 2769 for (j=0; j<jmax; j++) { 2770 PetscInt ilen; 2771 2772 mat_i = imat_i[j]; 2773 mat_a = imat_a + mat_i; 2774 mat_j = imat_j + mat_i; 2775 ilen = imat_ilen[j]; 2776 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 2777 } 2778 } 2779 } 2780 2781 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2782 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2783 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2784 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2785 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2786 2787 /* Restore the indices */ 2788 for (i=0; i<ismax; i++) { 2789 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2790 if (!allcolumns[i]) { 2791 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2792 } 2793 } 2794 2795 /* Destroy allocated memory */ 2796 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 2797 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2798 ierr = PetscFree(pa);CHKERRQ(ierr); 2799 2800 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 2801 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 2802 for (i=0; i<nrqr; ++i) { 2803 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 2804 } 2805 for (i=0; i<nrqs; ++i) { 2806 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 2807 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2808 } 2809 2810 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 2811 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 2812 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2813 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2814 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2815 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2816 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2817 2818 #if defined(PETSC_USE_CTABLE) 2819 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 2820 #else 2821 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 2822 #endif 2823 ierr = PetscFree(rmap);CHKERRQ(ierr); 2824 2825 for (i=0; i<ismax; i++) { 2826 if (!allcolumns[i]) { 2827 #if defined(PETSC_USE_CTABLE) 2828 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 2829 #else 2830 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 2831 #endif 2832 } 2833 } 2834 ierr = PetscFree(cmap);CHKERRQ(ierr); 2835 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2836 ierr = PetscFree(lens);CHKERRQ(ierr); 2837 2838 for (i=0; i<ismax; i++) { 2839 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2840 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2841 } 2842 PetscFunctionReturn(0); 2843 } 2844 2845 /* 2846 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2847 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2848 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2849 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2850 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2851 state, and needs to be "assembled" later by compressing B's column space. 2852 2853 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2854 Following this call, C->A & C->B have been created, even if empty. 2855 */ 2856 #undef __FUNCT__ 2857 #define __FUNCT__ "MatSetSeqMats_MPIAIJ" 2858 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2859 { 2860 /* If making this function public, change the error returned in this function away from _PLIB. */ 2861 PetscErrorCode ierr; 2862 Mat_MPIAIJ *aij; 2863 Mat_SeqAIJ *Baij; 2864 PetscBool seqaij,Bdisassembled; 2865 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2866 PetscScalar v; 2867 const PetscInt *rowindices,*colindices; 2868 2869 PetscFunctionBegin; 2870 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2871 if (A) { 2872 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2873 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2874 if (rowemb) { 2875 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2876 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2877 } else { 2878 if (C->rmap->n != A->rmap->n) { 2879 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2880 } 2881 } 2882 if (dcolemb) { 2883 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2884 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2885 } else { 2886 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2887 } 2888 } 2889 if (B) { 2890 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2891 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2892 if (rowemb) { 2893 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2894 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2895 } else { 2896 if (C->rmap->n != B->rmap->n) { 2897 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2898 } 2899 } 2900 if (ocolemb) { 2901 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2902 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2903 } else { 2904 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2905 } 2906 } 2907 2908 aij = (Mat_MPIAIJ*)(C->data); 2909 if (!aij->A) { 2910 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2911 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2912 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2913 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2914 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2915 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2916 } 2917 if (A) { 2918 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2919 } else { 2920 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2921 } 2922 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2923 /* 2924 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2925 need to "disassemble" B -- convert it to using C's global indices. 2926 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2927 2928 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2929 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2930 2931 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2932 At least avoid calling MatSetValues() and the implied searches? 2933 */ 2934 2935 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2936 #if defined(PETSC_USE_CTABLE) 2937 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2938 #else 2939 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2940 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2941 if (aij->B) { 2942 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2943 } 2944 #endif 2945 ngcol = 0; 2946 if (aij->lvec) { 2947 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2948 } 2949 if (aij->garray) { 2950 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2951 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2952 } 2953 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2954 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2955 } 2956 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2957 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2958 } 2959 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2960 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2961 } 2962 } 2963 Bdisassembled = PETSC_FALSE; 2964 if (!aij->B) { 2965 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2966 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2967 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2968 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2969 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2970 Bdisassembled = PETSC_TRUE; 2971 } 2972 if (B) { 2973 Baij = (Mat_SeqAIJ*)(B->data); 2974 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2975 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2976 for (i=0; i<B->rmap->n; i++) { 2977 nz[i] = Baij->i[i+1] - Baij->i[i]; 2978 } 2979 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2980 ierr = PetscFree(nz);CHKERRQ(ierr); 2981 } 2982 2983 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 2984 shift = rend-rstart; 2985 count = 0; 2986 rowindices = NULL; 2987 colindices = NULL; 2988 if (rowemb) { 2989 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 2990 } 2991 if (ocolemb) { 2992 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 2993 } 2994 for (i=0; i<B->rmap->n; i++) { 2995 PetscInt row; 2996 row = i; 2997 if (rowindices) row = rowindices[i]; 2998 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 2999 col = Baij->j[count]; 3000 if (colindices) col = colindices[col]; 3001 if (Bdisassembled && col>=rstart) col += shift; 3002 v = Baij->a[count]; 3003 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3004 ++count; 3005 } 3006 } 3007 /* No assembly for aij->B is necessary. */ 3008 /* FIXME: set aij->B's nonzerostate correctly. */ 3009 } else { 3010 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3011 } 3012 C->preallocated = PETSC_TRUE; 3013 C->was_assembled = PETSC_FALSE; 3014 C->assembled = PETSC_FALSE; 3015 /* 3016 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3017 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3018 */ 3019 PetscFunctionReturn(0); 3020 } 3021 3022 #undef __FUNCT__ 3023 #define __FUNCT__ "MatGetSeqMats_MPIAIJ" 3024 /* 3025 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3026 */ 3027 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3028 { 3029 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 3030 3031 PetscFunctionBegin; 3032 PetscValidPointer(A,2); 3033 PetscValidPointer(B,3); 3034 /* FIXME: make sure C is assembled */ 3035 *A = aij->A; 3036 *B = aij->B; 3037 /* Note that we don't incref *A and *B, so be careful! */ 3038 PetscFunctionReturn(0); 3039 } 3040 3041 /* 3042 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3043 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3044 */ 3045 #undef __FUNCT__ 3046 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ" 3047 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3048 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3049 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3050 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3051 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3052 { 3053 PetscErrorCode ierr; 3054 PetscMPIInt isize,flag; 3055 PetscInt i,ii,cismax,ispar,ciscol_localsize; 3056 Mat *A,*B; 3057 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3058 3059 PetscFunctionBegin; 3060 if (!ismax) PetscFunctionReturn(0); 3061 3062 for (i = 0, cismax = 0; i < ismax; ++i) { 3063 PetscMPIInt isize; 3064 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3065 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3066 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3067 if (isize > 1) ++cismax; 3068 } 3069 /* 3070 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3071 ispar counts the number of parallel ISs across C's comm. 3072 */ 3073 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3074 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3075 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3076 PetscFunctionReturn(0); 3077 } 3078 3079 /* if (ispar) */ 3080 /* 3081 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3082 These are used to extract the off-diag portion of the resulting parallel matrix. 3083 The row IS for the off-diag portion is the same as for the diag portion, 3084 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3085 */ 3086 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3087 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3088 for (i = 0, ii = 0; i < ismax; ++i) { 3089 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3090 if (isize > 1) { 3091 /* 3092 TODO: This is the part that's ***NOT SCALABLE***. 3093 To fix this we need to extract just the indices of C's nonzero columns 3094 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3095 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3096 to be done without serializing on the IS list, so, most likely, it is best 3097 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3098 */ 3099 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3100 /* Now we have to 3101 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3102 were sorted on each rank, concatenated they might no longer be sorted; 3103 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3104 indices in the nondecreasing order to the original index positions. 3105 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3106 */ 3107 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3108 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3109 ++ii; 3110 } 3111 } 3112 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3113 for (i = 0, ii = 0; i < ismax; ++i) { 3114 PetscInt j,issize; 3115 const PetscInt *indices; 3116 3117 /* 3118 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3119 */ 3120 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3121 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3122 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3123 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3124 for (j = 1; j < issize; ++j) { 3125 if (indices[j] == indices[j-1]) { 3126 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3127 } 3128 } 3129 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3130 3131 3132 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3133 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3134 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3135 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3136 for (j = 1; j < issize; ++j) { 3137 if (indices[j-1] == indices[j]) { 3138 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3139 } 3140 } 3141 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3142 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3143 if (isize > 1) { 3144 cisrow[ii] = isrow[i]; 3145 ++ii; 3146 } 3147 } 3148 /* 3149 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3150 array of sequential matrices underlying the resulting parallel matrices. 3151 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3152 contain duplicates. 3153 3154 There are as many diag matrices as there are original index sets. There are only as many parallel 3155 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3156 3157 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3158 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3159 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3160 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3161 with A[i] and B[ii] extracted from the corresponding MPI submat. 3162 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3163 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3164 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3165 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3166 values into A[i] and B[ii] sitting inside the corresponding submat. 3167 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3168 A[i], B[ii], AA[i] or BB[ii] matrices. 3169 */ 3170 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3171 if (scall == MAT_INITIAL_MATRIX) { 3172 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3173 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 3174 } else { 3175 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 3176 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 3177 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 3178 for (i = 0, ii = 0; i < ismax; ++i) { 3179 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3180 if (isize > 1) { 3181 Mat AA,BB; 3182 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 3183 if (!isrow_p[i] && !iscol_p[i]) { 3184 A[i] = AA; 3185 } else { 3186 /* TODO: extract A[i] composed on AA. */ 3187 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3188 } 3189 if (!isrow_p[i] && !ciscol_p[ii]) { 3190 B[ii] = BB; 3191 } else { 3192 /* TODO: extract B[ii] composed on BB. */ 3193 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 3194 } 3195 ++ii; 3196 } else { 3197 if (!isrow_p[i] && !iscol_p[i]) { 3198 A[i] = (*submat)[i]; 3199 } else { 3200 /* TODO: extract A[i] composed on (*submat)[i]. */ 3201 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3202 } 3203 } 3204 } 3205 } 3206 /* Now obtain the sequential A and B submatrices separately. */ 3207 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 3208 /* I did not figure out a good way to fix it right now. 3209 * Local column size of B[i] is different from the size of ciscol[i]. 3210 * B[i]'s size is finally determined by MatAssembly, while 3211 * ciscol[i] is computed as the complement of iscol[i]. 3212 * It is better to keep only nonzero indices when computing 3213 * the complement ciscol[i]. 3214 * */ 3215 if(scall==MAT_REUSE_MATRIX){ 3216 for(i=0; i<cismax; i++){ 3217 ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr); 3218 B[i]->cmap->n = ciscol_localsize; 3219 } 3220 } 3221 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 3222 /* 3223 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3224 matrices A & B have been extracted directly into the parallel matrices containing them, or 3225 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3226 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3227 to have the same sparsity pattern. 3228 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3229 must be constructed for C. This is done by setseqmat(s). 3230 */ 3231 for (i = 0, ii = 0; i < ismax; ++i) { 3232 /* 3233 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3234 That way we can avoid sorting and computing permutations when reusing. 3235 To this end: 3236 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3237 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3238 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3239 */ 3240 MatStructure pattern; 3241 if (scall == MAT_INITIAL_MATRIX) { 3242 pattern = DIFFERENT_NONZERO_PATTERN; 3243 } else { 3244 pattern = SAME_NONZERO_PATTERN; 3245 } 3246 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3247 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3248 if (isize > 1) { 3249 if (scall == MAT_INITIAL_MATRIX) { 3250 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3251 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3252 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3253 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3254 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3255 } 3256 /* 3257 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3258 */ 3259 { 3260 Mat AA,BB; 3261 AA = NULL; 3262 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 3263 BB = NULL; 3264 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 3265 if (AA || BB) { 3266 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3267 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3268 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3269 } 3270 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 3271 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 3272 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3273 } 3274 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 3275 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 3276 ierr = MatDestroy(&BB);CHKERRQ(ierr); 3277 } 3278 } 3279 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3280 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3281 ++ii; 3282 } else { /* if (isize == 1) */ 3283 if (scall == MAT_INITIAL_MATRIX) { 3284 if (isrow_p[i] || iscol_p[i]) { 3285 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3286 } else (*submat)[i] = A[i]; 3287 } 3288 if (isrow_p[i] || iscol_p[i]) { 3289 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3290 /* Otherwise A is extracted straight into (*submats)[i]. */ 3291 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3292 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3293 } 3294 } 3295 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3296 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3297 } 3298 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3299 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3300 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3301 ierr = PetscFree(A);CHKERRQ(ierr); 3302 ierr = PetscFree(B);CHKERRQ(ierr); 3303 PetscFunctionReturn(0); 3304 } 3305 3306 3307 3308 #undef __FUNCT__ 3309 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ" 3310 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3311 { 3312 PetscErrorCode ierr; 3313 3314 PetscFunctionBegin; 3315 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3316 PetscFunctionReturn(0); 3317 } 3318