1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 #undef __FUNCT__ 23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 25 { 26 PetscErrorCode ierr; 27 PetscInt i; 28 29 PetscFunctionBegin; 30 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 31 for (i=0; i<ov; ++i) { 32 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 33 } 34 PetscFunctionReturn(0); 35 } 36 37 #undef __FUNCT__ 38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 40 { 41 PetscErrorCode ierr; 42 PetscInt i; 43 44 PetscFunctionBegin; 45 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 46 for (i=0; i<ov; ++i) { 47 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 48 } 49 PetscFunctionReturn(0); 50 } 51 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 56 { 57 PetscErrorCode ierr; 58 MPI_Comm comm; 59 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 60 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 61 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 62 const PetscInt *indices_i,**indices; 63 PetscLayout rmap; 64 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 65 PetscSF sf; 66 PetscSFNode *remote; 67 68 PetscFunctionBegin; 69 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 70 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 71 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 72 /* get row map to determine where rows should be going */ 73 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 74 /* retrieve IS data and put all together so that we 75 * can optimize communication 76 * */ 77 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 78 for (i=0,tlength=0; i<nidx; i++){ 79 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 80 tlength += length[i]; 81 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 82 } 83 /* find these rows on remote processors */ 84 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 85 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 86 nrrows = 0; 87 for (i=0; i<nidx; i++){ 88 length_i = length[i]; 89 indices_i = indices[i]; 90 for (j=0; j<length_i; j++){ 91 owner = -1; 92 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 93 /* remote processors */ 94 if (owner != rank){ 95 tosizes_temp[owner]++; /* number of rows to owner */ 96 rrow_ranks[nrrows] = owner; /* processor */ 97 rrow_isids[nrrows] = i; /* is id */ 98 remoterows[nrrows++] = indices_i[j]; /* row */ 99 } 100 } 101 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 102 } 103 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 104 /* test if we need to exchange messages 105 * generally speaking, we do not need to exchange 106 * data when overlap is 1 107 * */ 108 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 109 /* we do not have any messages 110 * It usually corresponds to overlap 1 111 * */ 112 if (!reducednrrows){ 113 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 114 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 115 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 nto = 0; 119 /* send sizes and ranks for building a two-sided communcation */ 120 for (i=0; i<size; i++){ 121 if (tosizes_temp[i]){ 122 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 123 tosizes_temp[i] = nto; /* a map from processor to index */ 124 toranks[nto++] = i; /* processor */ 125 } 126 } 127 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 128 for (i=0; i<nto; i++){ 129 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 130 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 131 } 132 /* send information to other processors */ 133 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 134 nrecvrows = 0; 135 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 136 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 137 nrecvrows = 0; 138 for (i=0; i<nfrom; i++){ 139 for (j=0; j<fromsizes[2*i]; j++){ 140 remote[nrecvrows].rank = fromranks[i]; 141 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 142 } 143 } 144 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 145 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 146 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 147 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 148 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 149 /* message pair <no of is, row> */ 150 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 151 for (i=0; i<nrrows; i++){ 152 owner = rrow_ranks[i]; /* processor */ 153 j = tosizes_temp[owner]; /* index */ 154 todata[toffsets[j]++] = rrow_isids[i]; 155 todata[toffsets[j]++] = remoterows[i]; 156 } 157 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 158 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 159 ierr = PetscFree(toffsets);CHKERRQ(ierr); 160 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 161 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 162 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 163 /* send rows belonging to the remote so that then we could get the overlapping data back */ 164 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 165 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 166 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 167 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 168 ierr = PetscFree(fromranks);CHKERRQ(ierr); 169 nrecvrows = 0; 170 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 171 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 172 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 173 nrecvrows = 0; 174 for (i=0; i<nto; i++){ 175 for (j=0; j<tosizes[2*i]; j++){ 176 remote[nrecvrows].rank = toranks[i]; 177 remote[nrecvrows++].index = tosizes[2*i+1]+j; 178 } 179 } 180 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 181 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 182 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 183 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 184 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 185 /* overlap communication and computation */ 186 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 187 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 188 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 189 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 190 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 191 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 192 ierr = PetscFree(toranks);CHKERRQ(ierr); 193 ierr = PetscFree(tosizes);CHKERRQ(ierr); 194 ierr = PetscFree(todata);CHKERRQ(ierr); 195 PetscFunctionReturn(0); 196 } 197 198 #undef __FUNCT__ 199 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 200 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 201 { 202 PetscInt *isz,isz_i,i,j,is_id, data_size; 203 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 204 const PetscInt *indices_i_temp; 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 max_lsize = 0; 209 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 210 for (i=0; i<nidx; i++){ 211 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 212 max_lsize = lsize>max_lsize ? lsize:max_lsize; 213 isz[i] = lsize; 214 } 215 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 216 for (i=0; i<nidx; i++){ 217 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 218 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 219 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 220 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 221 } 222 /* retrieve information to get row id and its overlap */ 223 for (i=0; i<nrecvs; ){ 224 is_id = recvdata[i++]; 225 data_size = recvdata[i++]; 226 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 227 isz_i = isz[is_id]; 228 for (j=0; j< data_size; j++){ 229 col = recvdata[i++]; 230 indices_i[isz_i++] = col; 231 } 232 isz[is_id] = isz_i; 233 } 234 /* remove duplicate entities */ 235 for (i=0; i<nidx; i++){ 236 indices_i = indices_temp+(max_lsize+nrecvs)*i; 237 isz_i = isz[i]; 238 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 239 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 240 } 241 ierr = PetscFree(isz);CHKERRQ(ierr); 242 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 243 PetscFunctionReturn(0); 244 } 245 246 #undef __FUNCT__ 247 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 248 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 249 { 250 PetscLayout rmap,cmap; 251 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 252 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 253 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 254 const PetscInt *gcols,*ai,*aj,*bi,*bj; 255 Mat amat,bmat; 256 PetscMPIInt rank; 257 PetscBool done; 258 MPI_Comm comm; 259 PetscErrorCode ierr; 260 261 PetscFunctionBegin; 262 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 263 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 264 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 265 /* Even if the mat is symmetric, we still assume it is not symmetric */ 266 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 267 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 268 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 269 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 270 /* total number of nonzero values is used to estimate the memory usage in the next step */ 271 tnz = ai[an]+bi[bn]; 272 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 273 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 274 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 275 /* to find the longest message */ 276 max_fszs = 0; 277 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 278 /* better way to estimate number of nonzero in the mat??? */ 279 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 280 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 281 rows_pos = 0; 282 totalrows = 0; 283 for (i=0; i<nfrom; i++){ 284 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 285 /* group data together */ 286 for (j=0; j<fromsizes[2*i]; j+=2){ 287 is_id = fromrows[rows_pos++];/* no of is */ 288 rows_i = rows_data[is_id]; 289 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 290 } 291 /* estimate a space to avoid multiple allocations */ 292 for (j=0; j<nidx; j++){ 293 indvc_ij = 0; 294 rows_i = rows_data[j]; 295 for (l=0; l<rows_pos_i[j]; l++){ 296 row = rows_i[l]-rstart; 297 start = ai[row]; 298 end = ai[row+1]; 299 for (k=start; k<end; k++){ /* Amat */ 300 col = aj[k] + cstart; 301 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 302 } 303 start = bi[row]; 304 end = bi[row+1]; 305 for (k=start; k<end; k++) { /* Bmat */ 306 col = gcols[bj[k]]; 307 indices_tmp[indvc_ij++] = col; 308 } 309 } 310 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 311 indv_counts[i*nidx+j] = indvc_ij; 312 totalrows += indvc_ij; 313 } 314 } 315 /* message triple <no of is, number of rows, rows> */ 316 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 317 totalrows = 0; 318 rows_pos = 0; 319 /* use this code again */ 320 for (i=0;i<nfrom;i++){ 321 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 322 for (j=0; j<fromsizes[2*i]; j+=2){ 323 is_id = fromrows[rows_pos++]; 324 rows_i = rows_data[is_id]; 325 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 326 } 327 /* add data */ 328 for (j=0; j<nidx; j++){ 329 if (!indv_counts[i*nidx+j]) continue; 330 indvc_ij = 0; 331 sbdata[totalrows++] = j; 332 sbdata[totalrows++] = indv_counts[i*nidx+j]; 333 sbsizes[2*i] += 2; 334 rows_i = rows_data[j]; 335 for (l=0; l<rows_pos_i[j]; l++){ 336 row = rows_i[l]-rstart; 337 start = ai[row]; 338 end = ai[row+1]; 339 for (k=start; k<end; k++){ /* Amat */ 340 col = aj[k] + cstart; 341 indices_tmp[indvc_ij++] = col; 342 } 343 start = bi[row]; 344 end = bi[row+1]; 345 for (k=start; k<end; k++) { /* Bmat */ 346 col = gcols[bj[k]]; 347 indices_tmp[indvc_ij++] = col; 348 } 349 } 350 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 351 sbsizes[2*i] += indvc_ij; 352 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 353 totalrows += indvc_ij; 354 } 355 } 356 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 357 for (i=0; i<nfrom; i++){ 358 offsets[i+1] = offsets[i] + sbsizes[2*i]; 359 sbsizes[2*i+1] = offsets[i]; 360 } 361 ierr = PetscFree(offsets);CHKERRQ(ierr); 362 if (sbrowsizes) *sbrowsizes = sbsizes; 363 if (sbrows) *sbrows = sbdata; 364 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 365 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 366 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 367 PetscFunctionReturn(0); 368 } 369 370 #undef __FUNCT__ 371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 373 { 374 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 375 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 376 PetscInt lsize,lsize_tmp,owner; 377 PetscMPIInt rank; 378 Mat amat,bmat; 379 PetscBool done; 380 PetscLayout cmap,rmap; 381 MPI_Comm comm; 382 PetscErrorCode ierr; 383 384 PetscFunctionBegin; 385 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 386 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 387 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 388 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 389 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 390 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 391 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 392 /* is it a safe way to compute number of nonzero values ? */ 393 tnz = ai[an]+bi[bn]; 394 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 395 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 396 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 397 /* it is a better way to estimate memory than the old implementation 398 * where global size of matrix is used 399 * */ 400 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 401 for (i=0; i<nidx; i++) { 402 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 403 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 404 lsize_tmp = 0; 405 for (j=0; j<lsize; j++) { 406 owner = -1; 407 row = indices[j]; 408 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 409 if (owner != rank) continue; 410 /* local number */ 411 row -= rstart; 412 start = ai[row]; 413 end = ai[row+1]; 414 for (k=start; k<end; k++) { /* Amat */ 415 col = aj[k] + cstart; 416 indices_temp[lsize_tmp++] = col; 417 } 418 start = bi[row]; 419 end = bi[row+1]; 420 for (k=start; k<end; k++) { /* Bmat */ 421 col = gcols[bj[k]]; 422 indices_temp[lsize_tmp++] = col; 423 } 424 } 425 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 426 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 427 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 428 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 429 } 430 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 431 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 432 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 433 PetscFunctionReturn(0); 434 } 435 436 437 /* 438 Sample message format: 439 If a processor A wants processor B to process some elements corresponding 440 to index sets is[1],is[5] 441 mesg [0] = 2 (no of index sets in the mesg) 442 ----------- 443 mesg [1] = 1 => is[1] 444 mesg [2] = sizeof(is[1]); 445 ----------- 446 mesg [3] = 5 => is[5] 447 mesg [4] = sizeof(is[5]); 448 ----------- 449 mesg [5] 450 mesg [n] datas[1] 451 ----------- 452 mesg[n+1] 453 mesg[m] data(is[5]) 454 ----------- 455 456 Notes: 457 nrqs - no of requests sent (or to be sent out) 458 nrqr - no of requests recieved (which have to be or which have been processed 459 */ 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 463 { 464 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 465 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 466 const PetscInt **idx,*idx_i; 467 PetscInt *n,**data,len; 468 #if defined(PETSC_USE_CTABLE) 469 PetscTable *table_data,table_data_i; 470 PetscInt *tdata,tcount,tcount_max; 471 #else 472 PetscInt *data_i,*d_p; 473 #endif 474 PetscErrorCode ierr; 475 PetscMPIInt size,rank,tag1,tag2; 476 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 477 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 478 PetscBT *table; 479 MPI_Comm comm; 480 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 481 MPI_Status *s_status,*recv_status; 482 char *t_p; 483 484 PetscFunctionBegin; 485 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 486 size = c->size; 487 rank = c->rank; 488 M = C->rmap->N; 489 490 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 491 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 492 493 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 494 495 for (i=0; i<imax; i++) { 496 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 497 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 498 } 499 500 /* evaluate communication - mesg to who,length of mesg, and buffer space 501 required. Based on this, buffers are allocated, and data copied into them */ 502 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 503 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 504 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 505 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 506 for (i=0; i<imax; i++) { 507 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 508 idx_i = idx[i]; 509 len = n[i]; 510 for (j=0; j<len; j++) { 511 row = idx_i[j]; 512 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 513 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 514 w4[proc]++; 515 } 516 for (j=0; j<size; j++) { 517 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 518 } 519 } 520 521 nrqs = 0; /* no of outgoing messages */ 522 msz = 0; /* total mesg length (for all proc */ 523 w1[rank] = 0; /* no mesg sent to intself */ 524 w3[rank] = 0; 525 for (i=0; i<size; i++) { 526 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 527 } 528 /* pa - is list of processors to communicate with */ 529 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 530 for (i=0,j=0; i<size; i++) { 531 if (w1[i]) {pa[j] = i; j++;} 532 } 533 534 /* Each message would have a header = 1 + 2*(no of IS) + data */ 535 for (i=0; i<nrqs; i++) { 536 j = pa[i]; 537 w1[j] += w2[j] + 2*w3[j]; 538 msz += w1[j]; 539 } 540 541 /* Determine the number of messages to expect, their lengths, from from-ids */ 542 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 543 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 544 545 /* Now post the Irecvs corresponding to these messages */ 546 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 547 548 /* Allocate Memory for outgoing messages */ 549 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 550 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 551 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 552 553 { 554 PetscInt *iptr = tmp,ict = 0; 555 for (i=0; i<nrqs; i++) { 556 j = pa[i]; 557 iptr += ict; 558 outdat[j] = iptr; 559 ict = w1[j]; 560 } 561 } 562 563 /* Form the outgoing messages */ 564 /* plug in the headers */ 565 for (i=0; i<nrqs; i++) { 566 j = pa[i]; 567 outdat[j][0] = 0; 568 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 569 ptr[j] = outdat[j] + 2*w3[j] + 1; 570 } 571 572 /* Memory for doing local proc's work */ 573 { 574 PetscInt M_BPB_imax = 0; 575 #if defined(PETSC_USE_CTABLE) 576 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 577 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 578 for (i=0; i<imax; i++) { 579 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 580 } 581 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 582 for (i=0; i<imax; i++) { 583 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 584 } 585 #else 586 PetscInt Mimax = 0; 587 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 588 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 589 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 590 for (i=0; i<imax; i++) { 591 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 592 data[i] = d_p + M*i; 593 } 594 #endif 595 } 596 597 /* Parse the IS and update local tables and the outgoing buf with the data */ 598 { 599 PetscInt n_i,isz_i,*outdat_j,ctr_j; 600 PetscBT table_i; 601 602 for (i=0; i<imax; i++) { 603 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 604 n_i = n[i]; 605 table_i = table[i]; 606 idx_i = idx[i]; 607 #if defined(PETSC_USE_CTABLE) 608 table_data_i = table_data[i]; 609 #else 610 data_i = data[i]; 611 #endif 612 isz_i = isz[i]; 613 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 614 row = idx_i[j]; 615 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 616 if (proc != rank) { /* copy to the outgoing buffer */ 617 ctr[proc]++; 618 *ptr[proc] = row; 619 ptr[proc]++; 620 } else if (!PetscBTLookupSet(table_i,row)) { 621 #if defined(PETSC_USE_CTABLE) 622 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 623 #else 624 data_i[isz_i] = row; /* Update the local table */ 625 #endif 626 isz_i++; 627 } 628 } 629 /* Update the headers for the current IS */ 630 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 631 if ((ctr_j = ctr[j])) { 632 outdat_j = outdat[j]; 633 k = ++outdat_j[0]; 634 outdat_j[2*k] = ctr_j; 635 outdat_j[2*k-1] = i; 636 } 637 } 638 isz[i] = isz_i; 639 } 640 } 641 642 /* Now post the sends */ 643 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 644 for (i=0; i<nrqs; ++i) { 645 j = pa[i]; 646 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 647 } 648 649 /* No longer need the original indices */ 650 for (i=0; i<imax; ++i) { 651 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 652 } 653 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 654 655 for (i=0; i<imax; ++i) { 656 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 657 } 658 659 /* Do Local work */ 660 #if defined(PETSC_USE_CTABLE) 661 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 662 #else 663 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 664 #endif 665 666 /* Receive messages */ 667 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 668 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 669 670 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 671 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 672 673 /* Phase 1 sends are complete - deallocate buffers */ 674 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 675 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 676 677 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 678 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 679 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 680 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 681 ierr = PetscFree(rbuf);CHKERRQ(ierr); 682 683 684 /* Send the data back */ 685 /* Do a global reduction to know the buffer space req for incoming messages */ 686 { 687 PetscMPIInt *rw1; 688 689 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 690 691 for (i=0; i<nrqr; ++i) { 692 proc = recv_status[i].MPI_SOURCE; 693 694 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 695 rw1[proc] = isz1[i]; 696 } 697 ierr = PetscFree(onodes1);CHKERRQ(ierr); 698 ierr = PetscFree(olengths1);CHKERRQ(ierr); 699 700 /* Determine the number of messages to expect, their lengths, from from-ids */ 701 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 702 ierr = PetscFree(rw1);CHKERRQ(ierr); 703 } 704 /* Now post the Irecvs corresponding to these messages */ 705 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 706 707 /* Now post the sends */ 708 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 709 for (i=0; i<nrqr; ++i) { 710 j = recv_status[i].MPI_SOURCE; 711 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 712 } 713 714 /* receive work done on other processors */ 715 { 716 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 717 PetscMPIInt idex; 718 PetscBT table_i; 719 MPI_Status *status2; 720 721 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 722 for (i=0; i<nrqs; ++i) { 723 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 724 /* Process the message */ 725 rbuf2_i = rbuf2[idex]; 726 ct1 = 2*rbuf2_i[0]+1; 727 jmax = rbuf2[idex][0]; 728 for (j=1; j<=jmax; j++) { 729 max = rbuf2_i[2*j]; 730 is_no = rbuf2_i[2*j-1]; 731 isz_i = isz[is_no]; 732 table_i = table[is_no]; 733 #if defined(PETSC_USE_CTABLE) 734 table_data_i = table_data[is_no]; 735 #else 736 data_i = data[is_no]; 737 #endif 738 for (k=0; k<max; k++,ct1++) { 739 row = rbuf2_i[ct1]; 740 if (!PetscBTLookupSet(table_i,row)) { 741 #if defined(PETSC_USE_CTABLE) 742 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 743 #else 744 data_i[isz_i] = row; 745 #endif 746 isz_i++; 747 } 748 } 749 isz[is_no] = isz_i; 750 } 751 } 752 753 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 754 ierr = PetscFree(status2);CHKERRQ(ierr); 755 } 756 757 #if defined(PETSC_USE_CTABLE) 758 tcount_max = 0; 759 for (i=0; i<imax; ++i) { 760 table_data_i = table_data[i]; 761 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 762 if (tcount_max < tcount) tcount_max = tcount; 763 } 764 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 765 #endif 766 767 for (i=0; i<imax; ++i) { 768 #if defined(PETSC_USE_CTABLE) 769 PetscTablePosition tpos; 770 table_data_i = table_data[i]; 771 772 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 773 while (tpos) { 774 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 775 tdata[--j] = --k; 776 } 777 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 778 #else 779 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 780 #endif 781 } 782 783 ierr = PetscFree(onodes2);CHKERRQ(ierr); 784 ierr = PetscFree(olengths2);CHKERRQ(ierr); 785 786 ierr = PetscFree(pa);CHKERRQ(ierr); 787 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 788 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 789 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 790 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 791 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 792 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 793 ierr = PetscFree(s_status);CHKERRQ(ierr); 794 ierr = PetscFree(recv_status);CHKERRQ(ierr); 795 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 796 ierr = PetscFree(xdata);CHKERRQ(ierr); 797 ierr = PetscFree(isz1);CHKERRQ(ierr); 798 #if defined(PETSC_USE_CTABLE) 799 for (i=0; i<imax; i++) { 800 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 801 } 802 ierr = PetscFree(table_data);CHKERRQ(ierr); 803 ierr = PetscFree(tdata);CHKERRQ(ierr); 804 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 805 #else 806 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 807 #endif 808 PetscFunctionReturn(0); 809 } 810 811 #undef __FUNCT__ 812 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 813 /* 814 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 815 the work on the local processor. 816 817 Inputs: 818 C - MAT_MPIAIJ; 819 imax - total no of index sets processed at a time; 820 table - an array of char - size = m bits. 821 822 Output: 823 isz - array containing the count of the solution elements corresponding 824 to each index set; 825 data or table_data - pointer to the solutions 826 */ 827 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 828 { 829 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 830 Mat A = c->A,B = c->B; 831 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 832 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 833 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 834 PetscBT table_i; 835 #if defined(PETSC_USE_CTABLE) 836 PetscTable table_data_i; 837 PetscErrorCode ierr; 838 PetscTablePosition tpos; 839 PetscInt tcount,*tdata; 840 #else 841 PetscInt *data_i; 842 #endif 843 844 PetscFunctionBegin; 845 rstart = C->rmap->rstart; 846 cstart = C->cmap->rstart; 847 ai = a->i; 848 aj = a->j; 849 bi = b->i; 850 bj = b->j; 851 garray = c->garray; 852 853 for (i=0; i<imax; i++) { 854 #if defined(PETSC_USE_CTABLE) 855 /* copy existing entries of table_data_i into tdata[] */ 856 table_data_i = table_data[i]; 857 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 858 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 859 860 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 861 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 862 while (tpos) { 863 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 864 tdata[--j] = --row; 865 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 866 } 867 #else 868 data_i = data[i]; 869 #endif 870 table_i = table[i]; 871 isz_i = isz[i]; 872 max = isz[i]; 873 874 for (j=0; j<max; j++) { 875 #if defined(PETSC_USE_CTABLE) 876 row = tdata[j] - rstart; 877 #else 878 row = data_i[j] - rstart; 879 #endif 880 start = ai[row]; 881 end = ai[row+1]; 882 for (k=start; k<end; k++) { /* Amat */ 883 val = aj[k] + cstart; 884 if (!PetscBTLookupSet(table_i,val)) { 885 #if defined(PETSC_USE_CTABLE) 886 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 887 #else 888 data_i[isz_i] = val; 889 #endif 890 isz_i++; 891 } 892 } 893 start = bi[row]; 894 end = bi[row+1]; 895 for (k=start; k<end; k++) { /* Bmat */ 896 val = garray[bj[k]]; 897 if (!PetscBTLookupSet(table_i,val)) { 898 #if defined(PETSC_USE_CTABLE) 899 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 900 #else 901 data_i[isz_i] = val; 902 #endif 903 isz_i++; 904 } 905 } 906 } 907 isz[i] = isz_i; 908 909 #if defined(PETSC_USE_CTABLE) 910 ierr = PetscFree(tdata);CHKERRQ(ierr); 911 #endif 912 } 913 PetscFunctionReturn(0); 914 } 915 916 #undef __FUNCT__ 917 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 918 /* 919 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 920 and return the output 921 922 Input: 923 C - the matrix 924 nrqr - no of messages being processed. 925 rbuf - an array of pointers to the recieved requests 926 927 Output: 928 xdata - array of messages to be sent back 929 isz1 - size of each message 930 931 For better efficiency perhaps we should malloc separately each xdata[i], 932 then if a remalloc is required we need only copy the data for that one row 933 rather then all previous rows as it is now where a single large chunck of 934 memory is used. 935 936 */ 937 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 938 { 939 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 940 Mat A = c->A,B = c->B; 941 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 942 PetscErrorCode ierr; 943 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 944 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 945 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 946 PetscInt *rbuf_i,kmax,rbuf_0; 947 PetscBT xtable; 948 949 PetscFunctionBegin; 950 m = C->rmap->N; 951 rstart = C->rmap->rstart; 952 cstart = C->cmap->rstart; 953 ai = a->i; 954 aj = a->j; 955 bi = b->i; 956 bj = b->j; 957 garray = c->garray; 958 959 960 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 961 rbuf_i = rbuf[i]; 962 rbuf_0 = rbuf_i[0]; 963 ct += rbuf_0; 964 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 965 } 966 967 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 968 else max1 = 1; 969 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 970 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 971 ++no_malloc; 972 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 973 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 974 975 ct3 = 0; 976 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 977 rbuf_i = rbuf[i]; 978 rbuf_0 = rbuf_i[0]; 979 ct1 = 2*rbuf_0+1; 980 ct2 = ct1; 981 ct3 += ct1; 982 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 983 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 984 oct2 = ct2; 985 kmax = rbuf_i[2*j]; 986 for (k=0; k<kmax; k++,ct1++) { 987 row = rbuf_i[ct1]; 988 if (!PetscBTLookupSet(xtable,row)) { 989 if (!(ct3 < mem_estimate)) { 990 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 991 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 992 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 993 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 994 xdata[0] = tmp; 995 mem_estimate = new_estimate; ++no_malloc; 996 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 997 } 998 xdata[i][ct2++] = row; 999 ct3++; 1000 } 1001 } 1002 for (k=oct2,max2=ct2; k<max2; k++) { 1003 row = xdata[i][k] - rstart; 1004 start = ai[row]; 1005 end = ai[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = aj[l] + cstart; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 start = bi[row]; 1023 end = bi[row+1]; 1024 for (l=start; l<end; l++) { 1025 val = garray[bj[l]]; 1026 if (!PetscBTLookupSet(xtable,val)) { 1027 if (!(ct3 < mem_estimate)) { 1028 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1029 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1030 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1031 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1032 xdata[0] = tmp; 1033 mem_estimate = new_estimate; ++no_malloc; 1034 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1035 } 1036 xdata[i][ct2++] = val; 1037 ct3++; 1038 } 1039 } 1040 } 1041 /* Update the header*/ 1042 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1043 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1044 } 1045 xdata[i][0] = rbuf_0; 1046 xdata[i+1] = xdata[i] + ct2; 1047 isz1[i] = ct2; /* size of each message */ 1048 } 1049 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1050 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1051 PetscFunctionReturn(0); 1052 } 1053 /* -------------------------------------------------------------------------*/ 1054 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 1055 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1056 /* 1057 Every processor gets the entire matrix 1058 */ 1059 #undef __FUNCT__ 1060 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 1061 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1062 { 1063 Mat B; 1064 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1065 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1066 PetscErrorCode ierr; 1067 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1068 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1069 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1070 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1071 1072 PetscFunctionBegin; 1073 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1074 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1075 1076 if (scall == MAT_INITIAL_MATRIX) { 1077 /* ---------------------------------------------------------------- 1078 Tell every processor the number of nonzeros per row 1079 */ 1080 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1081 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1082 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1083 } 1084 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1085 for (i=0; i<size; i++) { 1086 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1087 displs[i] = A->rmap->range[i]; 1088 } 1089 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1090 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1091 #else 1092 sendcount = A->rmap->rend - A->rmap->rstart; 1093 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1094 #endif 1095 /* --------------------------------------------------------------- 1096 Create the sequential matrix of the same type as the local block diagonal 1097 */ 1098 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1099 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1100 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1101 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1102 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1103 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1104 **Bin = B; 1105 b = (Mat_SeqAIJ*)B->data; 1106 1107 /*-------------------------------------------------------------------- 1108 Copy my part of matrix column indices over 1109 */ 1110 sendcount = ad->nz + bd->nz; 1111 jsendbuf = b->j + b->i[rstarts[rank]]; 1112 a_jsendbuf = ad->j; 1113 b_jsendbuf = bd->j; 1114 n = A->rmap->rend - A->rmap->rstart; 1115 cnt = 0; 1116 for (i=0; i<n; i++) { 1117 1118 /* put in lower diagonal portion */ 1119 m = bd->i[i+1] - bd->i[i]; 1120 while (m > 0) { 1121 /* is it above diagonal (in bd (compressed) numbering) */ 1122 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1123 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1124 m--; 1125 } 1126 1127 /* put in diagonal portion */ 1128 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1129 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1130 } 1131 1132 /* put in upper diagonal portion */ 1133 while (m-- > 0) { 1134 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1135 } 1136 } 1137 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1138 1139 /*-------------------------------------------------------------------- 1140 Gather all column indices to all processors 1141 */ 1142 for (i=0; i<size; i++) { 1143 recvcounts[i] = 0; 1144 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1145 recvcounts[i] += lens[j]; 1146 } 1147 } 1148 displs[0] = 0; 1149 for (i=1; i<size; i++) { 1150 displs[i] = displs[i-1] + recvcounts[i-1]; 1151 } 1152 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1153 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1154 #else 1155 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1156 #endif 1157 /*-------------------------------------------------------------------- 1158 Assemble the matrix into useable form (note numerical values not yet set) 1159 */ 1160 /* set the b->ilen (length of each row) values */ 1161 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1162 /* set the b->i indices */ 1163 b->i[0] = 0; 1164 for (i=1; i<=A->rmap->N; i++) { 1165 b->i[i] = b->i[i-1] + lens[i-1]; 1166 } 1167 ierr = PetscFree(lens);CHKERRQ(ierr); 1168 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1169 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1170 1171 } else { 1172 B = **Bin; 1173 b = (Mat_SeqAIJ*)B->data; 1174 } 1175 1176 /*-------------------------------------------------------------------- 1177 Copy my part of matrix numerical values into the values location 1178 */ 1179 if (flag == MAT_GET_VALUES) { 1180 sendcount = ad->nz + bd->nz; 1181 sendbuf = b->a + b->i[rstarts[rank]]; 1182 a_sendbuf = ad->a; 1183 b_sendbuf = bd->a; 1184 b_sendj = bd->j; 1185 n = A->rmap->rend - A->rmap->rstart; 1186 cnt = 0; 1187 for (i=0; i<n; i++) { 1188 1189 /* put in lower diagonal portion */ 1190 m = bd->i[i+1] - bd->i[i]; 1191 while (m > 0) { 1192 /* is it above diagonal (in bd (compressed) numbering) */ 1193 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1194 sendbuf[cnt++] = *b_sendbuf++; 1195 m--; 1196 b_sendj++; 1197 } 1198 1199 /* put in diagonal portion */ 1200 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1201 sendbuf[cnt++] = *a_sendbuf++; 1202 } 1203 1204 /* put in upper diagonal portion */ 1205 while (m-- > 0) { 1206 sendbuf[cnt++] = *b_sendbuf++; 1207 b_sendj++; 1208 } 1209 } 1210 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1211 1212 /* ----------------------------------------------------------------- 1213 Gather all numerical values to all processors 1214 */ 1215 if (!recvcounts) { 1216 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1217 } 1218 for (i=0; i<size; i++) { 1219 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1220 } 1221 displs[0] = 0; 1222 for (i=1; i<size; i++) { 1223 displs[i] = displs[i-1] + recvcounts[i-1]; 1224 } 1225 recvbuf = b->a; 1226 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1227 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1228 #else 1229 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1230 #endif 1231 } /* endof (flag == MAT_GET_VALUES) */ 1232 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1233 1234 if (A->symmetric) { 1235 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1236 } else if (A->hermitian) { 1237 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1238 } else if (A->structurally_symmetric) { 1239 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1240 } 1241 PetscFunctionReturn(0); 1242 } 1243 #undef __FUNCT__ 1244 #define __FUNCT__ "MatDestroy_MPIAIJ_MatGetSubmatrices" 1245 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1246 { 1247 PetscErrorCode ierr; 1248 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1249 Mat_SubMat *submatj = c->submatis1; 1250 PetscInt i; 1251 1252 PetscFunctionBegin; 1253 ierr = submatj->destroy(C);CHKERRQ(ierr); 1254 1255 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1256 1257 for (i=0; i<submatj->nrqr; ++i) { 1258 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1259 } 1260 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1261 1262 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1263 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1264 1265 for (i=0; i<submatj->nrqs; ++i) { 1266 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1267 } 1268 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1269 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1270 1271 ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr); 1272 1273 #if defined(PETSC_USE_CTABLE) 1274 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1275 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1276 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1277 #else 1278 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1279 #endif 1280 1281 if (!submatj->allcolumns) { 1282 #if defined(PETSC_USE_CTABLE) 1283 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1284 #else 1285 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1286 #endif 1287 } 1288 1289 ierr = PetscFree(submatj);CHKERRQ(ierr); 1290 PetscFunctionReturn(0); 1291 } 1292 1293 #undef __FUNCT__ 1294 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS_Local" 1295 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1296 { 1297 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1298 Mat submat,A = c->A,B = c->B; 1299 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1300 PetscInt *ai = a->i,*bi = b->i,nzA,nzB; 1301 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1302 const PetscInt *icol,*irow; 1303 PetscInt nrow,ncol,start; 1304 PetscErrorCode ierr; 1305 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1306 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1307 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1308 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1309 PetscInt *lens,ncols,*cols,Crow; 1310 #if defined(PETSC_USE_CTABLE) 1311 PetscTable cmap,rmap; 1312 PetscInt *cmap_loc,*rmap_loc; 1313 #else 1314 PetscInt *cmap,*rmap; 1315 #endif 1316 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1317 PetscInt *cworkB,lwrite,*bj = b->j,*subcols,*row2proc; 1318 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1319 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1320 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1321 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1322 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1323 MPI_Comm comm; 1324 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1325 PetscMPIInt *onodes1,*olengths1,idex,end; 1326 Mat_SubMat *smatis1; 1327 #if defined(PETSC_USE_LOG) 1328 PetscLogStage stage[5]; 1329 #endif 1330 PetscBool isrowsorted; 1331 1332 PetscFunctionBegin; 1333 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1334 1335 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1336 size = c->size; 1337 rank = c->rank; 1338 1339 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1340 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1341 1342 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1343 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1344 if (allcolumns) { 1345 icol = NULL; 1346 ncol = C->cmap->N; 1347 } else { 1348 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1349 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1350 } 1351 1352 if (scall == MAT_INITIAL_MATRIX) { 1353 PetscInt *sbuf2_i,*cworkA,lwrite,*aj = a->j,ctmp; 1354 1355 ierr = PetscLogStageRegister("Setup",&stage[0]);CHKERRQ(ierr); 1356 ierr = PetscLogStageRegister("Aj comm",&stage[1]);CHKERRQ(ierr); 1357 ierr = PetscLogStageRegister("Aval comm",&stage[2]);CHKERRQ(ierr); 1358 ierr = PetscLogStageRegister("MAssem_loc",&stage[3]);CHKERRQ(ierr); 1359 ierr = PetscLogStageRegister("MAssem_off",&stage[4]);CHKERRQ(ierr); 1360 1361 ierr = PetscLogStagePush(stage[0]);CHKERRQ(ierr); 1362 1363 /* Get some new tags to keep the communication clean */ 1364 tag1 = ((PetscObject)C)->tag; 1365 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1366 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1367 1368 /* evaluate communication - mesg to who, length of mesg, and buffer space 1369 required. Based on this, buffers are allocated, and data copied into them */ 1370 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1371 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1372 1373 /* w1[proc] = num of rows owned by proc */ 1374 proc = 0; 1375 for (j=0; j<nrow; j++) { 1376 row = irow[j]; /* sorted! */ 1377 while (row >= C->rmap->range[proc+1]) proc++; 1378 w1[proc]++; 1379 row2proc[j] = proc; /* map row index to proc */ 1380 } 1381 1382 nrqs = 0; /* num of outgoing messages */ 1383 msz = 0; /* total mesg length (for all procs) */ 1384 w1[rank] = 0; /* num mesg sent to self */ 1385 for (proc=0; proc<size; proc++) { 1386 if (w1[proc]) { w2[proc] = 1; nrqs++;} /* there exists a message to this proc */ 1387 } 1388 1389 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1390 for (proc=0,j=0; proc<size; proc++) { 1391 if (w1[proc]) { pa[j++] = proc;} 1392 } 1393 1394 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1395 for (i=0; i<nrqs; i++) { 1396 proc = pa[i]; 1397 w1[proc] += 3; 1398 msz += w1[proc]; 1399 } 1400 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1401 1402 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1403 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1404 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1405 1406 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1407 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1408 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1409 1410 /* Now post the Irecvs corresponding to these messages */ 1411 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1412 1413 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1414 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1415 1416 /* Allocate Memory for outgoing messages */ 1417 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1418 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1419 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1420 1421 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1422 iptr = tmp; 1423 for (i=0; i<nrqs; i++) { 1424 proc = pa[i]; 1425 sbuf1[proc] = iptr; 1426 iptr += w1[proc]; 1427 } 1428 1429 /* Form the outgoing messages */ 1430 /* Initialize the header space */ 1431 for (i=0; i<nrqs; i++) { 1432 proc = pa[i]; 1433 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1434 ptr[proc] = sbuf1[proc] + 3; 1435 } 1436 1437 /* Parse the isrow and copy data into outbuf */ 1438 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1439 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1440 proc = row2proc[j]; 1441 if (proc != rank) { /* copy to the outgoing buf*/ 1442 *ptr[proc] = irow[j]; 1443 ctr[proc]++; ptr[proc]++; 1444 } 1445 } 1446 1447 /* Update the headers for the current IS */ 1448 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1449 if ((ctr_j = ctr[j])) { 1450 sbuf1_j = sbuf1[j]; 1451 k = ++sbuf1_j[0]; 1452 sbuf1_j[2*k] = ctr_j; 1453 sbuf1_j[2*k-1] = 0; 1454 } 1455 } 1456 1457 /* Now post the sends */ 1458 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1459 for (i=0; i<nrqs; ++i) { 1460 proc = pa[i]; 1461 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1462 } 1463 1464 /* Post Receives to capture the buffer size */ 1465 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1466 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1467 1468 rbuf2[0] = tmp + msz; 1469 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1470 1471 for (i=0; i<nrqs; ++i) { 1472 proc = pa[i]; 1473 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1474 } 1475 1476 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1477 1478 /* Send to other procs the buf size they should allocate */ 1479 /* Receive messages*/ 1480 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1481 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1482 1483 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1484 for (i=0; i<nrqr; ++i) { 1485 req_size[i] = 0; 1486 rbuf1_i = rbuf1[i]; 1487 start = 2*rbuf1_i[0] + 1; 1488 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1489 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1490 sbuf2_i = sbuf2[i]; 1491 for (j=start; j<end; j++) { 1492 k = rbuf1_i[j] - rstart; 1493 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1494 sbuf2_i[j] = ncols; 1495 req_size[i] += ncols; 1496 } 1497 req_source1[i] = r_status1[i].MPI_SOURCE; 1498 1499 /* form the header */ 1500 sbuf2_i[0] = req_size[i]; 1501 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1502 1503 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1504 } 1505 1506 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1507 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1508 1509 /* rbuf2 is received, Post recv column indices a->j */ 1510 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1511 1512 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1513 for (i=0; i<nrqs; ++i) { 1514 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1515 req_source2[i] = r_status2[i].MPI_SOURCE; 1516 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1517 } 1518 1519 /* Wait on sends1 and sends2 */ 1520 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1521 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1522 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1523 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1524 1525 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1526 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1527 1528 ierr = PetscLogStagePop();CHKERRQ(ierr); 1529 ierr = PetscLogStagePush(stage[1]);CHKERRQ(ierr); 1530 1531 /* Now allocate sending buffers for a->j, and send them off */ 1532 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1533 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1534 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1535 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1536 1537 for (i=0; i<nrqr; i++) { /* for each requested message */ 1538 rbuf1_i = rbuf1[i]; 1539 sbuf_aj_i = sbuf_aj[i]; 1540 ct1 = 2*rbuf1_i[0] + 1; 1541 ct2 = 0; 1542 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1543 1544 kmax = rbuf1[i][2]; 1545 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1546 row = rbuf1_i[ct1] - rstart; 1547 nzA = ai[row+1] - ai[row]; nzB = bi[row+1] - bi[row]; 1548 ncols = nzA + nzB; 1549 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1550 1551 /* load the column indices for this row into cols*/ 1552 cols = sbuf_aj_i + ct2; 1553 1554 lwrite = 0; 1555 for (l=0; l<nzB; l++) { 1556 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1557 } 1558 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1559 for (l=0; l<nzB; l++) { 1560 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1561 } 1562 1563 ct2 += ncols; 1564 } 1565 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1566 } 1567 1568 /* create column map (cmap): global col of C -> local col of submat */ 1569 #if defined(PETSC_USE_CTABLE) 1570 if (!allcolumns) { 1571 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1572 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1573 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1574 if (icol[j] >= cstart && icol[j] <cend) { 1575 cmap_loc[icol[j] - cstart] = j+1; 1576 } else { /* use PetscTable for non-local col indices */ 1577 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1578 } 1579 } 1580 } else { 1581 cmap = NULL; 1582 cmap_loc = NULL; 1583 } 1584 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1585 #else 1586 if (!allcolumns) { 1587 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1588 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1589 } else { 1590 cmap = NULL; 1591 } 1592 #endif 1593 1594 /* Create lens for MatSeqAIJSetPreallocation() */ 1595 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1596 1597 /* Compute lens from local part of C */ 1598 for (j=0; j<nrow; j++) { 1599 row = irow[j]; 1600 proc = row2proc[j]; 1601 if (proc == rank) { 1602 /* diagonal part A = c->A */ 1603 ncols = ai[row-rstart+1] - ai[row-rstart]; 1604 cols = aj + ai[row-rstart]; 1605 if (!allcolumns) { 1606 for (k=0; k<ncols; k++) { 1607 #if defined(PETSC_USE_CTABLE) 1608 tcol = cmap_loc[cols[k]]; 1609 #else 1610 tcol = cmap[cols[k]+cstart]; 1611 #endif 1612 if (tcol) lens[j]++; 1613 } 1614 } else { /* allcolumns */ 1615 lens[j] = ncols; 1616 } 1617 1618 /* off-diagonal part B = c->B */ 1619 ncols = bi[row-rstart+1] - bi[row-rstart]; 1620 cols = bj + bi[row-rstart]; 1621 if (!allcolumns) { 1622 for (k=0; k<ncols; k++) { 1623 #if defined(PETSC_USE_CTABLE) 1624 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1625 #else 1626 tcol = cmap[bmap[cols[k]]]; 1627 #endif 1628 if (tcol) lens[j]++; 1629 } 1630 } else { /* allcolumns */ 1631 lens[j] += ncols; 1632 } 1633 } 1634 } 1635 1636 /* Create row map (rmap): global row of C -> local row of submat */ 1637 #if defined(PETSC_USE_CTABLE) 1638 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1639 for (j=0; j<nrow; j++) { 1640 row = irow[j]; 1641 proc = row2proc[j]; 1642 if (proc == rank) { /* a local row */ 1643 rmap_loc[row - rstart] = j; 1644 } else { 1645 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1646 } 1647 } 1648 #else 1649 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1650 for (j=0; j<nrow; j++) { 1651 rmap[irow[j]] = j; 1652 } 1653 #endif 1654 1655 /* Update lens from offproc data */ 1656 /* recv a->j is done */ 1657 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1658 for (i=0; i<nrqs; i++) { 1659 proc = pa[i]; 1660 sbuf1_i = sbuf1[proc]; 1661 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1662 ct1 = 2 + 1; 1663 ct2 = 0; 1664 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1665 rbuf3_i = rbuf3[i]; /* received C->j */ 1666 1667 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1668 max1 = sbuf1_i[2]; 1669 for (k=0; k<max1; k++,ct1++) { 1670 #if defined(PETSC_USE_CTABLE) 1671 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1672 row--; 1673 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1674 #else 1675 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1676 #endif 1677 /* Now, store row index of submat in sbuf1_i[ct1] */ 1678 sbuf1_i[ct1] = row; 1679 1680 nnz = rbuf2_i[ct1]; 1681 if (!allcolumns) { 1682 for (l=0; l<nnz; l++,ct2++) { 1683 #if defined(PETSC_USE_CTABLE) 1684 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1685 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1686 } else { 1687 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1688 } 1689 #else 1690 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1691 #endif 1692 if (tcol) { 1693 lens[row]++; 1694 1695 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1696 For reuse, we replace received C->j with index that should be inserted to submat */ 1697 /* rbuf3_i[ct3++] = ct2; ??? */ 1698 } 1699 } 1700 } else { /* allcolumns */ 1701 lens[row] += nnz; 1702 } 1703 } 1704 } 1705 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1706 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1707 1708 /* Create the submatrices */ 1709 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1710 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1711 1712 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1713 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1714 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1715 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1716 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1717 1718 /* create struct Mat_SubMat and attached it to submat */ 1719 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1720 subc = (Mat_SeqAIJ*)submat->data; 1721 subc->submatis1 = smatis1; 1722 1723 smatis1->nrqs = nrqs; 1724 smatis1->nrqr = nrqr; 1725 smatis1->rbuf1 = rbuf1; 1726 smatis1->rbuf2 = rbuf2; 1727 smatis1->rbuf3 = rbuf3; 1728 smatis1->sbuf2 = sbuf2; 1729 smatis1->req_source2 = req_source2; 1730 1731 smatis1->sbuf1 = sbuf1; 1732 smatis1->ptr = ptr; 1733 smatis1->tmp = tmp; 1734 smatis1->ctr = ctr; 1735 1736 smatis1->pa = pa; 1737 smatis1->req_size = req_size; 1738 smatis1->req_source1 = req_source1; 1739 1740 smatis1->allcolumns = allcolumns; 1741 smatis1->row2proc = row2proc; 1742 smatis1->rmap = rmap; 1743 smatis1->cmap = cmap; 1744 #if defined(PETSC_USE_CTABLE) 1745 smatis1->rmap_loc = rmap_loc; 1746 smatis1->cmap_loc = cmap_loc; 1747 #endif 1748 1749 smatis1->destroy = submat->ops->destroy; 1750 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1751 submat->factortype = C->factortype; 1752 1753 ierr = PetscLogStagePop();CHKERRQ(ierr); 1754 ierr = PetscLogStagePush(stage[2]);CHKERRQ(ierr); 1755 } else { /* scall == MAT_REUSE_MATRIX */ 1756 submat = submats[0]; 1757 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1758 1759 subc = (Mat_SeqAIJ*)submat->data; 1760 smatis1 = subc->submatis1; 1761 nrqs = smatis1->nrqs; 1762 nrqr = smatis1->nrqr; 1763 rbuf1 = smatis1->rbuf1; 1764 rbuf2 = smatis1->rbuf2; 1765 rbuf3 = smatis1->rbuf3; 1766 req_source2 = smatis1->req_source2; 1767 1768 sbuf1 = smatis1->sbuf1; 1769 sbuf2 = smatis1->sbuf2; 1770 ptr = smatis1->ptr; 1771 tmp = smatis1->tmp; 1772 ctr = smatis1->ctr; 1773 1774 pa = smatis1->pa; 1775 req_size = smatis1->req_size; 1776 req_source1 = smatis1->req_source1; 1777 1778 allcolumns = smatis1->allcolumns; 1779 row2proc = smatis1->row2proc; 1780 rmap = smatis1->rmap; 1781 cmap = smatis1->cmap; 1782 #if defined(PETSC_USE_CTABLE) 1783 rmap_loc = smatis1->rmap_loc; 1784 cmap_loc = smatis1->cmap_loc; 1785 #endif 1786 } 1787 1788 /* Post recv matrix values */ 1789 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1790 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1791 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1792 for (i=0; i<nrqs; ++i) { 1793 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1794 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1795 } 1796 1797 /* Allocate sending buffers for a->a, and send them off */ 1798 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1799 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1800 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1801 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1802 1803 for (i=0; i<nrqr; i++) { 1804 rbuf1_i = rbuf1[i]; 1805 sbuf_aa_i = sbuf_aa[i]; 1806 ct1 = 2*rbuf1_i[0]+1; 1807 ct2 = 0; 1808 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1809 1810 kmax = rbuf1_i[2]; 1811 for (k=0; k<kmax; k++,ct1++) { 1812 row = rbuf1_i[ct1] - rstart; 1813 nzA = ai[row+1] - ai[row]; 1814 nzB = bi[row+1] - bi[row]; 1815 ncols = nzA + nzB; 1816 cworkB = bj + bi[row]; 1817 vworkA = a_a + ai[row]; 1818 vworkB = b_a + bi[row]; 1819 1820 /* load the column values for this row into vals*/ 1821 vals = sbuf_aa_i + ct2; 1822 1823 lwrite = 0; 1824 for (l=0; l<nzB; l++) { 1825 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1826 } 1827 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1828 for (l=0; l<nzB; l++) { 1829 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1830 } 1831 1832 ct2 += ncols; 1833 } 1834 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1835 } 1836 1837 if (scall == MAT_INITIAL_MATRIX) { 1838 ierr = PetscLogStagePop();CHKERRQ(ierr); 1839 ierr = PetscLogStagePush(stage[3]);CHKERRQ(ierr); 1840 } 1841 1842 /* Assemble submat */ 1843 if (!allcolumns) { 1844 ierr = PetscCalloc2(ncol,&subcols,ncol,&subvals);CHKERRQ(ierr); 1845 } 1846 /* First assemble the local rows */ 1847 for (j=0; j<nrow; j++) { 1848 row = irow[j]; 1849 proc = row2proc[j]; 1850 if (proc == rank) { 1851 Crow = row; 1852 #if defined(PETSC_USE_CTABLE) 1853 row = rmap_loc[Crow - rstart]; /* row index of submat */ 1854 #else 1855 row = rmap[row]; 1856 #endif 1857 ierr = MatGetRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr); 1858 1859 if (!allcolumns) { 1860 #if defined(PETSC_USE_CTABLE) 1861 i = 0; 1862 k = 0; 1863 while (cols[k] < cstart) { 1864 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1865 if (tcol) { 1866 subcols[i] = --tcol; 1867 subvals[i++] = vals[k]; 1868 } 1869 k++; 1870 } 1871 while (cols[k] < cend && k<ncols) { 1872 tcol = cmap_loc[cols[k]-cstart]; 1873 if (tcol) { 1874 subcols[i] = --tcol; 1875 subvals[i++] = vals[k]; 1876 } 1877 k++; 1878 } 1879 while (k<ncols) { 1880 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1881 if (tcol) { 1882 subcols[i] = --tcol; 1883 subvals[i++] = vals[k]; 1884 } 1885 k++; 1886 } 1887 #else 1888 i = 0; 1889 for (k=0; k<ncols; k++) { 1890 tcol = cmap[cols[k]]; 1891 if (tcol) { 1892 subcols[i] = --tcol; 1893 subvals[i++] = vals[k]; 1894 } 1895 } 1896 #endif 1897 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1898 } else { /* allcolumns */ 1899 ierr = MatSetValues_SeqAIJ(submat,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr); 1900 } 1901 ierr = MatRestoreRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr); 1902 } 1903 } 1904 1905 if (scall == MAT_INITIAL_MATRIX) { 1906 ierr = PetscLogStagePop();CHKERRQ(ierr); 1907 ierr = PetscLogStagePush(stage[4]);CHKERRQ(ierr); 1908 } 1909 1910 /* Now assemble the off-proc rows */ 1911 for (i=0; i<nrqs; i++) { /* for each requested message */ 1912 /* recv values from other processes */ 1913 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1914 proc = pa[idex]; 1915 sbuf1_i = sbuf1[proc]; 1916 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1917 ct1 = 2 + 1; 1918 ct2 = 0; /* count of received C->j */ 1919 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1920 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1921 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1922 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1923 1924 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1925 max1 = sbuf1_i[2]; /* num of rows */ 1926 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1927 row = sbuf1_i[ct1]; /* row index of submat */ 1928 if (!allcolumns) { 1929 idex = 0; 1930 if (scall == MAT_INITIAL_MATRIX) { 1931 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1932 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1933 #if defined(PETSC_USE_CTABLE) 1934 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1935 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1936 } else { 1937 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1938 } 1939 #else 1940 tcol = cmap[rbuf3_i[ct2]]; 1941 #endif 1942 if (tcol) { 1943 subcols[idex] = --tcol; 1944 subvals[idex++] = rbuf4_i[ct2]; 1945 1946 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1947 For reuse, we replace received C->j with index that should be inserted to submat */ 1948 rbuf3_i[ct3++] = ct2; 1949 } 1950 } 1951 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1952 1953 } else { /* scall == MAT_REUSE_MATRIX */ 1954 submat = submats[0]; 1955 subc = (Mat_SeqAIJ*)submat->data; 1956 1957 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1958 for (l=0; l<nnz; l++) { 1959 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1960 subvals[idex++] = rbuf4_i[ct2]; 1961 } 1962 1963 bj = subc->j + subc->i[row]; 1964 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1965 } 1966 } else { /* allcolumns */ 1967 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1968 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1969 ct2 += nnz; 1970 } 1971 } 1972 } 1973 1974 /* sending a->a are done */ 1975 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1976 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1977 1978 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1979 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1980 submats[0] = submat; 1981 1982 /* Restore the indices */ 1983 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1984 if (!allcolumns) { 1985 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1986 } 1987 1988 /* Destroy allocated memory */ 1989 for (i=0; i<nrqs; ++i) { 1990 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1991 } 1992 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1993 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1994 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1995 1996 if (!allcolumns) { 1997 ierr = PetscFree2(subcols,subvals);CHKERRQ(ierr); 1998 } 1999 2000 if (scall == MAT_INITIAL_MATRIX) { 2001 ierr = PetscLogStagePop();CHKERRQ(ierr); 2002 2003 ierr = PetscFree(lens);CHKERRQ(ierr); 2004 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2005 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2006 } 2007 PetscFunctionReturn(0); 2008 } 2009 2010 #undef __FUNCT__ 2011 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS" 2012 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2013 { 2014 PetscErrorCode ierr; 2015 PetscInt ncol; 2016 PetscBool colflag,allcolumns=PETSC_FALSE; 2017 2018 PetscFunctionBegin; 2019 /* Allocate memory to hold all the submatrices */ 2020 if (scall != MAT_REUSE_MATRIX) { 2021 ierr = PetscMalloc1(1,submat);CHKERRQ(ierr); 2022 } 2023 2024 /* Check for special case: each processor gets entire matrix columns */ 2025 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 2026 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 2027 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 2028 2029 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2030 PetscFunctionReturn(0); 2031 } 2032 2033 #undef __FUNCT__ 2034 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 2035 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2036 { 2037 PetscErrorCode ierr; 2038 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 2039 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 2040 2041 PetscFunctionBegin; 2042 /* Check for special case: each processor gets entire matrix */ 2043 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */ 2044 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2045 PetscFunctionReturn(0); 2046 } 2047 2048 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2049 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2050 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2051 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2052 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2053 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2054 wantallmatrix = PETSC_TRUE; 2055 2056 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2057 } 2058 } 2059 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2060 if (twantallmatrix) { 2061 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2062 PetscFunctionReturn(0); 2063 } 2064 2065 /* Allocate memory to hold all the submatrices */ 2066 if (scall != MAT_REUSE_MATRIX) { 2067 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 2068 } 2069 2070 /* Check for special case: each processor gets entire matrix columns */ 2071 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 2072 for (i=0; i<ismax; i++) { 2073 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2074 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 2075 if (colflag && ncol == C->cmap->N) { 2076 allcolumns[i] = PETSC_TRUE; 2077 } else { 2078 allcolumns[i] = PETSC_FALSE; 2079 } 2080 } 2081 2082 /* Determine the number of stages through which submatrices are done */ 2083 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2084 2085 /* 2086 Each stage will extract nmax submatrices. 2087 nmax is determined by the matrix column dimension. 2088 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2089 */ 2090 if (!nmax) nmax = 1; 2091 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 2092 2093 /* Make sure every processor loops through the nstages */ 2094 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2095 2096 for (i=0,pos=0; i<nstages; i++) { 2097 if (pos+nmax <= ismax) max_no = nmax; 2098 else if (pos == ismax) max_no = 0; 2099 else max_no = ismax-pos; 2100 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 2101 pos += max_no; 2102 } 2103 2104 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 2105 PetscFunctionReturn(0); 2106 } 2107 2108 /* -------------------------------------------------------------------------*/ 2109 #undef __FUNCT__ 2110 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 2111 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 2112 { 2113 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2114 Mat A = c->A; 2115 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 2116 const PetscInt **icol,**irow; 2117 PetscInt *nrow,*ncol,start; 2118 PetscErrorCode ierr; 2119 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 2120 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 2121 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 2122 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 2123 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2124 #if defined(PETSC_USE_CTABLE) 2125 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2126 #else 2127 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2128 #endif 2129 const PetscInt *irow_i; 2130 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2131 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2132 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2133 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2134 MPI_Status *r_status3,*r_status4,*s_status4; 2135 MPI_Comm comm; 2136 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 2137 PetscMPIInt *onodes1,*olengths1; 2138 PetscMPIInt idex,idex2,end; 2139 2140 PetscFunctionBegin; 2141 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2142 tag0 = ((PetscObject)C)->tag; 2143 size = c->size; 2144 rank = c->rank; 2145 2146 /* Get some new tags to keep the communication clean */ 2147 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 2148 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2149 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2150 2151 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 2152 2153 for (i=0; i<ismax; i++) { 2154 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2155 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2156 if (allcolumns[i]) { 2157 icol[i] = NULL; 2158 ncol[i] = C->cmap->N; 2159 } else { 2160 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2161 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2162 } 2163 } 2164 2165 /* evaluate communication - mesg to who, length of mesg, and buffer space 2166 required. Based on this, buffers are allocated, and data copied into them*/ 2167 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 2168 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2169 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2170 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2171 for (i=0; i<ismax; i++) { 2172 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2173 jmax = nrow[i]; 2174 irow_i = irow[i]; 2175 for (j=0; j<jmax; j++) { 2176 l = 0; 2177 row = irow_i[j]; 2178 while (row >= C->rmap->range[l+1]) l++; 2179 proc = l; 2180 w4[proc]++; 2181 } 2182 for (j=0; j<size; j++) { 2183 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 2184 } 2185 } 2186 2187 nrqs = 0; /* no of outgoing messages */ 2188 msz = 0; /* total mesg length (for all procs) */ 2189 w1[rank] = 0; /* no mesg sent to self */ 2190 w3[rank] = 0; 2191 for (i=0; i<size; i++) { 2192 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2193 } 2194 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2195 for (i=0,j=0; i<size; i++) { 2196 if (w1[i]) { pa[j] = i; j++; } 2197 } 2198 2199 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2200 for (i=0; i<nrqs; i++) { 2201 j = pa[i]; 2202 w1[j] += w2[j] + 2* w3[j]; 2203 msz += w1[j]; 2204 } 2205 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2206 2207 /* Determine the number of messages to expect, their lengths, from from-ids */ 2208 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2209 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2210 2211 /* Now post the Irecvs corresponding to these messages */ 2212 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2213 2214 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2215 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2216 2217 /* Allocate Memory for outgoing messages */ 2218 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2219 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2220 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2221 2222 { 2223 PetscInt *iptr = tmp; 2224 k = 0; 2225 for (i=0; i<nrqs; i++) { 2226 j = pa[i]; 2227 iptr += k; 2228 sbuf1[j] = iptr; 2229 k = w1[j]; 2230 } 2231 } 2232 2233 /* Form the outgoing messages */ 2234 /* Initialize the header space */ 2235 for (i=0; i<nrqs; i++) { 2236 j = pa[i]; 2237 sbuf1[j][0] = 0; 2238 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2239 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2240 } 2241 2242 /* Parse the isrow and copy data into outbuf */ 2243 for (i=0; i<ismax; i++) { 2244 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2245 irow_i = irow[i]; 2246 jmax = nrow[i]; 2247 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2248 l = 0; 2249 row = irow_i[j]; 2250 while (row >= C->rmap->range[l+1]) l++; 2251 proc = l; 2252 if (proc != rank) { /* copy to the outgoing buf*/ 2253 ctr[proc]++; 2254 *ptr[proc] = row; 2255 ptr[proc]++; 2256 } 2257 } 2258 /* Update the headers for the current IS */ 2259 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2260 if ((ctr_j = ctr[j])) { 2261 sbuf1_j = sbuf1[j]; 2262 k = ++sbuf1_j[0]; 2263 sbuf1_j[2*k] = ctr_j; 2264 sbuf1_j[2*k-1] = i; 2265 } 2266 } 2267 } 2268 2269 /* Now post the sends */ 2270 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2271 for (i=0; i<nrqs; ++i) { 2272 j = pa[i]; 2273 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2274 } 2275 2276 /* Post Receives to capture the buffer size */ 2277 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2278 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 2279 rbuf2[0] = tmp + msz; 2280 for (i=1; i<nrqs; ++i) { 2281 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2282 } 2283 for (i=0; i<nrqs; ++i) { 2284 j = pa[i]; 2285 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 2286 } 2287 2288 /* Send to other procs the buf size they should allocate */ 2289 2290 2291 /* Receive messages*/ 2292 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2293 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2294 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 2295 { 2296 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2297 PetscInt *sbuf2_i; 2298 2299 for (i=0; i<nrqr; ++i) { 2300 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 2301 2302 req_size[idex] = 0; 2303 rbuf1_i = rbuf1[idex]; 2304 start = 2*rbuf1_i[0] + 1; 2305 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2306 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 2307 sbuf2_i = sbuf2[idex]; 2308 for (j=start; j<end; j++) { 2309 id = rbuf1_i[j] - rstart; 2310 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2311 sbuf2_i[j] = ncols; 2312 req_size[idex] += ncols; 2313 } 2314 req_source[idex] = r_status1[i].MPI_SOURCE; 2315 /* form the header */ 2316 sbuf2_i[0] = req_size[idex]; 2317 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2318 2319 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 2320 } 2321 } 2322 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2323 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2324 2325 /* recv buffer sizes */ 2326 /* Receive messages*/ 2327 2328 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 2329 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2330 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2331 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2332 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2333 2334 for (i=0; i<nrqs; ++i) { 2335 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 2336 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 2337 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 2338 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 2339 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 2340 } 2341 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2342 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2343 2344 /* Wait on sends1 and sends2 */ 2345 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2346 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2347 2348 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2349 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2350 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2351 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2352 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2353 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2354 2355 /* Now allocate buffers for a->j, and send them off */ 2356 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2357 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2358 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2359 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2360 2361 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2362 { 2363 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2364 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2365 PetscInt cend = C->cmap->rend; 2366 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2367 2368 for (i=0; i<nrqr; i++) { 2369 rbuf1_i = rbuf1[i]; 2370 sbuf_aj_i = sbuf_aj[i]; 2371 ct1 = 2*rbuf1_i[0] + 1; 2372 ct2 = 0; 2373 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2374 kmax = rbuf1[i][2*j]; 2375 for (k=0; k<kmax; k++,ct1++) { 2376 row = rbuf1_i[ct1] - rstart; 2377 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2378 ncols = nzA + nzB; 2379 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2380 2381 /* load the column indices for this row into cols*/ 2382 cols = sbuf_aj_i + ct2; 2383 2384 lwrite = 0; 2385 for (l=0; l<nzB; l++) { 2386 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2387 } 2388 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2389 for (l=0; l<nzB; l++) { 2390 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2391 } 2392 2393 ct2 += ncols; 2394 } 2395 } 2396 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 2397 } 2398 } 2399 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 2400 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 2401 2402 /* Allocate buffers for a->a, and send them off */ 2403 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2404 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2405 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2406 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2407 2408 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2409 { 2410 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2411 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2412 PetscInt cend = C->cmap->rend; 2413 PetscInt *b_j = b->j; 2414 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2415 2416 for (i=0; i<nrqr; i++) { 2417 rbuf1_i = rbuf1[i]; 2418 sbuf_aa_i = sbuf_aa[i]; 2419 ct1 = 2*rbuf1_i[0]+1; 2420 ct2 = 0; 2421 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2422 kmax = rbuf1_i[2*j]; 2423 for (k=0; k<kmax; k++,ct1++) { 2424 row = rbuf1_i[ct1] - rstart; 2425 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2426 ncols = nzA + nzB; 2427 cworkB = b_j + b_i[row]; 2428 vworkA = a_a + a_i[row]; 2429 vworkB = b_a + b_i[row]; 2430 2431 /* load the column values for this row into vals*/ 2432 vals = sbuf_aa_i+ct2; 2433 2434 lwrite = 0; 2435 for (l=0; l<nzB; l++) { 2436 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2437 } 2438 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2439 for (l=0; l<nzB; l++) { 2440 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2441 } 2442 2443 ct2 += ncols; 2444 } 2445 } 2446 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 2447 } 2448 } 2449 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 2450 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 2451 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2452 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2453 2454 /* Form the matrix */ 2455 /* create col map: global col of C -> local col of submatrices */ 2456 { 2457 const PetscInt *icol_i; 2458 #if defined(PETSC_USE_CTABLE) 2459 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 2460 for (i=0; i<ismax; i++) { 2461 if (!allcolumns[i]) { 2462 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2463 2464 jmax = ncol[i]; 2465 icol_i = icol[i]; 2466 cmap_i = cmap[i]; 2467 for (j=0; j<jmax; j++) { 2468 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2469 } 2470 } else { 2471 cmap[i] = NULL; 2472 } 2473 } 2474 #else 2475 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 2476 for (i=0; i<ismax; i++) { 2477 if (!allcolumns[i]) { 2478 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2479 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2480 jmax = ncol[i]; 2481 icol_i = icol[i]; 2482 cmap_i = cmap[i]; 2483 for (j=0; j<jmax; j++) { 2484 cmap_i[icol_i[j]] = j+1; 2485 } 2486 } else { 2487 cmap[i] = NULL; 2488 } 2489 } 2490 #endif 2491 } 2492 2493 /* Create lens which is required for MatCreate... */ 2494 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2495 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2496 if (ismax) { 2497 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 2498 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 2499 } 2500 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2501 2502 /* Update lens from local data */ 2503 for (i=0; i<ismax; i++) { 2504 jmax = nrow[i]; 2505 if (!allcolumns[i]) cmap_i = cmap[i]; 2506 irow_i = irow[i]; 2507 lens_i = lens[i]; 2508 for (j=0; j<jmax; j++) { 2509 l = 0; 2510 row = irow_i[j]; 2511 while (row >= C->rmap->range[l+1]) l++; 2512 proc = l; 2513 if (proc == rank) { 2514 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2515 if (!allcolumns[i]) { 2516 for (k=0; k<ncols; k++) { 2517 #if defined(PETSC_USE_CTABLE) 2518 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2519 #else 2520 tcol = cmap_i[cols[k]]; 2521 #endif 2522 if (tcol) lens_i[j]++; 2523 } 2524 } else { /* allcolumns */ 2525 lens_i[j] = ncols; 2526 } 2527 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2528 } 2529 } 2530 } 2531 2532 /* Create row map: global row of C -> local row of submatrices */ 2533 #if defined(PETSC_USE_CTABLE) 2534 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 2535 for (i=0; i<ismax; i++) { 2536 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2537 irow_i = irow[i]; 2538 jmax = nrow[i]; 2539 for (j=0; j<jmax; j++) { 2540 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2541 } 2542 } 2543 #else 2544 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 2545 if (ismax) { 2546 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 2547 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2548 } 2549 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 2550 for (i=0; i<ismax; i++) { 2551 rmap_i = rmap[i]; 2552 irow_i = irow[i]; 2553 jmax = nrow[i]; 2554 for (j=0; j<jmax; j++) { 2555 rmap_i[irow_i[j]] = j; 2556 } 2557 } 2558 #endif 2559 2560 /* Update lens from offproc data */ 2561 { 2562 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2563 2564 for (tmp2=0; tmp2<nrqs; tmp2++) { 2565 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 2566 idex = pa[idex2]; 2567 sbuf1_i = sbuf1[idex]; 2568 jmax = sbuf1_i[0]; 2569 ct1 = 2*jmax+1; 2570 ct2 = 0; 2571 rbuf2_i = rbuf2[idex2]; 2572 rbuf3_i = rbuf3[idex2]; 2573 for (j=1; j<=jmax; j++) { 2574 is_no = sbuf1_i[2*j-1]; 2575 max1 = sbuf1_i[2*j]; 2576 lens_i = lens[is_no]; 2577 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2578 rmap_i = rmap[is_no]; 2579 for (k=0; k<max1; k++,ct1++) { 2580 #if defined(PETSC_USE_CTABLE) 2581 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2582 row--; 2583 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2584 #else 2585 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2586 #endif 2587 max2 = rbuf2_i[ct1]; 2588 for (l=0; l<max2; l++,ct2++) { 2589 if (!allcolumns[is_no]) { 2590 #if defined(PETSC_USE_CTABLE) 2591 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2592 #else 2593 tcol = cmap_i[rbuf3_i[ct2]]; 2594 #endif 2595 if (tcol) lens_i[row]++; 2596 } else { /* allcolumns */ 2597 lens_i[row]++; /* lens_i[row] += max2 ? */ 2598 } 2599 } 2600 } 2601 } 2602 } 2603 } 2604 ierr = PetscFree(r_status3);CHKERRQ(ierr); 2605 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2606 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2607 ierr = PetscFree(s_status3);CHKERRQ(ierr); 2608 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2609 2610 /* Create the submatrices */ 2611 if (scall == MAT_REUSE_MATRIX) { 2612 PetscBool flag; 2613 2614 /* 2615 Assumes new rows are same length as the old rows,hence bug! 2616 */ 2617 for (i=0; i<ismax; i++) { 2618 mat = (Mat_SeqAIJ*)(submats[i]->data); 2619 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2620 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2621 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 2622 /* Initial matrix as if empty */ 2623 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2624 2625 submats[i]->factortype = C->factortype; 2626 } 2627 } else { 2628 for (i=0; i<ismax; i++) { 2629 PetscInt rbs,cbs; 2630 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2631 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2632 2633 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2634 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2635 2636 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2637 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2638 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2639 } 2640 } 2641 2642 /* Assemble the matrices */ 2643 /* First assemble the local rows */ 2644 { 2645 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2646 PetscScalar *imat_a; 2647 2648 for (i=0; i<ismax; i++) { 2649 mat = (Mat_SeqAIJ*)submats[i]->data; 2650 imat_ilen = mat->ilen; 2651 imat_j = mat->j; 2652 imat_i = mat->i; 2653 imat_a = mat->a; 2654 2655 if (!allcolumns[i]) cmap_i = cmap[i]; 2656 rmap_i = rmap[i]; 2657 irow_i = irow[i]; 2658 jmax = nrow[i]; 2659 for (j=0; j<jmax; j++) { 2660 l = 0; 2661 row = irow_i[j]; 2662 while (row >= C->rmap->range[l+1]) l++; 2663 proc = l; 2664 if (proc == rank) { 2665 old_row = row; 2666 #if defined(PETSC_USE_CTABLE) 2667 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2668 row--; 2669 #else 2670 row = rmap_i[row]; 2671 #endif 2672 ilen_row = imat_ilen[row]; 2673 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2674 mat_i = imat_i[row]; 2675 mat_a = imat_a + mat_i; 2676 mat_j = imat_j + mat_i; 2677 if (!allcolumns[i]) { 2678 for (k=0; k<ncols; k++) { 2679 #if defined(PETSC_USE_CTABLE) 2680 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2681 #else 2682 tcol = cmap_i[cols[k]]; 2683 #endif 2684 if (tcol) { 2685 *mat_j++ = tcol - 1; 2686 *mat_a++ = vals[k]; 2687 ilen_row++; 2688 } 2689 } 2690 } else { /* allcolumns */ 2691 for (k=0; k<ncols; k++) { 2692 *mat_j++ = cols[k]; /* global col index! */ 2693 *mat_a++ = vals[k]; 2694 ilen_row++; 2695 } 2696 } 2697 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2698 2699 imat_ilen[row] = ilen_row; 2700 } 2701 } 2702 } 2703 } 2704 2705 /* Now assemble the off proc rows*/ 2706 { 2707 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 2708 PetscInt *imat_j,*imat_i; 2709 PetscScalar *imat_a,*rbuf4_i; 2710 2711 for (tmp2=0; tmp2<nrqs; tmp2++) { 2712 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 2713 idex = pa[idex2]; 2714 sbuf1_i = sbuf1[idex]; 2715 jmax = sbuf1_i[0]; 2716 ct1 = 2*jmax + 1; 2717 ct2 = 0; 2718 rbuf2_i = rbuf2[idex2]; 2719 rbuf3_i = rbuf3[idex2]; 2720 rbuf4_i = rbuf4[idex2]; 2721 for (j=1; j<=jmax; j++) { 2722 is_no = sbuf1_i[2*j-1]; 2723 rmap_i = rmap[is_no]; 2724 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2725 mat = (Mat_SeqAIJ*)submats[is_no]->data; 2726 imat_ilen = mat->ilen; 2727 imat_j = mat->j; 2728 imat_i = mat->i; 2729 imat_a = mat->a; 2730 max1 = sbuf1_i[2*j]; 2731 for (k=0; k<max1; k++,ct1++) { 2732 row = sbuf1_i[ct1]; 2733 #if defined(PETSC_USE_CTABLE) 2734 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2735 row--; 2736 #else 2737 row = rmap_i[row]; 2738 #endif 2739 ilen = imat_ilen[row]; 2740 mat_i = imat_i[row]; 2741 mat_a = imat_a + mat_i; 2742 mat_j = imat_j + mat_i; 2743 max2 = rbuf2_i[ct1]; 2744 if (!allcolumns[is_no]) { 2745 for (l=0; l<max2; l++,ct2++) { 2746 2747 #if defined(PETSC_USE_CTABLE) 2748 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2749 #else 2750 tcol = cmap_i[rbuf3_i[ct2]]; 2751 #endif 2752 if (tcol) { 2753 *mat_j++ = tcol - 1; 2754 *mat_a++ = rbuf4_i[ct2]; 2755 ilen++; 2756 } 2757 } 2758 } else { /* allcolumns */ 2759 for (l=0; l<max2; l++,ct2++) { 2760 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2761 *mat_a++ = rbuf4_i[ct2]; 2762 ilen++; 2763 } 2764 } 2765 imat_ilen[row] = ilen; 2766 } 2767 } 2768 } 2769 } 2770 2771 /* sort the rows */ 2772 { 2773 PetscInt *imat_ilen,*imat_j,*imat_i; 2774 PetscScalar *imat_a; 2775 2776 for (i=0; i<ismax; i++) { 2777 mat = (Mat_SeqAIJ*)submats[i]->data; 2778 imat_j = mat->j; 2779 imat_i = mat->i; 2780 imat_a = mat->a; 2781 imat_ilen = mat->ilen; 2782 2783 if (allcolumns[i]) continue; 2784 jmax = nrow[i]; 2785 for (j=0; j<jmax; j++) { 2786 PetscInt ilen; 2787 2788 mat_i = imat_i[j]; 2789 mat_a = imat_a + mat_i; 2790 mat_j = imat_j + mat_i; 2791 ilen = imat_ilen[j]; 2792 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 2793 } 2794 } 2795 } 2796 2797 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2798 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2799 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2800 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2801 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2802 2803 /* Restore the indices */ 2804 for (i=0; i<ismax; i++) { 2805 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2806 if (!allcolumns[i]) { 2807 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2808 } 2809 } 2810 2811 /* Destroy allocated memory */ 2812 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 2813 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2814 ierr = PetscFree(pa);CHKERRQ(ierr); 2815 2816 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 2817 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 2818 for (i=0; i<nrqr; ++i) { 2819 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 2820 } 2821 for (i=0; i<nrqs; ++i) { 2822 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 2823 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2824 } 2825 2826 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 2827 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 2828 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2829 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2830 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2831 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2832 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2833 2834 #if defined(PETSC_USE_CTABLE) 2835 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 2836 #else 2837 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 2838 #endif 2839 ierr = PetscFree(rmap);CHKERRQ(ierr); 2840 2841 for (i=0; i<ismax; i++) { 2842 if (!allcolumns[i]) { 2843 #if defined(PETSC_USE_CTABLE) 2844 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 2845 #else 2846 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 2847 #endif 2848 } 2849 } 2850 ierr = PetscFree(cmap);CHKERRQ(ierr); 2851 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2852 ierr = PetscFree(lens);CHKERRQ(ierr); 2853 2854 for (i=0; i<ismax; i++) { 2855 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2856 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2857 } 2858 PetscFunctionReturn(0); 2859 } 2860 2861 /* 2862 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2863 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2864 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2865 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2866 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2867 state, and needs to be "assembled" later by compressing B's column space. 2868 2869 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2870 Following this call, C->A & C->B have been created, even if empty. 2871 */ 2872 #undef __FUNCT__ 2873 #define __FUNCT__ "MatSetSeqMats_MPIAIJ" 2874 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2875 { 2876 /* If making this function public, change the error returned in this function away from _PLIB. */ 2877 PetscErrorCode ierr; 2878 Mat_MPIAIJ *aij; 2879 Mat_SeqAIJ *Baij; 2880 PetscBool seqaij,Bdisassembled; 2881 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2882 PetscScalar v; 2883 const PetscInt *rowindices,*colindices; 2884 2885 PetscFunctionBegin; 2886 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2887 if (A) { 2888 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2889 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2890 if (rowemb) { 2891 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2892 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2893 } else { 2894 if (C->rmap->n != A->rmap->n) { 2895 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2896 } 2897 } 2898 if (dcolemb) { 2899 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2900 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2901 } else { 2902 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2903 } 2904 } 2905 if (B) { 2906 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2907 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2908 if (rowemb) { 2909 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2910 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2911 } else { 2912 if (C->rmap->n != B->rmap->n) { 2913 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2914 } 2915 } 2916 if (ocolemb) { 2917 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2918 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2919 } else { 2920 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2921 } 2922 } 2923 2924 aij = (Mat_MPIAIJ*)(C->data); 2925 if (!aij->A) { 2926 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2927 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2928 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2929 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2930 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2931 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2932 } 2933 if (A) { 2934 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2935 } else { 2936 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2937 } 2938 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2939 /* 2940 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2941 need to "disassemble" B -- convert it to using C's global indices. 2942 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2943 2944 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2945 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2946 2947 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2948 At least avoid calling MatSetValues() and the implied searches? 2949 */ 2950 2951 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2952 #if defined(PETSC_USE_CTABLE) 2953 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2954 #else 2955 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2956 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2957 if (aij->B) { 2958 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2959 } 2960 #endif 2961 ngcol = 0; 2962 if (aij->lvec) { 2963 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2964 } 2965 if (aij->garray) { 2966 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2967 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2968 } 2969 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2970 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2971 } 2972 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2973 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2974 } 2975 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2976 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2977 } 2978 } 2979 Bdisassembled = PETSC_FALSE; 2980 if (!aij->B) { 2981 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2982 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2983 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2984 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2985 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2986 Bdisassembled = PETSC_TRUE; 2987 } 2988 if (B) { 2989 Baij = (Mat_SeqAIJ*)(B->data); 2990 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2991 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2992 for (i=0; i<B->rmap->n; i++) { 2993 nz[i] = Baij->i[i+1] - Baij->i[i]; 2994 } 2995 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2996 ierr = PetscFree(nz);CHKERRQ(ierr); 2997 } 2998 2999 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3000 shift = rend-rstart; 3001 count = 0; 3002 rowindices = NULL; 3003 colindices = NULL; 3004 if (rowemb) { 3005 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3006 } 3007 if (ocolemb) { 3008 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3009 } 3010 for (i=0; i<B->rmap->n; i++) { 3011 PetscInt row; 3012 row = i; 3013 if (rowindices) row = rowindices[i]; 3014 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3015 col = Baij->j[count]; 3016 if (colindices) col = colindices[col]; 3017 if (Bdisassembled && col>=rstart) col += shift; 3018 v = Baij->a[count]; 3019 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3020 ++count; 3021 } 3022 } 3023 /* No assembly for aij->B is necessary. */ 3024 /* FIXME: set aij->B's nonzerostate correctly. */ 3025 } else { 3026 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3027 } 3028 C->preallocated = PETSC_TRUE; 3029 C->was_assembled = PETSC_FALSE; 3030 C->assembled = PETSC_FALSE; 3031 /* 3032 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3033 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3034 */ 3035 PetscFunctionReturn(0); 3036 } 3037 3038 #undef __FUNCT__ 3039 #define __FUNCT__ "MatGetSeqMats_MPIAIJ" 3040 /* 3041 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3042 */ 3043 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3044 { 3045 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 3046 3047 PetscFunctionBegin; 3048 PetscValidPointer(A,2); 3049 PetscValidPointer(B,3); 3050 /* FIXME: make sure C is assembled */ 3051 *A = aij->A; 3052 *B = aij->B; 3053 /* Note that we don't incref *A and *B, so be careful! */ 3054 PetscFunctionReturn(0); 3055 } 3056 3057 /* 3058 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3059 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3060 */ 3061 #undef __FUNCT__ 3062 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ" 3063 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3064 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3065 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3066 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3067 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3068 { 3069 PetscErrorCode ierr; 3070 PetscMPIInt isize,flag; 3071 PetscInt i,ii,cismax,ispar,ciscol_localsize; 3072 Mat *A,*B; 3073 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3074 3075 PetscFunctionBegin; 3076 if (!ismax) PetscFunctionReturn(0); 3077 3078 for (i = 0, cismax = 0; i < ismax; ++i) { 3079 PetscMPIInt isize; 3080 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3081 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3082 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3083 if (isize > 1) ++cismax; 3084 } 3085 /* 3086 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3087 ispar counts the number of parallel ISs across C's comm. 3088 */ 3089 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3090 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3091 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3092 PetscFunctionReturn(0); 3093 } 3094 3095 /* if (ispar) */ 3096 /* 3097 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3098 These are used to extract the off-diag portion of the resulting parallel matrix. 3099 The row IS for the off-diag portion is the same as for the diag portion, 3100 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3101 */ 3102 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3103 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3104 for (i = 0, ii = 0; i < ismax; ++i) { 3105 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3106 if (isize > 1) { 3107 /* 3108 TODO: This is the part that's ***NOT SCALABLE***. 3109 To fix this we need to extract just the indices of C's nonzero columns 3110 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3111 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3112 to be done without serializing on the IS list, so, most likely, it is best 3113 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3114 */ 3115 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3116 /* Now we have to 3117 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3118 were sorted on each rank, concatenated they might no longer be sorted; 3119 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3120 indices in the nondecreasing order to the original index positions. 3121 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3122 */ 3123 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3124 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3125 ++ii; 3126 } 3127 } 3128 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3129 for (i = 0, ii = 0; i < ismax; ++i) { 3130 PetscInt j,issize; 3131 const PetscInt *indices; 3132 3133 /* 3134 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3135 */ 3136 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3137 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3138 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3139 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3140 for (j = 1; j < issize; ++j) { 3141 if (indices[j] == indices[j-1]) { 3142 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3143 } 3144 } 3145 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3146 3147 3148 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3149 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3150 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3151 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3152 for (j = 1; j < issize; ++j) { 3153 if (indices[j-1] == indices[j]) { 3154 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3155 } 3156 } 3157 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3158 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3159 if (isize > 1) { 3160 cisrow[ii] = isrow[i]; 3161 ++ii; 3162 } 3163 } 3164 /* 3165 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3166 array of sequential matrices underlying the resulting parallel matrices. 3167 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3168 contain duplicates. 3169 3170 There are as many diag matrices as there are original index sets. There are only as many parallel 3171 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3172 3173 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3174 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3175 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3176 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3177 with A[i] and B[ii] extracted from the corresponding MPI submat. 3178 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3179 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3180 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3181 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3182 values into A[i] and B[ii] sitting inside the corresponding submat. 3183 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3184 A[i], B[ii], AA[i] or BB[ii] matrices. 3185 */ 3186 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3187 if (scall == MAT_INITIAL_MATRIX) { 3188 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3189 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 3190 } else { 3191 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 3192 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 3193 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 3194 for (i = 0, ii = 0; i < ismax; ++i) { 3195 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3196 if (isize > 1) { 3197 Mat AA,BB; 3198 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 3199 if (!isrow_p[i] && !iscol_p[i]) { 3200 A[i] = AA; 3201 } else { 3202 /* TODO: extract A[i] composed on AA. */ 3203 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3204 } 3205 if (!isrow_p[i] && !ciscol_p[ii]) { 3206 B[ii] = BB; 3207 } else { 3208 /* TODO: extract B[ii] composed on BB. */ 3209 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 3210 } 3211 ++ii; 3212 } else { 3213 if (!isrow_p[i] && !iscol_p[i]) { 3214 A[i] = (*submat)[i]; 3215 } else { 3216 /* TODO: extract A[i] composed on (*submat)[i]. */ 3217 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3218 } 3219 } 3220 } 3221 } 3222 /* Now obtain the sequential A and B submatrices separately. */ 3223 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 3224 /* I did not figure out a good way to fix it right now. 3225 * Local column size of B[i] is different from the size of ciscol[i]. 3226 * B[i]'s size is finally determined by MatAssembly, while 3227 * ciscol[i] is computed as the complement of iscol[i]. 3228 * It is better to keep only nonzero indices when computing 3229 * the complement ciscol[i]. 3230 * */ 3231 if(scall==MAT_REUSE_MATRIX){ 3232 for(i=0; i<cismax; i++){ 3233 ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr); 3234 B[i]->cmap->n = ciscol_localsize; 3235 } 3236 } 3237 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 3238 /* 3239 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3240 matrices A & B have been extracted directly into the parallel matrices containing them, or 3241 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3242 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3243 to have the same sparsity pattern. 3244 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3245 must be constructed for C. This is done by setseqmat(s). 3246 */ 3247 for (i = 0, ii = 0; i < ismax; ++i) { 3248 /* 3249 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3250 That way we can avoid sorting and computing permutations when reusing. 3251 To this end: 3252 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3253 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3254 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3255 */ 3256 MatStructure pattern; 3257 if (scall == MAT_INITIAL_MATRIX) { 3258 pattern = DIFFERENT_NONZERO_PATTERN; 3259 } else { 3260 pattern = SAME_NONZERO_PATTERN; 3261 } 3262 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3263 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3264 if (isize > 1) { 3265 if (scall == MAT_INITIAL_MATRIX) { 3266 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3267 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3268 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3269 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3270 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3271 } 3272 /* 3273 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3274 */ 3275 { 3276 Mat AA,BB; 3277 AA = NULL; 3278 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 3279 BB = NULL; 3280 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 3281 if (AA || BB) { 3282 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3283 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3284 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3285 } 3286 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 3287 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 3288 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3289 } 3290 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 3291 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 3292 ierr = MatDestroy(&BB);CHKERRQ(ierr); 3293 } 3294 } 3295 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3296 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3297 ++ii; 3298 } else { /* if (isize == 1) */ 3299 if (scall == MAT_INITIAL_MATRIX) { 3300 if (isrow_p[i] || iscol_p[i]) { 3301 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3302 } else (*submat)[i] = A[i]; 3303 } 3304 if (isrow_p[i] || iscol_p[i]) { 3305 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3306 /* Otherwise A is extracted straight into (*submats)[i]. */ 3307 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3308 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3309 } 3310 } 3311 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3312 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3313 } 3314 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3315 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3316 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3317 ierr = PetscFree(A);CHKERRQ(ierr); 3318 ierr = PetscFree(B);CHKERRQ(ierr); 3319 PetscFunctionReturn(0); 3320 } 3321 3322 3323 3324 #undef __FUNCT__ 3325 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ" 3326 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3327 { 3328 PetscErrorCode ierr; 3329 3330 PetscFunctionBegin; 3331 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3332 PetscFunctionReturn(0); 3333 } 3334