1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 #undef __FUNCT__ 23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 25 { 26 PetscErrorCode ierr; 27 PetscInt i; 28 29 PetscFunctionBegin; 30 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 31 for (i=0; i<ov; ++i) { 32 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 33 } 34 PetscFunctionReturn(0); 35 } 36 37 #undef __FUNCT__ 38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 40 { 41 PetscErrorCode ierr; 42 PetscInt i; 43 44 PetscFunctionBegin; 45 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 46 for (i=0; i<ov; ++i) { 47 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 48 } 49 PetscFunctionReturn(0); 50 } 51 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 56 { 57 PetscErrorCode ierr; 58 MPI_Comm comm; 59 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 60 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 61 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 62 const PetscInt *indices_i,**indices; 63 PetscLayout rmap; 64 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 65 PetscSF sf; 66 PetscSFNode *remote; 67 68 PetscFunctionBegin; 69 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 70 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 71 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 72 /* get row map to determine where rows should be going */ 73 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 74 /* retrieve IS data and put all together so that we 75 * can optimize communication 76 * */ 77 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 78 for (i=0,tlength=0; i<nidx; i++){ 79 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 80 tlength += length[i]; 81 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 82 } 83 /* find these rows on remote processors */ 84 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 85 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 86 nrrows = 0; 87 for (i=0; i<nidx; i++){ 88 length_i = length[i]; 89 indices_i = indices[i]; 90 for (j=0; j<length_i; j++){ 91 owner = -1; 92 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 93 /* remote processors */ 94 if (owner != rank){ 95 tosizes_temp[owner]++; /* number of rows to owner */ 96 rrow_ranks[nrrows] = owner; /* processor */ 97 rrow_isids[nrrows] = i; /* is id */ 98 remoterows[nrrows++] = indices_i[j]; /* row */ 99 } 100 } 101 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 102 } 103 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 104 /* test if we need to exchange messages 105 * generally speaking, we do not need to exchange 106 * data when overlap is 1 107 * */ 108 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 109 /* we do not have any messages 110 * It usually corresponds to overlap 1 111 * */ 112 if (!reducednrrows){ 113 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 114 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 115 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 nto = 0; 119 /* send sizes and ranks for building a two-sided communcation */ 120 for (i=0; i<size; i++){ 121 if (tosizes_temp[i]){ 122 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 123 tosizes_temp[i] = nto; /* a map from processor to index */ 124 toranks[nto++] = i; /* processor */ 125 } 126 } 127 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 128 for (i=0; i<nto; i++){ 129 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 130 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 131 } 132 /* send information to other processors */ 133 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 134 nrecvrows = 0; 135 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 136 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 137 nrecvrows = 0; 138 for (i=0; i<nfrom; i++){ 139 for (j=0; j<fromsizes[2*i]; j++){ 140 remote[nrecvrows].rank = fromranks[i]; 141 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 142 } 143 } 144 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 145 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 146 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 147 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 148 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 149 /* message pair <no of is, row> */ 150 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 151 for (i=0; i<nrrows; i++){ 152 owner = rrow_ranks[i]; /* processor */ 153 j = tosizes_temp[owner]; /* index */ 154 todata[toffsets[j]++] = rrow_isids[i]; 155 todata[toffsets[j]++] = remoterows[i]; 156 } 157 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 158 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 159 ierr = PetscFree(toffsets);CHKERRQ(ierr); 160 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 161 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 162 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 163 /* send rows belonging to the remote so that then we could get the overlapping data back */ 164 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 165 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 166 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 167 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 168 ierr = PetscFree(fromranks);CHKERRQ(ierr); 169 nrecvrows = 0; 170 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 171 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 172 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 173 nrecvrows = 0; 174 for (i=0; i<nto; i++){ 175 for (j=0; j<tosizes[2*i]; j++){ 176 remote[nrecvrows].rank = toranks[i]; 177 remote[nrecvrows++].index = tosizes[2*i+1]+j; 178 } 179 } 180 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 181 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 182 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 183 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 184 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 185 /* overlap communication and computation */ 186 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 187 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 188 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 189 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 190 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 191 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 192 ierr = PetscFree(toranks);CHKERRQ(ierr); 193 ierr = PetscFree(tosizes);CHKERRQ(ierr); 194 ierr = PetscFree(todata);CHKERRQ(ierr); 195 PetscFunctionReturn(0); 196 } 197 198 #undef __FUNCT__ 199 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 200 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 201 { 202 PetscInt *isz,isz_i,i,j,is_id, data_size; 203 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 204 const PetscInt *indices_i_temp; 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 max_lsize = 0; 209 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 210 for (i=0; i<nidx; i++){ 211 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 212 max_lsize = lsize>max_lsize ? lsize:max_lsize; 213 isz[i] = lsize; 214 } 215 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 216 for (i=0; i<nidx; i++){ 217 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 218 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 219 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 220 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 221 } 222 /* retrieve information to get row id and its overlap */ 223 for (i=0; i<nrecvs; ){ 224 is_id = recvdata[i++]; 225 data_size = recvdata[i++]; 226 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 227 isz_i = isz[is_id]; 228 for (j=0; j< data_size; j++){ 229 col = recvdata[i++]; 230 indices_i[isz_i++] = col; 231 } 232 isz[is_id] = isz_i; 233 } 234 /* remove duplicate entities */ 235 for (i=0; i<nidx; i++){ 236 indices_i = indices_temp+(max_lsize+nrecvs)*i; 237 isz_i = isz[i]; 238 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 239 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 240 } 241 ierr = PetscFree(isz);CHKERRQ(ierr); 242 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 243 PetscFunctionReturn(0); 244 } 245 246 #undef __FUNCT__ 247 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 248 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 249 { 250 PetscLayout rmap,cmap; 251 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 252 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 253 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 254 const PetscInt *gcols,*ai,*aj,*bi,*bj; 255 Mat amat,bmat; 256 PetscMPIInt rank; 257 PetscBool done; 258 MPI_Comm comm; 259 PetscErrorCode ierr; 260 261 PetscFunctionBegin; 262 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 263 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 264 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 265 /* Even if the mat is symmetric, we still assume it is not symmetric */ 266 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 267 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 268 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 269 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 270 /* total number of nonzero values is used to estimate the memory usage in the next step */ 271 tnz = ai[an]+bi[bn]; 272 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 273 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 274 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 275 /* to find the longest message */ 276 max_fszs = 0; 277 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 278 /* better way to estimate number of nonzero in the mat??? */ 279 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 280 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 281 rows_pos = 0; 282 totalrows = 0; 283 for (i=0; i<nfrom; i++){ 284 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 285 /* group data together */ 286 for (j=0; j<fromsizes[2*i]; j+=2){ 287 is_id = fromrows[rows_pos++];/* no of is */ 288 rows_i = rows_data[is_id]; 289 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 290 } 291 /* estimate a space to avoid multiple allocations */ 292 for (j=0; j<nidx; j++){ 293 indvc_ij = 0; 294 rows_i = rows_data[j]; 295 for (l=0; l<rows_pos_i[j]; l++){ 296 row = rows_i[l]-rstart; 297 start = ai[row]; 298 end = ai[row+1]; 299 for (k=start; k<end; k++){ /* Amat */ 300 col = aj[k] + cstart; 301 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 302 } 303 start = bi[row]; 304 end = bi[row+1]; 305 for (k=start; k<end; k++) { /* Bmat */ 306 col = gcols[bj[k]]; 307 indices_tmp[indvc_ij++] = col; 308 } 309 } 310 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 311 indv_counts[i*nidx+j] = indvc_ij; 312 totalrows += indvc_ij; 313 } 314 } 315 /* message triple <no of is, number of rows, rows> */ 316 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 317 totalrows = 0; 318 rows_pos = 0; 319 /* use this code again */ 320 for (i=0;i<nfrom;i++){ 321 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 322 for (j=0; j<fromsizes[2*i]; j+=2){ 323 is_id = fromrows[rows_pos++]; 324 rows_i = rows_data[is_id]; 325 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 326 } 327 /* add data */ 328 for (j=0; j<nidx; j++){ 329 if (!indv_counts[i*nidx+j]) continue; 330 indvc_ij = 0; 331 sbdata[totalrows++] = j; 332 sbdata[totalrows++] = indv_counts[i*nidx+j]; 333 sbsizes[2*i] += 2; 334 rows_i = rows_data[j]; 335 for (l=0; l<rows_pos_i[j]; l++){ 336 row = rows_i[l]-rstart; 337 start = ai[row]; 338 end = ai[row+1]; 339 for (k=start; k<end; k++){ /* Amat */ 340 col = aj[k] + cstart; 341 indices_tmp[indvc_ij++] = col; 342 } 343 start = bi[row]; 344 end = bi[row+1]; 345 for (k=start; k<end; k++) { /* Bmat */ 346 col = gcols[bj[k]]; 347 indices_tmp[indvc_ij++] = col; 348 } 349 } 350 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 351 sbsizes[2*i] += indvc_ij; 352 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 353 totalrows += indvc_ij; 354 } 355 } 356 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 357 for (i=0; i<nfrom; i++){ 358 offsets[i+1] = offsets[i] + sbsizes[2*i]; 359 sbsizes[2*i+1] = offsets[i]; 360 } 361 ierr = PetscFree(offsets);CHKERRQ(ierr); 362 if (sbrowsizes) *sbrowsizes = sbsizes; 363 if (sbrows) *sbrows = sbdata; 364 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 365 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 366 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 367 PetscFunctionReturn(0); 368 } 369 370 #undef __FUNCT__ 371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 373 { 374 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 375 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 376 PetscInt lsize,lsize_tmp,owner; 377 PetscMPIInt rank; 378 Mat amat,bmat; 379 PetscBool done; 380 PetscLayout cmap,rmap; 381 MPI_Comm comm; 382 PetscErrorCode ierr; 383 384 PetscFunctionBegin; 385 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 386 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 387 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 388 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 389 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 390 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 391 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 392 /* is it a safe way to compute number of nonzero values ? */ 393 tnz = ai[an]+bi[bn]; 394 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 395 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 396 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 397 /* it is a better way to estimate memory than the old implementation 398 * where global size of matrix is used 399 * */ 400 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 401 for (i=0; i<nidx; i++) { 402 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 403 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 404 lsize_tmp = 0; 405 for (j=0; j<lsize; j++) { 406 owner = -1; 407 row = indices[j]; 408 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 409 if (owner != rank) continue; 410 /* local number */ 411 row -= rstart; 412 start = ai[row]; 413 end = ai[row+1]; 414 for (k=start; k<end; k++) { /* Amat */ 415 col = aj[k] + cstart; 416 indices_temp[lsize_tmp++] = col; 417 } 418 start = bi[row]; 419 end = bi[row+1]; 420 for (k=start; k<end; k++) { /* Bmat */ 421 col = gcols[bj[k]]; 422 indices_temp[lsize_tmp++] = col; 423 } 424 } 425 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 426 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 427 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 428 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 429 } 430 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 431 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 432 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 433 PetscFunctionReturn(0); 434 } 435 436 437 /* 438 Sample message format: 439 If a processor A wants processor B to process some elements corresponding 440 to index sets is[1],is[5] 441 mesg [0] = 2 (no of index sets in the mesg) 442 ----------- 443 mesg [1] = 1 => is[1] 444 mesg [2] = sizeof(is[1]); 445 ----------- 446 mesg [3] = 5 => is[5] 447 mesg [4] = sizeof(is[5]); 448 ----------- 449 mesg [5] 450 mesg [n] datas[1] 451 ----------- 452 mesg[n+1] 453 mesg[m] data(is[5]) 454 ----------- 455 456 Notes: 457 nrqs - no of requests sent (or to be sent out) 458 nrqr - no of requests recieved (which have to be or which have been processed 459 */ 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 463 { 464 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 465 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 466 const PetscInt **idx,*idx_i; 467 PetscInt *n,**data,len; 468 #if defined(PETSC_USE_CTABLE) 469 PetscTable *table_data,table_data_i; 470 PetscInt *tdata,tcount,tcount_max; 471 #else 472 PetscInt *data_i,*d_p; 473 #endif 474 PetscErrorCode ierr; 475 PetscMPIInt size,rank,tag1,tag2; 476 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 477 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 478 PetscBT *table; 479 MPI_Comm comm; 480 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 481 MPI_Status *s_status,*recv_status; 482 char *t_p; 483 484 PetscFunctionBegin; 485 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 486 size = c->size; 487 rank = c->rank; 488 M = C->rmap->N; 489 490 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 491 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 492 493 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 494 495 for (i=0; i<imax; i++) { 496 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 497 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 498 } 499 500 /* evaluate communication - mesg to who,length of mesg, and buffer space 501 required. Based on this, buffers are allocated, and data copied into them */ 502 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 503 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 504 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 505 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 506 for (i=0; i<imax; i++) { 507 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 508 idx_i = idx[i]; 509 len = n[i]; 510 for (j=0; j<len; j++) { 511 row = idx_i[j]; 512 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 513 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 514 w4[proc]++; 515 } 516 for (j=0; j<size; j++) { 517 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 518 } 519 } 520 521 nrqs = 0; /* no of outgoing messages */ 522 msz = 0; /* total mesg length (for all proc */ 523 w1[rank] = 0; /* no mesg sent to intself */ 524 w3[rank] = 0; 525 for (i=0; i<size; i++) { 526 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 527 } 528 /* pa - is list of processors to communicate with */ 529 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 530 for (i=0,j=0; i<size; i++) { 531 if (w1[i]) {pa[j] = i; j++;} 532 } 533 534 /* Each message would have a header = 1 + 2*(no of IS) + data */ 535 for (i=0; i<nrqs; i++) { 536 j = pa[i]; 537 w1[j] += w2[j] + 2*w3[j]; 538 msz += w1[j]; 539 } 540 541 /* Determine the number of messages to expect, their lengths, from from-ids */ 542 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 543 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 544 545 /* Now post the Irecvs corresponding to these messages */ 546 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 547 548 /* Allocate Memory for outgoing messages */ 549 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 550 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 551 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 552 553 { 554 PetscInt *iptr = tmp,ict = 0; 555 for (i=0; i<nrqs; i++) { 556 j = pa[i]; 557 iptr += ict; 558 outdat[j] = iptr; 559 ict = w1[j]; 560 } 561 } 562 563 /* Form the outgoing messages */ 564 /* plug in the headers */ 565 for (i=0; i<nrqs; i++) { 566 j = pa[i]; 567 outdat[j][0] = 0; 568 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 569 ptr[j] = outdat[j] + 2*w3[j] + 1; 570 } 571 572 /* Memory for doing local proc's work */ 573 { 574 #if defined(PETSC_USE_CTABLE) 575 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 578 } 579 ierr = PetscCalloc4(imax,&table,imax,&data,imax,&isz,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 580 for (i=0; i<imax; i++) { 581 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 582 } 583 #else 584 PetscInt Mimax = 0,M_BPB_imax = 0; 585 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 586 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 587 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 588 ierr = PetscCalloc5(imax,&table,imax,&data,imax,&isz,M*imax,&d_p,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 589 for (i=0; i<imax; i++) { 590 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 591 data[i] = d_p + M*i; 592 } 593 #endif 594 } 595 596 /* Parse the IS and update local tables and the outgoing buf with the data */ 597 { 598 PetscInt n_i,isz_i,*outdat_j,ctr_j; 599 PetscBT table_i; 600 601 for (i=0; i<imax; i++) { 602 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 603 n_i = n[i]; 604 table_i = table[i]; 605 idx_i = idx[i]; 606 #if defined(PETSC_USE_CTABLE) 607 table_data_i = table_data[i]; 608 #else 609 data_i = data[i]; 610 #endif 611 isz_i = isz[i]; 612 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 613 row = idx_i[j]; 614 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 615 if (proc != rank) { /* copy to the outgoing buffer */ 616 ctr[proc]++; 617 *ptr[proc] = row; 618 ptr[proc]++; 619 } else if (!PetscBTLookupSet(table_i,row)) { 620 #if defined(PETSC_USE_CTABLE) 621 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 622 #else 623 data_i[isz_i] = row; /* Update the local table */ 624 #endif 625 isz_i++; 626 } 627 } 628 /* Update the headers for the current IS */ 629 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 630 if ((ctr_j = ctr[j])) { 631 outdat_j = outdat[j]; 632 k = ++outdat_j[0]; 633 outdat_j[2*k] = ctr_j; 634 outdat_j[2*k-1] = i; 635 } 636 } 637 isz[i] = isz_i; 638 } 639 } 640 641 /* Now post the sends */ 642 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 643 for (i=0; i<nrqs; ++i) { 644 j = pa[i]; 645 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 646 } 647 648 /* No longer need the original indices */ 649 for (i=0; i<imax; ++i) { 650 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 651 } 652 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 653 654 for (i=0; i<imax; ++i) { 655 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 656 } 657 658 /* Do Local work */ 659 #if defined(PETSC_USE_CTABLE) 660 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 661 #else 662 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 663 #endif 664 665 /* Receive messages */ 666 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 667 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 668 669 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 670 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 671 672 /* Phase 1 sends are complete - deallocate buffers */ 673 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 674 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 675 676 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 677 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 678 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 679 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 680 ierr = PetscFree(rbuf);CHKERRQ(ierr); 681 682 683 /* Send the data back */ 684 /* Do a global reduction to know the buffer space req for incoming messages */ 685 { 686 PetscMPIInt *rw1; 687 688 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 689 690 for (i=0; i<nrqr; ++i) { 691 proc = recv_status[i].MPI_SOURCE; 692 693 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 694 rw1[proc] = isz1[i]; 695 } 696 ierr = PetscFree(onodes1);CHKERRQ(ierr); 697 ierr = PetscFree(olengths1);CHKERRQ(ierr); 698 699 /* Determine the number of messages to expect, their lengths, from from-ids */ 700 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 701 ierr = PetscFree(rw1);CHKERRQ(ierr); 702 } 703 /* Now post the Irecvs corresponding to these messages */ 704 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 705 706 /* Now post the sends */ 707 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 708 for (i=0; i<nrqr; ++i) { 709 j = recv_status[i].MPI_SOURCE; 710 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 711 } 712 713 /* receive work done on other processors */ 714 { 715 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 716 PetscMPIInt idex; 717 PetscBT table_i; 718 MPI_Status *status2; 719 720 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 721 for (i=0; i<nrqs; ++i) { 722 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 723 /* Process the message */ 724 rbuf2_i = rbuf2[idex]; 725 ct1 = 2*rbuf2_i[0]+1; 726 jmax = rbuf2[idex][0]; 727 for (j=1; j<=jmax; j++) { 728 max = rbuf2_i[2*j]; 729 is_no = rbuf2_i[2*j-1]; 730 isz_i = isz[is_no]; 731 table_i = table[is_no]; 732 #if defined(PETSC_USE_CTABLE) 733 table_data_i = table_data[is_no]; 734 #else 735 data_i = data[is_no]; 736 #endif 737 for (k=0; k<max; k++,ct1++) { 738 row = rbuf2_i[ct1]; 739 if (!PetscBTLookupSet(table_i,row)) { 740 #if defined(PETSC_USE_CTABLE) 741 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 742 #else 743 data_i[isz_i] = row; 744 #endif 745 isz_i++; 746 } 747 } 748 isz[is_no] = isz_i; 749 } 750 } 751 752 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 753 ierr = PetscFree(status2);CHKERRQ(ierr); 754 } 755 756 #if defined(PETSC_USE_CTABLE) 757 tcount_max = 0; 758 for (i=0; i<imax; ++i) { 759 table_data_i = table_data[i]; 760 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 761 if (tcount_max < tcount) tcount_max = tcount; 762 } 763 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 764 #endif 765 766 for (i=0; i<imax; ++i) { 767 #if defined(PETSC_USE_CTABLE) 768 PetscTablePosition tpos; 769 table_data_i = table_data[i]; 770 771 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 772 while (tpos) { 773 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 774 tdata[--j] = --k; 775 } 776 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 777 #else 778 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 779 #endif 780 } 781 782 ierr = PetscFree(onodes2);CHKERRQ(ierr); 783 ierr = PetscFree(olengths2);CHKERRQ(ierr); 784 785 ierr = PetscFree(pa);CHKERRQ(ierr); 786 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 787 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 788 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 789 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 790 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 791 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 792 ierr = PetscFree(s_status);CHKERRQ(ierr); 793 ierr = PetscFree(recv_status);CHKERRQ(ierr); 794 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 795 ierr = PetscFree(xdata);CHKERRQ(ierr); 796 ierr = PetscFree(isz1);CHKERRQ(ierr); 797 #if defined(PETSC_USE_CTABLE) 798 for (i=0; i<imax; i++) { 799 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 800 } 801 ierr = PetscFree(table_data);CHKERRQ(ierr); 802 ierr = PetscFree(tdata);CHKERRQ(ierr); 803 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 804 #else 805 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 806 #endif 807 PetscFunctionReturn(0); 808 } 809 810 #undef __FUNCT__ 811 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 812 /* 813 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 814 the work on the local processor. 815 816 Inputs: 817 C - MAT_MPIAIJ; 818 imax - total no of index sets processed at a time; 819 table - an array of char - size = m bits. 820 821 Output: 822 isz - array containing the count of the solution elements corresponding 823 to each index set; 824 data or table_data - pointer to the solutions 825 */ 826 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 827 { 828 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 829 Mat A = c->A,B = c->B; 830 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 831 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 832 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 833 PetscBT table_i; 834 #if defined(PETSC_USE_CTABLE) 835 PetscTable table_data_i; 836 PetscErrorCode ierr; 837 PetscTablePosition tpos; 838 PetscInt tcount,*tdata; 839 #else 840 PetscInt *data_i; 841 #endif 842 843 PetscFunctionBegin; 844 rstart = C->rmap->rstart; 845 cstart = C->cmap->rstart; 846 ai = a->i; 847 aj = a->j; 848 bi = b->i; 849 bj = b->j; 850 garray = c->garray; 851 852 for (i=0; i<imax; i++) { 853 #if defined(PETSC_USE_CTABLE) 854 /* copy existing entries of table_data_i into tdata[] */ 855 table_data_i = table_data[i]; 856 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 857 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 858 859 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 860 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 861 while (tpos) { 862 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 863 tdata[--j] = --row; 864 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 865 } 866 #else 867 data_i = data[i]; 868 #endif 869 table_i = table[i]; 870 isz_i = isz[i]; 871 max = isz[i]; 872 873 for (j=0; j<max; j++) { 874 #if defined(PETSC_USE_CTABLE) 875 row = tdata[j] - rstart; 876 #else 877 row = data_i[j] - rstart; 878 #endif 879 start = ai[row]; 880 end = ai[row+1]; 881 for (k=start; k<end; k++) { /* Amat */ 882 val = aj[k] + cstart; 883 if (!PetscBTLookupSet(table_i,val)) { 884 #if defined(PETSC_USE_CTABLE) 885 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 886 #else 887 data_i[isz_i] = val; 888 #endif 889 isz_i++; 890 } 891 } 892 start = bi[row]; 893 end = bi[row+1]; 894 for (k=start; k<end; k++) { /* Bmat */ 895 val = garray[bj[k]]; 896 if (!PetscBTLookupSet(table_i,val)) { 897 #if defined(PETSC_USE_CTABLE) 898 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 899 #else 900 data_i[isz_i] = val; 901 #endif 902 isz_i++; 903 } 904 } 905 } 906 isz[i] = isz_i; 907 908 #if defined(PETSC_USE_CTABLE) 909 ierr = PetscFree(tdata);CHKERRQ(ierr); 910 #endif 911 } 912 PetscFunctionReturn(0); 913 } 914 915 #undef __FUNCT__ 916 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 917 /* 918 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 919 and return the output 920 921 Input: 922 C - the matrix 923 nrqr - no of messages being processed. 924 rbuf - an array of pointers to the recieved requests 925 926 Output: 927 xdata - array of messages to be sent back 928 isz1 - size of each message 929 930 For better efficiency perhaps we should malloc separately each xdata[i], 931 then if a remalloc is required we need only copy the data for that one row 932 rather then all previous rows as it is now where a single large chunck of 933 memory is used. 934 935 */ 936 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 937 { 938 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 939 Mat A = c->A,B = c->B; 940 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 941 PetscErrorCode ierr; 942 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 943 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 944 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 945 PetscInt *rbuf_i,kmax,rbuf_0; 946 PetscBT xtable; 947 948 PetscFunctionBegin; 949 m = C->rmap->N; 950 rstart = C->rmap->rstart; 951 cstart = C->cmap->rstart; 952 ai = a->i; 953 aj = a->j; 954 bi = b->i; 955 bj = b->j; 956 garray = c->garray; 957 958 959 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 960 rbuf_i = rbuf[i]; 961 rbuf_0 = rbuf_i[0]; 962 ct += rbuf_0; 963 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 964 } 965 966 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 967 else max1 = 1; 968 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 969 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 970 ++no_malloc; 971 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 972 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 973 974 ct3 = 0; 975 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 976 rbuf_i = rbuf[i]; 977 rbuf_0 = rbuf_i[0]; 978 ct1 = 2*rbuf_0+1; 979 ct2 = ct1; 980 ct3 += ct1; 981 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 982 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 983 oct2 = ct2; 984 kmax = rbuf_i[2*j]; 985 for (k=0; k<kmax; k++,ct1++) { 986 row = rbuf_i[ct1]; 987 if (!PetscBTLookupSet(xtable,row)) { 988 if (!(ct3 < mem_estimate)) { 989 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 990 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 991 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 992 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 993 xdata[0] = tmp; 994 mem_estimate = new_estimate; ++no_malloc; 995 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 996 } 997 xdata[i][ct2++] = row; 998 ct3++; 999 } 1000 } 1001 for (k=oct2,max2=ct2; k<max2; k++) { 1002 row = xdata[i][k] - rstart; 1003 start = ai[row]; 1004 end = ai[row+1]; 1005 for (l=start; l<end; l++) { 1006 val = aj[l] + cstart; 1007 if (!PetscBTLookupSet(xtable,val)) { 1008 if (!(ct3 < mem_estimate)) { 1009 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1010 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1011 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1012 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1013 xdata[0] = tmp; 1014 mem_estimate = new_estimate; ++no_malloc; 1015 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1016 } 1017 xdata[i][ct2++] = val; 1018 ct3++; 1019 } 1020 } 1021 start = bi[row]; 1022 end = bi[row+1]; 1023 for (l=start; l<end; l++) { 1024 val = garray[bj[l]]; 1025 if (!PetscBTLookupSet(xtable,val)) { 1026 if (!(ct3 < mem_estimate)) { 1027 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1028 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1029 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1030 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1031 xdata[0] = tmp; 1032 mem_estimate = new_estimate; ++no_malloc; 1033 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1034 } 1035 xdata[i][ct2++] = val; 1036 ct3++; 1037 } 1038 } 1039 } 1040 /* Update the header*/ 1041 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1042 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1043 } 1044 xdata[i][0] = rbuf_0; 1045 xdata[i+1] = xdata[i] + ct2; 1046 isz1[i] = ct2; /* size of each message */ 1047 } 1048 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1049 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1050 PetscFunctionReturn(0); 1051 } 1052 /* -------------------------------------------------------------------------*/ 1053 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 1054 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1055 /* 1056 Every processor gets the entire matrix 1057 */ 1058 #undef __FUNCT__ 1059 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 1060 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1061 { 1062 Mat B; 1063 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1064 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1065 PetscErrorCode ierr; 1066 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1067 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1068 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1069 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1070 1071 PetscFunctionBegin; 1072 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1073 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1074 1075 if (scall == MAT_INITIAL_MATRIX) { 1076 /* ---------------------------------------------------------------- 1077 Tell every processor the number of nonzeros per row 1078 */ 1079 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1080 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1081 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1082 } 1083 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1084 for (i=0; i<size; i++) { 1085 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1086 displs[i] = A->rmap->range[i]; 1087 } 1088 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1089 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1090 #else 1091 sendcount = A->rmap->rend - A->rmap->rstart; 1092 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1093 #endif 1094 /* --------------------------------------------------------------- 1095 Create the sequential matrix of the same type as the local block diagonal 1096 */ 1097 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1098 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1099 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1100 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1101 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1102 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1103 **Bin = B; 1104 b = (Mat_SeqAIJ*)B->data; 1105 1106 /*-------------------------------------------------------------------- 1107 Copy my part of matrix column indices over 1108 */ 1109 sendcount = ad->nz + bd->nz; 1110 jsendbuf = b->j + b->i[rstarts[rank]]; 1111 a_jsendbuf = ad->j; 1112 b_jsendbuf = bd->j; 1113 n = A->rmap->rend - A->rmap->rstart; 1114 cnt = 0; 1115 for (i=0; i<n; i++) { 1116 1117 /* put in lower diagonal portion */ 1118 m = bd->i[i+1] - bd->i[i]; 1119 while (m > 0) { 1120 /* is it above diagonal (in bd (compressed) numbering) */ 1121 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1122 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1123 m--; 1124 } 1125 1126 /* put in diagonal portion */ 1127 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1128 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1129 } 1130 1131 /* put in upper diagonal portion */ 1132 while (m-- > 0) { 1133 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1134 } 1135 } 1136 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1137 1138 /*-------------------------------------------------------------------- 1139 Gather all column indices to all processors 1140 */ 1141 for (i=0; i<size; i++) { 1142 recvcounts[i] = 0; 1143 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1144 recvcounts[i] += lens[j]; 1145 } 1146 } 1147 displs[0] = 0; 1148 for (i=1; i<size; i++) { 1149 displs[i] = displs[i-1] + recvcounts[i-1]; 1150 } 1151 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1152 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1153 #else 1154 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1155 #endif 1156 /*-------------------------------------------------------------------- 1157 Assemble the matrix into useable form (note numerical values not yet set) 1158 */ 1159 /* set the b->ilen (length of each row) values */ 1160 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1161 /* set the b->i indices */ 1162 b->i[0] = 0; 1163 for (i=1; i<=A->rmap->N; i++) { 1164 b->i[i] = b->i[i-1] + lens[i-1]; 1165 } 1166 ierr = PetscFree(lens);CHKERRQ(ierr); 1167 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1168 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1169 1170 } else { 1171 B = **Bin; 1172 b = (Mat_SeqAIJ*)B->data; 1173 } 1174 1175 /*-------------------------------------------------------------------- 1176 Copy my part of matrix numerical values into the values location 1177 */ 1178 if (flag == MAT_GET_VALUES) { 1179 sendcount = ad->nz + bd->nz; 1180 sendbuf = b->a + b->i[rstarts[rank]]; 1181 a_sendbuf = ad->a; 1182 b_sendbuf = bd->a; 1183 b_sendj = bd->j; 1184 n = A->rmap->rend - A->rmap->rstart; 1185 cnt = 0; 1186 for (i=0; i<n; i++) { 1187 1188 /* put in lower diagonal portion */ 1189 m = bd->i[i+1] - bd->i[i]; 1190 while (m > 0) { 1191 /* is it above diagonal (in bd (compressed) numbering) */ 1192 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1193 sendbuf[cnt++] = *b_sendbuf++; 1194 m--; 1195 b_sendj++; 1196 } 1197 1198 /* put in diagonal portion */ 1199 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1200 sendbuf[cnt++] = *a_sendbuf++; 1201 } 1202 1203 /* put in upper diagonal portion */ 1204 while (m-- > 0) { 1205 sendbuf[cnt++] = *b_sendbuf++; 1206 b_sendj++; 1207 } 1208 } 1209 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1210 1211 /* ----------------------------------------------------------------- 1212 Gather all numerical values to all processors 1213 */ 1214 if (!recvcounts) { 1215 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1216 } 1217 for (i=0; i<size; i++) { 1218 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1219 } 1220 displs[0] = 0; 1221 for (i=1; i<size; i++) { 1222 displs[i] = displs[i-1] + recvcounts[i-1]; 1223 } 1224 recvbuf = b->a; 1225 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1226 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1227 #else 1228 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1229 #endif 1230 } /* endof (flag == MAT_GET_VALUES) */ 1231 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1232 1233 if (A->symmetric) { 1234 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1235 } else if (A->hermitian) { 1236 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1237 } else if (A->structurally_symmetric) { 1238 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1239 } 1240 PetscFunctionReturn(0); 1241 } 1242 #undef __FUNCT__ 1243 #define __FUNCT__ "MatDestroy_MPIAIJ_MatGetSubmatrices" 1244 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1245 { 1246 PetscErrorCode ierr; 1247 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1248 Mat_SubMat *submatj = c->submatis1; 1249 PetscInt i; 1250 1251 PetscFunctionBegin; 1252 ierr = submatj->destroy(C);CHKERRQ(ierr); 1253 1254 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1255 1256 for (i=0; i<submatj->nrqr; ++i) { 1257 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1258 } 1259 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1260 1261 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1262 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1263 1264 for (i=0; i<submatj->nrqs; ++i) { 1265 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1266 } 1267 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1268 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1269 1270 #if defined(PETSC_USE_CTABLE) 1271 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1272 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1273 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1274 #else 1275 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1276 #endif 1277 1278 if (!submatj->allcolumns) { 1279 #if defined(PETSC_USE_CTABLE) 1280 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1281 #else 1282 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1283 #endif 1284 } 1285 1286 ierr = PetscFree(submatj);CHKERRQ(ierr); 1287 PetscFunctionReturn(0); 1288 } 1289 1290 #undef __FUNCT__ 1291 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS_Local" 1292 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1293 { 1294 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1295 Mat submat,A = c->A,B = c->B; 1296 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1297 PetscInt *ai = a->i,*bi = b->i,nzA,nzB; 1298 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1299 const PetscInt *icol,*irow; 1300 PetscInt nrow,ncol,start; 1301 PetscErrorCode ierr; 1302 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 1303 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1304 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1305 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1306 PetscInt *lens,ncols,*cols,Crow; 1307 #if defined(PETSC_USE_CTABLE) 1308 PetscTable cmap,rmap; 1309 PetscInt *cmap_loc,*rmap_loc; 1310 #else 1311 PetscInt *cmap,*rmap; 1312 #endif 1313 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1314 PetscInt *cworkB,lwrite,*bj = b->j,*subcols; 1315 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1316 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1317 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1318 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1319 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1320 MPI_Comm comm; 1321 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1322 PetscMPIInt *onodes1,*olengths1,idex,end; 1323 Mat_SubMat *smatis1; 1324 #if defined(PETSC_USE_LOG) 1325 PetscLogStage stage[5]; 1326 #endif 1327 1328 PetscFunctionBegin; 1329 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1330 1331 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1332 size = c->size; 1333 rank = c->rank; 1334 1335 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1336 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1337 if (allcolumns) { 1338 icol = NULL; 1339 ncol = C->cmap->N; 1340 } else { 1341 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1342 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1343 } 1344 1345 if (scall == MAT_INITIAL_MATRIX) { 1346 PetscInt *sbuf2_i,*cworkA,lwrite,*aj = a->j,ctmp; 1347 1348 ierr = PetscLogStageRegister("Setup",&stage[0]);CHKERRQ(ierr); 1349 ierr = PetscLogStageRegister("Aj comm",&stage[1]);CHKERRQ(ierr); 1350 ierr = PetscLogStageRegister("Aval comm",&stage[2]);CHKERRQ(ierr); 1351 ierr = PetscLogStageRegister("MAssem_loc",&stage[3]);CHKERRQ(ierr); 1352 ierr = PetscLogStageRegister("MAssem_off",&stage[4]);CHKERRQ(ierr); 1353 1354 ierr = PetscLogStagePush(stage[0]);CHKERRQ(ierr); 1355 1356 /* Get some new tags to keep the communication clean */ 1357 tag1 = ((PetscObject)C)->tag; 1358 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1359 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1360 1361 /* evaluate communication - mesg to who, length of mesg, and buffer space 1362 required. Based on this, buffers are allocated, and data copied into them */ 1363 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 1364 1365 for (j=0; j<nrow; j++) { 1366 proc = 0; 1367 row = irow[j]; 1368 while (row >= C->rmap->range[proc+1]) proc++; 1369 w4[proc]++; 1370 } 1371 for (j=0; j<size; j++) { 1372 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 1373 } 1374 1375 nrqs = 0; /* no of outgoing messages */ 1376 msz = 0; /* total mesg length (for all procs) */ 1377 w1[rank] = 0; /* no mesg sent to self */ 1378 w3[rank] = 0; 1379 for (i=0; i<size; i++) { 1380 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 1381 } 1382 1383 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1384 for (i=0,j=0; i<size; i++) { 1385 if (w1[i]) { pa[j] = i; j++; } 1386 } 1387 1388 /* Each message would have a header = 1 + 2*(no of IS) + data */ 1389 for (i=0; i<nrqs; i++) { 1390 proc = pa[i]; 1391 w1[proc] += w2[proc] + 2* w3[proc]; 1392 msz += w1[proc]; 1393 } 1394 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1395 1396 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1397 /* if w2[i]=1, a message of length w1[i] will be sent to proc[i]; */ 1398 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1399 1400 /* nrqs: nsend; nrqr: nrecv; w1: send message length; onodes1: recv id; olengths1: recv message length */ 1401 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1402 1403 /* Now post the Irecvs corresponding to these messages */ 1404 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1405 1406 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1407 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1408 1409 /* Allocate Memory for outgoing messages */ 1410 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1411 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1412 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1413 1414 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1415 iptr = tmp; 1416 k = 0; 1417 for (i=0; i<nrqs; i++) { 1418 proc = pa[i]; 1419 iptr += k; 1420 sbuf1[proc] = iptr; 1421 k = w1[proc]; 1422 } 1423 1424 /* Form the outgoing messages */ 1425 /* Initialize the header space */ 1426 for (i=0; i<nrqs; i++) { 1427 proc = pa[i]; 1428 sbuf1[proc][0] = 0; 1429 ierr = PetscMemzero(sbuf1[proc]+1,2*w3[proc]*sizeof(PetscInt));CHKERRQ(ierr); 1430 ptr[proc] = sbuf1[proc] + 2*w3[proc] + 1; 1431 } 1432 1433 /* Parse the isrow and copy data into outbuf */ 1434 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1435 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1436 proc = 0; 1437 row = irow[j]; 1438 while (row >= C->rmap->range[proc+1]) proc++; 1439 if (proc != rank) { /* copy to the outgoing buf*/ 1440 ctr[proc]++; 1441 *ptr[proc] = row; 1442 ptr[proc]++; 1443 } 1444 } 1445 1446 /* Update the headers for the current IS */ 1447 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1448 if ((ctr_j = ctr[j])) { 1449 sbuf1_j = sbuf1[j]; 1450 k = ++sbuf1_j[0]; 1451 sbuf1_j[2*k] = ctr_j; 1452 sbuf1_j[2*k-1] = 0; 1453 } 1454 } 1455 1456 /* Now post the sends */ 1457 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1458 for (i=0; i<nrqs; ++i) { 1459 proc = pa[i]; 1460 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1461 } 1462 1463 /* Post Receives to capture the buffer size */ 1464 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1465 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1466 1467 rbuf2[0] = tmp + msz; 1468 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1469 1470 for (i=0; i<nrqs; ++i) { 1471 proc = pa[i]; 1472 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1473 } 1474 1475 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1476 1477 /* Send to other procs the buf size they should allocate */ 1478 /* Receive messages*/ 1479 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1480 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1481 1482 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1483 for (i=0; i<nrqr; ++i) { 1484 req_size[i] = 0; 1485 rbuf1_i = rbuf1[i]; 1486 start = 2*rbuf1_i[0] + 1; 1487 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1488 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1489 sbuf2_i = sbuf2[i]; 1490 for (j=start; j<end; j++) { 1491 k = rbuf1_i[j] - rstart; 1492 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1493 sbuf2_i[j] = ncols; 1494 req_size[i] += ncols; 1495 } 1496 req_source1[i] = r_status1[i].MPI_SOURCE; 1497 1498 /* form the header */ 1499 sbuf2_i[0] = req_size[i]; 1500 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1501 1502 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1503 } 1504 1505 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1506 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1507 1508 /* rbuf2 is received, Post recv column indices a->j */ 1509 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1510 1511 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1512 for (i=0; i<nrqs; ++i) { 1513 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1514 req_source2[i] = r_status2[i].MPI_SOURCE; 1515 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1516 } 1517 1518 /* Wait on sends1 and sends2 */ 1519 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1520 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1521 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1522 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1523 1524 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1525 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1526 1527 ierr = PetscLogStagePop();CHKERRQ(ierr); 1528 ierr = PetscLogStagePush(stage[1]);CHKERRQ(ierr); 1529 1530 /* Now allocate sending buffers for a->j, and send them off */ 1531 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1532 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1533 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1534 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1535 1536 for (i=0; i<nrqr; i++) { /* for each requested message */ 1537 rbuf1_i = rbuf1[i]; 1538 sbuf_aj_i = sbuf_aj[i]; 1539 ct1 = 2*rbuf1_i[0] + 1; 1540 ct2 = 0; 1541 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1542 1543 kmax = rbuf1[i][2]; 1544 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1545 row = rbuf1_i[ct1] - rstart; 1546 nzA = ai[row+1] - ai[row]; nzB = bi[row+1] - bi[row]; 1547 ncols = nzA + nzB; 1548 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1549 1550 /* load the column indices for this row into cols*/ 1551 cols = sbuf_aj_i + ct2; 1552 1553 lwrite = 0; 1554 for (l=0; l<nzB; l++) { 1555 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1556 } 1557 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1558 for (l=0; l<nzB; l++) { 1559 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1560 } 1561 1562 ct2 += ncols; 1563 } 1564 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1565 } 1566 1567 /* create column map (cmap): global col of C -> local col of submat */ 1568 #if defined(PETSC_USE_CTABLE) 1569 if (!allcolumns) { 1570 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1571 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1572 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1573 if (icol[j] >= cstart && icol[j] <cend) { 1574 cmap_loc[icol[j] - cstart] = j+1; 1575 } else { /* use PetscTable for non-local col indices */ 1576 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1577 } 1578 } 1579 } else { 1580 cmap = NULL; 1581 cmap_loc = NULL; 1582 } 1583 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1584 #else 1585 if (!allcolumns) { 1586 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1587 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1588 } else { 1589 cmap = NULL; 1590 } 1591 #endif 1592 1593 /* Create lens for MatSeqAIJSetPreallocation() */ 1594 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1595 1596 /* Compute lens from local part of C */ 1597 for (j=0; j<nrow; j++) { 1598 proc = 0; 1599 row = irow[j]; 1600 while (row >= C->rmap->range[proc+1]) proc++; 1601 if (proc == rank) { 1602 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1603 if (!allcolumns) { 1604 for (k=0; k<ncols; k++) { 1605 #if defined(PETSC_USE_CTABLE) 1606 if (cols[k] >= cstart && cols[k] <cend) { 1607 tcol = cmap_loc[cols[k] - cstart]; 1608 } else { 1609 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1610 } 1611 #else 1612 tcol = cmap[cols[k]]; 1613 #endif 1614 if (tcol) lens[j]++; 1615 } 1616 } else { /* allcolumns */ 1617 lens[j] = ncols; 1618 } 1619 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1620 } 1621 } 1622 1623 /* Create row map (rmap): global row of C -> local row of submat */ 1624 #if defined(PETSC_USE_CTABLE) 1625 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1626 for (j=0; j<nrow; j++) { 1627 proc = 0; 1628 row = irow[j]; 1629 while (row >= C->rmap->range[proc+1]) proc++; 1630 if (proc == rank) { /* a local row */ 1631 rmap_loc[row - rstart] = j; 1632 } else { 1633 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1634 } 1635 } 1636 #else 1637 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1638 for (j=0; j<nrow; j++) { 1639 rmap[irow[j]] = j; 1640 } 1641 #endif 1642 1643 /* Update lens from offproc data */ 1644 /* recv a->j is done */ 1645 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1646 for (i=0; i<nrqs; i++) { 1647 proc = pa[i]; 1648 sbuf1_i = sbuf1[proc]; 1649 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1650 ct1 = 2 + 1; 1651 ct2 = 0; 1652 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1653 rbuf3_i = rbuf3[i]; /* received C->j */ 1654 1655 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1656 max1 = sbuf1_i[2]; 1657 for (k=0; k<max1; k++,ct1++) { 1658 #if defined(PETSC_USE_CTABLE) 1659 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1660 row--; 1661 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1662 #else 1663 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1664 #endif 1665 /* Now, store row index of submat in sbuf1_i[ct1] */ 1666 sbuf1_i[ct1] = row; 1667 1668 nnz = rbuf2_i[ct1]; 1669 if (!allcolumns) { 1670 for (l=0; l<nnz; l++,ct2++) { 1671 #if defined(PETSC_USE_CTABLE) 1672 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1673 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1674 } else { 1675 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1676 } 1677 #else 1678 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1679 #endif 1680 if (tcol) { 1681 lens[row]++; 1682 1683 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1684 For reuse, we replace received C->j with index that should be inserted to submat */ 1685 /* rbuf3_i[ct3++] = ct2; ??? */ 1686 } 1687 } 1688 } else { /* allcolumns */ 1689 lens[row] += nnz; 1690 } 1691 } 1692 } 1693 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1694 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1695 1696 /* Create the submatrices */ 1697 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1698 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1699 1700 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1701 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1702 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1703 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1704 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1705 1706 /* create struct Mat_SubMat and attached it to submat */ 1707 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1708 subc = (Mat_SeqAIJ*)submat->data; 1709 subc->submatis1 = smatis1; 1710 1711 smatis1->nrqs = nrqs; 1712 smatis1->nrqr = nrqr; 1713 smatis1->rbuf1 = rbuf1; 1714 smatis1->rbuf2 = rbuf2; 1715 smatis1->rbuf3 = rbuf3; 1716 smatis1->sbuf2 = sbuf2; 1717 smatis1->req_source2 = req_source2; 1718 1719 smatis1->sbuf1 = sbuf1; 1720 smatis1->ptr = ptr; 1721 smatis1->tmp = tmp; 1722 smatis1->ctr = ctr; 1723 1724 smatis1->pa = pa; 1725 smatis1->req_size = req_size; 1726 smatis1->req_source1 = req_source1; 1727 1728 smatis1->allcolumns = allcolumns; 1729 smatis1->rmap = rmap; 1730 smatis1->cmap = cmap; 1731 #if defined(PETSC_USE_CTABLE) 1732 smatis1->rmap_loc = rmap_loc; 1733 smatis1->cmap_loc = cmap_loc; 1734 #endif 1735 1736 smatis1->destroy = submat->ops->destroy; 1737 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1738 submat->factortype = C->factortype; 1739 1740 ierr = PetscLogStagePop();CHKERRQ(ierr); 1741 ierr = PetscLogStagePush(stage[2]);CHKERRQ(ierr); 1742 } else { /* scall == MAT_REUSE_MATRIX */ 1743 submat = submats[0]; 1744 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1745 1746 subc = (Mat_SeqAIJ*)submat->data; 1747 smatis1 = subc->submatis1; 1748 nrqs = smatis1->nrqs; 1749 nrqr = smatis1->nrqr; 1750 rbuf1 = smatis1->rbuf1; 1751 rbuf2 = smatis1->rbuf2; 1752 rbuf3 = smatis1->rbuf3; 1753 req_source2 = smatis1->req_source2; 1754 1755 sbuf1 = smatis1->sbuf1; 1756 sbuf2 = smatis1->sbuf2; 1757 ptr = smatis1->ptr; 1758 tmp = smatis1->tmp; 1759 ctr = smatis1->ctr; 1760 1761 pa = smatis1->pa; 1762 req_size = smatis1->req_size; 1763 req_source1 = smatis1->req_source1; 1764 1765 allcolumns = smatis1->allcolumns; 1766 rmap = smatis1->rmap; 1767 cmap = smatis1->cmap; 1768 #if defined(PETSC_USE_CTABLE) 1769 rmap_loc = smatis1->rmap_loc; 1770 cmap_loc = smatis1->cmap_loc; 1771 #endif 1772 } 1773 1774 /* Post recv matrix values */ 1775 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1776 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1777 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1778 for (i=0; i<nrqs; ++i) { 1779 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1780 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1781 } 1782 1783 /* Allocate sending buffers for a->a, and send them off */ 1784 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1785 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1786 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1787 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1788 1789 for (i=0; i<nrqr; i++) { 1790 rbuf1_i = rbuf1[i]; 1791 sbuf_aa_i = sbuf_aa[i]; 1792 ct1 = 2*rbuf1_i[0]+1; 1793 ct2 = 0; 1794 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1795 1796 kmax = rbuf1_i[2]; 1797 for (k=0; k<kmax; k++,ct1++) { 1798 row = rbuf1_i[ct1] - rstart; 1799 nzA = ai[row+1] - ai[row]; 1800 nzB = bi[row+1] - bi[row]; 1801 ncols = nzA + nzB; 1802 cworkB = bj + bi[row]; 1803 vworkA = a_a + ai[row]; 1804 vworkB = b_a + bi[row]; 1805 1806 /* load the column values for this row into vals*/ 1807 vals = sbuf_aa_i + ct2; 1808 1809 lwrite = 0; 1810 for (l=0; l<nzB; l++) { 1811 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1812 } 1813 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1814 for (l=0; l<nzB; l++) { 1815 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1816 } 1817 1818 ct2 += ncols; 1819 } 1820 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1821 } 1822 1823 if (scall == MAT_INITIAL_MATRIX) { 1824 ierr = PetscLogStagePop();CHKERRQ(ierr); 1825 ierr = PetscLogStagePush(stage[3]);CHKERRQ(ierr); 1826 } 1827 1828 /* Assemble submat */ 1829 if (!allcolumns) { 1830 ierr = PetscCalloc2(ncol,&subcols,ncol,&subvals);CHKERRQ(ierr); 1831 } 1832 /* First assemble the local rows */ 1833 for (j=0; j<nrow; j++) { 1834 proc = 0; 1835 row = irow[j]; 1836 while (row >= C->rmap->range[proc+1]) proc++; 1837 if (proc == rank) { 1838 Crow = row; 1839 #if defined(PETSC_USE_CTABLE) 1840 row = rmap_loc[Crow - rstart]; /* row index of submat */ 1841 #else 1842 row = rmap[row]; 1843 #endif 1844 ierr = MatGetRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr); 1845 1846 if (!allcolumns) { 1847 #if defined(PETSC_USE_CTABLE) 1848 i = 0; 1849 k = 0; 1850 while (cols[k] < cstart) { 1851 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1852 if (tcol) { 1853 subcols[i] = --tcol; 1854 subvals[i++] = vals[k]; 1855 } 1856 k++; 1857 } 1858 while (cols[k] < cend && k<ncols) { 1859 tcol = cmap_loc[cols[k]-cstart]; 1860 if (tcol) { 1861 subcols[i] = --tcol; 1862 subvals[i++] = vals[k]; 1863 } 1864 k++; 1865 } 1866 while (k<ncols) { 1867 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1868 if (tcol) { 1869 subcols[i] = --tcol; 1870 subvals[i++] = vals[k]; 1871 } 1872 k++; 1873 } 1874 #else 1875 i = 0; 1876 for (k=0; k<ncols; k++) { 1877 tcol = cmap[cols[k]]; 1878 if (tcol) { 1879 subcols[i] = --tcol; 1880 subvals[i++] = vals[k]; 1881 } 1882 } 1883 #endif 1884 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1885 } else { /* allcolumns */ 1886 ierr = MatSetValues_SeqAIJ(submat,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr); 1887 } 1888 ierr = MatRestoreRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr); 1889 } 1890 } 1891 1892 if (scall == MAT_INITIAL_MATRIX) { 1893 ierr = PetscLogStagePop();CHKERRQ(ierr); 1894 ierr = PetscLogStagePush(stage[4]);CHKERRQ(ierr); 1895 } 1896 1897 /* Now assemble the off-proc rows */ 1898 for (i=0; i<nrqs; i++) { /* for each requested message */ 1899 /* recv values from other processes */ 1900 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1901 proc = pa[idex]; 1902 sbuf1_i = sbuf1[proc]; 1903 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1904 ct1 = 2 + 1; 1905 ct2 = 0; /* count of received C->j */ 1906 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1907 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1908 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1909 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1910 1911 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1912 max1 = sbuf1_i[2]; /* num of rows */ 1913 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1914 row = sbuf1_i[ct1]; /* row index of submat */ 1915 if (!allcolumns) { 1916 idex = 0; 1917 if (scall == MAT_INITIAL_MATRIX) { 1918 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1919 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1920 #if defined(PETSC_USE_CTABLE) 1921 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1922 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1923 } else { 1924 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1925 } 1926 #else 1927 tcol = cmap[rbuf3_i[ct2]]; 1928 #endif 1929 if (tcol) { 1930 subcols[idex] = --tcol; 1931 subvals[idex++] = rbuf4_i[ct2]; 1932 1933 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1934 For reuse, we replace received C->j with index that should be inserted to submat */ 1935 rbuf3_i[ct3++] = ct2; 1936 } 1937 } 1938 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1939 1940 } else { /* scall == MAT_REUSE_MATRIX */ 1941 submat = submats[0]; 1942 subc = (Mat_SeqAIJ*)submat->data; 1943 1944 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1945 for (l=0; l<nnz; l++) { 1946 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1947 subvals[idex++] = rbuf4_i[ct2]; 1948 } 1949 1950 bj = subc->j + subc->i[row]; 1951 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1952 } 1953 } else { /* allcolumns */ 1954 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1955 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1956 ct2 += nnz; 1957 } 1958 } 1959 } 1960 1961 /* sending a->a are done */ 1962 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1963 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1964 1965 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1966 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1967 submats[0] = submat; 1968 1969 /* Restore the indices */ 1970 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1971 if (!allcolumns) { 1972 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1973 } 1974 1975 /* Destroy allocated memory */ 1976 for (i=0; i<nrqs; ++i) { 1977 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1978 } 1979 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1980 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1981 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1982 1983 if (!allcolumns) { 1984 ierr = PetscFree2(subcols,subvals);CHKERRQ(ierr); 1985 } 1986 1987 if (scall == MAT_INITIAL_MATRIX) { 1988 ierr = PetscLogStagePop();CHKERRQ(ierr); 1989 1990 ierr = PetscFree(lens);CHKERRQ(ierr); 1991 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1992 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1993 } 1994 PetscFunctionReturn(0); 1995 } 1996 1997 #undef __FUNCT__ 1998 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS" 1999 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2000 { 2001 PetscErrorCode ierr; 2002 PetscInt ncol; 2003 PetscBool colflag,allcolumns=PETSC_FALSE; 2004 2005 PetscFunctionBegin; 2006 /* Allocate memory to hold all the submatrices */ 2007 if (scall != MAT_REUSE_MATRIX) { 2008 ierr = PetscMalloc1(1,submat);CHKERRQ(ierr); 2009 } 2010 2011 /* Check for special case: each processor gets entire matrix columns */ 2012 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 2013 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 2014 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 2015 2016 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2017 PetscFunctionReturn(0); 2018 } 2019 2020 #undef __FUNCT__ 2021 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 2022 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2023 { 2024 PetscErrorCode ierr; 2025 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 2026 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 2027 2028 PetscFunctionBegin; 2029 /* Check for special case: each processor gets entire matrix */ 2030 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */ 2031 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2032 PetscFunctionReturn(0); 2033 } 2034 2035 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2036 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2037 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2038 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2039 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2040 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2041 wantallmatrix = PETSC_TRUE; 2042 2043 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2044 } 2045 } 2046 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2047 if (twantallmatrix) { 2048 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2049 PetscFunctionReturn(0); 2050 } 2051 2052 /* Allocate memory to hold all the submatrices */ 2053 if (scall != MAT_REUSE_MATRIX) { 2054 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 2055 } 2056 2057 /* Check for special case: each processor gets entire matrix columns */ 2058 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 2059 for (i=0; i<ismax; i++) { 2060 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2061 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 2062 if (colflag && ncol == C->cmap->N) { 2063 allcolumns[i] = PETSC_TRUE; 2064 } else { 2065 allcolumns[i] = PETSC_FALSE; 2066 } 2067 } 2068 2069 /* Determine the number of stages through which submatrices are done */ 2070 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2071 2072 /* 2073 Each stage will extract nmax submatrices. 2074 nmax is determined by the matrix column dimension. 2075 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2076 */ 2077 if (!nmax) nmax = 1; 2078 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 2079 2080 /* Make sure every processor loops through the nstages */ 2081 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2082 2083 for (i=0,pos=0; i<nstages; i++) { 2084 if (pos+nmax <= ismax) max_no = nmax; 2085 else if (pos == ismax) max_no = 0; 2086 else max_no = ismax-pos; 2087 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 2088 pos += max_no; 2089 } 2090 2091 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 2092 PetscFunctionReturn(0); 2093 } 2094 2095 /* -------------------------------------------------------------------------*/ 2096 #undef __FUNCT__ 2097 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 2098 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 2099 { 2100 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2101 Mat A = c->A; 2102 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 2103 const PetscInt **icol,**irow; 2104 PetscInt *nrow,*ncol,start; 2105 PetscErrorCode ierr; 2106 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 2107 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 2108 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 2109 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 2110 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2111 #if defined(PETSC_USE_CTABLE) 2112 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2113 #else 2114 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2115 #endif 2116 const PetscInt *irow_i; 2117 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2118 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2119 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2120 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2121 MPI_Status *r_status3,*r_status4,*s_status4; 2122 MPI_Comm comm; 2123 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 2124 PetscMPIInt *onodes1,*olengths1; 2125 PetscMPIInt idex,idex2,end; 2126 2127 PetscFunctionBegin; 2128 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2129 tag0 = ((PetscObject)C)->tag; 2130 size = c->size; 2131 rank = c->rank; 2132 2133 /* Get some new tags to keep the communication clean */ 2134 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 2135 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2136 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2137 2138 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 2139 2140 for (i=0; i<ismax; i++) { 2141 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2142 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2143 if (allcolumns[i]) { 2144 icol[i] = NULL; 2145 ncol[i] = C->cmap->N; 2146 } else { 2147 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2148 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2149 } 2150 } 2151 2152 /* evaluate communication - mesg to who, length of mesg, and buffer space 2153 required. Based on this, buffers are allocated, and data copied into them*/ 2154 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 2155 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2156 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2157 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2158 for (i=0; i<ismax; i++) { 2159 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2160 jmax = nrow[i]; 2161 irow_i = irow[i]; 2162 for (j=0; j<jmax; j++) { 2163 l = 0; 2164 row = irow_i[j]; 2165 while (row >= C->rmap->range[l+1]) l++; 2166 proc = l; 2167 w4[proc]++; 2168 } 2169 for (j=0; j<size; j++) { 2170 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 2171 } 2172 } 2173 2174 nrqs = 0; /* no of outgoing messages */ 2175 msz = 0; /* total mesg length (for all procs) */ 2176 w1[rank] = 0; /* no mesg sent to self */ 2177 w3[rank] = 0; 2178 for (i=0; i<size; i++) { 2179 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2180 } 2181 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2182 for (i=0,j=0; i<size; i++) { 2183 if (w1[i]) { pa[j] = i; j++; } 2184 } 2185 2186 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2187 for (i=0; i<nrqs; i++) { 2188 j = pa[i]; 2189 w1[j] += w2[j] + 2* w3[j]; 2190 msz += w1[j]; 2191 } 2192 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2193 2194 /* Determine the number of messages to expect, their lengths, from from-ids */ 2195 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2196 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2197 2198 /* Now post the Irecvs corresponding to these messages */ 2199 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2200 2201 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2202 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2203 2204 /* Allocate Memory for outgoing messages */ 2205 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2206 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2207 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2208 2209 { 2210 PetscInt *iptr = tmp; 2211 k = 0; 2212 for (i=0; i<nrqs; i++) { 2213 j = pa[i]; 2214 iptr += k; 2215 sbuf1[j] = iptr; 2216 k = w1[j]; 2217 } 2218 } 2219 2220 /* Form the outgoing messages */ 2221 /* Initialize the header space */ 2222 for (i=0; i<nrqs; i++) { 2223 j = pa[i]; 2224 sbuf1[j][0] = 0; 2225 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2226 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2227 } 2228 2229 /* Parse the isrow and copy data into outbuf */ 2230 for (i=0; i<ismax; i++) { 2231 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2232 irow_i = irow[i]; 2233 jmax = nrow[i]; 2234 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2235 l = 0; 2236 row = irow_i[j]; 2237 while (row >= C->rmap->range[l+1]) l++; 2238 proc = l; 2239 if (proc != rank) { /* copy to the outgoing buf*/ 2240 ctr[proc]++; 2241 *ptr[proc] = row; 2242 ptr[proc]++; 2243 } 2244 } 2245 /* Update the headers for the current IS */ 2246 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2247 if ((ctr_j = ctr[j])) { 2248 sbuf1_j = sbuf1[j]; 2249 k = ++sbuf1_j[0]; 2250 sbuf1_j[2*k] = ctr_j; 2251 sbuf1_j[2*k-1] = i; 2252 } 2253 } 2254 } 2255 2256 /* Now post the sends */ 2257 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2258 for (i=0; i<nrqs; ++i) { 2259 j = pa[i]; 2260 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2261 } 2262 2263 /* Post Receives to capture the buffer size */ 2264 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2265 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 2266 rbuf2[0] = tmp + msz; 2267 for (i=1; i<nrqs; ++i) { 2268 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2269 } 2270 for (i=0; i<nrqs; ++i) { 2271 j = pa[i]; 2272 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 2273 } 2274 2275 /* Send to other procs the buf size they should allocate */ 2276 2277 2278 /* Receive messages*/ 2279 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2280 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2281 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 2282 { 2283 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2284 PetscInt *sbuf2_i; 2285 2286 for (i=0; i<nrqr; ++i) { 2287 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 2288 2289 req_size[idex] = 0; 2290 rbuf1_i = rbuf1[idex]; 2291 start = 2*rbuf1_i[0] + 1; 2292 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2293 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 2294 sbuf2_i = sbuf2[idex]; 2295 for (j=start; j<end; j++) { 2296 id = rbuf1_i[j] - rstart; 2297 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2298 sbuf2_i[j] = ncols; 2299 req_size[idex] += ncols; 2300 } 2301 req_source[idex] = r_status1[i].MPI_SOURCE; 2302 /* form the header */ 2303 sbuf2_i[0] = req_size[idex]; 2304 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2305 2306 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 2307 } 2308 } 2309 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2310 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2311 2312 /* recv buffer sizes */ 2313 /* Receive messages*/ 2314 2315 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 2316 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2317 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2318 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2319 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2320 2321 for (i=0; i<nrqs; ++i) { 2322 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 2323 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 2324 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 2325 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 2326 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 2327 } 2328 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2329 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2330 2331 /* Wait on sends1 and sends2 */ 2332 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2333 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2334 2335 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2336 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2337 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2338 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2339 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2340 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2341 2342 /* Now allocate buffers for a->j, and send them off */ 2343 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2344 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2345 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2346 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2347 2348 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2349 { 2350 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2351 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2352 PetscInt cend = C->cmap->rend; 2353 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2354 2355 for (i=0; i<nrqr; i++) { 2356 rbuf1_i = rbuf1[i]; 2357 sbuf_aj_i = sbuf_aj[i]; 2358 ct1 = 2*rbuf1_i[0] + 1; 2359 ct2 = 0; 2360 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2361 kmax = rbuf1[i][2*j]; 2362 for (k=0; k<kmax; k++,ct1++) { 2363 row = rbuf1_i[ct1] - rstart; 2364 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2365 ncols = nzA + nzB; 2366 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2367 2368 /* load the column indices for this row into cols*/ 2369 cols = sbuf_aj_i + ct2; 2370 2371 lwrite = 0; 2372 for (l=0; l<nzB; l++) { 2373 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2374 } 2375 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2376 for (l=0; l<nzB; l++) { 2377 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2378 } 2379 2380 ct2 += ncols; 2381 } 2382 } 2383 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 2384 } 2385 } 2386 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 2387 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 2388 2389 /* Allocate buffers for a->a, and send them off */ 2390 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2391 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2392 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2393 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2394 2395 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2396 { 2397 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2398 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2399 PetscInt cend = C->cmap->rend; 2400 PetscInt *b_j = b->j; 2401 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2402 2403 for (i=0; i<nrqr; i++) { 2404 rbuf1_i = rbuf1[i]; 2405 sbuf_aa_i = sbuf_aa[i]; 2406 ct1 = 2*rbuf1_i[0]+1; 2407 ct2 = 0; 2408 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2409 kmax = rbuf1_i[2*j]; 2410 for (k=0; k<kmax; k++,ct1++) { 2411 row = rbuf1_i[ct1] - rstart; 2412 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2413 ncols = nzA + nzB; 2414 cworkB = b_j + b_i[row]; 2415 vworkA = a_a + a_i[row]; 2416 vworkB = b_a + b_i[row]; 2417 2418 /* load the column values for this row into vals*/ 2419 vals = sbuf_aa_i+ct2; 2420 2421 lwrite = 0; 2422 for (l=0; l<nzB; l++) { 2423 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2424 } 2425 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2426 for (l=0; l<nzB; l++) { 2427 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2428 } 2429 2430 ct2 += ncols; 2431 } 2432 } 2433 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 2434 } 2435 } 2436 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 2437 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 2438 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2439 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2440 2441 /* Form the matrix */ 2442 /* create col map: global col of C -> local col of submatrices */ 2443 { 2444 const PetscInt *icol_i; 2445 #if defined(PETSC_USE_CTABLE) 2446 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 2447 for (i=0; i<ismax; i++) { 2448 if (!allcolumns[i]) { 2449 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2450 2451 jmax = ncol[i]; 2452 icol_i = icol[i]; 2453 cmap_i = cmap[i]; 2454 for (j=0; j<jmax; j++) { 2455 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2456 } 2457 } else { 2458 cmap[i] = NULL; 2459 } 2460 } 2461 #else 2462 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 2463 for (i=0; i<ismax; i++) { 2464 if (!allcolumns[i]) { 2465 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2466 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2467 jmax = ncol[i]; 2468 icol_i = icol[i]; 2469 cmap_i = cmap[i]; 2470 for (j=0; j<jmax; j++) { 2471 cmap_i[icol_i[j]] = j+1; 2472 } 2473 } else { 2474 cmap[i] = NULL; 2475 } 2476 } 2477 #endif 2478 } 2479 2480 /* Create lens which is required for MatCreate... */ 2481 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2482 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2483 if (ismax) { 2484 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 2485 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 2486 } 2487 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2488 2489 /* Update lens from local data */ 2490 for (i=0; i<ismax; i++) { 2491 jmax = nrow[i]; 2492 if (!allcolumns[i]) cmap_i = cmap[i]; 2493 irow_i = irow[i]; 2494 lens_i = lens[i]; 2495 for (j=0; j<jmax; j++) { 2496 l = 0; 2497 row = irow_i[j]; 2498 while (row >= C->rmap->range[l+1]) l++; 2499 proc = l; 2500 if (proc == rank) { 2501 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2502 if (!allcolumns[i]) { 2503 for (k=0; k<ncols; k++) { 2504 #if defined(PETSC_USE_CTABLE) 2505 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2506 #else 2507 tcol = cmap_i[cols[k]]; 2508 #endif 2509 if (tcol) lens_i[j]++; 2510 } 2511 } else { /* allcolumns */ 2512 lens_i[j] = ncols; 2513 } 2514 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2515 } 2516 } 2517 } 2518 2519 /* Create row map: global row of C -> local row of submatrices */ 2520 #if defined(PETSC_USE_CTABLE) 2521 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 2522 for (i=0; i<ismax; i++) { 2523 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2524 irow_i = irow[i]; 2525 jmax = nrow[i]; 2526 for (j=0; j<jmax; j++) { 2527 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2528 } 2529 } 2530 #else 2531 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 2532 if (ismax) { 2533 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 2534 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2535 } 2536 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 2537 for (i=0; i<ismax; i++) { 2538 rmap_i = rmap[i]; 2539 irow_i = irow[i]; 2540 jmax = nrow[i]; 2541 for (j=0; j<jmax; j++) { 2542 rmap_i[irow_i[j]] = j; 2543 } 2544 } 2545 #endif 2546 2547 /* Update lens from offproc data */ 2548 { 2549 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2550 2551 for (tmp2=0; tmp2<nrqs; tmp2++) { 2552 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 2553 idex = pa[idex2]; 2554 sbuf1_i = sbuf1[idex]; 2555 jmax = sbuf1_i[0]; 2556 ct1 = 2*jmax+1; 2557 ct2 = 0; 2558 rbuf2_i = rbuf2[idex2]; 2559 rbuf3_i = rbuf3[idex2]; 2560 for (j=1; j<=jmax; j++) { 2561 is_no = sbuf1_i[2*j-1]; 2562 max1 = sbuf1_i[2*j]; 2563 lens_i = lens[is_no]; 2564 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2565 rmap_i = rmap[is_no]; 2566 for (k=0; k<max1; k++,ct1++) { 2567 #if defined(PETSC_USE_CTABLE) 2568 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2569 row--; 2570 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2571 #else 2572 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2573 #endif 2574 max2 = rbuf2_i[ct1]; 2575 for (l=0; l<max2; l++,ct2++) { 2576 if (!allcolumns[is_no]) { 2577 #if defined(PETSC_USE_CTABLE) 2578 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2579 #else 2580 tcol = cmap_i[rbuf3_i[ct2]]; 2581 #endif 2582 if (tcol) lens_i[row]++; 2583 } else { /* allcolumns */ 2584 lens_i[row]++; /* lens_i[row] += max2 ? */ 2585 } 2586 } 2587 } 2588 } 2589 } 2590 } 2591 ierr = PetscFree(r_status3);CHKERRQ(ierr); 2592 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2593 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2594 ierr = PetscFree(s_status3);CHKERRQ(ierr); 2595 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2596 2597 /* Create the submatrices */ 2598 if (scall == MAT_REUSE_MATRIX) { 2599 PetscBool flag; 2600 2601 /* 2602 Assumes new rows are same length as the old rows,hence bug! 2603 */ 2604 for (i=0; i<ismax; i++) { 2605 mat = (Mat_SeqAIJ*)(submats[i]->data); 2606 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2607 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2608 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 2609 /* Initial matrix as if empty */ 2610 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2611 2612 submats[i]->factortype = C->factortype; 2613 } 2614 } else { 2615 for (i=0; i<ismax; i++) { 2616 PetscInt rbs,cbs; 2617 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2618 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2619 2620 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2621 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2622 2623 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2624 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2625 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2626 } 2627 } 2628 2629 /* Assemble the matrices */ 2630 /* First assemble the local rows */ 2631 { 2632 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2633 PetscScalar *imat_a; 2634 2635 for (i=0; i<ismax; i++) { 2636 mat = (Mat_SeqAIJ*)submats[i]->data; 2637 imat_ilen = mat->ilen; 2638 imat_j = mat->j; 2639 imat_i = mat->i; 2640 imat_a = mat->a; 2641 2642 if (!allcolumns[i]) cmap_i = cmap[i]; 2643 rmap_i = rmap[i]; 2644 irow_i = irow[i]; 2645 jmax = nrow[i]; 2646 for (j=0; j<jmax; j++) { 2647 l = 0; 2648 row = irow_i[j]; 2649 while (row >= C->rmap->range[l+1]) l++; 2650 proc = l; 2651 if (proc == rank) { 2652 old_row = row; 2653 #if defined(PETSC_USE_CTABLE) 2654 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2655 row--; 2656 #else 2657 row = rmap_i[row]; 2658 #endif 2659 ilen_row = imat_ilen[row]; 2660 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2661 mat_i = imat_i[row]; 2662 mat_a = imat_a + mat_i; 2663 mat_j = imat_j + mat_i; 2664 if (!allcolumns[i]) { 2665 for (k=0; k<ncols; k++) { 2666 #if defined(PETSC_USE_CTABLE) 2667 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2668 #else 2669 tcol = cmap_i[cols[k]]; 2670 #endif 2671 if (tcol) { 2672 *mat_j++ = tcol - 1; 2673 *mat_a++ = vals[k]; 2674 ilen_row++; 2675 } 2676 } 2677 } else { /* allcolumns */ 2678 for (k=0; k<ncols; k++) { 2679 *mat_j++ = cols[k]; /* global col index! */ 2680 *mat_a++ = vals[k]; 2681 ilen_row++; 2682 } 2683 } 2684 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2685 2686 imat_ilen[row] = ilen_row; 2687 } 2688 } 2689 } 2690 } 2691 2692 /* Now assemble the off proc rows*/ 2693 { 2694 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 2695 PetscInt *imat_j,*imat_i; 2696 PetscScalar *imat_a,*rbuf4_i; 2697 2698 for (tmp2=0; tmp2<nrqs; tmp2++) { 2699 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 2700 idex = pa[idex2]; 2701 sbuf1_i = sbuf1[idex]; 2702 jmax = sbuf1_i[0]; 2703 ct1 = 2*jmax + 1; 2704 ct2 = 0; 2705 rbuf2_i = rbuf2[idex2]; 2706 rbuf3_i = rbuf3[idex2]; 2707 rbuf4_i = rbuf4[idex2]; 2708 for (j=1; j<=jmax; j++) { 2709 is_no = sbuf1_i[2*j-1]; 2710 rmap_i = rmap[is_no]; 2711 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2712 mat = (Mat_SeqAIJ*)submats[is_no]->data; 2713 imat_ilen = mat->ilen; 2714 imat_j = mat->j; 2715 imat_i = mat->i; 2716 imat_a = mat->a; 2717 max1 = sbuf1_i[2*j]; 2718 for (k=0; k<max1; k++,ct1++) { 2719 row = sbuf1_i[ct1]; 2720 #if defined(PETSC_USE_CTABLE) 2721 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2722 row--; 2723 #else 2724 row = rmap_i[row]; 2725 #endif 2726 ilen = imat_ilen[row]; 2727 mat_i = imat_i[row]; 2728 mat_a = imat_a + mat_i; 2729 mat_j = imat_j + mat_i; 2730 max2 = rbuf2_i[ct1]; 2731 if (!allcolumns[is_no]) { 2732 for (l=0; l<max2; l++,ct2++) { 2733 2734 #if defined(PETSC_USE_CTABLE) 2735 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2736 #else 2737 tcol = cmap_i[rbuf3_i[ct2]]; 2738 #endif 2739 if (tcol) { 2740 *mat_j++ = tcol - 1; 2741 *mat_a++ = rbuf4_i[ct2]; 2742 ilen++; 2743 } 2744 } 2745 } else { /* allcolumns */ 2746 for (l=0; l<max2; l++,ct2++) { 2747 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2748 *mat_a++ = rbuf4_i[ct2]; 2749 ilen++; 2750 } 2751 } 2752 imat_ilen[row] = ilen; 2753 } 2754 } 2755 } 2756 } 2757 2758 /* sort the rows */ 2759 { 2760 PetscInt *imat_ilen,*imat_j,*imat_i; 2761 PetscScalar *imat_a; 2762 2763 for (i=0; i<ismax; i++) { 2764 mat = (Mat_SeqAIJ*)submats[i]->data; 2765 imat_j = mat->j; 2766 imat_i = mat->i; 2767 imat_a = mat->a; 2768 imat_ilen = mat->ilen; 2769 2770 if (allcolumns[i]) continue; 2771 jmax = nrow[i]; 2772 for (j=0; j<jmax; j++) { 2773 PetscInt ilen; 2774 2775 mat_i = imat_i[j]; 2776 mat_a = imat_a + mat_i; 2777 mat_j = imat_j + mat_i; 2778 ilen = imat_ilen[j]; 2779 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 2780 } 2781 } 2782 } 2783 2784 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2785 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2786 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2787 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2788 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2789 2790 /* Restore the indices */ 2791 for (i=0; i<ismax; i++) { 2792 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2793 if (!allcolumns[i]) { 2794 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2795 } 2796 } 2797 2798 /* Destroy allocated memory */ 2799 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 2800 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2801 ierr = PetscFree(pa);CHKERRQ(ierr); 2802 2803 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 2804 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 2805 for (i=0; i<nrqr; ++i) { 2806 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 2807 } 2808 for (i=0; i<nrqs; ++i) { 2809 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 2810 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2811 } 2812 2813 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 2814 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 2815 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2816 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2817 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2818 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2819 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2820 2821 #if defined(PETSC_USE_CTABLE) 2822 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 2823 #else 2824 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 2825 #endif 2826 ierr = PetscFree(rmap);CHKERRQ(ierr); 2827 2828 for (i=0; i<ismax; i++) { 2829 if (!allcolumns[i]) { 2830 #if defined(PETSC_USE_CTABLE) 2831 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 2832 #else 2833 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 2834 #endif 2835 } 2836 } 2837 ierr = PetscFree(cmap);CHKERRQ(ierr); 2838 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2839 ierr = PetscFree(lens);CHKERRQ(ierr); 2840 2841 for (i=0; i<ismax; i++) { 2842 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2843 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2844 } 2845 PetscFunctionReturn(0); 2846 } 2847 2848 /* 2849 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2850 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2851 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2852 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2853 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2854 state, and needs to be "assembled" later by compressing B's column space. 2855 2856 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2857 Following this call, C->A & C->B have been created, even if empty. 2858 */ 2859 #undef __FUNCT__ 2860 #define __FUNCT__ "MatSetSeqMats_MPIAIJ" 2861 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2862 { 2863 /* If making this function public, change the error returned in this function away from _PLIB. */ 2864 PetscErrorCode ierr; 2865 Mat_MPIAIJ *aij; 2866 Mat_SeqAIJ *Baij; 2867 PetscBool seqaij,Bdisassembled; 2868 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2869 PetscScalar v; 2870 const PetscInt *rowindices,*colindices; 2871 2872 PetscFunctionBegin; 2873 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2874 if (A) { 2875 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2876 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2877 if (rowemb) { 2878 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2879 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2880 } else { 2881 if (C->rmap->n != A->rmap->n) { 2882 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2883 } 2884 } 2885 if (dcolemb) { 2886 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2887 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2888 } else { 2889 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2890 } 2891 } 2892 if (B) { 2893 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2894 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2895 if (rowemb) { 2896 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2897 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2898 } else { 2899 if (C->rmap->n != B->rmap->n) { 2900 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2901 } 2902 } 2903 if (ocolemb) { 2904 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2905 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2906 } else { 2907 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2908 } 2909 } 2910 2911 aij = (Mat_MPIAIJ*)(C->data); 2912 if (!aij->A) { 2913 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2914 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2915 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2916 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2917 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2918 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2919 } 2920 if (A) { 2921 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2922 } else { 2923 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2924 } 2925 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2926 /* 2927 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2928 need to "disassemble" B -- convert it to using C's global indices. 2929 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2930 2931 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2932 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2933 2934 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2935 At least avoid calling MatSetValues() and the implied searches? 2936 */ 2937 2938 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2939 #if defined(PETSC_USE_CTABLE) 2940 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2941 #else 2942 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2943 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2944 if (aij->B) { 2945 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2946 } 2947 #endif 2948 ngcol = 0; 2949 if (aij->lvec) { 2950 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2951 } 2952 if (aij->garray) { 2953 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2954 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2955 } 2956 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2957 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2958 } 2959 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2960 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2961 } 2962 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2963 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2964 } 2965 } 2966 Bdisassembled = PETSC_FALSE; 2967 if (!aij->B) { 2968 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2969 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2970 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2971 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2972 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2973 Bdisassembled = PETSC_TRUE; 2974 } 2975 if (B) { 2976 Baij = (Mat_SeqAIJ*)(B->data); 2977 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2978 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2979 for (i=0; i<B->rmap->n; i++) { 2980 nz[i] = Baij->i[i+1] - Baij->i[i]; 2981 } 2982 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2983 ierr = PetscFree(nz);CHKERRQ(ierr); 2984 } 2985 2986 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 2987 shift = rend-rstart; 2988 count = 0; 2989 rowindices = NULL; 2990 colindices = NULL; 2991 if (rowemb) { 2992 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 2993 } 2994 if (ocolemb) { 2995 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 2996 } 2997 for (i=0; i<B->rmap->n; i++) { 2998 PetscInt row; 2999 row = i; 3000 if (rowindices) row = rowindices[i]; 3001 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3002 col = Baij->j[count]; 3003 if (colindices) col = colindices[col]; 3004 if (Bdisassembled && col>=rstart) col += shift; 3005 v = Baij->a[count]; 3006 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3007 ++count; 3008 } 3009 } 3010 /* No assembly for aij->B is necessary. */ 3011 /* FIXME: set aij->B's nonzerostate correctly. */ 3012 } else { 3013 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3014 } 3015 C->preallocated = PETSC_TRUE; 3016 C->was_assembled = PETSC_FALSE; 3017 C->assembled = PETSC_FALSE; 3018 /* 3019 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3020 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3021 */ 3022 PetscFunctionReturn(0); 3023 } 3024 3025 #undef __FUNCT__ 3026 #define __FUNCT__ "MatGetSeqMats_MPIAIJ" 3027 /* 3028 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3029 */ 3030 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3031 { 3032 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 3033 3034 PetscFunctionBegin; 3035 PetscValidPointer(A,2); 3036 PetscValidPointer(B,3); 3037 /* FIXME: make sure C is assembled */ 3038 *A = aij->A; 3039 *B = aij->B; 3040 /* Note that we don't incref *A and *B, so be careful! */ 3041 PetscFunctionReturn(0); 3042 } 3043 3044 /* 3045 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3046 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3047 */ 3048 #undef __FUNCT__ 3049 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ" 3050 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3051 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3052 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3053 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3054 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3055 { 3056 PetscErrorCode ierr; 3057 PetscMPIInt isize,flag; 3058 PetscInt i,ii,cismax,ispar,ciscol_localsize; 3059 Mat *A,*B; 3060 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3061 3062 PetscFunctionBegin; 3063 if (!ismax) PetscFunctionReturn(0); 3064 3065 for (i = 0, cismax = 0; i < ismax; ++i) { 3066 PetscMPIInt isize; 3067 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3068 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3069 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3070 if (isize > 1) ++cismax; 3071 } 3072 /* 3073 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3074 ispar counts the number of parallel ISs across C's comm. 3075 */ 3076 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3077 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3078 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3079 PetscFunctionReturn(0); 3080 } 3081 3082 /* if (ispar) */ 3083 /* 3084 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3085 These are used to extract the off-diag portion of the resulting parallel matrix. 3086 The row IS for the off-diag portion is the same as for the diag portion, 3087 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3088 */ 3089 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3090 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3091 for (i = 0, ii = 0; i < ismax; ++i) { 3092 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3093 if (isize > 1) { 3094 /* 3095 TODO: This is the part that's ***NOT SCALABLE***. 3096 To fix this we need to extract just the indices of C's nonzero columns 3097 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3098 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3099 to be done without serializing on the IS list, so, most likely, it is best 3100 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3101 */ 3102 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3103 /* Now we have to 3104 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3105 were sorted on each rank, concatenated they might no longer be sorted; 3106 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3107 indices in the nondecreasing order to the original index positions. 3108 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3109 */ 3110 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3111 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3112 ++ii; 3113 } 3114 } 3115 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3116 for (i = 0, ii = 0; i < ismax; ++i) { 3117 PetscInt j,issize; 3118 const PetscInt *indices; 3119 3120 /* 3121 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3122 */ 3123 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3124 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3125 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3126 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3127 for (j = 1; j < issize; ++j) { 3128 if (indices[j] == indices[j-1]) { 3129 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3130 } 3131 } 3132 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3133 3134 3135 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3136 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3137 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3138 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3139 for (j = 1; j < issize; ++j) { 3140 if (indices[j-1] == indices[j]) { 3141 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3142 } 3143 } 3144 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3145 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3146 if (isize > 1) { 3147 cisrow[ii] = isrow[i]; 3148 ++ii; 3149 } 3150 } 3151 /* 3152 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3153 array of sequential matrices underlying the resulting parallel matrices. 3154 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3155 contain duplicates. 3156 3157 There are as many diag matrices as there are original index sets. There are only as many parallel 3158 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3159 3160 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3161 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3162 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3163 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3164 with A[i] and B[ii] extracted from the corresponding MPI submat. 3165 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3166 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3167 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3168 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3169 values into A[i] and B[ii] sitting inside the corresponding submat. 3170 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3171 A[i], B[ii], AA[i] or BB[ii] matrices. 3172 */ 3173 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3174 if (scall == MAT_INITIAL_MATRIX) { 3175 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3176 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 3177 } else { 3178 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 3179 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 3180 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 3181 for (i = 0, ii = 0; i < ismax; ++i) { 3182 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3183 if (isize > 1) { 3184 Mat AA,BB; 3185 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 3186 if (!isrow_p[i] && !iscol_p[i]) { 3187 A[i] = AA; 3188 } else { 3189 /* TODO: extract A[i] composed on AA. */ 3190 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3191 } 3192 if (!isrow_p[i] && !ciscol_p[ii]) { 3193 B[ii] = BB; 3194 } else { 3195 /* TODO: extract B[ii] composed on BB. */ 3196 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 3197 } 3198 ++ii; 3199 } else { 3200 if (!isrow_p[i] && !iscol_p[i]) { 3201 A[i] = (*submat)[i]; 3202 } else { 3203 /* TODO: extract A[i] composed on (*submat)[i]. */ 3204 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3205 } 3206 } 3207 } 3208 } 3209 /* Now obtain the sequential A and B submatrices separately. */ 3210 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 3211 /* I did not figure out a good way to fix it right now. 3212 * Local column size of B[i] is different from the size of ciscol[i]. 3213 * B[i]'s size is finally determined by MatAssembly, while 3214 * ciscol[i] is computed as the complement of iscol[i]. 3215 * It is better to keep only nonzero indices when computing 3216 * the complement ciscol[i]. 3217 * */ 3218 if(scall==MAT_REUSE_MATRIX){ 3219 for(i=0; i<cismax; i++){ 3220 ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr); 3221 B[i]->cmap->n = ciscol_localsize; 3222 } 3223 } 3224 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 3225 /* 3226 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3227 matrices A & B have been extracted directly into the parallel matrices containing them, or 3228 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3229 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3230 to have the same sparsity pattern. 3231 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3232 must be constructed for C. This is done by setseqmat(s). 3233 */ 3234 for (i = 0, ii = 0; i < ismax; ++i) { 3235 /* 3236 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3237 That way we can avoid sorting and computing permutations when reusing. 3238 To this end: 3239 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3240 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3241 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3242 */ 3243 MatStructure pattern; 3244 if (scall == MAT_INITIAL_MATRIX) { 3245 pattern = DIFFERENT_NONZERO_PATTERN; 3246 } else { 3247 pattern = SAME_NONZERO_PATTERN; 3248 } 3249 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3250 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3251 if (isize > 1) { 3252 if (scall == MAT_INITIAL_MATRIX) { 3253 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3254 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3255 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3256 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3257 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3258 } 3259 /* 3260 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3261 */ 3262 { 3263 Mat AA,BB; 3264 AA = NULL; 3265 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 3266 BB = NULL; 3267 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 3268 if (AA || BB) { 3269 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3270 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3271 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3272 } 3273 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 3274 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 3275 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3276 } 3277 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 3278 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 3279 ierr = MatDestroy(&BB);CHKERRQ(ierr); 3280 } 3281 } 3282 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3283 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3284 ++ii; 3285 } else { /* if (isize == 1) */ 3286 if (scall == MAT_INITIAL_MATRIX) { 3287 if (isrow_p[i] || iscol_p[i]) { 3288 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3289 } else (*submat)[i] = A[i]; 3290 } 3291 if (isrow_p[i] || iscol_p[i]) { 3292 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3293 /* Otherwise A is extracted straight into (*submats)[i]. */ 3294 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3295 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3296 } 3297 } 3298 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3299 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3300 } 3301 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3302 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3303 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3304 ierr = PetscFree(A);CHKERRQ(ierr); 3305 ierr = PetscFree(B);CHKERRQ(ierr); 3306 PetscFunctionReturn(0); 3307 } 3308 3309 3310 3311 #undef __FUNCT__ 3312 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ" 3313 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3314 { 3315 PetscErrorCode ierr; 3316 3317 PetscFunctionBegin; 3318 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3319 PetscFunctionReturn(0); 3320 } 3321