1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 #undef __FUNCT__ 23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 25 { 26 PetscErrorCode ierr; 27 PetscInt i; 28 29 PetscFunctionBegin; 30 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 31 for (i=0; i<ov; ++i) { 32 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 33 } 34 PetscFunctionReturn(0); 35 } 36 37 #undef __FUNCT__ 38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 40 { 41 PetscErrorCode ierr; 42 PetscInt i; 43 44 PetscFunctionBegin; 45 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 46 for (i=0; i<ov; ++i) { 47 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 48 } 49 PetscFunctionReturn(0); 50 } 51 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 56 { 57 PetscErrorCode ierr; 58 MPI_Comm comm; 59 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 60 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 61 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 62 const PetscInt *indices_i,**indices; 63 PetscLayout rmap; 64 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 65 PetscSF sf; 66 PetscSFNode *remote; 67 68 PetscFunctionBegin; 69 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 70 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 71 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 72 /* get row map to determine where rows should be going */ 73 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 74 /* retrieve IS data and put all together so that we 75 * can optimize communication 76 * */ 77 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 78 for (i=0,tlength=0; i<nidx; i++){ 79 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 80 tlength += length[i]; 81 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 82 } 83 /* find these rows on remote processors */ 84 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 85 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 86 nrrows = 0; 87 for (i=0; i<nidx; i++){ 88 length_i = length[i]; 89 indices_i = indices[i]; 90 for (j=0; j<length_i; j++){ 91 owner = -1; 92 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 93 /* remote processors */ 94 if (owner != rank){ 95 tosizes_temp[owner]++; /* number of rows to owner */ 96 rrow_ranks[nrrows] = owner; /* processor */ 97 rrow_isids[nrrows] = i; /* is id */ 98 remoterows[nrrows++] = indices_i[j]; /* row */ 99 } 100 } 101 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 102 } 103 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 104 /* test if we need to exchange messages 105 * generally speaking, we do not need to exchange 106 * data when overlap is 1 107 * */ 108 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 109 /* we do not have any messages 110 * It usually corresponds to overlap 1 111 * */ 112 if (!reducednrrows){ 113 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 114 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 115 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 nto = 0; 119 /* send sizes and ranks for building a two-sided communcation */ 120 for (i=0; i<size; i++){ 121 if (tosizes_temp[i]){ 122 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 123 tosizes_temp[i] = nto; /* a map from processor to index */ 124 toranks[nto++] = i; /* processor */ 125 } 126 } 127 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 128 for (i=0; i<nto; i++){ 129 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 130 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 131 } 132 /* send information to other processors */ 133 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 134 nrecvrows = 0; 135 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 136 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 137 nrecvrows = 0; 138 for (i=0; i<nfrom; i++){ 139 for (j=0; j<fromsizes[2*i]; j++){ 140 remote[nrecvrows].rank = fromranks[i]; 141 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 142 } 143 } 144 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 145 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 146 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 147 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 148 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 149 /* message pair <no of is, row> */ 150 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 151 for (i=0; i<nrrows; i++){ 152 owner = rrow_ranks[i]; /* processor */ 153 j = tosizes_temp[owner]; /* index */ 154 todata[toffsets[j]++] = rrow_isids[i]; 155 todata[toffsets[j]++] = remoterows[i]; 156 } 157 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 158 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 159 ierr = PetscFree(toffsets);CHKERRQ(ierr); 160 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 161 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 162 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 163 /* send rows belonging to the remote so that then we could get the overlapping data back */ 164 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 165 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 166 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 167 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 168 ierr = PetscFree(fromranks);CHKERRQ(ierr); 169 nrecvrows = 0; 170 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 171 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 172 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 173 nrecvrows = 0; 174 for (i=0; i<nto; i++){ 175 for (j=0; j<tosizes[2*i]; j++){ 176 remote[nrecvrows].rank = toranks[i]; 177 remote[nrecvrows++].index = tosizes[2*i+1]+j; 178 } 179 } 180 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 181 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 182 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 183 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 184 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 185 /* overlap communication and computation */ 186 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 187 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 188 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 189 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 190 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 191 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 192 ierr = PetscFree(toranks);CHKERRQ(ierr); 193 ierr = PetscFree(tosizes);CHKERRQ(ierr); 194 ierr = PetscFree(todata);CHKERRQ(ierr); 195 PetscFunctionReturn(0); 196 } 197 198 #undef __FUNCT__ 199 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 200 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 201 { 202 PetscInt *isz,isz_i,i,j,is_id, data_size; 203 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 204 const PetscInt *indices_i_temp; 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 max_lsize = 0; 209 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 210 for (i=0; i<nidx; i++){ 211 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 212 max_lsize = lsize>max_lsize ? lsize:max_lsize; 213 isz[i] = lsize; 214 } 215 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 216 for (i=0; i<nidx; i++){ 217 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 218 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 219 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 220 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 221 } 222 /* retrieve information to get row id and its overlap */ 223 for (i=0; i<nrecvs; ){ 224 is_id = recvdata[i++]; 225 data_size = recvdata[i++]; 226 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 227 isz_i = isz[is_id]; 228 for (j=0; j< data_size; j++){ 229 col = recvdata[i++]; 230 indices_i[isz_i++] = col; 231 } 232 isz[is_id] = isz_i; 233 } 234 /* remove duplicate entities */ 235 for (i=0; i<nidx; i++){ 236 indices_i = indices_temp+(max_lsize+nrecvs)*i; 237 isz_i = isz[i]; 238 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 239 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 240 } 241 ierr = PetscFree(isz);CHKERRQ(ierr); 242 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 243 PetscFunctionReturn(0); 244 } 245 246 #undef __FUNCT__ 247 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 248 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 249 { 250 PetscLayout rmap,cmap; 251 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 252 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 253 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 254 const PetscInt *gcols,*ai,*aj,*bi,*bj; 255 Mat amat,bmat; 256 PetscMPIInt rank; 257 PetscBool done; 258 MPI_Comm comm; 259 PetscErrorCode ierr; 260 261 PetscFunctionBegin; 262 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 263 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 264 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 265 /* Even if the mat is symmetric, we still assume it is not symmetric */ 266 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 267 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 268 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 269 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 270 /* total number of nonzero values is used to estimate the memory usage in the next step */ 271 tnz = ai[an]+bi[bn]; 272 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 273 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 274 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 275 /* to find the longest message */ 276 max_fszs = 0; 277 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 278 /* better way to estimate number of nonzero in the mat??? */ 279 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 280 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 281 rows_pos = 0; 282 totalrows = 0; 283 for (i=0; i<nfrom; i++){ 284 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 285 /* group data together */ 286 for (j=0; j<fromsizes[2*i]; j+=2){ 287 is_id = fromrows[rows_pos++];/* no of is */ 288 rows_i = rows_data[is_id]; 289 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 290 } 291 /* estimate a space to avoid multiple allocations */ 292 for (j=0; j<nidx; j++){ 293 indvc_ij = 0; 294 rows_i = rows_data[j]; 295 for (l=0; l<rows_pos_i[j]; l++){ 296 row = rows_i[l]-rstart; 297 start = ai[row]; 298 end = ai[row+1]; 299 for (k=start; k<end; k++){ /* Amat */ 300 col = aj[k] + cstart; 301 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 302 } 303 start = bi[row]; 304 end = bi[row+1]; 305 for (k=start; k<end; k++) { /* Bmat */ 306 col = gcols[bj[k]]; 307 indices_tmp[indvc_ij++] = col; 308 } 309 } 310 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 311 indv_counts[i*nidx+j] = indvc_ij; 312 totalrows += indvc_ij; 313 } 314 } 315 /* message triple <no of is, number of rows, rows> */ 316 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 317 totalrows = 0; 318 rows_pos = 0; 319 /* use this code again */ 320 for (i=0;i<nfrom;i++){ 321 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 322 for (j=0; j<fromsizes[2*i]; j+=2){ 323 is_id = fromrows[rows_pos++]; 324 rows_i = rows_data[is_id]; 325 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 326 } 327 /* add data */ 328 for (j=0; j<nidx; j++){ 329 if (!indv_counts[i*nidx+j]) continue; 330 indvc_ij = 0; 331 sbdata[totalrows++] = j; 332 sbdata[totalrows++] = indv_counts[i*nidx+j]; 333 sbsizes[2*i] += 2; 334 rows_i = rows_data[j]; 335 for (l=0; l<rows_pos_i[j]; l++){ 336 row = rows_i[l]-rstart; 337 start = ai[row]; 338 end = ai[row+1]; 339 for (k=start; k<end; k++){ /* Amat */ 340 col = aj[k] + cstart; 341 indices_tmp[indvc_ij++] = col; 342 } 343 start = bi[row]; 344 end = bi[row+1]; 345 for (k=start; k<end; k++) { /* Bmat */ 346 col = gcols[bj[k]]; 347 indices_tmp[indvc_ij++] = col; 348 } 349 } 350 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 351 sbsizes[2*i] += indvc_ij; 352 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 353 totalrows += indvc_ij; 354 } 355 } 356 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 357 for (i=0; i<nfrom; i++){ 358 offsets[i+1] = offsets[i] + sbsizes[2*i]; 359 sbsizes[2*i+1] = offsets[i]; 360 } 361 ierr = PetscFree(offsets);CHKERRQ(ierr); 362 if (sbrowsizes) *sbrowsizes = sbsizes; 363 if (sbrows) *sbrows = sbdata; 364 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 365 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 366 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 367 PetscFunctionReturn(0); 368 } 369 370 #undef __FUNCT__ 371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 373 { 374 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 375 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 376 PetscInt lsize,lsize_tmp,owner; 377 PetscMPIInt rank; 378 Mat amat,bmat; 379 PetscBool done; 380 PetscLayout cmap,rmap; 381 MPI_Comm comm; 382 PetscErrorCode ierr; 383 384 PetscFunctionBegin; 385 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 386 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 387 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 388 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 389 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 390 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 391 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 392 /* is it a safe way to compute number of nonzero values ? */ 393 tnz = ai[an]+bi[bn]; 394 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 395 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 396 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 397 /* it is a better way to estimate memory than the old implementation 398 * where global size of matrix is used 399 * */ 400 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 401 for (i=0; i<nidx; i++) { 402 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 403 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 404 lsize_tmp = 0; 405 for (j=0; j<lsize; j++) { 406 owner = -1; 407 row = indices[j]; 408 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 409 if (owner != rank) continue; 410 /* local number */ 411 row -= rstart; 412 start = ai[row]; 413 end = ai[row+1]; 414 for (k=start; k<end; k++) { /* Amat */ 415 col = aj[k] + cstart; 416 indices_temp[lsize_tmp++] = col; 417 } 418 start = bi[row]; 419 end = bi[row+1]; 420 for (k=start; k<end; k++) { /* Bmat */ 421 col = gcols[bj[k]]; 422 indices_temp[lsize_tmp++] = col; 423 } 424 } 425 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 426 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 427 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 428 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 429 } 430 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 431 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 432 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 433 PetscFunctionReturn(0); 434 } 435 436 437 /* 438 Sample message format: 439 If a processor A wants processor B to process some elements corresponding 440 to index sets is[1],is[5] 441 mesg [0] = 2 (no of index sets in the mesg) 442 ----------- 443 mesg [1] = 1 => is[1] 444 mesg [2] = sizeof(is[1]); 445 ----------- 446 mesg [3] = 5 => is[5] 447 mesg [4] = sizeof(is[5]); 448 ----------- 449 mesg [5] 450 mesg [n] datas[1] 451 ----------- 452 mesg[n+1] 453 mesg[m] data(is[5]) 454 ----------- 455 456 Notes: 457 nrqs - no of requests sent (or to be sent out) 458 nrqr - no of requests recieved (which have to be or which have been processed 459 */ 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 463 { 464 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 465 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 466 const PetscInt **idx,*idx_i; 467 PetscInt *n,**data,len; 468 #if defined(PETSC_USE_CTABLE) 469 PetscTable *table_data,table_data_i; 470 PetscInt *tdata,tcount,tcount_max; 471 #else 472 PetscInt *data_i,*d_p; 473 #endif 474 PetscErrorCode ierr; 475 PetscMPIInt size,rank,tag1,tag2; 476 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 477 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 478 PetscBT *table; 479 MPI_Comm comm; 480 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 481 MPI_Status *s_status,*recv_status; 482 char *t_p; 483 484 PetscFunctionBegin; 485 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 486 size = c->size; 487 rank = c->rank; 488 M = C->rmap->N; 489 490 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 491 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 492 493 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 494 495 for (i=0; i<imax; i++) { 496 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 497 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 498 } 499 500 /* evaluate communication - mesg to who,length of mesg, and buffer space 501 required. Based on this, buffers are allocated, and data copied into them */ 502 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 503 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 504 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 505 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 506 for (i=0; i<imax; i++) { 507 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 508 idx_i = idx[i]; 509 len = n[i]; 510 for (j=0; j<len; j++) { 511 row = idx_i[j]; 512 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 513 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 514 w4[proc]++; 515 } 516 for (j=0; j<size; j++) { 517 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 518 } 519 } 520 521 nrqs = 0; /* no of outgoing messages */ 522 msz = 0; /* total mesg length (for all proc */ 523 w1[rank] = 0; /* no mesg sent to intself */ 524 w3[rank] = 0; 525 for (i=0; i<size; i++) { 526 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 527 } 528 /* pa - is list of processors to communicate with */ 529 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 530 for (i=0,j=0; i<size; i++) { 531 if (w1[i]) {pa[j] = i; j++;} 532 } 533 534 /* Each message would have a header = 1 + 2*(no of IS) + data */ 535 for (i=0; i<nrqs; i++) { 536 j = pa[i]; 537 w1[j] += w2[j] + 2*w3[j]; 538 msz += w1[j]; 539 } 540 541 /* Determine the number of messages to expect, their lengths, from from-ids */ 542 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 543 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 544 545 /* Now post the Irecvs corresponding to these messages */ 546 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 547 548 /* Allocate Memory for outgoing messages */ 549 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 550 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 551 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 552 553 { 554 PetscInt *iptr = tmp,ict = 0; 555 for (i=0; i<nrqs; i++) { 556 j = pa[i]; 557 iptr += ict; 558 outdat[j] = iptr; 559 ict = w1[j]; 560 } 561 } 562 563 /* Form the outgoing messages */ 564 /* plug in the headers */ 565 for (i=0; i<nrqs; i++) { 566 j = pa[i]; 567 outdat[j][0] = 0; 568 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 569 ptr[j] = outdat[j] + 2*w3[j] + 1; 570 } 571 572 /* Memory for doing local proc's work */ 573 { 574 #if defined(PETSC_USE_CTABLE) 575 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 578 } 579 ierr = PetscCalloc4(imax,&table,imax,&data,imax,&isz,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 580 for (i=0; i<imax; i++) { 581 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 582 } 583 #else 584 PetscInt Mimax = 0,M_BPB_imax = 0; 585 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 586 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 587 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 588 ierr = PetscCalloc5(imax,&table,imax,&data,imax,&isz,M*imax,&d_p,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 589 for (i=0; i<imax; i++) { 590 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 591 data[i] = d_p + M*i; 592 } 593 #endif 594 } 595 596 /* Parse the IS and update local tables and the outgoing buf with the data */ 597 { 598 PetscInt n_i,isz_i,*outdat_j,ctr_j; 599 PetscBT table_i; 600 601 for (i=0; i<imax; i++) { 602 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 603 n_i = n[i]; 604 table_i = table[i]; 605 idx_i = idx[i]; 606 #if defined(PETSC_USE_CTABLE) 607 table_data_i = table_data[i]; 608 #else 609 data_i = data[i]; 610 #endif 611 isz_i = isz[i]; 612 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 613 row = idx_i[j]; 614 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 615 if (proc != rank) { /* copy to the outgoing buffer */ 616 ctr[proc]++; 617 *ptr[proc] = row; 618 ptr[proc]++; 619 } else if (!PetscBTLookupSet(table_i,row)) { 620 #if defined(PETSC_USE_CTABLE) 621 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 622 #else 623 data_i[isz_i] = row; /* Update the local table */ 624 #endif 625 isz_i++; 626 } 627 } 628 /* Update the headers for the current IS */ 629 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 630 if ((ctr_j = ctr[j])) { 631 outdat_j = outdat[j]; 632 k = ++outdat_j[0]; 633 outdat_j[2*k] = ctr_j; 634 outdat_j[2*k-1] = i; 635 } 636 } 637 isz[i] = isz_i; 638 } 639 } 640 641 /* Now post the sends */ 642 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 643 for (i=0; i<nrqs; ++i) { 644 j = pa[i]; 645 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 646 } 647 648 /* No longer need the original indices */ 649 for (i=0; i<imax; ++i) { 650 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 651 } 652 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 653 654 for (i=0; i<imax; ++i) { 655 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 656 } 657 658 /* Do Local work */ 659 #if defined(PETSC_USE_CTABLE) 660 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 661 #else 662 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 663 #endif 664 665 /* Receive messages */ 666 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 667 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 668 669 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 670 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 671 672 /* Phase 1 sends are complete - deallocate buffers */ 673 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 674 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 675 676 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 677 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 678 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 679 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 680 ierr = PetscFree(rbuf);CHKERRQ(ierr); 681 682 683 /* Send the data back */ 684 /* Do a global reduction to know the buffer space req for incoming messages */ 685 { 686 PetscMPIInt *rw1; 687 688 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 689 690 for (i=0; i<nrqr; ++i) { 691 proc = recv_status[i].MPI_SOURCE; 692 693 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 694 rw1[proc] = isz1[i]; 695 } 696 ierr = PetscFree(onodes1);CHKERRQ(ierr); 697 ierr = PetscFree(olengths1);CHKERRQ(ierr); 698 699 /* Determine the number of messages to expect, their lengths, from from-ids */ 700 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 701 ierr = PetscFree(rw1);CHKERRQ(ierr); 702 } 703 /* Now post the Irecvs corresponding to these messages */ 704 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 705 706 /* Now post the sends */ 707 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 708 for (i=0; i<nrqr; ++i) { 709 j = recv_status[i].MPI_SOURCE; 710 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 711 } 712 713 /* receive work done on other processors */ 714 { 715 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 716 PetscMPIInt idex; 717 PetscBT table_i; 718 MPI_Status *status2; 719 720 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 721 for (i=0; i<nrqs; ++i) { 722 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 723 /* Process the message */ 724 rbuf2_i = rbuf2[idex]; 725 ct1 = 2*rbuf2_i[0]+1; 726 jmax = rbuf2[idex][0]; 727 for (j=1; j<=jmax; j++) { 728 max = rbuf2_i[2*j]; 729 is_no = rbuf2_i[2*j-1]; 730 isz_i = isz[is_no]; 731 table_i = table[is_no]; 732 #if defined(PETSC_USE_CTABLE) 733 table_data_i = table_data[is_no]; 734 #else 735 data_i = data[is_no]; 736 #endif 737 for (k=0; k<max; k++,ct1++) { 738 row = rbuf2_i[ct1]; 739 if (!PetscBTLookupSet(table_i,row)) { 740 #if defined(PETSC_USE_CTABLE) 741 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 742 #else 743 data_i[isz_i] = row; 744 #endif 745 isz_i++; 746 } 747 } 748 isz[is_no] = isz_i; 749 } 750 } 751 752 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 753 ierr = PetscFree(status2);CHKERRQ(ierr); 754 } 755 756 #if defined(PETSC_USE_CTABLE) 757 tcount_max = 0; 758 for (i=0; i<imax; ++i) { 759 table_data_i = table_data[i]; 760 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 761 if (tcount_max < tcount) tcount_max = tcount; 762 } 763 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 764 #endif 765 766 for (i=0; i<imax; ++i) { 767 #if defined(PETSC_USE_CTABLE) 768 PetscTablePosition tpos; 769 table_data_i = table_data[i]; 770 771 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 772 while (tpos) { 773 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 774 tdata[--j] = --k; 775 } 776 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 777 #else 778 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 779 #endif 780 } 781 782 ierr = PetscFree(onodes2);CHKERRQ(ierr); 783 ierr = PetscFree(olengths2);CHKERRQ(ierr); 784 785 ierr = PetscFree(pa);CHKERRQ(ierr); 786 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 787 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 788 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 789 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 790 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 791 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 792 ierr = PetscFree(s_status);CHKERRQ(ierr); 793 ierr = PetscFree(recv_status);CHKERRQ(ierr); 794 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 795 ierr = PetscFree(xdata);CHKERRQ(ierr); 796 ierr = PetscFree(isz1);CHKERRQ(ierr); 797 #if defined(PETSC_USE_CTABLE) 798 for (i=0; i<imax; i++) { 799 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 800 } 801 ierr = PetscFree(table_data);CHKERRQ(ierr); 802 ierr = PetscFree(tdata);CHKERRQ(ierr); 803 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 804 #else 805 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 806 #endif 807 PetscFunctionReturn(0); 808 } 809 810 #undef __FUNCT__ 811 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 812 /* 813 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 814 the work on the local processor. 815 816 Inputs: 817 C - MAT_MPIAIJ; 818 imax - total no of index sets processed at a time; 819 table - an array of char - size = m bits. 820 821 Output: 822 isz - array containing the count of the solution elements corresponding 823 to each index set; 824 data or table_data - pointer to the solutions 825 */ 826 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 827 { 828 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 829 Mat A = c->A,B = c->B; 830 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 831 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 832 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 833 PetscBT table_i; 834 #if defined(PETSC_USE_CTABLE) 835 PetscTable table_data_i; 836 PetscErrorCode ierr; 837 PetscTablePosition tpos; 838 PetscInt tcount,*tdata; 839 #else 840 PetscInt *data_i; 841 #endif 842 843 PetscFunctionBegin; 844 rstart = C->rmap->rstart; 845 cstart = C->cmap->rstart; 846 ai = a->i; 847 aj = a->j; 848 bi = b->i; 849 bj = b->j; 850 garray = c->garray; 851 852 for (i=0; i<imax; i++) { 853 #if defined(PETSC_USE_CTABLE) 854 /* copy existing entries of table_data_i into tdata[] */ 855 table_data_i = table_data[i]; 856 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 857 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 858 859 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 860 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 861 while (tpos) { 862 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 863 tdata[--j] = --row; 864 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 865 } 866 #else 867 data_i = data[i]; 868 #endif 869 table_i = table[i]; 870 isz_i = isz[i]; 871 max = isz[i]; 872 873 for (j=0; j<max; j++) { 874 #if defined(PETSC_USE_CTABLE) 875 row = tdata[j] - rstart; 876 #else 877 row = data_i[j] - rstart; 878 #endif 879 start = ai[row]; 880 end = ai[row+1]; 881 for (k=start; k<end; k++) { /* Amat */ 882 val = aj[k] + cstart; 883 if (!PetscBTLookupSet(table_i,val)) { 884 #if defined(PETSC_USE_CTABLE) 885 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 886 #else 887 data_i[isz_i] = val; 888 #endif 889 isz_i++; 890 } 891 } 892 start = bi[row]; 893 end = bi[row+1]; 894 for (k=start; k<end; k++) { /* Bmat */ 895 val = garray[bj[k]]; 896 if (!PetscBTLookupSet(table_i,val)) { 897 #if defined(PETSC_USE_CTABLE) 898 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 899 #else 900 data_i[isz_i] = val; 901 #endif 902 isz_i++; 903 } 904 } 905 } 906 isz[i] = isz_i; 907 908 #if defined(PETSC_USE_CTABLE) 909 ierr = PetscFree(tdata);CHKERRQ(ierr); 910 #endif 911 } 912 PetscFunctionReturn(0); 913 } 914 915 #undef __FUNCT__ 916 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 917 /* 918 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 919 and return the output 920 921 Input: 922 C - the matrix 923 nrqr - no of messages being processed. 924 rbuf - an array of pointers to the recieved requests 925 926 Output: 927 xdata - array of messages to be sent back 928 isz1 - size of each message 929 930 For better efficiency perhaps we should malloc separately each xdata[i], 931 then if a remalloc is required we need only copy the data for that one row 932 rather then all previous rows as it is now where a single large chunck of 933 memory is used. 934 935 */ 936 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 937 { 938 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 939 Mat A = c->A,B = c->B; 940 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 941 PetscErrorCode ierr; 942 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 943 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 944 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 945 PetscInt *rbuf_i,kmax,rbuf_0; 946 PetscBT xtable; 947 948 PetscFunctionBegin; 949 m = C->rmap->N; 950 rstart = C->rmap->rstart; 951 cstart = C->cmap->rstart; 952 ai = a->i; 953 aj = a->j; 954 bi = b->i; 955 bj = b->j; 956 garray = c->garray; 957 958 959 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 960 rbuf_i = rbuf[i]; 961 rbuf_0 = rbuf_i[0]; 962 ct += rbuf_0; 963 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 964 } 965 966 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 967 else max1 = 1; 968 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 969 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 970 ++no_malloc; 971 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 972 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 973 974 ct3 = 0; 975 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 976 rbuf_i = rbuf[i]; 977 rbuf_0 = rbuf_i[0]; 978 ct1 = 2*rbuf_0+1; 979 ct2 = ct1; 980 ct3 += ct1; 981 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 982 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 983 oct2 = ct2; 984 kmax = rbuf_i[2*j]; 985 for (k=0; k<kmax; k++,ct1++) { 986 row = rbuf_i[ct1]; 987 if (!PetscBTLookupSet(xtable,row)) { 988 if (!(ct3 < mem_estimate)) { 989 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 990 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 991 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 992 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 993 xdata[0] = tmp; 994 mem_estimate = new_estimate; ++no_malloc; 995 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 996 } 997 xdata[i][ct2++] = row; 998 ct3++; 999 } 1000 } 1001 for (k=oct2,max2=ct2; k<max2; k++) { 1002 row = xdata[i][k] - rstart; 1003 start = ai[row]; 1004 end = ai[row+1]; 1005 for (l=start; l<end; l++) { 1006 val = aj[l] + cstart; 1007 if (!PetscBTLookupSet(xtable,val)) { 1008 if (!(ct3 < mem_estimate)) { 1009 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1010 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1011 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1012 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1013 xdata[0] = tmp; 1014 mem_estimate = new_estimate; ++no_malloc; 1015 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1016 } 1017 xdata[i][ct2++] = val; 1018 ct3++; 1019 } 1020 } 1021 start = bi[row]; 1022 end = bi[row+1]; 1023 for (l=start; l<end; l++) { 1024 val = garray[bj[l]]; 1025 if (!PetscBTLookupSet(xtable,val)) { 1026 if (!(ct3 < mem_estimate)) { 1027 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1028 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1029 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1030 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1031 xdata[0] = tmp; 1032 mem_estimate = new_estimate; ++no_malloc; 1033 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1034 } 1035 xdata[i][ct2++] = val; 1036 ct3++; 1037 } 1038 } 1039 } 1040 /* Update the header*/ 1041 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1042 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1043 } 1044 xdata[i][0] = rbuf_0; 1045 xdata[i+1] = xdata[i] + ct2; 1046 isz1[i] = ct2; /* size of each message */ 1047 } 1048 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1049 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1050 PetscFunctionReturn(0); 1051 } 1052 /* -------------------------------------------------------------------------*/ 1053 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 1054 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1055 /* 1056 Every processor gets the entire matrix 1057 */ 1058 #undef __FUNCT__ 1059 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 1060 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1061 { 1062 Mat B; 1063 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1064 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1065 PetscErrorCode ierr; 1066 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1067 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1068 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1069 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1070 1071 PetscFunctionBegin; 1072 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1073 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1074 1075 if (scall == MAT_INITIAL_MATRIX) { 1076 /* ---------------------------------------------------------------- 1077 Tell every processor the number of nonzeros per row 1078 */ 1079 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1080 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1081 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1082 } 1083 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1084 for (i=0; i<size; i++) { 1085 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1086 displs[i] = A->rmap->range[i]; 1087 } 1088 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1089 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1090 #else 1091 sendcount = A->rmap->rend - A->rmap->rstart; 1092 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1093 #endif 1094 /* --------------------------------------------------------------- 1095 Create the sequential matrix of the same type as the local block diagonal 1096 */ 1097 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1098 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1099 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1100 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1101 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1102 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1103 **Bin = B; 1104 b = (Mat_SeqAIJ*)B->data; 1105 1106 /*-------------------------------------------------------------------- 1107 Copy my part of matrix column indices over 1108 */ 1109 sendcount = ad->nz + bd->nz; 1110 jsendbuf = b->j + b->i[rstarts[rank]]; 1111 a_jsendbuf = ad->j; 1112 b_jsendbuf = bd->j; 1113 n = A->rmap->rend - A->rmap->rstart; 1114 cnt = 0; 1115 for (i=0; i<n; i++) { 1116 1117 /* put in lower diagonal portion */ 1118 m = bd->i[i+1] - bd->i[i]; 1119 while (m > 0) { 1120 /* is it above diagonal (in bd (compressed) numbering) */ 1121 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1122 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1123 m--; 1124 } 1125 1126 /* put in diagonal portion */ 1127 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1128 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1129 } 1130 1131 /* put in upper diagonal portion */ 1132 while (m-- > 0) { 1133 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1134 } 1135 } 1136 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1137 1138 /*-------------------------------------------------------------------- 1139 Gather all column indices to all processors 1140 */ 1141 for (i=0; i<size; i++) { 1142 recvcounts[i] = 0; 1143 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1144 recvcounts[i] += lens[j]; 1145 } 1146 } 1147 displs[0] = 0; 1148 for (i=1; i<size; i++) { 1149 displs[i] = displs[i-1] + recvcounts[i-1]; 1150 } 1151 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1152 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1153 #else 1154 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1155 #endif 1156 /*-------------------------------------------------------------------- 1157 Assemble the matrix into useable form (note numerical values not yet set) 1158 */ 1159 /* set the b->ilen (length of each row) values */ 1160 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1161 /* set the b->i indices */ 1162 b->i[0] = 0; 1163 for (i=1; i<=A->rmap->N; i++) { 1164 b->i[i] = b->i[i-1] + lens[i-1]; 1165 } 1166 ierr = PetscFree(lens);CHKERRQ(ierr); 1167 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1168 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1169 1170 } else { 1171 B = **Bin; 1172 b = (Mat_SeqAIJ*)B->data; 1173 } 1174 1175 /*-------------------------------------------------------------------- 1176 Copy my part of matrix numerical values into the values location 1177 */ 1178 if (flag == MAT_GET_VALUES) { 1179 sendcount = ad->nz + bd->nz; 1180 sendbuf = b->a + b->i[rstarts[rank]]; 1181 a_sendbuf = ad->a; 1182 b_sendbuf = bd->a; 1183 b_sendj = bd->j; 1184 n = A->rmap->rend - A->rmap->rstart; 1185 cnt = 0; 1186 for (i=0; i<n; i++) { 1187 1188 /* put in lower diagonal portion */ 1189 m = bd->i[i+1] - bd->i[i]; 1190 while (m > 0) { 1191 /* is it above diagonal (in bd (compressed) numbering) */ 1192 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1193 sendbuf[cnt++] = *b_sendbuf++; 1194 m--; 1195 b_sendj++; 1196 } 1197 1198 /* put in diagonal portion */ 1199 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1200 sendbuf[cnt++] = *a_sendbuf++; 1201 } 1202 1203 /* put in upper diagonal portion */ 1204 while (m-- > 0) { 1205 sendbuf[cnt++] = *b_sendbuf++; 1206 b_sendj++; 1207 } 1208 } 1209 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1210 1211 /* ----------------------------------------------------------------- 1212 Gather all numerical values to all processors 1213 */ 1214 if (!recvcounts) { 1215 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1216 } 1217 for (i=0; i<size; i++) { 1218 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1219 } 1220 displs[0] = 0; 1221 for (i=1; i<size; i++) { 1222 displs[i] = displs[i-1] + recvcounts[i-1]; 1223 } 1224 recvbuf = b->a; 1225 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1226 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1227 #else 1228 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1229 #endif 1230 } /* endof (flag == MAT_GET_VALUES) */ 1231 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1232 1233 if (A->symmetric) { 1234 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1235 } else if (A->hermitian) { 1236 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1237 } else if (A->structurally_symmetric) { 1238 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1239 } 1240 PetscFunctionReturn(0); 1241 } 1242 #undef __FUNCT__ 1243 #define __FUNCT__ "MatDestroy_MPIAIJ_MatGetSubmatrices" 1244 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1245 { 1246 PetscErrorCode ierr; 1247 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1248 Mat_SubMat *submatj = c->submatis1; 1249 PetscInt i; 1250 1251 PetscFunctionBegin; 1252 ierr = submatj->destroy(C);CHKERRQ(ierr); 1253 1254 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1255 1256 for (i=0; i<submatj->nrqr; ++i) { 1257 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1258 } 1259 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1260 1261 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1262 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1263 1264 for (i=0; i<submatj->nrqs; ++i) { 1265 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1266 } 1267 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1268 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1269 1270 ierr = PetscFree(submatj->r2pmap);CHKERRQ(ierr); 1271 1272 #if defined(PETSC_USE_CTABLE) 1273 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1274 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1275 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1276 #else 1277 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1278 #endif 1279 1280 if (!submatj->allcolumns) { 1281 #if defined(PETSC_USE_CTABLE) 1282 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1283 #else 1284 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1285 #endif 1286 } 1287 1288 ierr = PetscFree(submatj);CHKERRQ(ierr); 1289 PetscFunctionReturn(0); 1290 } 1291 1292 #undef __FUNCT__ 1293 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS_Local" 1294 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1295 { 1296 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1297 Mat submat,A = c->A,B = c->B; 1298 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1299 PetscInt *ai = a->i,*bi = b->i,nzA,nzB; 1300 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1301 const PetscInt *icol,*irow; 1302 PetscInt nrow,ncol,start; 1303 PetscErrorCode ierr; 1304 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 1305 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1306 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1307 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1308 PetscInt *lens,ncols,*cols,Crow; 1309 #if defined(PETSC_USE_CTABLE) 1310 PetscTable cmap,rmap; 1311 PetscInt *cmap_loc,*rmap_loc; 1312 #else 1313 PetscInt *cmap,*rmap; 1314 #endif 1315 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1316 PetscInt *cworkB,lwrite,*bj = b->j,*subcols,*r2pmap; 1317 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1318 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1319 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1320 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1321 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1322 MPI_Comm comm; 1323 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1324 PetscMPIInt *onodes1,*olengths1,idex,end; 1325 Mat_SubMat *smatis1; 1326 #if defined(PETSC_USE_LOG) 1327 PetscLogStage stage[5]; 1328 #endif 1329 1330 PetscFunctionBegin; 1331 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1332 1333 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1334 size = c->size; 1335 rank = c->rank; 1336 1337 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1338 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1339 if (allcolumns) { 1340 icol = NULL; 1341 ncol = C->cmap->N; 1342 } else { 1343 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1344 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1345 } 1346 1347 if (scall == MAT_INITIAL_MATRIX) { 1348 PetscInt *sbuf2_i,*cworkA,lwrite,*aj = a->j,ctmp; 1349 1350 ierr = PetscLogStageRegister("Setup",&stage[0]);CHKERRQ(ierr); 1351 ierr = PetscLogStageRegister("Aj comm",&stage[1]);CHKERRQ(ierr); 1352 ierr = PetscLogStageRegister("Aval comm",&stage[2]);CHKERRQ(ierr); 1353 ierr = PetscLogStageRegister("MAssem_loc",&stage[3]);CHKERRQ(ierr); 1354 ierr = PetscLogStageRegister("MAssem_off",&stage[4]);CHKERRQ(ierr); 1355 1356 ierr = PetscLogStagePush(stage[0]);CHKERRQ(ierr); 1357 1358 /* Get some new tags to keep the communication clean */ 1359 tag1 = ((PetscObject)C)->tag; 1360 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1361 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1362 1363 /* evaluate communication - mesg to who, length of mesg, and buffer space 1364 required. Based on this, buffers are allocated, and data copied into them */ 1365 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 1366 ierr = PetscMalloc1(nrow,&r2pmap);CHKERRQ(ierr); 1367 1368 /* w4[proc] = num of rows owned by this proc */ 1369 for (j=0; j<nrow; j++) { 1370 proc = 0; 1371 row = irow[j]; 1372 while (row >= C->rmap->range[proc+1]) proc++; 1373 w4[proc]++; 1374 r2pmap[j] = proc; /* map row to proc */ 1375 } 1376 for (proc=0; proc<size; proc++) { 1377 if (w4[proc]) { w1[proc] += w4[proc]; w3[proc]++;} 1378 } 1379 1380 nrqs = 0; /* no of outgoing messages */ 1381 msz = 0; /* total mesg length (for all procs) */ 1382 w1[rank] = 0; /* no mesg sent to self */ 1383 w3[rank] = 0; 1384 for (proc=0; proc<size; proc++) { 1385 if (w1[proc]) { w2[proc] = 1; nrqs++;} /* there exists a message to this proc */ 1386 } 1387 1388 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1389 for (i=0,j=0; i<size; i++) { 1390 if (w1[i]) { pa[j] = i; j++; } 1391 } 1392 1393 /* Each message would have a header = 1 + 2*(no of IS) + data */ 1394 for (i=0; i<nrqs; i++) { 1395 proc = pa[i]; 1396 w1[proc] += w2[proc] + 2* w3[proc]; 1397 msz += w1[proc]; 1398 } 1399 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1400 1401 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1402 /* if w2[i]=1, a message of length w1[i] will be sent to proc[i]; */ 1403 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1404 1405 /* nrqs: nsend; nrqr: nrecv; w1: send message length; onodes1: recv id; olengths1: recv message length */ 1406 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1407 1408 /* Now post the Irecvs corresponding to these messages */ 1409 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1410 1411 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1412 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1413 1414 /* Allocate Memory for outgoing messages */ 1415 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1416 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1417 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1418 1419 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1420 iptr = tmp; 1421 k = 0; 1422 for (i=0; i<nrqs; i++) { 1423 proc = pa[i]; 1424 iptr += k; 1425 sbuf1[proc] = iptr; 1426 k = w1[proc]; 1427 } 1428 1429 /* Form the outgoing messages */ 1430 /* Initialize the header space */ 1431 for (i=0; i<nrqs; i++) { 1432 proc = pa[i]; 1433 sbuf1[proc][0] = 0; 1434 ierr = PetscMemzero(sbuf1[proc]+1,2*w3[proc]*sizeof(PetscInt));CHKERRQ(ierr); 1435 ptr[proc] = sbuf1[proc] + 2*w3[proc] + 1; 1436 } 1437 1438 /* Parse the isrow and copy data into outbuf */ 1439 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1440 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1441 proc = 0; 1442 row = irow[j]; 1443 while (row >= C->rmap->range[proc+1]) proc++; 1444 if (proc != rank) { /* copy to the outgoing buf*/ 1445 ctr[proc]++; 1446 *ptr[proc] = row; 1447 ptr[proc]++; 1448 } 1449 } 1450 1451 /* Update the headers for the current IS */ 1452 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1453 if ((ctr_j = ctr[j])) { 1454 sbuf1_j = sbuf1[j]; 1455 k = ++sbuf1_j[0]; 1456 sbuf1_j[2*k] = ctr_j; 1457 sbuf1_j[2*k-1] = 0; 1458 } 1459 } 1460 1461 /* Now post the sends */ 1462 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1463 for (i=0; i<nrqs; ++i) { 1464 proc = pa[i]; 1465 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1466 } 1467 1468 /* Post Receives to capture the buffer size */ 1469 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1470 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1471 1472 rbuf2[0] = tmp + msz; 1473 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1474 1475 for (i=0; i<nrqs; ++i) { 1476 proc = pa[i]; 1477 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1478 } 1479 1480 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1481 1482 /* Send to other procs the buf size they should allocate */ 1483 /* Receive messages*/ 1484 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1485 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1486 1487 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1488 for (i=0; i<nrqr; ++i) { 1489 req_size[i] = 0; 1490 rbuf1_i = rbuf1[i]; 1491 start = 2*rbuf1_i[0] + 1; 1492 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1493 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1494 sbuf2_i = sbuf2[i]; 1495 for (j=start; j<end; j++) { 1496 k = rbuf1_i[j] - rstart; 1497 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1498 sbuf2_i[j] = ncols; 1499 req_size[i] += ncols; 1500 } 1501 req_source1[i] = r_status1[i].MPI_SOURCE; 1502 1503 /* form the header */ 1504 sbuf2_i[0] = req_size[i]; 1505 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1506 1507 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1508 } 1509 1510 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1511 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1512 1513 /* rbuf2 is received, Post recv column indices a->j */ 1514 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1515 1516 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1517 for (i=0; i<nrqs; ++i) { 1518 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1519 req_source2[i] = r_status2[i].MPI_SOURCE; 1520 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1521 } 1522 1523 /* Wait on sends1 and sends2 */ 1524 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1525 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1526 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1527 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1528 1529 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1530 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1531 1532 ierr = PetscLogStagePop();CHKERRQ(ierr); 1533 ierr = PetscLogStagePush(stage[1]);CHKERRQ(ierr); 1534 1535 /* Now allocate sending buffers for a->j, and send them off */ 1536 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1537 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1538 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1539 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1540 1541 for (i=0; i<nrqr; i++) { /* for each requested message */ 1542 rbuf1_i = rbuf1[i]; 1543 sbuf_aj_i = sbuf_aj[i]; 1544 ct1 = 2*rbuf1_i[0] + 1; 1545 ct2 = 0; 1546 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1547 1548 kmax = rbuf1[i][2]; 1549 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1550 row = rbuf1_i[ct1] - rstart; 1551 nzA = ai[row+1] - ai[row]; nzB = bi[row+1] - bi[row]; 1552 ncols = nzA + nzB; 1553 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1554 1555 /* load the column indices for this row into cols*/ 1556 cols = sbuf_aj_i + ct2; 1557 1558 lwrite = 0; 1559 for (l=0; l<nzB; l++) { 1560 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1561 } 1562 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1563 for (l=0; l<nzB; l++) { 1564 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1565 } 1566 1567 ct2 += ncols; 1568 } 1569 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1570 } 1571 1572 /* create column map (cmap): global col of C -> local col of submat */ 1573 #if defined(PETSC_USE_CTABLE) 1574 if (!allcolumns) { 1575 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1576 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1577 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1578 if (icol[j] >= cstart && icol[j] <cend) { 1579 cmap_loc[icol[j] - cstart] = j+1; 1580 } else { /* use PetscTable for non-local col indices */ 1581 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1582 } 1583 } 1584 } else { 1585 cmap = NULL; 1586 cmap_loc = NULL; 1587 } 1588 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1589 #else 1590 if (!allcolumns) { 1591 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1592 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1593 } else { 1594 cmap = NULL; 1595 } 1596 #endif 1597 1598 /* Create lens for MatSeqAIJSetPreallocation() */ 1599 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1600 1601 /* Compute lens from local part of C */ 1602 for (j=0; j<nrow; j++) { 1603 proc = 0; 1604 row = irow[j]; 1605 while (row >= C->rmap->range[proc+1]) proc++; 1606 if (proc == rank) { 1607 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1608 if (!allcolumns) { 1609 for (k=0; k<ncols; k++) { 1610 #if defined(PETSC_USE_CTABLE) 1611 if (cols[k] >= cstart && cols[k] <cend) { 1612 tcol = cmap_loc[cols[k] - cstart]; 1613 } else { 1614 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1615 } 1616 #else 1617 tcol = cmap[cols[k]]; 1618 #endif 1619 if (tcol) lens[j]++; 1620 } 1621 } else { /* allcolumns */ 1622 lens[j] = ncols; 1623 } 1624 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1625 } 1626 } 1627 1628 /* Create row map (rmap): global row of C -> local row of submat */ 1629 #if defined(PETSC_USE_CTABLE) 1630 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1631 for (j=0; j<nrow; j++) { 1632 row = irow[j]; 1633 /* 1634 proc = 0; 1635 while (row >= C->rmap->range[proc+1]) proc++; 1636 */ 1637 proc = r2pmap[j]; 1638 if (proc == rank) { /* a local row */ 1639 rmap_loc[row - rstart] = j; 1640 } else { 1641 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1642 } 1643 } 1644 #else 1645 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1646 for (j=0; j<nrow; j++) { 1647 rmap[irow[j]] = j; 1648 } 1649 #endif 1650 1651 /* Update lens from offproc data */ 1652 /* recv a->j is done */ 1653 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1654 for (i=0; i<nrqs; i++) { 1655 proc = pa[i]; 1656 sbuf1_i = sbuf1[proc]; 1657 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1658 ct1 = 2 + 1; 1659 ct2 = 0; 1660 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1661 rbuf3_i = rbuf3[i]; /* received C->j */ 1662 1663 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1664 max1 = sbuf1_i[2]; 1665 for (k=0; k<max1; k++,ct1++) { 1666 #if defined(PETSC_USE_CTABLE) 1667 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1668 row--; 1669 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1670 #else 1671 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1672 #endif 1673 /* Now, store row index of submat in sbuf1_i[ct1] */ 1674 sbuf1_i[ct1] = row; 1675 1676 nnz = rbuf2_i[ct1]; 1677 if (!allcolumns) { 1678 for (l=0; l<nnz; l++,ct2++) { 1679 #if defined(PETSC_USE_CTABLE) 1680 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1681 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1682 } else { 1683 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1684 } 1685 #else 1686 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1687 #endif 1688 if (tcol) { 1689 lens[row]++; 1690 1691 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1692 For reuse, we replace received C->j with index that should be inserted to submat */ 1693 /* rbuf3_i[ct3++] = ct2; ??? */ 1694 } 1695 } 1696 } else { /* allcolumns */ 1697 lens[row] += nnz; 1698 } 1699 } 1700 } 1701 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1702 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1703 1704 /* Create the submatrices */ 1705 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1706 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1707 1708 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1709 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1710 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1711 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1712 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1713 1714 /* create struct Mat_SubMat and attached it to submat */ 1715 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1716 subc = (Mat_SeqAIJ*)submat->data; 1717 subc->submatis1 = smatis1; 1718 1719 smatis1->nrqs = nrqs; 1720 smatis1->nrqr = nrqr; 1721 smatis1->rbuf1 = rbuf1; 1722 smatis1->rbuf2 = rbuf2; 1723 smatis1->rbuf3 = rbuf3; 1724 smatis1->sbuf2 = sbuf2; 1725 smatis1->req_source2 = req_source2; 1726 1727 smatis1->sbuf1 = sbuf1; 1728 smatis1->ptr = ptr; 1729 smatis1->tmp = tmp; 1730 smatis1->ctr = ctr; 1731 1732 smatis1->pa = pa; 1733 smatis1->req_size = req_size; 1734 smatis1->req_source1 = req_source1; 1735 1736 smatis1->allcolumns = allcolumns; 1737 smatis1->r2pmap = r2pmap; 1738 smatis1->rmap = rmap; 1739 smatis1->cmap = cmap; 1740 #if defined(PETSC_USE_CTABLE) 1741 smatis1->rmap_loc = rmap_loc; 1742 smatis1->cmap_loc = cmap_loc; 1743 #endif 1744 1745 smatis1->destroy = submat->ops->destroy; 1746 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1747 submat->factortype = C->factortype; 1748 1749 ierr = PetscLogStagePop();CHKERRQ(ierr); 1750 ierr = PetscLogStagePush(stage[2]);CHKERRQ(ierr); 1751 } else { /* scall == MAT_REUSE_MATRIX */ 1752 submat = submats[0]; 1753 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1754 1755 subc = (Mat_SeqAIJ*)submat->data; 1756 smatis1 = subc->submatis1; 1757 nrqs = smatis1->nrqs; 1758 nrqr = smatis1->nrqr; 1759 rbuf1 = smatis1->rbuf1; 1760 rbuf2 = smatis1->rbuf2; 1761 rbuf3 = smatis1->rbuf3; 1762 req_source2 = smatis1->req_source2; 1763 1764 sbuf1 = smatis1->sbuf1; 1765 sbuf2 = smatis1->sbuf2; 1766 ptr = smatis1->ptr; 1767 tmp = smatis1->tmp; 1768 ctr = smatis1->ctr; 1769 1770 pa = smatis1->pa; 1771 req_size = smatis1->req_size; 1772 req_source1 = smatis1->req_source1; 1773 1774 allcolumns = smatis1->allcolumns; 1775 r2pmap = smatis1->r2pmap; 1776 rmap = smatis1->rmap; 1777 cmap = smatis1->cmap; 1778 #if defined(PETSC_USE_CTABLE) 1779 rmap_loc = smatis1->rmap_loc; 1780 cmap_loc = smatis1->cmap_loc; 1781 #endif 1782 } 1783 1784 /* Post recv matrix values */ 1785 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1786 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1787 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1788 for (i=0; i<nrqs; ++i) { 1789 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1790 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1791 } 1792 1793 /* Allocate sending buffers for a->a, and send them off */ 1794 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1795 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1796 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1797 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1798 1799 for (i=0; i<nrqr; i++) { 1800 rbuf1_i = rbuf1[i]; 1801 sbuf_aa_i = sbuf_aa[i]; 1802 ct1 = 2*rbuf1_i[0]+1; 1803 ct2 = 0; 1804 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1805 1806 kmax = rbuf1_i[2]; 1807 for (k=0; k<kmax; k++,ct1++) { 1808 row = rbuf1_i[ct1] - rstart; 1809 nzA = ai[row+1] - ai[row]; 1810 nzB = bi[row+1] - bi[row]; 1811 ncols = nzA + nzB; 1812 cworkB = bj + bi[row]; 1813 vworkA = a_a + ai[row]; 1814 vworkB = b_a + bi[row]; 1815 1816 /* load the column values for this row into vals*/ 1817 vals = sbuf_aa_i + ct2; 1818 1819 lwrite = 0; 1820 for (l=0; l<nzB; l++) { 1821 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1822 } 1823 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1824 for (l=0; l<nzB; l++) { 1825 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1826 } 1827 1828 ct2 += ncols; 1829 } 1830 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1831 } 1832 1833 if (scall == MAT_INITIAL_MATRIX) { 1834 ierr = PetscLogStagePop();CHKERRQ(ierr); 1835 ierr = PetscLogStagePush(stage[3]);CHKERRQ(ierr); 1836 } 1837 1838 /* Assemble submat */ 1839 if (!allcolumns) { 1840 ierr = PetscCalloc2(ncol,&subcols,ncol,&subvals);CHKERRQ(ierr); 1841 } 1842 /* First assemble the local rows */ 1843 for (j=0; j<nrow; j++) { 1844 row = irow[j]; 1845 /* 1846 proc = 0; 1847 while (row >= C->rmap->range[proc+1]) proc++; 1848 */ 1849 proc = r2pmap[j]; 1850 if (proc == rank) { 1851 Crow = row; 1852 #if defined(PETSC_USE_CTABLE) 1853 row = rmap_loc[Crow - rstart]; /* row index of submat */ 1854 #else 1855 row = rmap[row]; 1856 #endif 1857 ierr = MatGetRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr); 1858 1859 if (!allcolumns) { 1860 #if defined(PETSC_USE_CTABLE) 1861 i = 0; 1862 k = 0; 1863 while (cols[k] < cstart) { 1864 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1865 if (tcol) { 1866 subcols[i] = --tcol; 1867 subvals[i++] = vals[k]; 1868 } 1869 k++; 1870 } 1871 while (cols[k] < cend && k<ncols) { 1872 tcol = cmap_loc[cols[k]-cstart]; 1873 if (tcol) { 1874 subcols[i] = --tcol; 1875 subvals[i++] = vals[k]; 1876 } 1877 k++; 1878 } 1879 while (k<ncols) { 1880 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1881 if (tcol) { 1882 subcols[i] = --tcol; 1883 subvals[i++] = vals[k]; 1884 } 1885 k++; 1886 } 1887 #else 1888 i = 0; 1889 for (k=0; k<ncols; k++) { 1890 tcol = cmap[cols[k]]; 1891 if (tcol) { 1892 subcols[i] = --tcol; 1893 subvals[i++] = vals[k]; 1894 } 1895 } 1896 #endif 1897 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1898 } else { /* allcolumns */ 1899 ierr = MatSetValues_SeqAIJ(submat,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr); 1900 } 1901 ierr = MatRestoreRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr); 1902 } 1903 } 1904 1905 if (scall == MAT_INITIAL_MATRIX) { 1906 ierr = PetscLogStagePop();CHKERRQ(ierr); 1907 ierr = PetscLogStagePush(stage[4]);CHKERRQ(ierr); 1908 } 1909 1910 /* Now assemble the off-proc rows */ 1911 for (i=0; i<nrqs; i++) { /* for each requested message */ 1912 /* recv values from other processes */ 1913 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1914 proc = pa[idex]; 1915 sbuf1_i = sbuf1[proc]; 1916 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1917 ct1 = 2 + 1; 1918 ct2 = 0; /* count of received C->j */ 1919 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1920 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1921 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1922 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1923 1924 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1925 max1 = sbuf1_i[2]; /* num of rows */ 1926 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1927 row = sbuf1_i[ct1]; /* row index of submat */ 1928 if (!allcolumns) { 1929 idex = 0; 1930 if (scall == MAT_INITIAL_MATRIX) { 1931 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1932 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1933 #if defined(PETSC_USE_CTABLE) 1934 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1935 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1936 } else { 1937 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1938 } 1939 #else 1940 tcol = cmap[rbuf3_i[ct2]]; 1941 #endif 1942 if (tcol) { 1943 subcols[idex] = --tcol; 1944 subvals[idex++] = rbuf4_i[ct2]; 1945 1946 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1947 For reuse, we replace received C->j with index that should be inserted to submat */ 1948 rbuf3_i[ct3++] = ct2; 1949 } 1950 } 1951 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1952 1953 } else { /* scall == MAT_REUSE_MATRIX */ 1954 submat = submats[0]; 1955 subc = (Mat_SeqAIJ*)submat->data; 1956 1957 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1958 for (l=0; l<nnz; l++) { 1959 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1960 subvals[idex++] = rbuf4_i[ct2]; 1961 } 1962 1963 bj = subc->j + subc->i[row]; 1964 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1965 } 1966 } else { /* allcolumns */ 1967 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1968 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1969 ct2 += nnz; 1970 } 1971 } 1972 } 1973 1974 /* sending a->a are done */ 1975 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1976 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1977 1978 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1979 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1980 submats[0] = submat; 1981 1982 /* Restore the indices */ 1983 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1984 if (!allcolumns) { 1985 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1986 } 1987 1988 /* Destroy allocated memory */ 1989 for (i=0; i<nrqs; ++i) { 1990 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1991 } 1992 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1993 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1994 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1995 1996 if (!allcolumns) { 1997 ierr = PetscFree2(subcols,subvals);CHKERRQ(ierr); 1998 } 1999 2000 if (scall == MAT_INITIAL_MATRIX) { 2001 ierr = PetscLogStagePop();CHKERRQ(ierr); 2002 2003 ierr = PetscFree(lens);CHKERRQ(ierr); 2004 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2005 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2006 } 2007 PetscFunctionReturn(0); 2008 } 2009 2010 #undef __FUNCT__ 2011 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS" 2012 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2013 { 2014 PetscErrorCode ierr; 2015 PetscInt ncol; 2016 PetscBool colflag,allcolumns=PETSC_FALSE; 2017 2018 PetscFunctionBegin; 2019 /* Allocate memory to hold all the submatrices */ 2020 if (scall != MAT_REUSE_MATRIX) { 2021 ierr = PetscMalloc1(1,submat);CHKERRQ(ierr); 2022 } 2023 2024 /* Check for special case: each processor gets entire matrix columns */ 2025 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 2026 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 2027 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 2028 2029 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2030 PetscFunctionReturn(0); 2031 } 2032 2033 #undef __FUNCT__ 2034 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 2035 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2036 { 2037 PetscErrorCode ierr; 2038 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 2039 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 2040 2041 PetscFunctionBegin; 2042 /* Check for special case: each processor gets entire matrix */ 2043 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */ 2044 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2045 PetscFunctionReturn(0); 2046 } 2047 2048 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2049 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2050 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2051 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2052 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2053 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2054 wantallmatrix = PETSC_TRUE; 2055 2056 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2057 } 2058 } 2059 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2060 if (twantallmatrix) { 2061 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2062 PetscFunctionReturn(0); 2063 } 2064 2065 /* Allocate memory to hold all the submatrices */ 2066 if (scall != MAT_REUSE_MATRIX) { 2067 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 2068 } 2069 2070 /* Check for special case: each processor gets entire matrix columns */ 2071 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 2072 for (i=0; i<ismax; i++) { 2073 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2074 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 2075 if (colflag && ncol == C->cmap->N) { 2076 allcolumns[i] = PETSC_TRUE; 2077 } else { 2078 allcolumns[i] = PETSC_FALSE; 2079 } 2080 } 2081 2082 /* Determine the number of stages through which submatrices are done */ 2083 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2084 2085 /* 2086 Each stage will extract nmax submatrices. 2087 nmax is determined by the matrix column dimension. 2088 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2089 */ 2090 if (!nmax) nmax = 1; 2091 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 2092 2093 /* Make sure every processor loops through the nstages */ 2094 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2095 2096 for (i=0,pos=0; i<nstages; i++) { 2097 if (pos+nmax <= ismax) max_no = nmax; 2098 else if (pos == ismax) max_no = 0; 2099 else max_no = ismax-pos; 2100 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 2101 pos += max_no; 2102 } 2103 2104 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 2105 PetscFunctionReturn(0); 2106 } 2107 2108 /* -------------------------------------------------------------------------*/ 2109 #undef __FUNCT__ 2110 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 2111 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 2112 { 2113 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2114 Mat A = c->A; 2115 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 2116 const PetscInt **icol,**irow; 2117 PetscInt *nrow,*ncol,start; 2118 PetscErrorCode ierr; 2119 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 2120 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 2121 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 2122 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 2123 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2124 #if defined(PETSC_USE_CTABLE) 2125 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2126 #else 2127 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2128 #endif 2129 const PetscInt *irow_i; 2130 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2131 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2132 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2133 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2134 MPI_Status *r_status3,*r_status4,*s_status4; 2135 MPI_Comm comm; 2136 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 2137 PetscMPIInt *onodes1,*olengths1; 2138 PetscMPIInt idex,idex2,end; 2139 2140 PetscFunctionBegin; 2141 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2142 tag0 = ((PetscObject)C)->tag; 2143 size = c->size; 2144 rank = c->rank; 2145 2146 /* Get some new tags to keep the communication clean */ 2147 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 2148 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2149 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2150 2151 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 2152 2153 for (i=0; i<ismax; i++) { 2154 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2155 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2156 if (allcolumns[i]) { 2157 icol[i] = NULL; 2158 ncol[i] = C->cmap->N; 2159 } else { 2160 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2161 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2162 } 2163 } 2164 2165 /* evaluate communication - mesg to who, length of mesg, and buffer space 2166 required. Based on this, buffers are allocated, and data copied into them*/ 2167 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 2168 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2169 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2170 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2171 for (i=0; i<ismax; i++) { 2172 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2173 jmax = nrow[i]; 2174 irow_i = irow[i]; 2175 for (j=0; j<jmax; j++) { 2176 l = 0; 2177 row = irow_i[j]; 2178 while (row >= C->rmap->range[l+1]) l++; 2179 proc = l; 2180 w4[proc]++; 2181 } 2182 for (j=0; j<size; j++) { 2183 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 2184 } 2185 } 2186 2187 nrqs = 0; /* no of outgoing messages */ 2188 msz = 0; /* total mesg length (for all procs) */ 2189 w1[rank] = 0; /* no mesg sent to self */ 2190 w3[rank] = 0; 2191 for (i=0; i<size; i++) { 2192 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2193 } 2194 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2195 for (i=0,j=0; i<size; i++) { 2196 if (w1[i]) { pa[j] = i; j++; } 2197 } 2198 2199 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2200 for (i=0; i<nrqs; i++) { 2201 j = pa[i]; 2202 w1[j] += w2[j] + 2* w3[j]; 2203 msz += w1[j]; 2204 } 2205 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2206 2207 /* Determine the number of messages to expect, their lengths, from from-ids */ 2208 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2209 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2210 2211 /* Now post the Irecvs corresponding to these messages */ 2212 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2213 2214 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2215 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2216 2217 /* Allocate Memory for outgoing messages */ 2218 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2219 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2220 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2221 2222 { 2223 PetscInt *iptr = tmp; 2224 k = 0; 2225 for (i=0; i<nrqs; i++) { 2226 j = pa[i]; 2227 iptr += k; 2228 sbuf1[j] = iptr; 2229 k = w1[j]; 2230 } 2231 } 2232 2233 /* Form the outgoing messages */ 2234 /* Initialize the header space */ 2235 for (i=0; i<nrqs; i++) { 2236 j = pa[i]; 2237 sbuf1[j][0] = 0; 2238 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2239 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2240 } 2241 2242 /* Parse the isrow and copy data into outbuf */ 2243 for (i=0; i<ismax; i++) { 2244 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2245 irow_i = irow[i]; 2246 jmax = nrow[i]; 2247 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2248 l = 0; 2249 row = irow_i[j]; 2250 while (row >= C->rmap->range[l+1]) l++; 2251 proc = l; 2252 if (proc != rank) { /* copy to the outgoing buf*/ 2253 ctr[proc]++; 2254 *ptr[proc] = row; 2255 ptr[proc]++; 2256 } 2257 } 2258 /* Update the headers for the current IS */ 2259 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2260 if ((ctr_j = ctr[j])) { 2261 sbuf1_j = sbuf1[j]; 2262 k = ++sbuf1_j[0]; 2263 sbuf1_j[2*k] = ctr_j; 2264 sbuf1_j[2*k-1] = i; 2265 } 2266 } 2267 } 2268 2269 /* Now post the sends */ 2270 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2271 for (i=0; i<nrqs; ++i) { 2272 j = pa[i]; 2273 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2274 } 2275 2276 /* Post Receives to capture the buffer size */ 2277 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2278 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 2279 rbuf2[0] = tmp + msz; 2280 for (i=1; i<nrqs; ++i) { 2281 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2282 } 2283 for (i=0; i<nrqs; ++i) { 2284 j = pa[i]; 2285 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 2286 } 2287 2288 /* Send to other procs the buf size they should allocate */ 2289 2290 2291 /* Receive messages*/ 2292 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2293 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2294 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 2295 { 2296 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2297 PetscInt *sbuf2_i; 2298 2299 for (i=0; i<nrqr; ++i) { 2300 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 2301 2302 req_size[idex] = 0; 2303 rbuf1_i = rbuf1[idex]; 2304 start = 2*rbuf1_i[0] + 1; 2305 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2306 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 2307 sbuf2_i = sbuf2[idex]; 2308 for (j=start; j<end; j++) { 2309 id = rbuf1_i[j] - rstart; 2310 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2311 sbuf2_i[j] = ncols; 2312 req_size[idex] += ncols; 2313 } 2314 req_source[idex] = r_status1[i].MPI_SOURCE; 2315 /* form the header */ 2316 sbuf2_i[0] = req_size[idex]; 2317 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2318 2319 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 2320 } 2321 } 2322 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2323 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2324 2325 /* recv buffer sizes */ 2326 /* Receive messages*/ 2327 2328 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 2329 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2330 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2331 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2332 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2333 2334 for (i=0; i<nrqs; ++i) { 2335 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 2336 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 2337 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 2338 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 2339 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 2340 } 2341 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2342 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2343 2344 /* Wait on sends1 and sends2 */ 2345 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2346 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2347 2348 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2349 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2350 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2351 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2352 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2353 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2354 2355 /* Now allocate buffers for a->j, and send them off */ 2356 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2357 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2358 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2359 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2360 2361 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2362 { 2363 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2364 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2365 PetscInt cend = C->cmap->rend; 2366 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2367 2368 for (i=0; i<nrqr; i++) { 2369 rbuf1_i = rbuf1[i]; 2370 sbuf_aj_i = sbuf_aj[i]; 2371 ct1 = 2*rbuf1_i[0] + 1; 2372 ct2 = 0; 2373 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2374 kmax = rbuf1[i][2*j]; 2375 for (k=0; k<kmax; k++,ct1++) { 2376 row = rbuf1_i[ct1] - rstart; 2377 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2378 ncols = nzA + nzB; 2379 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2380 2381 /* load the column indices for this row into cols*/ 2382 cols = sbuf_aj_i + ct2; 2383 2384 lwrite = 0; 2385 for (l=0; l<nzB; l++) { 2386 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2387 } 2388 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2389 for (l=0; l<nzB; l++) { 2390 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2391 } 2392 2393 ct2 += ncols; 2394 } 2395 } 2396 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 2397 } 2398 } 2399 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 2400 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 2401 2402 /* Allocate buffers for a->a, and send them off */ 2403 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2404 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2405 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2406 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2407 2408 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2409 { 2410 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2411 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2412 PetscInt cend = C->cmap->rend; 2413 PetscInt *b_j = b->j; 2414 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2415 2416 for (i=0; i<nrqr; i++) { 2417 rbuf1_i = rbuf1[i]; 2418 sbuf_aa_i = sbuf_aa[i]; 2419 ct1 = 2*rbuf1_i[0]+1; 2420 ct2 = 0; 2421 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2422 kmax = rbuf1_i[2*j]; 2423 for (k=0; k<kmax; k++,ct1++) { 2424 row = rbuf1_i[ct1] - rstart; 2425 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2426 ncols = nzA + nzB; 2427 cworkB = b_j + b_i[row]; 2428 vworkA = a_a + a_i[row]; 2429 vworkB = b_a + b_i[row]; 2430 2431 /* load the column values for this row into vals*/ 2432 vals = sbuf_aa_i+ct2; 2433 2434 lwrite = 0; 2435 for (l=0; l<nzB; l++) { 2436 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2437 } 2438 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2439 for (l=0; l<nzB; l++) { 2440 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2441 } 2442 2443 ct2 += ncols; 2444 } 2445 } 2446 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 2447 } 2448 } 2449 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 2450 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 2451 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2452 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2453 2454 /* Form the matrix */ 2455 /* create col map: global col of C -> local col of submatrices */ 2456 { 2457 const PetscInt *icol_i; 2458 #if defined(PETSC_USE_CTABLE) 2459 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 2460 for (i=0; i<ismax; i++) { 2461 if (!allcolumns[i]) { 2462 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2463 2464 jmax = ncol[i]; 2465 icol_i = icol[i]; 2466 cmap_i = cmap[i]; 2467 for (j=0; j<jmax; j++) { 2468 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2469 } 2470 } else { 2471 cmap[i] = NULL; 2472 } 2473 } 2474 #else 2475 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 2476 for (i=0; i<ismax; i++) { 2477 if (!allcolumns[i]) { 2478 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2479 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2480 jmax = ncol[i]; 2481 icol_i = icol[i]; 2482 cmap_i = cmap[i]; 2483 for (j=0; j<jmax; j++) { 2484 cmap_i[icol_i[j]] = j+1; 2485 } 2486 } else { 2487 cmap[i] = NULL; 2488 } 2489 } 2490 #endif 2491 } 2492 2493 /* Create lens which is required for MatCreate... */ 2494 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2495 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2496 if (ismax) { 2497 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 2498 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 2499 } 2500 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2501 2502 /* Update lens from local data */ 2503 for (i=0; i<ismax; i++) { 2504 jmax = nrow[i]; 2505 if (!allcolumns[i]) cmap_i = cmap[i]; 2506 irow_i = irow[i]; 2507 lens_i = lens[i]; 2508 for (j=0; j<jmax; j++) { 2509 l = 0; 2510 row = irow_i[j]; 2511 while (row >= C->rmap->range[l+1]) l++; 2512 proc = l; 2513 if (proc == rank) { 2514 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2515 if (!allcolumns[i]) { 2516 for (k=0; k<ncols; k++) { 2517 #if defined(PETSC_USE_CTABLE) 2518 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2519 #else 2520 tcol = cmap_i[cols[k]]; 2521 #endif 2522 if (tcol) lens_i[j]++; 2523 } 2524 } else { /* allcolumns */ 2525 lens_i[j] = ncols; 2526 } 2527 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2528 } 2529 } 2530 } 2531 2532 /* Create row map: global row of C -> local row of submatrices */ 2533 #if defined(PETSC_USE_CTABLE) 2534 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 2535 for (i=0; i<ismax; i++) { 2536 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2537 irow_i = irow[i]; 2538 jmax = nrow[i]; 2539 for (j=0; j<jmax; j++) { 2540 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2541 } 2542 } 2543 #else 2544 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 2545 if (ismax) { 2546 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 2547 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2548 } 2549 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 2550 for (i=0; i<ismax; i++) { 2551 rmap_i = rmap[i]; 2552 irow_i = irow[i]; 2553 jmax = nrow[i]; 2554 for (j=0; j<jmax; j++) { 2555 rmap_i[irow_i[j]] = j; 2556 } 2557 } 2558 #endif 2559 2560 /* Update lens from offproc data */ 2561 { 2562 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2563 2564 for (tmp2=0; tmp2<nrqs; tmp2++) { 2565 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 2566 idex = pa[idex2]; 2567 sbuf1_i = sbuf1[idex]; 2568 jmax = sbuf1_i[0]; 2569 ct1 = 2*jmax+1; 2570 ct2 = 0; 2571 rbuf2_i = rbuf2[idex2]; 2572 rbuf3_i = rbuf3[idex2]; 2573 for (j=1; j<=jmax; j++) { 2574 is_no = sbuf1_i[2*j-1]; 2575 max1 = sbuf1_i[2*j]; 2576 lens_i = lens[is_no]; 2577 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2578 rmap_i = rmap[is_no]; 2579 for (k=0; k<max1; k++,ct1++) { 2580 #if defined(PETSC_USE_CTABLE) 2581 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2582 row--; 2583 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2584 #else 2585 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2586 #endif 2587 max2 = rbuf2_i[ct1]; 2588 for (l=0; l<max2; l++,ct2++) { 2589 if (!allcolumns[is_no]) { 2590 #if defined(PETSC_USE_CTABLE) 2591 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2592 #else 2593 tcol = cmap_i[rbuf3_i[ct2]]; 2594 #endif 2595 if (tcol) lens_i[row]++; 2596 } else { /* allcolumns */ 2597 lens_i[row]++; /* lens_i[row] += max2 ? */ 2598 } 2599 } 2600 } 2601 } 2602 } 2603 } 2604 ierr = PetscFree(r_status3);CHKERRQ(ierr); 2605 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2606 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2607 ierr = PetscFree(s_status3);CHKERRQ(ierr); 2608 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2609 2610 /* Create the submatrices */ 2611 if (scall == MAT_REUSE_MATRIX) { 2612 PetscBool flag; 2613 2614 /* 2615 Assumes new rows are same length as the old rows,hence bug! 2616 */ 2617 for (i=0; i<ismax; i++) { 2618 mat = (Mat_SeqAIJ*)(submats[i]->data); 2619 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2620 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2621 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 2622 /* Initial matrix as if empty */ 2623 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2624 2625 submats[i]->factortype = C->factortype; 2626 } 2627 } else { 2628 for (i=0; i<ismax; i++) { 2629 PetscInt rbs,cbs; 2630 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2631 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2632 2633 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2634 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2635 2636 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2637 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2638 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2639 } 2640 } 2641 2642 /* Assemble the matrices */ 2643 /* First assemble the local rows */ 2644 { 2645 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2646 PetscScalar *imat_a; 2647 2648 for (i=0; i<ismax; i++) { 2649 mat = (Mat_SeqAIJ*)submats[i]->data; 2650 imat_ilen = mat->ilen; 2651 imat_j = mat->j; 2652 imat_i = mat->i; 2653 imat_a = mat->a; 2654 2655 if (!allcolumns[i]) cmap_i = cmap[i]; 2656 rmap_i = rmap[i]; 2657 irow_i = irow[i]; 2658 jmax = nrow[i]; 2659 for (j=0; j<jmax; j++) { 2660 l = 0; 2661 row = irow_i[j]; 2662 while (row >= C->rmap->range[l+1]) l++; 2663 proc = l; 2664 if (proc == rank) { 2665 old_row = row; 2666 #if defined(PETSC_USE_CTABLE) 2667 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2668 row--; 2669 #else 2670 row = rmap_i[row]; 2671 #endif 2672 ilen_row = imat_ilen[row]; 2673 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2674 mat_i = imat_i[row]; 2675 mat_a = imat_a + mat_i; 2676 mat_j = imat_j + mat_i; 2677 if (!allcolumns[i]) { 2678 for (k=0; k<ncols; k++) { 2679 #if defined(PETSC_USE_CTABLE) 2680 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2681 #else 2682 tcol = cmap_i[cols[k]]; 2683 #endif 2684 if (tcol) { 2685 *mat_j++ = tcol - 1; 2686 *mat_a++ = vals[k]; 2687 ilen_row++; 2688 } 2689 } 2690 } else { /* allcolumns */ 2691 for (k=0; k<ncols; k++) { 2692 *mat_j++ = cols[k]; /* global col index! */ 2693 *mat_a++ = vals[k]; 2694 ilen_row++; 2695 } 2696 } 2697 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2698 2699 imat_ilen[row] = ilen_row; 2700 } 2701 } 2702 } 2703 } 2704 2705 /* Now assemble the off proc rows*/ 2706 { 2707 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 2708 PetscInt *imat_j,*imat_i; 2709 PetscScalar *imat_a,*rbuf4_i; 2710 2711 for (tmp2=0; tmp2<nrqs; tmp2++) { 2712 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 2713 idex = pa[idex2]; 2714 sbuf1_i = sbuf1[idex]; 2715 jmax = sbuf1_i[0]; 2716 ct1 = 2*jmax + 1; 2717 ct2 = 0; 2718 rbuf2_i = rbuf2[idex2]; 2719 rbuf3_i = rbuf3[idex2]; 2720 rbuf4_i = rbuf4[idex2]; 2721 for (j=1; j<=jmax; j++) { 2722 is_no = sbuf1_i[2*j-1]; 2723 rmap_i = rmap[is_no]; 2724 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2725 mat = (Mat_SeqAIJ*)submats[is_no]->data; 2726 imat_ilen = mat->ilen; 2727 imat_j = mat->j; 2728 imat_i = mat->i; 2729 imat_a = mat->a; 2730 max1 = sbuf1_i[2*j]; 2731 for (k=0; k<max1; k++,ct1++) { 2732 row = sbuf1_i[ct1]; 2733 #if defined(PETSC_USE_CTABLE) 2734 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2735 row--; 2736 #else 2737 row = rmap_i[row]; 2738 #endif 2739 ilen = imat_ilen[row]; 2740 mat_i = imat_i[row]; 2741 mat_a = imat_a + mat_i; 2742 mat_j = imat_j + mat_i; 2743 max2 = rbuf2_i[ct1]; 2744 if (!allcolumns[is_no]) { 2745 for (l=0; l<max2; l++,ct2++) { 2746 2747 #if defined(PETSC_USE_CTABLE) 2748 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2749 #else 2750 tcol = cmap_i[rbuf3_i[ct2]]; 2751 #endif 2752 if (tcol) { 2753 *mat_j++ = tcol - 1; 2754 *mat_a++ = rbuf4_i[ct2]; 2755 ilen++; 2756 } 2757 } 2758 } else { /* allcolumns */ 2759 for (l=0; l<max2; l++,ct2++) { 2760 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2761 *mat_a++ = rbuf4_i[ct2]; 2762 ilen++; 2763 } 2764 } 2765 imat_ilen[row] = ilen; 2766 } 2767 } 2768 } 2769 } 2770 2771 /* sort the rows */ 2772 { 2773 PetscInt *imat_ilen,*imat_j,*imat_i; 2774 PetscScalar *imat_a; 2775 2776 for (i=0; i<ismax; i++) { 2777 mat = (Mat_SeqAIJ*)submats[i]->data; 2778 imat_j = mat->j; 2779 imat_i = mat->i; 2780 imat_a = mat->a; 2781 imat_ilen = mat->ilen; 2782 2783 if (allcolumns[i]) continue; 2784 jmax = nrow[i]; 2785 for (j=0; j<jmax; j++) { 2786 PetscInt ilen; 2787 2788 mat_i = imat_i[j]; 2789 mat_a = imat_a + mat_i; 2790 mat_j = imat_j + mat_i; 2791 ilen = imat_ilen[j]; 2792 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 2793 } 2794 } 2795 } 2796 2797 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2798 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2799 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2800 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2801 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2802 2803 /* Restore the indices */ 2804 for (i=0; i<ismax; i++) { 2805 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2806 if (!allcolumns[i]) { 2807 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2808 } 2809 } 2810 2811 /* Destroy allocated memory */ 2812 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 2813 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2814 ierr = PetscFree(pa);CHKERRQ(ierr); 2815 2816 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 2817 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 2818 for (i=0; i<nrqr; ++i) { 2819 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 2820 } 2821 for (i=0; i<nrqs; ++i) { 2822 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 2823 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2824 } 2825 2826 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 2827 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 2828 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2829 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2830 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2831 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2832 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2833 2834 #if defined(PETSC_USE_CTABLE) 2835 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 2836 #else 2837 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 2838 #endif 2839 ierr = PetscFree(rmap);CHKERRQ(ierr); 2840 2841 for (i=0; i<ismax; i++) { 2842 if (!allcolumns[i]) { 2843 #if defined(PETSC_USE_CTABLE) 2844 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 2845 #else 2846 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 2847 #endif 2848 } 2849 } 2850 ierr = PetscFree(cmap);CHKERRQ(ierr); 2851 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2852 ierr = PetscFree(lens);CHKERRQ(ierr); 2853 2854 for (i=0; i<ismax; i++) { 2855 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2856 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2857 } 2858 PetscFunctionReturn(0); 2859 } 2860 2861 /* 2862 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2863 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2864 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2865 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2866 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2867 state, and needs to be "assembled" later by compressing B's column space. 2868 2869 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2870 Following this call, C->A & C->B have been created, even if empty. 2871 */ 2872 #undef __FUNCT__ 2873 #define __FUNCT__ "MatSetSeqMats_MPIAIJ" 2874 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2875 { 2876 /* If making this function public, change the error returned in this function away from _PLIB. */ 2877 PetscErrorCode ierr; 2878 Mat_MPIAIJ *aij; 2879 Mat_SeqAIJ *Baij; 2880 PetscBool seqaij,Bdisassembled; 2881 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2882 PetscScalar v; 2883 const PetscInt *rowindices,*colindices; 2884 2885 PetscFunctionBegin; 2886 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2887 if (A) { 2888 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2889 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2890 if (rowemb) { 2891 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2892 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2893 } else { 2894 if (C->rmap->n != A->rmap->n) { 2895 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2896 } 2897 } 2898 if (dcolemb) { 2899 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2900 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2901 } else { 2902 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2903 } 2904 } 2905 if (B) { 2906 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2907 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2908 if (rowemb) { 2909 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2910 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2911 } else { 2912 if (C->rmap->n != B->rmap->n) { 2913 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2914 } 2915 } 2916 if (ocolemb) { 2917 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2918 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2919 } else { 2920 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2921 } 2922 } 2923 2924 aij = (Mat_MPIAIJ*)(C->data); 2925 if (!aij->A) { 2926 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2927 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2928 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2929 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2930 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2931 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2932 } 2933 if (A) { 2934 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2935 } else { 2936 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2937 } 2938 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2939 /* 2940 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2941 need to "disassemble" B -- convert it to using C's global indices. 2942 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2943 2944 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2945 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2946 2947 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2948 At least avoid calling MatSetValues() and the implied searches? 2949 */ 2950 2951 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2952 #if defined(PETSC_USE_CTABLE) 2953 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2954 #else 2955 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2956 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2957 if (aij->B) { 2958 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2959 } 2960 #endif 2961 ngcol = 0; 2962 if (aij->lvec) { 2963 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2964 } 2965 if (aij->garray) { 2966 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2967 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2968 } 2969 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2970 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2971 } 2972 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2973 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2974 } 2975 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2976 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2977 } 2978 } 2979 Bdisassembled = PETSC_FALSE; 2980 if (!aij->B) { 2981 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2982 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2983 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2984 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2985 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2986 Bdisassembled = PETSC_TRUE; 2987 } 2988 if (B) { 2989 Baij = (Mat_SeqAIJ*)(B->data); 2990 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2991 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2992 for (i=0; i<B->rmap->n; i++) { 2993 nz[i] = Baij->i[i+1] - Baij->i[i]; 2994 } 2995 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2996 ierr = PetscFree(nz);CHKERRQ(ierr); 2997 } 2998 2999 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3000 shift = rend-rstart; 3001 count = 0; 3002 rowindices = NULL; 3003 colindices = NULL; 3004 if (rowemb) { 3005 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3006 } 3007 if (ocolemb) { 3008 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3009 } 3010 for (i=0; i<B->rmap->n; i++) { 3011 PetscInt row; 3012 row = i; 3013 if (rowindices) row = rowindices[i]; 3014 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3015 col = Baij->j[count]; 3016 if (colindices) col = colindices[col]; 3017 if (Bdisassembled && col>=rstart) col += shift; 3018 v = Baij->a[count]; 3019 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3020 ++count; 3021 } 3022 } 3023 /* No assembly for aij->B is necessary. */ 3024 /* FIXME: set aij->B's nonzerostate correctly. */ 3025 } else { 3026 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3027 } 3028 C->preallocated = PETSC_TRUE; 3029 C->was_assembled = PETSC_FALSE; 3030 C->assembled = PETSC_FALSE; 3031 /* 3032 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3033 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3034 */ 3035 PetscFunctionReturn(0); 3036 } 3037 3038 #undef __FUNCT__ 3039 #define __FUNCT__ "MatGetSeqMats_MPIAIJ" 3040 /* 3041 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3042 */ 3043 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3044 { 3045 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 3046 3047 PetscFunctionBegin; 3048 PetscValidPointer(A,2); 3049 PetscValidPointer(B,3); 3050 /* FIXME: make sure C is assembled */ 3051 *A = aij->A; 3052 *B = aij->B; 3053 /* Note that we don't incref *A and *B, so be careful! */ 3054 PetscFunctionReturn(0); 3055 } 3056 3057 /* 3058 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3059 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3060 */ 3061 #undef __FUNCT__ 3062 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ" 3063 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3064 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3065 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3066 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3067 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3068 { 3069 PetscErrorCode ierr; 3070 PetscMPIInt isize,flag; 3071 PetscInt i,ii,cismax,ispar,ciscol_localsize; 3072 Mat *A,*B; 3073 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3074 3075 PetscFunctionBegin; 3076 if (!ismax) PetscFunctionReturn(0); 3077 3078 for (i = 0, cismax = 0; i < ismax; ++i) { 3079 PetscMPIInt isize; 3080 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3081 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3082 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3083 if (isize > 1) ++cismax; 3084 } 3085 /* 3086 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3087 ispar counts the number of parallel ISs across C's comm. 3088 */ 3089 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3090 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3091 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3092 PetscFunctionReturn(0); 3093 } 3094 3095 /* if (ispar) */ 3096 /* 3097 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3098 These are used to extract the off-diag portion of the resulting parallel matrix. 3099 The row IS for the off-diag portion is the same as for the diag portion, 3100 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3101 */ 3102 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3103 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3104 for (i = 0, ii = 0; i < ismax; ++i) { 3105 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3106 if (isize > 1) { 3107 /* 3108 TODO: This is the part that's ***NOT SCALABLE***. 3109 To fix this we need to extract just the indices of C's nonzero columns 3110 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3111 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3112 to be done without serializing on the IS list, so, most likely, it is best 3113 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3114 */ 3115 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3116 /* Now we have to 3117 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3118 were sorted on each rank, concatenated they might no longer be sorted; 3119 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3120 indices in the nondecreasing order to the original index positions. 3121 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3122 */ 3123 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3124 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3125 ++ii; 3126 } 3127 } 3128 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3129 for (i = 0, ii = 0; i < ismax; ++i) { 3130 PetscInt j,issize; 3131 const PetscInt *indices; 3132 3133 /* 3134 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3135 */ 3136 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3137 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3138 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3139 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3140 for (j = 1; j < issize; ++j) { 3141 if (indices[j] == indices[j-1]) { 3142 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3143 } 3144 } 3145 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3146 3147 3148 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3149 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3150 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3151 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3152 for (j = 1; j < issize; ++j) { 3153 if (indices[j-1] == indices[j]) { 3154 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3155 } 3156 } 3157 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3158 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3159 if (isize > 1) { 3160 cisrow[ii] = isrow[i]; 3161 ++ii; 3162 } 3163 } 3164 /* 3165 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3166 array of sequential matrices underlying the resulting parallel matrices. 3167 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3168 contain duplicates. 3169 3170 There are as many diag matrices as there are original index sets. There are only as many parallel 3171 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3172 3173 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3174 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3175 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3176 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3177 with A[i] and B[ii] extracted from the corresponding MPI submat. 3178 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3179 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3180 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3181 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3182 values into A[i] and B[ii] sitting inside the corresponding submat. 3183 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3184 A[i], B[ii], AA[i] or BB[ii] matrices. 3185 */ 3186 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3187 if (scall == MAT_INITIAL_MATRIX) { 3188 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3189 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 3190 } else { 3191 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 3192 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 3193 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 3194 for (i = 0, ii = 0; i < ismax; ++i) { 3195 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3196 if (isize > 1) { 3197 Mat AA,BB; 3198 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 3199 if (!isrow_p[i] && !iscol_p[i]) { 3200 A[i] = AA; 3201 } else { 3202 /* TODO: extract A[i] composed on AA. */ 3203 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3204 } 3205 if (!isrow_p[i] && !ciscol_p[ii]) { 3206 B[ii] = BB; 3207 } else { 3208 /* TODO: extract B[ii] composed on BB. */ 3209 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 3210 } 3211 ++ii; 3212 } else { 3213 if (!isrow_p[i] && !iscol_p[i]) { 3214 A[i] = (*submat)[i]; 3215 } else { 3216 /* TODO: extract A[i] composed on (*submat)[i]. */ 3217 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3218 } 3219 } 3220 } 3221 } 3222 /* Now obtain the sequential A and B submatrices separately. */ 3223 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 3224 /* I did not figure out a good way to fix it right now. 3225 * Local column size of B[i] is different from the size of ciscol[i]. 3226 * B[i]'s size is finally determined by MatAssembly, while 3227 * ciscol[i] is computed as the complement of iscol[i]. 3228 * It is better to keep only nonzero indices when computing 3229 * the complement ciscol[i]. 3230 * */ 3231 if(scall==MAT_REUSE_MATRIX){ 3232 for(i=0; i<cismax; i++){ 3233 ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr); 3234 B[i]->cmap->n = ciscol_localsize; 3235 } 3236 } 3237 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 3238 /* 3239 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3240 matrices A & B have been extracted directly into the parallel matrices containing them, or 3241 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3242 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3243 to have the same sparsity pattern. 3244 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3245 must be constructed for C. This is done by setseqmat(s). 3246 */ 3247 for (i = 0, ii = 0; i < ismax; ++i) { 3248 /* 3249 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3250 That way we can avoid sorting and computing permutations when reusing. 3251 To this end: 3252 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3253 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3254 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3255 */ 3256 MatStructure pattern; 3257 if (scall == MAT_INITIAL_MATRIX) { 3258 pattern = DIFFERENT_NONZERO_PATTERN; 3259 } else { 3260 pattern = SAME_NONZERO_PATTERN; 3261 } 3262 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3263 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3264 if (isize > 1) { 3265 if (scall == MAT_INITIAL_MATRIX) { 3266 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3267 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3268 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3269 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3270 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3271 } 3272 /* 3273 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3274 */ 3275 { 3276 Mat AA,BB; 3277 AA = NULL; 3278 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 3279 BB = NULL; 3280 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 3281 if (AA || BB) { 3282 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3283 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3284 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3285 } 3286 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 3287 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 3288 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3289 } 3290 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 3291 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 3292 ierr = MatDestroy(&BB);CHKERRQ(ierr); 3293 } 3294 } 3295 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3296 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3297 ++ii; 3298 } else { /* if (isize == 1) */ 3299 if (scall == MAT_INITIAL_MATRIX) { 3300 if (isrow_p[i] || iscol_p[i]) { 3301 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3302 } else (*submat)[i] = A[i]; 3303 } 3304 if (isrow_p[i] || iscol_p[i]) { 3305 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3306 /* Otherwise A is extracted straight into (*submats)[i]. */ 3307 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3308 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3309 } 3310 } 3311 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3312 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3313 } 3314 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3315 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3316 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3317 ierr = PetscFree(A);CHKERRQ(ierr); 3318 ierr = PetscFree(B);CHKERRQ(ierr); 3319 PetscFunctionReturn(0); 3320 } 3321 3322 3323 3324 #undef __FUNCT__ 3325 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ" 3326 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3327 { 3328 PetscErrorCode ierr; 3329 3330 PetscFunctionBegin; 3331 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3332 PetscFunctionReturn(0); 3333 } 3334