1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 #undef __FUNCT__ 23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 25 { 26 PetscErrorCode ierr; 27 PetscInt i; 28 29 PetscFunctionBegin; 30 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 31 for (i=0; i<ov; ++i) { 32 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 33 } 34 PetscFunctionReturn(0); 35 } 36 37 #undef __FUNCT__ 38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 40 { 41 PetscErrorCode ierr; 42 PetscInt i; 43 44 PetscFunctionBegin; 45 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 46 for (i=0; i<ov; ++i) { 47 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 48 } 49 PetscFunctionReturn(0); 50 } 51 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 56 { 57 PetscErrorCode ierr; 58 MPI_Comm comm; 59 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 60 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 61 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 62 const PetscInt *indices_i,**indices; 63 PetscLayout rmap; 64 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 65 PetscSF sf; 66 PetscSFNode *remote; 67 68 PetscFunctionBegin; 69 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 70 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 71 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 72 /* get row map to determine where rows should be going */ 73 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 74 /* retrieve IS data and put all together so that we 75 * can optimize communication 76 * */ 77 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 78 for (i=0,tlength=0; i<nidx; i++){ 79 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 80 tlength += length[i]; 81 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 82 } 83 /* find these rows on remote processors */ 84 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 85 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 86 nrrows = 0; 87 for (i=0; i<nidx; i++){ 88 length_i = length[i]; 89 indices_i = indices[i]; 90 for (j=0; j<length_i; j++){ 91 owner = -1; 92 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 93 /* remote processors */ 94 if (owner != rank){ 95 tosizes_temp[owner]++; /* number of rows to owner */ 96 rrow_ranks[nrrows] = owner; /* processor */ 97 rrow_isids[nrrows] = i; /* is id */ 98 remoterows[nrrows++] = indices_i[j]; /* row */ 99 } 100 } 101 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 102 } 103 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 104 /* test if we need to exchange messages 105 * generally speaking, we do not need to exchange 106 * data when overlap is 1 107 * */ 108 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 109 /* we do not have any messages 110 * It usually corresponds to overlap 1 111 * */ 112 if (!reducednrrows){ 113 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 114 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 115 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 nto = 0; 119 /* send sizes and ranks for building a two-sided communcation */ 120 for (i=0; i<size; i++){ 121 if (tosizes_temp[i]){ 122 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 123 tosizes_temp[i] = nto; /* a map from processor to index */ 124 toranks[nto++] = i; /* processor */ 125 } 126 } 127 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 128 for (i=0; i<nto; i++){ 129 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 130 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 131 } 132 /* send information to other processors */ 133 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 134 nrecvrows = 0; 135 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 136 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 137 nrecvrows = 0; 138 for (i=0; i<nfrom; i++){ 139 for (j=0; j<fromsizes[2*i]; j++){ 140 remote[nrecvrows].rank = fromranks[i]; 141 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 142 } 143 } 144 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 145 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 146 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 147 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 148 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 149 /* message pair <no of is, row> */ 150 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 151 for (i=0; i<nrrows; i++){ 152 owner = rrow_ranks[i]; /* processor */ 153 j = tosizes_temp[owner]; /* index */ 154 todata[toffsets[j]++] = rrow_isids[i]; 155 todata[toffsets[j]++] = remoterows[i]; 156 } 157 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 158 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 159 ierr = PetscFree(toffsets);CHKERRQ(ierr); 160 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 161 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 162 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 163 /* send rows belonging to the remote so that then we could get the overlapping data back */ 164 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 165 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 166 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 167 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 168 ierr = PetscFree(fromranks);CHKERRQ(ierr); 169 nrecvrows = 0; 170 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 171 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 172 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 173 nrecvrows = 0; 174 for (i=0; i<nto; i++){ 175 for (j=0; j<tosizes[2*i]; j++){ 176 remote[nrecvrows].rank = toranks[i]; 177 remote[nrecvrows++].index = tosizes[2*i+1]+j; 178 } 179 } 180 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 181 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 182 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 183 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 184 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 185 /* overlap communication and computation */ 186 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 187 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 188 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 189 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 190 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 191 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 192 ierr = PetscFree(toranks);CHKERRQ(ierr); 193 ierr = PetscFree(tosizes);CHKERRQ(ierr); 194 ierr = PetscFree(todata);CHKERRQ(ierr); 195 PetscFunctionReturn(0); 196 } 197 198 #undef __FUNCT__ 199 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 200 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 201 { 202 PetscInt *isz,isz_i,i,j,is_id, data_size; 203 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 204 const PetscInt *indices_i_temp; 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 max_lsize = 0; 209 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 210 for (i=0; i<nidx; i++){ 211 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 212 max_lsize = lsize>max_lsize ? lsize:max_lsize; 213 isz[i] = lsize; 214 } 215 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 216 for (i=0; i<nidx; i++){ 217 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 218 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 219 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 220 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 221 } 222 /* retrieve information to get row id and its overlap */ 223 for (i=0; i<nrecvs; ){ 224 is_id = recvdata[i++]; 225 data_size = recvdata[i++]; 226 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 227 isz_i = isz[is_id]; 228 for (j=0; j< data_size; j++){ 229 col = recvdata[i++]; 230 indices_i[isz_i++] = col; 231 } 232 isz[is_id] = isz_i; 233 } 234 /* remove duplicate entities */ 235 for (i=0; i<nidx; i++){ 236 indices_i = indices_temp+(max_lsize+nrecvs)*i; 237 isz_i = isz[i]; 238 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 239 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 240 } 241 ierr = PetscFree(isz);CHKERRQ(ierr); 242 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 243 PetscFunctionReturn(0); 244 } 245 246 #undef __FUNCT__ 247 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 248 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 249 { 250 PetscLayout rmap,cmap; 251 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 252 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 253 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 254 const PetscInt *gcols,*ai,*aj,*bi,*bj; 255 Mat amat,bmat; 256 PetscMPIInt rank; 257 PetscBool done; 258 MPI_Comm comm; 259 PetscErrorCode ierr; 260 261 PetscFunctionBegin; 262 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 263 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 264 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 265 /* Even if the mat is symmetric, we still assume it is not symmetric */ 266 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 267 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 268 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 269 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 270 /* total number of nonzero values is used to estimate the memory usage in the next step */ 271 tnz = ai[an]+bi[bn]; 272 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 273 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 274 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 275 /* to find the longest message */ 276 max_fszs = 0; 277 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 278 /* better way to estimate number of nonzero in the mat??? */ 279 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 280 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 281 rows_pos = 0; 282 totalrows = 0; 283 for (i=0; i<nfrom; i++){ 284 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 285 /* group data together */ 286 for (j=0; j<fromsizes[2*i]; j+=2){ 287 is_id = fromrows[rows_pos++];/* no of is */ 288 rows_i = rows_data[is_id]; 289 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 290 } 291 /* estimate a space to avoid multiple allocations */ 292 for (j=0; j<nidx; j++){ 293 indvc_ij = 0; 294 rows_i = rows_data[j]; 295 for (l=0; l<rows_pos_i[j]; l++){ 296 row = rows_i[l]-rstart; 297 start = ai[row]; 298 end = ai[row+1]; 299 for (k=start; k<end; k++){ /* Amat */ 300 col = aj[k] + cstart; 301 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 302 } 303 start = bi[row]; 304 end = bi[row+1]; 305 for (k=start; k<end; k++) { /* Bmat */ 306 col = gcols[bj[k]]; 307 indices_tmp[indvc_ij++] = col; 308 } 309 } 310 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 311 indv_counts[i*nidx+j] = indvc_ij; 312 totalrows += indvc_ij; 313 } 314 } 315 /* message triple <no of is, number of rows, rows> */ 316 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 317 totalrows = 0; 318 rows_pos = 0; 319 /* use this code again */ 320 for (i=0;i<nfrom;i++){ 321 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 322 for (j=0; j<fromsizes[2*i]; j+=2){ 323 is_id = fromrows[rows_pos++]; 324 rows_i = rows_data[is_id]; 325 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 326 } 327 /* add data */ 328 for (j=0; j<nidx; j++){ 329 if (!indv_counts[i*nidx+j]) continue; 330 indvc_ij = 0; 331 sbdata[totalrows++] = j; 332 sbdata[totalrows++] = indv_counts[i*nidx+j]; 333 sbsizes[2*i] += 2; 334 rows_i = rows_data[j]; 335 for (l=0; l<rows_pos_i[j]; l++){ 336 row = rows_i[l]-rstart; 337 start = ai[row]; 338 end = ai[row+1]; 339 for (k=start; k<end; k++){ /* Amat */ 340 col = aj[k] + cstart; 341 indices_tmp[indvc_ij++] = col; 342 } 343 start = bi[row]; 344 end = bi[row+1]; 345 for (k=start; k<end; k++) { /* Bmat */ 346 col = gcols[bj[k]]; 347 indices_tmp[indvc_ij++] = col; 348 } 349 } 350 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 351 sbsizes[2*i] += indvc_ij; 352 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 353 totalrows += indvc_ij; 354 } 355 } 356 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 357 for (i=0; i<nfrom; i++){ 358 offsets[i+1] = offsets[i] + sbsizes[2*i]; 359 sbsizes[2*i+1] = offsets[i]; 360 } 361 ierr = PetscFree(offsets);CHKERRQ(ierr); 362 if (sbrowsizes) *sbrowsizes = sbsizes; 363 if (sbrows) *sbrows = sbdata; 364 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 365 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 366 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 367 PetscFunctionReturn(0); 368 } 369 370 #undef __FUNCT__ 371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 373 { 374 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 375 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 376 PetscInt lsize,lsize_tmp,owner; 377 PetscMPIInt rank; 378 Mat amat,bmat; 379 PetscBool done; 380 PetscLayout cmap,rmap; 381 MPI_Comm comm; 382 PetscErrorCode ierr; 383 384 PetscFunctionBegin; 385 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 386 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 387 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 388 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 389 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 390 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 391 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 392 /* is it a safe way to compute number of nonzero values ? */ 393 tnz = ai[an]+bi[bn]; 394 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 395 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 396 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 397 /* it is a better way to estimate memory than the old implementation 398 * where global size of matrix is used 399 * */ 400 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 401 for (i=0; i<nidx; i++) { 402 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 403 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 404 lsize_tmp = 0; 405 for (j=0; j<lsize; j++) { 406 owner = -1; 407 row = indices[j]; 408 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 409 if (owner != rank) continue; 410 /* local number */ 411 row -= rstart; 412 start = ai[row]; 413 end = ai[row+1]; 414 for (k=start; k<end; k++) { /* Amat */ 415 col = aj[k] + cstart; 416 indices_temp[lsize_tmp++] = col; 417 } 418 start = bi[row]; 419 end = bi[row+1]; 420 for (k=start; k<end; k++) { /* Bmat */ 421 col = gcols[bj[k]]; 422 indices_temp[lsize_tmp++] = col; 423 } 424 } 425 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 426 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 427 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 428 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 429 } 430 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 431 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 432 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 433 PetscFunctionReturn(0); 434 } 435 436 437 /* 438 Sample message format: 439 If a processor A wants processor B to process some elements corresponding 440 to index sets is[1],is[5] 441 mesg [0] = 2 (no of index sets in the mesg) 442 ----------- 443 mesg [1] = 1 => is[1] 444 mesg [2] = sizeof(is[1]); 445 ----------- 446 mesg [3] = 5 => is[5] 447 mesg [4] = sizeof(is[5]); 448 ----------- 449 mesg [5] 450 mesg [n] datas[1] 451 ----------- 452 mesg[n+1] 453 mesg[m] data(is[5]) 454 ----------- 455 456 Notes: 457 nrqs - no of requests sent (or to be sent out) 458 nrqr - no of requests recieved (which have to be or which have been processed 459 */ 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 463 { 464 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 465 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 466 const PetscInt **idx,*idx_i; 467 PetscInt *n,**data,len; 468 #if defined(PETSC_USE_CTABLE) 469 PetscTable *table_data,table_data_i; 470 PetscInt *tdata,tcount,tcount_max; 471 #else 472 PetscInt *data_i,*d_p; 473 #endif 474 PetscErrorCode ierr; 475 PetscMPIInt size,rank,tag1,tag2; 476 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 477 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 478 PetscBT *table; 479 MPI_Comm comm; 480 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 481 MPI_Status *s_status,*recv_status; 482 char *t_p; 483 484 PetscFunctionBegin; 485 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 486 size = c->size; 487 rank = c->rank; 488 M = C->rmap->N; 489 490 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 491 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 492 493 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 494 495 for (i=0; i<imax; i++) { 496 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 497 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 498 } 499 500 /* evaluate communication - mesg to who,length of mesg, and buffer space 501 required. Based on this, buffers are allocated, and data copied into them */ 502 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 503 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 504 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 505 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 506 for (i=0; i<imax; i++) { 507 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 508 idx_i = idx[i]; 509 len = n[i]; 510 for (j=0; j<len; j++) { 511 row = idx_i[j]; 512 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 513 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 514 w4[proc]++; 515 } 516 for (j=0; j<size; j++) { 517 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 518 } 519 } 520 521 nrqs = 0; /* no of outgoing messages */ 522 msz = 0; /* total mesg length (for all proc */ 523 w1[rank] = 0; /* no mesg sent to intself */ 524 w3[rank] = 0; 525 for (i=0; i<size; i++) { 526 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 527 } 528 /* pa - is list of processors to communicate with */ 529 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 530 for (i=0,j=0; i<size; i++) { 531 if (w1[i]) {pa[j] = i; j++;} 532 } 533 534 /* Each message would have a header = 1 + 2*(no of IS) + data */ 535 for (i=0; i<nrqs; i++) { 536 j = pa[i]; 537 w1[j] += w2[j] + 2*w3[j]; 538 msz += w1[j]; 539 } 540 541 /* Determine the number of messages to expect, their lengths, from from-ids */ 542 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 543 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 544 545 /* Now post the Irecvs corresponding to these messages */ 546 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 547 548 /* Allocate Memory for outgoing messages */ 549 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 550 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 551 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 552 553 { 554 PetscInt *iptr = tmp,ict = 0; 555 for (i=0; i<nrqs; i++) { 556 j = pa[i]; 557 iptr += ict; 558 outdat[j] = iptr; 559 ict = w1[j]; 560 } 561 } 562 563 /* Form the outgoing messages */ 564 /* plug in the headers */ 565 for (i=0; i<nrqs; i++) { 566 j = pa[i]; 567 outdat[j][0] = 0; 568 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 569 ptr[j] = outdat[j] + 2*w3[j] + 1; 570 } 571 572 /* Memory for doing local proc's work */ 573 { 574 #if defined(PETSC_USE_CTABLE) 575 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 578 } 579 ierr = PetscCalloc4(imax,&table,imax,&data,imax,&isz,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 580 for (i=0; i<imax; i++) { 581 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 582 } 583 #else 584 PetscInt Mimax = 0,M_BPB_imax = 0; 585 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 586 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 587 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 588 ierr = PetscCalloc5(imax,&table,imax,&data,imax,&isz,M*imax,&d_p,(M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 589 for (i=0; i<imax; i++) { 590 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 591 data[i] = d_p + M*i; 592 } 593 #endif 594 } 595 596 /* Parse the IS and update local tables and the outgoing buf with the data */ 597 { 598 PetscInt n_i,isz_i,*outdat_j,ctr_j; 599 PetscBT table_i; 600 601 for (i=0; i<imax; i++) { 602 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 603 n_i = n[i]; 604 table_i = table[i]; 605 idx_i = idx[i]; 606 #if defined(PETSC_USE_CTABLE) 607 table_data_i = table_data[i]; 608 #else 609 data_i = data[i]; 610 #endif 611 isz_i = isz[i]; 612 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 613 row = idx_i[j]; 614 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 615 if (proc != rank) { /* copy to the outgoing buffer */ 616 ctr[proc]++; 617 *ptr[proc] = row; 618 ptr[proc]++; 619 } else if (!PetscBTLookupSet(table_i,row)) { 620 #if defined(PETSC_USE_CTABLE) 621 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 622 #else 623 data_i[isz_i] = row; /* Update the local table */ 624 #endif 625 isz_i++; 626 } 627 } 628 /* Update the headers for the current IS */ 629 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 630 if ((ctr_j = ctr[j])) { 631 outdat_j = outdat[j]; 632 k = ++outdat_j[0]; 633 outdat_j[2*k] = ctr_j; 634 outdat_j[2*k-1] = i; 635 } 636 } 637 isz[i] = isz_i; 638 } 639 } 640 641 /* Now post the sends */ 642 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 643 for (i=0; i<nrqs; ++i) { 644 j = pa[i]; 645 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 646 } 647 648 /* No longer need the original indices */ 649 for (i=0; i<imax; ++i) { 650 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 651 } 652 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 653 654 for (i=0; i<imax; ++i) { 655 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 656 } 657 658 /* Do Local work */ 659 #if defined(PETSC_USE_CTABLE) 660 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 661 #else 662 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 663 #endif 664 665 /* Receive messages */ 666 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 667 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 668 669 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 670 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 671 672 /* Phase 1 sends are complete - deallocate buffers */ 673 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 674 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 675 676 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 677 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 678 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 679 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 680 ierr = PetscFree(rbuf);CHKERRQ(ierr); 681 682 683 /* Send the data back */ 684 /* Do a global reduction to know the buffer space req for incoming messages */ 685 { 686 PetscMPIInt *rw1; 687 688 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 689 690 for (i=0; i<nrqr; ++i) { 691 proc = recv_status[i].MPI_SOURCE; 692 693 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 694 rw1[proc] = isz1[i]; 695 } 696 ierr = PetscFree(onodes1);CHKERRQ(ierr); 697 ierr = PetscFree(olengths1);CHKERRQ(ierr); 698 699 /* Determine the number of messages to expect, their lengths, from from-ids */ 700 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 701 ierr = PetscFree(rw1);CHKERRQ(ierr); 702 } 703 /* Now post the Irecvs corresponding to these messages */ 704 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 705 706 /* Now post the sends */ 707 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 708 for (i=0; i<nrqr; ++i) { 709 j = recv_status[i].MPI_SOURCE; 710 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 711 } 712 713 /* receive work done on other processors */ 714 { 715 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 716 PetscMPIInt idex; 717 PetscBT table_i; 718 MPI_Status *status2; 719 720 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 721 for (i=0; i<nrqs; ++i) { 722 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 723 /* Process the message */ 724 rbuf2_i = rbuf2[idex]; 725 ct1 = 2*rbuf2_i[0]+1; 726 jmax = rbuf2[idex][0]; 727 for (j=1; j<=jmax; j++) { 728 max = rbuf2_i[2*j]; 729 is_no = rbuf2_i[2*j-1]; 730 isz_i = isz[is_no]; 731 table_i = table[is_no]; 732 #if defined(PETSC_USE_CTABLE) 733 table_data_i = table_data[is_no]; 734 #else 735 data_i = data[is_no]; 736 #endif 737 for (k=0; k<max; k++,ct1++) { 738 row = rbuf2_i[ct1]; 739 if (!PetscBTLookupSet(table_i,row)) { 740 #if defined(PETSC_USE_CTABLE) 741 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 742 #else 743 data_i[isz_i] = row; 744 #endif 745 isz_i++; 746 } 747 } 748 isz[is_no] = isz_i; 749 } 750 } 751 752 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 753 ierr = PetscFree(status2);CHKERRQ(ierr); 754 } 755 756 #if defined(PETSC_USE_CTABLE) 757 tcount_max = 0; 758 for (i=0; i<imax; ++i) { 759 table_data_i = table_data[i]; 760 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 761 if (tcount_max < tcount) tcount_max = tcount; 762 } 763 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 764 #endif 765 766 for (i=0; i<imax; ++i) { 767 #if defined(PETSC_USE_CTABLE) 768 PetscTablePosition tpos; 769 table_data_i = table_data[i]; 770 771 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 772 while (tpos) { 773 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 774 tdata[--j] = --k; 775 } 776 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 777 #else 778 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 779 #endif 780 } 781 782 ierr = PetscFree(onodes2);CHKERRQ(ierr); 783 ierr = PetscFree(olengths2);CHKERRQ(ierr); 784 785 ierr = PetscFree(pa);CHKERRQ(ierr); 786 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 787 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 788 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 789 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 790 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 791 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 792 ierr = PetscFree(s_status);CHKERRQ(ierr); 793 ierr = PetscFree(recv_status);CHKERRQ(ierr); 794 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 795 ierr = PetscFree(xdata);CHKERRQ(ierr); 796 ierr = PetscFree(isz1);CHKERRQ(ierr); 797 #if defined(PETSC_USE_CTABLE) 798 for (i=0; i<imax; i++) { 799 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 800 } 801 ierr = PetscFree(table_data);CHKERRQ(ierr); 802 ierr = PetscFree(tdata);CHKERRQ(ierr); 803 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 804 #else 805 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 806 #endif 807 PetscFunctionReturn(0); 808 } 809 810 #undef __FUNCT__ 811 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 812 /* 813 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 814 the work on the local processor. 815 816 Inputs: 817 C - MAT_MPIAIJ; 818 imax - total no of index sets processed at a time; 819 table - an array of char - size = m bits. 820 821 Output: 822 isz - array containing the count of the solution elements corresponding 823 to each index set; 824 data or table_data - pointer to the solutions 825 */ 826 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 827 { 828 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 829 Mat A = c->A,B = c->B; 830 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 831 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 832 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 833 PetscBT table_i; 834 #if defined(PETSC_USE_CTABLE) 835 PetscTable table_data_i; 836 PetscErrorCode ierr; 837 PetscTablePosition tpos; 838 PetscInt tcount,*tdata; 839 #else 840 PetscInt *data_i; 841 #endif 842 843 PetscFunctionBegin; 844 rstart = C->rmap->rstart; 845 cstart = C->cmap->rstart; 846 ai = a->i; 847 aj = a->j; 848 bi = b->i; 849 bj = b->j; 850 garray = c->garray; 851 852 for (i=0; i<imax; i++) { 853 #if defined(PETSC_USE_CTABLE) 854 /* copy existing entries of table_data_i into tdata[] */ 855 table_data_i = table_data[i]; 856 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 857 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 858 859 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 860 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 861 while (tpos) { 862 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 863 tdata[--j] = --row; 864 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 865 } 866 #else 867 data_i = data[i]; 868 #endif 869 table_i = table[i]; 870 isz_i = isz[i]; 871 max = isz[i]; 872 873 for (j=0; j<max; j++) { 874 #if defined(PETSC_USE_CTABLE) 875 row = tdata[j] - rstart; 876 #else 877 row = data_i[j] - rstart; 878 #endif 879 start = ai[row]; 880 end = ai[row+1]; 881 for (k=start; k<end; k++) { /* Amat */ 882 val = aj[k] + cstart; 883 if (!PetscBTLookupSet(table_i,val)) { 884 #if defined(PETSC_USE_CTABLE) 885 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 886 #else 887 data_i[isz_i] = val; 888 #endif 889 isz_i++; 890 } 891 } 892 start = bi[row]; 893 end = bi[row+1]; 894 for (k=start; k<end; k++) { /* Bmat */ 895 val = garray[bj[k]]; 896 if (!PetscBTLookupSet(table_i,val)) { 897 #if defined(PETSC_USE_CTABLE) 898 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 899 #else 900 data_i[isz_i] = val; 901 #endif 902 isz_i++; 903 } 904 } 905 } 906 isz[i] = isz_i; 907 908 #if defined(PETSC_USE_CTABLE) 909 ierr = PetscFree(tdata);CHKERRQ(ierr); 910 #endif 911 } 912 PetscFunctionReturn(0); 913 } 914 915 #undef __FUNCT__ 916 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 917 /* 918 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 919 and return the output 920 921 Input: 922 C - the matrix 923 nrqr - no of messages being processed. 924 rbuf - an array of pointers to the recieved requests 925 926 Output: 927 xdata - array of messages to be sent back 928 isz1 - size of each message 929 930 For better efficiency perhaps we should malloc separately each xdata[i], 931 then if a remalloc is required we need only copy the data for that one row 932 rather then all previous rows as it is now where a single large chunck of 933 memory is used. 934 935 */ 936 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 937 { 938 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 939 Mat A = c->A,B = c->B; 940 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 941 PetscErrorCode ierr; 942 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 943 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 944 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 945 PetscInt *rbuf_i,kmax,rbuf_0; 946 PetscBT xtable; 947 948 PetscFunctionBegin; 949 m = C->rmap->N; 950 rstart = C->rmap->rstart; 951 cstart = C->cmap->rstart; 952 ai = a->i; 953 aj = a->j; 954 bi = b->i; 955 bj = b->j; 956 garray = c->garray; 957 958 959 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 960 rbuf_i = rbuf[i]; 961 rbuf_0 = rbuf_i[0]; 962 ct += rbuf_0; 963 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 964 } 965 966 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 967 else max1 = 1; 968 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 969 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 970 ++no_malloc; 971 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 972 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 973 974 ct3 = 0; 975 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 976 rbuf_i = rbuf[i]; 977 rbuf_0 = rbuf_i[0]; 978 ct1 = 2*rbuf_0+1; 979 ct2 = ct1; 980 ct3 += ct1; 981 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 982 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 983 oct2 = ct2; 984 kmax = rbuf_i[2*j]; 985 for (k=0; k<kmax; k++,ct1++) { 986 row = rbuf_i[ct1]; 987 if (!PetscBTLookupSet(xtable,row)) { 988 if (!(ct3 < mem_estimate)) { 989 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 990 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 991 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 992 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 993 xdata[0] = tmp; 994 mem_estimate = new_estimate; ++no_malloc; 995 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 996 } 997 xdata[i][ct2++] = row; 998 ct3++; 999 } 1000 } 1001 for (k=oct2,max2=ct2; k<max2; k++) { 1002 row = xdata[i][k] - rstart; 1003 start = ai[row]; 1004 end = ai[row+1]; 1005 for (l=start; l<end; l++) { 1006 val = aj[l] + cstart; 1007 if (!PetscBTLookupSet(xtable,val)) { 1008 if (!(ct3 < mem_estimate)) { 1009 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1010 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1011 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1012 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1013 xdata[0] = tmp; 1014 mem_estimate = new_estimate; ++no_malloc; 1015 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1016 } 1017 xdata[i][ct2++] = val; 1018 ct3++; 1019 } 1020 } 1021 start = bi[row]; 1022 end = bi[row+1]; 1023 for (l=start; l<end; l++) { 1024 val = garray[bj[l]]; 1025 if (!PetscBTLookupSet(xtable,val)) { 1026 if (!(ct3 < mem_estimate)) { 1027 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1028 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1029 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1030 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1031 xdata[0] = tmp; 1032 mem_estimate = new_estimate; ++no_malloc; 1033 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1034 } 1035 xdata[i][ct2++] = val; 1036 ct3++; 1037 } 1038 } 1039 } 1040 /* Update the header*/ 1041 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1042 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1043 } 1044 xdata[i][0] = rbuf_0; 1045 xdata[i+1] = xdata[i] + ct2; 1046 isz1[i] = ct2; /* size of each message */ 1047 } 1048 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1049 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1050 PetscFunctionReturn(0); 1051 } 1052 /* -------------------------------------------------------------------------*/ 1053 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 1054 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1055 /* 1056 Every processor gets the entire matrix 1057 */ 1058 #undef __FUNCT__ 1059 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 1060 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1061 { 1062 Mat B; 1063 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1064 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1065 PetscErrorCode ierr; 1066 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1067 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1068 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1069 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1070 1071 PetscFunctionBegin; 1072 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1073 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1074 1075 if (scall == MAT_INITIAL_MATRIX) { 1076 /* ---------------------------------------------------------------- 1077 Tell every processor the number of nonzeros per row 1078 */ 1079 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1080 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1081 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1082 } 1083 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1084 for (i=0; i<size; i++) { 1085 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1086 displs[i] = A->rmap->range[i]; 1087 } 1088 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1089 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1090 #else 1091 sendcount = A->rmap->rend - A->rmap->rstart; 1092 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1093 #endif 1094 /* --------------------------------------------------------------- 1095 Create the sequential matrix of the same type as the local block diagonal 1096 */ 1097 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1098 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1099 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1100 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1101 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1102 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1103 **Bin = B; 1104 b = (Mat_SeqAIJ*)B->data; 1105 1106 /*-------------------------------------------------------------------- 1107 Copy my part of matrix column indices over 1108 */ 1109 sendcount = ad->nz + bd->nz; 1110 jsendbuf = b->j + b->i[rstarts[rank]]; 1111 a_jsendbuf = ad->j; 1112 b_jsendbuf = bd->j; 1113 n = A->rmap->rend - A->rmap->rstart; 1114 cnt = 0; 1115 for (i=0; i<n; i++) { 1116 1117 /* put in lower diagonal portion */ 1118 m = bd->i[i+1] - bd->i[i]; 1119 while (m > 0) { 1120 /* is it above diagonal (in bd (compressed) numbering) */ 1121 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1122 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1123 m--; 1124 } 1125 1126 /* put in diagonal portion */ 1127 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1128 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1129 } 1130 1131 /* put in upper diagonal portion */ 1132 while (m-- > 0) { 1133 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1134 } 1135 } 1136 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1137 1138 /*-------------------------------------------------------------------- 1139 Gather all column indices to all processors 1140 */ 1141 for (i=0; i<size; i++) { 1142 recvcounts[i] = 0; 1143 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1144 recvcounts[i] += lens[j]; 1145 } 1146 } 1147 displs[0] = 0; 1148 for (i=1; i<size; i++) { 1149 displs[i] = displs[i-1] + recvcounts[i-1]; 1150 } 1151 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1152 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1153 #else 1154 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1155 #endif 1156 /*-------------------------------------------------------------------- 1157 Assemble the matrix into useable form (note numerical values not yet set) 1158 */ 1159 /* set the b->ilen (length of each row) values */ 1160 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1161 /* set the b->i indices */ 1162 b->i[0] = 0; 1163 for (i=1; i<=A->rmap->N; i++) { 1164 b->i[i] = b->i[i-1] + lens[i-1]; 1165 } 1166 ierr = PetscFree(lens);CHKERRQ(ierr); 1167 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1168 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1169 1170 } else { 1171 B = **Bin; 1172 b = (Mat_SeqAIJ*)B->data; 1173 } 1174 1175 /*-------------------------------------------------------------------- 1176 Copy my part of matrix numerical values into the values location 1177 */ 1178 if (flag == MAT_GET_VALUES) { 1179 sendcount = ad->nz + bd->nz; 1180 sendbuf = b->a + b->i[rstarts[rank]]; 1181 a_sendbuf = ad->a; 1182 b_sendbuf = bd->a; 1183 b_sendj = bd->j; 1184 n = A->rmap->rend - A->rmap->rstart; 1185 cnt = 0; 1186 for (i=0; i<n; i++) { 1187 1188 /* put in lower diagonal portion */ 1189 m = bd->i[i+1] - bd->i[i]; 1190 while (m > 0) { 1191 /* is it above diagonal (in bd (compressed) numbering) */ 1192 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1193 sendbuf[cnt++] = *b_sendbuf++; 1194 m--; 1195 b_sendj++; 1196 } 1197 1198 /* put in diagonal portion */ 1199 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1200 sendbuf[cnt++] = *a_sendbuf++; 1201 } 1202 1203 /* put in upper diagonal portion */ 1204 while (m-- > 0) { 1205 sendbuf[cnt++] = *b_sendbuf++; 1206 b_sendj++; 1207 } 1208 } 1209 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1210 1211 /* ----------------------------------------------------------------- 1212 Gather all numerical values to all processors 1213 */ 1214 if (!recvcounts) { 1215 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1216 } 1217 for (i=0; i<size; i++) { 1218 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1219 } 1220 displs[0] = 0; 1221 for (i=1; i<size; i++) { 1222 displs[i] = displs[i-1] + recvcounts[i-1]; 1223 } 1224 recvbuf = b->a; 1225 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1226 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1227 #else 1228 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1229 #endif 1230 } /* endof (flag == MAT_GET_VALUES) */ 1231 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1232 1233 if (A->symmetric) { 1234 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1235 } else if (A->hermitian) { 1236 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1237 } else if (A->structurally_symmetric) { 1238 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1239 } 1240 PetscFunctionReturn(0); 1241 } 1242 #undef __FUNCT__ 1243 #define __FUNCT__ "MatDestroy_MPIAIJ_MatGetSubmatrices" 1244 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1245 { 1246 PetscErrorCode ierr; 1247 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1248 Mat_SubMat *submatj = c->submatis1; 1249 PetscInt i; 1250 1251 PetscFunctionBegin; 1252 ierr = submatj->destroy(C);CHKERRQ(ierr); 1253 1254 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1255 1256 for (i=0; i<submatj->nrqr; ++i) { 1257 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1258 } 1259 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1260 1261 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1262 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1263 1264 for (i=0; i<submatj->nrqs; ++i) { 1265 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1266 } 1267 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1268 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1269 1270 ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr); 1271 1272 #if defined(PETSC_USE_CTABLE) 1273 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1274 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1275 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1276 #else 1277 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1278 #endif 1279 1280 if (!submatj->allcolumns) { 1281 #if defined(PETSC_USE_CTABLE) 1282 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1283 #else 1284 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1285 #endif 1286 } 1287 1288 ierr = PetscFree(submatj);CHKERRQ(ierr); 1289 PetscFunctionReturn(0); 1290 } 1291 1292 #undef __FUNCT__ 1293 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS_Local" 1294 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1295 { 1296 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1297 Mat submat,A = c->A,B = c->B; 1298 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1299 PetscInt *ai = a->i,*bi = b->i,nzA,nzB; 1300 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1301 const PetscInt *icol,*irow; 1302 PetscInt nrow,ncol,start; 1303 PetscErrorCode ierr; 1304 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1305 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1306 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1307 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1308 PetscInt *lens,ncols,*cols,Crow; 1309 #if defined(PETSC_USE_CTABLE) 1310 PetscTable cmap,rmap; 1311 PetscInt *cmap_loc,*rmap_loc; 1312 #else 1313 PetscInt *cmap,*rmap; 1314 #endif 1315 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1316 PetscInt *cworkB,lwrite,*bj = b->j,*subcols,*row2proc; 1317 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1318 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1319 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1320 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1321 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1322 MPI_Comm comm; 1323 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1324 PetscMPIInt *onodes1,*olengths1,idex,end; 1325 Mat_SubMat *smatis1; 1326 #if defined(PETSC_USE_LOG) 1327 PetscLogStage stage[5]; 1328 #endif 1329 PetscBool isrowsorted; 1330 1331 PetscFunctionBegin; 1332 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1333 1334 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1335 size = c->size; 1336 rank = c->rank; 1337 1338 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1339 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1340 1341 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1342 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1343 if (allcolumns) { 1344 icol = NULL; 1345 ncol = C->cmap->N; 1346 } else { 1347 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1348 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1349 } 1350 1351 if (scall == MAT_INITIAL_MATRIX) { 1352 PetscInt *sbuf2_i,*cworkA,lwrite,*aj = a->j,ctmp; 1353 1354 ierr = PetscLogStageRegister("Setup",&stage[0]);CHKERRQ(ierr); 1355 ierr = PetscLogStageRegister("Aj comm",&stage[1]);CHKERRQ(ierr); 1356 ierr = PetscLogStageRegister("Aval comm",&stage[2]);CHKERRQ(ierr); 1357 ierr = PetscLogStageRegister("MAssem_loc",&stage[3]);CHKERRQ(ierr); 1358 ierr = PetscLogStageRegister("MAssem_off",&stage[4]);CHKERRQ(ierr); 1359 1360 ierr = PetscLogStagePush(stage[0]);CHKERRQ(ierr); 1361 1362 /* Get some new tags to keep the communication clean */ 1363 tag1 = ((PetscObject)C)->tag; 1364 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1365 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1366 1367 /* evaluate communication - mesg to who, length of mesg, and buffer space 1368 required. Based on this, buffers are allocated, and data copied into them */ 1369 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1370 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1371 1372 /* w1[proc] = num of rows owned by proc */ 1373 proc = 0; 1374 for (j=0; j<nrow; j++) { 1375 row = irow[j]; /* sorted! */ 1376 while (row >= C->rmap->range[proc+1]) proc++; 1377 w1[proc]++; 1378 row2proc[j] = proc; /* map row index to proc */ 1379 } 1380 1381 nrqs = 0; /* num of outgoing messages */ 1382 msz = 0; /* total mesg length (for all procs) */ 1383 w1[rank] = 0; /* num mesg sent to self */ 1384 for (proc=0; proc<size; proc++) { 1385 if (w1[proc]) { w2[proc] = 1; nrqs++;} /* there exists a message to this proc */ 1386 } 1387 1388 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1389 for (proc=0,j=0; proc<size; proc++) { 1390 if (w1[proc]) { pa[j++] = proc;} 1391 } 1392 1393 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1394 for (i=0; i<nrqs; i++) { 1395 proc = pa[i]; 1396 w1[proc] += 3; 1397 msz += w1[proc]; 1398 } 1399 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1400 1401 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1402 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1403 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1404 1405 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1406 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1407 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1408 1409 /* Now post the Irecvs corresponding to these messages */ 1410 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1411 1412 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1413 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1414 1415 /* Allocate Memory for outgoing messages */ 1416 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1417 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1418 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1419 1420 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1421 iptr = tmp; 1422 for (i=0; i<nrqs; i++) { 1423 proc = pa[i]; 1424 sbuf1[proc] = iptr; 1425 iptr += w1[proc]; 1426 } 1427 1428 /* Form the outgoing messages */ 1429 /* Initialize the header space */ 1430 for (i=0; i<nrqs; i++) { 1431 proc = pa[i]; 1432 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1433 ptr[proc] = sbuf1[proc] + 3; 1434 } 1435 1436 /* Parse the isrow and copy data into outbuf */ 1437 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1438 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1439 proc = row2proc[j]; 1440 if (proc != rank) { /* copy to the outgoing buf*/ 1441 *ptr[proc] = irow[j]; 1442 ctr[proc]++; ptr[proc]++; 1443 } 1444 } 1445 1446 /* Update the headers for the current IS */ 1447 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1448 if ((ctr_j = ctr[j])) { 1449 sbuf1_j = sbuf1[j]; 1450 k = ++sbuf1_j[0]; 1451 sbuf1_j[2*k] = ctr_j; 1452 sbuf1_j[2*k-1] = 0; 1453 } 1454 } 1455 1456 /* Now post the sends */ 1457 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1458 for (i=0; i<nrqs; ++i) { 1459 proc = pa[i]; 1460 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1461 } 1462 1463 /* Post Receives to capture the buffer size */ 1464 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1465 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1466 1467 rbuf2[0] = tmp + msz; 1468 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1469 1470 for (i=0; i<nrqs; ++i) { 1471 proc = pa[i]; 1472 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1473 } 1474 1475 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1476 1477 /* Send to other procs the buf size they should allocate */ 1478 /* Receive messages*/ 1479 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1480 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1481 1482 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1483 for (i=0; i<nrqr; ++i) { 1484 req_size[i] = 0; 1485 rbuf1_i = rbuf1[i]; 1486 start = 2*rbuf1_i[0] + 1; 1487 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1488 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1489 sbuf2_i = sbuf2[i]; 1490 for (j=start; j<end; j++) { 1491 k = rbuf1_i[j] - rstart; 1492 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1493 sbuf2_i[j] = ncols; 1494 req_size[i] += ncols; 1495 } 1496 req_source1[i] = r_status1[i].MPI_SOURCE; 1497 1498 /* form the header */ 1499 sbuf2_i[0] = req_size[i]; 1500 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1501 1502 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1503 } 1504 1505 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1506 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1507 1508 /* rbuf2 is received, Post recv column indices a->j */ 1509 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1510 1511 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1512 for (i=0; i<nrqs; ++i) { 1513 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1514 req_source2[i] = r_status2[i].MPI_SOURCE; 1515 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1516 } 1517 1518 /* Wait on sends1 and sends2 */ 1519 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1520 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1521 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1522 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1523 1524 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1525 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1526 1527 ierr = PetscLogStagePop();CHKERRQ(ierr); 1528 ierr = PetscLogStagePush(stage[1]);CHKERRQ(ierr); 1529 1530 /* Now allocate sending buffers for a->j, and send them off */ 1531 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1532 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1533 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1534 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1535 1536 for (i=0; i<nrqr; i++) { /* for each requested message */ 1537 rbuf1_i = rbuf1[i]; 1538 sbuf_aj_i = sbuf_aj[i]; 1539 ct1 = 2*rbuf1_i[0] + 1; 1540 ct2 = 0; 1541 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1542 1543 kmax = rbuf1[i][2]; 1544 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1545 row = rbuf1_i[ct1] - rstart; 1546 nzA = ai[row+1] - ai[row]; nzB = bi[row+1] - bi[row]; 1547 ncols = nzA + nzB; 1548 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1549 1550 /* load the column indices for this row into cols*/ 1551 cols = sbuf_aj_i + ct2; 1552 1553 lwrite = 0; 1554 for (l=0; l<nzB; l++) { 1555 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1556 } 1557 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1558 for (l=0; l<nzB; l++) { 1559 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1560 } 1561 1562 ct2 += ncols; 1563 } 1564 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1565 } 1566 1567 /* create column map (cmap): global col of C -> local col of submat */ 1568 #if defined(PETSC_USE_CTABLE) 1569 if (!allcolumns) { 1570 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1571 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1572 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1573 if (icol[j] >= cstart && icol[j] <cend) { 1574 cmap_loc[icol[j] - cstart] = j+1; 1575 } else { /* use PetscTable for non-local col indices */ 1576 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1577 } 1578 } 1579 } else { 1580 cmap = NULL; 1581 cmap_loc = NULL; 1582 } 1583 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1584 #else 1585 if (!allcolumns) { 1586 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1587 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1588 } else { 1589 cmap = NULL; 1590 } 1591 #endif 1592 1593 /* Create lens for MatSeqAIJSetPreallocation() */ 1594 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1595 1596 /* Compute lens from local part of C */ 1597 for (j=0; j<nrow; j++) { 1598 row = irow[j]; 1599 proc = row2proc[j]; 1600 if (proc == rank) { 1601 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1602 if (!allcolumns) { 1603 for (k=0; k<ncols; k++) { 1604 #if defined(PETSC_USE_CTABLE) 1605 if (cols[k] >= cstart && cols[k] <cend) { 1606 tcol = cmap_loc[cols[k] - cstart]; 1607 } else { 1608 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1609 } 1610 #else 1611 tcol = cmap[cols[k]]; 1612 #endif 1613 if (tcol) lens[j]++; 1614 } 1615 } else { /* allcolumns */ 1616 lens[j] = ncols; 1617 } 1618 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1619 } 1620 } 1621 1622 /* Create row map (rmap): global row of C -> local row of submat */ 1623 #if defined(PETSC_USE_CTABLE) 1624 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1625 for (j=0; j<nrow; j++) { 1626 row = irow[j]; 1627 proc = row2proc[j]; 1628 if (proc == rank) { /* a local row */ 1629 rmap_loc[row - rstart] = j; 1630 } else { 1631 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1632 } 1633 } 1634 #else 1635 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1636 for (j=0; j<nrow; j++) { 1637 rmap[irow[j]] = j; 1638 } 1639 #endif 1640 1641 /* Update lens from offproc data */ 1642 /* recv a->j is done */ 1643 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1644 for (i=0; i<nrqs; i++) { 1645 proc = pa[i]; 1646 sbuf1_i = sbuf1[proc]; 1647 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1648 ct1 = 2 + 1; 1649 ct2 = 0; 1650 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1651 rbuf3_i = rbuf3[i]; /* received C->j */ 1652 1653 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1654 max1 = sbuf1_i[2]; 1655 for (k=0; k<max1; k++,ct1++) { 1656 #if defined(PETSC_USE_CTABLE) 1657 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1658 row--; 1659 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1660 #else 1661 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1662 #endif 1663 /* Now, store row index of submat in sbuf1_i[ct1] */ 1664 sbuf1_i[ct1] = row; 1665 1666 nnz = rbuf2_i[ct1]; 1667 if (!allcolumns) { 1668 for (l=0; l<nnz; l++,ct2++) { 1669 #if defined(PETSC_USE_CTABLE) 1670 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1671 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1672 } else { 1673 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1674 } 1675 #else 1676 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1677 #endif 1678 if (tcol) { 1679 lens[row]++; 1680 1681 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1682 For reuse, we replace received C->j with index that should be inserted to submat */ 1683 /* rbuf3_i[ct3++] = ct2; ??? */ 1684 } 1685 } 1686 } else { /* allcolumns */ 1687 lens[row] += nnz; 1688 } 1689 } 1690 } 1691 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1692 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1693 1694 /* Create the submatrices */ 1695 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1696 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1697 1698 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1699 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1700 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1701 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1702 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1703 1704 /* create struct Mat_SubMat and attached it to submat */ 1705 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1706 subc = (Mat_SeqAIJ*)submat->data; 1707 subc->submatis1 = smatis1; 1708 1709 smatis1->nrqs = nrqs; 1710 smatis1->nrqr = nrqr; 1711 smatis1->rbuf1 = rbuf1; 1712 smatis1->rbuf2 = rbuf2; 1713 smatis1->rbuf3 = rbuf3; 1714 smatis1->sbuf2 = sbuf2; 1715 smatis1->req_source2 = req_source2; 1716 1717 smatis1->sbuf1 = sbuf1; 1718 smatis1->ptr = ptr; 1719 smatis1->tmp = tmp; 1720 smatis1->ctr = ctr; 1721 1722 smatis1->pa = pa; 1723 smatis1->req_size = req_size; 1724 smatis1->req_source1 = req_source1; 1725 1726 smatis1->allcolumns = allcolumns; 1727 smatis1->row2proc = row2proc; 1728 smatis1->rmap = rmap; 1729 smatis1->cmap = cmap; 1730 #if defined(PETSC_USE_CTABLE) 1731 smatis1->rmap_loc = rmap_loc; 1732 smatis1->cmap_loc = cmap_loc; 1733 #endif 1734 1735 smatis1->destroy = submat->ops->destroy; 1736 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1737 submat->factortype = C->factortype; 1738 1739 ierr = PetscLogStagePop();CHKERRQ(ierr); 1740 ierr = PetscLogStagePush(stage[2]);CHKERRQ(ierr); 1741 } else { /* scall == MAT_REUSE_MATRIX */ 1742 submat = submats[0]; 1743 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1744 1745 subc = (Mat_SeqAIJ*)submat->data; 1746 smatis1 = subc->submatis1; 1747 nrqs = smatis1->nrqs; 1748 nrqr = smatis1->nrqr; 1749 rbuf1 = smatis1->rbuf1; 1750 rbuf2 = smatis1->rbuf2; 1751 rbuf3 = smatis1->rbuf3; 1752 req_source2 = smatis1->req_source2; 1753 1754 sbuf1 = smatis1->sbuf1; 1755 sbuf2 = smatis1->sbuf2; 1756 ptr = smatis1->ptr; 1757 tmp = smatis1->tmp; 1758 ctr = smatis1->ctr; 1759 1760 pa = smatis1->pa; 1761 req_size = smatis1->req_size; 1762 req_source1 = smatis1->req_source1; 1763 1764 allcolumns = smatis1->allcolumns; 1765 row2proc = smatis1->row2proc; 1766 rmap = smatis1->rmap; 1767 cmap = smatis1->cmap; 1768 #if defined(PETSC_USE_CTABLE) 1769 rmap_loc = smatis1->rmap_loc; 1770 cmap_loc = smatis1->cmap_loc; 1771 #endif 1772 } 1773 1774 /* Post recv matrix values */ 1775 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1776 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1777 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1778 for (i=0; i<nrqs; ++i) { 1779 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1780 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1781 } 1782 1783 /* Allocate sending buffers for a->a, and send them off */ 1784 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1785 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1786 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1787 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1788 1789 for (i=0; i<nrqr; i++) { 1790 rbuf1_i = rbuf1[i]; 1791 sbuf_aa_i = sbuf_aa[i]; 1792 ct1 = 2*rbuf1_i[0]+1; 1793 ct2 = 0; 1794 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1795 1796 kmax = rbuf1_i[2]; 1797 for (k=0; k<kmax; k++,ct1++) { 1798 row = rbuf1_i[ct1] - rstart; 1799 nzA = ai[row+1] - ai[row]; 1800 nzB = bi[row+1] - bi[row]; 1801 ncols = nzA + nzB; 1802 cworkB = bj + bi[row]; 1803 vworkA = a_a + ai[row]; 1804 vworkB = b_a + bi[row]; 1805 1806 /* load the column values for this row into vals*/ 1807 vals = sbuf_aa_i + ct2; 1808 1809 lwrite = 0; 1810 for (l=0; l<nzB; l++) { 1811 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1812 } 1813 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1814 for (l=0; l<nzB; l++) { 1815 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1816 } 1817 1818 ct2 += ncols; 1819 } 1820 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1821 } 1822 1823 if (scall == MAT_INITIAL_MATRIX) { 1824 ierr = PetscLogStagePop();CHKERRQ(ierr); 1825 ierr = PetscLogStagePush(stage[3]);CHKERRQ(ierr); 1826 } 1827 1828 /* Assemble submat */ 1829 if (!allcolumns) { 1830 ierr = PetscCalloc2(ncol,&subcols,ncol,&subvals);CHKERRQ(ierr); 1831 } 1832 /* First assemble the local rows */ 1833 for (j=0; j<nrow; j++) { 1834 row = irow[j]; 1835 proc = row2proc[j]; 1836 if (proc == rank) { 1837 Crow = row; 1838 #if defined(PETSC_USE_CTABLE) 1839 row = rmap_loc[Crow - rstart]; /* row index of submat */ 1840 #else 1841 row = rmap[row]; 1842 #endif 1843 ierr = MatGetRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr); 1844 1845 if (!allcolumns) { 1846 #if defined(PETSC_USE_CTABLE) 1847 i = 0; 1848 k = 0; 1849 while (cols[k] < cstart) { 1850 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1851 if (tcol) { 1852 subcols[i] = --tcol; 1853 subvals[i++] = vals[k]; 1854 } 1855 k++; 1856 } 1857 while (cols[k] < cend && k<ncols) { 1858 tcol = cmap_loc[cols[k]-cstart]; 1859 if (tcol) { 1860 subcols[i] = --tcol; 1861 subvals[i++] = vals[k]; 1862 } 1863 k++; 1864 } 1865 while (k<ncols) { 1866 ierr = PetscTableFind(cmap,cols[k]+1,&tcol);CHKERRQ(ierr); 1867 if (tcol) { 1868 subcols[i] = --tcol; 1869 subvals[i++] = vals[k]; 1870 } 1871 k++; 1872 } 1873 #else 1874 i = 0; 1875 for (k=0; k<ncols; k++) { 1876 tcol = cmap[cols[k]]; 1877 if (tcol) { 1878 subcols[i] = --tcol; 1879 subvals[i++] = vals[k]; 1880 } 1881 } 1882 #endif 1883 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1884 } else { /* allcolumns */ 1885 ierr = MatSetValues_SeqAIJ(submat,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr); 1886 } 1887 ierr = MatRestoreRow_MPIAIJ(C,Crow,&ncols,&cols,&vals);CHKERRQ(ierr); 1888 } 1889 } 1890 1891 if (scall == MAT_INITIAL_MATRIX) { 1892 ierr = PetscLogStagePop();CHKERRQ(ierr); 1893 ierr = PetscLogStagePush(stage[4]);CHKERRQ(ierr); 1894 } 1895 1896 /* Now assemble the off-proc rows */ 1897 for (i=0; i<nrqs; i++) { /* for each requested message */ 1898 /* recv values from other processes */ 1899 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1900 proc = pa[idex]; 1901 sbuf1_i = sbuf1[proc]; 1902 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1903 ct1 = 2 + 1; 1904 ct2 = 0; /* count of received C->j */ 1905 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1906 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1907 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1908 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1909 1910 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1911 max1 = sbuf1_i[2]; /* num of rows */ 1912 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1913 row = sbuf1_i[ct1]; /* row index of submat */ 1914 if (!allcolumns) { 1915 idex = 0; 1916 if (scall == MAT_INITIAL_MATRIX) { 1917 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1918 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1919 #if defined(PETSC_USE_CTABLE) 1920 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1921 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1922 } else { 1923 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1924 } 1925 #else 1926 tcol = cmap[rbuf3_i[ct2]]; 1927 #endif 1928 if (tcol) { 1929 subcols[idex] = --tcol; 1930 subvals[idex++] = rbuf4_i[ct2]; 1931 1932 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1933 For reuse, we replace received C->j with index that should be inserted to submat */ 1934 rbuf3_i[ct3++] = ct2; 1935 } 1936 } 1937 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1938 1939 } else { /* scall == MAT_REUSE_MATRIX */ 1940 submat = submats[0]; 1941 subc = (Mat_SeqAIJ*)submat->data; 1942 1943 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1944 for (l=0; l<nnz; l++) { 1945 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1946 subvals[idex++] = rbuf4_i[ct2]; 1947 } 1948 1949 bj = subc->j + subc->i[row]; 1950 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1951 } 1952 } else { /* allcolumns */ 1953 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1954 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1955 ct2 += nnz; 1956 } 1957 } 1958 } 1959 1960 /* sending a->a are done */ 1961 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1962 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1963 1964 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1965 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1966 submats[0] = submat; 1967 1968 /* Restore the indices */ 1969 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1970 if (!allcolumns) { 1971 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1972 } 1973 1974 /* Destroy allocated memory */ 1975 for (i=0; i<nrqs; ++i) { 1976 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1977 } 1978 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1979 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1980 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1981 1982 if (!allcolumns) { 1983 ierr = PetscFree2(subcols,subvals);CHKERRQ(ierr); 1984 } 1985 1986 if (scall == MAT_INITIAL_MATRIX) { 1987 ierr = PetscLogStagePop();CHKERRQ(ierr); 1988 1989 ierr = PetscFree(lens);CHKERRQ(ierr); 1990 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1991 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1992 } 1993 PetscFunctionReturn(0); 1994 } 1995 1996 #undef __FUNCT__ 1997 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS" 1998 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1999 { 2000 PetscErrorCode ierr; 2001 PetscInt ncol; 2002 PetscBool colflag,allcolumns=PETSC_FALSE; 2003 2004 PetscFunctionBegin; 2005 /* Allocate memory to hold all the submatrices */ 2006 if (scall != MAT_REUSE_MATRIX) { 2007 ierr = PetscMalloc1(1,submat);CHKERRQ(ierr); 2008 } 2009 2010 /* Check for special case: each processor gets entire matrix columns */ 2011 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 2012 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 2013 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 2014 2015 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2016 PetscFunctionReturn(0); 2017 } 2018 2019 #undef __FUNCT__ 2020 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 2021 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2022 { 2023 PetscErrorCode ierr; 2024 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 2025 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 2026 2027 PetscFunctionBegin; 2028 /* Check for special case: each processor gets entire matrix */ 2029 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */ 2030 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2031 PetscFunctionReturn(0); 2032 } 2033 2034 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2035 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2036 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2037 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2038 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2039 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2040 wantallmatrix = PETSC_TRUE; 2041 2042 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2043 } 2044 } 2045 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2046 if (twantallmatrix) { 2047 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2048 PetscFunctionReturn(0); 2049 } 2050 2051 /* Allocate memory to hold all the submatrices */ 2052 if (scall != MAT_REUSE_MATRIX) { 2053 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 2054 } 2055 2056 /* Check for special case: each processor gets entire matrix columns */ 2057 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 2058 for (i=0; i<ismax; i++) { 2059 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2060 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 2061 if (colflag && ncol == C->cmap->N) { 2062 allcolumns[i] = PETSC_TRUE; 2063 } else { 2064 allcolumns[i] = PETSC_FALSE; 2065 } 2066 } 2067 2068 /* Determine the number of stages through which submatrices are done */ 2069 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2070 2071 /* 2072 Each stage will extract nmax submatrices. 2073 nmax is determined by the matrix column dimension. 2074 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2075 */ 2076 if (!nmax) nmax = 1; 2077 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 2078 2079 /* Make sure every processor loops through the nstages */ 2080 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2081 2082 for (i=0,pos=0; i<nstages; i++) { 2083 if (pos+nmax <= ismax) max_no = nmax; 2084 else if (pos == ismax) max_no = 0; 2085 else max_no = ismax-pos; 2086 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 2087 pos += max_no; 2088 } 2089 2090 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 2091 PetscFunctionReturn(0); 2092 } 2093 2094 /* -------------------------------------------------------------------------*/ 2095 #undef __FUNCT__ 2096 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 2097 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 2098 { 2099 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2100 Mat A = c->A; 2101 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 2102 const PetscInt **icol,**irow; 2103 PetscInt *nrow,*ncol,start; 2104 PetscErrorCode ierr; 2105 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 2106 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 2107 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 2108 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 2109 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2110 #if defined(PETSC_USE_CTABLE) 2111 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2112 #else 2113 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2114 #endif 2115 const PetscInt *irow_i; 2116 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2117 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2118 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2119 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2120 MPI_Status *r_status3,*r_status4,*s_status4; 2121 MPI_Comm comm; 2122 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 2123 PetscMPIInt *onodes1,*olengths1; 2124 PetscMPIInt idex,idex2,end; 2125 2126 PetscFunctionBegin; 2127 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2128 tag0 = ((PetscObject)C)->tag; 2129 size = c->size; 2130 rank = c->rank; 2131 2132 /* Get some new tags to keep the communication clean */ 2133 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 2134 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2135 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2136 2137 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 2138 2139 for (i=0; i<ismax; i++) { 2140 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2141 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2142 if (allcolumns[i]) { 2143 icol[i] = NULL; 2144 ncol[i] = C->cmap->N; 2145 } else { 2146 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2147 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2148 } 2149 } 2150 2151 /* evaluate communication - mesg to who, length of mesg, and buffer space 2152 required. Based on this, buffers are allocated, and data copied into them*/ 2153 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 2154 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2155 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2156 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2157 for (i=0; i<ismax; i++) { 2158 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2159 jmax = nrow[i]; 2160 irow_i = irow[i]; 2161 for (j=0; j<jmax; j++) { 2162 l = 0; 2163 row = irow_i[j]; 2164 while (row >= C->rmap->range[l+1]) l++; 2165 proc = l; 2166 w4[proc]++; 2167 } 2168 for (j=0; j<size; j++) { 2169 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 2170 } 2171 } 2172 2173 nrqs = 0; /* no of outgoing messages */ 2174 msz = 0; /* total mesg length (for all procs) */ 2175 w1[rank] = 0; /* no mesg sent to self */ 2176 w3[rank] = 0; 2177 for (i=0; i<size; i++) { 2178 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2179 } 2180 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2181 for (i=0,j=0; i<size; i++) { 2182 if (w1[i]) { pa[j] = i; j++; } 2183 } 2184 2185 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2186 for (i=0; i<nrqs; i++) { 2187 j = pa[i]; 2188 w1[j] += w2[j] + 2* w3[j]; 2189 msz += w1[j]; 2190 } 2191 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2192 2193 /* Determine the number of messages to expect, their lengths, from from-ids */ 2194 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2195 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2196 2197 /* Now post the Irecvs corresponding to these messages */ 2198 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2199 2200 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2201 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2202 2203 /* Allocate Memory for outgoing messages */ 2204 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2205 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2206 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2207 2208 { 2209 PetscInt *iptr = tmp; 2210 k = 0; 2211 for (i=0; i<nrqs; i++) { 2212 j = pa[i]; 2213 iptr += k; 2214 sbuf1[j] = iptr; 2215 k = w1[j]; 2216 } 2217 } 2218 2219 /* Form the outgoing messages */ 2220 /* Initialize the header space */ 2221 for (i=0; i<nrqs; i++) { 2222 j = pa[i]; 2223 sbuf1[j][0] = 0; 2224 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2225 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2226 } 2227 2228 /* Parse the isrow and copy data into outbuf */ 2229 for (i=0; i<ismax; i++) { 2230 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2231 irow_i = irow[i]; 2232 jmax = nrow[i]; 2233 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2234 l = 0; 2235 row = irow_i[j]; 2236 while (row >= C->rmap->range[l+1]) l++; 2237 proc = l; 2238 if (proc != rank) { /* copy to the outgoing buf*/ 2239 ctr[proc]++; 2240 *ptr[proc] = row; 2241 ptr[proc]++; 2242 } 2243 } 2244 /* Update the headers for the current IS */ 2245 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2246 if ((ctr_j = ctr[j])) { 2247 sbuf1_j = sbuf1[j]; 2248 k = ++sbuf1_j[0]; 2249 sbuf1_j[2*k] = ctr_j; 2250 sbuf1_j[2*k-1] = i; 2251 } 2252 } 2253 } 2254 2255 /* Now post the sends */ 2256 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2257 for (i=0; i<nrqs; ++i) { 2258 j = pa[i]; 2259 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2260 } 2261 2262 /* Post Receives to capture the buffer size */ 2263 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2264 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 2265 rbuf2[0] = tmp + msz; 2266 for (i=1; i<nrqs; ++i) { 2267 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2268 } 2269 for (i=0; i<nrqs; ++i) { 2270 j = pa[i]; 2271 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 2272 } 2273 2274 /* Send to other procs the buf size they should allocate */ 2275 2276 2277 /* Receive messages*/ 2278 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2279 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2280 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 2281 { 2282 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2283 PetscInt *sbuf2_i; 2284 2285 for (i=0; i<nrqr; ++i) { 2286 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 2287 2288 req_size[idex] = 0; 2289 rbuf1_i = rbuf1[idex]; 2290 start = 2*rbuf1_i[0] + 1; 2291 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2292 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 2293 sbuf2_i = sbuf2[idex]; 2294 for (j=start; j<end; j++) { 2295 id = rbuf1_i[j] - rstart; 2296 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2297 sbuf2_i[j] = ncols; 2298 req_size[idex] += ncols; 2299 } 2300 req_source[idex] = r_status1[i].MPI_SOURCE; 2301 /* form the header */ 2302 sbuf2_i[0] = req_size[idex]; 2303 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2304 2305 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 2306 } 2307 } 2308 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2309 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2310 2311 /* recv buffer sizes */ 2312 /* Receive messages*/ 2313 2314 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 2315 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2316 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2317 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2318 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2319 2320 for (i=0; i<nrqs; ++i) { 2321 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 2322 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 2323 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 2324 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 2325 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 2326 } 2327 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2328 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2329 2330 /* Wait on sends1 and sends2 */ 2331 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2332 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2333 2334 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2335 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2336 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2337 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2338 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2339 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2340 2341 /* Now allocate buffers for a->j, and send them off */ 2342 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2343 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2344 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2345 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2346 2347 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2348 { 2349 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2350 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2351 PetscInt cend = C->cmap->rend; 2352 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2353 2354 for (i=0; i<nrqr; i++) { 2355 rbuf1_i = rbuf1[i]; 2356 sbuf_aj_i = sbuf_aj[i]; 2357 ct1 = 2*rbuf1_i[0] + 1; 2358 ct2 = 0; 2359 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2360 kmax = rbuf1[i][2*j]; 2361 for (k=0; k<kmax; k++,ct1++) { 2362 row = rbuf1_i[ct1] - rstart; 2363 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2364 ncols = nzA + nzB; 2365 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2366 2367 /* load the column indices for this row into cols*/ 2368 cols = sbuf_aj_i + ct2; 2369 2370 lwrite = 0; 2371 for (l=0; l<nzB; l++) { 2372 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2373 } 2374 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2375 for (l=0; l<nzB; l++) { 2376 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2377 } 2378 2379 ct2 += ncols; 2380 } 2381 } 2382 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 2383 } 2384 } 2385 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 2386 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 2387 2388 /* Allocate buffers for a->a, and send them off */ 2389 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2390 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2391 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2392 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2393 2394 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2395 { 2396 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2397 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2398 PetscInt cend = C->cmap->rend; 2399 PetscInt *b_j = b->j; 2400 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2401 2402 for (i=0; i<nrqr; i++) { 2403 rbuf1_i = rbuf1[i]; 2404 sbuf_aa_i = sbuf_aa[i]; 2405 ct1 = 2*rbuf1_i[0]+1; 2406 ct2 = 0; 2407 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2408 kmax = rbuf1_i[2*j]; 2409 for (k=0; k<kmax; k++,ct1++) { 2410 row = rbuf1_i[ct1] - rstart; 2411 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2412 ncols = nzA + nzB; 2413 cworkB = b_j + b_i[row]; 2414 vworkA = a_a + a_i[row]; 2415 vworkB = b_a + b_i[row]; 2416 2417 /* load the column values for this row into vals*/ 2418 vals = sbuf_aa_i+ct2; 2419 2420 lwrite = 0; 2421 for (l=0; l<nzB; l++) { 2422 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2423 } 2424 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2425 for (l=0; l<nzB; l++) { 2426 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2427 } 2428 2429 ct2 += ncols; 2430 } 2431 } 2432 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 2433 } 2434 } 2435 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 2436 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 2437 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2438 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2439 2440 /* Form the matrix */ 2441 /* create col map: global col of C -> local col of submatrices */ 2442 { 2443 const PetscInt *icol_i; 2444 #if defined(PETSC_USE_CTABLE) 2445 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 2446 for (i=0; i<ismax; i++) { 2447 if (!allcolumns[i]) { 2448 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2449 2450 jmax = ncol[i]; 2451 icol_i = icol[i]; 2452 cmap_i = cmap[i]; 2453 for (j=0; j<jmax; j++) { 2454 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2455 } 2456 } else { 2457 cmap[i] = NULL; 2458 } 2459 } 2460 #else 2461 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 2462 for (i=0; i<ismax; i++) { 2463 if (!allcolumns[i]) { 2464 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2465 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2466 jmax = ncol[i]; 2467 icol_i = icol[i]; 2468 cmap_i = cmap[i]; 2469 for (j=0; j<jmax; j++) { 2470 cmap_i[icol_i[j]] = j+1; 2471 } 2472 } else { 2473 cmap[i] = NULL; 2474 } 2475 } 2476 #endif 2477 } 2478 2479 /* Create lens which is required for MatCreate... */ 2480 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2481 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2482 if (ismax) { 2483 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 2484 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 2485 } 2486 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2487 2488 /* Update lens from local data */ 2489 for (i=0; i<ismax; i++) { 2490 jmax = nrow[i]; 2491 if (!allcolumns[i]) cmap_i = cmap[i]; 2492 irow_i = irow[i]; 2493 lens_i = lens[i]; 2494 for (j=0; j<jmax; j++) { 2495 l = 0; 2496 row = irow_i[j]; 2497 while (row >= C->rmap->range[l+1]) l++; 2498 proc = l; 2499 if (proc == rank) { 2500 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2501 if (!allcolumns[i]) { 2502 for (k=0; k<ncols; k++) { 2503 #if defined(PETSC_USE_CTABLE) 2504 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2505 #else 2506 tcol = cmap_i[cols[k]]; 2507 #endif 2508 if (tcol) lens_i[j]++; 2509 } 2510 } else { /* allcolumns */ 2511 lens_i[j] = ncols; 2512 } 2513 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2514 } 2515 } 2516 } 2517 2518 /* Create row map: global row of C -> local row of submatrices */ 2519 #if defined(PETSC_USE_CTABLE) 2520 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 2521 for (i=0; i<ismax; i++) { 2522 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2523 irow_i = irow[i]; 2524 jmax = nrow[i]; 2525 for (j=0; j<jmax; j++) { 2526 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2527 } 2528 } 2529 #else 2530 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 2531 if (ismax) { 2532 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 2533 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2534 } 2535 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 2536 for (i=0; i<ismax; i++) { 2537 rmap_i = rmap[i]; 2538 irow_i = irow[i]; 2539 jmax = nrow[i]; 2540 for (j=0; j<jmax; j++) { 2541 rmap_i[irow_i[j]] = j; 2542 } 2543 } 2544 #endif 2545 2546 /* Update lens from offproc data */ 2547 { 2548 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2549 2550 for (tmp2=0; tmp2<nrqs; tmp2++) { 2551 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 2552 idex = pa[idex2]; 2553 sbuf1_i = sbuf1[idex]; 2554 jmax = sbuf1_i[0]; 2555 ct1 = 2*jmax+1; 2556 ct2 = 0; 2557 rbuf2_i = rbuf2[idex2]; 2558 rbuf3_i = rbuf3[idex2]; 2559 for (j=1; j<=jmax; j++) { 2560 is_no = sbuf1_i[2*j-1]; 2561 max1 = sbuf1_i[2*j]; 2562 lens_i = lens[is_no]; 2563 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2564 rmap_i = rmap[is_no]; 2565 for (k=0; k<max1; k++,ct1++) { 2566 #if defined(PETSC_USE_CTABLE) 2567 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2568 row--; 2569 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2570 #else 2571 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2572 #endif 2573 max2 = rbuf2_i[ct1]; 2574 for (l=0; l<max2; l++,ct2++) { 2575 if (!allcolumns[is_no]) { 2576 #if defined(PETSC_USE_CTABLE) 2577 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2578 #else 2579 tcol = cmap_i[rbuf3_i[ct2]]; 2580 #endif 2581 if (tcol) lens_i[row]++; 2582 } else { /* allcolumns */ 2583 lens_i[row]++; /* lens_i[row] += max2 ? */ 2584 } 2585 } 2586 } 2587 } 2588 } 2589 } 2590 ierr = PetscFree(r_status3);CHKERRQ(ierr); 2591 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2592 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2593 ierr = PetscFree(s_status3);CHKERRQ(ierr); 2594 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2595 2596 /* Create the submatrices */ 2597 if (scall == MAT_REUSE_MATRIX) { 2598 PetscBool flag; 2599 2600 /* 2601 Assumes new rows are same length as the old rows,hence bug! 2602 */ 2603 for (i=0; i<ismax; i++) { 2604 mat = (Mat_SeqAIJ*)(submats[i]->data); 2605 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2606 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2607 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 2608 /* Initial matrix as if empty */ 2609 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2610 2611 submats[i]->factortype = C->factortype; 2612 } 2613 } else { 2614 for (i=0; i<ismax; i++) { 2615 PetscInt rbs,cbs; 2616 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2617 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2618 2619 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2620 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2621 2622 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2623 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2624 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2625 } 2626 } 2627 2628 /* Assemble the matrices */ 2629 /* First assemble the local rows */ 2630 { 2631 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2632 PetscScalar *imat_a; 2633 2634 for (i=0; i<ismax; i++) { 2635 mat = (Mat_SeqAIJ*)submats[i]->data; 2636 imat_ilen = mat->ilen; 2637 imat_j = mat->j; 2638 imat_i = mat->i; 2639 imat_a = mat->a; 2640 2641 if (!allcolumns[i]) cmap_i = cmap[i]; 2642 rmap_i = rmap[i]; 2643 irow_i = irow[i]; 2644 jmax = nrow[i]; 2645 for (j=0; j<jmax; j++) { 2646 l = 0; 2647 row = irow_i[j]; 2648 while (row >= C->rmap->range[l+1]) l++; 2649 proc = l; 2650 if (proc == rank) { 2651 old_row = row; 2652 #if defined(PETSC_USE_CTABLE) 2653 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2654 row--; 2655 #else 2656 row = rmap_i[row]; 2657 #endif 2658 ilen_row = imat_ilen[row]; 2659 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2660 mat_i = imat_i[row]; 2661 mat_a = imat_a + mat_i; 2662 mat_j = imat_j + mat_i; 2663 if (!allcolumns[i]) { 2664 for (k=0; k<ncols; k++) { 2665 #if defined(PETSC_USE_CTABLE) 2666 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2667 #else 2668 tcol = cmap_i[cols[k]]; 2669 #endif 2670 if (tcol) { 2671 *mat_j++ = tcol - 1; 2672 *mat_a++ = vals[k]; 2673 ilen_row++; 2674 } 2675 } 2676 } else { /* allcolumns */ 2677 for (k=0; k<ncols; k++) { 2678 *mat_j++ = cols[k]; /* global col index! */ 2679 *mat_a++ = vals[k]; 2680 ilen_row++; 2681 } 2682 } 2683 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2684 2685 imat_ilen[row] = ilen_row; 2686 } 2687 } 2688 } 2689 } 2690 2691 /* Now assemble the off proc rows*/ 2692 { 2693 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 2694 PetscInt *imat_j,*imat_i; 2695 PetscScalar *imat_a,*rbuf4_i; 2696 2697 for (tmp2=0; tmp2<nrqs; tmp2++) { 2698 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 2699 idex = pa[idex2]; 2700 sbuf1_i = sbuf1[idex]; 2701 jmax = sbuf1_i[0]; 2702 ct1 = 2*jmax + 1; 2703 ct2 = 0; 2704 rbuf2_i = rbuf2[idex2]; 2705 rbuf3_i = rbuf3[idex2]; 2706 rbuf4_i = rbuf4[idex2]; 2707 for (j=1; j<=jmax; j++) { 2708 is_no = sbuf1_i[2*j-1]; 2709 rmap_i = rmap[is_no]; 2710 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2711 mat = (Mat_SeqAIJ*)submats[is_no]->data; 2712 imat_ilen = mat->ilen; 2713 imat_j = mat->j; 2714 imat_i = mat->i; 2715 imat_a = mat->a; 2716 max1 = sbuf1_i[2*j]; 2717 for (k=0; k<max1; k++,ct1++) { 2718 row = sbuf1_i[ct1]; 2719 #if defined(PETSC_USE_CTABLE) 2720 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2721 row--; 2722 #else 2723 row = rmap_i[row]; 2724 #endif 2725 ilen = imat_ilen[row]; 2726 mat_i = imat_i[row]; 2727 mat_a = imat_a + mat_i; 2728 mat_j = imat_j + mat_i; 2729 max2 = rbuf2_i[ct1]; 2730 if (!allcolumns[is_no]) { 2731 for (l=0; l<max2; l++,ct2++) { 2732 2733 #if defined(PETSC_USE_CTABLE) 2734 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2735 #else 2736 tcol = cmap_i[rbuf3_i[ct2]]; 2737 #endif 2738 if (tcol) { 2739 *mat_j++ = tcol - 1; 2740 *mat_a++ = rbuf4_i[ct2]; 2741 ilen++; 2742 } 2743 } 2744 } else { /* allcolumns */ 2745 for (l=0; l<max2; l++,ct2++) { 2746 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2747 *mat_a++ = rbuf4_i[ct2]; 2748 ilen++; 2749 } 2750 } 2751 imat_ilen[row] = ilen; 2752 } 2753 } 2754 } 2755 } 2756 2757 /* sort the rows */ 2758 { 2759 PetscInt *imat_ilen,*imat_j,*imat_i; 2760 PetscScalar *imat_a; 2761 2762 for (i=0; i<ismax; i++) { 2763 mat = (Mat_SeqAIJ*)submats[i]->data; 2764 imat_j = mat->j; 2765 imat_i = mat->i; 2766 imat_a = mat->a; 2767 imat_ilen = mat->ilen; 2768 2769 if (allcolumns[i]) continue; 2770 jmax = nrow[i]; 2771 for (j=0; j<jmax; j++) { 2772 PetscInt ilen; 2773 2774 mat_i = imat_i[j]; 2775 mat_a = imat_a + mat_i; 2776 mat_j = imat_j + mat_i; 2777 ilen = imat_ilen[j]; 2778 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 2779 } 2780 } 2781 } 2782 2783 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2784 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2785 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2786 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2787 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2788 2789 /* Restore the indices */ 2790 for (i=0; i<ismax; i++) { 2791 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2792 if (!allcolumns[i]) { 2793 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2794 } 2795 } 2796 2797 /* Destroy allocated memory */ 2798 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 2799 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2800 ierr = PetscFree(pa);CHKERRQ(ierr); 2801 2802 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 2803 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 2804 for (i=0; i<nrqr; ++i) { 2805 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 2806 } 2807 for (i=0; i<nrqs; ++i) { 2808 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 2809 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2810 } 2811 2812 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 2813 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 2814 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2815 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2816 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2817 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2818 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2819 2820 #if defined(PETSC_USE_CTABLE) 2821 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 2822 #else 2823 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 2824 #endif 2825 ierr = PetscFree(rmap);CHKERRQ(ierr); 2826 2827 for (i=0; i<ismax; i++) { 2828 if (!allcolumns[i]) { 2829 #if defined(PETSC_USE_CTABLE) 2830 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 2831 #else 2832 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 2833 #endif 2834 } 2835 } 2836 ierr = PetscFree(cmap);CHKERRQ(ierr); 2837 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2838 ierr = PetscFree(lens);CHKERRQ(ierr); 2839 2840 for (i=0; i<ismax; i++) { 2841 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2842 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2843 } 2844 PetscFunctionReturn(0); 2845 } 2846 2847 /* 2848 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2849 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2850 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2851 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2852 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2853 state, and needs to be "assembled" later by compressing B's column space. 2854 2855 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2856 Following this call, C->A & C->B have been created, even if empty. 2857 */ 2858 #undef __FUNCT__ 2859 #define __FUNCT__ "MatSetSeqMats_MPIAIJ" 2860 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2861 { 2862 /* If making this function public, change the error returned in this function away from _PLIB. */ 2863 PetscErrorCode ierr; 2864 Mat_MPIAIJ *aij; 2865 Mat_SeqAIJ *Baij; 2866 PetscBool seqaij,Bdisassembled; 2867 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2868 PetscScalar v; 2869 const PetscInt *rowindices,*colindices; 2870 2871 PetscFunctionBegin; 2872 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2873 if (A) { 2874 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2875 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2876 if (rowemb) { 2877 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2878 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2879 } else { 2880 if (C->rmap->n != A->rmap->n) { 2881 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2882 } 2883 } 2884 if (dcolemb) { 2885 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2886 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2887 } else { 2888 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2889 } 2890 } 2891 if (B) { 2892 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2893 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2894 if (rowemb) { 2895 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2896 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2897 } else { 2898 if (C->rmap->n != B->rmap->n) { 2899 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2900 } 2901 } 2902 if (ocolemb) { 2903 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2904 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2905 } else { 2906 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2907 } 2908 } 2909 2910 aij = (Mat_MPIAIJ*)(C->data); 2911 if (!aij->A) { 2912 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2913 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2914 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2915 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2916 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2917 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2918 } 2919 if (A) { 2920 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2921 } else { 2922 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2923 } 2924 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2925 /* 2926 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2927 need to "disassemble" B -- convert it to using C's global indices. 2928 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2929 2930 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2931 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2932 2933 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2934 At least avoid calling MatSetValues() and the implied searches? 2935 */ 2936 2937 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2938 #if defined(PETSC_USE_CTABLE) 2939 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2940 #else 2941 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2942 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2943 if (aij->B) { 2944 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2945 } 2946 #endif 2947 ngcol = 0; 2948 if (aij->lvec) { 2949 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2950 } 2951 if (aij->garray) { 2952 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2953 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2954 } 2955 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2956 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2957 } 2958 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2959 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2960 } 2961 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2962 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2963 } 2964 } 2965 Bdisassembled = PETSC_FALSE; 2966 if (!aij->B) { 2967 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2968 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2969 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2970 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2971 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2972 Bdisassembled = PETSC_TRUE; 2973 } 2974 if (B) { 2975 Baij = (Mat_SeqAIJ*)(B->data); 2976 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2977 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2978 for (i=0; i<B->rmap->n; i++) { 2979 nz[i] = Baij->i[i+1] - Baij->i[i]; 2980 } 2981 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2982 ierr = PetscFree(nz);CHKERRQ(ierr); 2983 } 2984 2985 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 2986 shift = rend-rstart; 2987 count = 0; 2988 rowindices = NULL; 2989 colindices = NULL; 2990 if (rowemb) { 2991 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 2992 } 2993 if (ocolemb) { 2994 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 2995 } 2996 for (i=0; i<B->rmap->n; i++) { 2997 PetscInt row; 2998 row = i; 2999 if (rowindices) row = rowindices[i]; 3000 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3001 col = Baij->j[count]; 3002 if (colindices) col = colindices[col]; 3003 if (Bdisassembled && col>=rstart) col += shift; 3004 v = Baij->a[count]; 3005 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3006 ++count; 3007 } 3008 } 3009 /* No assembly for aij->B is necessary. */ 3010 /* FIXME: set aij->B's nonzerostate correctly. */ 3011 } else { 3012 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3013 } 3014 C->preallocated = PETSC_TRUE; 3015 C->was_assembled = PETSC_FALSE; 3016 C->assembled = PETSC_FALSE; 3017 /* 3018 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3019 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3020 */ 3021 PetscFunctionReturn(0); 3022 } 3023 3024 #undef __FUNCT__ 3025 #define __FUNCT__ "MatGetSeqMats_MPIAIJ" 3026 /* 3027 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3028 */ 3029 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3030 { 3031 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 3032 3033 PetscFunctionBegin; 3034 PetscValidPointer(A,2); 3035 PetscValidPointer(B,3); 3036 /* FIXME: make sure C is assembled */ 3037 *A = aij->A; 3038 *B = aij->B; 3039 /* Note that we don't incref *A and *B, so be careful! */ 3040 PetscFunctionReturn(0); 3041 } 3042 3043 /* 3044 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3045 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3046 */ 3047 #undef __FUNCT__ 3048 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ" 3049 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3050 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3051 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3052 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3053 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3054 { 3055 PetscErrorCode ierr; 3056 PetscMPIInt isize,flag; 3057 PetscInt i,ii,cismax,ispar,ciscol_localsize; 3058 Mat *A,*B; 3059 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3060 3061 PetscFunctionBegin; 3062 if (!ismax) PetscFunctionReturn(0); 3063 3064 for (i = 0, cismax = 0; i < ismax; ++i) { 3065 PetscMPIInt isize; 3066 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3067 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3068 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3069 if (isize > 1) ++cismax; 3070 } 3071 /* 3072 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3073 ispar counts the number of parallel ISs across C's comm. 3074 */ 3075 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3076 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3077 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3078 PetscFunctionReturn(0); 3079 } 3080 3081 /* if (ispar) */ 3082 /* 3083 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3084 These are used to extract the off-diag portion of the resulting parallel matrix. 3085 The row IS for the off-diag portion is the same as for the diag portion, 3086 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3087 */ 3088 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3089 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3090 for (i = 0, ii = 0; i < ismax; ++i) { 3091 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3092 if (isize > 1) { 3093 /* 3094 TODO: This is the part that's ***NOT SCALABLE***. 3095 To fix this we need to extract just the indices of C's nonzero columns 3096 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3097 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3098 to be done without serializing on the IS list, so, most likely, it is best 3099 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3100 */ 3101 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3102 /* Now we have to 3103 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3104 were sorted on each rank, concatenated they might no longer be sorted; 3105 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3106 indices in the nondecreasing order to the original index positions. 3107 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3108 */ 3109 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3110 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3111 ++ii; 3112 } 3113 } 3114 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3115 for (i = 0, ii = 0; i < ismax; ++i) { 3116 PetscInt j,issize; 3117 const PetscInt *indices; 3118 3119 /* 3120 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3121 */ 3122 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3123 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3124 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3125 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3126 for (j = 1; j < issize; ++j) { 3127 if (indices[j] == indices[j-1]) { 3128 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3129 } 3130 } 3131 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3132 3133 3134 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3135 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3136 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3137 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3138 for (j = 1; j < issize; ++j) { 3139 if (indices[j-1] == indices[j]) { 3140 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3141 } 3142 } 3143 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3144 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3145 if (isize > 1) { 3146 cisrow[ii] = isrow[i]; 3147 ++ii; 3148 } 3149 } 3150 /* 3151 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3152 array of sequential matrices underlying the resulting parallel matrices. 3153 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3154 contain duplicates. 3155 3156 There are as many diag matrices as there are original index sets. There are only as many parallel 3157 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3158 3159 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3160 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3161 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3162 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3163 with A[i] and B[ii] extracted from the corresponding MPI submat. 3164 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3165 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3166 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3167 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3168 values into A[i] and B[ii] sitting inside the corresponding submat. 3169 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3170 A[i], B[ii], AA[i] or BB[ii] matrices. 3171 */ 3172 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3173 if (scall == MAT_INITIAL_MATRIX) { 3174 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3175 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 3176 } else { 3177 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 3178 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 3179 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 3180 for (i = 0, ii = 0; i < ismax; ++i) { 3181 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3182 if (isize > 1) { 3183 Mat AA,BB; 3184 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 3185 if (!isrow_p[i] && !iscol_p[i]) { 3186 A[i] = AA; 3187 } else { 3188 /* TODO: extract A[i] composed on AA. */ 3189 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3190 } 3191 if (!isrow_p[i] && !ciscol_p[ii]) { 3192 B[ii] = BB; 3193 } else { 3194 /* TODO: extract B[ii] composed on BB. */ 3195 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 3196 } 3197 ++ii; 3198 } else { 3199 if (!isrow_p[i] && !iscol_p[i]) { 3200 A[i] = (*submat)[i]; 3201 } else { 3202 /* TODO: extract A[i] composed on (*submat)[i]. */ 3203 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3204 } 3205 } 3206 } 3207 } 3208 /* Now obtain the sequential A and B submatrices separately. */ 3209 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 3210 /* I did not figure out a good way to fix it right now. 3211 * Local column size of B[i] is different from the size of ciscol[i]. 3212 * B[i]'s size is finally determined by MatAssembly, while 3213 * ciscol[i] is computed as the complement of iscol[i]. 3214 * It is better to keep only nonzero indices when computing 3215 * the complement ciscol[i]. 3216 * */ 3217 if(scall==MAT_REUSE_MATRIX){ 3218 for(i=0; i<cismax; i++){ 3219 ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr); 3220 B[i]->cmap->n = ciscol_localsize; 3221 } 3222 } 3223 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 3224 /* 3225 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3226 matrices A & B have been extracted directly into the parallel matrices containing them, or 3227 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3228 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3229 to have the same sparsity pattern. 3230 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3231 must be constructed for C. This is done by setseqmat(s). 3232 */ 3233 for (i = 0, ii = 0; i < ismax; ++i) { 3234 /* 3235 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3236 That way we can avoid sorting and computing permutations when reusing. 3237 To this end: 3238 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3239 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3240 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3241 */ 3242 MatStructure pattern; 3243 if (scall == MAT_INITIAL_MATRIX) { 3244 pattern = DIFFERENT_NONZERO_PATTERN; 3245 } else { 3246 pattern = SAME_NONZERO_PATTERN; 3247 } 3248 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3249 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3250 if (isize > 1) { 3251 if (scall == MAT_INITIAL_MATRIX) { 3252 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3253 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3254 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3255 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3256 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3257 } 3258 /* 3259 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3260 */ 3261 { 3262 Mat AA,BB; 3263 AA = NULL; 3264 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 3265 BB = NULL; 3266 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 3267 if (AA || BB) { 3268 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3269 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3270 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3271 } 3272 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 3273 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 3274 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3275 } 3276 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 3277 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 3278 ierr = MatDestroy(&BB);CHKERRQ(ierr); 3279 } 3280 } 3281 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3282 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3283 ++ii; 3284 } else { /* if (isize == 1) */ 3285 if (scall == MAT_INITIAL_MATRIX) { 3286 if (isrow_p[i] || iscol_p[i]) { 3287 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3288 } else (*submat)[i] = A[i]; 3289 } 3290 if (isrow_p[i] || iscol_p[i]) { 3291 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3292 /* Otherwise A is extracted straight into (*submats)[i]. */ 3293 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3294 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3295 } 3296 } 3297 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3298 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3299 } 3300 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3301 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3302 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3303 ierr = PetscFree(A);CHKERRQ(ierr); 3304 ierr = PetscFree(B);CHKERRQ(ierr); 3305 PetscFunctionReturn(0); 3306 } 3307 3308 3309 3310 #undef __FUNCT__ 3311 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ" 3312 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3313 { 3314 PetscErrorCode ierr; 3315 3316 PetscFunctionBegin; 3317 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3318 PetscFunctionReturn(0); 3319 } 3320