1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 #undef __FUNCT__ 23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 25 { 26 PetscErrorCode ierr; 27 PetscInt i; 28 29 PetscFunctionBegin; 30 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 31 for (i=0; i<ov; ++i) { 32 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 33 } 34 PetscFunctionReturn(0); 35 } 36 37 #undef __FUNCT__ 38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 40 { 41 PetscErrorCode ierr; 42 PetscInt i; 43 44 PetscFunctionBegin; 45 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 46 for (i=0; i<ov; ++i) { 47 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 48 } 49 PetscFunctionReturn(0); 50 } 51 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 56 { 57 PetscErrorCode ierr; 58 MPI_Comm comm; 59 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 60 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 61 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 62 const PetscInt *indices_i,**indices; 63 PetscLayout rmap; 64 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 65 PetscSF sf; 66 PetscSFNode *remote; 67 68 PetscFunctionBegin; 69 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 70 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 71 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 72 /* get row map to determine where rows should be going */ 73 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 74 /* retrieve IS data and put all together so that we 75 * can optimize communication 76 * */ 77 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 78 for (i=0,tlength=0; i<nidx; i++){ 79 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 80 tlength += length[i]; 81 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 82 } 83 /* find these rows on remote processors */ 84 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 85 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 86 nrrows = 0; 87 for (i=0; i<nidx; i++){ 88 length_i = length[i]; 89 indices_i = indices[i]; 90 for (j=0; j<length_i; j++){ 91 owner = -1; 92 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 93 /* remote processors */ 94 if (owner != rank){ 95 tosizes_temp[owner]++; /* number of rows to owner */ 96 rrow_ranks[nrrows] = owner; /* processor */ 97 rrow_isids[nrrows] = i; /* is id */ 98 remoterows[nrrows++] = indices_i[j]; /* row */ 99 } 100 } 101 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 102 } 103 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 104 /* test if we need to exchange messages 105 * generally speaking, we do not need to exchange 106 * data when overlap is 1 107 * */ 108 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 109 /* we do not have any messages 110 * It usually corresponds to overlap 1 111 * */ 112 if (!reducednrrows){ 113 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 114 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 115 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 nto = 0; 119 /* send sizes and ranks for building a two-sided communcation */ 120 for (i=0; i<size; i++){ 121 if (tosizes_temp[i]){ 122 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 123 tosizes_temp[i] = nto; /* a map from processor to index */ 124 toranks[nto++] = i; /* processor */ 125 } 126 } 127 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 128 for (i=0; i<nto; i++){ 129 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 130 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 131 } 132 /* send information to other processors */ 133 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 134 nrecvrows = 0; 135 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 136 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 137 nrecvrows = 0; 138 for (i=0; i<nfrom; i++){ 139 for (j=0; j<fromsizes[2*i]; j++){ 140 remote[nrecvrows].rank = fromranks[i]; 141 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 142 } 143 } 144 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 145 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 146 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 147 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 148 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 149 /* message pair <no of is, row> */ 150 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 151 for (i=0; i<nrrows; i++){ 152 owner = rrow_ranks[i]; /* processor */ 153 j = tosizes_temp[owner]; /* index */ 154 todata[toffsets[j]++] = rrow_isids[i]; 155 todata[toffsets[j]++] = remoterows[i]; 156 } 157 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 158 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 159 ierr = PetscFree(toffsets);CHKERRQ(ierr); 160 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 161 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 162 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 163 /* send rows belonging to the remote so that then we could get the overlapping data back */ 164 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 165 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 166 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 167 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 168 ierr = PetscFree(fromranks);CHKERRQ(ierr); 169 nrecvrows = 0; 170 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 171 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 172 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 173 nrecvrows = 0; 174 for (i=0; i<nto; i++){ 175 for (j=0; j<tosizes[2*i]; j++){ 176 remote[nrecvrows].rank = toranks[i]; 177 remote[nrecvrows++].index = tosizes[2*i+1]+j; 178 } 179 } 180 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 181 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 182 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 183 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 184 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 185 /* overlap communication and computation */ 186 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 187 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 188 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 189 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 190 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 191 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 192 ierr = PetscFree(toranks);CHKERRQ(ierr); 193 ierr = PetscFree(tosizes);CHKERRQ(ierr); 194 ierr = PetscFree(todata);CHKERRQ(ierr); 195 PetscFunctionReturn(0); 196 } 197 198 #undef __FUNCT__ 199 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 200 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 201 { 202 PetscInt *isz,isz_i,i,j,is_id, data_size; 203 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 204 const PetscInt *indices_i_temp; 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 max_lsize = 0; 209 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 210 for (i=0; i<nidx; i++){ 211 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 212 max_lsize = lsize>max_lsize ? lsize:max_lsize; 213 isz[i] = lsize; 214 } 215 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 216 for (i=0; i<nidx; i++){ 217 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 218 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 219 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 220 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 221 } 222 /* retrieve information to get row id and its overlap */ 223 for (i=0; i<nrecvs; ){ 224 is_id = recvdata[i++]; 225 data_size = recvdata[i++]; 226 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 227 isz_i = isz[is_id]; 228 for (j=0; j< data_size; j++){ 229 col = recvdata[i++]; 230 indices_i[isz_i++] = col; 231 } 232 isz[is_id] = isz_i; 233 } 234 /* remove duplicate entities */ 235 for (i=0; i<nidx; i++){ 236 indices_i = indices_temp+(max_lsize+nrecvs)*i; 237 isz_i = isz[i]; 238 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 239 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 240 } 241 ierr = PetscFree(isz);CHKERRQ(ierr); 242 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 243 PetscFunctionReturn(0); 244 } 245 246 #undef __FUNCT__ 247 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 248 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 249 { 250 PetscLayout rmap,cmap; 251 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 252 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 253 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 254 const PetscInt *gcols,*ai,*aj,*bi,*bj; 255 Mat amat,bmat; 256 PetscMPIInt rank; 257 PetscBool done; 258 MPI_Comm comm; 259 PetscErrorCode ierr; 260 261 PetscFunctionBegin; 262 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 263 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 264 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 265 /* Even if the mat is symmetric, we still assume it is not symmetric */ 266 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 267 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 268 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 269 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 270 /* total number of nonzero values is used to estimate the memory usage in the next step */ 271 tnz = ai[an]+bi[bn]; 272 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 273 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 274 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 275 /* to find the longest message */ 276 max_fszs = 0; 277 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 278 /* better way to estimate number of nonzero in the mat??? */ 279 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 280 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 281 rows_pos = 0; 282 totalrows = 0; 283 for (i=0; i<nfrom; i++){ 284 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 285 /* group data together */ 286 for (j=0; j<fromsizes[2*i]; j+=2){ 287 is_id = fromrows[rows_pos++];/* no of is */ 288 rows_i = rows_data[is_id]; 289 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 290 } 291 /* estimate a space to avoid multiple allocations */ 292 for (j=0; j<nidx; j++){ 293 indvc_ij = 0; 294 rows_i = rows_data[j]; 295 for (l=0; l<rows_pos_i[j]; l++){ 296 row = rows_i[l]-rstart; 297 start = ai[row]; 298 end = ai[row+1]; 299 for (k=start; k<end; k++){ /* Amat */ 300 col = aj[k] + cstart; 301 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 302 } 303 start = bi[row]; 304 end = bi[row+1]; 305 for (k=start; k<end; k++) { /* Bmat */ 306 col = gcols[bj[k]]; 307 indices_tmp[indvc_ij++] = col; 308 } 309 } 310 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 311 indv_counts[i*nidx+j] = indvc_ij; 312 totalrows += indvc_ij; 313 } 314 } 315 /* message triple <no of is, number of rows, rows> */ 316 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 317 totalrows = 0; 318 rows_pos = 0; 319 /* use this code again */ 320 for (i=0;i<nfrom;i++){ 321 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 322 for (j=0; j<fromsizes[2*i]; j+=2){ 323 is_id = fromrows[rows_pos++]; 324 rows_i = rows_data[is_id]; 325 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 326 } 327 /* add data */ 328 for (j=0; j<nidx; j++){ 329 if (!indv_counts[i*nidx+j]) continue; 330 indvc_ij = 0; 331 sbdata[totalrows++] = j; 332 sbdata[totalrows++] = indv_counts[i*nidx+j]; 333 sbsizes[2*i] += 2; 334 rows_i = rows_data[j]; 335 for (l=0; l<rows_pos_i[j]; l++){ 336 row = rows_i[l]-rstart; 337 start = ai[row]; 338 end = ai[row+1]; 339 for (k=start; k<end; k++){ /* Amat */ 340 col = aj[k] + cstart; 341 indices_tmp[indvc_ij++] = col; 342 } 343 start = bi[row]; 344 end = bi[row+1]; 345 for (k=start; k<end; k++) { /* Bmat */ 346 col = gcols[bj[k]]; 347 indices_tmp[indvc_ij++] = col; 348 } 349 } 350 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 351 sbsizes[2*i] += indvc_ij; 352 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 353 totalrows += indvc_ij; 354 } 355 } 356 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 357 for (i=0; i<nfrom; i++){ 358 offsets[i+1] = offsets[i] + sbsizes[2*i]; 359 sbsizes[2*i+1] = offsets[i]; 360 } 361 ierr = PetscFree(offsets);CHKERRQ(ierr); 362 if (sbrowsizes) *sbrowsizes = sbsizes; 363 if (sbrows) *sbrows = sbdata; 364 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 365 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 366 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 367 PetscFunctionReturn(0); 368 } 369 370 #undef __FUNCT__ 371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 373 { 374 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 375 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 376 PetscInt lsize,lsize_tmp,owner; 377 PetscMPIInt rank; 378 Mat amat,bmat; 379 PetscBool done; 380 PetscLayout cmap,rmap; 381 MPI_Comm comm; 382 PetscErrorCode ierr; 383 384 PetscFunctionBegin; 385 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 386 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 387 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 388 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 389 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 390 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 391 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 392 /* is it a safe way to compute number of nonzero values ? */ 393 tnz = ai[an]+bi[bn]; 394 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 395 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 396 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 397 /* it is a better way to estimate memory than the old implementation 398 * where global size of matrix is used 399 * */ 400 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 401 for (i=0; i<nidx; i++) { 402 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 403 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 404 lsize_tmp = 0; 405 for (j=0; j<lsize; j++) { 406 owner = -1; 407 row = indices[j]; 408 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 409 if (owner != rank) continue; 410 /* local number */ 411 row -= rstart; 412 start = ai[row]; 413 end = ai[row+1]; 414 for (k=start; k<end; k++) { /* Amat */ 415 col = aj[k] + cstart; 416 indices_temp[lsize_tmp++] = col; 417 } 418 start = bi[row]; 419 end = bi[row+1]; 420 for (k=start; k<end; k++) { /* Bmat */ 421 col = gcols[bj[k]]; 422 indices_temp[lsize_tmp++] = col; 423 } 424 } 425 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 426 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 427 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 428 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 429 } 430 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 431 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 432 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 433 PetscFunctionReturn(0); 434 } 435 436 437 /* 438 Sample message format: 439 If a processor A wants processor B to process some elements corresponding 440 to index sets is[1],is[5] 441 mesg [0] = 2 (no of index sets in the mesg) 442 ----------- 443 mesg [1] = 1 => is[1] 444 mesg [2] = sizeof(is[1]); 445 ----------- 446 mesg [3] = 5 => is[5] 447 mesg [4] = sizeof(is[5]); 448 ----------- 449 mesg [5] 450 mesg [n] datas[1] 451 ----------- 452 mesg[n+1] 453 mesg[m] data(is[5]) 454 ----------- 455 456 Notes: 457 nrqs - no of requests sent (or to be sent out) 458 nrqr - no of requests recieved (which have to be or which have been processed 459 */ 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 463 { 464 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 465 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 466 const PetscInt **idx,*idx_i; 467 PetscInt *n,**data,len; 468 #if defined(PETSC_USE_CTABLE) 469 PetscTable *table_data,table_data_i; 470 PetscInt *tdata,tcount,tcount_max; 471 #else 472 PetscInt *data_i,*d_p; 473 #endif 474 PetscErrorCode ierr; 475 PetscMPIInt size,rank,tag1,tag2; 476 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 477 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 478 PetscBT *table; 479 MPI_Comm comm; 480 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 481 MPI_Status *s_status,*recv_status; 482 char *t_p; 483 484 PetscFunctionBegin; 485 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 486 size = c->size; 487 rank = c->rank; 488 M = C->rmap->N; 489 490 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 491 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 492 493 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 494 495 for (i=0; i<imax; i++) { 496 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 497 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 498 } 499 500 /* evaluate communication - mesg to who,length of mesg, and buffer space 501 required. Based on this, buffers are allocated, and data copied into them */ 502 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 503 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 504 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 505 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 506 for (i=0; i<imax; i++) { 507 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 508 idx_i = idx[i]; 509 len = n[i]; 510 for (j=0; j<len; j++) { 511 row = idx_i[j]; 512 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 513 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 514 w4[proc]++; 515 } 516 for (j=0; j<size; j++) { 517 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 518 } 519 } 520 521 nrqs = 0; /* no of outgoing messages */ 522 msz = 0; /* total mesg length (for all proc */ 523 w1[rank] = 0; /* no mesg sent to intself */ 524 w3[rank] = 0; 525 for (i=0; i<size; i++) { 526 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 527 } 528 /* pa - is list of processors to communicate with */ 529 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 530 for (i=0,j=0; i<size; i++) { 531 if (w1[i]) {pa[j] = i; j++;} 532 } 533 534 /* Each message would have a header = 1 + 2*(no of IS) + data */ 535 for (i=0; i<nrqs; i++) { 536 j = pa[i]; 537 w1[j] += w2[j] + 2*w3[j]; 538 msz += w1[j]; 539 } 540 541 /* Determine the number of messages to expect, their lengths, from from-ids */ 542 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 543 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 544 545 /* Now post the Irecvs corresponding to these messages */ 546 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 547 548 /* Allocate Memory for outgoing messages */ 549 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 550 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 551 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 552 553 { 554 PetscInt *iptr = tmp,ict = 0; 555 for (i=0; i<nrqs; i++) { 556 j = pa[i]; 557 iptr += ict; 558 outdat[j] = iptr; 559 ict = w1[j]; 560 } 561 } 562 563 /* Form the outgoing messages */ 564 /* plug in the headers */ 565 for (i=0; i<nrqs; i++) { 566 j = pa[i]; 567 outdat[j][0] = 0; 568 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 569 ptr[j] = outdat[j] + 2*w3[j] + 1; 570 } 571 572 /* Memory for doing local proc's work */ 573 { 574 PetscInt M_BPB_imax = 0; 575 #if defined(PETSC_USE_CTABLE) 576 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 577 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 578 for (i=0; i<imax; i++) { 579 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 580 } 581 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 582 for (i=0; i<imax; i++) { 583 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 584 } 585 #else 586 PetscInt Mimax = 0; 587 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 588 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 589 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 590 for (i=0; i<imax; i++) { 591 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 592 data[i] = d_p + M*i; 593 } 594 #endif 595 } 596 597 /* Parse the IS and update local tables and the outgoing buf with the data */ 598 { 599 PetscInt n_i,isz_i,*outdat_j,ctr_j; 600 PetscBT table_i; 601 602 for (i=0; i<imax; i++) { 603 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 604 n_i = n[i]; 605 table_i = table[i]; 606 idx_i = idx[i]; 607 #if defined(PETSC_USE_CTABLE) 608 table_data_i = table_data[i]; 609 #else 610 data_i = data[i]; 611 #endif 612 isz_i = isz[i]; 613 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 614 row = idx_i[j]; 615 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 616 if (proc != rank) { /* copy to the outgoing buffer */ 617 ctr[proc]++; 618 *ptr[proc] = row; 619 ptr[proc]++; 620 } else if (!PetscBTLookupSet(table_i,row)) { 621 #if defined(PETSC_USE_CTABLE) 622 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 623 #else 624 data_i[isz_i] = row; /* Update the local table */ 625 #endif 626 isz_i++; 627 } 628 } 629 /* Update the headers for the current IS */ 630 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 631 if ((ctr_j = ctr[j])) { 632 outdat_j = outdat[j]; 633 k = ++outdat_j[0]; 634 outdat_j[2*k] = ctr_j; 635 outdat_j[2*k-1] = i; 636 } 637 } 638 isz[i] = isz_i; 639 } 640 } 641 642 /* Now post the sends */ 643 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 644 for (i=0; i<nrqs; ++i) { 645 j = pa[i]; 646 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 647 } 648 649 /* No longer need the original indices */ 650 for (i=0; i<imax; ++i) { 651 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 652 } 653 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 654 655 for (i=0; i<imax; ++i) { 656 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 657 } 658 659 /* Do Local work */ 660 #if defined(PETSC_USE_CTABLE) 661 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 662 #else 663 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 664 #endif 665 666 /* Receive messages */ 667 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 668 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 669 670 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 671 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 672 673 /* Phase 1 sends are complete - deallocate buffers */ 674 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 675 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 676 677 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 678 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 679 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 680 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 681 ierr = PetscFree(rbuf);CHKERRQ(ierr); 682 683 684 /* Send the data back */ 685 /* Do a global reduction to know the buffer space req for incoming messages */ 686 { 687 PetscMPIInt *rw1; 688 689 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 690 691 for (i=0; i<nrqr; ++i) { 692 proc = recv_status[i].MPI_SOURCE; 693 694 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 695 rw1[proc] = isz1[i]; 696 } 697 ierr = PetscFree(onodes1);CHKERRQ(ierr); 698 ierr = PetscFree(olengths1);CHKERRQ(ierr); 699 700 /* Determine the number of messages to expect, their lengths, from from-ids */ 701 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 702 ierr = PetscFree(rw1);CHKERRQ(ierr); 703 } 704 /* Now post the Irecvs corresponding to these messages */ 705 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 706 707 /* Now post the sends */ 708 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 709 for (i=0; i<nrqr; ++i) { 710 j = recv_status[i].MPI_SOURCE; 711 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 712 } 713 714 /* receive work done on other processors */ 715 { 716 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 717 PetscMPIInt idex; 718 PetscBT table_i; 719 MPI_Status *status2; 720 721 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 722 for (i=0; i<nrqs; ++i) { 723 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 724 /* Process the message */ 725 rbuf2_i = rbuf2[idex]; 726 ct1 = 2*rbuf2_i[0]+1; 727 jmax = rbuf2[idex][0]; 728 for (j=1; j<=jmax; j++) { 729 max = rbuf2_i[2*j]; 730 is_no = rbuf2_i[2*j-1]; 731 isz_i = isz[is_no]; 732 table_i = table[is_no]; 733 #if defined(PETSC_USE_CTABLE) 734 table_data_i = table_data[is_no]; 735 #else 736 data_i = data[is_no]; 737 #endif 738 for (k=0; k<max; k++,ct1++) { 739 row = rbuf2_i[ct1]; 740 if (!PetscBTLookupSet(table_i,row)) { 741 #if defined(PETSC_USE_CTABLE) 742 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 743 #else 744 data_i[isz_i] = row; 745 #endif 746 isz_i++; 747 } 748 } 749 isz[is_no] = isz_i; 750 } 751 } 752 753 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 754 ierr = PetscFree(status2);CHKERRQ(ierr); 755 } 756 757 #if defined(PETSC_USE_CTABLE) 758 tcount_max = 0; 759 for (i=0; i<imax; ++i) { 760 table_data_i = table_data[i]; 761 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 762 if (tcount_max < tcount) tcount_max = tcount; 763 } 764 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 765 #endif 766 767 for (i=0; i<imax; ++i) { 768 #if defined(PETSC_USE_CTABLE) 769 PetscTablePosition tpos; 770 table_data_i = table_data[i]; 771 772 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 773 while (tpos) { 774 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 775 tdata[--j] = --k; 776 } 777 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 778 #else 779 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 780 #endif 781 } 782 783 ierr = PetscFree(onodes2);CHKERRQ(ierr); 784 ierr = PetscFree(olengths2);CHKERRQ(ierr); 785 786 ierr = PetscFree(pa);CHKERRQ(ierr); 787 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 788 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 789 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 790 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 791 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 792 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 793 ierr = PetscFree(s_status);CHKERRQ(ierr); 794 ierr = PetscFree(recv_status);CHKERRQ(ierr); 795 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 796 ierr = PetscFree(xdata);CHKERRQ(ierr); 797 ierr = PetscFree(isz1);CHKERRQ(ierr); 798 #if defined(PETSC_USE_CTABLE) 799 for (i=0; i<imax; i++) { 800 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 801 } 802 ierr = PetscFree(table_data);CHKERRQ(ierr); 803 ierr = PetscFree(tdata);CHKERRQ(ierr); 804 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 805 #else 806 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 807 #endif 808 PetscFunctionReturn(0); 809 } 810 811 #undef __FUNCT__ 812 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 813 /* 814 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 815 the work on the local processor. 816 817 Inputs: 818 C - MAT_MPIAIJ; 819 imax - total no of index sets processed at a time; 820 table - an array of char - size = m bits. 821 822 Output: 823 isz - array containing the count of the solution elements corresponding 824 to each index set; 825 data or table_data - pointer to the solutions 826 */ 827 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 828 { 829 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 830 Mat A = c->A,B = c->B; 831 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 832 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 833 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 834 PetscBT table_i; 835 #if defined(PETSC_USE_CTABLE) 836 PetscTable table_data_i; 837 PetscErrorCode ierr; 838 PetscTablePosition tpos; 839 PetscInt tcount,*tdata; 840 #else 841 PetscInt *data_i; 842 #endif 843 844 PetscFunctionBegin; 845 rstart = C->rmap->rstart; 846 cstart = C->cmap->rstart; 847 ai = a->i; 848 aj = a->j; 849 bi = b->i; 850 bj = b->j; 851 garray = c->garray; 852 853 for (i=0; i<imax; i++) { 854 #if defined(PETSC_USE_CTABLE) 855 /* copy existing entries of table_data_i into tdata[] */ 856 table_data_i = table_data[i]; 857 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 858 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 859 860 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 861 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 862 while (tpos) { 863 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 864 tdata[--j] = --row; 865 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 866 } 867 #else 868 data_i = data[i]; 869 #endif 870 table_i = table[i]; 871 isz_i = isz[i]; 872 max = isz[i]; 873 874 for (j=0; j<max; j++) { 875 #if defined(PETSC_USE_CTABLE) 876 row = tdata[j] - rstart; 877 #else 878 row = data_i[j] - rstart; 879 #endif 880 start = ai[row]; 881 end = ai[row+1]; 882 for (k=start; k<end; k++) { /* Amat */ 883 val = aj[k] + cstart; 884 if (!PetscBTLookupSet(table_i,val)) { 885 #if defined(PETSC_USE_CTABLE) 886 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 887 #else 888 data_i[isz_i] = val; 889 #endif 890 isz_i++; 891 } 892 } 893 start = bi[row]; 894 end = bi[row+1]; 895 for (k=start; k<end; k++) { /* Bmat */ 896 val = garray[bj[k]]; 897 if (!PetscBTLookupSet(table_i,val)) { 898 #if defined(PETSC_USE_CTABLE) 899 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 900 #else 901 data_i[isz_i] = val; 902 #endif 903 isz_i++; 904 } 905 } 906 } 907 isz[i] = isz_i; 908 909 #if defined(PETSC_USE_CTABLE) 910 ierr = PetscFree(tdata);CHKERRQ(ierr); 911 #endif 912 } 913 PetscFunctionReturn(0); 914 } 915 916 #undef __FUNCT__ 917 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 918 /* 919 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 920 and return the output 921 922 Input: 923 C - the matrix 924 nrqr - no of messages being processed. 925 rbuf - an array of pointers to the recieved requests 926 927 Output: 928 xdata - array of messages to be sent back 929 isz1 - size of each message 930 931 For better efficiency perhaps we should malloc separately each xdata[i], 932 then if a remalloc is required we need only copy the data for that one row 933 rather then all previous rows as it is now where a single large chunck of 934 memory is used. 935 936 */ 937 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 938 { 939 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 940 Mat A = c->A,B = c->B; 941 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 942 PetscErrorCode ierr; 943 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 944 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 945 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 946 PetscInt *rbuf_i,kmax,rbuf_0; 947 PetscBT xtable; 948 949 PetscFunctionBegin; 950 m = C->rmap->N; 951 rstart = C->rmap->rstart; 952 cstart = C->cmap->rstart; 953 ai = a->i; 954 aj = a->j; 955 bi = b->i; 956 bj = b->j; 957 garray = c->garray; 958 959 960 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 961 rbuf_i = rbuf[i]; 962 rbuf_0 = rbuf_i[0]; 963 ct += rbuf_0; 964 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 965 } 966 967 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 968 else max1 = 1; 969 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 970 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 971 ++no_malloc; 972 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 973 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 974 975 ct3 = 0; 976 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 977 rbuf_i = rbuf[i]; 978 rbuf_0 = rbuf_i[0]; 979 ct1 = 2*rbuf_0+1; 980 ct2 = ct1; 981 ct3 += ct1; 982 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 983 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 984 oct2 = ct2; 985 kmax = rbuf_i[2*j]; 986 for (k=0; k<kmax; k++,ct1++) { 987 row = rbuf_i[ct1]; 988 if (!PetscBTLookupSet(xtable,row)) { 989 if (!(ct3 < mem_estimate)) { 990 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 991 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 992 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 993 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 994 xdata[0] = tmp; 995 mem_estimate = new_estimate; ++no_malloc; 996 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 997 } 998 xdata[i][ct2++] = row; 999 ct3++; 1000 } 1001 } 1002 for (k=oct2,max2=ct2; k<max2; k++) { 1003 row = xdata[i][k] - rstart; 1004 start = ai[row]; 1005 end = ai[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = aj[l] + cstart; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 start = bi[row]; 1023 end = bi[row+1]; 1024 for (l=start; l<end; l++) { 1025 val = garray[bj[l]]; 1026 if (!PetscBTLookupSet(xtable,val)) { 1027 if (!(ct3 < mem_estimate)) { 1028 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1029 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1030 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1031 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1032 xdata[0] = tmp; 1033 mem_estimate = new_estimate; ++no_malloc; 1034 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1035 } 1036 xdata[i][ct2++] = val; 1037 ct3++; 1038 } 1039 } 1040 } 1041 /* Update the header*/ 1042 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1043 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1044 } 1045 xdata[i][0] = rbuf_0; 1046 xdata[i+1] = xdata[i] + ct2; 1047 isz1[i] = ct2; /* size of each message */ 1048 } 1049 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1050 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1051 PetscFunctionReturn(0); 1052 } 1053 /* -------------------------------------------------------------------------*/ 1054 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 1055 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1056 /* 1057 Every processor gets the entire matrix 1058 */ 1059 #undef __FUNCT__ 1060 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 1061 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1062 { 1063 Mat B; 1064 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1065 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1066 PetscErrorCode ierr; 1067 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1068 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1069 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1070 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1071 1072 PetscFunctionBegin; 1073 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1074 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1075 1076 if (scall == MAT_INITIAL_MATRIX) { 1077 /* ---------------------------------------------------------------- 1078 Tell every processor the number of nonzeros per row 1079 */ 1080 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1081 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1082 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1083 } 1084 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1085 for (i=0; i<size; i++) { 1086 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1087 displs[i] = A->rmap->range[i]; 1088 } 1089 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1090 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1091 #else 1092 sendcount = A->rmap->rend - A->rmap->rstart; 1093 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1094 #endif 1095 /* --------------------------------------------------------------- 1096 Create the sequential matrix of the same type as the local block diagonal 1097 */ 1098 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1099 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1100 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1101 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1102 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1103 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1104 **Bin = B; 1105 b = (Mat_SeqAIJ*)B->data; 1106 1107 /*-------------------------------------------------------------------- 1108 Copy my part of matrix column indices over 1109 */ 1110 sendcount = ad->nz + bd->nz; 1111 jsendbuf = b->j + b->i[rstarts[rank]]; 1112 a_jsendbuf = ad->j; 1113 b_jsendbuf = bd->j; 1114 n = A->rmap->rend - A->rmap->rstart; 1115 cnt = 0; 1116 for (i=0; i<n; i++) { 1117 1118 /* put in lower diagonal portion */ 1119 m = bd->i[i+1] - bd->i[i]; 1120 while (m > 0) { 1121 /* is it above diagonal (in bd (compressed) numbering) */ 1122 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1123 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1124 m--; 1125 } 1126 1127 /* put in diagonal portion */ 1128 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1129 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1130 } 1131 1132 /* put in upper diagonal portion */ 1133 while (m-- > 0) { 1134 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1135 } 1136 } 1137 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1138 1139 /*-------------------------------------------------------------------- 1140 Gather all column indices to all processors 1141 */ 1142 for (i=0; i<size; i++) { 1143 recvcounts[i] = 0; 1144 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1145 recvcounts[i] += lens[j]; 1146 } 1147 } 1148 displs[0] = 0; 1149 for (i=1; i<size; i++) { 1150 displs[i] = displs[i-1] + recvcounts[i-1]; 1151 } 1152 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1153 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1154 #else 1155 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1156 #endif 1157 /*-------------------------------------------------------------------- 1158 Assemble the matrix into useable form (note numerical values not yet set) 1159 */ 1160 /* set the b->ilen (length of each row) values */ 1161 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1162 /* set the b->i indices */ 1163 b->i[0] = 0; 1164 for (i=1; i<=A->rmap->N; i++) { 1165 b->i[i] = b->i[i-1] + lens[i-1]; 1166 } 1167 ierr = PetscFree(lens);CHKERRQ(ierr); 1168 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1169 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1170 1171 } else { 1172 B = **Bin; 1173 b = (Mat_SeqAIJ*)B->data; 1174 } 1175 1176 /*-------------------------------------------------------------------- 1177 Copy my part of matrix numerical values into the values location 1178 */ 1179 if (flag == MAT_GET_VALUES) { 1180 sendcount = ad->nz + bd->nz; 1181 sendbuf = b->a + b->i[rstarts[rank]]; 1182 a_sendbuf = ad->a; 1183 b_sendbuf = bd->a; 1184 b_sendj = bd->j; 1185 n = A->rmap->rend - A->rmap->rstart; 1186 cnt = 0; 1187 for (i=0; i<n; i++) { 1188 1189 /* put in lower diagonal portion */ 1190 m = bd->i[i+1] - bd->i[i]; 1191 while (m > 0) { 1192 /* is it above diagonal (in bd (compressed) numbering) */ 1193 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1194 sendbuf[cnt++] = *b_sendbuf++; 1195 m--; 1196 b_sendj++; 1197 } 1198 1199 /* put in diagonal portion */ 1200 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1201 sendbuf[cnt++] = *a_sendbuf++; 1202 } 1203 1204 /* put in upper diagonal portion */ 1205 while (m-- > 0) { 1206 sendbuf[cnt++] = *b_sendbuf++; 1207 b_sendj++; 1208 } 1209 } 1210 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1211 1212 /* ----------------------------------------------------------------- 1213 Gather all numerical values to all processors 1214 */ 1215 if (!recvcounts) { 1216 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1217 } 1218 for (i=0; i<size; i++) { 1219 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1220 } 1221 displs[0] = 0; 1222 for (i=1; i<size; i++) { 1223 displs[i] = displs[i-1] + recvcounts[i-1]; 1224 } 1225 recvbuf = b->a; 1226 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1227 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1228 #else 1229 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1230 #endif 1231 } /* endof (flag == MAT_GET_VALUES) */ 1232 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1233 1234 if (A->symmetric) { 1235 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1236 } else if (A->hermitian) { 1237 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1238 } else if (A->structurally_symmetric) { 1239 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1240 } 1241 PetscFunctionReturn(0); 1242 } 1243 #undef __FUNCT__ 1244 #define __FUNCT__ "MatDestroy_MPIAIJ_MatGetSubmatrices" 1245 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1246 { 1247 PetscErrorCode ierr; 1248 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1249 Mat_SubMat *submatj = c->submatis1; 1250 PetscInt i; 1251 1252 PetscFunctionBegin; 1253 ierr = submatj->destroy(C);CHKERRQ(ierr); 1254 1255 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1256 1257 for (i=0; i<submatj->nrqr; ++i) { 1258 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1259 } 1260 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1261 1262 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1263 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1264 1265 for (i=0; i<submatj->nrqs; ++i) { 1266 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1267 } 1268 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1269 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1270 1271 ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr); 1272 1273 #if defined(PETSC_USE_CTABLE) 1274 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1275 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1276 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1277 #else 1278 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1279 #endif 1280 1281 if (!submatj->allcolumns) { 1282 #if defined(PETSC_USE_CTABLE) 1283 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1284 #else 1285 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1286 #endif 1287 } 1288 1289 ierr = PetscFree(submatj);CHKERRQ(ierr); 1290 PetscFunctionReturn(0); 1291 } 1292 1293 #undef __FUNCT__ 1294 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS_Local" 1295 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1296 { 1297 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1298 Mat submat,A = c->A,B = c->B; 1299 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1300 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1301 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1302 const PetscInt *icol,*irow; 1303 PetscInt nrow,ncol,start; 1304 PetscErrorCode ierr; 1305 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1306 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1307 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1308 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1309 PetscInt *lens,rmax,ncols,*cols,Crow; 1310 #if defined(PETSC_USE_CTABLE) 1311 PetscTable cmap,rmap; 1312 PetscInt *cmap_loc,*rmap_loc; 1313 #else 1314 PetscInt *cmap,*rmap; 1315 #endif 1316 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1317 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1318 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1319 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1320 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1321 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1322 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1323 MPI_Comm comm; 1324 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1325 PetscMPIInt *onodes1,*olengths1,idex,end; 1326 Mat_SubMat *smatis1; 1327 PetscBool isrowsorted; 1328 1329 PetscFunctionBegin; 1330 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1331 1332 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1333 size = c->size; 1334 rank = c->rank; 1335 1336 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1337 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1338 1339 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1340 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1341 if (allcolumns) { 1342 icol = NULL; 1343 ncol = C->cmap->N; 1344 } else { 1345 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1346 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1347 } 1348 1349 if (scall == MAT_INITIAL_MATRIX) { 1350 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1351 1352 /* Get some new tags to keep the communication clean */ 1353 tag1 = ((PetscObject)C)->tag; 1354 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1355 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1356 1357 /* evaluate communication - mesg to who, length of mesg, and buffer space 1358 required. Based on this, buffers are allocated, and data copied into them */ 1359 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1360 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1361 1362 /* w1[proc] = num of rows owned by proc */ 1363 proc = 0; 1364 for (j=0; j<nrow; j++) { 1365 row = irow[j]; /* sorted! */ 1366 while (row >= C->rmap->range[proc+1]) proc++; 1367 w1[proc]++; 1368 row2proc[j] = proc; /* map row index to proc */ 1369 } 1370 1371 nrqs = 0; /* num of outgoing messages */ 1372 msz = 0; /* total mesg length (for all procs) */ 1373 w1[rank] = 0; /* num mesg sent to self */ 1374 for (proc=0; proc<size; proc++) { 1375 if (w1[proc]) { w2[proc] = 1; nrqs++;} /* there exists a message to this proc */ 1376 } 1377 1378 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1379 for (proc=0,j=0; proc<size; proc++) { 1380 if (w1[proc]) { pa[j++] = proc;} 1381 } 1382 1383 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1384 for (i=0; i<nrqs; i++) { 1385 proc = pa[i]; 1386 w1[proc] += 3; 1387 msz += w1[proc]; 1388 } 1389 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1390 1391 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1392 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1393 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1394 1395 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1396 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1397 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1398 1399 /* Now post the Irecvs corresponding to these messages */ 1400 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1401 1402 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1403 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1404 1405 /* Allocate Memory for outgoing messages */ 1406 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1407 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1408 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1409 1410 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1411 iptr = tmp; 1412 for (i=0; i<nrqs; i++) { 1413 proc = pa[i]; 1414 sbuf1[proc] = iptr; 1415 iptr += w1[proc]; 1416 } 1417 1418 /* Form the outgoing messages */ 1419 /* Initialize the header space */ 1420 for (i=0; i<nrqs; i++) { 1421 proc = pa[i]; 1422 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1423 ptr[proc] = sbuf1[proc] + 3; 1424 } 1425 1426 /* Parse the isrow and copy data into outbuf */ 1427 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1428 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1429 proc = row2proc[j]; 1430 if (proc != rank) { /* copy to the outgoing buf*/ 1431 *ptr[proc] = irow[j]; 1432 ctr[proc]++; ptr[proc]++; 1433 } 1434 } 1435 1436 /* Update the headers for the current IS */ 1437 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1438 if ((ctr_j = ctr[j])) { 1439 sbuf1_j = sbuf1[j]; 1440 k = ++sbuf1_j[0]; 1441 sbuf1_j[2*k] = ctr_j; 1442 sbuf1_j[2*k-1] = 0; 1443 } 1444 } 1445 1446 /* Now post the sends */ 1447 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1448 for (i=0; i<nrqs; ++i) { 1449 proc = pa[i]; 1450 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1451 } 1452 1453 /* Post Receives to capture the buffer size */ 1454 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1455 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1456 1457 rbuf2[0] = tmp + msz; 1458 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1459 1460 for (i=0; i<nrqs; ++i) { 1461 proc = pa[i]; 1462 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1463 } 1464 1465 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1466 1467 /* Send to other procs the buf size they should allocate */ 1468 /* Receive messages*/ 1469 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1470 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1471 1472 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1473 for (i=0; i<nrqr; ++i) { 1474 req_size[i] = 0; 1475 rbuf1_i = rbuf1[i]; 1476 start = 2*rbuf1_i[0] + 1; 1477 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1478 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1479 sbuf2_i = sbuf2[i]; 1480 for (j=start; j<end; j++) { 1481 k = rbuf1_i[j] - rstart; 1482 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1483 sbuf2_i[j] = ncols; 1484 req_size[i] += ncols; 1485 } 1486 req_source1[i] = r_status1[i].MPI_SOURCE; 1487 1488 /* form the header */ 1489 sbuf2_i[0] = req_size[i]; 1490 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1491 1492 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1493 } 1494 1495 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1496 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1497 1498 /* rbuf2 is received, Post recv column indices a->j */ 1499 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1500 1501 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1502 for (i=0; i<nrqs; ++i) { 1503 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1504 req_source2[i] = r_status2[i].MPI_SOURCE; 1505 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1506 } 1507 1508 /* Wait on sends1 and sends2 */ 1509 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1510 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1511 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1512 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1513 1514 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1515 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1516 1517 /* Now allocate sending buffers for a->j, and send them off */ 1518 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1519 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1520 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1521 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1522 1523 for (i=0; i<nrqr; i++) { /* for each requested message */ 1524 rbuf1_i = rbuf1[i]; 1525 sbuf_aj_i = sbuf_aj[i]; 1526 ct1 = 2*rbuf1_i[0] + 1; 1527 ct2 = 0; 1528 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1529 1530 kmax = rbuf1[i][2]; 1531 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1532 row = rbuf1_i[ct1] - rstart; 1533 nzA = ai[row+1] - ai[row]; nzB = bi[row+1] - bi[row]; 1534 ncols = nzA + nzB; 1535 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1536 1537 /* load the column indices for this row into cols*/ 1538 cols = sbuf_aj_i + ct2; 1539 1540 lwrite = 0; 1541 for (l=0; l<nzB; l++) { 1542 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1543 } 1544 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1545 for (l=0; l<nzB; l++) { 1546 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1547 } 1548 1549 ct2 += ncols; 1550 } 1551 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1552 } 1553 1554 /* create column map (cmap): global col of C -> local col of submat */ 1555 #if defined(PETSC_USE_CTABLE) 1556 if (!allcolumns) { 1557 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1558 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1559 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1560 if (icol[j] >= cstart && icol[j] <cend) { 1561 cmap_loc[icol[j] - cstart] = j+1; 1562 } else { /* use PetscTable for non-local col indices */ 1563 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1564 } 1565 } 1566 } else { 1567 cmap = NULL; 1568 cmap_loc = NULL; 1569 } 1570 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1571 #else 1572 if (!allcolumns) { 1573 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1574 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1575 } else { 1576 cmap = NULL; 1577 } 1578 #endif 1579 1580 /* Create lens for MatSeqAIJSetPreallocation() */ 1581 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1582 1583 /* Compute lens from local part of C */ 1584 for (j=0; j<nrow; j++) { 1585 row = irow[j]; 1586 proc = row2proc[j]; 1587 if (proc == rank) { 1588 /* diagonal part A = c->A */ 1589 ncols = ai[row-rstart+1] - ai[row-rstart]; 1590 cols = aj + ai[row-rstart]; 1591 if (!allcolumns) { 1592 for (k=0; k<ncols; k++) { 1593 #if defined(PETSC_USE_CTABLE) 1594 tcol = cmap_loc[cols[k]]; 1595 #else 1596 tcol = cmap[cols[k]+cstart]; 1597 #endif 1598 if (tcol) lens[j]++; 1599 } 1600 } else { /* allcolumns */ 1601 lens[j] = ncols; 1602 } 1603 1604 /* off-diagonal part B = c->B */ 1605 ncols = bi[row-rstart+1] - bi[row-rstart]; 1606 cols = bj + bi[row-rstart]; 1607 if (!allcolumns) { 1608 for (k=0; k<ncols; k++) { 1609 #if defined(PETSC_USE_CTABLE) 1610 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1611 #else 1612 tcol = cmap[bmap[cols[k]]]; 1613 #endif 1614 if (tcol) lens[j]++; 1615 } 1616 } else { /* allcolumns */ 1617 lens[j] += ncols; 1618 } 1619 } 1620 } 1621 1622 /* Create row map (rmap): global row of C -> local row of submat */ 1623 #if defined(PETSC_USE_CTABLE) 1624 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1625 for (j=0; j<nrow; j++) { 1626 row = irow[j]; 1627 proc = row2proc[j]; 1628 if (proc == rank) { /* a local row */ 1629 rmap_loc[row - rstart] = j; 1630 } else { 1631 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1632 } 1633 } 1634 #else 1635 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1636 for (j=0; j<nrow; j++) { 1637 rmap[irow[j]] = j; 1638 } 1639 #endif 1640 1641 /* Update lens from offproc data */ 1642 /* recv a->j is done */ 1643 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1644 for (i=0; i<nrqs; i++) { 1645 proc = pa[i]; 1646 sbuf1_i = sbuf1[proc]; 1647 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1648 ct1 = 2 + 1; 1649 ct2 = 0; 1650 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1651 rbuf3_i = rbuf3[i]; /* received C->j */ 1652 1653 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1654 max1 = sbuf1_i[2]; 1655 for (k=0; k<max1; k++,ct1++) { 1656 #if defined(PETSC_USE_CTABLE) 1657 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1658 row--; 1659 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1660 #else 1661 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1662 #endif 1663 /* Now, store row index of submat in sbuf1_i[ct1] */ 1664 sbuf1_i[ct1] = row; 1665 1666 nnz = rbuf2_i[ct1]; 1667 if (!allcolumns) { 1668 for (l=0; l<nnz; l++,ct2++) { 1669 #if defined(PETSC_USE_CTABLE) 1670 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1671 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1672 } else { 1673 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1674 } 1675 #else 1676 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1677 #endif 1678 if (tcol) { 1679 lens[row]++; 1680 1681 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1682 For reuse, we replace received C->j with index that should be inserted to submat */ 1683 /* rbuf3_i[ct3++] = ct2; ??? */ 1684 } 1685 } 1686 } else { /* allcolumns */ 1687 lens[row] += nnz; 1688 } 1689 } 1690 } 1691 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1692 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1693 1694 /* Create the submatrices */ 1695 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1696 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1697 1698 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1699 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1700 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1701 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1702 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1703 1704 /* create struct Mat_SubMat and attached it to submat */ 1705 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1706 subc = (Mat_SeqAIJ*)submat->data; 1707 subc->submatis1 = smatis1; 1708 1709 smatis1->nrqs = nrqs; 1710 smatis1->nrqr = nrqr; 1711 smatis1->rbuf1 = rbuf1; 1712 smatis1->rbuf2 = rbuf2; 1713 smatis1->rbuf3 = rbuf3; 1714 smatis1->sbuf2 = sbuf2; 1715 smatis1->req_source2 = req_source2; 1716 1717 smatis1->sbuf1 = sbuf1; 1718 smatis1->ptr = ptr; 1719 smatis1->tmp = tmp; 1720 smatis1->ctr = ctr; 1721 1722 smatis1->pa = pa; 1723 smatis1->req_size = req_size; 1724 smatis1->req_source1 = req_source1; 1725 1726 smatis1->allcolumns = allcolumns; 1727 smatis1->row2proc = row2proc; 1728 smatis1->rmap = rmap; 1729 smatis1->cmap = cmap; 1730 #if defined(PETSC_USE_CTABLE) 1731 smatis1->rmap_loc = rmap_loc; 1732 smatis1->cmap_loc = cmap_loc; 1733 #endif 1734 1735 smatis1->destroy = submat->ops->destroy; 1736 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1737 submat->factortype = C->factortype; 1738 1739 /* compute rmax */ 1740 rmax = 0; 1741 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1742 1743 } else { /* scall == MAT_REUSE_MATRIX */ 1744 submat = submats[0]; 1745 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1746 1747 subc = (Mat_SeqAIJ*)submat->data; 1748 rmax = subc->rmax; 1749 smatis1 = subc->submatis1; 1750 nrqs = smatis1->nrqs; 1751 nrqr = smatis1->nrqr; 1752 rbuf1 = smatis1->rbuf1; 1753 rbuf2 = smatis1->rbuf2; 1754 rbuf3 = smatis1->rbuf3; 1755 req_source2 = smatis1->req_source2; 1756 1757 sbuf1 = smatis1->sbuf1; 1758 sbuf2 = smatis1->sbuf2; 1759 ptr = smatis1->ptr; 1760 tmp = smatis1->tmp; 1761 ctr = smatis1->ctr; 1762 1763 pa = smatis1->pa; 1764 req_size = smatis1->req_size; 1765 req_source1 = smatis1->req_source1; 1766 1767 allcolumns = smatis1->allcolumns; 1768 row2proc = smatis1->row2proc; 1769 rmap = smatis1->rmap; 1770 cmap = smatis1->cmap; 1771 #if defined(PETSC_USE_CTABLE) 1772 rmap_loc = smatis1->rmap_loc; 1773 cmap_loc = smatis1->cmap_loc; 1774 #endif 1775 } 1776 1777 /* Post recv matrix values */ 1778 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1779 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1780 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1781 for (i=0; i<nrqs; ++i) { 1782 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1783 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1784 } 1785 1786 /* Allocate sending buffers for a->a, and send them off */ 1787 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1788 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1789 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1790 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1791 1792 for (i=0; i<nrqr; i++) { 1793 rbuf1_i = rbuf1[i]; 1794 sbuf_aa_i = sbuf_aa[i]; 1795 ct1 = 2*rbuf1_i[0]+1; 1796 ct2 = 0; 1797 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1798 1799 kmax = rbuf1_i[2]; 1800 for (k=0; k<kmax; k++,ct1++) { 1801 row = rbuf1_i[ct1] - rstart; 1802 nzA = ai[row+1] - ai[row]; 1803 nzB = bi[row+1] - bi[row]; 1804 ncols = nzA + nzB; 1805 cworkB = bj + bi[row]; 1806 vworkA = a_a + ai[row]; 1807 vworkB = b_a + bi[row]; 1808 1809 /* load the column values for this row into vals*/ 1810 vals = sbuf_aa_i + ct2; 1811 1812 lwrite = 0; 1813 for (l=0; l<nzB; l++) { 1814 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1815 } 1816 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1817 for (l=0; l<nzB; l++) { 1818 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1819 } 1820 1821 ct2 += ncols; 1822 } 1823 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1824 } 1825 1826 /* Assemble submat */ 1827 ierr = PetscCalloc2(rmax,&subcols,rmax,&subvals);CHKERRQ(ierr); 1828 1829 /* First assemble the local rows */ 1830 for (j=0; j<nrow; j++) { 1831 row = irow[j]; 1832 proc = row2proc[j]; 1833 if (proc == rank) { 1834 Crow = row - rstart; /* local row index of C */ 1835 #if defined(PETSC_USE_CTABLE) 1836 row = rmap_loc[Crow]; /* row index of submat */ 1837 #else 1838 row = rmap[row]; 1839 #endif 1840 1841 if (allcolumns) { 1842 /* diagonal part A = c->A */ 1843 ncols = ai[Crow+1] - ai[Crow]; 1844 cols = aj + ai[Crow]; 1845 vals = a->a + ai[Crow]; 1846 i = 0; 1847 for (k=0; k<ncols; k++) { 1848 subcols[i] = cols[k] + cstart; 1849 subvals[i++] = vals[k]; 1850 } 1851 1852 /* off-diagonal part B = c->B */ 1853 ncols = bi[Crow+1] - bi[Crow]; 1854 cols = bj + bi[Crow]; 1855 vals = b->a + bi[Crow]; 1856 for (k=0; k<ncols; k++) { 1857 subcols[i] = bmap[cols[k]]; 1858 subvals[i++] = vals[k]; 1859 } 1860 1861 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1862 1863 } else { /* !allcolumns */ 1864 #if defined(PETSC_USE_CTABLE) 1865 /* diagonal part A = c->A */ 1866 ncols = ai[Crow+1] - ai[Crow]; 1867 cols = aj + ai[Crow]; 1868 vals = a->a + ai[Crow]; 1869 i = 0; 1870 for (k=0; k<ncols; k++) { 1871 tcol = cmap_loc[cols[k]]; 1872 if (tcol) { 1873 subcols[i] = --tcol; 1874 subvals[i++] = vals[k]; 1875 } 1876 } 1877 1878 /* off-diagonal part B = c->B */ 1879 ncols = bi[Crow+1] - bi[Crow]; 1880 cols = bj + bi[Crow]; 1881 vals = b->a + bi[Crow]; 1882 for (k=0; k<ncols; k++) { 1883 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1884 if (tcol) { 1885 subcols[i] = --tcol; 1886 subvals[i++] = vals[k]; 1887 } 1888 } 1889 #else 1890 /* diagonal part A = c->A */ 1891 ncols = ai[Crow+1] - ai[Crow]; 1892 cols = aj + ai[Crow]; 1893 vals = a->a + ai[Crow]; 1894 i = 0; 1895 for (k=0; k<ncols; k++) { 1896 tcol = cmap[cols[k]+cstart]; 1897 if (tcol) { 1898 subcols[i] = --tcol; 1899 subvals[i++] = vals[k]; 1900 } 1901 } 1902 1903 /* off-diagonal part B = c->B */ 1904 ncols = bi[Crow+1] - bi[Crow]; 1905 cols = bj + bi[Crow]; 1906 vals = b->a + bi[Crow]; 1907 for (k=0; k<ncols; k++) { 1908 tcol = cmap[bmap[cols[k]]]; 1909 if (tcol) { 1910 subcols[i] = --tcol; 1911 subvals[i++] = vals[k]; 1912 } 1913 } 1914 #endif 1915 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1916 } 1917 } 1918 } 1919 1920 /* Now assemble the off-proc rows */ 1921 for (i=0; i<nrqs; i++) { /* for each requested message */ 1922 /* recv values from other processes */ 1923 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1924 proc = pa[idex]; 1925 sbuf1_i = sbuf1[proc]; 1926 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1927 ct1 = 2 + 1; 1928 ct2 = 0; /* count of received C->j */ 1929 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1930 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1931 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1932 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1933 1934 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1935 max1 = sbuf1_i[2]; /* num of rows */ 1936 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1937 row = sbuf1_i[ct1]; /* row index of submat */ 1938 if (!allcolumns) { 1939 idex = 0; 1940 if (scall == MAT_INITIAL_MATRIX) { 1941 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1942 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1943 #if defined(PETSC_USE_CTABLE) 1944 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1945 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1946 } else { 1947 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1948 } 1949 #else 1950 tcol = cmap[rbuf3_i[ct2]]; 1951 #endif 1952 if (tcol) { 1953 subcols[idex] = --tcol; 1954 subvals[idex++] = rbuf4_i[ct2]; 1955 1956 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1957 For reuse, we replace received C->j with index that should be inserted to submat */ 1958 rbuf3_i[ct3++] = ct2; 1959 } 1960 } 1961 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1962 1963 } else { /* scall == MAT_REUSE_MATRIX */ 1964 submat = submats[0]; 1965 subc = (Mat_SeqAIJ*)submat->data; 1966 1967 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1968 for (l=0; l<nnz; l++) { 1969 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1970 subvals[idex++] = rbuf4_i[ct2]; 1971 } 1972 1973 bj = subc->j + subc->i[row]; 1974 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1975 } 1976 } else { /* allcolumns */ 1977 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1978 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1979 ct2 += nnz; 1980 } 1981 } 1982 } 1983 1984 /* sending a->a are done */ 1985 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1986 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1987 1988 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1989 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1990 submats[0] = submat; 1991 1992 /* Restore the indices */ 1993 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1994 if (!allcolumns) { 1995 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1996 } 1997 1998 /* Destroy allocated memory */ 1999 for (i=0; i<nrqs; ++i) { 2000 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2001 } 2002 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2003 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2004 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2005 ierr = PetscFree2(subcols,subvals);CHKERRQ(ierr); 2006 2007 if (scall == MAT_INITIAL_MATRIX) { 2008 ierr = PetscFree(lens);CHKERRQ(ierr); 2009 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2010 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2011 } 2012 PetscFunctionReturn(0); 2013 } 2014 2015 #undef __FUNCT__ 2016 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS" 2017 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2018 { 2019 PetscErrorCode ierr; 2020 PetscInt ncol; 2021 PetscBool colflag,allcolumns=PETSC_FALSE; 2022 2023 PetscFunctionBegin; 2024 /* Allocate memory to hold all the submatrices */ 2025 if (scall != MAT_REUSE_MATRIX) { 2026 ierr = PetscMalloc1(1,submat);CHKERRQ(ierr); 2027 } 2028 2029 /* Check for special case: each processor gets entire matrix columns */ 2030 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 2031 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 2032 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 2033 2034 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2035 PetscFunctionReturn(0); 2036 } 2037 2038 #undef __FUNCT__ 2039 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 2040 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2041 { 2042 PetscErrorCode ierr; 2043 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 2044 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 2045 2046 PetscFunctionBegin; 2047 /* Check for special case: each processor gets entire matrix */ 2048 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */ 2049 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2050 PetscFunctionReturn(0); 2051 } 2052 2053 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2054 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2055 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2056 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2057 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2058 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2059 wantallmatrix = PETSC_TRUE; 2060 2061 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2062 } 2063 } 2064 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2065 if (twantallmatrix) { 2066 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2067 PetscFunctionReturn(0); 2068 } 2069 2070 /* Allocate memory to hold all the submatrices */ 2071 if (scall != MAT_REUSE_MATRIX) { 2072 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 2073 } 2074 2075 /* Check for special case: each processor gets entire matrix columns */ 2076 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 2077 for (i=0; i<ismax; i++) { 2078 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2079 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 2080 if (colflag && ncol == C->cmap->N) { 2081 allcolumns[i] = PETSC_TRUE; 2082 } else { 2083 allcolumns[i] = PETSC_FALSE; 2084 } 2085 } 2086 2087 /* Determine the number of stages through which submatrices are done */ 2088 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2089 2090 /* 2091 Each stage will extract nmax submatrices. 2092 nmax is determined by the matrix column dimension. 2093 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2094 */ 2095 if (!nmax) nmax = 1; 2096 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 2097 2098 /* Make sure every processor loops through the nstages */ 2099 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2100 2101 for (i=0,pos=0; i<nstages; i++) { 2102 if (pos+nmax <= ismax) max_no = nmax; 2103 else if (pos == ismax) max_no = 0; 2104 else max_no = ismax-pos; 2105 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 2106 pos += max_no; 2107 } 2108 2109 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 2110 PetscFunctionReturn(0); 2111 } 2112 2113 /* -------------------------------------------------------------------------*/ 2114 #undef __FUNCT__ 2115 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 2116 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 2117 { 2118 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2119 Mat A = c->A; 2120 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 2121 const PetscInt **icol,**irow; 2122 PetscInt *nrow,*ncol,start; 2123 PetscErrorCode ierr; 2124 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 2125 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 2126 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 2127 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 2128 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2129 #if defined(PETSC_USE_CTABLE) 2130 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2131 #else 2132 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2133 #endif 2134 const PetscInt *irow_i; 2135 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2136 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2137 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2138 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2139 MPI_Status *r_status3,*r_status4,*s_status4; 2140 MPI_Comm comm; 2141 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 2142 PetscMPIInt *onodes1,*olengths1; 2143 PetscMPIInt idex,idex2,end; 2144 2145 PetscFunctionBegin; 2146 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2147 tag0 = ((PetscObject)C)->tag; 2148 size = c->size; 2149 rank = c->rank; 2150 2151 /* Get some new tags to keep the communication clean */ 2152 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 2153 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2154 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2155 2156 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 2157 2158 for (i=0; i<ismax; i++) { 2159 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2160 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2161 if (allcolumns[i]) { 2162 icol[i] = NULL; 2163 ncol[i] = C->cmap->N; 2164 } else { 2165 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2166 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2167 } 2168 } 2169 2170 /* evaluate communication - mesg to who, length of mesg, and buffer space 2171 required. Based on this, buffers are allocated, and data copied into them*/ 2172 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 2173 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2174 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2175 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2176 for (i=0; i<ismax; i++) { 2177 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2178 jmax = nrow[i]; 2179 irow_i = irow[i]; 2180 for (j=0; j<jmax; j++) { 2181 l = 0; 2182 row = irow_i[j]; 2183 while (row >= C->rmap->range[l+1]) l++; 2184 proc = l; 2185 w4[proc]++; 2186 } 2187 for (j=0; j<size; j++) { 2188 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 2189 } 2190 } 2191 2192 nrqs = 0; /* no of outgoing messages */ 2193 msz = 0; /* total mesg length (for all procs) */ 2194 w1[rank] = 0; /* no mesg sent to self */ 2195 w3[rank] = 0; 2196 for (i=0; i<size; i++) { 2197 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2198 } 2199 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2200 for (i=0,j=0; i<size; i++) { 2201 if (w1[i]) { pa[j] = i; j++; } 2202 } 2203 2204 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2205 for (i=0; i<nrqs; i++) { 2206 j = pa[i]; 2207 w1[j] += w2[j] + 2* w3[j]; 2208 msz += w1[j]; 2209 } 2210 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2211 2212 /* Determine the number of messages to expect, their lengths, from from-ids */ 2213 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2214 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2215 2216 /* Now post the Irecvs corresponding to these messages */ 2217 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2218 2219 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2220 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2221 2222 /* Allocate Memory for outgoing messages */ 2223 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2224 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2225 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2226 2227 { 2228 PetscInt *iptr = tmp; 2229 k = 0; 2230 for (i=0; i<nrqs; i++) { 2231 j = pa[i]; 2232 iptr += k; 2233 sbuf1[j] = iptr; 2234 k = w1[j]; 2235 } 2236 } 2237 2238 /* Form the outgoing messages */ 2239 /* Initialize the header space */ 2240 for (i=0; i<nrqs; i++) { 2241 j = pa[i]; 2242 sbuf1[j][0] = 0; 2243 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2244 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2245 } 2246 2247 /* Parse the isrow and copy data into outbuf */ 2248 for (i=0; i<ismax; i++) { 2249 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2250 irow_i = irow[i]; 2251 jmax = nrow[i]; 2252 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2253 l = 0; 2254 row = irow_i[j]; 2255 while (row >= C->rmap->range[l+1]) l++; 2256 proc = l; 2257 if (proc != rank) { /* copy to the outgoing buf*/ 2258 ctr[proc]++; 2259 *ptr[proc] = row; 2260 ptr[proc]++; 2261 } 2262 } 2263 /* Update the headers for the current IS */ 2264 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2265 if ((ctr_j = ctr[j])) { 2266 sbuf1_j = sbuf1[j]; 2267 k = ++sbuf1_j[0]; 2268 sbuf1_j[2*k] = ctr_j; 2269 sbuf1_j[2*k-1] = i; 2270 } 2271 } 2272 } 2273 2274 /* Now post the sends */ 2275 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2276 for (i=0; i<nrqs; ++i) { 2277 j = pa[i]; 2278 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2279 } 2280 2281 /* Post Receives to capture the buffer size */ 2282 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2283 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 2284 rbuf2[0] = tmp + msz; 2285 for (i=1; i<nrqs; ++i) { 2286 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2287 } 2288 for (i=0; i<nrqs; ++i) { 2289 j = pa[i]; 2290 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 2291 } 2292 2293 /* Send to other procs the buf size they should allocate */ 2294 2295 2296 /* Receive messages*/ 2297 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2298 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2299 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 2300 { 2301 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2302 PetscInt *sbuf2_i; 2303 2304 for (i=0; i<nrqr; ++i) { 2305 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 2306 2307 req_size[idex] = 0; 2308 rbuf1_i = rbuf1[idex]; 2309 start = 2*rbuf1_i[0] + 1; 2310 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2311 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 2312 sbuf2_i = sbuf2[idex]; 2313 for (j=start; j<end; j++) { 2314 id = rbuf1_i[j] - rstart; 2315 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2316 sbuf2_i[j] = ncols; 2317 req_size[idex] += ncols; 2318 } 2319 req_source[idex] = r_status1[i].MPI_SOURCE; 2320 /* form the header */ 2321 sbuf2_i[0] = req_size[idex]; 2322 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2323 2324 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 2325 } 2326 } 2327 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2328 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2329 2330 /* recv buffer sizes */ 2331 /* Receive messages*/ 2332 2333 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 2334 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2335 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2336 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2337 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2338 2339 for (i=0; i<nrqs; ++i) { 2340 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 2341 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 2342 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 2343 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 2344 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 2345 } 2346 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2347 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2348 2349 /* Wait on sends1 and sends2 */ 2350 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2351 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2352 2353 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2354 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2355 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2356 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2357 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2358 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2359 2360 /* Now allocate buffers for a->j, and send them off */ 2361 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2362 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2363 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2364 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2365 2366 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2367 { 2368 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2369 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2370 PetscInt cend = C->cmap->rend; 2371 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2372 2373 for (i=0; i<nrqr; i++) { 2374 rbuf1_i = rbuf1[i]; 2375 sbuf_aj_i = sbuf_aj[i]; 2376 ct1 = 2*rbuf1_i[0] + 1; 2377 ct2 = 0; 2378 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2379 kmax = rbuf1[i][2*j]; 2380 for (k=0; k<kmax; k++,ct1++) { 2381 row = rbuf1_i[ct1] - rstart; 2382 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2383 ncols = nzA + nzB; 2384 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2385 2386 /* load the column indices for this row into cols*/ 2387 cols = sbuf_aj_i + ct2; 2388 2389 lwrite = 0; 2390 for (l=0; l<nzB; l++) { 2391 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2392 } 2393 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2394 for (l=0; l<nzB; l++) { 2395 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2396 } 2397 2398 ct2 += ncols; 2399 } 2400 } 2401 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 2402 } 2403 } 2404 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 2405 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 2406 2407 /* Allocate buffers for a->a, and send them off */ 2408 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2409 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2410 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2411 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2412 2413 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2414 { 2415 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2416 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2417 PetscInt cend = C->cmap->rend; 2418 PetscInt *b_j = b->j; 2419 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2420 2421 for (i=0; i<nrqr; i++) { 2422 rbuf1_i = rbuf1[i]; 2423 sbuf_aa_i = sbuf_aa[i]; 2424 ct1 = 2*rbuf1_i[0]+1; 2425 ct2 = 0; 2426 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2427 kmax = rbuf1_i[2*j]; 2428 for (k=0; k<kmax; k++,ct1++) { 2429 row = rbuf1_i[ct1] - rstart; 2430 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2431 ncols = nzA + nzB; 2432 cworkB = b_j + b_i[row]; 2433 vworkA = a_a + a_i[row]; 2434 vworkB = b_a + b_i[row]; 2435 2436 /* load the column values for this row into vals*/ 2437 vals = sbuf_aa_i+ct2; 2438 2439 lwrite = 0; 2440 for (l=0; l<nzB; l++) { 2441 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2442 } 2443 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2444 for (l=0; l<nzB; l++) { 2445 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2446 } 2447 2448 ct2 += ncols; 2449 } 2450 } 2451 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 2452 } 2453 } 2454 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 2455 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 2456 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2457 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2458 2459 /* Form the matrix */ 2460 /* create col map: global col of C -> local col of submatrices */ 2461 { 2462 const PetscInt *icol_i; 2463 #if defined(PETSC_USE_CTABLE) 2464 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 2465 for (i=0; i<ismax; i++) { 2466 if (!allcolumns[i]) { 2467 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2468 2469 jmax = ncol[i]; 2470 icol_i = icol[i]; 2471 cmap_i = cmap[i]; 2472 for (j=0; j<jmax; j++) { 2473 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2474 } 2475 } else { 2476 cmap[i] = NULL; 2477 } 2478 } 2479 #else 2480 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 2481 for (i=0; i<ismax; i++) { 2482 if (!allcolumns[i]) { 2483 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2484 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2485 jmax = ncol[i]; 2486 icol_i = icol[i]; 2487 cmap_i = cmap[i]; 2488 for (j=0; j<jmax; j++) { 2489 cmap_i[icol_i[j]] = j+1; 2490 } 2491 } else { 2492 cmap[i] = NULL; 2493 } 2494 } 2495 #endif 2496 } 2497 2498 /* Create lens which is required for MatCreate... */ 2499 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2500 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2501 if (ismax) { 2502 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 2503 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 2504 } 2505 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2506 2507 /* Update lens from local data */ 2508 for (i=0; i<ismax; i++) { 2509 jmax = nrow[i]; 2510 if (!allcolumns[i]) cmap_i = cmap[i]; 2511 irow_i = irow[i]; 2512 lens_i = lens[i]; 2513 for (j=0; j<jmax; j++) { 2514 l = 0; 2515 row = irow_i[j]; 2516 while (row >= C->rmap->range[l+1]) l++; 2517 proc = l; 2518 if (proc == rank) { 2519 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2520 if (!allcolumns[i]) { 2521 for (k=0; k<ncols; k++) { 2522 #if defined(PETSC_USE_CTABLE) 2523 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2524 #else 2525 tcol = cmap_i[cols[k]]; 2526 #endif 2527 if (tcol) lens_i[j]++; 2528 } 2529 } else { /* allcolumns */ 2530 lens_i[j] = ncols; 2531 } 2532 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2533 } 2534 } 2535 } 2536 2537 /* Create row map: global row of C -> local row of submatrices */ 2538 #if defined(PETSC_USE_CTABLE) 2539 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 2540 for (i=0; i<ismax; i++) { 2541 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2542 irow_i = irow[i]; 2543 jmax = nrow[i]; 2544 for (j=0; j<jmax; j++) { 2545 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2546 } 2547 } 2548 #else 2549 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 2550 if (ismax) { 2551 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 2552 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2553 } 2554 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 2555 for (i=0; i<ismax; i++) { 2556 rmap_i = rmap[i]; 2557 irow_i = irow[i]; 2558 jmax = nrow[i]; 2559 for (j=0; j<jmax; j++) { 2560 rmap_i[irow_i[j]] = j; 2561 } 2562 } 2563 #endif 2564 2565 /* Update lens from offproc data */ 2566 { 2567 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2568 2569 for (tmp2=0; tmp2<nrqs; tmp2++) { 2570 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 2571 idex = pa[idex2]; 2572 sbuf1_i = sbuf1[idex]; 2573 jmax = sbuf1_i[0]; 2574 ct1 = 2*jmax+1; 2575 ct2 = 0; 2576 rbuf2_i = rbuf2[idex2]; 2577 rbuf3_i = rbuf3[idex2]; 2578 for (j=1; j<=jmax; j++) { 2579 is_no = sbuf1_i[2*j-1]; 2580 max1 = sbuf1_i[2*j]; 2581 lens_i = lens[is_no]; 2582 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2583 rmap_i = rmap[is_no]; 2584 for (k=0; k<max1; k++,ct1++) { 2585 #if defined(PETSC_USE_CTABLE) 2586 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2587 row--; 2588 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2589 #else 2590 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2591 #endif 2592 max2 = rbuf2_i[ct1]; 2593 for (l=0; l<max2; l++,ct2++) { 2594 if (!allcolumns[is_no]) { 2595 #if defined(PETSC_USE_CTABLE) 2596 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2597 #else 2598 tcol = cmap_i[rbuf3_i[ct2]]; 2599 #endif 2600 if (tcol) lens_i[row]++; 2601 } else { /* allcolumns */ 2602 lens_i[row]++; /* lens_i[row] += max2 ? */ 2603 } 2604 } 2605 } 2606 } 2607 } 2608 } 2609 ierr = PetscFree(r_status3);CHKERRQ(ierr); 2610 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2611 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2612 ierr = PetscFree(s_status3);CHKERRQ(ierr); 2613 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2614 2615 /* Create the submatrices */ 2616 if (scall == MAT_REUSE_MATRIX) { 2617 PetscBool flag; 2618 2619 /* 2620 Assumes new rows are same length as the old rows,hence bug! 2621 */ 2622 for (i=0; i<ismax; i++) { 2623 mat = (Mat_SeqAIJ*)(submats[i]->data); 2624 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2625 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2626 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 2627 /* Initial matrix as if empty */ 2628 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2629 2630 submats[i]->factortype = C->factortype; 2631 } 2632 } else { 2633 for (i=0; i<ismax; i++) { 2634 PetscInt rbs,cbs; 2635 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2636 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2637 2638 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2639 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2640 2641 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2642 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2643 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2644 } 2645 } 2646 2647 /* Assemble the matrices */ 2648 /* First assemble the local rows */ 2649 { 2650 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2651 PetscScalar *imat_a; 2652 2653 for (i=0; i<ismax; i++) { 2654 mat = (Mat_SeqAIJ*)submats[i]->data; 2655 imat_ilen = mat->ilen; 2656 imat_j = mat->j; 2657 imat_i = mat->i; 2658 imat_a = mat->a; 2659 2660 if (!allcolumns[i]) cmap_i = cmap[i]; 2661 rmap_i = rmap[i]; 2662 irow_i = irow[i]; 2663 jmax = nrow[i]; 2664 for (j=0; j<jmax; j++) { 2665 l = 0; 2666 row = irow_i[j]; 2667 while (row >= C->rmap->range[l+1]) l++; 2668 proc = l; 2669 if (proc == rank) { 2670 old_row = row; 2671 #if defined(PETSC_USE_CTABLE) 2672 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2673 row--; 2674 #else 2675 row = rmap_i[row]; 2676 #endif 2677 ilen_row = imat_ilen[row]; 2678 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2679 mat_i = imat_i[row]; 2680 mat_a = imat_a + mat_i; 2681 mat_j = imat_j + mat_i; 2682 if (!allcolumns[i]) { 2683 for (k=0; k<ncols; k++) { 2684 #if defined(PETSC_USE_CTABLE) 2685 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2686 #else 2687 tcol = cmap_i[cols[k]]; 2688 #endif 2689 if (tcol) { 2690 *mat_j++ = tcol - 1; 2691 *mat_a++ = vals[k]; 2692 ilen_row++; 2693 } 2694 } 2695 } else { /* allcolumns */ 2696 for (k=0; k<ncols; k++) { 2697 *mat_j++ = cols[k]; /* global col index! */ 2698 *mat_a++ = vals[k]; 2699 ilen_row++; 2700 } 2701 } 2702 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2703 2704 imat_ilen[row] = ilen_row; 2705 } 2706 } 2707 } 2708 } 2709 2710 /* Now assemble the off proc rows*/ 2711 { 2712 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 2713 PetscInt *imat_j,*imat_i; 2714 PetscScalar *imat_a,*rbuf4_i; 2715 2716 for (tmp2=0; tmp2<nrqs; tmp2++) { 2717 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 2718 idex = pa[idex2]; 2719 sbuf1_i = sbuf1[idex]; 2720 jmax = sbuf1_i[0]; 2721 ct1 = 2*jmax + 1; 2722 ct2 = 0; 2723 rbuf2_i = rbuf2[idex2]; 2724 rbuf3_i = rbuf3[idex2]; 2725 rbuf4_i = rbuf4[idex2]; 2726 for (j=1; j<=jmax; j++) { 2727 is_no = sbuf1_i[2*j-1]; 2728 rmap_i = rmap[is_no]; 2729 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2730 mat = (Mat_SeqAIJ*)submats[is_no]->data; 2731 imat_ilen = mat->ilen; 2732 imat_j = mat->j; 2733 imat_i = mat->i; 2734 imat_a = mat->a; 2735 max1 = sbuf1_i[2*j]; 2736 for (k=0; k<max1; k++,ct1++) { 2737 row = sbuf1_i[ct1]; 2738 #if defined(PETSC_USE_CTABLE) 2739 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2740 row--; 2741 #else 2742 row = rmap_i[row]; 2743 #endif 2744 ilen = imat_ilen[row]; 2745 mat_i = imat_i[row]; 2746 mat_a = imat_a + mat_i; 2747 mat_j = imat_j + mat_i; 2748 max2 = rbuf2_i[ct1]; 2749 if (!allcolumns[is_no]) { 2750 for (l=0; l<max2; l++,ct2++) { 2751 2752 #if defined(PETSC_USE_CTABLE) 2753 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2754 #else 2755 tcol = cmap_i[rbuf3_i[ct2]]; 2756 #endif 2757 if (tcol) { 2758 *mat_j++ = tcol - 1; 2759 *mat_a++ = rbuf4_i[ct2]; 2760 ilen++; 2761 } 2762 } 2763 } else { /* allcolumns */ 2764 for (l=0; l<max2; l++,ct2++) { 2765 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2766 *mat_a++ = rbuf4_i[ct2]; 2767 ilen++; 2768 } 2769 } 2770 imat_ilen[row] = ilen; 2771 } 2772 } 2773 } 2774 } 2775 2776 /* sort the rows */ 2777 { 2778 PetscInt *imat_ilen,*imat_j,*imat_i; 2779 PetscScalar *imat_a; 2780 2781 for (i=0; i<ismax; i++) { 2782 mat = (Mat_SeqAIJ*)submats[i]->data; 2783 imat_j = mat->j; 2784 imat_i = mat->i; 2785 imat_a = mat->a; 2786 imat_ilen = mat->ilen; 2787 2788 if (allcolumns[i]) continue; 2789 jmax = nrow[i]; 2790 for (j=0; j<jmax; j++) { 2791 PetscInt ilen; 2792 2793 mat_i = imat_i[j]; 2794 mat_a = imat_a + mat_i; 2795 mat_j = imat_j + mat_i; 2796 ilen = imat_ilen[j]; 2797 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 2798 } 2799 } 2800 } 2801 2802 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2803 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2804 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2805 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2806 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2807 2808 /* Restore the indices */ 2809 for (i=0; i<ismax; i++) { 2810 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2811 if (!allcolumns[i]) { 2812 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2813 } 2814 } 2815 2816 /* Destroy allocated memory */ 2817 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 2818 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2819 ierr = PetscFree(pa);CHKERRQ(ierr); 2820 2821 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 2822 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 2823 for (i=0; i<nrqr; ++i) { 2824 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 2825 } 2826 for (i=0; i<nrqs; ++i) { 2827 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 2828 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2829 } 2830 2831 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 2832 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 2833 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2834 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2835 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2836 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2837 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2838 2839 #if defined(PETSC_USE_CTABLE) 2840 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 2841 #else 2842 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 2843 #endif 2844 ierr = PetscFree(rmap);CHKERRQ(ierr); 2845 2846 for (i=0; i<ismax; i++) { 2847 if (!allcolumns[i]) { 2848 #if defined(PETSC_USE_CTABLE) 2849 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 2850 #else 2851 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 2852 #endif 2853 } 2854 } 2855 ierr = PetscFree(cmap);CHKERRQ(ierr); 2856 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2857 ierr = PetscFree(lens);CHKERRQ(ierr); 2858 2859 for (i=0; i<ismax; i++) { 2860 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2861 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2862 } 2863 PetscFunctionReturn(0); 2864 } 2865 2866 /* 2867 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2868 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2869 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2870 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2871 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2872 state, and needs to be "assembled" later by compressing B's column space. 2873 2874 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2875 Following this call, C->A & C->B have been created, even if empty. 2876 */ 2877 #undef __FUNCT__ 2878 #define __FUNCT__ "MatSetSeqMats_MPIAIJ" 2879 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2880 { 2881 /* If making this function public, change the error returned in this function away from _PLIB. */ 2882 PetscErrorCode ierr; 2883 Mat_MPIAIJ *aij; 2884 Mat_SeqAIJ *Baij; 2885 PetscBool seqaij,Bdisassembled; 2886 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2887 PetscScalar v; 2888 const PetscInt *rowindices,*colindices; 2889 2890 PetscFunctionBegin; 2891 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2892 if (A) { 2893 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2894 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2895 if (rowemb) { 2896 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2897 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2898 } else { 2899 if (C->rmap->n != A->rmap->n) { 2900 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2901 } 2902 } 2903 if (dcolemb) { 2904 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2905 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2906 } else { 2907 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2908 } 2909 } 2910 if (B) { 2911 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2912 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2913 if (rowemb) { 2914 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2915 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2916 } else { 2917 if (C->rmap->n != B->rmap->n) { 2918 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2919 } 2920 } 2921 if (ocolemb) { 2922 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2923 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2924 } else { 2925 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2926 } 2927 } 2928 2929 aij = (Mat_MPIAIJ*)(C->data); 2930 if (!aij->A) { 2931 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2932 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2933 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2934 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2935 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2936 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2937 } 2938 if (A) { 2939 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2940 } else { 2941 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2942 } 2943 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2944 /* 2945 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2946 need to "disassemble" B -- convert it to using C's global indices. 2947 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2948 2949 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2950 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2951 2952 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2953 At least avoid calling MatSetValues() and the implied searches? 2954 */ 2955 2956 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2957 #if defined(PETSC_USE_CTABLE) 2958 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2959 #else 2960 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2961 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2962 if (aij->B) { 2963 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2964 } 2965 #endif 2966 ngcol = 0; 2967 if (aij->lvec) { 2968 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2969 } 2970 if (aij->garray) { 2971 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2972 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2973 } 2974 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2975 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2976 } 2977 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2978 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2979 } 2980 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2981 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2982 } 2983 } 2984 Bdisassembled = PETSC_FALSE; 2985 if (!aij->B) { 2986 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2987 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2988 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2989 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2990 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2991 Bdisassembled = PETSC_TRUE; 2992 } 2993 if (B) { 2994 Baij = (Mat_SeqAIJ*)(B->data); 2995 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2996 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2997 for (i=0; i<B->rmap->n; i++) { 2998 nz[i] = Baij->i[i+1] - Baij->i[i]; 2999 } 3000 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3001 ierr = PetscFree(nz);CHKERRQ(ierr); 3002 } 3003 3004 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3005 shift = rend-rstart; 3006 count = 0; 3007 rowindices = NULL; 3008 colindices = NULL; 3009 if (rowemb) { 3010 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3011 } 3012 if (ocolemb) { 3013 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3014 } 3015 for (i=0; i<B->rmap->n; i++) { 3016 PetscInt row; 3017 row = i; 3018 if (rowindices) row = rowindices[i]; 3019 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3020 col = Baij->j[count]; 3021 if (colindices) col = colindices[col]; 3022 if (Bdisassembled && col>=rstart) col += shift; 3023 v = Baij->a[count]; 3024 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3025 ++count; 3026 } 3027 } 3028 /* No assembly for aij->B is necessary. */ 3029 /* FIXME: set aij->B's nonzerostate correctly. */ 3030 } else { 3031 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3032 } 3033 C->preallocated = PETSC_TRUE; 3034 C->was_assembled = PETSC_FALSE; 3035 C->assembled = PETSC_FALSE; 3036 /* 3037 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3038 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3039 */ 3040 PetscFunctionReturn(0); 3041 } 3042 3043 #undef __FUNCT__ 3044 #define __FUNCT__ "MatGetSeqMats_MPIAIJ" 3045 /* 3046 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3047 */ 3048 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3049 { 3050 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 3051 3052 PetscFunctionBegin; 3053 PetscValidPointer(A,2); 3054 PetscValidPointer(B,3); 3055 /* FIXME: make sure C is assembled */ 3056 *A = aij->A; 3057 *B = aij->B; 3058 /* Note that we don't incref *A and *B, so be careful! */ 3059 PetscFunctionReturn(0); 3060 } 3061 3062 /* 3063 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3064 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3065 */ 3066 #undef __FUNCT__ 3067 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ" 3068 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3069 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3070 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3071 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3072 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3073 { 3074 PetscErrorCode ierr; 3075 PetscMPIInt isize,flag; 3076 PetscInt i,ii,cismax,ispar,ciscol_localsize; 3077 Mat *A,*B; 3078 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3079 3080 PetscFunctionBegin; 3081 if (!ismax) PetscFunctionReturn(0); 3082 3083 for (i = 0, cismax = 0; i < ismax; ++i) { 3084 PetscMPIInt isize; 3085 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3086 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3087 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3088 if (isize > 1) ++cismax; 3089 } 3090 /* 3091 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3092 ispar counts the number of parallel ISs across C's comm. 3093 */ 3094 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3095 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3096 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3097 PetscFunctionReturn(0); 3098 } 3099 3100 /* if (ispar) */ 3101 /* 3102 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3103 These are used to extract the off-diag portion of the resulting parallel matrix. 3104 The row IS for the off-diag portion is the same as for the diag portion, 3105 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3106 */ 3107 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3108 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3109 for (i = 0, ii = 0; i < ismax; ++i) { 3110 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3111 if (isize > 1) { 3112 /* 3113 TODO: This is the part that's ***NOT SCALABLE***. 3114 To fix this we need to extract just the indices of C's nonzero columns 3115 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3116 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3117 to be done without serializing on the IS list, so, most likely, it is best 3118 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3119 */ 3120 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3121 /* Now we have to 3122 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3123 were sorted on each rank, concatenated they might no longer be sorted; 3124 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3125 indices in the nondecreasing order to the original index positions. 3126 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3127 */ 3128 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3129 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3130 ++ii; 3131 } 3132 } 3133 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3134 for (i = 0, ii = 0; i < ismax; ++i) { 3135 PetscInt j,issize; 3136 const PetscInt *indices; 3137 3138 /* 3139 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3140 */ 3141 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3142 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3143 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3144 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3145 for (j = 1; j < issize; ++j) { 3146 if (indices[j] == indices[j-1]) { 3147 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3148 } 3149 } 3150 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3151 3152 3153 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3154 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3155 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3156 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3157 for (j = 1; j < issize; ++j) { 3158 if (indices[j-1] == indices[j]) { 3159 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3160 } 3161 } 3162 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3163 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3164 if (isize > 1) { 3165 cisrow[ii] = isrow[i]; 3166 ++ii; 3167 } 3168 } 3169 /* 3170 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3171 array of sequential matrices underlying the resulting parallel matrices. 3172 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3173 contain duplicates. 3174 3175 There are as many diag matrices as there are original index sets. There are only as many parallel 3176 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3177 3178 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3179 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3180 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3181 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3182 with A[i] and B[ii] extracted from the corresponding MPI submat. 3183 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3184 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3185 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3186 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3187 values into A[i] and B[ii] sitting inside the corresponding submat. 3188 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3189 A[i], B[ii], AA[i] or BB[ii] matrices. 3190 */ 3191 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3192 if (scall == MAT_INITIAL_MATRIX) { 3193 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3194 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 3195 } else { 3196 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 3197 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 3198 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 3199 for (i = 0, ii = 0; i < ismax; ++i) { 3200 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3201 if (isize > 1) { 3202 Mat AA,BB; 3203 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 3204 if (!isrow_p[i] && !iscol_p[i]) { 3205 A[i] = AA; 3206 } else { 3207 /* TODO: extract A[i] composed on AA. */ 3208 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3209 } 3210 if (!isrow_p[i] && !ciscol_p[ii]) { 3211 B[ii] = BB; 3212 } else { 3213 /* TODO: extract B[ii] composed on BB. */ 3214 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 3215 } 3216 ++ii; 3217 } else { 3218 if (!isrow_p[i] && !iscol_p[i]) { 3219 A[i] = (*submat)[i]; 3220 } else { 3221 /* TODO: extract A[i] composed on (*submat)[i]. */ 3222 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3223 } 3224 } 3225 } 3226 } 3227 /* Now obtain the sequential A and B submatrices separately. */ 3228 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 3229 /* I did not figure out a good way to fix it right now. 3230 * Local column size of B[i] is different from the size of ciscol[i]. 3231 * B[i]'s size is finally determined by MatAssembly, while 3232 * ciscol[i] is computed as the complement of iscol[i]. 3233 * It is better to keep only nonzero indices when computing 3234 * the complement ciscol[i]. 3235 * */ 3236 if(scall==MAT_REUSE_MATRIX){ 3237 for(i=0; i<cismax; i++){ 3238 ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr); 3239 B[i]->cmap->n = ciscol_localsize; 3240 } 3241 } 3242 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 3243 /* 3244 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3245 matrices A & B have been extracted directly into the parallel matrices containing them, or 3246 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3247 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3248 to have the same sparsity pattern. 3249 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3250 must be constructed for C. This is done by setseqmat(s). 3251 */ 3252 for (i = 0, ii = 0; i < ismax; ++i) { 3253 /* 3254 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3255 That way we can avoid sorting and computing permutations when reusing. 3256 To this end: 3257 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3258 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3259 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3260 */ 3261 MatStructure pattern; 3262 if (scall == MAT_INITIAL_MATRIX) { 3263 pattern = DIFFERENT_NONZERO_PATTERN; 3264 } else { 3265 pattern = SAME_NONZERO_PATTERN; 3266 } 3267 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3268 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3269 if (isize > 1) { 3270 if (scall == MAT_INITIAL_MATRIX) { 3271 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3272 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3273 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3274 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3275 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3276 } 3277 /* 3278 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3279 */ 3280 { 3281 Mat AA,BB; 3282 AA = NULL; 3283 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 3284 BB = NULL; 3285 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 3286 if (AA || BB) { 3287 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3288 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3289 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3290 } 3291 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 3292 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 3293 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3294 } 3295 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 3296 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 3297 ierr = MatDestroy(&BB);CHKERRQ(ierr); 3298 } 3299 } 3300 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3301 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3302 ++ii; 3303 } else { /* if (isize == 1) */ 3304 if (scall == MAT_INITIAL_MATRIX) { 3305 if (isrow_p[i] || iscol_p[i]) { 3306 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3307 } else (*submat)[i] = A[i]; 3308 } 3309 if (isrow_p[i] || iscol_p[i]) { 3310 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3311 /* Otherwise A is extracted straight into (*submats)[i]. */ 3312 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3313 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3314 } 3315 } 3316 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3317 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3318 } 3319 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3320 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3321 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3322 ierr = PetscFree(A);CHKERRQ(ierr); 3323 ierr = PetscFree(B);CHKERRQ(ierr); 3324 PetscFunctionReturn(0); 3325 } 3326 3327 3328 3329 #undef __FUNCT__ 3330 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ" 3331 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3332 { 3333 PetscErrorCode ierr; 3334 3335 PetscFunctionBegin; 3336 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3337 PetscFunctionReturn(0); 3338 } 3339