1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 #undef __FUNCT__ 23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 25 { 26 PetscErrorCode ierr; 27 PetscInt i; 28 29 PetscFunctionBegin; 30 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 31 for (i=0; i<ov; ++i) { 32 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 33 } 34 PetscFunctionReturn(0); 35 } 36 37 #undef __FUNCT__ 38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 40 { 41 PetscErrorCode ierr; 42 PetscInt i; 43 44 PetscFunctionBegin; 45 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 46 for (i=0; i<ov; ++i) { 47 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 48 } 49 PetscFunctionReturn(0); 50 } 51 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 56 { 57 PetscErrorCode ierr; 58 MPI_Comm comm; 59 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 60 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 61 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 62 const PetscInt *indices_i,**indices; 63 PetscLayout rmap; 64 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 65 PetscSF sf; 66 PetscSFNode *remote; 67 68 PetscFunctionBegin; 69 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 70 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 71 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 72 /* get row map to determine where rows should be going */ 73 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 74 /* retrieve IS data and put all together so that we 75 * can optimize communication 76 * */ 77 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 78 for (i=0,tlength=0; i<nidx; i++){ 79 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 80 tlength += length[i]; 81 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 82 } 83 /* find these rows on remote processors */ 84 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 85 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 86 nrrows = 0; 87 for (i=0; i<nidx; i++){ 88 length_i = length[i]; 89 indices_i = indices[i]; 90 for (j=0; j<length_i; j++){ 91 owner = -1; 92 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 93 /* remote processors */ 94 if (owner != rank){ 95 tosizes_temp[owner]++; /* number of rows to owner */ 96 rrow_ranks[nrrows] = owner; /* processor */ 97 rrow_isids[nrrows] = i; /* is id */ 98 remoterows[nrrows++] = indices_i[j]; /* row */ 99 } 100 } 101 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 102 } 103 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 104 /* test if we need to exchange messages 105 * generally speaking, we do not need to exchange 106 * data when overlap is 1 107 * */ 108 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 109 /* we do not have any messages 110 * It usually corresponds to overlap 1 111 * */ 112 if (!reducednrrows){ 113 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 114 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 115 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 nto = 0; 119 /* send sizes and ranks for building a two-sided communcation */ 120 for (i=0; i<size; i++){ 121 if (tosizes_temp[i]){ 122 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 123 tosizes_temp[i] = nto; /* a map from processor to index */ 124 toranks[nto++] = i; /* processor */ 125 } 126 } 127 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 128 for (i=0; i<nto; i++){ 129 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 130 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 131 } 132 /* send information to other processors */ 133 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 134 nrecvrows = 0; 135 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 136 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 137 nrecvrows = 0; 138 for (i=0; i<nfrom; i++){ 139 for (j=0; j<fromsizes[2*i]; j++){ 140 remote[nrecvrows].rank = fromranks[i]; 141 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 142 } 143 } 144 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 145 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 146 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 147 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 148 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 149 /* message pair <no of is, row> */ 150 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 151 for (i=0; i<nrrows; i++){ 152 owner = rrow_ranks[i]; /* processor */ 153 j = tosizes_temp[owner]; /* index */ 154 todata[toffsets[j]++] = rrow_isids[i]; 155 todata[toffsets[j]++] = remoterows[i]; 156 } 157 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 158 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 159 ierr = PetscFree(toffsets);CHKERRQ(ierr); 160 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 161 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 162 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 163 /* send rows belonging to the remote so that then we could get the overlapping data back */ 164 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 165 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 166 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 167 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 168 ierr = PetscFree(fromranks);CHKERRQ(ierr); 169 nrecvrows = 0; 170 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 171 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 172 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 173 nrecvrows = 0; 174 for (i=0; i<nto; i++){ 175 for (j=0; j<tosizes[2*i]; j++){ 176 remote[nrecvrows].rank = toranks[i]; 177 remote[nrecvrows++].index = tosizes[2*i+1]+j; 178 } 179 } 180 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 181 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 182 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 183 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 184 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 185 /* overlap communication and computation */ 186 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 187 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 188 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 189 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 190 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 191 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 192 ierr = PetscFree(toranks);CHKERRQ(ierr); 193 ierr = PetscFree(tosizes);CHKERRQ(ierr); 194 ierr = PetscFree(todata);CHKERRQ(ierr); 195 PetscFunctionReturn(0); 196 } 197 198 #undef __FUNCT__ 199 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 200 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 201 { 202 PetscInt *isz,isz_i,i,j,is_id, data_size; 203 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 204 const PetscInt *indices_i_temp; 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 max_lsize = 0; 209 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 210 for (i=0; i<nidx; i++){ 211 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 212 max_lsize = lsize>max_lsize ? lsize:max_lsize; 213 isz[i] = lsize; 214 } 215 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 216 for (i=0; i<nidx; i++){ 217 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 218 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 219 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 220 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 221 } 222 /* retrieve information to get row id and its overlap */ 223 for (i=0; i<nrecvs; ){ 224 is_id = recvdata[i++]; 225 data_size = recvdata[i++]; 226 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 227 isz_i = isz[is_id]; 228 for (j=0; j< data_size; j++){ 229 col = recvdata[i++]; 230 indices_i[isz_i++] = col; 231 } 232 isz[is_id] = isz_i; 233 } 234 /* remove duplicate entities */ 235 for (i=0; i<nidx; i++){ 236 indices_i = indices_temp+(max_lsize+nrecvs)*i; 237 isz_i = isz[i]; 238 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 239 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 240 } 241 ierr = PetscFree(isz);CHKERRQ(ierr); 242 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 243 PetscFunctionReturn(0); 244 } 245 246 #undef __FUNCT__ 247 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 248 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 249 { 250 PetscLayout rmap,cmap; 251 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 252 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 253 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 254 const PetscInt *gcols,*ai,*aj,*bi,*bj; 255 Mat amat,bmat; 256 PetscMPIInt rank; 257 PetscBool done; 258 MPI_Comm comm; 259 PetscErrorCode ierr; 260 261 PetscFunctionBegin; 262 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 263 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 264 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 265 /* Even if the mat is symmetric, we still assume it is not symmetric */ 266 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 267 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 268 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 269 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 270 /* total number of nonzero values is used to estimate the memory usage in the next step */ 271 tnz = ai[an]+bi[bn]; 272 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 273 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 274 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 275 /* to find the longest message */ 276 max_fszs = 0; 277 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 278 /* better way to estimate number of nonzero in the mat??? */ 279 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 280 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 281 rows_pos = 0; 282 totalrows = 0; 283 for (i=0; i<nfrom; i++){ 284 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 285 /* group data together */ 286 for (j=0; j<fromsizes[2*i]; j+=2){ 287 is_id = fromrows[rows_pos++];/* no of is */ 288 rows_i = rows_data[is_id]; 289 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 290 } 291 /* estimate a space to avoid multiple allocations */ 292 for (j=0; j<nidx; j++){ 293 indvc_ij = 0; 294 rows_i = rows_data[j]; 295 for (l=0; l<rows_pos_i[j]; l++){ 296 row = rows_i[l]-rstart; 297 start = ai[row]; 298 end = ai[row+1]; 299 for (k=start; k<end; k++){ /* Amat */ 300 col = aj[k] + cstart; 301 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 302 } 303 start = bi[row]; 304 end = bi[row+1]; 305 for (k=start; k<end; k++) { /* Bmat */ 306 col = gcols[bj[k]]; 307 indices_tmp[indvc_ij++] = col; 308 } 309 } 310 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 311 indv_counts[i*nidx+j] = indvc_ij; 312 totalrows += indvc_ij; 313 } 314 } 315 /* message triple <no of is, number of rows, rows> */ 316 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 317 totalrows = 0; 318 rows_pos = 0; 319 /* use this code again */ 320 for (i=0;i<nfrom;i++){ 321 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 322 for (j=0; j<fromsizes[2*i]; j+=2){ 323 is_id = fromrows[rows_pos++]; 324 rows_i = rows_data[is_id]; 325 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 326 } 327 /* add data */ 328 for (j=0; j<nidx; j++){ 329 if (!indv_counts[i*nidx+j]) continue; 330 indvc_ij = 0; 331 sbdata[totalrows++] = j; 332 sbdata[totalrows++] = indv_counts[i*nidx+j]; 333 sbsizes[2*i] += 2; 334 rows_i = rows_data[j]; 335 for (l=0; l<rows_pos_i[j]; l++){ 336 row = rows_i[l]-rstart; 337 start = ai[row]; 338 end = ai[row+1]; 339 for (k=start; k<end; k++){ /* Amat */ 340 col = aj[k] + cstart; 341 indices_tmp[indvc_ij++] = col; 342 } 343 start = bi[row]; 344 end = bi[row+1]; 345 for (k=start; k<end; k++) { /* Bmat */ 346 col = gcols[bj[k]]; 347 indices_tmp[indvc_ij++] = col; 348 } 349 } 350 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 351 sbsizes[2*i] += indvc_ij; 352 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 353 totalrows += indvc_ij; 354 } 355 } 356 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 357 for (i=0; i<nfrom; i++){ 358 offsets[i+1] = offsets[i] + sbsizes[2*i]; 359 sbsizes[2*i+1] = offsets[i]; 360 } 361 ierr = PetscFree(offsets);CHKERRQ(ierr); 362 if (sbrowsizes) *sbrowsizes = sbsizes; 363 if (sbrows) *sbrows = sbdata; 364 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 365 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 366 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 367 PetscFunctionReturn(0); 368 } 369 370 #undef __FUNCT__ 371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 373 { 374 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 375 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 376 PetscInt lsize,lsize_tmp,owner; 377 PetscMPIInt rank; 378 Mat amat,bmat; 379 PetscBool done; 380 PetscLayout cmap,rmap; 381 MPI_Comm comm; 382 PetscErrorCode ierr; 383 384 PetscFunctionBegin; 385 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 386 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 387 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 388 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 389 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 390 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 391 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 392 /* is it a safe way to compute number of nonzero values ? */ 393 tnz = ai[an]+bi[bn]; 394 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 395 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 396 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 397 /* it is a better way to estimate memory than the old implementation 398 * where global size of matrix is used 399 * */ 400 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 401 for (i=0; i<nidx; i++) { 402 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 403 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 404 lsize_tmp = 0; 405 for (j=0; j<lsize; j++) { 406 owner = -1; 407 row = indices[j]; 408 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 409 if (owner != rank) continue; 410 /* local number */ 411 row -= rstart; 412 start = ai[row]; 413 end = ai[row+1]; 414 for (k=start; k<end; k++) { /* Amat */ 415 col = aj[k] + cstart; 416 indices_temp[lsize_tmp++] = col; 417 } 418 start = bi[row]; 419 end = bi[row+1]; 420 for (k=start; k<end; k++) { /* Bmat */ 421 col = gcols[bj[k]]; 422 indices_temp[lsize_tmp++] = col; 423 } 424 } 425 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 426 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 427 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 428 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 429 } 430 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 431 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 432 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 433 PetscFunctionReturn(0); 434 } 435 436 437 /* 438 Sample message format: 439 If a processor A wants processor B to process some elements corresponding 440 to index sets is[1],is[5] 441 mesg [0] = 2 (no of index sets in the mesg) 442 ----------- 443 mesg [1] = 1 => is[1] 444 mesg [2] = sizeof(is[1]); 445 ----------- 446 mesg [3] = 5 => is[5] 447 mesg [4] = sizeof(is[5]); 448 ----------- 449 mesg [5] 450 mesg [n] datas[1] 451 ----------- 452 mesg[n+1] 453 mesg[m] data(is[5]) 454 ----------- 455 456 Notes: 457 nrqs - no of requests sent (or to be sent out) 458 nrqr - no of requests recieved (which have to be or which have been processed 459 */ 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 463 { 464 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 465 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 466 const PetscInt **idx,*idx_i; 467 PetscInt *n,**data,len; 468 #if defined(PETSC_USE_CTABLE) 469 PetscTable *table_data,table_data_i; 470 PetscInt *tdata,tcount,tcount_max; 471 #else 472 PetscInt *data_i,*d_p; 473 #endif 474 PetscErrorCode ierr; 475 PetscMPIInt size,rank,tag1,tag2; 476 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 477 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 478 PetscBT *table; 479 MPI_Comm comm; 480 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 481 MPI_Status *s_status,*recv_status; 482 char *t_p; 483 484 PetscFunctionBegin; 485 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 486 size = c->size; 487 rank = c->rank; 488 M = C->rmap->N; 489 490 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 491 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 492 493 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 494 495 for (i=0; i<imax; i++) { 496 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 497 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 498 } 499 500 /* evaluate communication - mesg to who,length of mesg, and buffer space 501 required. Based on this, buffers are allocated, and data copied into them */ 502 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 503 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 504 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 505 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 506 for (i=0; i<imax; i++) { 507 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 508 idx_i = idx[i]; 509 len = n[i]; 510 for (j=0; j<len; j++) { 511 row = idx_i[j]; 512 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 513 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 514 w4[proc]++; 515 } 516 for (j=0; j<size; j++) { 517 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 518 } 519 } 520 521 nrqs = 0; /* no of outgoing messages */ 522 msz = 0; /* total mesg length (for all proc */ 523 w1[rank] = 0; /* no mesg sent to intself */ 524 w3[rank] = 0; 525 for (i=0; i<size; i++) { 526 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 527 } 528 /* pa - is list of processors to communicate with */ 529 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 530 for (i=0,j=0; i<size; i++) { 531 if (w1[i]) {pa[j] = i; j++;} 532 } 533 534 /* Each message would have a header = 1 + 2*(no of IS) + data */ 535 for (i=0; i<nrqs; i++) { 536 j = pa[i]; 537 w1[j] += w2[j] + 2*w3[j]; 538 msz += w1[j]; 539 } 540 541 /* Determine the number of messages to expect, their lengths, from from-ids */ 542 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 543 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 544 545 /* Now post the Irecvs corresponding to these messages */ 546 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 547 548 /* Allocate Memory for outgoing messages */ 549 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 550 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 551 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 552 553 { 554 PetscInt *iptr = tmp,ict = 0; 555 for (i=0; i<nrqs; i++) { 556 j = pa[i]; 557 iptr += ict; 558 outdat[j] = iptr; 559 ict = w1[j]; 560 } 561 } 562 563 /* Form the outgoing messages */ 564 /* plug in the headers */ 565 for (i=0; i<nrqs; i++) { 566 j = pa[i]; 567 outdat[j][0] = 0; 568 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 569 ptr[j] = outdat[j] + 2*w3[j] + 1; 570 } 571 572 /* Memory for doing local proc's work */ 573 { 574 PetscInt M_BPB_imax = 0; 575 #if defined(PETSC_USE_CTABLE) 576 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 577 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 578 for (i=0; i<imax; i++) { 579 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 580 } 581 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 582 for (i=0; i<imax; i++) { 583 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 584 } 585 #else 586 PetscInt Mimax = 0; 587 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 588 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 589 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 590 for (i=0; i<imax; i++) { 591 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 592 data[i] = d_p + M*i; 593 } 594 #endif 595 } 596 597 /* Parse the IS and update local tables and the outgoing buf with the data */ 598 { 599 PetscInt n_i,isz_i,*outdat_j,ctr_j; 600 PetscBT table_i; 601 602 for (i=0; i<imax; i++) { 603 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 604 n_i = n[i]; 605 table_i = table[i]; 606 idx_i = idx[i]; 607 #if defined(PETSC_USE_CTABLE) 608 table_data_i = table_data[i]; 609 #else 610 data_i = data[i]; 611 #endif 612 isz_i = isz[i]; 613 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 614 row = idx_i[j]; 615 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 616 if (proc != rank) { /* copy to the outgoing buffer */ 617 ctr[proc]++; 618 *ptr[proc] = row; 619 ptr[proc]++; 620 } else if (!PetscBTLookupSet(table_i,row)) { 621 #if defined(PETSC_USE_CTABLE) 622 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 623 #else 624 data_i[isz_i] = row; /* Update the local table */ 625 #endif 626 isz_i++; 627 } 628 } 629 /* Update the headers for the current IS */ 630 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 631 if ((ctr_j = ctr[j])) { 632 outdat_j = outdat[j]; 633 k = ++outdat_j[0]; 634 outdat_j[2*k] = ctr_j; 635 outdat_j[2*k-1] = i; 636 } 637 } 638 isz[i] = isz_i; 639 } 640 } 641 642 /* Now post the sends */ 643 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 644 for (i=0; i<nrqs; ++i) { 645 j = pa[i]; 646 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 647 } 648 649 /* No longer need the original indices */ 650 for (i=0; i<imax; ++i) { 651 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 652 } 653 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 654 655 for (i=0; i<imax; ++i) { 656 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 657 } 658 659 /* Do Local work */ 660 #if defined(PETSC_USE_CTABLE) 661 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 662 #else 663 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 664 #endif 665 666 /* Receive messages */ 667 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 668 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 669 670 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 671 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 672 673 /* Phase 1 sends are complete - deallocate buffers */ 674 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 675 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 676 677 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 678 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 679 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 680 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 681 ierr = PetscFree(rbuf);CHKERRQ(ierr); 682 683 684 /* Send the data back */ 685 /* Do a global reduction to know the buffer space req for incoming messages */ 686 { 687 PetscMPIInt *rw1; 688 689 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 690 691 for (i=0; i<nrqr; ++i) { 692 proc = recv_status[i].MPI_SOURCE; 693 694 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 695 rw1[proc] = isz1[i]; 696 } 697 ierr = PetscFree(onodes1);CHKERRQ(ierr); 698 ierr = PetscFree(olengths1);CHKERRQ(ierr); 699 700 /* Determine the number of messages to expect, their lengths, from from-ids */ 701 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 702 ierr = PetscFree(rw1);CHKERRQ(ierr); 703 } 704 /* Now post the Irecvs corresponding to these messages */ 705 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 706 707 /* Now post the sends */ 708 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 709 for (i=0; i<nrqr; ++i) { 710 j = recv_status[i].MPI_SOURCE; 711 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 712 } 713 714 /* receive work done on other processors */ 715 { 716 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 717 PetscMPIInt idex; 718 PetscBT table_i; 719 MPI_Status *status2; 720 721 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 722 for (i=0; i<nrqs; ++i) { 723 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 724 /* Process the message */ 725 rbuf2_i = rbuf2[idex]; 726 ct1 = 2*rbuf2_i[0]+1; 727 jmax = rbuf2[idex][0]; 728 for (j=1; j<=jmax; j++) { 729 max = rbuf2_i[2*j]; 730 is_no = rbuf2_i[2*j-1]; 731 isz_i = isz[is_no]; 732 table_i = table[is_no]; 733 #if defined(PETSC_USE_CTABLE) 734 table_data_i = table_data[is_no]; 735 #else 736 data_i = data[is_no]; 737 #endif 738 for (k=0; k<max; k++,ct1++) { 739 row = rbuf2_i[ct1]; 740 if (!PetscBTLookupSet(table_i,row)) { 741 #if defined(PETSC_USE_CTABLE) 742 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 743 #else 744 data_i[isz_i] = row; 745 #endif 746 isz_i++; 747 } 748 } 749 isz[is_no] = isz_i; 750 } 751 } 752 753 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 754 ierr = PetscFree(status2);CHKERRQ(ierr); 755 } 756 757 #if defined(PETSC_USE_CTABLE) 758 tcount_max = 0; 759 for (i=0; i<imax; ++i) { 760 table_data_i = table_data[i]; 761 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 762 if (tcount_max < tcount) tcount_max = tcount; 763 } 764 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 765 #endif 766 767 for (i=0; i<imax; ++i) { 768 #if defined(PETSC_USE_CTABLE) 769 PetscTablePosition tpos; 770 table_data_i = table_data[i]; 771 772 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 773 while (tpos) { 774 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 775 tdata[--j] = --k; 776 } 777 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 778 #else 779 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 780 #endif 781 } 782 783 ierr = PetscFree(onodes2);CHKERRQ(ierr); 784 ierr = PetscFree(olengths2);CHKERRQ(ierr); 785 786 ierr = PetscFree(pa);CHKERRQ(ierr); 787 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 788 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 789 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 790 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 791 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 792 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 793 ierr = PetscFree(s_status);CHKERRQ(ierr); 794 ierr = PetscFree(recv_status);CHKERRQ(ierr); 795 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 796 ierr = PetscFree(xdata);CHKERRQ(ierr); 797 ierr = PetscFree(isz1);CHKERRQ(ierr); 798 #if defined(PETSC_USE_CTABLE) 799 for (i=0; i<imax; i++) { 800 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 801 } 802 ierr = PetscFree(table_data);CHKERRQ(ierr); 803 ierr = PetscFree(tdata);CHKERRQ(ierr); 804 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 805 #else 806 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 807 #endif 808 PetscFunctionReturn(0); 809 } 810 811 #undef __FUNCT__ 812 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 813 /* 814 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 815 the work on the local processor. 816 817 Inputs: 818 C - MAT_MPIAIJ; 819 imax - total no of index sets processed at a time; 820 table - an array of char - size = m bits. 821 822 Output: 823 isz - array containing the count of the solution elements corresponding 824 to each index set; 825 data or table_data - pointer to the solutions 826 */ 827 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 828 { 829 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 830 Mat A = c->A,B = c->B; 831 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 832 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 833 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 834 PetscBT table_i; 835 #if defined(PETSC_USE_CTABLE) 836 PetscTable table_data_i; 837 PetscErrorCode ierr; 838 PetscTablePosition tpos; 839 PetscInt tcount,*tdata; 840 #else 841 PetscInt *data_i; 842 #endif 843 844 PetscFunctionBegin; 845 rstart = C->rmap->rstart; 846 cstart = C->cmap->rstart; 847 ai = a->i; 848 aj = a->j; 849 bi = b->i; 850 bj = b->j; 851 garray = c->garray; 852 853 for (i=0; i<imax; i++) { 854 #if defined(PETSC_USE_CTABLE) 855 /* copy existing entries of table_data_i into tdata[] */ 856 table_data_i = table_data[i]; 857 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 858 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 859 860 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 861 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 862 while (tpos) { 863 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 864 tdata[--j] = --row; 865 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 866 } 867 #else 868 data_i = data[i]; 869 #endif 870 table_i = table[i]; 871 isz_i = isz[i]; 872 max = isz[i]; 873 874 for (j=0; j<max; j++) { 875 #if defined(PETSC_USE_CTABLE) 876 row = tdata[j] - rstart; 877 #else 878 row = data_i[j] - rstart; 879 #endif 880 start = ai[row]; 881 end = ai[row+1]; 882 for (k=start; k<end; k++) { /* Amat */ 883 val = aj[k] + cstart; 884 if (!PetscBTLookupSet(table_i,val)) { 885 #if defined(PETSC_USE_CTABLE) 886 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 887 #else 888 data_i[isz_i] = val; 889 #endif 890 isz_i++; 891 } 892 } 893 start = bi[row]; 894 end = bi[row+1]; 895 for (k=start; k<end; k++) { /* Bmat */ 896 val = garray[bj[k]]; 897 if (!PetscBTLookupSet(table_i,val)) { 898 #if defined(PETSC_USE_CTABLE) 899 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 900 #else 901 data_i[isz_i] = val; 902 #endif 903 isz_i++; 904 } 905 } 906 } 907 isz[i] = isz_i; 908 909 #if defined(PETSC_USE_CTABLE) 910 ierr = PetscFree(tdata);CHKERRQ(ierr); 911 #endif 912 } 913 PetscFunctionReturn(0); 914 } 915 916 #undef __FUNCT__ 917 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 918 /* 919 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 920 and return the output 921 922 Input: 923 C - the matrix 924 nrqr - no of messages being processed. 925 rbuf - an array of pointers to the recieved requests 926 927 Output: 928 xdata - array of messages to be sent back 929 isz1 - size of each message 930 931 For better efficiency perhaps we should malloc separately each xdata[i], 932 then if a remalloc is required we need only copy the data for that one row 933 rather then all previous rows as it is now where a single large chunck of 934 memory is used. 935 936 */ 937 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 938 { 939 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 940 Mat A = c->A,B = c->B; 941 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 942 PetscErrorCode ierr; 943 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 944 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 945 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 946 PetscInt *rbuf_i,kmax,rbuf_0; 947 PetscBT xtable; 948 949 PetscFunctionBegin; 950 m = C->rmap->N; 951 rstart = C->rmap->rstart; 952 cstart = C->cmap->rstart; 953 ai = a->i; 954 aj = a->j; 955 bi = b->i; 956 bj = b->j; 957 garray = c->garray; 958 959 960 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 961 rbuf_i = rbuf[i]; 962 rbuf_0 = rbuf_i[0]; 963 ct += rbuf_0; 964 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 965 } 966 967 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 968 else max1 = 1; 969 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 970 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 971 ++no_malloc; 972 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 973 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 974 975 ct3 = 0; 976 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 977 rbuf_i = rbuf[i]; 978 rbuf_0 = rbuf_i[0]; 979 ct1 = 2*rbuf_0+1; 980 ct2 = ct1; 981 ct3 += ct1; 982 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 983 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 984 oct2 = ct2; 985 kmax = rbuf_i[2*j]; 986 for (k=0; k<kmax; k++,ct1++) { 987 row = rbuf_i[ct1]; 988 if (!PetscBTLookupSet(xtable,row)) { 989 if (!(ct3 < mem_estimate)) { 990 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 991 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 992 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 993 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 994 xdata[0] = tmp; 995 mem_estimate = new_estimate; ++no_malloc; 996 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 997 } 998 xdata[i][ct2++] = row; 999 ct3++; 1000 } 1001 } 1002 for (k=oct2,max2=ct2; k<max2; k++) { 1003 row = xdata[i][k] - rstart; 1004 start = ai[row]; 1005 end = ai[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = aj[l] + cstart; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 start = bi[row]; 1023 end = bi[row+1]; 1024 for (l=start; l<end; l++) { 1025 val = garray[bj[l]]; 1026 if (!PetscBTLookupSet(xtable,val)) { 1027 if (!(ct3 < mem_estimate)) { 1028 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1029 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1030 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1031 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1032 xdata[0] = tmp; 1033 mem_estimate = new_estimate; ++no_malloc; 1034 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1035 } 1036 xdata[i][ct2++] = val; 1037 ct3++; 1038 } 1039 } 1040 } 1041 /* Update the header*/ 1042 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1043 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1044 } 1045 xdata[i][0] = rbuf_0; 1046 xdata[i+1] = xdata[i] + ct2; 1047 isz1[i] = ct2; /* size of each message */ 1048 } 1049 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1050 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1051 PetscFunctionReturn(0); 1052 } 1053 /* -------------------------------------------------------------------------*/ 1054 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 1055 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1056 /* 1057 Every processor gets the entire matrix 1058 */ 1059 #undef __FUNCT__ 1060 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 1061 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1062 { 1063 Mat B; 1064 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1065 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1066 PetscErrorCode ierr; 1067 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1068 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1069 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1070 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1071 1072 PetscFunctionBegin; 1073 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1074 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1075 1076 if (scall == MAT_INITIAL_MATRIX) { 1077 /* ---------------------------------------------------------------- 1078 Tell every processor the number of nonzeros per row 1079 */ 1080 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1081 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1082 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1083 } 1084 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1085 for (i=0; i<size; i++) { 1086 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1087 displs[i] = A->rmap->range[i]; 1088 } 1089 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1090 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1091 #else 1092 sendcount = A->rmap->rend - A->rmap->rstart; 1093 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1094 #endif 1095 /* --------------------------------------------------------------- 1096 Create the sequential matrix of the same type as the local block diagonal 1097 */ 1098 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1099 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1100 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1101 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1102 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1103 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1104 **Bin = B; 1105 b = (Mat_SeqAIJ*)B->data; 1106 1107 /*-------------------------------------------------------------------- 1108 Copy my part of matrix column indices over 1109 */ 1110 sendcount = ad->nz + bd->nz; 1111 jsendbuf = b->j + b->i[rstarts[rank]]; 1112 a_jsendbuf = ad->j; 1113 b_jsendbuf = bd->j; 1114 n = A->rmap->rend - A->rmap->rstart; 1115 cnt = 0; 1116 for (i=0; i<n; i++) { 1117 1118 /* put in lower diagonal portion */ 1119 m = bd->i[i+1] - bd->i[i]; 1120 while (m > 0) { 1121 /* is it above diagonal (in bd (compressed) numbering) */ 1122 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1123 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1124 m--; 1125 } 1126 1127 /* put in diagonal portion */ 1128 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1129 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1130 } 1131 1132 /* put in upper diagonal portion */ 1133 while (m-- > 0) { 1134 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1135 } 1136 } 1137 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1138 1139 /*-------------------------------------------------------------------- 1140 Gather all column indices to all processors 1141 */ 1142 for (i=0; i<size; i++) { 1143 recvcounts[i] = 0; 1144 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1145 recvcounts[i] += lens[j]; 1146 } 1147 } 1148 displs[0] = 0; 1149 for (i=1; i<size; i++) { 1150 displs[i] = displs[i-1] + recvcounts[i-1]; 1151 } 1152 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1153 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1154 #else 1155 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1156 #endif 1157 /*-------------------------------------------------------------------- 1158 Assemble the matrix into useable form (note numerical values not yet set) 1159 */ 1160 /* set the b->ilen (length of each row) values */ 1161 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1162 /* set the b->i indices */ 1163 b->i[0] = 0; 1164 for (i=1; i<=A->rmap->N; i++) { 1165 b->i[i] = b->i[i-1] + lens[i-1]; 1166 } 1167 ierr = PetscFree(lens);CHKERRQ(ierr); 1168 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1169 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1170 1171 } else { 1172 B = **Bin; 1173 b = (Mat_SeqAIJ*)B->data; 1174 } 1175 1176 /*-------------------------------------------------------------------- 1177 Copy my part of matrix numerical values into the values location 1178 */ 1179 if (flag == MAT_GET_VALUES) { 1180 sendcount = ad->nz + bd->nz; 1181 sendbuf = b->a + b->i[rstarts[rank]]; 1182 a_sendbuf = ad->a; 1183 b_sendbuf = bd->a; 1184 b_sendj = bd->j; 1185 n = A->rmap->rend - A->rmap->rstart; 1186 cnt = 0; 1187 for (i=0; i<n; i++) { 1188 1189 /* put in lower diagonal portion */ 1190 m = bd->i[i+1] - bd->i[i]; 1191 while (m > 0) { 1192 /* is it above diagonal (in bd (compressed) numbering) */ 1193 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1194 sendbuf[cnt++] = *b_sendbuf++; 1195 m--; 1196 b_sendj++; 1197 } 1198 1199 /* put in diagonal portion */ 1200 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1201 sendbuf[cnt++] = *a_sendbuf++; 1202 } 1203 1204 /* put in upper diagonal portion */ 1205 while (m-- > 0) { 1206 sendbuf[cnt++] = *b_sendbuf++; 1207 b_sendj++; 1208 } 1209 } 1210 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1211 1212 /* ----------------------------------------------------------------- 1213 Gather all numerical values to all processors 1214 */ 1215 if (!recvcounts) { 1216 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1217 } 1218 for (i=0; i<size; i++) { 1219 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1220 } 1221 displs[0] = 0; 1222 for (i=1; i<size; i++) { 1223 displs[i] = displs[i-1] + recvcounts[i-1]; 1224 } 1225 recvbuf = b->a; 1226 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1227 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1228 #else 1229 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1230 #endif 1231 } /* endof (flag == MAT_GET_VALUES) */ 1232 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1233 1234 if (A->symmetric) { 1235 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1236 } else if (A->hermitian) { 1237 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1238 } else if (A->structurally_symmetric) { 1239 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1240 } 1241 PetscFunctionReturn(0); 1242 } 1243 #undef __FUNCT__ 1244 #define __FUNCT__ "MatDestroy_MPIAIJ_MatGetSubmatrices" 1245 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1246 { 1247 PetscErrorCode ierr; 1248 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1249 Mat_SubMat *submatj = c->submatis1; 1250 PetscInt i; 1251 1252 PetscFunctionBegin; 1253 ierr = submatj->destroy(C);CHKERRQ(ierr); 1254 1255 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1256 1257 for (i=0; i<submatj->nrqr; ++i) { 1258 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1259 } 1260 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1261 1262 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1263 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1264 1265 for (i=0; i<submatj->nrqs; ++i) { 1266 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1267 } 1268 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1269 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1270 1271 ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr); 1272 1273 #if defined(PETSC_USE_CTABLE) 1274 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1275 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1276 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1277 #else 1278 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1279 #endif 1280 1281 if (!submatj->allcolumns) { 1282 #if defined(PETSC_USE_CTABLE) 1283 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1284 #else 1285 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1286 #endif 1287 } 1288 1289 ierr = PetscFree(submatj);CHKERRQ(ierr); 1290 PetscFunctionReturn(0); 1291 } 1292 1293 #undef __FUNCT__ 1294 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS_Local" 1295 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1296 { 1297 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1298 Mat submat,A = c->A,B = c->B; 1299 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1300 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1301 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1302 const PetscInt *icol,*irow; 1303 PetscInt nrow,ncol,start; 1304 PetscErrorCode ierr; 1305 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1306 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1307 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1308 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1309 PetscInt *lens,ncols,*cols,Crow; 1310 #if defined(PETSC_USE_CTABLE) 1311 PetscTable cmap,rmap; 1312 PetscInt *cmap_loc,*rmap_loc; 1313 #else 1314 PetscInt *cmap,*rmap; 1315 #endif 1316 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1317 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1318 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1319 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1320 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1321 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1322 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1323 MPI_Comm comm; 1324 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1325 PetscMPIInt *onodes1,*olengths1,idex,end; 1326 Mat_SubMat *smatis1; 1327 PetscBool isrowsorted; 1328 1329 PetscFunctionBegin; 1330 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1331 1332 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1333 size = c->size; 1334 rank = c->rank; 1335 1336 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1337 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1338 1339 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1340 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1341 if (allcolumns) { 1342 icol = NULL; 1343 ncol = C->cmap->N; 1344 } else { 1345 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1346 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1347 } 1348 1349 if (scall == MAT_INITIAL_MATRIX) { 1350 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1351 1352 /* Get some new tags to keep the communication clean */ 1353 tag1 = ((PetscObject)C)->tag; 1354 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1355 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1356 1357 /* evaluate communication - mesg to who, length of mesg, and buffer space 1358 required. Based on this, buffers are allocated, and data copied into them */ 1359 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1360 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1361 1362 /* w1[proc] = num of rows owned by proc */ 1363 proc = 0; 1364 for (j=0; j<nrow; j++) { 1365 row = irow[j]; /* sorted! */ 1366 while (row >= C->rmap->range[proc+1]) proc++; 1367 w1[proc]++; 1368 row2proc[j] = proc; /* map row index to proc */ 1369 } 1370 1371 nrqs = 0; /* num of outgoing messages */ 1372 msz = 0; /* total mesg length (for all procs) */ 1373 w1[rank] = 0; /* num mesg sent to self */ 1374 for (proc=0; proc<size; proc++) { 1375 if (w1[proc]) { w2[proc] = 1; nrqs++;} /* there exists a message to this proc */ 1376 } 1377 1378 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1379 for (proc=0,j=0; proc<size; proc++) { 1380 if (w1[proc]) { pa[j++] = proc;} 1381 } 1382 1383 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1384 for (i=0; i<nrqs; i++) { 1385 proc = pa[i]; 1386 w1[proc] += 3; 1387 msz += w1[proc]; 1388 } 1389 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1390 1391 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1392 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1393 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1394 1395 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1396 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1397 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1398 1399 /* Now post the Irecvs corresponding to these messages */ 1400 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1401 1402 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1403 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1404 1405 /* Allocate Memory for outgoing messages */ 1406 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1407 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1408 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1409 1410 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1411 iptr = tmp; 1412 for (i=0; i<nrqs; i++) { 1413 proc = pa[i]; 1414 sbuf1[proc] = iptr; 1415 iptr += w1[proc]; 1416 } 1417 1418 /* Form the outgoing messages */ 1419 /* Initialize the header space */ 1420 for (i=0; i<nrqs; i++) { 1421 proc = pa[i]; 1422 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1423 ptr[proc] = sbuf1[proc] + 3; 1424 } 1425 1426 /* Parse the isrow and copy data into outbuf */ 1427 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1428 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1429 proc = row2proc[j]; 1430 if (proc != rank) { /* copy to the outgoing buf*/ 1431 *ptr[proc] = irow[j]; 1432 ctr[proc]++; ptr[proc]++; 1433 } 1434 } 1435 1436 /* Update the headers for the current IS */ 1437 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1438 if ((ctr_j = ctr[j])) { 1439 sbuf1_j = sbuf1[j]; 1440 k = ++sbuf1_j[0]; 1441 sbuf1_j[2*k] = ctr_j; 1442 sbuf1_j[2*k-1] = 0; 1443 } 1444 } 1445 1446 /* Now post the sends */ 1447 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1448 for (i=0; i<nrqs; ++i) { 1449 proc = pa[i]; 1450 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1451 } 1452 1453 /* Post Receives to capture the buffer size */ 1454 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1455 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1456 1457 rbuf2[0] = tmp + msz; 1458 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1459 1460 for (i=0; i<nrqs; ++i) { 1461 proc = pa[i]; 1462 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1463 } 1464 1465 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1466 1467 /* Send to other procs the buf size they should allocate */ 1468 /* Receive messages*/ 1469 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1470 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1471 1472 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1473 for (i=0; i<nrqr; ++i) { 1474 req_size[i] = 0; 1475 rbuf1_i = rbuf1[i]; 1476 start = 2*rbuf1_i[0] + 1; 1477 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1478 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1479 sbuf2_i = sbuf2[i]; 1480 for (j=start; j<end; j++) { 1481 k = rbuf1_i[j] - rstart; 1482 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1483 sbuf2_i[j] = ncols; 1484 req_size[i] += ncols; 1485 } 1486 req_source1[i] = r_status1[i].MPI_SOURCE; 1487 1488 /* form the header */ 1489 sbuf2_i[0] = req_size[i]; 1490 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1491 1492 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1493 } 1494 1495 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1496 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1497 1498 /* rbuf2 is received, Post recv column indices a->j */ 1499 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1500 1501 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1502 for (i=0; i<nrqs; ++i) { 1503 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1504 req_source2[i] = r_status2[i].MPI_SOURCE; 1505 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1506 } 1507 1508 /* Wait on sends1 and sends2 */ 1509 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1510 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1511 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1512 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1513 1514 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1515 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1516 1517 /* Now allocate sending buffers for a->j, and send them off */ 1518 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1519 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1520 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1521 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1522 1523 for (i=0; i<nrqr; i++) { /* for each requested message */ 1524 rbuf1_i = rbuf1[i]; 1525 sbuf_aj_i = sbuf_aj[i]; 1526 ct1 = 2*rbuf1_i[0] + 1; 1527 ct2 = 0; 1528 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1529 1530 kmax = rbuf1[i][2]; 1531 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1532 row = rbuf1_i[ct1] - rstart; 1533 nzA = ai[row+1] - ai[row]; nzB = bi[row+1] - bi[row]; 1534 ncols = nzA + nzB; 1535 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1536 1537 /* load the column indices for this row into cols*/ 1538 cols = sbuf_aj_i + ct2; 1539 1540 lwrite = 0; 1541 for (l=0; l<nzB; l++) { 1542 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1543 } 1544 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1545 for (l=0; l<nzB; l++) { 1546 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1547 } 1548 1549 ct2 += ncols; 1550 } 1551 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1552 } 1553 1554 /* create column map (cmap): global col of C -> local col of submat */ 1555 #if defined(PETSC_USE_CTABLE) 1556 if (!allcolumns) { 1557 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1558 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1559 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1560 if (icol[j] >= cstart && icol[j] <cend) { 1561 cmap_loc[icol[j] - cstart] = j+1; 1562 } else { /* use PetscTable for non-local col indices */ 1563 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1564 } 1565 } 1566 } else { 1567 cmap = NULL; 1568 cmap_loc = NULL; 1569 } 1570 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1571 #else 1572 if (!allcolumns) { 1573 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1574 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1575 } else { 1576 cmap = NULL; 1577 } 1578 #endif 1579 1580 /* Create lens for MatSeqAIJSetPreallocation() */ 1581 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1582 1583 /* Compute lens from local part of C */ 1584 for (j=0; j<nrow; j++) { 1585 row = irow[j]; 1586 proc = row2proc[j]; 1587 if (proc == rank) { 1588 /* diagonal part A = c->A */ 1589 ncols = ai[row-rstart+1] - ai[row-rstart]; 1590 cols = aj + ai[row-rstart]; 1591 if (!allcolumns) { 1592 for (k=0; k<ncols; k++) { 1593 #if defined(PETSC_USE_CTABLE) 1594 tcol = cmap_loc[cols[k]]; 1595 #else 1596 tcol = cmap[cols[k]+cstart]; 1597 #endif 1598 if (tcol) lens[j]++; 1599 } 1600 } else { /* allcolumns */ 1601 lens[j] = ncols; 1602 } 1603 1604 /* off-diagonal part B = c->B */ 1605 ncols = bi[row-rstart+1] - bi[row-rstart]; 1606 cols = bj + bi[row-rstart]; 1607 if (!allcolumns) { 1608 for (k=0; k<ncols; k++) { 1609 #if defined(PETSC_USE_CTABLE) 1610 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1611 #else 1612 tcol = cmap[bmap[cols[k]]]; 1613 #endif 1614 if (tcol) lens[j]++; 1615 } 1616 } else { /* allcolumns */ 1617 lens[j] += ncols; 1618 } 1619 } 1620 } 1621 1622 /* Create row map (rmap): global row of C -> local row of submat */ 1623 #if defined(PETSC_USE_CTABLE) 1624 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1625 for (j=0; j<nrow; j++) { 1626 row = irow[j]; 1627 proc = row2proc[j]; 1628 if (proc == rank) { /* a local row */ 1629 rmap_loc[row - rstart] = j; 1630 } else { 1631 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1632 } 1633 } 1634 #else 1635 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1636 for (j=0; j<nrow; j++) { 1637 rmap[irow[j]] = j; 1638 } 1639 #endif 1640 1641 /* Update lens from offproc data */ 1642 /* recv a->j is done */ 1643 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1644 for (i=0; i<nrqs; i++) { 1645 proc = pa[i]; 1646 sbuf1_i = sbuf1[proc]; 1647 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1648 ct1 = 2 + 1; 1649 ct2 = 0; 1650 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1651 rbuf3_i = rbuf3[i]; /* received C->j */ 1652 1653 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1654 max1 = sbuf1_i[2]; 1655 for (k=0; k<max1; k++,ct1++) { 1656 #if defined(PETSC_USE_CTABLE) 1657 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1658 row--; 1659 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1660 #else 1661 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1662 #endif 1663 /* Now, store row index of submat in sbuf1_i[ct1] */ 1664 sbuf1_i[ct1] = row; 1665 1666 nnz = rbuf2_i[ct1]; 1667 if (!allcolumns) { 1668 for (l=0; l<nnz; l++,ct2++) { 1669 #if defined(PETSC_USE_CTABLE) 1670 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1671 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1672 } else { 1673 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1674 } 1675 #else 1676 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1677 #endif 1678 if (tcol) { 1679 lens[row]++; 1680 1681 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1682 For reuse, we replace received C->j with index that should be inserted to submat */ 1683 /* rbuf3_i[ct3++] = ct2; ??? */ 1684 } 1685 } 1686 } else { /* allcolumns */ 1687 lens[row] += nnz; 1688 } 1689 } 1690 } 1691 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1692 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1693 1694 /* Create the submatrices */ 1695 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1696 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1697 1698 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1699 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1700 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1701 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1702 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1703 1704 /* create struct Mat_SubMat and attached it to submat */ 1705 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1706 subc = (Mat_SeqAIJ*)submat->data; 1707 subc->submatis1 = smatis1; 1708 1709 smatis1->nrqs = nrqs; 1710 smatis1->nrqr = nrqr; 1711 smatis1->rbuf1 = rbuf1; 1712 smatis1->rbuf2 = rbuf2; 1713 smatis1->rbuf3 = rbuf3; 1714 smatis1->sbuf2 = sbuf2; 1715 smatis1->req_source2 = req_source2; 1716 1717 smatis1->sbuf1 = sbuf1; 1718 smatis1->ptr = ptr; 1719 smatis1->tmp = tmp; 1720 smatis1->ctr = ctr; 1721 1722 smatis1->pa = pa; 1723 smatis1->req_size = req_size; 1724 smatis1->req_source1 = req_source1; 1725 1726 smatis1->allcolumns = allcolumns; 1727 smatis1->row2proc = row2proc; 1728 smatis1->rmap = rmap; 1729 smatis1->cmap = cmap; 1730 #if defined(PETSC_USE_CTABLE) 1731 smatis1->rmap_loc = rmap_loc; 1732 smatis1->cmap_loc = cmap_loc; 1733 #endif 1734 1735 smatis1->destroy = submat->ops->destroy; 1736 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1737 submat->factortype = C->factortype; 1738 1739 } else { /* scall == MAT_REUSE_MATRIX */ 1740 submat = submats[0]; 1741 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1742 1743 subc = (Mat_SeqAIJ*)submat->data; 1744 smatis1 = subc->submatis1; 1745 nrqs = smatis1->nrqs; 1746 nrqr = smatis1->nrqr; 1747 rbuf1 = smatis1->rbuf1; 1748 rbuf2 = smatis1->rbuf2; 1749 rbuf3 = smatis1->rbuf3; 1750 req_source2 = smatis1->req_source2; 1751 1752 sbuf1 = smatis1->sbuf1; 1753 sbuf2 = smatis1->sbuf2; 1754 ptr = smatis1->ptr; 1755 tmp = smatis1->tmp; 1756 ctr = smatis1->ctr; 1757 1758 pa = smatis1->pa; 1759 req_size = smatis1->req_size; 1760 req_source1 = smatis1->req_source1; 1761 1762 allcolumns = smatis1->allcolumns; 1763 row2proc = smatis1->row2proc; 1764 rmap = smatis1->rmap; 1765 cmap = smatis1->cmap; 1766 #if defined(PETSC_USE_CTABLE) 1767 rmap_loc = smatis1->rmap_loc; 1768 cmap_loc = smatis1->cmap_loc; 1769 #endif 1770 } 1771 1772 /* Post recv matrix values */ 1773 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1774 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1775 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1776 for (i=0; i<nrqs; ++i) { 1777 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1778 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1779 } 1780 1781 /* Allocate sending buffers for a->a, and send them off */ 1782 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1783 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1784 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1785 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1786 1787 for (i=0; i<nrqr; i++) { 1788 rbuf1_i = rbuf1[i]; 1789 sbuf_aa_i = sbuf_aa[i]; 1790 ct1 = 2*rbuf1_i[0]+1; 1791 ct2 = 0; 1792 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1793 1794 kmax = rbuf1_i[2]; 1795 for (k=0; k<kmax; k++,ct1++) { 1796 row = rbuf1_i[ct1] - rstart; 1797 nzA = ai[row+1] - ai[row]; 1798 nzB = bi[row+1] - bi[row]; 1799 ncols = nzA + nzB; 1800 cworkB = bj + bi[row]; 1801 vworkA = a_a + ai[row]; 1802 vworkB = b_a + bi[row]; 1803 1804 /* load the column values for this row into vals*/ 1805 vals = sbuf_aa_i + ct2; 1806 1807 lwrite = 0; 1808 for (l=0; l<nzB; l++) { 1809 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1810 } 1811 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1812 for (l=0; l<nzB; l++) { 1813 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1814 } 1815 1816 ct2 += ncols; 1817 } 1818 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1819 } 1820 1821 /* Assemble submat */ 1822 ierr = PetscCalloc2(ncol,&subcols,ncol,&subvals);CHKERRQ(ierr); 1823 1824 /* First assemble the local rows */ 1825 for (j=0; j<nrow; j++) { 1826 row = irow[j]; 1827 proc = row2proc[j]; 1828 if (proc == rank) { 1829 Crow = row - rstart; /* local row index of C */ 1830 #if defined(PETSC_USE_CTABLE) 1831 row = rmap_loc[Crow]; /* row index of submat */ 1832 #else 1833 row = rmap[row]; 1834 #endif 1835 1836 if (allcolumns) { 1837 /* diagonal part A = c->A */ 1838 ncols = ai[Crow+1] - ai[Crow]; 1839 cols = aj + ai[Crow]; 1840 vals = a->a + ai[Crow]; 1841 i = 0; 1842 for (k=0; k<ncols; k++) { 1843 subcols[i] = cols[k] + cstart; 1844 subvals[i++] = vals[k]; 1845 } 1846 1847 /* off-diagonal part B = c->B */ 1848 ncols = bi[Crow+1] - bi[Crow]; 1849 cols = bj + bi[Crow]; 1850 vals = b->a + bi[Crow]; 1851 for (k=0; k<ncols; k++) { 1852 subcols[i] = bmap[cols[k]]; 1853 subvals[i++] = vals[k]; 1854 } 1855 1856 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1857 1858 } else { /* !allcolumns */ 1859 #if defined(PETSC_USE_CTABLE) 1860 /* diagonal part A = c->A */ 1861 ncols = ai[Crow+1] - ai[Crow]; 1862 cols = aj + ai[Crow]; 1863 vals = a->a + ai[Crow]; 1864 i = 0; 1865 for (k=0; k<ncols; k++) { 1866 tcol = cmap_loc[cols[k]]; 1867 if (tcol) { 1868 subcols[i] = --tcol; 1869 subvals[i++] = vals[k]; 1870 } 1871 } 1872 1873 /* off-diagonal part B = c->B */ 1874 ncols = bi[Crow+1] - bi[Crow]; 1875 cols = bj + bi[Crow]; 1876 vals = b->a + bi[Crow]; 1877 for (k=0; k<ncols; k++) { 1878 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1879 if (tcol) { 1880 subcols[i] = --tcol; 1881 subvals[i++] = vals[k]; 1882 } 1883 } 1884 #else 1885 /* diagonal part A = c->A */ 1886 ncols = ai[Crow+1] - ai[Crow]; 1887 cols = aj + ai[Crow]; 1888 vals = a->a + ai[Crow]; 1889 i = 0; 1890 for (k=0; k<ncols; k++) { 1891 tcol = cmap[cols[k]+cstart]; 1892 if (tcol) { 1893 subcols[i] = --tcol; 1894 subvals[i++] = vals[k]; 1895 } 1896 } 1897 1898 /* off-diagonal part B = c->B */ 1899 ncols = bi[Crow+1] - bi[Crow]; 1900 cols = bj + bi[Crow]; 1901 vals = b->a + bi[Crow]; 1902 for (k=0; k<ncols; k++) { 1903 tcol = cmap[bmap[cols[k]]]; 1904 if (tcol) { 1905 subcols[i] = --tcol; 1906 subvals[i++] = vals[k]; 1907 } 1908 } 1909 #endif 1910 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1911 } 1912 } 1913 } 1914 1915 /* Now assemble the off-proc rows */ 1916 for (i=0; i<nrqs; i++) { /* for each requested message */ 1917 /* recv values from other processes */ 1918 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1919 proc = pa[idex]; 1920 sbuf1_i = sbuf1[proc]; 1921 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1922 ct1 = 2 + 1; 1923 ct2 = 0; /* count of received C->j */ 1924 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1925 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1926 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1927 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1928 1929 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1930 max1 = sbuf1_i[2]; /* num of rows */ 1931 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1932 row = sbuf1_i[ct1]; /* row index of submat */ 1933 if (!allcolumns) { 1934 idex = 0; 1935 if (scall == MAT_INITIAL_MATRIX) { 1936 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1937 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1938 #if defined(PETSC_USE_CTABLE) 1939 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1940 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1941 } else { 1942 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1943 } 1944 #else 1945 tcol = cmap[rbuf3_i[ct2]]; 1946 #endif 1947 if (tcol) { 1948 subcols[idex] = --tcol; 1949 subvals[idex++] = rbuf4_i[ct2]; 1950 1951 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1952 For reuse, we replace received C->j with index that should be inserted to submat */ 1953 rbuf3_i[ct3++] = ct2; 1954 } 1955 } 1956 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1957 1958 } else { /* scall == MAT_REUSE_MATRIX */ 1959 submat = submats[0]; 1960 subc = (Mat_SeqAIJ*)submat->data; 1961 1962 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1963 for (l=0; l<nnz; l++) { 1964 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1965 subvals[idex++] = rbuf4_i[ct2]; 1966 } 1967 1968 bj = subc->j + subc->i[row]; 1969 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1970 } 1971 } else { /* allcolumns */ 1972 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1973 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1974 ct2 += nnz; 1975 } 1976 } 1977 } 1978 1979 /* sending a->a are done */ 1980 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1981 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1982 1983 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1984 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1985 submats[0] = submat; 1986 1987 /* Restore the indices */ 1988 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1989 if (!allcolumns) { 1990 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1991 } 1992 1993 /* Destroy allocated memory */ 1994 for (i=0; i<nrqs; ++i) { 1995 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1996 } 1997 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1998 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1999 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2000 ierr = PetscFree2(subcols,subvals);CHKERRQ(ierr); 2001 2002 if (scall == MAT_INITIAL_MATRIX) { 2003 ierr = PetscFree(lens);CHKERRQ(ierr); 2004 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2005 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2006 } 2007 PetscFunctionReturn(0); 2008 } 2009 2010 #undef __FUNCT__ 2011 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS" 2012 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2013 { 2014 PetscErrorCode ierr; 2015 PetscInt ncol; 2016 PetscBool colflag,allcolumns=PETSC_FALSE; 2017 2018 PetscFunctionBegin; 2019 /* Allocate memory to hold all the submatrices */ 2020 if (scall != MAT_REUSE_MATRIX) { 2021 ierr = PetscMalloc1(1,submat);CHKERRQ(ierr); 2022 } 2023 2024 /* Check for special case: each processor gets entire matrix columns */ 2025 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 2026 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 2027 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 2028 2029 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2030 PetscFunctionReturn(0); 2031 } 2032 2033 #undef __FUNCT__ 2034 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 2035 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2036 { 2037 PetscErrorCode ierr; 2038 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 2039 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 2040 2041 PetscFunctionBegin; 2042 /* Check for special case: each processor gets entire matrix */ 2043 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */ 2044 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2045 PetscFunctionReturn(0); 2046 } 2047 2048 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2049 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2050 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2051 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2052 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2053 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2054 wantallmatrix = PETSC_TRUE; 2055 2056 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2057 } 2058 } 2059 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2060 if (twantallmatrix) { 2061 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2062 PetscFunctionReturn(0); 2063 } 2064 2065 /* Allocate memory to hold all the submatrices */ 2066 if (scall != MAT_REUSE_MATRIX) { 2067 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 2068 } 2069 2070 /* Check for special case: each processor gets entire matrix columns */ 2071 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 2072 for (i=0; i<ismax; i++) { 2073 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2074 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 2075 if (colflag && ncol == C->cmap->N) { 2076 allcolumns[i] = PETSC_TRUE; 2077 } else { 2078 allcolumns[i] = PETSC_FALSE; 2079 } 2080 } 2081 2082 /* Determine the number of stages through which submatrices are done */ 2083 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2084 2085 /* 2086 Each stage will extract nmax submatrices. 2087 nmax is determined by the matrix column dimension. 2088 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2089 */ 2090 if (!nmax) nmax = 1; 2091 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 2092 2093 /* Make sure every processor loops through the nstages */ 2094 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2095 2096 for (i=0,pos=0; i<nstages; i++) { 2097 if (pos+nmax <= ismax) max_no = nmax; 2098 else if (pos == ismax) max_no = 0; 2099 else max_no = ismax-pos; 2100 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 2101 pos += max_no; 2102 } 2103 2104 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 2105 PetscFunctionReturn(0); 2106 } 2107 2108 /* -------------------------------------------------------------------------*/ 2109 #undef __FUNCT__ 2110 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 2111 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 2112 { 2113 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2114 Mat A = c->A; 2115 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 2116 const PetscInt **icol,**irow; 2117 PetscInt *nrow,*ncol,start; 2118 PetscErrorCode ierr; 2119 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 2120 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 2121 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 2122 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 2123 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2124 #if defined(PETSC_USE_CTABLE) 2125 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2126 #else 2127 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2128 #endif 2129 const PetscInt *irow_i; 2130 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2131 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2132 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2133 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2134 MPI_Status *r_status3,*r_status4,*s_status4; 2135 MPI_Comm comm; 2136 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 2137 PetscMPIInt *onodes1,*olengths1; 2138 PetscMPIInt idex,idex2,end; 2139 2140 PetscFunctionBegin; 2141 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2142 tag0 = ((PetscObject)C)->tag; 2143 size = c->size; 2144 rank = c->rank; 2145 2146 /* Get some new tags to keep the communication clean */ 2147 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 2148 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2149 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2150 2151 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 2152 2153 for (i=0; i<ismax; i++) { 2154 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2155 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2156 if (allcolumns[i]) { 2157 icol[i] = NULL; 2158 ncol[i] = C->cmap->N; 2159 } else { 2160 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2161 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2162 } 2163 } 2164 2165 /* evaluate communication - mesg to who, length of mesg, and buffer space 2166 required. Based on this, buffers are allocated, and data copied into them*/ 2167 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 2168 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2169 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2170 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2171 for (i=0; i<ismax; i++) { 2172 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2173 jmax = nrow[i]; 2174 irow_i = irow[i]; 2175 for (j=0; j<jmax; j++) { 2176 l = 0; 2177 row = irow_i[j]; 2178 while (row >= C->rmap->range[l+1]) l++; 2179 proc = l; 2180 w4[proc]++; 2181 } 2182 for (j=0; j<size; j++) { 2183 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 2184 } 2185 } 2186 2187 nrqs = 0; /* no of outgoing messages */ 2188 msz = 0; /* total mesg length (for all procs) */ 2189 w1[rank] = 0; /* no mesg sent to self */ 2190 w3[rank] = 0; 2191 for (i=0; i<size; i++) { 2192 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2193 } 2194 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2195 for (i=0,j=0; i<size; i++) { 2196 if (w1[i]) { pa[j] = i; j++; } 2197 } 2198 2199 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2200 for (i=0; i<nrqs; i++) { 2201 j = pa[i]; 2202 w1[j] += w2[j] + 2* w3[j]; 2203 msz += w1[j]; 2204 } 2205 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2206 2207 /* Determine the number of messages to expect, their lengths, from from-ids */ 2208 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2209 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2210 2211 /* Now post the Irecvs corresponding to these messages */ 2212 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2213 2214 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2215 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2216 2217 /* Allocate Memory for outgoing messages */ 2218 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2219 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2220 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2221 2222 { 2223 PetscInt *iptr = tmp; 2224 k = 0; 2225 for (i=0; i<nrqs; i++) { 2226 j = pa[i]; 2227 iptr += k; 2228 sbuf1[j] = iptr; 2229 k = w1[j]; 2230 } 2231 } 2232 2233 /* Form the outgoing messages */ 2234 /* Initialize the header space */ 2235 for (i=0; i<nrqs; i++) { 2236 j = pa[i]; 2237 sbuf1[j][0] = 0; 2238 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2239 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2240 } 2241 2242 /* Parse the isrow and copy data into outbuf */ 2243 for (i=0; i<ismax; i++) { 2244 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2245 irow_i = irow[i]; 2246 jmax = nrow[i]; 2247 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2248 l = 0; 2249 row = irow_i[j]; 2250 while (row >= C->rmap->range[l+1]) l++; 2251 proc = l; 2252 if (proc != rank) { /* copy to the outgoing buf*/ 2253 ctr[proc]++; 2254 *ptr[proc] = row; 2255 ptr[proc]++; 2256 } 2257 } 2258 /* Update the headers for the current IS */ 2259 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2260 if ((ctr_j = ctr[j])) { 2261 sbuf1_j = sbuf1[j]; 2262 k = ++sbuf1_j[0]; 2263 sbuf1_j[2*k] = ctr_j; 2264 sbuf1_j[2*k-1] = i; 2265 } 2266 } 2267 } 2268 2269 /* Now post the sends */ 2270 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2271 for (i=0; i<nrqs; ++i) { 2272 j = pa[i]; 2273 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2274 } 2275 2276 /* Post Receives to capture the buffer size */ 2277 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2278 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 2279 rbuf2[0] = tmp + msz; 2280 for (i=1; i<nrqs; ++i) { 2281 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2282 } 2283 for (i=0; i<nrqs; ++i) { 2284 j = pa[i]; 2285 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 2286 } 2287 2288 /* Send to other procs the buf size they should allocate */ 2289 2290 2291 /* Receive messages*/ 2292 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2293 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2294 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 2295 { 2296 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2297 PetscInt *sbuf2_i; 2298 2299 for (i=0; i<nrqr; ++i) { 2300 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 2301 2302 req_size[idex] = 0; 2303 rbuf1_i = rbuf1[idex]; 2304 start = 2*rbuf1_i[0] + 1; 2305 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2306 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 2307 sbuf2_i = sbuf2[idex]; 2308 for (j=start; j<end; j++) { 2309 id = rbuf1_i[j] - rstart; 2310 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2311 sbuf2_i[j] = ncols; 2312 req_size[idex] += ncols; 2313 } 2314 req_source[idex] = r_status1[i].MPI_SOURCE; 2315 /* form the header */ 2316 sbuf2_i[0] = req_size[idex]; 2317 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2318 2319 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 2320 } 2321 } 2322 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2323 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2324 2325 /* recv buffer sizes */ 2326 /* Receive messages*/ 2327 2328 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 2329 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2330 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2331 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2332 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2333 2334 for (i=0; i<nrqs; ++i) { 2335 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 2336 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 2337 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 2338 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 2339 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 2340 } 2341 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2342 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2343 2344 /* Wait on sends1 and sends2 */ 2345 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2346 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2347 2348 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2349 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2350 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2351 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2352 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2353 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2354 2355 /* Now allocate buffers for a->j, and send them off */ 2356 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2357 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2358 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2359 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2360 2361 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2362 { 2363 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2364 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2365 PetscInt cend = C->cmap->rend; 2366 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2367 2368 for (i=0; i<nrqr; i++) { 2369 rbuf1_i = rbuf1[i]; 2370 sbuf_aj_i = sbuf_aj[i]; 2371 ct1 = 2*rbuf1_i[0] + 1; 2372 ct2 = 0; 2373 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2374 kmax = rbuf1[i][2*j]; 2375 for (k=0; k<kmax; k++,ct1++) { 2376 row = rbuf1_i[ct1] - rstart; 2377 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2378 ncols = nzA + nzB; 2379 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2380 2381 /* load the column indices for this row into cols*/ 2382 cols = sbuf_aj_i + ct2; 2383 2384 lwrite = 0; 2385 for (l=0; l<nzB; l++) { 2386 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2387 } 2388 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2389 for (l=0; l<nzB; l++) { 2390 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2391 } 2392 2393 ct2 += ncols; 2394 } 2395 } 2396 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 2397 } 2398 } 2399 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 2400 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 2401 2402 /* Allocate buffers for a->a, and send them off */ 2403 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2404 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2405 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2406 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2407 2408 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2409 { 2410 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2411 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2412 PetscInt cend = C->cmap->rend; 2413 PetscInt *b_j = b->j; 2414 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2415 2416 for (i=0; i<nrqr; i++) { 2417 rbuf1_i = rbuf1[i]; 2418 sbuf_aa_i = sbuf_aa[i]; 2419 ct1 = 2*rbuf1_i[0]+1; 2420 ct2 = 0; 2421 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2422 kmax = rbuf1_i[2*j]; 2423 for (k=0; k<kmax; k++,ct1++) { 2424 row = rbuf1_i[ct1] - rstart; 2425 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2426 ncols = nzA + nzB; 2427 cworkB = b_j + b_i[row]; 2428 vworkA = a_a + a_i[row]; 2429 vworkB = b_a + b_i[row]; 2430 2431 /* load the column values for this row into vals*/ 2432 vals = sbuf_aa_i+ct2; 2433 2434 lwrite = 0; 2435 for (l=0; l<nzB; l++) { 2436 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2437 } 2438 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2439 for (l=0; l<nzB; l++) { 2440 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2441 } 2442 2443 ct2 += ncols; 2444 } 2445 } 2446 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 2447 } 2448 } 2449 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 2450 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 2451 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2452 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2453 2454 /* Form the matrix */ 2455 /* create col map: global col of C -> local col of submatrices */ 2456 { 2457 const PetscInt *icol_i; 2458 #if defined(PETSC_USE_CTABLE) 2459 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 2460 for (i=0; i<ismax; i++) { 2461 if (!allcolumns[i]) { 2462 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2463 2464 jmax = ncol[i]; 2465 icol_i = icol[i]; 2466 cmap_i = cmap[i]; 2467 for (j=0; j<jmax; j++) { 2468 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2469 } 2470 } else { 2471 cmap[i] = NULL; 2472 } 2473 } 2474 #else 2475 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 2476 for (i=0; i<ismax; i++) { 2477 if (!allcolumns[i]) { 2478 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2479 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2480 jmax = ncol[i]; 2481 icol_i = icol[i]; 2482 cmap_i = cmap[i]; 2483 for (j=0; j<jmax; j++) { 2484 cmap_i[icol_i[j]] = j+1; 2485 } 2486 } else { 2487 cmap[i] = NULL; 2488 } 2489 } 2490 #endif 2491 } 2492 2493 /* Create lens which is required for MatCreate... */ 2494 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2495 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2496 if (ismax) { 2497 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 2498 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 2499 } 2500 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2501 2502 /* Update lens from local data */ 2503 for (i=0; i<ismax; i++) { 2504 jmax = nrow[i]; 2505 if (!allcolumns[i]) cmap_i = cmap[i]; 2506 irow_i = irow[i]; 2507 lens_i = lens[i]; 2508 for (j=0; j<jmax; j++) { 2509 l = 0; 2510 row = irow_i[j]; 2511 while (row >= C->rmap->range[l+1]) l++; 2512 proc = l; 2513 if (proc == rank) { 2514 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2515 if (!allcolumns[i]) { 2516 for (k=0; k<ncols; k++) { 2517 #if defined(PETSC_USE_CTABLE) 2518 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2519 #else 2520 tcol = cmap_i[cols[k]]; 2521 #endif 2522 if (tcol) lens_i[j]++; 2523 } 2524 } else { /* allcolumns */ 2525 lens_i[j] = ncols; 2526 } 2527 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2528 } 2529 } 2530 } 2531 2532 /* Create row map: global row of C -> local row of submatrices */ 2533 #if defined(PETSC_USE_CTABLE) 2534 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 2535 for (i=0; i<ismax; i++) { 2536 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2537 irow_i = irow[i]; 2538 jmax = nrow[i]; 2539 for (j=0; j<jmax; j++) { 2540 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2541 } 2542 } 2543 #else 2544 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 2545 if (ismax) { 2546 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 2547 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2548 } 2549 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 2550 for (i=0; i<ismax; i++) { 2551 rmap_i = rmap[i]; 2552 irow_i = irow[i]; 2553 jmax = nrow[i]; 2554 for (j=0; j<jmax; j++) { 2555 rmap_i[irow_i[j]] = j; 2556 } 2557 } 2558 #endif 2559 2560 /* Update lens from offproc data */ 2561 { 2562 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2563 2564 for (tmp2=0; tmp2<nrqs; tmp2++) { 2565 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 2566 idex = pa[idex2]; 2567 sbuf1_i = sbuf1[idex]; 2568 jmax = sbuf1_i[0]; 2569 ct1 = 2*jmax+1; 2570 ct2 = 0; 2571 rbuf2_i = rbuf2[idex2]; 2572 rbuf3_i = rbuf3[idex2]; 2573 for (j=1; j<=jmax; j++) { 2574 is_no = sbuf1_i[2*j-1]; 2575 max1 = sbuf1_i[2*j]; 2576 lens_i = lens[is_no]; 2577 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2578 rmap_i = rmap[is_no]; 2579 for (k=0; k<max1; k++,ct1++) { 2580 #if defined(PETSC_USE_CTABLE) 2581 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2582 row--; 2583 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2584 #else 2585 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2586 #endif 2587 max2 = rbuf2_i[ct1]; 2588 for (l=0; l<max2; l++,ct2++) { 2589 if (!allcolumns[is_no]) { 2590 #if defined(PETSC_USE_CTABLE) 2591 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2592 #else 2593 tcol = cmap_i[rbuf3_i[ct2]]; 2594 #endif 2595 if (tcol) lens_i[row]++; 2596 } else { /* allcolumns */ 2597 lens_i[row]++; /* lens_i[row] += max2 ? */ 2598 } 2599 } 2600 } 2601 } 2602 } 2603 } 2604 ierr = PetscFree(r_status3);CHKERRQ(ierr); 2605 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2606 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2607 ierr = PetscFree(s_status3);CHKERRQ(ierr); 2608 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2609 2610 /* Create the submatrices */ 2611 if (scall == MAT_REUSE_MATRIX) { 2612 PetscBool flag; 2613 2614 /* 2615 Assumes new rows are same length as the old rows,hence bug! 2616 */ 2617 for (i=0; i<ismax; i++) { 2618 mat = (Mat_SeqAIJ*)(submats[i]->data); 2619 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2620 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2621 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 2622 /* Initial matrix as if empty */ 2623 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2624 2625 submats[i]->factortype = C->factortype; 2626 } 2627 } else { 2628 for (i=0; i<ismax; i++) { 2629 PetscInt rbs,cbs; 2630 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2631 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2632 2633 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2634 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2635 2636 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2637 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2638 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2639 } 2640 } 2641 2642 /* Assemble the matrices */ 2643 /* First assemble the local rows */ 2644 { 2645 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2646 PetscScalar *imat_a; 2647 2648 for (i=0; i<ismax; i++) { 2649 mat = (Mat_SeqAIJ*)submats[i]->data; 2650 imat_ilen = mat->ilen; 2651 imat_j = mat->j; 2652 imat_i = mat->i; 2653 imat_a = mat->a; 2654 2655 if (!allcolumns[i]) cmap_i = cmap[i]; 2656 rmap_i = rmap[i]; 2657 irow_i = irow[i]; 2658 jmax = nrow[i]; 2659 for (j=0; j<jmax; j++) { 2660 l = 0; 2661 row = irow_i[j]; 2662 while (row >= C->rmap->range[l+1]) l++; 2663 proc = l; 2664 if (proc == rank) { 2665 old_row = row; 2666 #if defined(PETSC_USE_CTABLE) 2667 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2668 row--; 2669 #else 2670 row = rmap_i[row]; 2671 #endif 2672 ilen_row = imat_ilen[row]; 2673 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2674 mat_i = imat_i[row]; 2675 mat_a = imat_a + mat_i; 2676 mat_j = imat_j + mat_i; 2677 if (!allcolumns[i]) { 2678 for (k=0; k<ncols; k++) { 2679 #if defined(PETSC_USE_CTABLE) 2680 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2681 #else 2682 tcol = cmap_i[cols[k]]; 2683 #endif 2684 if (tcol) { 2685 *mat_j++ = tcol - 1; 2686 *mat_a++ = vals[k]; 2687 ilen_row++; 2688 } 2689 } 2690 } else { /* allcolumns */ 2691 for (k=0; k<ncols; k++) { 2692 *mat_j++ = cols[k]; /* global col index! */ 2693 *mat_a++ = vals[k]; 2694 ilen_row++; 2695 } 2696 } 2697 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2698 2699 imat_ilen[row] = ilen_row; 2700 } 2701 } 2702 } 2703 } 2704 2705 /* Now assemble the off proc rows*/ 2706 { 2707 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 2708 PetscInt *imat_j,*imat_i; 2709 PetscScalar *imat_a,*rbuf4_i; 2710 2711 for (tmp2=0; tmp2<nrqs; tmp2++) { 2712 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 2713 idex = pa[idex2]; 2714 sbuf1_i = sbuf1[idex]; 2715 jmax = sbuf1_i[0]; 2716 ct1 = 2*jmax + 1; 2717 ct2 = 0; 2718 rbuf2_i = rbuf2[idex2]; 2719 rbuf3_i = rbuf3[idex2]; 2720 rbuf4_i = rbuf4[idex2]; 2721 for (j=1; j<=jmax; j++) { 2722 is_no = sbuf1_i[2*j-1]; 2723 rmap_i = rmap[is_no]; 2724 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2725 mat = (Mat_SeqAIJ*)submats[is_no]->data; 2726 imat_ilen = mat->ilen; 2727 imat_j = mat->j; 2728 imat_i = mat->i; 2729 imat_a = mat->a; 2730 max1 = sbuf1_i[2*j]; 2731 for (k=0; k<max1; k++,ct1++) { 2732 row = sbuf1_i[ct1]; 2733 #if defined(PETSC_USE_CTABLE) 2734 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2735 row--; 2736 #else 2737 row = rmap_i[row]; 2738 #endif 2739 ilen = imat_ilen[row]; 2740 mat_i = imat_i[row]; 2741 mat_a = imat_a + mat_i; 2742 mat_j = imat_j + mat_i; 2743 max2 = rbuf2_i[ct1]; 2744 if (!allcolumns[is_no]) { 2745 for (l=0; l<max2; l++,ct2++) { 2746 2747 #if defined(PETSC_USE_CTABLE) 2748 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2749 #else 2750 tcol = cmap_i[rbuf3_i[ct2]]; 2751 #endif 2752 if (tcol) { 2753 *mat_j++ = tcol - 1; 2754 *mat_a++ = rbuf4_i[ct2]; 2755 ilen++; 2756 } 2757 } 2758 } else { /* allcolumns */ 2759 for (l=0; l<max2; l++,ct2++) { 2760 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2761 *mat_a++ = rbuf4_i[ct2]; 2762 ilen++; 2763 } 2764 } 2765 imat_ilen[row] = ilen; 2766 } 2767 } 2768 } 2769 } 2770 2771 /* sort the rows */ 2772 { 2773 PetscInt *imat_ilen,*imat_j,*imat_i; 2774 PetscScalar *imat_a; 2775 2776 for (i=0; i<ismax; i++) { 2777 mat = (Mat_SeqAIJ*)submats[i]->data; 2778 imat_j = mat->j; 2779 imat_i = mat->i; 2780 imat_a = mat->a; 2781 imat_ilen = mat->ilen; 2782 2783 if (allcolumns[i]) continue; 2784 jmax = nrow[i]; 2785 for (j=0; j<jmax; j++) { 2786 PetscInt ilen; 2787 2788 mat_i = imat_i[j]; 2789 mat_a = imat_a + mat_i; 2790 mat_j = imat_j + mat_i; 2791 ilen = imat_ilen[j]; 2792 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 2793 } 2794 } 2795 } 2796 2797 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2798 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2799 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2800 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2801 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2802 2803 /* Restore the indices */ 2804 for (i=0; i<ismax; i++) { 2805 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2806 if (!allcolumns[i]) { 2807 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2808 } 2809 } 2810 2811 /* Destroy allocated memory */ 2812 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 2813 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2814 ierr = PetscFree(pa);CHKERRQ(ierr); 2815 2816 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 2817 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 2818 for (i=0; i<nrqr; ++i) { 2819 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 2820 } 2821 for (i=0; i<nrqs; ++i) { 2822 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 2823 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2824 } 2825 2826 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 2827 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 2828 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2829 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2830 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2831 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2832 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2833 2834 #if defined(PETSC_USE_CTABLE) 2835 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 2836 #else 2837 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 2838 #endif 2839 ierr = PetscFree(rmap);CHKERRQ(ierr); 2840 2841 for (i=0; i<ismax; i++) { 2842 if (!allcolumns[i]) { 2843 #if defined(PETSC_USE_CTABLE) 2844 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 2845 #else 2846 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 2847 #endif 2848 } 2849 } 2850 ierr = PetscFree(cmap);CHKERRQ(ierr); 2851 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2852 ierr = PetscFree(lens);CHKERRQ(ierr); 2853 2854 for (i=0; i<ismax; i++) { 2855 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2856 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2857 } 2858 PetscFunctionReturn(0); 2859 } 2860 2861 /* 2862 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2863 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2864 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2865 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2866 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2867 state, and needs to be "assembled" later by compressing B's column space. 2868 2869 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2870 Following this call, C->A & C->B have been created, even if empty. 2871 */ 2872 #undef __FUNCT__ 2873 #define __FUNCT__ "MatSetSeqMats_MPIAIJ" 2874 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2875 { 2876 /* If making this function public, change the error returned in this function away from _PLIB. */ 2877 PetscErrorCode ierr; 2878 Mat_MPIAIJ *aij; 2879 Mat_SeqAIJ *Baij; 2880 PetscBool seqaij,Bdisassembled; 2881 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2882 PetscScalar v; 2883 const PetscInt *rowindices,*colindices; 2884 2885 PetscFunctionBegin; 2886 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2887 if (A) { 2888 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2889 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2890 if (rowemb) { 2891 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2892 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2893 } else { 2894 if (C->rmap->n != A->rmap->n) { 2895 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2896 } 2897 } 2898 if (dcolemb) { 2899 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2900 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2901 } else { 2902 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2903 } 2904 } 2905 if (B) { 2906 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2907 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2908 if (rowemb) { 2909 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2910 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2911 } else { 2912 if (C->rmap->n != B->rmap->n) { 2913 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2914 } 2915 } 2916 if (ocolemb) { 2917 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2918 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2919 } else { 2920 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2921 } 2922 } 2923 2924 aij = (Mat_MPIAIJ*)(C->data); 2925 if (!aij->A) { 2926 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2927 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2928 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2929 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2930 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2931 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2932 } 2933 if (A) { 2934 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2935 } else { 2936 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2937 } 2938 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2939 /* 2940 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2941 need to "disassemble" B -- convert it to using C's global indices. 2942 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2943 2944 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2945 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2946 2947 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2948 At least avoid calling MatSetValues() and the implied searches? 2949 */ 2950 2951 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2952 #if defined(PETSC_USE_CTABLE) 2953 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2954 #else 2955 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2956 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2957 if (aij->B) { 2958 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2959 } 2960 #endif 2961 ngcol = 0; 2962 if (aij->lvec) { 2963 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2964 } 2965 if (aij->garray) { 2966 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2967 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2968 } 2969 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2970 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2971 } 2972 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2973 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2974 } 2975 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2976 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2977 } 2978 } 2979 Bdisassembled = PETSC_FALSE; 2980 if (!aij->B) { 2981 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2982 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2983 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2984 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2985 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2986 Bdisassembled = PETSC_TRUE; 2987 } 2988 if (B) { 2989 Baij = (Mat_SeqAIJ*)(B->data); 2990 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2991 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2992 for (i=0; i<B->rmap->n; i++) { 2993 nz[i] = Baij->i[i+1] - Baij->i[i]; 2994 } 2995 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2996 ierr = PetscFree(nz);CHKERRQ(ierr); 2997 } 2998 2999 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3000 shift = rend-rstart; 3001 count = 0; 3002 rowindices = NULL; 3003 colindices = NULL; 3004 if (rowemb) { 3005 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3006 } 3007 if (ocolemb) { 3008 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3009 } 3010 for (i=0; i<B->rmap->n; i++) { 3011 PetscInt row; 3012 row = i; 3013 if (rowindices) row = rowindices[i]; 3014 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3015 col = Baij->j[count]; 3016 if (colindices) col = colindices[col]; 3017 if (Bdisassembled && col>=rstart) col += shift; 3018 v = Baij->a[count]; 3019 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3020 ++count; 3021 } 3022 } 3023 /* No assembly for aij->B is necessary. */ 3024 /* FIXME: set aij->B's nonzerostate correctly. */ 3025 } else { 3026 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3027 } 3028 C->preallocated = PETSC_TRUE; 3029 C->was_assembled = PETSC_FALSE; 3030 C->assembled = PETSC_FALSE; 3031 /* 3032 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3033 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3034 */ 3035 PetscFunctionReturn(0); 3036 } 3037 3038 #undef __FUNCT__ 3039 #define __FUNCT__ "MatGetSeqMats_MPIAIJ" 3040 /* 3041 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3042 */ 3043 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3044 { 3045 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 3046 3047 PetscFunctionBegin; 3048 PetscValidPointer(A,2); 3049 PetscValidPointer(B,3); 3050 /* FIXME: make sure C is assembled */ 3051 *A = aij->A; 3052 *B = aij->B; 3053 /* Note that we don't incref *A and *B, so be careful! */ 3054 PetscFunctionReturn(0); 3055 } 3056 3057 /* 3058 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3059 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3060 */ 3061 #undef __FUNCT__ 3062 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ" 3063 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3064 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3065 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3066 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3067 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3068 { 3069 PetscErrorCode ierr; 3070 PetscMPIInt isize,flag; 3071 PetscInt i,ii,cismax,ispar,ciscol_localsize; 3072 Mat *A,*B; 3073 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3074 3075 PetscFunctionBegin; 3076 if (!ismax) PetscFunctionReturn(0); 3077 3078 for (i = 0, cismax = 0; i < ismax; ++i) { 3079 PetscMPIInt isize; 3080 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3081 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3082 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3083 if (isize > 1) ++cismax; 3084 } 3085 /* 3086 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3087 ispar counts the number of parallel ISs across C's comm. 3088 */ 3089 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3090 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3091 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3092 PetscFunctionReturn(0); 3093 } 3094 3095 /* if (ispar) */ 3096 /* 3097 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3098 These are used to extract the off-diag portion of the resulting parallel matrix. 3099 The row IS for the off-diag portion is the same as for the diag portion, 3100 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3101 */ 3102 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3103 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3104 for (i = 0, ii = 0; i < ismax; ++i) { 3105 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3106 if (isize > 1) { 3107 /* 3108 TODO: This is the part that's ***NOT SCALABLE***. 3109 To fix this we need to extract just the indices of C's nonzero columns 3110 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3111 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3112 to be done without serializing on the IS list, so, most likely, it is best 3113 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3114 */ 3115 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3116 /* Now we have to 3117 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3118 were sorted on each rank, concatenated they might no longer be sorted; 3119 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3120 indices in the nondecreasing order to the original index positions. 3121 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3122 */ 3123 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3124 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3125 ++ii; 3126 } 3127 } 3128 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3129 for (i = 0, ii = 0; i < ismax; ++i) { 3130 PetscInt j,issize; 3131 const PetscInt *indices; 3132 3133 /* 3134 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3135 */ 3136 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3137 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3138 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3139 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3140 for (j = 1; j < issize; ++j) { 3141 if (indices[j] == indices[j-1]) { 3142 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3143 } 3144 } 3145 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3146 3147 3148 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3149 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3150 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3151 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3152 for (j = 1; j < issize; ++j) { 3153 if (indices[j-1] == indices[j]) { 3154 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3155 } 3156 } 3157 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3158 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3159 if (isize > 1) { 3160 cisrow[ii] = isrow[i]; 3161 ++ii; 3162 } 3163 } 3164 /* 3165 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3166 array of sequential matrices underlying the resulting parallel matrices. 3167 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3168 contain duplicates. 3169 3170 There are as many diag matrices as there are original index sets. There are only as many parallel 3171 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3172 3173 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3174 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3175 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3176 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3177 with A[i] and B[ii] extracted from the corresponding MPI submat. 3178 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3179 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3180 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3181 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3182 values into A[i] and B[ii] sitting inside the corresponding submat. 3183 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3184 A[i], B[ii], AA[i] or BB[ii] matrices. 3185 */ 3186 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3187 if (scall == MAT_INITIAL_MATRIX) { 3188 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3189 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 3190 } else { 3191 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 3192 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 3193 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 3194 for (i = 0, ii = 0; i < ismax; ++i) { 3195 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3196 if (isize > 1) { 3197 Mat AA,BB; 3198 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 3199 if (!isrow_p[i] && !iscol_p[i]) { 3200 A[i] = AA; 3201 } else { 3202 /* TODO: extract A[i] composed on AA. */ 3203 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3204 } 3205 if (!isrow_p[i] && !ciscol_p[ii]) { 3206 B[ii] = BB; 3207 } else { 3208 /* TODO: extract B[ii] composed on BB. */ 3209 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 3210 } 3211 ++ii; 3212 } else { 3213 if (!isrow_p[i] && !iscol_p[i]) { 3214 A[i] = (*submat)[i]; 3215 } else { 3216 /* TODO: extract A[i] composed on (*submat)[i]. */ 3217 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3218 } 3219 } 3220 } 3221 } 3222 /* Now obtain the sequential A and B submatrices separately. */ 3223 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 3224 /* I did not figure out a good way to fix it right now. 3225 * Local column size of B[i] is different from the size of ciscol[i]. 3226 * B[i]'s size is finally determined by MatAssembly, while 3227 * ciscol[i] is computed as the complement of iscol[i]. 3228 * It is better to keep only nonzero indices when computing 3229 * the complement ciscol[i]. 3230 * */ 3231 if(scall==MAT_REUSE_MATRIX){ 3232 for(i=0; i<cismax; i++){ 3233 ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr); 3234 B[i]->cmap->n = ciscol_localsize; 3235 } 3236 } 3237 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 3238 /* 3239 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3240 matrices A & B have been extracted directly into the parallel matrices containing them, or 3241 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3242 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3243 to have the same sparsity pattern. 3244 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3245 must be constructed for C. This is done by setseqmat(s). 3246 */ 3247 for (i = 0, ii = 0; i < ismax; ++i) { 3248 /* 3249 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3250 That way we can avoid sorting and computing permutations when reusing. 3251 To this end: 3252 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3253 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3254 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3255 */ 3256 MatStructure pattern; 3257 if (scall == MAT_INITIAL_MATRIX) { 3258 pattern = DIFFERENT_NONZERO_PATTERN; 3259 } else { 3260 pattern = SAME_NONZERO_PATTERN; 3261 } 3262 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3263 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3264 if (isize > 1) { 3265 if (scall == MAT_INITIAL_MATRIX) { 3266 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3267 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3268 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3269 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3270 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3271 } 3272 /* 3273 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3274 */ 3275 { 3276 Mat AA,BB; 3277 AA = NULL; 3278 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 3279 BB = NULL; 3280 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 3281 if (AA || BB) { 3282 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3283 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3284 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3285 } 3286 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 3287 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 3288 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3289 } 3290 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 3291 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 3292 ierr = MatDestroy(&BB);CHKERRQ(ierr); 3293 } 3294 } 3295 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3296 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3297 ++ii; 3298 } else { /* if (isize == 1) */ 3299 if (scall == MAT_INITIAL_MATRIX) { 3300 if (isrow_p[i] || iscol_p[i]) { 3301 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3302 } else (*submat)[i] = A[i]; 3303 } 3304 if (isrow_p[i] || iscol_p[i]) { 3305 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3306 /* Otherwise A is extracted straight into (*submats)[i]. */ 3307 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3308 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3309 } 3310 } 3311 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3312 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3313 } 3314 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3315 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3316 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3317 ierr = PetscFree(A);CHKERRQ(ierr); 3318 ierr = PetscFree(B);CHKERRQ(ierr); 3319 PetscFunctionReturn(0); 3320 } 3321 3322 3323 3324 #undef __FUNCT__ 3325 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ" 3326 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3327 { 3328 PetscErrorCode ierr; 3329 3330 PetscFunctionBegin; 3331 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3332 PetscFunctionReturn(0); 3333 } 3334