1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++){ 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++){ 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank){ 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows){ 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++){ 115 if (tosizes_temp[i]){ 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++){ 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++){ 133 for (j=0; j<fromsizes[2*i]; j++){ 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++){ 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++){ 169 for (j=0; j<tosizes[2*i]; j++){ 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++){ 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++){ 209 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 210 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 213 } 214 /* retrieve information to get row id and its overlap */ 215 for (i=0; i<nrecvs; ){ 216 is_id = recvdata[i++]; 217 data_size = recvdata[i++]; 218 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 219 isz_i = isz[is_id]; 220 for (j=0; j< data_size; j++){ 221 col = recvdata[i++]; 222 indices_i[isz_i++] = col; 223 } 224 isz[is_id] = isz_i; 225 } 226 /* remove duplicate entities */ 227 for (i=0; i<nidx; i++){ 228 indices_i = indices_temp+(max_lsize+nrecvs)*i; 229 isz_i = isz[i]; 230 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 231 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 232 } 233 ierr = PetscFree(isz);CHKERRQ(ierr); 234 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 239 { 240 PetscLayout rmap,cmap; 241 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 242 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 243 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 244 const PetscInt *gcols,*ai,*aj,*bi,*bj; 245 Mat amat,bmat; 246 PetscMPIInt rank; 247 PetscBool done; 248 MPI_Comm comm; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 253 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 254 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 255 /* Even if the mat is symmetric, we still assume it is not symmetric */ 256 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 257 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 258 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 /* total number of nonzero values is used to estimate the memory usage in the next step */ 261 tnz = ai[an]+bi[bn]; 262 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 263 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 264 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 265 /* to find the longest message */ 266 max_fszs = 0; 267 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 268 /* better way to estimate number of nonzero in the mat??? */ 269 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 270 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 271 rows_pos = 0; 272 totalrows = 0; 273 for (i=0; i<nfrom; i++){ 274 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 275 /* group data together */ 276 for (j=0; j<fromsizes[2*i]; j+=2){ 277 is_id = fromrows[rows_pos++];/* no of is */ 278 rows_i = rows_data[is_id]; 279 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 280 } 281 /* estimate a space to avoid multiple allocations */ 282 for (j=0; j<nidx; j++){ 283 indvc_ij = 0; 284 rows_i = rows_data[j]; 285 for (l=0; l<rows_pos_i[j]; l++){ 286 row = rows_i[l]-rstart; 287 start = ai[row]; 288 end = ai[row+1]; 289 for (k=start; k<end; k++){ /* Amat */ 290 col = aj[k] + cstart; 291 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 292 } 293 start = bi[row]; 294 end = bi[row+1]; 295 for (k=start; k<end; k++) { /* Bmat */ 296 col = gcols[bj[k]]; 297 indices_tmp[indvc_ij++] = col; 298 } 299 } 300 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 301 indv_counts[i*nidx+j] = indvc_ij; 302 totalrows += indvc_ij; 303 } 304 } 305 /* message triple <no of is, number of rows, rows> */ 306 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 307 totalrows = 0; 308 rows_pos = 0; 309 /* use this code again */ 310 for (i=0;i<nfrom;i++){ 311 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 312 for (j=0; j<fromsizes[2*i]; j+=2){ 313 is_id = fromrows[rows_pos++]; 314 rows_i = rows_data[is_id]; 315 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 316 } 317 /* add data */ 318 for (j=0; j<nidx; j++){ 319 if (!indv_counts[i*nidx+j]) continue; 320 indvc_ij = 0; 321 sbdata[totalrows++] = j; 322 sbdata[totalrows++] = indv_counts[i*nidx+j]; 323 sbsizes[2*i] += 2; 324 rows_i = rows_data[j]; 325 for (l=0; l<rows_pos_i[j]; l++){ 326 row = rows_i[l]-rstart; 327 start = ai[row]; 328 end = ai[row+1]; 329 for (k=start; k<end; k++){ /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; 332 } 333 start = bi[row]; 334 end = bi[row+1]; 335 for (k=start; k<end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 341 sbsizes[2*i] += indvc_ij; 342 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 343 totalrows += indvc_ij; 344 } 345 } 346 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 347 for (i=0; i<nfrom; i++){ 348 offsets[i+1] = offsets[i] + sbsizes[2*i]; 349 sbsizes[2*i+1] = offsets[i]; 350 } 351 ierr = PetscFree(offsets);CHKERRQ(ierr); 352 if (sbrowsizes) *sbrowsizes = sbsizes; 353 if (sbrows) *sbrows = sbdata; 354 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 355 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 356 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 357 PetscFunctionReturn(0); 358 } 359 360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 361 { 362 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 363 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 364 PetscInt lsize,lsize_tmp,owner; 365 PetscMPIInt rank; 366 Mat amat,bmat; 367 PetscBool done; 368 PetscLayout cmap,rmap; 369 MPI_Comm comm; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 374 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 375 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 376 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 377 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 378 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 379 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 380 /* is it a safe way to compute number of nonzero values ? */ 381 tnz = ai[an]+bi[bn]; 382 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 383 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 384 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 385 /* it is a better way to estimate memory than the old implementation 386 * where global size of matrix is used 387 * */ 388 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 389 for (i=0; i<nidx; i++) { 390 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 391 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 392 lsize_tmp = 0; 393 for (j=0; j<lsize; j++) { 394 owner = -1; 395 row = indices[j]; 396 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 397 if (owner != rank) continue; 398 /* local number */ 399 row -= rstart; 400 start = ai[row]; 401 end = ai[row+1]; 402 for (k=start; k<end; k++) { /* Amat */ 403 col = aj[k] + cstart; 404 indices_temp[lsize_tmp++] = col; 405 } 406 start = bi[row]; 407 end = bi[row+1]; 408 for (k=start; k<end; k++) { /* Bmat */ 409 col = gcols[bj[k]]; 410 indices_temp[lsize_tmp++] = col; 411 } 412 } 413 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 414 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 415 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 416 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 417 } 418 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 419 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 420 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 421 PetscFunctionReturn(0); 422 } 423 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests recieved (which have to be or which have been processed 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscErrorCode ierr; 461 PetscMPIInt size,rank,tag1,tag2; 462 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 463 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 464 PetscBT *table; 465 MPI_Comm comm; 466 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 467 MPI_Status *s_status,*recv_status; 468 char *t_p; 469 470 PetscFunctionBegin; 471 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 477 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 478 479 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 480 481 for (i=0; i<imax; i++) { 482 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 483 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 489 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 490 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 491 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 492 for (i=0; i<imax; i++) { 493 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 494 idx_i = idx[i]; 495 len = n[i]; 496 for (j=0; j<len; j++) { 497 row = idx_i[j]; 498 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 499 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 500 w4[proc]++; 501 } 502 for (j=0; j<size; j++) { 503 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 504 } 505 } 506 507 nrqs = 0; /* no of outgoing messages */ 508 msz = 0; /* total mesg length (for all proc */ 509 w1[rank] = 0; /* no mesg sent to intself */ 510 w3[rank] = 0; 511 for (i=0; i<size; i++) { 512 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 513 } 514 /* pa - is list of processors to communicate with */ 515 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 516 for (i=0,j=0; i<size; i++) { 517 if (w1[i]) {pa[j] = i; j++;} 518 } 519 520 /* Each message would have a header = 1 + 2*(no of IS) + data */ 521 for (i=0; i<nrqs; i++) { 522 j = pa[i]; 523 w1[j] += w2[j] + 2*w3[j]; 524 msz += w1[j]; 525 } 526 527 /* Determine the number of messages to expect, their lengths, from from-ids */ 528 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 529 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 530 531 /* Now post the Irecvs corresponding to these messages */ 532 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 533 534 /* Allocate Memory for outgoing messages */ 535 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 536 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 537 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 538 539 { 540 PetscInt *iptr = tmp,ict = 0; 541 for (i=0; i<nrqs; i++) { 542 j = pa[i]; 543 iptr += ict; 544 outdat[j] = iptr; 545 ict = w1[j]; 546 } 547 } 548 549 /* Form the outgoing messages */ 550 /* plug in the headers */ 551 for (i=0; i<nrqs; i++) { 552 j = pa[i]; 553 outdat[j][0] = 0; 554 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 555 ptr[j] = outdat[j] + 2*w3[j] + 1; 556 } 557 558 /* Memory for doing local proc's work */ 559 { 560 PetscInt M_BPB_imax = 0; 561 #if defined(PETSC_USE_CTABLE) 562 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 563 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 564 for (i=0; i<imax; i++) { 565 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 566 } 567 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 570 } 571 #else 572 PetscInt Mimax = 0; 573 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 574 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 575 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 578 data[i] = d_p + M*i; 579 } 580 #endif 581 } 582 583 /* Parse the IS and update local tables and the outgoing buf with the data */ 584 { 585 PetscInt n_i,isz_i,*outdat_j,ctr_j; 586 PetscBT table_i; 587 588 for (i=0; i<imax; i++) { 589 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 590 n_i = n[i]; 591 table_i = table[i]; 592 idx_i = idx[i]; 593 #if defined(PETSC_USE_CTABLE) 594 table_data_i = table_data[i]; 595 #else 596 data_i = data[i]; 597 #endif 598 isz_i = isz[i]; 599 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 600 row = idx_i[j]; 601 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 602 if (proc != rank) { /* copy to the outgoing buffer */ 603 ctr[proc]++; 604 *ptr[proc] = row; 605 ptr[proc]++; 606 } else if (!PetscBTLookupSet(table_i,row)) { 607 #if defined(PETSC_USE_CTABLE) 608 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 609 #else 610 data_i[isz_i] = row; /* Update the local table */ 611 #endif 612 isz_i++; 613 } 614 } 615 /* Update the headers for the current IS */ 616 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 617 if ((ctr_j = ctr[j])) { 618 outdat_j = outdat[j]; 619 k = ++outdat_j[0]; 620 outdat_j[2*k] = ctr_j; 621 outdat_j[2*k-1] = i; 622 } 623 } 624 isz[i] = isz_i; 625 } 626 } 627 628 /* Now post the sends */ 629 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 630 for (i=0; i<nrqs; ++i) { 631 j = pa[i]; 632 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 633 } 634 635 /* No longer need the original indices */ 636 for (i=0; i<imax; ++i) { 637 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 638 } 639 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 640 641 for (i=0; i<imax; ++i) { 642 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 643 } 644 645 /* Do Local work */ 646 #if defined(PETSC_USE_CTABLE) 647 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 648 #else 649 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 650 #endif 651 652 /* Receive messages */ 653 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 654 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 655 656 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 657 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 658 659 /* Phase 1 sends are complete - deallocate buffers */ 660 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 661 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 662 663 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 664 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 665 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 666 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 667 ierr = PetscFree(rbuf);CHKERRQ(ierr); 668 669 670 /* Send the data back */ 671 /* Do a global reduction to know the buffer space req for incoming messages */ 672 { 673 PetscMPIInt *rw1; 674 675 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 676 677 for (i=0; i<nrqr; ++i) { 678 proc = recv_status[i].MPI_SOURCE; 679 680 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 681 rw1[proc] = isz1[i]; 682 } 683 ierr = PetscFree(onodes1);CHKERRQ(ierr); 684 ierr = PetscFree(olengths1);CHKERRQ(ierr); 685 686 /* Determine the number of messages to expect, their lengths, from from-ids */ 687 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 688 ierr = PetscFree(rw1);CHKERRQ(ierr); 689 } 690 /* Now post the Irecvs corresponding to these messages */ 691 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 692 693 /* Now post the sends */ 694 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 695 for (i=0; i<nrqr; ++i) { 696 j = recv_status[i].MPI_SOURCE; 697 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 698 } 699 700 /* receive work done on other processors */ 701 { 702 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 703 PetscMPIInt idex; 704 PetscBT table_i; 705 MPI_Status *status2; 706 707 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 708 for (i=0; i<nrqs; ++i) { 709 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 710 /* Process the message */ 711 rbuf2_i = rbuf2[idex]; 712 ct1 = 2*rbuf2_i[0]+1; 713 jmax = rbuf2[idex][0]; 714 for (j=1; j<=jmax; j++) { 715 max = rbuf2_i[2*j]; 716 is_no = rbuf2_i[2*j-1]; 717 isz_i = isz[is_no]; 718 table_i = table[is_no]; 719 #if defined(PETSC_USE_CTABLE) 720 table_data_i = table_data[is_no]; 721 #else 722 data_i = data[is_no]; 723 #endif 724 for (k=0; k<max; k++,ct1++) { 725 row = rbuf2_i[ct1]; 726 if (!PetscBTLookupSet(table_i,row)) { 727 #if defined(PETSC_USE_CTABLE) 728 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 729 #else 730 data_i[isz_i] = row; 731 #endif 732 isz_i++; 733 } 734 } 735 isz[is_no] = isz_i; 736 } 737 } 738 739 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 740 ierr = PetscFree(status2);CHKERRQ(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 } 768 769 ierr = PetscFree(onodes2);CHKERRQ(ierr); 770 ierr = PetscFree(olengths2);CHKERRQ(ierr); 771 772 ierr = PetscFree(pa);CHKERRQ(ierr); 773 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 774 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 775 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 776 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 779 ierr = PetscFree(s_status);CHKERRQ(ierr); 780 ierr = PetscFree(recv_status);CHKERRQ(ierr); 781 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 782 ierr = PetscFree(xdata);CHKERRQ(ierr); 783 ierr = PetscFree(isz1);CHKERRQ(ierr); 784 #if defined(PETSC_USE_CTABLE) 785 for (i=0; i<imax; i++) { 786 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 787 } 788 ierr = PetscFree(table_data);CHKERRQ(ierr); 789 ierr = PetscFree(tdata);CHKERRQ(ierr); 790 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 791 #else 792 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 793 #endif 794 PetscFunctionReturn(0); 795 } 796 797 /* 798 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 799 the work on the local processor. 800 801 Inputs: 802 C - MAT_MPIAIJ; 803 imax - total no of index sets processed at a time; 804 table - an array of char - size = m bits. 805 806 Output: 807 isz - array containing the count of the solution elements corresponding 808 to each index set; 809 data or table_data - pointer to the solutions 810 */ 811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 812 { 813 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 814 Mat A = c->A,B = c->B; 815 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 816 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 817 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 818 PetscBT table_i; 819 #if defined(PETSC_USE_CTABLE) 820 PetscTable table_data_i; 821 PetscErrorCode ierr; 822 PetscTablePosition tpos; 823 PetscInt tcount,*tdata; 824 #else 825 PetscInt *data_i; 826 #endif 827 828 PetscFunctionBegin; 829 rstart = C->rmap->rstart; 830 cstart = C->cmap->rstart; 831 ai = a->i; 832 aj = a->j; 833 bi = b->i; 834 bj = b->j; 835 garray = c->garray; 836 837 for (i=0; i<imax; i++) { 838 #if defined(PETSC_USE_CTABLE) 839 /* copy existing entries of table_data_i into tdata[] */ 840 table_data_i = table_data[i]; 841 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 842 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 843 844 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 845 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 846 while (tpos) { 847 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 848 tdata[--j] = --row; 849 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 850 } 851 #else 852 data_i = data[i]; 853 #endif 854 table_i = table[i]; 855 isz_i = isz[i]; 856 max = isz[i]; 857 858 for (j=0; j<max; j++) { 859 #if defined(PETSC_USE_CTABLE) 860 row = tdata[j] - rstart; 861 #else 862 row = data_i[j] - rstart; 863 #endif 864 start = ai[row]; 865 end = ai[row+1]; 866 for (k=start; k<end; k++) { /* Amat */ 867 val = aj[k] + cstart; 868 if (!PetscBTLookupSet(table_i,val)) { 869 #if defined(PETSC_USE_CTABLE) 870 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 871 #else 872 data_i[isz_i] = val; 873 #endif 874 isz_i++; 875 } 876 } 877 start = bi[row]; 878 end = bi[row+1]; 879 for (k=start; k<end; k++) { /* Bmat */ 880 val = garray[bj[k]]; 881 if (!PetscBTLookupSet(table_i,val)) { 882 #if defined(PETSC_USE_CTABLE) 883 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 884 #else 885 data_i[isz_i] = val; 886 #endif 887 isz_i++; 888 } 889 } 890 } 891 isz[i] = isz_i; 892 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscFree(tdata);CHKERRQ(ierr); 895 #endif 896 } 897 PetscFunctionReturn(0); 898 } 899 900 /* 901 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 902 and return the output 903 904 Input: 905 C - the matrix 906 nrqr - no of messages being processed. 907 rbuf - an array of pointers to the recieved requests 908 909 Output: 910 xdata - array of messages to be sent back 911 isz1 - size of each message 912 913 For better efficiency perhaps we should malloc separately each xdata[i], 914 then if a remalloc is required we need only copy the data for that one row 915 rather then all previous rows as it is now where a single large chunck of 916 memory is used. 917 918 */ 919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 920 { 921 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 922 Mat A = c->A,B = c->B; 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 924 PetscErrorCode ierr; 925 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 926 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 927 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 928 PetscInt *rbuf_i,kmax,rbuf_0; 929 PetscBT xtable; 930 931 PetscFunctionBegin; 932 m = C->rmap->N; 933 rstart = C->rmap->rstart; 934 cstart = C->cmap->rstart; 935 ai = a->i; 936 aj = a->j; 937 bi = b->i; 938 bj = b->j; 939 garray = c->garray; 940 941 942 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 943 rbuf_i = rbuf[i]; 944 rbuf_0 = rbuf_i[0]; 945 ct += rbuf_0; 946 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 947 } 948 949 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 950 else max1 = 1; 951 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 952 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 953 ++no_malloc; 954 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 955 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 956 957 ct3 = 0; 958 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 959 rbuf_i = rbuf[i]; 960 rbuf_0 = rbuf_i[0]; 961 ct1 = 2*rbuf_0+1; 962 ct2 = ct1; 963 ct3 += ct1; 964 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 965 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 966 oct2 = ct2; 967 kmax = rbuf_i[2*j]; 968 for (k=0; k<kmax; k++,ct1++) { 969 row = rbuf_i[ct1]; 970 if (!PetscBTLookupSet(xtable,row)) { 971 if (!(ct3 < mem_estimate)) { 972 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 973 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 974 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 975 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 976 xdata[0] = tmp; 977 mem_estimate = new_estimate; ++no_malloc; 978 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 979 } 980 xdata[i][ct2++] = row; 981 ct3++; 982 } 983 } 984 for (k=oct2,max2=ct2; k<max2; k++) { 985 row = xdata[i][k] - rstart; 986 start = ai[row]; 987 end = ai[row+1]; 988 for (l=start; l<end; l++) { 989 val = aj[l] + cstart; 990 if (!PetscBTLookupSet(xtable,val)) { 991 if (!(ct3 < mem_estimate)) { 992 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 993 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 994 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 995 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 996 xdata[0] = tmp; 997 mem_estimate = new_estimate; ++no_malloc; 998 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 999 } 1000 xdata[i][ct2++] = val; 1001 ct3++; 1002 } 1003 } 1004 start = bi[row]; 1005 end = bi[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = garray[bj[l]]; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 } 1023 /* Update the header*/ 1024 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1025 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1026 } 1027 xdata[i][0] = rbuf_0; 1028 xdata[i+1] = xdata[i] + ct2; 1029 isz1[i] = ct2; /* size of each message */ 1030 } 1031 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1032 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1033 PetscFunctionReturn(0); 1034 } 1035 /* -------------------------------------------------------------------------*/ 1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1038 /* 1039 Every processor gets the entire matrix 1040 */ 1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1042 { 1043 Mat B; 1044 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1045 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1046 PetscErrorCode ierr; 1047 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1048 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1049 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1050 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1051 1052 PetscFunctionBegin; 1053 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1054 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1055 1056 if (scall == MAT_INITIAL_MATRIX) { 1057 /* ---------------------------------------------------------------- 1058 Tell every processor the number of nonzeros per row 1059 */ 1060 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1061 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1062 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1063 } 1064 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1065 for (i=0; i<size; i++) { 1066 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1067 displs[i] = A->rmap->range[i]; 1068 } 1069 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1070 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1071 #else 1072 sendcount = A->rmap->rend - A->rmap->rstart; 1073 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1074 #endif 1075 /* --------------------------------------------------------------- 1076 Create the sequential matrix of the same type as the local block diagonal 1077 */ 1078 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1079 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1080 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1081 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1082 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1083 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1084 **Bin = B; 1085 b = (Mat_SeqAIJ*)B->data; 1086 1087 /*-------------------------------------------------------------------- 1088 Copy my part of matrix column indices over 1089 */ 1090 sendcount = ad->nz + bd->nz; 1091 jsendbuf = b->j + b->i[rstarts[rank]]; 1092 a_jsendbuf = ad->j; 1093 b_jsendbuf = bd->j; 1094 n = A->rmap->rend - A->rmap->rstart; 1095 cnt = 0; 1096 for (i=0; i<n; i++) { 1097 1098 /* put in lower diagonal portion */ 1099 m = bd->i[i+1] - bd->i[i]; 1100 while (m > 0) { 1101 /* is it above diagonal (in bd (compressed) numbering) */ 1102 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1103 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1104 m--; 1105 } 1106 1107 /* put in diagonal portion */ 1108 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1109 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1110 } 1111 1112 /* put in upper diagonal portion */ 1113 while (m-- > 0) { 1114 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1115 } 1116 } 1117 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1118 1119 /*-------------------------------------------------------------------- 1120 Gather all column indices to all processors 1121 */ 1122 for (i=0; i<size; i++) { 1123 recvcounts[i] = 0; 1124 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1125 recvcounts[i] += lens[j]; 1126 } 1127 } 1128 displs[0] = 0; 1129 for (i=1; i<size; i++) { 1130 displs[i] = displs[i-1] + recvcounts[i-1]; 1131 } 1132 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1133 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1134 #else 1135 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1136 #endif 1137 /*-------------------------------------------------------------------- 1138 Assemble the matrix into useable form (note numerical values not yet set) 1139 */ 1140 /* set the b->ilen (length of each row) values */ 1141 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1142 /* set the b->i indices */ 1143 b->i[0] = 0; 1144 for (i=1; i<=A->rmap->N; i++) { 1145 b->i[i] = b->i[i-1] + lens[i-1]; 1146 } 1147 ierr = PetscFree(lens);CHKERRQ(ierr); 1148 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1149 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1150 1151 } else { 1152 B = **Bin; 1153 b = (Mat_SeqAIJ*)B->data; 1154 } 1155 1156 /*-------------------------------------------------------------------- 1157 Copy my part of matrix numerical values into the values location 1158 */ 1159 if (flag == MAT_GET_VALUES) { 1160 sendcount = ad->nz + bd->nz; 1161 sendbuf = b->a + b->i[rstarts[rank]]; 1162 a_sendbuf = ad->a; 1163 b_sendbuf = bd->a; 1164 b_sendj = bd->j; 1165 n = A->rmap->rend - A->rmap->rstart; 1166 cnt = 0; 1167 for (i=0; i<n; i++) { 1168 1169 /* put in lower diagonal portion */ 1170 m = bd->i[i+1] - bd->i[i]; 1171 while (m > 0) { 1172 /* is it above diagonal (in bd (compressed) numbering) */ 1173 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1174 sendbuf[cnt++] = *b_sendbuf++; 1175 m--; 1176 b_sendj++; 1177 } 1178 1179 /* put in diagonal portion */ 1180 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1181 sendbuf[cnt++] = *a_sendbuf++; 1182 } 1183 1184 /* put in upper diagonal portion */ 1185 while (m-- > 0) { 1186 sendbuf[cnt++] = *b_sendbuf++; 1187 b_sendj++; 1188 } 1189 } 1190 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1191 1192 /* ----------------------------------------------------------------- 1193 Gather all numerical values to all processors 1194 */ 1195 if (!recvcounts) { 1196 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1197 } 1198 for (i=0; i<size; i++) { 1199 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1200 } 1201 displs[0] = 0; 1202 for (i=1; i<size; i++) { 1203 displs[i] = displs[i-1] + recvcounts[i-1]; 1204 } 1205 recvbuf = b->a; 1206 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1207 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1208 #else 1209 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1210 #endif 1211 } /* endof (flag == MAT_GET_VALUES) */ 1212 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1213 1214 if (A->symmetric) { 1215 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1216 } else if (A->hermitian) { 1217 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1218 } else if (A->structurally_symmetric) { 1219 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1220 } 1221 PetscFunctionReturn(0); 1222 } 1223 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1224 { 1225 PetscErrorCode ierr; 1226 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1227 Mat_SubMat *submatj = c->submatis1; 1228 PetscInt i; 1229 1230 PetscFunctionBegin; 1231 ierr = submatj->destroy(C);CHKERRQ(ierr); 1232 1233 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1234 1235 for (i=0; i<submatj->nrqr; ++i) { 1236 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1237 } 1238 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1239 1240 if (submatj->rbuf1) { 1241 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1242 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1243 } 1244 1245 for (i=0; i<submatj->nrqs; ++i) { 1246 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1247 } 1248 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1249 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1250 1251 ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr); 1252 1253 #if defined(PETSC_USE_CTABLE) 1254 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1255 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1256 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1257 #else 1258 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1259 #endif 1260 1261 if (!submatj->allcolumns) { 1262 #if defined(PETSC_USE_CTABLE) 1263 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1264 #else 1265 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1266 #endif 1267 } 1268 1269 ierr = PetscFree(submatj);CHKERRQ(ierr); 1270 PetscFunctionReturn(0); 1271 } 1272 1273 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1274 { 1275 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1276 Mat submat,A = c->A,B = c->B; 1277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1278 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1279 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1280 const PetscInt *icol,*irow; 1281 PetscInt nrow,ncol,start; 1282 PetscErrorCode ierr; 1283 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1284 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1285 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1286 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1287 PetscInt *lens,rmax,ncols,*cols,Crow; 1288 #if defined(PETSC_USE_CTABLE) 1289 PetscTable cmap,rmap; 1290 PetscInt *cmap_loc,*rmap_loc; 1291 #else 1292 PetscInt *cmap,*rmap; 1293 #endif 1294 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1295 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1296 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1297 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1298 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1299 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1300 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1301 MPI_Comm comm; 1302 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1303 PetscMPIInt *onodes1,*olengths1,idex,end; 1304 Mat_SubMat *smatis1; 1305 PetscBool isrowsorted; 1306 1307 PetscFunctionBegin; 1308 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1309 1310 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1311 size = c->size; 1312 rank = c->rank; 1313 1314 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1315 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1316 1317 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1318 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1319 if (allcolumns) { 1320 icol = NULL; 1321 ncol = C->cmap->N; 1322 } else { 1323 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1324 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1325 } 1326 1327 if (scall == MAT_INITIAL_MATRIX) { 1328 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1329 1330 /* Get some new tags to keep the communication clean */ 1331 tag1 = ((PetscObject)C)->tag; 1332 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1333 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1334 1335 /* evaluate communication - mesg to who, length of mesg, and buffer space 1336 required. Based on this, buffers are allocated, and data copied into them */ 1337 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1338 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1339 1340 /* w1[proc] = num of rows owned by proc -- to be requested */ 1341 proc = 0; 1342 nrqs = 0; /* num of outgoing messages */ 1343 for (j=0; j<nrow; j++) { 1344 row = irow[j]; /* sorted! */ 1345 while (row >= C->rmap->range[proc+1]) proc++; 1346 w1[proc]++; 1347 row2proc[j] = proc; /* map row index to proc */ 1348 1349 if (proc != rank && !w2[proc]) { 1350 w2[proc] = 1; nrqs++; 1351 } 1352 } 1353 w1[rank] = 0; /* rows owned by self will not be requested */ 1354 1355 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1356 for (proc=0,j=0; proc<size; proc++) { 1357 if (w1[proc]) { pa[j++] = proc;} 1358 } 1359 1360 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1361 msz = 0; /* total mesg length (for all procs) */ 1362 for (i=0; i<nrqs; i++) { 1363 proc = pa[i]; 1364 w1[proc] += 3; 1365 msz += w1[proc]; 1366 } 1367 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1368 1369 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1370 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1371 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1372 1373 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1374 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1375 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1376 1377 /* Now post the Irecvs corresponding to these messages */ 1378 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1379 1380 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1381 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1382 1383 /* Allocate Memory for outgoing messages */ 1384 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1385 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1386 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1387 1388 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1389 iptr = tmp; 1390 for (i=0; i<nrqs; i++) { 1391 proc = pa[i]; 1392 sbuf1[proc] = iptr; 1393 iptr += w1[proc]; 1394 } 1395 1396 /* Form the outgoing messages */ 1397 /* Initialize the header space */ 1398 for (i=0; i<nrqs; i++) { 1399 proc = pa[i]; 1400 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1401 ptr[proc] = sbuf1[proc] + 3; 1402 } 1403 1404 /* Parse the isrow and copy data into outbuf */ 1405 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1406 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1407 proc = row2proc[j]; 1408 if (proc != rank) { /* copy to the outgoing buf*/ 1409 *ptr[proc] = irow[j]; 1410 ctr[proc]++; ptr[proc]++; 1411 } 1412 } 1413 1414 /* Update the headers for the current IS */ 1415 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1416 if ((ctr_j = ctr[j])) { 1417 sbuf1_j = sbuf1[j]; 1418 k = ++sbuf1_j[0]; 1419 sbuf1_j[2*k] = ctr_j; 1420 sbuf1_j[2*k-1] = 0; 1421 } 1422 } 1423 1424 /* Now post the sends */ 1425 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1426 for (i=0; i<nrqs; ++i) { 1427 proc = pa[i]; 1428 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1429 } 1430 1431 /* Post Receives to capture the buffer size */ 1432 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1433 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1434 1435 rbuf2[0] = tmp + msz; 1436 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1437 1438 for (i=0; i<nrqs; ++i) { 1439 proc = pa[i]; 1440 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1441 } 1442 1443 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1444 1445 /* Send to other procs the buf size they should allocate */ 1446 /* Receive messages*/ 1447 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1448 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1449 1450 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1451 for (i=0; i<nrqr; ++i) { 1452 req_size[i] = 0; 1453 rbuf1_i = rbuf1[i]; 1454 start = 2*rbuf1_i[0] + 1; 1455 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1456 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1457 sbuf2_i = sbuf2[i]; 1458 for (j=start; j<end; j++) { 1459 k = rbuf1_i[j] - rstart; 1460 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1461 sbuf2_i[j] = ncols; 1462 req_size[i] += ncols; 1463 } 1464 req_source1[i] = r_status1[i].MPI_SOURCE; 1465 1466 /* form the header */ 1467 sbuf2_i[0] = req_size[i]; 1468 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1469 1470 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1471 } 1472 1473 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1474 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1475 1476 /* rbuf2 is received, Post recv column indices a->j */ 1477 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1478 1479 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1480 for (i=0; i<nrqs; ++i) { 1481 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1482 req_source2[i] = r_status2[i].MPI_SOURCE; 1483 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1484 } 1485 1486 /* Wait on sends1 and sends2 */ 1487 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1488 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1489 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1490 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1491 1492 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1493 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1494 1495 /* Now allocate sending buffers for a->j, and send them off */ 1496 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1497 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1498 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1499 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1500 1501 for (i=0; i<nrqr; i++) { /* for each requested message */ 1502 rbuf1_i = rbuf1[i]; 1503 sbuf_aj_i = sbuf_aj[i]; 1504 ct1 = 2*rbuf1_i[0] + 1; 1505 ct2 = 0; 1506 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1507 1508 kmax = rbuf1[i][2]; 1509 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1510 row = rbuf1_i[ct1] - rstart; 1511 nzA = ai[row+1] - ai[row]; 1512 nzB = bi[row+1] - bi[row]; 1513 ncols = nzA + nzB; 1514 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1515 1516 /* load the column indices for this row into cols*/ 1517 cols = sbuf_aj_i + ct2; 1518 1519 lwrite = 0; 1520 for (l=0; l<nzB; l++) { 1521 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1522 } 1523 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1524 for (l=0; l<nzB; l++) { 1525 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1526 } 1527 1528 ct2 += ncols; 1529 } 1530 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1531 } 1532 1533 /* create column map (cmap): global col of C -> local col of submat */ 1534 #if defined(PETSC_USE_CTABLE) 1535 if (!allcolumns) { 1536 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1537 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1538 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1539 if (icol[j] >= cstart && icol[j] <cend) { 1540 cmap_loc[icol[j] - cstart] = j+1; 1541 } else { /* use PetscTable for non-local col indices */ 1542 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1543 } 1544 } 1545 } else { 1546 cmap = NULL; 1547 cmap_loc = NULL; 1548 } 1549 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1550 #else 1551 if (!allcolumns) { 1552 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1553 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1554 } else { 1555 cmap = NULL; 1556 } 1557 #endif 1558 1559 /* Create lens for MatSeqAIJSetPreallocation() */ 1560 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1561 1562 /* Compute lens from local part of C */ 1563 for (j=0; j<nrow; j++) { 1564 row = irow[j]; 1565 proc = row2proc[j]; 1566 if (proc == rank) { 1567 /* diagonal part A = c->A */ 1568 ncols = ai[row-rstart+1] - ai[row-rstart]; 1569 cols = aj + ai[row-rstart]; 1570 if (!allcolumns) { 1571 for (k=0; k<ncols; k++) { 1572 #if defined(PETSC_USE_CTABLE) 1573 tcol = cmap_loc[cols[k]]; 1574 #else 1575 tcol = cmap[cols[k]+cstart]; 1576 #endif 1577 if (tcol) lens[j]++; 1578 } 1579 } else { /* allcolumns */ 1580 lens[j] = ncols; 1581 } 1582 1583 /* off-diagonal part B = c->B */ 1584 ncols = bi[row-rstart+1] - bi[row-rstart]; 1585 cols = bj + bi[row-rstart]; 1586 if (!allcolumns) { 1587 for (k=0; k<ncols; k++) { 1588 #if defined(PETSC_USE_CTABLE) 1589 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1590 #else 1591 tcol = cmap[bmap[cols[k]]]; 1592 #endif 1593 if (tcol) lens[j]++; 1594 } 1595 } else { /* allcolumns */ 1596 lens[j] += ncols; 1597 } 1598 } 1599 } 1600 1601 /* Create row map (rmap): global row of C -> local row of submat */ 1602 #if defined(PETSC_USE_CTABLE) 1603 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1604 for (j=0; j<nrow; j++) { 1605 row = irow[j]; 1606 proc = row2proc[j]; 1607 if (proc == rank) { /* a local row */ 1608 rmap_loc[row - rstart] = j; 1609 } else { 1610 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1611 } 1612 } 1613 #else 1614 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1615 for (j=0; j<nrow; j++) { 1616 rmap[irow[j]] = j; 1617 } 1618 #endif 1619 1620 /* Update lens from offproc data */ 1621 /* recv a->j is done */ 1622 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1623 for (i=0; i<nrqs; i++) { 1624 proc = pa[i]; 1625 sbuf1_i = sbuf1[proc]; 1626 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1627 ct1 = 2 + 1; 1628 ct2 = 0; 1629 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1630 rbuf3_i = rbuf3[i]; /* received C->j */ 1631 1632 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1633 max1 = sbuf1_i[2]; 1634 for (k=0; k<max1; k++,ct1++) { 1635 #if defined(PETSC_USE_CTABLE) 1636 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1637 row--; 1638 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1639 #else 1640 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1641 #endif 1642 /* Now, store row index of submat in sbuf1_i[ct1] */ 1643 sbuf1_i[ct1] = row; 1644 1645 nnz = rbuf2_i[ct1]; 1646 if (!allcolumns) { 1647 for (l=0; l<nnz; l++,ct2++) { 1648 #if defined(PETSC_USE_CTABLE) 1649 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1650 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1651 } else { 1652 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1653 } 1654 #else 1655 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1656 #endif 1657 if (tcol) lens[row]++; 1658 } 1659 } else { /* allcolumns */ 1660 lens[row] += nnz; 1661 } 1662 } 1663 } 1664 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1665 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1666 1667 /* Create the submatrices */ 1668 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1669 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1670 1671 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1672 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1673 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1674 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1675 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1676 1677 /* create struct Mat_SubMat and attached it to submat */ 1678 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1679 subc = (Mat_SeqAIJ*)submat->data; 1680 subc->submatis1 = smatis1; 1681 1682 smatis1->nrqs = nrqs; 1683 smatis1->nrqr = nrqr; 1684 smatis1->rbuf1 = rbuf1; 1685 smatis1->rbuf2 = rbuf2; 1686 smatis1->rbuf3 = rbuf3; 1687 smatis1->sbuf2 = sbuf2; 1688 smatis1->req_source2 = req_source2; 1689 1690 smatis1->sbuf1 = sbuf1; 1691 smatis1->ptr = ptr; 1692 smatis1->tmp = tmp; 1693 smatis1->ctr = ctr; 1694 1695 smatis1->pa = pa; 1696 smatis1->req_size = req_size; 1697 smatis1->req_source1 = req_source1; 1698 1699 smatis1->allcolumns = allcolumns; 1700 smatis1->row2proc = row2proc; 1701 smatis1->rmap = rmap; 1702 smatis1->cmap = cmap; 1703 #if defined(PETSC_USE_CTABLE) 1704 smatis1->rmap_loc = rmap_loc; 1705 smatis1->cmap_loc = cmap_loc; 1706 #endif 1707 1708 smatis1->destroy = submat->ops->destroy; 1709 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1710 submat->factortype = C->factortype; 1711 1712 /* compute rmax */ 1713 rmax = 0; 1714 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1715 1716 } else { /* scall == MAT_REUSE_MATRIX */ 1717 submat = submats[0]; 1718 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1719 1720 subc = (Mat_SeqAIJ*)submat->data; 1721 rmax = subc->rmax; 1722 smatis1 = subc->submatis1; 1723 nrqs = smatis1->nrqs; 1724 nrqr = smatis1->nrqr; 1725 rbuf1 = smatis1->rbuf1; 1726 rbuf2 = smatis1->rbuf2; 1727 rbuf3 = smatis1->rbuf3; 1728 req_source2 = smatis1->req_source2; 1729 1730 sbuf1 = smatis1->sbuf1; 1731 sbuf2 = smatis1->sbuf2; 1732 ptr = smatis1->ptr; 1733 tmp = smatis1->tmp; 1734 ctr = smatis1->ctr; 1735 1736 pa = smatis1->pa; 1737 req_size = smatis1->req_size; 1738 req_source1 = smatis1->req_source1; 1739 1740 allcolumns = smatis1->allcolumns; 1741 row2proc = smatis1->row2proc; 1742 rmap = smatis1->rmap; 1743 cmap = smatis1->cmap; 1744 #if defined(PETSC_USE_CTABLE) 1745 rmap_loc = smatis1->rmap_loc; 1746 cmap_loc = smatis1->cmap_loc; 1747 #endif 1748 } 1749 1750 /* Post recv matrix values */ 1751 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1752 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1753 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1754 for (i=0; i<nrqs; ++i) { 1755 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1756 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1757 } 1758 1759 /* Allocate sending buffers for a->a, and send them off */ 1760 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1761 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1762 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1763 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1764 1765 for (i=0; i<nrqr; i++) { 1766 rbuf1_i = rbuf1[i]; 1767 sbuf_aa_i = sbuf_aa[i]; 1768 ct1 = 2*rbuf1_i[0]+1; 1769 ct2 = 0; 1770 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1771 1772 kmax = rbuf1_i[2]; 1773 for (k=0; k<kmax; k++,ct1++) { 1774 row = rbuf1_i[ct1] - rstart; 1775 nzA = ai[row+1] - ai[row]; 1776 nzB = bi[row+1] - bi[row]; 1777 ncols = nzA + nzB; 1778 cworkB = bj + bi[row]; 1779 vworkA = a_a + ai[row]; 1780 vworkB = b_a + bi[row]; 1781 1782 /* load the column values for this row into vals*/ 1783 vals = sbuf_aa_i + ct2; 1784 1785 lwrite = 0; 1786 for (l=0; l<nzB; l++) { 1787 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1788 } 1789 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1790 for (l=0; l<nzB; l++) { 1791 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1792 } 1793 1794 ct2 += ncols; 1795 } 1796 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1797 } 1798 1799 /* Assemble submat */ 1800 /* First assemble the local rows */ 1801 for (j=0; j<nrow; j++) { 1802 row = irow[j]; 1803 proc = row2proc[j]; 1804 if (proc == rank) { 1805 Crow = row - rstart; /* local row index of C */ 1806 #if defined(PETSC_USE_CTABLE) 1807 row = rmap_loc[Crow]; /* row index of submat */ 1808 #else 1809 row = rmap[row]; 1810 #endif 1811 1812 if (allcolumns) { 1813 /* diagonal part A = c->A */ 1814 ncols = ai[Crow+1] - ai[Crow]; 1815 cols = aj + ai[Crow]; 1816 vals = a->a + ai[Crow]; 1817 i = 0; 1818 for (k=0; k<ncols; k++) { 1819 subcols[i] = cols[k] + cstart; 1820 subvals[i++] = vals[k]; 1821 } 1822 1823 /* off-diagonal part B = c->B */ 1824 ncols = bi[Crow+1] - bi[Crow]; 1825 cols = bj + bi[Crow]; 1826 vals = b->a + bi[Crow]; 1827 for (k=0; k<ncols; k++) { 1828 subcols[i] = bmap[cols[k]]; 1829 subvals[i++] = vals[k]; 1830 } 1831 1832 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1833 1834 } else { /* !allcolumns */ 1835 #if defined(PETSC_USE_CTABLE) 1836 /* diagonal part A = c->A */ 1837 ncols = ai[Crow+1] - ai[Crow]; 1838 cols = aj + ai[Crow]; 1839 vals = a->a + ai[Crow]; 1840 i = 0; 1841 for (k=0; k<ncols; k++) { 1842 tcol = cmap_loc[cols[k]]; 1843 if (tcol) { 1844 subcols[i] = --tcol; 1845 subvals[i++] = vals[k]; 1846 } 1847 } 1848 1849 /* off-diagonal part B = c->B */ 1850 ncols = bi[Crow+1] - bi[Crow]; 1851 cols = bj + bi[Crow]; 1852 vals = b->a + bi[Crow]; 1853 for (k=0; k<ncols; k++) { 1854 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1855 if (tcol) { 1856 subcols[i] = --tcol; 1857 subvals[i++] = vals[k]; 1858 } 1859 } 1860 #else 1861 /* diagonal part A = c->A */ 1862 ncols = ai[Crow+1] - ai[Crow]; 1863 cols = aj + ai[Crow]; 1864 vals = a->a + ai[Crow]; 1865 i = 0; 1866 for (k=0; k<ncols; k++) { 1867 tcol = cmap[cols[k]+cstart]; 1868 if (tcol) { 1869 subcols[i] = --tcol; 1870 subvals[i++] = vals[k]; 1871 } 1872 } 1873 1874 /* off-diagonal part B = c->B */ 1875 ncols = bi[Crow+1] - bi[Crow]; 1876 cols = bj + bi[Crow]; 1877 vals = b->a + bi[Crow]; 1878 for (k=0; k<ncols; k++) { 1879 tcol = cmap[bmap[cols[k]]]; 1880 if (tcol) { 1881 subcols[i] = --tcol; 1882 subvals[i++] = vals[k]; 1883 } 1884 } 1885 #endif 1886 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1887 } 1888 } 1889 } 1890 1891 /* Now assemble the off-proc rows */ 1892 for (i=0; i<nrqs; i++) { /* for each requested message */ 1893 /* recv values from other processes */ 1894 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1895 proc = pa[idex]; 1896 sbuf1_i = sbuf1[proc]; 1897 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1898 ct1 = 2 + 1; 1899 ct2 = 0; /* count of received C->j */ 1900 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1901 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1902 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1903 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1904 1905 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1906 max1 = sbuf1_i[2]; /* num of rows */ 1907 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1908 row = sbuf1_i[ct1]; /* row index of submat */ 1909 if (!allcolumns) { 1910 idex = 0; 1911 if (scall == MAT_INITIAL_MATRIX) { 1912 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1913 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1914 #if defined(PETSC_USE_CTABLE) 1915 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1916 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1917 } else { 1918 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1919 } 1920 #else 1921 tcol = cmap[rbuf3_i[ct2]]; 1922 #endif 1923 if (tcol) { 1924 subcols[idex] = --tcol; 1925 subvals[idex++] = rbuf4_i[ct2]; 1926 1927 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1928 For reuse, we replace received C->j with index that should be inserted to submat */ 1929 rbuf3_i[ct3++] = ct2; 1930 } 1931 } 1932 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1933 1934 } else { /* scall == MAT_REUSE_MATRIX */ 1935 submat = submats[0]; 1936 subc = (Mat_SeqAIJ*)submat->data; 1937 1938 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1939 for (l=0; l<nnz; l++) { 1940 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1941 subvals[idex++] = rbuf4_i[ct2]; 1942 } 1943 1944 bj = subc->j + subc->i[row]; 1945 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1946 } 1947 } else { /* allcolumns */ 1948 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1949 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1950 ct2 += nnz; 1951 } 1952 } 1953 } 1954 1955 /* sending a->a are done */ 1956 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1957 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1958 1959 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1960 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1961 submats[0] = submat; 1962 1963 /* Restore the indices */ 1964 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1965 if (!allcolumns) { 1966 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1967 } 1968 1969 /* Destroy allocated memory */ 1970 for (i=0; i<nrqs; ++i) { 1971 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1972 } 1973 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1974 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1975 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1976 1977 if (scall == MAT_INITIAL_MATRIX) { 1978 ierr = PetscFree(lens);CHKERRQ(ierr); 1979 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1980 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1981 } 1982 PetscFunctionReturn(0); 1983 } 1984 1985 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1986 { 1987 PetscErrorCode ierr; 1988 PetscInt ncol; 1989 PetscBool colflag,allcolumns=PETSC_FALSE; 1990 1991 PetscFunctionBegin; 1992 /* Allocate memory to hold all the submatrices */ 1993 if (scall != MAT_REUSE_MATRIX) { 1994 ierr = PetscMalloc1(1,submat);CHKERRQ(ierr); 1995 } 1996 1997 /* Check for special case: each processor gets entire matrix columns */ 1998 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1999 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 2000 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 2001 2002 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2003 PetscFunctionReturn(0); 2004 } 2005 2006 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2007 { 2008 PetscErrorCode ierr; 2009 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 2010 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 2011 2012 PetscFunctionBegin; 2013 #if 0 2014 /* Check for special case: each processor gets entire matrix */ 2015 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */ 2016 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2017 PetscFunctionReturn(0); 2018 } 2019 #endif 2020 2021 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2022 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2023 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2024 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2025 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2026 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2027 wantallmatrix = PETSC_TRUE; 2028 2029 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2030 } 2031 } 2032 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2033 if (twantallmatrix) { 2034 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2035 PetscFunctionReturn(0); 2036 } 2037 2038 /* Allocate memory to hold all the submatrices */ 2039 if (scall != MAT_REUSE_MATRIX) { 2040 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 2041 } 2042 2043 /* Check for special case: each processor gets entire matrix columns */ 2044 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 2045 for (i=0; i<ismax; i++) { 2046 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2047 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 2048 if (colflag && ncol == C->cmap->N) { 2049 allcolumns[i] = PETSC_TRUE; 2050 } else { 2051 allcolumns[i] = PETSC_FALSE; 2052 } 2053 } 2054 2055 /* Determine the number of stages through which submatrices are done */ 2056 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2057 2058 /* 2059 Each stage will extract nmax submatrices. 2060 nmax is determined by the matrix column dimension. 2061 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2062 */ 2063 if (!nmax) nmax = 1; 2064 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 2065 2066 /* Make sure every processor loops through the nstages */ 2067 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2068 2069 for (i=0,pos=0; i<nstages; i++) { 2070 if (pos+nmax <= ismax) max_no = nmax; 2071 else if (pos == ismax) max_no = 0; 2072 else max_no = ismax-pos; 2073 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 2074 pos += max_no; 2075 } 2076 2077 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 2078 PetscFunctionReturn(0); 2079 } 2080 2081 /* -------------------------------------------------------------------------*/ 2082 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 2083 { 2084 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2085 Mat A = c->A; 2086 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 2087 const PetscInt **icol,**irow; 2088 PetscInt *nrow,*ncol,start; 2089 PetscErrorCode ierr; 2090 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 2091 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 2092 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 2093 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 2094 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2095 #if defined(PETSC_USE_CTABLE) 2096 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2097 #else 2098 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2099 #endif 2100 const PetscInt *irow_i; 2101 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2102 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2103 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2104 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2105 MPI_Status *r_status3,*r_status4,*s_status4; 2106 MPI_Comm comm; 2107 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 2108 PetscMPIInt *onodes1,*olengths1; 2109 PetscMPIInt idex,idex2,end; 2110 PetscInt **row2proc,*row2proc_i; 2111 Mat_SubMat **smats,*smat_i; 2112 PetscBool *issorted; 2113 2114 PetscFunctionBegin; 2115 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2116 tag0 = ((PetscObject)C)->tag; 2117 size = c->size; 2118 rank = c->rank; 2119 //printf("[%d] MatGetSubMatrices_MPIAIJ_Local, scall %d\n",rank,scall); 2120 2121 /* Get some new tags to keep the communication clean */ 2122 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 2123 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2124 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2125 2126 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 2127 ierr = PetscMalloc1(ismax,&issorted);CHKERRQ(ierr); 2128 2129 for (i=0; i<ismax; i++) { 2130 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2131 2132 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2133 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2134 if (allcolumns[i]) { 2135 icol[i] = NULL; 2136 ncol[i] = C->cmap->N; 2137 } else { 2138 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2139 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2140 } 2141 } 2142 2143 2144 ierr = PetscMalloc1(ismax,&smats);CHKERRQ(ierr); 2145 ierr = PetscMalloc1(ismax,&row2proc);CHKERRQ(ierr); 2146 2147 if (scall == MAT_REUSE_MATRIX) { 2148 //PetscInt *req_source1,*req_source2; 2149 2150 /* Assumes new rows are same length as the old rows,hence bug! */ 2151 for (i=0; i<ismax; i++) { 2152 mat = (Mat_SeqAIJ*)(submats[i]->data); 2153 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2154 2155 /* Initial matrix as if empty -- rm later!!! */ 2156 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2157 2158 /* Initial matrix as if empty */ 2159 submats[i]->factortype = C->factortype; 2160 2161 smat_i = mat->submatis1; 2162 smats[i] = smat_i; 2163 2164 nrqs = smat_i->nrqs; 2165 nrqr = smat_i->nrqr; 2166 rbuf1 = smat_i->rbuf1; 2167 rbuf2 = smat_i->rbuf2; 2168 rbuf3 = smat_i->rbuf3; 2169 //req_source2 = smat_i->req_source2; 2170 2171 sbuf1 = smat_i->sbuf1; 2172 sbuf2 = smat_i->sbuf2; 2173 ptr = smat_i->ptr; 2174 tmp = smat_i->tmp; 2175 ctr = smat_i->ctr; 2176 2177 pa = smat_i->pa; 2178 req_size = smat_i->req_size; 2179 //req_source1 = smat_i->req_source1; 2180 2181 allcolumns[i] = smat_i->allcolumns; 2182 row2proc[i] = smat_i->row2proc; 2183 //rmap = smat_i->rmap; 2184 //cmap = smat_i->cmap; 2185 #if defined(PETSC_USE_CTABLE) 2186 //rmap_loc = smat_i->rmap_loc; 2187 //cmap_loc = smat_i->cmap_loc; 2188 #endif 2189 } 2190 } 2191 2192 /* evaluate communication - mesg to who, length of mesg, and buffer space 2193 required. Based on this, buffers are allocated, and data copied into them*/ 2194 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 2195 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2196 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2197 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2198 2199 for (i=0; i<ismax; i++) { 2200 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2201 jmax = nrow[i]; 2202 irow_i = irow[i]; 2203 2204 if (scall == MAT_INITIAL_MATRIX) { 2205 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2206 row2proc[i] = row2proc_i; 2207 } else { 2208 row2proc_i = row2proc[i]; 2209 } 2210 2211 if (issorted[i]) proc = 0; 2212 for (j=0; j<jmax; j++) { 2213 if (!issorted[i]) proc = 0; 2214 row = irow_i[j]; 2215 while (row >= C->rmap->range[proc+1]) proc++; 2216 w4[proc]++; 2217 row2proc_i[j] = proc; /* map row index to proc */ 2218 } 2219 for (j=0; j<size; j++) { 2220 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 2221 } 2222 } 2223 2224 nrqs = 0; /* no of outgoing messages */ 2225 msz = 0; /* total mesg length (for all procs) */ 2226 w1[rank] = 0; /* no mesg sent to self */ 2227 w3[rank] = 0; 2228 for (i=0; i<size; i++) { 2229 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2230 } 2231 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2232 for (i=0,j=0; i<size; i++) { 2233 if (w1[i]) { pa[j] = i; j++; } 2234 } 2235 2236 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2237 for (i=0; i<nrqs; i++) { 2238 j = pa[i]; 2239 w1[j] += w2[j] + 2* w3[j]; 2240 msz += w1[j]; 2241 } 2242 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2243 2244 /* Determine the number of messages to expect, their lengths, from from-ids */ 2245 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2246 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2247 2248 /* Now post the Irecvs corresponding to these messages */ 2249 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2250 2251 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2252 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2253 2254 /* Allocate Memory for outgoing messages */ 2255 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2256 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2257 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2258 2259 { 2260 PetscInt *iptr = tmp; 2261 k = 0; 2262 for (i=0; i<nrqs; i++) { 2263 j = pa[i]; 2264 iptr += k; 2265 sbuf1[j] = iptr; 2266 k = w1[j]; 2267 } 2268 } 2269 2270 /* Form the outgoing messages */ 2271 /* Initialize the header space */ 2272 for (i=0; i<nrqs; i++) { 2273 j = pa[i]; 2274 sbuf1[j][0] = 0; 2275 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2276 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2277 } 2278 2279 /* Parse the isrow and copy data into outbuf */ 2280 for (i=0; i<ismax; i++) { 2281 row2proc_i = row2proc[i]; 2282 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2283 irow_i = irow[i]; 2284 jmax = nrow[i]; 2285 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2286 proc = row2proc_i[j]; 2287 if (proc != rank) { /* copy to the outgoing buf*/ 2288 ctr[proc]++; 2289 *ptr[proc] = irow_i[j]; 2290 ptr[proc]++; 2291 } 2292 } 2293 /* Update the headers for the current IS */ 2294 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2295 if ((ctr_j = ctr[j])) { 2296 sbuf1_j = sbuf1[j]; 2297 k = ++sbuf1_j[0]; 2298 sbuf1_j[2*k] = ctr_j; 2299 sbuf1_j[2*k-1] = i; 2300 } 2301 } 2302 } 2303 2304 /* Now post the sends */ 2305 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2306 for (i=0; i<nrqs; ++i) { 2307 j = pa[i]; 2308 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2309 } 2310 2311 /* Post Receives to capture the buffer size */ 2312 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2313 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 2314 rbuf2[0] = tmp + msz; 2315 for (i=1; i<nrqs; ++i) { 2316 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2317 } 2318 for (i=0; i<nrqs; ++i) { 2319 j = pa[i]; 2320 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 2321 } 2322 2323 /* Send to other procs the buf size they should allocate */ 2324 2325 2326 /* Receive messages*/ 2327 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2328 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2329 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 2330 { 2331 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2332 PetscInt *sbuf2_i; 2333 2334 for (i=0; i<nrqr; ++i) { 2335 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 2336 2337 req_size[idex] = 0; 2338 rbuf1_i = rbuf1[idex]; 2339 start = 2*rbuf1_i[0] + 1; 2340 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2341 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 2342 sbuf2_i = sbuf2[idex]; 2343 for (j=start; j<end; j++) { 2344 id = rbuf1_i[j] - rstart; 2345 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2346 sbuf2_i[j] = ncols; 2347 req_size[idex] += ncols; 2348 } 2349 req_source[idex] = r_status1[i].MPI_SOURCE; 2350 /* form the header */ 2351 sbuf2_i[0] = req_size[idex]; 2352 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2353 2354 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 2355 } 2356 } 2357 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2358 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2359 2360 /* recv buffer sizes */ 2361 /* Receive messages*/ 2362 2363 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 2364 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2365 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2366 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2367 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2368 2369 for (i=0; i<nrqs; ++i) { 2370 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 2371 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 2372 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 2373 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 2374 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 2375 } 2376 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2377 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2378 2379 /* Wait on sends1 and sends2 */ 2380 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2381 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2382 2383 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2384 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2385 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2386 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2387 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2388 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2389 2390 /* Now allocate buffers for a->j, and send them off */ 2391 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2392 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2393 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2394 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2395 2396 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2397 { 2398 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2399 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2400 PetscInt cend = C->cmap->rend; 2401 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2402 2403 for (i=0; i<nrqr; i++) { 2404 rbuf1_i = rbuf1[i]; 2405 sbuf_aj_i = sbuf_aj[i]; 2406 ct1 = 2*rbuf1_i[0] + 1; 2407 ct2 = 0; 2408 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2409 kmax = rbuf1[i][2*j]; 2410 for (k=0; k<kmax; k++,ct1++) { 2411 row = rbuf1_i[ct1] - rstart; 2412 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2413 ncols = nzA + nzB; 2414 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2415 2416 /* load the column indices for this row into cols*/ 2417 cols = sbuf_aj_i + ct2; 2418 2419 lwrite = 0; 2420 for (l=0; l<nzB; l++) { 2421 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2422 } 2423 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2424 for (l=0; l<nzB; l++) { 2425 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2426 } 2427 2428 ct2 += ncols; 2429 } 2430 } 2431 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 2432 } 2433 } 2434 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 2435 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 2436 2437 /* Allocate buffers for a->a, and send them off */ 2438 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2439 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2440 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2441 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2442 2443 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2444 { 2445 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2446 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2447 PetscInt cend = C->cmap->rend; 2448 PetscInt *b_j = b->j; 2449 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2450 2451 for (i=0; i<nrqr; i++) { 2452 rbuf1_i = rbuf1[i]; 2453 sbuf_aa_i = sbuf_aa[i]; 2454 ct1 = 2*rbuf1_i[0]+1; 2455 ct2 = 0; 2456 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2457 kmax = rbuf1_i[2*j]; 2458 for (k=0; k<kmax; k++,ct1++) { 2459 row = rbuf1_i[ct1] - rstart; 2460 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2461 ncols = nzA + nzB; 2462 cworkB = b_j + b_i[row]; 2463 vworkA = a_a + a_i[row]; 2464 vworkB = b_a + b_i[row]; 2465 2466 /* load the column values for this row into vals*/ 2467 vals = sbuf_aa_i+ct2; 2468 2469 lwrite = 0; 2470 for (l=0; l<nzB; l++) { 2471 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2472 } 2473 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2474 for (l=0; l<nzB; l++) { 2475 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2476 } 2477 2478 ct2 += ncols; 2479 } 2480 } 2481 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 2482 } 2483 } 2484 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 2485 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 2486 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2487 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2488 2489 /* Form the matrix */ 2490 /* create col map: global col of C -> local col of submatrices */ 2491 { 2492 const PetscInt *icol_i; 2493 #if defined(PETSC_USE_CTABLE) 2494 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 2495 for (i=0; i<ismax; i++) { 2496 if (!allcolumns[i]) { 2497 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2498 2499 jmax = ncol[i]; 2500 icol_i = icol[i]; 2501 cmap_i = cmap[i]; 2502 for (j=0; j<jmax; j++) { 2503 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2504 } 2505 } else { 2506 cmap[i] = NULL; 2507 } 2508 } 2509 #else 2510 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 2511 for (i=0; i<ismax; i++) { 2512 if (!allcolumns[i]) { 2513 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2514 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2515 jmax = ncol[i]; 2516 icol_i = icol[i]; 2517 cmap_i = cmap[i]; 2518 for (j=0; j<jmax; j++) { 2519 cmap_i[icol_i[j]] = j+1; 2520 } 2521 } else { 2522 cmap[i] = NULL; 2523 } 2524 } 2525 #endif 2526 } 2527 2528 /* Create lens which is required for MatCreate... */ 2529 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2530 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2531 if (ismax) { 2532 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 2533 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 2534 } 2535 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2536 2537 /* Update lens from local data */ 2538 for (i=0; i<ismax; i++) { 2539 row2proc_i = row2proc[i]; 2540 jmax = nrow[i]; 2541 if (!allcolumns[i]) cmap_i = cmap[i]; 2542 irow_i = irow[i]; 2543 lens_i = lens[i]; 2544 for (j=0; j<jmax; j++) { 2545 row = irow_i[j]; 2546 proc = row2proc_i[j]; 2547 if (proc == rank) { 2548 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2549 if (!allcolumns[i]) { 2550 for (k=0; k<ncols; k++) { 2551 #if defined(PETSC_USE_CTABLE) 2552 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2553 #else 2554 tcol = cmap_i[cols[k]]; 2555 #endif 2556 if (tcol) lens_i[j]++; 2557 } 2558 } else { /* allcolumns */ 2559 lens_i[j] = ncols; 2560 } 2561 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2562 } 2563 } 2564 } 2565 2566 /* Create row map: global row of C -> local row of submatrices */ 2567 #if defined(PETSC_USE_CTABLE) 2568 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 2569 for (i=0; i<ismax; i++) { 2570 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2571 irow_i = irow[i]; 2572 jmax = nrow[i]; 2573 for (j=0; j<jmax; j++) { 2574 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2575 } 2576 } 2577 #else 2578 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 2579 if (ismax) { 2580 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 2581 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2582 } 2583 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 2584 for (i=0; i<ismax; i++) { 2585 rmap_i = rmap[i]; 2586 irow_i = irow[i]; 2587 jmax = nrow[i]; 2588 for (j=0; j<jmax; j++) { 2589 rmap_i[irow_i[j]] = j; 2590 } 2591 } 2592 #endif 2593 2594 /* Update lens from offproc data */ 2595 { 2596 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2597 2598 for (tmp2=0; tmp2<nrqs; tmp2++) { 2599 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 2600 idex = pa[idex2]; 2601 sbuf1_i = sbuf1[idex]; 2602 jmax = sbuf1_i[0]; 2603 ct1 = 2*jmax+1; 2604 ct2 = 0; 2605 rbuf2_i = rbuf2[idex2]; 2606 rbuf3_i = rbuf3[idex2]; 2607 for (j=1; j<=jmax; j++) { 2608 is_no = sbuf1_i[2*j-1]; 2609 max1 = sbuf1_i[2*j]; 2610 lens_i = lens[is_no]; 2611 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2612 rmap_i = rmap[is_no]; 2613 for (k=0; k<max1; k++,ct1++) { 2614 #if defined(PETSC_USE_CTABLE) 2615 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2616 row--; 2617 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2618 #else 2619 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2620 #endif 2621 max2 = rbuf2_i[ct1]; 2622 for (l=0; l<max2; l++,ct2++) { 2623 if (!allcolumns[is_no]) { 2624 #if defined(PETSC_USE_CTABLE) 2625 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2626 #else 2627 tcol = cmap_i[rbuf3_i[ct2]]; 2628 #endif 2629 if (tcol) lens_i[row]++; 2630 } else { /* allcolumns */ 2631 lens_i[row]++; /* lens_i[row] += max2 ? */ 2632 } 2633 } 2634 } 2635 } 2636 } 2637 } 2638 ierr = PetscFree(r_status3);CHKERRQ(ierr); 2639 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2640 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2641 ierr = PetscFree(s_status3);CHKERRQ(ierr); 2642 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2643 2644 /* Create the submatrices */ 2645 if (scall == MAT_INITIAL_MATRIX) { 2646 for (i=0; i<ismax; i++) { 2647 PetscInt rbs,cbs; 2648 Mat_SeqAIJ* subc; 2649 2650 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2651 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2652 2653 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2654 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2655 2656 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2657 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2658 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2659 2660 /* create struct Mat_SubMat and attached it to submat */ 2661 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2662 subc = (Mat_SeqAIJ*)submats[i]->data; 2663 subc->submatis1 = smat_i; 2664 smats[i] = smat_i; 2665 2666 smat_i->destroy = submats[i]->ops->destroy; 2667 submats[i]->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 2668 submats[i]->factortype = C->factortype; 2669 2670 smat_i->nrqs = 0;//nrqs; 2671 smat_i->nrqr = 0;//nrqr; 2672 smat_i->rbuf1 = NULL;//rbuf1; 2673 smat_i->rbuf2 = NULL;//rbuf2; 2674 smat_i->rbuf3 = NULL;//rbuf3; 2675 smat_i->sbuf2 = NULL;//sbuf2; 2676 smat_i->req_source2 = NULL;//req_source2; 2677 2678 smat_i->sbuf1 = NULL;//sbuf1; 2679 smat_i->ptr = NULL;//ptr; 2680 smat_i->tmp = NULL;//tmp; 2681 smat_i->ctr = NULL;//ctr; 2682 2683 smat_i->pa = NULL;//pa; 2684 smat_i->req_size = NULL;//req_size; 2685 smat_i->req_source1 = NULL;//req_source1; 2686 2687 smat_i->allcolumns = allcolumns[i]; 2688 smat_i->row2proc = row2proc[i]; 2689 smat_i->rmap = NULL;//rmap; 2690 smat_i->cmap = NULL;//cmap; 2691 #if defined(PETSC_USE_CTABLE) 2692 smat_i->rmap_loc = NULL;//rmap_loc; 2693 smat_i->cmap_loc = NULL;//cmap_loc; 2694 #endif 2695 2696 } 2697 } 2698 2699 /* Assemble the matrices */ 2700 /* First assemble the local rows */ 2701 { 2702 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2703 PetscScalar *imat_a; 2704 2705 for (i=0; i<ismax; i++) { 2706 row2proc_i = row2proc[i]; 2707 mat = (Mat_SeqAIJ*)submats[i]->data; 2708 imat_ilen = mat->ilen; 2709 imat_j = mat->j; 2710 imat_i = mat->i; 2711 imat_a = mat->a; 2712 2713 if (!allcolumns[i]) cmap_i = cmap[i]; 2714 rmap_i = rmap[i]; 2715 irow_i = irow[i]; 2716 jmax = nrow[i]; 2717 for (j=0; j<jmax; j++) { 2718 row = irow_i[j]; 2719 proc = row2proc_i[j]; 2720 if (proc == rank) { 2721 old_row = row; 2722 #if defined(PETSC_USE_CTABLE) 2723 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2724 row--; 2725 #else 2726 row = rmap_i[row]; 2727 #endif 2728 ilen_row = imat_ilen[row]; 2729 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2730 mat_i = imat_i[row]; 2731 mat_a = imat_a + mat_i; 2732 mat_j = imat_j + mat_i; 2733 if (!allcolumns[i]) { 2734 for (k=0; k<ncols; k++) { 2735 #if defined(PETSC_USE_CTABLE) 2736 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2737 #else 2738 tcol = cmap_i[cols[k]]; 2739 #endif 2740 if (tcol) { 2741 *mat_j++ = tcol - 1; 2742 *mat_a++ = vals[k]; 2743 ilen_row++; 2744 } 2745 } 2746 } else { /* allcolumns */ 2747 for (k=0; k<ncols; k++) { 2748 *mat_j++ = cols[k]; /* global col index! */ 2749 *mat_a++ = vals[k]; 2750 ilen_row++; 2751 } 2752 } 2753 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2754 2755 imat_ilen[row] = ilen_row; 2756 } 2757 } 2758 } 2759 } 2760 2761 /* Now assemble the off proc rows*/ 2762 { 2763 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 2764 PetscInt *imat_j,*imat_i; 2765 PetscScalar *imat_a,*rbuf4_i; 2766 2767 for (tmp2=0; tmp2<nrqs; tmp2++) { 2768 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 2769 idex = pa[idex2]; 2770 sbuf1_i = sbuf1[idex]; 2771 jmax = sbuf1_i[0]; 2772 ct1 = 2*jmax + 1; 2773 ct2 = 0; 2774 rbuf2_i = rbuf2[idex2]; 2775 rbuf3_i = rbuf3[idex2]; 2776 rbuf4_i = rbuf4[idex2]; 2777 for (j=1; j<=jmax; j++) { 2778 is_no = sbuf1_i[2*j-1]; 2779 rmap_i = rmap[is_no]; 2780 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2781 mat = (Mat_SeqAIJ*)submats[is_no]->data; 2782 imat_ilen = mat->ilen; 2783 imat_j = mat->j; 2784 imat_i = mat->i; 2785 imat_a = mat->a; 2786 max1 = sbuf1_i[2*j]; 2787 for (k=0; k<max1; k++,ct1++) { 2788 row = sbuf1_i[ct1]; 2789 #if defined(PETSC_USE_CTABLE) 2790 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2791 row--; 2792 #else 2793 row = rmap_i[row]; 2794 #endif 2795 ilen = imat_ilen[row]; 2796 mat_i = imat_i[row]; 2797 mat_a = imat_a + mat_i; 2798 mat_j = imat_j + mat_i; 2799 max2 = rbuf2_i[ct1]; 2800 if (!allcolumns[is_no]) { 2801 for (l=0; l<max2; l++,ct2++) { 2802 2803 #if defined(PETSC_USE_CTABLE) 2804 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2805 #else 2806 tcol = cmap_i[rbuf3_i[ct2]]; 2807 #endif 2808 if (tcol) { 2809 *mat_j++ = tcol - 1; 2810 *mat_a++ = rbuf4_i[ct2]; 2811 ilen++; 2812 } 2813 } 2814 } else { /* allcolumns */ 2815 for (l=0; l<max2; l++,ct2++) { 2816 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2817 *mat_a++ = rbuf4_i[ct2]; 2818 ilen++; 2819 } 2820 } 2821 imat_ilen[row] = ilen; 2822 } 2823 } 2824 } 2825 } 2826 2827 /* sort the rows */ 2828 { 2829 PetscInt *imat_ilen,*imat_j,*imat_i; 2830 PetscScalar *imat_a; 2831 2832 for (i=0; i<ismax; i++) { 2833 mat = (Mat_SeqAIJ*)submats[i]->data; 2834 imat_j = mat->j; 2835 imat_i = mat->i; 2836 imat_a = mat->a; 2837 imat_ilen = mat->ilen; 2838 2839 if (allcolumns[i]) continue; 2840 jmax = nrow[i]; 2841 for (j=0; j<jmax; j++) { 2842 PetscInt ilen; 2843 2844 mat_i = imat_i[j]; 2845 mat_a = imat_a + mat_i; 2846 mat_j = imat_j + mat_i; 2847 ilen = imat_ilen[j]; 2848 ierr = PetscSortIntWithScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 2849 } 2850 } 2851 } 2852 2853 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2854 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2855 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2856 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2857 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2858 2859 /* Restore the indices */ 2860 for (i=0; i<ismax; i++) { 2861 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2862 if (!allcolumns[i]) { 2863 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2864 } 2865 } 2866 2867 /* Destroy allocated memory */ 2868 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 2869 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2870 ierr = PetscFree(pa);CHKERRQ(ierr); 2871 2872 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 2873 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 2874 for (i=0; i<nrqr; ++i) { 2875 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 2876 } 2877 for (i=0; i<nrqs; ++i) { 2878 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 2879 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2880 } 2881 2882 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 2883 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 2884 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2885 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2886 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2887 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2888 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2889 2890 #if defined(PETSC_USE_CTABLE) 2891 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 2892 #else 2893 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 2894 #endif 2895 ierr = PetscFree(rmap);CHKERRQ(ierr); 2896 2897 for (i=0; i<ismax; i++) { 2898 if (!allcolumns[i]) { 2899 #if defined(PETSC_USE_CTABLE) 2900 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 2901 #else 2902 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 2903 #endif 2904 } 2905 } 2906 ierr = PetscFree(cmap);CHKERRQ(ierr); 2907 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2908 ierr = PetscFree(lens);CHKERRQ(ierr); 2909 2910 for (i=0; i<ismax; i++) { 2911 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2912 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2913 } 2914 2915 ierr = PetscFree(issorted);CHKERRQ(ierr); 2916 ierr = PetscFree(row2proc);CHKERRQ(ierr); 2917 ierr = PetscFree(smats);CHKERRQ(ierr); 2918 PetscFunctionReturn(0); 2919 } 2920 2921 /* 2922 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2923 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2924 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2925 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2926 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2927 state, and needs to be "assembled" later by compressing B's column space. 2928 2929 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2930 Following this call, C->A & C->B have been created, even if empty. 2931 */ 2932 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2933 { 2934 /* If making this function public, change the error returned in this function away from _PLIB. */ 2935 PetscErrorCode ierr; 2936 Mat_MPIAIJ *aij; 2937 Mat_SeqAIJ *Baij; 2938 PetscBool seqaij,Bdisassembled; 2939 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2940 PetscScalar v; 2941 const PetscInt *rowindices,*colindices; 2942 2943 PetscFunctionBegin; 2944 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2945 if (A) { 2946 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2947 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2948 if (rowemb) { 2949 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2950 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2951 } else { 2952 if (C->rmap->n != A->rmap->n) { 2953 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2954 } 2955 } 2956 if (dcolemb) { 2957 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2958 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2959 } else { 2960 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2961 } 2962 } 2963 if (B) { 2964 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2965 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2966 if (rowemb) { 2967 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2968 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2969 } else { 2970 if (C->rmap->n != B->rmap->n) { 2971 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2972 } 2973 } 2974 if (ocolemb) { 2975 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2976 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2977 } else { 2978 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2979 } 2980 } 2981 2982 aij = (Mat_MPIAIJ*)(C->data); 2983 if (!aij->A) { 2984 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2985 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2986 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2987 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2988 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2989 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2990 } 2991 if (A) { 2992 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2993 } else { 2994 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2995 } 2996 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2997 /* 2998 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2999 need to "disassemble" B -- convert it to using C's global indices. 3000 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 3001 3002 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 3003 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 3004 3005 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 3006 At least avoid calling MatSetValues() and the implied searches? 3007 */ 3008 3009 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 3010 #if defined(PETSC_USE_CTABLE) 3011 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 3012 #else 3013 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 3014 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 3015 if (aij->B) { 3016 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 3017 } 3018 #endif 3019 ngcol = 0; 3020 if (aij->lvec) { 3021 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 3022 } 3023 if (aij->garray) { 3024 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 3025 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 3026 } 3027 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 3028 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 3029 } 3030 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 3031 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 3032 } 3033 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 3034 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 3035 } 3036 } 3037 Bdisassembled = PETSC_FALSE; 3038 if (!aij->B) { 3039 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 3040 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 3041 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 3042 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 3043 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 3044 Bdisassembled = PETSC_TRUE; 3045 } 3046 if (B) { 3047 Baij = (Mat_SeqAIJ*)(B->data); 3048 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3049 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3050 for (i=0; i<B->rmap->n; i++) { 3051 nz[i] = Baij->i[i+1] - Baij->i[i]; 3052 } 3053 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3054 ierr = PetscFree(nz);CHKERRQ(ierr); 3055 } 3056 3057 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3058 shift = rend-rstart; 3059 count = 0; 3060 rowindices = NULL; 3061 colindices = NULL; 3062 if (rowemb) { 3063 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3064 } 3065 if (ocolemb) { 3066 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3067 } 3068 for (i=0; i<B->rmap->n; i++) { 3069 PetscInt row; 3070 row = i; 3071 if (rowindices) row = rowindices[i]; 3072 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3073 col = Baij->j[count]; 3074 if (colindices) col = colindices[col]; 3075 if (Bdisassembled && col>=rstart) col += shift; 3076 v = Baij->a[count]; 3077 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3078 ++count; 3079 } 3080 } 3081 /* No assembly for aij->B is necessary. */ 3082 /* FIXME: set aij->B's nonzerostate correctly. */ 3083 } else { 3084 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3085 } 3086 C->preallocated = PETSC_TRUE; 3087 C->was_assembled = PETSC_FALSE; 3088 C->assembled = PETSC_FALSE; 3089 /* 3090 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3091 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3092 */ 3093 PetscFunctionReturn(0); 3094 } 3095 3096 /* 3097 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3098 */ 3099 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3100 { 3101 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 3102 3103 PetscFunctionBegin; 3104 PetscValidPointer(A,2); 3105 PetscValidPointer(B,3); 3106 /* FIXME: make sure C is assembled */ 3107 *A = aij->A; 3108 *B = aij->B; 3109 /* Note that we don't incref *A and *B, so be careful! */ 3110 PetscFunctionReturn(0); 3111 } 3112 3113 /* 3114 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3115 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3116 */ 3117 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3118 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3119 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3120 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3121 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3122 { 3123 PetscErrorCode ierr; 3124 PetscMPIInt isize,flag; 3125 PetscInt i,ii,cismax,ispar,ciscol_localsize; 3126 Mat *A,*B; 3127 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3128 3129 PetscFunctionBegin; 3130 if (!ismax) PetscFunctionReturn(0); 3131 3132 for (i = 0, cismax = 0; i < ismax; ++i) { 3133 PetscMPIInt isize; 3134 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3135 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3136 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3137 if (isize > 1) ++cismax; 3138 } 3139 /* 3140 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3141 ispar counts the number of parallel ISs across C's comm. 3142 */ 3143 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3144 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3145 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3146 PetscFunctionReturn(0); 3147 } 3148 3149 /* if (ispar) */ 3150 /* 3151 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3152 These are used to extract the off-diag portion of the resulting parallel matrix. 3153 The row IS for the off-diag portion is the same as for the diag portion, 3154 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3155 */ 3156 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3157 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3158 for (i = 0, ii = 0; i < ismax; ++i) { 3159 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3160 if (isize > 1) { 3161 /* 3162 TODO: This is the part that's ***NOT SCALABLE***. 3163 To fix this we need to extract just the indices of C's nonzero columns 3164 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3165 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3166 to be done without serializing on the IS list, so, most likely, it is best 3167 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3168 */ 3169 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3170 /* Now we have to 3171 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3172 were sorted on each rank, concatenated they might no longer be sorted; 3173 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3174 indices in the nondecreasing order to the original index positions. 3175 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3176 */ 3177 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3178 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3179 ++ii; 3180 } 3181 } 3182 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3183 for (i = 0, ii = 0; i < ismax; ++i) { 3184 PetscInt j,issize; 3185 const PetscInt *indices; 3186 3187 /* 3188 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3189 */ 3190 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3191 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3192 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3193 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3194 for (j = 1; j < issize; ++j) { 3195 if (indices[j] == indices[j-1]) { 3196 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3197 } 3198 } 3199 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3200 3201 3202 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3203 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3204 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3205 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3206 for (j = 1; j < issize; ++j) { 3207 if (indices[j-1] == indices[j]) { 3208 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3209 } 3210 } 3211 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3212 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3213 if (isize > 1) { 3214 cisrow[ii] = isrow[i]; 3215 ++ii; 3216 } 3217 } 3218 /* 3219 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3220 array of sequential matrices underlying the resulting parallel matrices. 3221 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3222 contain duplicates. 3223 3224 There are as many diag matrices as there are original index sets. There are only as many parallel 3225 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3226 3227 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3228 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3229 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3230 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3231 with A[i] and B[ii] extracted from the corresponding MPI submat. 3232 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3233 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3234 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3235 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3236 values into A[i] and B[ii] sitting inside the corresponding submat. 3237 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3238 A[i], B[ii], AA[i] or BB[ii] matrices. 3239 */ 3240 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3241 if (scall == MAT_INITIAL_MATRIX) { 3242 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3243 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 3244 } else { 3245 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 3246 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 3247 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 3248 for (i = 0, ii = 0; i < ismax; ++i) { 3249 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3250 if (isize > 1) { 3251 Mat AA,BB; 3252 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 3253 if (!isrow_p[i] && !iscol_p[i]) { 3254 A[i] = AA; 3255 } else { 3256 /* TODO: extract A[i] composed on AA. */ 3257 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3258 } 3259 if (!isrow_p[i] && !ciscol_p[ii]) { 3260 B[ii] = BB; 3261 } else { 3262 /* TODO: extract B[ii] composed on BB. */ 3263 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 3264 } 3265 ++ii; 3266 } else { 3267 if (!isrow_p[i] && !iscol_p[i]) { 3268 A[i] = (*submat)[i]; 3269 } else { 3270 /* TODO: extract A[i] composed on (*submat)[i]. */ 3271 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3272 } 3273 } 3274 } 3275 } 3276 /* Now obtain the sequential A and B submatrices separately. */ 3277 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 3278 /* I did not figure out a good way to fix it right now. 3279 * Local column size of B[i] is different from the size of ciscol[i]. 3280 * B[i]'s size is finally determined by MatAssembly, while 3281 * ciscol[i] is computed as the complement of iscol[i]. 3282 * It is better to keep only nonzero indices when computing 3283 * the complement ciscol[i]. 3284 * */ 3285 if(scall==MAT_REUSE_MATRIX){ 3286 for(i=0; i<cismax; i++){ 3287 ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr); 3288 B[i]->cmap->n = ciscol_localsize; 3289 } 3290 } 3291 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 3292 /* 3293 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3294 matrices A & B have been extracted directly into the parallel matrices containing them, or 3295 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3296 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3297 to have the same sparsity pattern. 3298 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3299 must be constructed for C. This is done by setseqmat(s). 3300 */ 3301 for (i = 0, ii = 0; i < ismax; ++i) { 3302 /* 3303 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3304 That way we can avoid sorting and computing permutations when reusing. 3305 To this end: 3306 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3307 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3308 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3309 */ 3310 MatStructure pattern; 3311 if (scall == MAT_INITIAL_MATRIX) { 3312 pattern = DIFFERENT_NONZERO_PATTERN; 3313 } else { 3314 pattern = SAME_NONZERO_PATTERN; 3315 } 3316 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3317 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3318 if (isize > 1) { 3319 if (scall == MAT_INITIAL_MATRIX) { 3320 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3321 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3322 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3323 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3324 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3325 } 3326 /* 3327 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3328 */ 3329 { 3330 Mat AA,BB; 3331 AA = NULL; 3332 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 3333 BB = NULL; 3334 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 3335 if (AA || BB) { 3336 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3337 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3338 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3339 } 3340 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 3341 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 3342 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3343 } 3344 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 3345 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 3346 ierr = MatDestroy(&BB);CHKERRQ(ierr); 3347 } 3348 } 3349 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3350 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3351 ++ii; 3352 } else { /* if (isize == 1) */ 3353 if (scall == MAT_INITIAL_MATRIX) { 3354 if (isrow_p[i] || iscol_p[i]) { 3355 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3356 } else (*submat)[i] = A[i]; 3357 } 3358 if (isrow_p[i] || iscol_p[i]) { 3359 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3360 /* Otherwise A is extracted straight into (*submats)[i]. */ 3361 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3362 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3363 } 3364 } 3365 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3366 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3367 } 3368 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3369 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3370 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3371 ierr = PetscFree(A);CHKERRQ(ierr); 3372 ierr = PetscFree(B);CHKERRQ(ierr); 3373 PetscFunctionReturn(0); 3374 } 3375 3376 3377 3378 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3379 { 3380 PetscErrorCode ierr; 3381 3382 PetscFunctionBegin; 3383 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3384 PetscFunctionReturn(0); 3385 } 3386