1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 22 { 23 PetscErrorCode ierr; 24 PetscInt i; 25 26 PetscFunctionBegin; 27 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 for (i=0; i<ov; ++i) { 29 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 30 } 31 PetscFunctionReturn(0); 32 } 33 34 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 35 { 36 PetscErrorCode ierr; 37 PetscInt i; 38 39 PetscFunctionBegin; 40 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 41 for (i=0; i<ov; ++i) { 42 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 43 } 44 PetscFunctionReturn(0); 45 } 46 47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 48 { 49 PetscErrorCode ierr; 50 MPI_Comm comm; 51 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j; 52 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 53 PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL; 54 const PetscInt *indices_i,**indices; 55 PetscLayout rmap; 56 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner; 57 PetscSF sf; 58 PetscSFNode *remote; 59 60 PetscFunctionBegin; 61 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 62 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 63 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 64 /* get row map to determine where rows should be going */ 65 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 66 /* retrieve IS data and put all together so that we 67 * can optimize communication 68 * */ 69 ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 70 for (i=0,tlength=0; i<nidx; i++) { 71 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 72 tlength += length[i]; 73 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 74 } 75 /* find these rows on remote processors */ 76 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 77 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 78 nrrows = 0; 79 for (i=0; i<nidx; i++) { 80 length_i = length[i]; 81 indices_i = indices[i]; 82 for (j=0; j<length_i; j++) { 83 owner = -1; 84 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 85 /* remote processors */ 86 if (owner != rank) { 87 tosizes_temp[owner]++; /* number of rows to owner */ 88 rrow_ranks[nrrows] = owner; /* processor */ 89 rrow_isids[nrrows] = i; /* is id */ 90 remoterows[nrrows++] = indices_i[j]; /* row */ 91 } 92 } 93 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 94 } 95 ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr); 96 /* test if we need to exchange messages 97 * generally speaking, we do not need to exchange 98 * data when overlap is 1 99 * */ 100 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRMPI(ierr); 101 /* we do not have any messages 102 * It usually corresponds to overlap 1 103 * */ 104 if (!reducednrrows) { 105 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 106 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 107 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 nto = 0; 111 /* send sizes and ranks for building a two-sided communcation */ 112 for (i=0; i<size; i++) { 113 if (tosizes_temp[i]) { 114 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 115 tosizes_temp[i] = nto; /* a map from processor to index */ 116 toranks[nto++] = i; /* processor */ 117 } 118 } 119 ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr); 120 toffsets[0] = 0; 121 for (i=0; i<nto; i++) { 122 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 123 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 124 } 125 /* send information to other processors */ 126 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 127 nrecvrows = 0; 128 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 129 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 130 nrecvrows = 0; 131 for (i=0; i<nfrom; i++) { 132 for (j=0; j<fromsizes[2*i]; j++) { 133 remote[nrecvrows].rank = fromranks[i]; 134 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 135 } 136 } 137 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 138 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 139 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 140 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 141 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 142 /* message pair <no of is, row> */ 143 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 144 for (i=0; i<nrrows; i++) { 145 owner = rrow_ranks[i]; /* processor */ 146 j = tosizes_temp[owner]; /* index */ 147 todata[toffsets[j]++] = rrow_isids[i]; 148 todata[toffsets[j]++] = remoterows[i]; 149 } 150 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 151 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 152 ierr = PetscFree(toffsets);CHKERRQ(ierr); 153 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr); 154 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr); 155 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 156 /* send rows belonging to the remote so that then we could get the overlapping data back */ 157 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 158 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 159 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 160 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 161 ierr = PetscFree(fromranks);CHKERRQ(ierr); 162 nrecvrows = 0; 163 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 164 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 165 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 166 nrecvrows = 0; 167 for (i=0; i<nto; i++) { 168 for (j=0; j<tosizes[2*i]; j++) { 169 remote[nrecvrows].rank = toranks[i]; 170 remote[nrecvrows++].index = tosizes[2*i+1]+j; 171 } 172 } 173 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 174 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 175 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 176 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 177 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 178 /* overlap communication and computation */ 179 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr); 180 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 181 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr); 182 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 183 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 184 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 185 ierr = PetscFree(toranks);CHKERRQ(ierr); 186 ierr = PetscFree(tosizes);CHKERRQ(ierr); 187 ierr = PetscFree(todata);CHKERRQ(ierr); 188 PetscFunctionReturn(0); 189 } 190 191 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 192 { 193 PetscInt *isz,isz_i,i,j,is_id, data_size; 194 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 195 const PetscInt *indices_i_temp; 196 MPI_Comm *iscomms; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++) { 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++) { 209 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 210 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 211 ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr); 212 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 213 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 214 } 215 /* retrieve information to get row id and its overlap */ 216 for (i=0; i<nrecvs;) { 217 is_id = recvdata[i++]; 218 data_size = recvdata[i++]; 219 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 220 isz_i = isz[is_id]; 221 for (j=0; j< data_size; j++) { 222 col = recvdata[i++]; 223 indices_i[isz_i++] = col; 224 } 225 isz[is_id] = isz_i; 226 } 227 /* remove duplicate entities */ 228 for (i=0; i<nidx; i++) { 229 indices_i = indices_temp+(max_lsize+nrecvs)*i; 230 isz_i = isz[i]; 231 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 232 ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 233 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 234 } 235 ierr = PetscFree(isz);CHKERRQ(ierr); 236 ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr); 237 PetscFunctionReturn(0); 238 } 239 240 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 241 { 242 PetscLayout rmap,cmap; 243 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 244 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 245 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 246 const PetscInt *gcols,*ai,*aj,*bi,*bj; 247 Mat amat,bmat; 248 PetscMPIInt rank; 249 PetscBool done; 250 MPI_Comm comm; 251 PetscErrorCode ierr; 252 253 PetscFunctionBegin; 254 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 255 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 256 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 257 /* Even if the mat is symmetric, we still assume it is not symmetric */ 258 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 261 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 262 /* total number of nonzero values is used to estimate the memory usage in the next step */ 263 tnz = ai[an]+bi[bn]; 264 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 265 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 266 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 267 /* to find the longest message */ 268 max_fszs = 0; 269 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 270 /* better way to estimate number of nonzero in the mat??? */ 271 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 272 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 273 rows_pos = 0; 274 totalrows = 0; 275 for (i=0; i<nfrom; i++) { 276 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 277 /* group data together */ 278 for (j=0; j<fromsizes[2*i]; j+=2) { 279 is_id = fromrows[rows_pos++];/* no of is */ 280 rows_i = rows_data[is_id]; 281 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 282 } 283 /* estimate a space to avoid multiple allocations */ 284 for (j=0; j<nidx; j++) { 285 indvc_ij = 0; 286 rows_i = rows_data[j]; 287 for (l=0; l<rows_pos_i[j]; l++) { 288 row = rows_i[l]-rstart; 289 start = ai[row]; 290 end = ai[row+1]; 291 for (k=start; k<end; k++) { /* Amat */ 292 col = aj[k] + cstart; 293 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 294 } 295 start = bi[row]; 296 end = bi[row+1]; 297 for (k=start; k<end; k++) { /* Bmat */ 298 col = gcols[bj[k]]; 299 indices_tmp[indvc_ij++] = col; 300 } 301 } 302 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 303 indv_counts[i*nidx+j] = indvc_ij; 304 totalrows += indvc_ij; 305 } 306 } 307 /* message triple <no of is, number of rows, rows> */ 308 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 309 totalrows = 0; 310 rows_pos = 0; 311 /* use this code again */ 312 for (i=0;i<nfrom;i++) { 313 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 314 for (j=0; j<fromsizes[2*i]; j+=2) { 315 is_id = fromrows[rows_pos++]; 316 rows_i = rows_data[is_id]; 317 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 318 } 319 /* add data */ 320 for (j=0; j<nidx; j++) { 321 if (!indv_counts[i*nidx+j]) continue; 322 indvc_ij = 0; 323 sbdata[totalrows++] = j; 324 sbdata[totalrows++] = indv_counts[i*nidx+j]; 325 sbsizes[2*i] += 2; 326 rows_i = rows_data[j]; 327 for (l=0; l<rows_pos_i[j]; l++) { 328 row = rows_i[l]-rstart; 329 start = ai[row]; 330 end = ai[row+1]; 331 for (k=start; k<end; k++) { /* Amat */ 332 col = aj[k] + cstart; 333 indices_tmp[indvc_ij++] = col; 334 } 335 start = bi[row]; 336 end = bi[row+1]; 337 for (k=start; k<end; k++) { /* Bmat */ 338 col = gcols[bj[k]]; 339 indices_tmp[indvc_ij++] = col; 340 } 341 } 342 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 343 sbsizes[2*i] += indvc_ij; 344 ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr); 345 totalrows += indvc_ij; 346 } 347 } 348 ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 349 offsets[0] = 0; 350 for (i=0; i<nfrom; i++) { 351 offsets[i+1] = offsets[i] + sbsizes[2*i]; 352 sbsizes[2*i+1] = offsets[i]; 353 } 354 ierr = PetscFree(offsets);CHKERRQ(ierr); 355 if (sbrowsizes) *sbrowsizes = sbsizes; 356 if (sbrows) *sbrows = sbdata; 357 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 358 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 359 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 360 PetscFunctionReturn(0); 361 } 362 363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 364 { 365 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 366 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 367 PetscInt lsize,lsize_tmp; 368 PetscMPIInt rank,owner; 369 Mat amat,bmat; 370 PetscBool done; 371 PetscLayout cmap,rmap; 372 MPI_Comm comm; 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 377 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 378 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 379 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 380 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 381 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 382 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 383 /* is it a safe way to compute number of nonzero values ? */ 384 tnz = ai[an]+bi[bn]; 385 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 386 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 387 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 388 /* it is a better way to estimate memory than the old implementation 389 * where global size of matrix is used 390 * */ 391 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 392 for (i=0; i<nidx; i++) { 393 MPI_Comm iscomm; 394 395 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 396 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 397 lsize_tmp = 0; 398 for (j=0; j<lsize; j++) { 399 owner = -1; 400 row = indices[j]; 401 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 402 if (owner != rank) continue; 403 /* local number */ 404 row -= rstart; 405 start = ai[row]; 406 end = ai[row+1]; 407 for (k=start; k<end; k++) { /* Amat */ 408 col = aj[k] + cstart; 409 indices_temp[lsize_tmp++] = col; 410 } 411 start = bi[row]; 412 end = bi[row+1]; 413 for (k=start; k<end; k++) { /* Bmat */ 414 col = gcols[bj[k]]; 415 indices_temp[lsize_tmp++] = col; 416 } 417 } 418 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 419 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr); 420 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 421 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 422 ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 423 ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr); 424 } 425 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 426 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 427 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 /* 432 Sample message format: 433 If a processor A wants processor B to process some elements corresponding 434 to index sets is[1],is[5] 435 mesg [0] = 2 (no of index sets in the mesg) 436 ----------- 437 mesg [1] = 1 => is[1] 438 mesg [2] = sizeof(is[1]); 439 ----------- 440 mesg [3] = 5 => is[5] 441 mesg [4] = sizeof(is[5]); 442 ----------- 443 mesg [5] 444 mesg [n] datas[1] 445 ----------- 446 mesg[n+1] 447 mesg[m] data(is[5]) 448 ----------- 449 450 Notes: 451 nrqs - no of requests sent (or to be sent out) 452 nrqr - no of requests received (which have to be or which have been processed) 453 */ 454 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 455 { 456 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 457 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 458 const PetscInt **idx,*idx_i; 459 PetscInt *n,**data,len; 460 #if defined(PETSC_USE_CTABLE) 461 PetscTable *table_data,table_data_i; 462 PetscInt *tdata,tcount,tcount_max; 463 #else 464 PetscInt *data_i,*d_p; 465 #endif 466 PetscErrorCode ierr; 467 PetscMPIInt size,rank,tag1,tag2,proc = 0; 468 PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 469 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 470 PetscBT *table; 471 MPI_Comm comm; 472 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 473 MPI_Status *s_status,*recv_status; 474 MPI_Comm *iscomms; 475 char *t_p; 476 477 PetscFunctionBegin; 478 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 479 size = c->size; 480 rank = c->rank; 481 M = C->rmap->N; 482 483 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 484 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 485 486 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 487 488 for (i=0; i<imax; i++) { 489 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 490 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 491 } 492 493 /* evaluate communication - mesg to who,length of mesg, and buffer space 494 required. Based on this, buffers are allocated, and data copied into them */ 495 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 496 for (i=0; i<imax; i++) { 497 ierr = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/ 498 idx_i = idx[i]; 499 len = n[i]; 500 for (j=0; j<len; j++) { 501 row = idx_i[j]; 502 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 503 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 504 w4[proc]++; 505 } 506 for (j=0; j<size; j++) { 507 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 508 } 509 } 510 511 nrqs = 0; /* no of outgoing messages */ 512 msz = 0; /* total mesg length (for all proc */ 513 w1[rank] = 0; /* no mesg sent to intself */ 514 w3[rank] = 0; 515 for (i=0; i<size; i++) { 516 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 517 } 518 /* pa - is list of processors to communicate with */ 519 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 520 for (i=0,j=0; i<size; i++) { 521 if (w1[i]) {pa[j] = i; j++;} 522 } 523 524 /* Each message would have a header = 1 + 2*(no of IS) + data */ 525 for (i=0; i<nrqs; i++) { 526 j = pa[i]; 527 w1[j] += w2[j] + 2*w3[j]; 528 msz += w1[j]; 529 } 530 531 /* Determine the number of messages to expect, their lengths, from from-ids */ 532 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 533 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 534 535 /* Now post the Irecvs corresponding to these messages */ 536 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 537 538 /* Allocate Memory for outgoing messages */ 539 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 540 ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr); 541 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 542 543 { 544 PetscInt *iptr = tmp,ict = 0; 545 for (i=0; i<nrqs; i++) { 546 j = pa[i]; 547 iptr += ict; 548 outdat[j] = iptr; 549 ict = w1[j]; 550 } 551 } 552 553 /* Form the outgoing messages */ 554 /* plug in the headers */ 555 for (i=0; i<nrqs; i++) { 556 j = pa[i]; 557 outdat[j][0] = 0; 558 ierr = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr); 559 ptr[j] = outdat[j] + 2*w3[j] + 1; 560 } 561 562 /* Memory for doing local proc's work */ 563 { 564 PetscInt M_BPB_imax = 0; 565 #if defined(PETSC_USE_CTABLE) 566 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 567 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 570 } 571 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 572 for (i=0; i<imax; i++) { 573 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 574 } 575 #else 576 PetscInt Mimax = 0; 577 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 578 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 579 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 580 for (i=0; i<imax; i++) { 581 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 582 data[i] = d_p + M*i; 583 } 584 #endif 585 } 586 587 /* Parse the IS and update local tables and the outgoing buf with the data */ 588 { 589 PetscInt n_i,isz_i,*outdat_j,ctr_j; 590 PetscBT table_i; 591 592 for (i=0; i<imax; i++) { 593 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 594 n_i = n[i]; 595 table_i = table[i]; 596 idx_i = idx[i]; 597 #if defined(PETSC_USE_CTABLE) 598 table_data_i = table_data[i]; 599 #else 600 data_i = data[i]; 601 #endif 602 isz_i = isz[i]; 603 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 604 row = idx_i[j]; 605 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 606 if (proc != rank) { /* copy to the outgoing buffer */ 607 ctr[proc]++; 608 *ptr[proc] = row; 609 ptr[proc]++; 610 } else if (!PetscBTLookupSet(table_i,row)) { 611 #if defined(PETSC_USE_CTABLE) 612 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 613 #else 614 data_i[isz_i] = row; /* Update the local table */ 615 #endif 616 isz_i++; 617 } 618 } 619 /* Update the headers for the current IS */ 620 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 621 if ((ctr_j = ctr[j])) { 622 outdat_j = outdat[j]; 623 k = ++outdat_j[0]; 624 outdat_j[2*k] = ctr_j; 625 outdat_j[2*k-1] = i; 626 } 627 } 628 isz[i] = isz_i; 629 } 630 } 631 632 /* Now post the sends */ 633 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 634 for (i=0; i<nrqs; ++i) { 635 j = pa[i]; 636 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 637 } 638 639 /* No longer need the original indices */ 640 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 641 for (i=0; i<imax; ++i) { 642 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 643 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 644 } 645 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 646 647 for (i=0; i<imax; ++i) { 648 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 649 } 650 651 /* Do Local work */ 652 #if defined(PETSC_USE_CTABLE) 653 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 654 #else 655 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 656 #endif 657 658 /* Receive messages */ 659 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 660 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr);} 661 662 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 663 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRMPI(ierr);} 664 665 /* Phase 1 sends are complete - deallocate buffers */ 666 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 667 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 668 669 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 670 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 671 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 672 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 673 ierr = PetscFree(rbuf);CHKERRQ(ierr); 674 675 /* Send the data back */ 676 /* Do a global reduction to know the buffer space req for incoming messages */ 677 { 678 PetscMPIInt *rw1; 679 680 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 681 682 for (i=0; i<nrqr; ++i) { 683 proc = recv_status[i].MPI_SOURCE; 684 685 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 686 rw1[proc] = isz1[i]; 687 } 688 ierr = PetscFree(onodes1);CHKERRQ(ierr); 689 ierr = PetscFree(olengths1);CHKERRQ(ierr); 690 691 /* Determine the number of messages to expect, their lengths, from from-ids */ 692 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 693 ierr = PetscFree(rw1);CHKERRQ(ierr); 694 } 695 /* Now post the Irecvs corresponding to these messages */ 696 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 697 698 /* Now post the sends */ 699 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 700 for (i=0; i<nrqr; ++i) { 701 j = recv_status[i].MPI_SOURCE; 702 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr); 703 } 704 705 /* receive work done on other processors */ 706 { 707 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 708 PetscMPIInt idex; 709 PetscBT table_i; 710 MPI_Status *status2; 711 712 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 713 for (i=0; i<nrqs; ++i) { 714 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRMPI(ierr); 715 /* Process the message */ 716 rbuf2_i = rbuf2[idex]; 717 ct1 = 2*rbuf2_i[0]+1; 718 jmax = rbuf2[idex][0]; 719 for (j=1; j<=jmax; j++) { 720 max = rbuf2_i[2*j]; 721 is_no = rbuf2_i[2*j-1]; 722 isz_i = isz[is_no]; 723 table_i = table[is_no]; 724 #if defined(PETSC_USE_CTABLE) 725 table_data_i = table_data[is_no]; 726 #else 727 data_i = data[is_no]; 728 #endif 729 for (k=0; k<max; k++,ct1++) { 730 row = rbuf2_i[ct1]; 731 if (!PetscBTLookupSet(table_i,row)) { 732 #if defined(PETSC_USE_CTABLE) 733 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 734 #else 735 data_i[isz_i] = row; 736 #endif 737 isz_i++; 738 } 739 } 740 isz[is_no] = isz_i; 741 } 742 } 743 744 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRMPI(ierr);} 745 ierr = PetscFree(status2);CHKERRQ(ierr); 746 } 747 748 #if defined(PETSC_USE_CTABLE) 749 tcount_max = 0; 750 for (i=0; i<imax; ++i) { 751 table_data_i = table_data[i]; 752 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 753 if (tcount_max < tcount) tcount_max = tcount; 754 } 755 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 756 #endif 757 758 for (i=0; i<imax; ++i) { 759 #if defined(PETSC_USE_CTABLE) 760 PetscTablePosition tpos; 761 table_data_i = table_data[i]; 762 763 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 764 while (tpos) { 765 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 766 tdata[--j] = --k; 767 } 768 ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 769 #else 770 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 771 #endif 772 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 773 } 774 775 ierr = PetscFree(iscomms);CHKERRQ(ierr); 776 ierr = PetscFree(onodes2);CHKERRQ(ierr); 777 ierr = PetscFree(olengths2);CHKERRQ(ierr); 778 779 ierr = PetscFree(pa);CHKERRQ(ierr); 780 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 781 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 782 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 783 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 784 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 785 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 786 ierr = PetscFree(s_status);CHKERRQ(ierr); 787 ierr = PetscFree(recv_status);CHKERRQ(ierr); 788 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 789 ierr = PetscFree(xdata);CHKERRQ(ierr); 790 ierr = PetscFree(isz1);CHKERRQ(ierr); 791 #if defined(PETSC_USE_CTABLE) 792 for (i=0; i<imax; i++) { 793 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 794 } 795 ierr = PetscFree(table_data);CHKERRQ(ierr); 796 ierr = PetscFree(tdata);CHKERRQ(ierr); 797 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 798 #else 799 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 800 #endif 801 PetscFunctionReturn(0); 802 } 803 804 /* 805 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 806 the work on the local processor. 807 808 Inputs: 809 C - MAT_MPIAIJ; 810 imax - total no of index sets processed at a time; 811 table - an array of char - size = m bits. 812 813 Output: 814 isz - array containing the count of the solution elements corresponding 815 to each index set; 816 data or table_data - pointer to the solutions 817 */ 818 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 819 { 820 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 821 Mat A = c->A,B = c->B; 822 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 823 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 824 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 825 PetscBT table_i; 826 #if defined(PETSC_USE_CTABLE) 827 PetscTable table_data_i; 828 PetscErrorCode ierr; 829 PetscTablePosition tpos; 830 PetscInt tcount,*tdata; 831 #else 832 PetscInt *data_i; 833 #endif 834 835 PetscFunctionBegin; 836 rstart = C->rmap->rstart; 837 cstart = C->cmap->rstart; 838 ai = a->i; 839 aj = a->j; 840 bi = b->i; 841 bj = b->j; 842 garray = c->garray; 843 844 for (i=0; i<imax; i++) { 845 #if defined(PETSC_USE_CTABLE) 846 /* copy existing entries of table_data_i into tdata[] */ 847 table_data_i = table_data[i]; 848 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 849 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %d != isz[%d] %d",tcount,i,isz[i]); 850 851 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 852 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 853 while (tpos) { 854 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 855 tdata[--j] = --row; 856 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB," j %d >= tcount %d",j,tcount); 857 } 858 #else 859 data_i = data[i]; 860 #endif 861 table_i = table[i]; 862 isz_i = isz[i]; 863 max = isz[i]; 864 865 for (j=0; j<max; j++) { 866 #if defined(PETSC_USE_CTABLE) 867 row = tdata[j] - rstart; 868 #else 869 row = data_i[j] - rstart; 870 #endif 871 start = ai[row]; 872 end = ai[row+1]; 873 for (k=start; k<end; k++) { /* Amat */ 874 val = aj[k] + cstart; 875 if (!PetscBTLookupSet(table_i,val)) { 876 #if defined(PETSC_USE_CTABLE) 877 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 878 #else 879 data_i[isz_i] = val; 880 #endif 881 isz_i++; 882 } 883 } 884 start = bi[row]; 885 end = bi[row+1]; 886 for (k=start; k<end; k++) { /* Bmat */ 887 val = garray[bj[k]]; 888 if (!PetscBTLookupSet(table_i,val)) { 889 #if defined(PETSC_USE_CTABLE) 890 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 891 #else 892 data_i[isz_i] = val; 893 #endif 894 isz_i++; 895 } 896 } 897 } 898 isz[i] = isz_i; 899 900 #if defined(PETSC_USE_CTABLE) 901 ierr = PetscFree(tdata);CHKERRQ(ierr); 902 #endif 903 } 904 PetscFunctionReturn(0); 905 } 906 907 /* 908 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 909 and return the output 910 911 Input: 912 C - the matrix 913 nrqr - no of messages being processed. 914 rbuf - an array of pointers to the received requests 915 916 Output: 917 xdata - array of messages to be sent back 918 isz1 - size of each message 919 920 For better efficiency perhaps we should malloc separately each xdata[i], 921 then if a remalloc is required we need only copy the data for that one row 922 rather then all previous rows as it is now where a single large chunck of 923 memory is used. 924 925 */ 926 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 927 { 928 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 929 Mat A = c->A,B = c->B; 930 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 931 PetscErrorCode ierr; 932 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 933 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 934 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 935 PetscInt *rbuf_i,kmax,rbuf_0; 936 PetscBT xtable; 937 938 PetscFunctionBegin; 939 m = C->rmap->N; 940 rstart = C->rmap->rstart; 941 cstart = C->cmap->rstart; 942 ai = a->i; 943 aj = a->j; 944 bi = b->i; 945 bj = b->j; 946 garray = c->garray; 947 948 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 949 rbuf_i = rbuf[i]; 950 rbuf_0 = rbuf_i[0]; 951 ct += rbuf_0; 952 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 953 } 954 955 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 956 else max1 = 1; 957 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 958 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 959 ++no_malloc; 960 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 961 ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr); 962 963 ct3 = 0; 964 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 965 rbuf_i = rbuf[i]; 966 rbuf_0 = rbuf_i[0]; 967 ct1 = 2*rbuf_0+1; 968 ct2 = ct1; 969 ct3 += ct1; 970 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 971 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 972 oct2 = ct2; 973 kmax = rbuf_i[2*j]; 974 for (k=0; k<kmax; k++,ct1++) { 975 row = rbuf_i[ct1]; 976 if (!PetscBTLookupSet(xtable,row)) { 977 if (!(ct3 < mem_estimate)) { 978 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 979 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 980 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 981 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 982 xdata[0] = tmp; 983 mem_estimate = new_estimate; ++no_malloc; 984 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 985 } 986 xdata[i][ct2++] = row; 987 ct3++; 988 } 989 } 990 for (k=oct2,max2=ct2; k<max2; k++) { 991 row = xdata[i][k] - rstart; 992 start = ai[row]; 993 end = ai[row+1]; 994 for (l=start; l<end; l++) { 995 val = aj[l] + cstart; 996 if (!PetscBTLookupSet(xtable,val)) { 997 if (!(ct3 < mem_estimate)) { 998 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 999 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1000 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1001 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1002 xdata[0] = tmp; 1003 mem_estimate = new_estimate; ++no_malloc; 1004 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1005 } 1006 xdata[i][ct2++] = val; 1007 ct3++; 1008 } 1009 } 1010 start = bi[row]; 1011 end = bi[row+1]; 1012 for (l=start; l<end; l++) { 1013 val = garray[bj[l]]; 1014 if (!PetscBTLookupSet(xtable,val)) { 1015 if (!(ct3 < mem_estimate)) { 1016 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1017 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1018 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1019 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1020 xdata[0] = tmp; 1021 mem_estimate = new_estimate; ++no_malloc; 1022 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1023 } 1024 xdata[i][ct2++] = val; 1025 ct3++; 1026 } 1027 } 1028 } 1029 /* Update the header*/ 1030 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1031 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1032 } 1033 xdata[i][0] = rbuf_0; 1034 xdata[i+1] = xdata[i] + ct2; 1035 isz1[i] = ct2; /* size of each message */ 1036 } 1037 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1038 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1039 PetscFunctionReturn(0); 1040 } 1041 /* -------------------------------------------------------------------------*/ 1042 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1043 /* 1044 Every processor gets the entire matrix 1045 */ 1046 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1047 { 1048 Mat B; 1049 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1050 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1051 PetscErrorCode ierr; 1052 PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL; 1053 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1054 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1055 1056 PetscFunctionBegin; 1057 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 1058 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr); 1059 if (scall == MAT_INITIAL_MATRIX) { 1060 /* ---------------------------------------------------------------- 1061 Tell every processor the number of nonzeros per row 1062 */ 1063 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1064 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1065 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1066 } 1067 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1068 for (i=0; i<size; i++) { 1069 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1070 displs[i] = A->rmap->range[i]; 1071 } 1072 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1073 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1074 #else 1075 sendcount = A->rmap->rend - A->rmap->rstart; 1076 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1077 #endif 1078 /* --------------------------------------------------------------- 1079 Create the sequential matrix of the same type as the local block diagonal 1080 */ 1081 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1082 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1083 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1084 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1085 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1086 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1087 **Bin = B; 1088 b = (Mat_SeqAIJ*)B->data; 1089 1090 /*-------------------------------------------------------------------- 1091 Copy my part of matrix column indices over 1092 */ 1093 sendcount = ad->nz + bd->nz; 1094 jsendbuf = b->j + b->i[rstarts[rank]]; 1095 a_jsendbuf = ad->j; 1096 b_jsendbuf = bd->j; 1097 n = A->rmap->rend - A->rmap->rstart; 1098 cnt = 0; 1099 for (i=0; i<n; i++) { 1100 /* put in lower diagonal portion */ 1101 m = bd->i[i+1] - bd->i[i]; 1102 while (m > 0) { 1103 /* is it above diagonal (in bd (compressed) numbering) */ 1104 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1105 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1106 m--; 1107 } 1108 1109 /* put in diagonal portion */ 1110 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1111 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1112 } 1113 1114 /* put in upper diagonal portion */ 1115 while (m-- > 0) { 1116 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1117 } 1118 } 1119 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1120 1121 /*-------------------------------------------------------------------- 1122 Gather all column indices to all processors 1123 */ 1124 for (i=0; i<size; i++) { 1125 recvcounts[i] = 0; 1126 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1127 recvcounts[i] += lens[j]; 1128 } 1129 } 1130 displs[0] = 0; 1131 for (i=1; i<size; i++) { 1132 displs[i] = displs[i-1] + recvcounts[i-1]; 1133 } 1134 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1135 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1136 #else 1137 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1138 #endif 1139 /*-------------------------------------------------------------------- 1140 Assemble the matrix into useable form (note numerical values not yet set) 1141 */ 1142 /* set the b->ilen (length of each row) values */ 1143 ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr); 1144 /* set the b->i indices */ 1145 b->i[0] = 0; 1146 for (i=1; i<=A->rmap->N; i++) { 1147 b->i[i] = b->i[i-1] + lens[i-1]; 1148 } 1149 ierr = PetscFree(lens);CHKERRQ(ierr); 1150 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1151 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1152 1153 } else { 1154 B = **Bin; 1155 b = (Mat_SeqAIJ*)B->data; 1156 } 1157 1158 /*-------------------------------------------------------------------- 1159 Copy my part of matrix numerical values into the values location 1160 */ 1161 if (flag == MAT_GET_VALUES) { 1162 const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf; 1163 MatScalar *sendbuf,*recvbuf; 1164 1165 ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr); 1166 ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr); 1167 sendcount = ad->nz + bd->nz; 1168 sendbuf = b->a + b->i[rstarts[rank]]; 1169 a_sendbuf = ada; 1170 b_sendbuf = bda; 1171 b_sendj = bd->j; 1172 n = A->rmap->rend - A->rmap->rstart; 1173 cnt = 0; 1174 for (i=0; i<n; i++) { 1175 /* put in lower diagonal portion */ 1176 m = bd->i[i+1] - bd->i[i]; 1177 while (m > 0) { 1178 /* is it above diagonal (in bd (compressed) numbering) */ 1179 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1180 sendbuf[cnt++] = *b_sendbuf++; 1181 m--; 1182 b_sendj++; 1183 } 1184 1185 /* put in diagonal portion */ 1186 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1187 sendbuf[cnt++] = *a_sendbuf++; 1188 } 1189 1190 /* put in upper diagonal portion */ 1191 while (m-- > 0) { 1192 sendbuf[cnt++] = *b_sendbuf++; 1193 b_sendj++; 1194 } 1195 } 1196 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1197 1198 /* ----------------------------------------------------------------- 1199 Gather all numerical values to all processors 1200 */ 1201 if (!recvcounts) { 1202 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1203 } 1204 for (i=0; i<size; i++) { 1205 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1206 } 1207 displs[0] = 0; 1208 for (i=1; i<size; i++) { 1209 displs[i] = displs[i-1] + recvcounts[i-1]; 1210 } 1211 recvbuf = b->a; 1212 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1213 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1214 #else 1215 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1216 #endif 1217 ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr); 1218 ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr); 1219 } /* endof (flag == MAT_GET_VALUES) */ 1220 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1221 1222 if (A->symmetric) { 1223 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1224 } else if (A->hermitian) { 1225 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1226 } else if (A->structurally_symmetric) { 1227 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1228 } 1229 PetscFunctionReturn(0); 1230 } 1231 1232 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1233 { 1234 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1235 Mat submat,A = c->A,B = c->B; 1236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1237 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1238 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1239 const PetscInt *icol,*irow; 1240 PetscInt nrow,ncol,start; 1241 PetscErrorCode ierr; 1242 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1243 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1244 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1245 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1246 PetscInt *lens,rmax,ncols,*cols,Crow; 1247 #if defined(PETSC_USE_CTABLE) 1248 PetscTable cmap,rmap; 1249 PetscInt *cmap_loc,*rmap_loc; 1250 #else 1251 PetscInt *cmap,*rmap; 1252 #endif 1253 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1254 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1255 PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL; 1256 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1257 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1258 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1259 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1260 MPI_Comm comm; 1261 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1262 PetscMPIInt *onodes1,*olengths1,idex,end; 1263 Mat_SubSppt *smatis1; 1264 PetscBool isrowsorted,iscolsorted; 1265 1266 PetscFunctionBegin; 1267 PetscValidLogicalCollectiveInt(C,ismax,2); 1268 PetscValidLogicalCollectiveEnum(C,scall,5); 1269 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1270 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1271 ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1272 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1273 size = c->size; 1274 rank = c->rank; 1275 1276 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1277 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1278 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1279 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1280 if (allcolumns) { 1281 icol = NULL; 1282 ncol = C->cmap->N; 1283 } else { 1284 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1285 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1286 } 1287 1288 if (scall == MAT_INITIAL_MATRIX) { 1289 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1290 1291 /* Get some new tags to keep the communication clean */ 1292 tag1 = ((PetscObject)C)->tag; 1293 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1294 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1295 1296 /* evaluate communication - mesg to who, length of mesg, and buffer space 1297 required. Based on this, buffers are allocated, and data copied into them */ 1298 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1299 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1300 1301 /* w1[proc] = num of rows owned by proc -- to be requested */ 1302 proc = 0; 1303 nrqs = 0; /* num of outgoing messages */ 1304 for (j=0; j<nrow; j++) { 1305 row = irow[j]; 1306 if (!isrowsorted) proc = 0; 1307 while (row >= C->rmap->range[proc+1]) proc++; 1308 w1[proc]++; 1309 row2proc[j] = proc; /* map row index to proc */ 1310 1311 if (proc != rank && !w2[proc]) { 1312 w2[proc] = 1; nrqs++; 1313 } 1314 } 1315 w1[rank] = 0; /* rows owned by self will not be requested */ 1316 1317 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1318 for (proc=0,j=0; proc<size; proc++) { 1319 if (w1[proc]) { pa[j++] = proc;} 1320 } 1321 1322 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1323 msz = 0; /* total mesg length (for all procs) */ 1324 for (i=0; i<nrqs; i++) { 1325 proc = pa[i]; 1326 w1[proc] += 3; 1327 msz += w1[proc]; 1328 } 1329 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1330 1331 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1332 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1333 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1334 1335 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1336 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1337 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1338 1339 /* Now post the Irecvs corresponding to these messages */ 1340 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1341 1342 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1343 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1344 1345 /* Allocate Memory for outgoing messages */ 1346 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1347 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 1348 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 1349 1350 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1351 iptr = tmp; 1352 for (i=0; i<nrqs; i++) { 1353 proc = pa[i]; 1354 sbuf1[proc] = iptr; 1355 iptr += w1[proc]; 1356 } 1357 1358 /* Form the outgoing messages */ 1359 /* Initialize the header space */ 1360 for (i=0; i<nrqs; i++) { 1361 proc = pa[i]; 1362 ierr = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr); 1363 ptr[proc] = sbuf1[proc] + 3; 1364 } 1365 1366 /* Parse the isrow and copy data into outbuf */ 1367 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 1368 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1369 proc = row2proc[j]; 1370 if (proc != rank) { /* copy to the outgoing buf*/ 1371 *ptr[proc] = irow[j]; 1372 ctr[proc]++; ptr[proc]++; 1373 } 1374 } 1375 1376 /* Update the headers for the current IS */ 1377 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1378 if ((ctr_j = ctr[j])) { 1379 sbuf1_j = sbuf1[j]; 1380 k = ++sbuf1_j[0]; 1381 sbuf1_j[2*k] = ctr_j; 1382 sbuf1_j[2*k-1] = 0; 1383 } 1384 } 1385 1386 /* Now post the sends */ 1387 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1388 for (i=0; i<nrqs; ++i) { 1389 proc = pa[i]; 1390 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 1391 } 1392 1393 /* Post Receives to capture the buffer size */ 1394 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1395 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1396 1397 rbuf2[0] = tmp + msz; 1398 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1399 1400 for (i=0; i<nrqs; ++i) { 1401 proc = pa[i]; 1402 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 1403 } 1404 1405 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1406 1407 /* Send to other procs the buf size they should allocate */ 1408 /* Receive messages*/ 1409 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1410 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1411 1412 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr); 1413 for (i=0; i<nrqr; ++i) { 1414 req_size[i] = 0; 1415 rbuf1_i = rbuf1[i]; 1416 start = 2*rbuf1_i[0] + 1; 1417 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr); 1418 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1419 sbuf2_i = sbuf2[i]; 1420 for (j=start; j<end; j++) { 1421 k = rbuf1_i[j] - rstart; 1422 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1423 sbuf2_i[j] = ncols; 1424 req_size[i] += ncols; 1425 } 1426 req_source1[i] = r_status1[i].MPI_SOURCE; 1427 1428 /* form the header */ 1429 sbuf2_i[0] = req_size[i]; 1430 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1431 1432 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 1433 } 1434 1435 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1436 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1437 1438 /* rbuf2 is received, Post recv column indices a->j */ 1439 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr); 1440 1441 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1442 for (i=0; i<nrqs; ++i) { 1443 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1444 req_source2[i] = r_status2[i].MPI_SOURCE; 1445 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 1446 } 1447 1448 /* Wait on sends1 and sends2 */ 1449 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1450 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr); 1451 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1452 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1453 1454 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr); 1455 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1456 1457 /* Now allocate sending buffers for a->j, and send them off */ 1458 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1459 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1460 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1461 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1462 1463 for (i=0; i<nrqr; i++) { /* for each requested message */ 1464 rbuf1_i = rbuf1[i]; 1465 sbuf_aj_i = sbuf_aj[i]; 1466 ct1 = 2*rbuf1_i[0] + 1; 1467 ct2 = 0; 1468 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1469 1470 kmax = rbuf1[i][2]; 1471 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1472 row = rbuf1_i[ct1] - rstart; 1473 nzA = ai[row+1] - ai[row]; 1474 nzB = bi[row+1] - bi[row]; 1475 ncols = nzA + nzB; 1476 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1477 1478 /* load the column indices for this row into cols*/ 1479 cols = sbuf_aj_i + ct2; 1480 1481 lwrite = 0; 1482 for (l=0; l<nzB; l++) { 1483 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1484 } 1485 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1486 for (l=0; l<nzB; l++) { 1487 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1488 } 1489 1490 ct2 += ncols; 1491 } 1492 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 1493 } 1494 1495 /* create column map (cmap): global col of C -> local col of submat */ 1496 #if defined(PETSC_USE_CTABLE) 1497 if (!allcolumns) { 1498 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1499 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1500 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1501 if (icol[j] >= cstart && icol[j] <cend) { 1502 cmap_loc[icol[j] - cstart] = j+1; 1503 } else { /* use PetscTable for non-local col indices */ 1504 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1505 } 1506 } 1507 } else { 1508 cmap = NULL; 1509 cmap_loc = NULL; 1510 } 1511 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1512 #else 1513 if (!allcolumns) { 1514 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1515 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1516 } else { 1517 cmap = NULL; 1518 } 1519 #endif 1520 1521 /* Create lens for MatSeqAIJSetPreallocation() */ 1522 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1523 1524 /* Compute lens from local part of C */ 1525 for (j=0; j<nrow; j++) { 1526 row = irow[j]; 1527 proc = row2proc[j]; 1528 if (proc == rank) { 1529 /* diagonal part A = c->A */ 1530 ncols = ai[row-rstart+1] - ai[row-rstart]; 1531 cols = aj + ai[row-rstart]; 1532 if (!allcolumns) { 1533 for (k=0; k<ncols; k++) { 1534 #if defined(PETSC_USE_CTABLE) 1535 tcol = cmap_loc[cols[k]]; 1536 #else 1537 tcol = cmap[cols[k]+cstart]; 1538 #endif 1539 if (tcol) lens[j]++; 1540 } 1541 } else { /* allcolumns */ 1542 lens[j] = ncols; 1543 } 1544 1545 /* off-diagonal part B = c->B */ 1546 ncols = bi[row-rstart+1] - bi[row-rstart]; 1547 cols = bj + bi[row-rstart]; 1548 if (!allcolumns) { 1549 for (k=0; k<ncols; k++) { 1550 #if defined(PETSC_USE_CTABLE) 1551 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1552 #else 1553 tcol = cmap[bmap[cols[k]]]; 1554 #endif 1555 if (tcol) lens[j]++; 1556 } 1557 } else { /* allcolumns */ 1558 lens[j] += ncols; 1559 } 1560 } 1561 } 1562 1563 /* Create row map (rmap): global row of C -> local row of submat */ 1564 #if defined(PETSC_USE_CTABLE) 1565 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1566 for (j=0; j<nrow; j++) { 1567 row = irow[j]; 1568 proc = row2proc[j]; 1569 if (proc == rank) { /* a local row */ 1570 rmap_loc[row - rstart] = j; 1571 } else { 1572 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1573 } 1574 } 1575 #else 1576 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1577 for (j=0; j<nrow; j++) { 1578 rmap[irow[j]] = j; 1579 } 1580 #endif 1581 1582 /* Update lens from offproc data */ 1583 /* recv a->j is done */ 1584 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr); 1585 for (i=0; i<nrqs; i++) { 1586 proc = pa[i]; 1587 sbuf1_i = sbuf1[proc]; 1588 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1589 ct1 = 2 + 1; 1590 ct2 = 0; 1591 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1592 rbuf3_i = rbuf3[i]; /* received C->j */ 1593 1594 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1595 max1 = sbuf1_i[2]; 1596 for (k=0; k<max1; k++,ct1++) { 1597 #if defined(PETSC_USE_CTABLE) 1598 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1599 row--; 1600 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1601 #else 1602 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1603 #endif 1604 /* Now, store row index of submat in sbuf1_i[ct1] */ 1605 sbuf1_i[ct1] = row; 1606 1607 nnz = rbuf2_i[ct1]; 1608 if (!allcolumns) { 1609 for (l=0; l<nnz; l++,ct2++) { 1610 #if defined(PETSC_USE_CTABLE) 1611 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1612 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1613 } else { 1614 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1615 } 1616 #else 1617 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1618 #endif 1619 if (tcol) lens[row]++; 1620 } 1621 } else { /* allcolumns */ 1622 lens[row] += nnz; 1623 } 1624 } 1625 } 1626 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr); 1627 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1628 1629 /* Create the submatrices */ 1630 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1631 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1632 1633 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1634 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1635 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1636 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1637 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1638 1639 /* create struct Mat_SubSppt and attached it to submat */ 1640 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1641 subc = (Mat_SeqAIJ*)submat->data; 1642 subc->submatis1 = smatis1; 1643 1644 smatis1->id = 0; 1645 smatis1->nrqs = nrqs; 1646 smatis1->nrqr = nrqr; 1647 smatis1->rbuf1 = rbuf1; 1648 smatis1->rbuf2 = rbuf2; 1649 smatis1->rbuf3 = rbuf3; 1650 smatis1->sbuf2 = sbuf2; 1651 smatis1->req_source2 = req_source2; 1652 1653 smatis1->sbuf1 = sbuf1; 1654 smatis1->ptr = ptr; 1655 smatis1->tmp = tmp; 1656 smatis1->ctr = ctr; 1657 1658 smatis1->pa = pa; 1659 smatis1->req_size = req_size; 1660 smatis1->req_source1 = req_source1; 1661 1662 smatis1->allcolumns = allcolumns; 1663 smatis1->singleis = PETSC_TRUE; 1664 smatis1->row2proc = row2proc; 1665 smatis1->rmap = rmap; 1666 smatis1->cmap = cmap; 1667 #if defined(PETSC_USE_CTABLE) 1668 smatis1->rmap_loc = rmap_loc; 1669 smatis1->cmap_loc = cmap_loc; 1670 #endif 1671 1672 smatis1->destroy = submat->ops->destroy; 1673 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1674 submat->factortype = C->factortype; 1675 1676 /* compute rmax */ 1677 rmax = 0; 1678 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1679 1680 } else { /* scall == MAT_REUSE_MATRIX */ 1681 submat = submats[0]; 1682 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1683 1684 subc = (Mat_SeqAIJ*)submat->data; 1685 rmax = subc->rmax; 1686 smatis1 = subc->submatis1; 1687 nrqs = smatis1->nrqs; 1688 nrqr = smatis1->nrqr; 1689 rbuf1 = smatis1->rbuf1; 1690 rbuf2 = smatis1->rbuf2; 1691 rbuf3 = smatis1->rbuf3; 1692 req_source2 = smatis1->req_source2; 1693 1694 sbuf1 = smatis1->sbuf1; 1695 sbuf2 = smatis1->sbuf2; 1696 ptr = smatis1->ptr; 1697 tmp = smatis1->tmp; 1698 ctr = smatis1->ctr; 1699 1700 pa = smatis1->pa; 1701 req_size = smatis1->req_size; 1702 req_source1 = smatis1->req_source1; 1703 1704 allcolumns = smatis1->allcolumns; 1705 row2proc = smatis1->row2proc; 1706 rmap = smatis1->rmap; 1707 cmap = smatis1->cmap; 1708 #if defined(PETSC_USE_CTABLE) 1709 rmap_loc = smatis1->rmap_loc; 1710 cmap_loc = smatis1->cmap_loc; 1711 #endif 1712 } 1713 1714 /* Post recv matrix values */ 1715 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1716 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1717 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1718 for (i=0; i<nrqs; ++i) { 1719 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1720 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 1721 } 1722 1723 /* Allocate sending buffers for a->a, and send them off */ 1724 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1725 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1726 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1727 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1728 1729 for (i=0; i<nrqr; i++) { 1730 rbuf1_i = rbuf1[i]; 1731 sbuf_aa_i = sbuf_aa[i]; 1732 ct1 = 2*rbuf1_i[0]+1; 1733 ct2 = 0; 1734 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1735 1736 kmax = rbuf1_i[2]; 1737 for (k=0; k<kmax; k++,ct1++) { 1738 row = rbuf1_i[ct1] - rstart; 1739 nzA = ai[row+1] - ai[row]; 1740 nzB = bi[row+1] - bi[row]; 1741 ncols = nzA + nzB; 1742 cworkB = bj + bi[row]; 1743 vworkA = a_a + ai[row]; 1744 vworkB = b_a + bi[row]; 1745 1746 /* load the column values for this row into vals*/ 1747 vals = sbuf_aa_i + ct2; 1748 1749 lwrite = 0; 1750 for (l=0; l<nzB; l++) { 1751 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1752 } 1753 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1754 for (l=0; l<nzB; l++) { 1755 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1756 } 1757 1758 ct2 += ncols; 1759 } 1760 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 1761 } 1762 1763 /* Assemble submat */ 1764 /* First assemble the local rows */ 1765 for (j=0; j<nrow; j++) { 1766 row = irow[j]; 1767 proc = row2proc[j]; 1768 if (proc == rank) { 1769 Crow = row - rstart; /* local row index of C */ 1770 #if defined(PETSC_USE_CTABLE) 1771 row = rmap_loc[Crow]; /* row index of submat */ 1772 #else 1773 row = rmap[row]; 1774 #endif 1775 1776 if (allcolumns) { 1777 /* diagonal part A = c->A */ 1778 ncols = ai[Crow+1] - ai[Crow]; 1779 cols = aj + ai[Crow]; 1780 vals = a_a + ai[Crow]; 1781 i = 0; 1782 for (k=0; k<ncols; k++) { 1783 subcols[i] = cols[k] + cstart; 1784 subvals[i++] = vals[k]; 1785 } 1786 1787 /* off-diagonal part B = c->B */ 1788 ncols = bi[Crow+1] - bi[Crow]; 1789 cols = bj + bi[Crow]; 1790 vals = b_a + bi[Crow]; 1791 for (k=0; k<ncols; k++) { 1792 subcols[i] = bmap[cols[k]]; 1793 subvals[i++] = vals[k]; 1794 } 1795 1796 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1797 1798 } else { /* !allcolumns */ 1799 #if defined(PETSC_USE_CTABLE) 1800 /* diagonal part A = c->A */ 1801 ncols = ai[Crow+1] - ai[Crow]; 1802 cols = aj + ai[Crow]; 1803 vals = a_a + ai[Crow]; 1804 i = 0; 1805 for (k=0; k<ncols; k++) { 1806 tcol = cmap_loc[cols[k]]; 1807 if (tcol) { 1808 subcols[i] = --tcol; 1809 subvals[i++] = vals[k]; 1810 } 1811 } 1812 1813 /* off-diagonal part B = c->B */ 1814 ncols = bi[Crow+1] - bi[Crow]; 1815 cols = bj + bi[Crow]; 1816 vals = b_a + bi[Crow]; 1817 for (k=0; k<ncols; k++) { 1818 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1819 if (tcol) { 1820 subcols[i] = --tcol; 1821 subvals[i++] = vals[k]; 1822 } 1823 } 1824 #else 1825 /* diagonal part A = c->A */ 1826 ncols = ai[Crow+1] - ai[Crow]; 1827 cols = aj + ai[Crow]; 1828 vals = a_a + ai[Crow]; 1829 i = 0; 1830 for (k=0; k<ncols; k++) { 1831 tcol = cmap[cols[k]+cstart]; 1832 if (tcol) { 1833 subcols[i] = --tcol; 1834 subvals[i++] = vals[k]; 1835 } 1836 } 1837 1838 /* off-diagonal part B = c->B */ 1839 ncols = bi[Crow+1] - bi[Crow]; 1840 cols = bj + bi[Crow]; 1841 vals = b_a + bi[Crow]; 1842 for (k=0; k<ncols; k++) { 1843 tcol = cmap[bmap[cols[k]]]; 1844 if (tcol) { 1845 subcols[i] = --tcol; 1846 subvals[i++] = vals[k]; 1847 } 1848 } 1849 #endif 1850 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1851 } 1852 } 1853 } 1854 1855 /* Now assemble the off-proc rows */ 1856 for (i=0; i<nrqs; i++) { /* for each requested message */ 1857 /* recv values from other processes */ 1858 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRMPI(ierr); 1859 proc = pa[idex]; 1860 sbuf1_i = sbuf1[proc]; 1861 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1862 ct1 = 2 + 1; 1863 ct2 = 0; /* count of received C->j */ 1864 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1865 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1866 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1867 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1868 1869 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1870 max1 = sbuf1_i[2]; /* num of rows */ 1871 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1872 row = sbuf1_i[ct1]; /* row index of submat */ 1873 if (!allcolumns) { 1874 idex = 0; 1875 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1876 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1877 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1878 #if defined(PETSC_USE_CTABLE) 1879 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1880 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1881 } else { 1882 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1883 } 1884 #else 1885 tcol = cmap[rbuf3_i[ct2]]; 1886 #endif 1887 if (tcol) { 1888 subcols[idex] = --tcol; /* may not be sorted */ 1889 subvals[idex++] = rbuf4_i[ct2]; 1890 1891 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1892 For reuse, we replace received C->j with index that should be inserted to submat */ 1893 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1894 } 1895 } 1896 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1897 } else { /* scall == MAT_REUSE_MATRIX */ 1898 submat = submats[0]; 1899 subc = (Mat_SeqAIJ*)submat->data; 1900 1901 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1902 for (l=0; l<nnz; l++) { 1903 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1904 subvals[idex++] = rbuf4_i[ct2]; 1905 } 1906 1907 bj = subc->j + subc->i[row]; /* sorted column indices */ 1908 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1909 } 1910 } else { /* allcolumns */ 1911 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1912 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1913 ct2 += nnz; 1914 } 1915 } 1916 } 1917 1918 /* sending a->a are done */ 1919 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr); 1920 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1921 1922 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1923 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1924 submats[0] = submat; 1925 1926 /* Restore the indices */ 1927 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1928 if (!allcolumns) { 1929 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1930 } 1931 1932 /* Destroy allocated memory */ 1933 for (i=0; i<nrqs; ++i) { 1934 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1935 } 1936 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1937 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1938 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1939 1940 if (scall == MAT_INITIAL_MATRIX) { 1941 ierr = PetscFree(lens);CHKERRQ(ierr); 1942 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1943 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1944 } 1945 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1946 ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1947 PetscFunctionReturn(0); 1948 } 1949 1950 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1951 { 1952 PetscErrorCode ierr; 1953 PetscInt ncol; 1954 PetscBool colflag,allcolumns=PETSC_FALSE; 1955 1956 PetscFunctionBegin; 1957 /* Allocate memory to hold all the submatrices */ 1958 if (scall == MAT_INITIAL_MATRIX) { 1959 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1960 } 1961 1962 /* Check for special case: each processor gets entire matrix columns */ 1963 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1964 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1965 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1966 1967 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1968 PetscFunctionReturn(0); 1969 } 1970 1971 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1972 { 1973 PetscErrorCode ierr; 1974 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1975 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1976 Mat_SeqAIJ *subc; 1977 Mat_SubSppt *smat; 1978 1979 PetscFunctionBegin; 1980 /* Check for special case: each processor has a single IS */ 1981 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1982 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1983 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1984 PetscFunctionReturn(0); 1985 } 1986 1987 /* Collect global wantallmatrix and nstages */ 1988 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1989 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1990 if (!nmax) nmax = 1; 1991 1992 if (scall == MAT_INITIAL_MATRIX) { 1993 /* Collect global wantallmatrix and nstages */ 1994 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1995 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1996 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1997 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1998 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1999 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2000 wantallmatrix = PETSC_TRUE; 2001 2002 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2003 } 2004 } 2005 2006 /* Determine the number of stages through which submatrices are done 2007 Each stage will extract nmax submatrices. 2008 nmax is determined by the matrix column dimension. 2009 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2010 */ 2011 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2012 2013 in[0] = -1*(PetscInt)wantallmatrix; 2014 in[1] = nstages; 2015 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr); 2016 wantallmatrix = (PetscBool)(-out[0]); 2017 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2018 2019 } else { /* MAT_REUSE_MATRIX */ 2020 if (ismax) { 2021 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2022 smat = subc->submatis1; 2023 } else { /* (*submat)[0] is a dummy matrix */ 2024 smat = (Mat_SubSppt*)(*submat)[0]->data; 2025 } 2026 if (!smat) { 2027 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2028 wantallmatrix = PETSC_TRUE; 2029 } else if (smat->singleis) { 2030 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2031 PetscFunctionReturn(0); 2032 } else { 2033 nstages = smat->nstages; 2034 } 2035 } 2036 2037 if (wantallmatrix) { 2038 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2039 PetscFunctionReturn(0); 2040 } 2041 2042 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2043 if (scall == MAT_INITIAL_MATRIX) { 2044 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2045 } 2046 2047 for (i=0,pos=0; i<nstages; i++) { 2048 if (pos+nmax <= ismax) max_no = nmax; 2049 else if (pos >= ismax) max_no = 0; 2050 else max_no = ismax-pos; 2051 2052 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2053 if (!max_no) { 2054 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2055 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2056 smat->nstages = nstages; 2057 } 2058 pos++; /* advance to next dummy matrix if any */ 2059 } else pos += max_no; 2060 } 2061 2062 if (ismax && scall == MAT_INITIAL_MATRIX) { 2063 /* save nstages for reuse */ 2064 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2065 smat = subc->submatis1; 2066 smat->nstages = nstages; 2067 } 2068 PetscFunctionReturn(0); 2069 } 2070 2071 /* -------------------------------------------------------------------------*/ 2072 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2073 { 2074 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2075 Mat A = c->A; 2076 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2077 const PetscInt **icol,**irow; 2078 PetscInt *nrow,*ncol,start; 2079 PetscErrorCode ierr; 2080 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2081 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2082 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2083 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2084 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2085 #if defined(PETSC_USE_CTABLE) 2086 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2087 #else 2088 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2089 #endif 2090 const PetscInt *irow_i; 2091 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2092 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2093 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2094 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2095 MPI_Status *r_status3,*r_status4,*s_status4; 2096 MPI_Comm comm; 2097 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2098 PetscMPIInt *onodes1,*olengths1,end; 2099 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2100 Mat_SubSppt *smat_i; 2101 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2102 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2103 2104 PetscFunctionBegin; 2105 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2106 size = c->size; 2107 rank = c->rank; 2108 2109 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2110 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2111 2112 for (i=0; i<ismax; i++) { 2113 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2114 if (!issorted[i]) iscsorted = issorted[i]; 2115 2116 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2117 2118 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2119 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2120 2121 /* Check for special case: allcolumn */ 2122 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2123 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2124 if (colflag && ncol[i] == C->cmap->N) { 2125 allcolumns[i] = PETSC_TRUE; 2126 icol[i] = NULL; 2127 } else { 2128 allcolumns[i] = PETSC_FALSE; 2129 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2130 } 2131 } 2132 2133 if (scall == MAT_REUSE_MATRIX) { 2134 /* Assumes new rows are same length as the old rows */ 2135 for (i=0; i<ismax; i++) { 2136 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2137 subc = (Mat_SeqAIJ*)submats[i]->data; 2138 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2139 2140 /* Initial matrix as if empty */ 2141 ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr); 2142 2143 smat_i = subc->submatis1; 2144 2145 nrqs = smat_i->nrqs; 2146 nrqr = smat_i->nrqr; 2147 rbuf1 = smat_i->rbuf1; 2148 rbuf2 = smat_i->rbuf2; 2149 rbuf3 = smat_i->rbuf3; 2150 req_source2 = smat_i->req_source2; 2151 2152 sbuf1 = smat_i->sbuf1; 2153 sbuf2 = smat_i->sbuf2; 2154 ptr = smat_i->ptr; 2155 tmp = smat_i->tmp; 2156 ctr = smat_i->ctr; 2157 2158 pa = smat_i->pa; 2159 req_size = smat_i->req_size; 2160 req_source1 = smat_i->req_source1; 2161 2162 allcolumns[i] = smat_i->allcolumns; 2163 row2proc[i] = smat_i->row2proc; 2164 rmap[i] = smat_i->rmap; 2165 cmap[i] = smat_i->cmap; 2166 } 2167 2168 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2169 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2170 smat_i = (Mat_SubSppt*)submats[0]->data; 2171 2172 nrqs = smat_i->nrqs; 2173 nrqr = smat_i->nrqr; 2174 rbuf1 = smat_i->rbuf1; 2175 rbuf2 = smat_i->rbuf2; 2176 rbuf3 = smat_i->rbuf3; 2177 req_source2 = smat_i->req_source2; 2178 2179 sbuf1 = smat_i->sbuf1; 2180 sbuf2 = smat_i->sbuf2; 2181 ptr = smat_i->ptr; 2182 tmp = smat_i->tmp; 2183 ctr = smat_i->ctr; 2184 2185 pa = smat_i->pa; 2186 req_size = smat_i->req_size; 2187 req_source1 = smat_i->req_source1; 2188 2189 allcolumns[0] = PETSC_FALSE; 2190 } 2191 } else { /* scall == MAT_INITIAL_MATRIX */ 2192 /* Get some new tags to keep the communication clean */ 2193 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2194 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2195 2196 /* evaluate communication - mesg to who, length of mesg, and buffer space 2197 required. Based on this, buffers are allocated, and data copied into them*/ 2198 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2199 2200 for (i=0; i<ismax; i++) { 2201 jmax = nrow[i]; 2202 irow_i = irow[i]; 2203 2204 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2205 row2proc[i] = row2proc_i; 2206 2207 if (issorted[i]) proc = 0; 2208 for (j=0; j<jmax; j++) { 2209 if (!issorted[i]) proc = 0; 2210 row = irow_i[j]; 2211 while (row >= C->rmap->range[proc+1]) proc++; 2212 w4[proc]++; 2213 row2proc_i[j] = proc; /* map row index to proc */ 2214 } 2215 for (j=0; j<size; j++) { 2216 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2217 } 2218 } 2219 2220 nrqs = 0; /* no of outgoing messages */ 2221 msz = 0; /* total mesg length (for all procs) */ 2222 w1[rank] = 0; /* no mesg sent to self */ 2223 w3[rank] = 0; 2224 for (i=0; i<size; i++) { 2225 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2226 } 2227 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2228 for (i=0,j=0; i<size; i++) { 2229 if (w1[i]) { pa[j] = i; j++; } 2230 } 2231 2232 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2233 for (i=0; i<nrqs; i++) { 2234 j = pa[i]; 2235 w1[j] += w2[j] + 2* w3[j]; 2236 msz += w1[j]; 2237 } 2238 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2239 2240 /* Determine the number of messages to expect, their lengths, from from-ids */ 2241 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2242 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2243 2244 /* Now post the Irecvs corresponding to these messages */ 2245 tag0 = ((PetscObject)C)->tag; 2246 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2247 2248 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2249 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2250 2251 /* Allocate Memory for outgoing messages */ 2252 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2253 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 2254 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 2255 2256 { 2257 PetscInt *iptr = tmp; 2258 k = 0; 2259 for (i=0; i<nrqs; i++) { 2260 j = pa[i]; 2261 iptr += k; 2262 sbuf1[j] = iptr; 2263 k = w1[j]; 2264 } 2265 } 2266 2267 /* Form the outgoing messages. Initialize the header space */ 2268 for (i=0; i<nrqs; i++) { 2269 j = pa[i]; 2270 sbuf1[j][0] = 0; 2271 ierr = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr); 2272 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2273 } 2274 2275 /* Parse the isrow and copy data into outbuf */ 2276 for (i=0; i<ismax; i++) { 2277 row2proc_i = row2proc[i]; 2278 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 2279 irow_i = irow[i]; 2280 jmax = nrow[i]; 2281 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2282 proc = row2proc_i[j]; 2283 if (proc != rank) { /* copy to the outgoing buf*/ 2284 ctr[proc]++; 2285 *ptr[proc] = irow_i[j]; 2286 ptr[proc]++; 2287 } 2288 } 2289 /* Update the headers for the current IS */ 2290 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2291 if ((ctr_j = ctr[j])) { 2292 sbuf1_j = sbuf1[j]; 2293 k = ++sbuf1_j[0]; 2294 sbuf1_j[2*k] = ctr_j; 2295 sbuf1_j[2*k-1] = i; 2296 } 2297 } 2298 } 2299 2300 /* Now post the sends */ 2301 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2302 for (i=0; i<nrqs; ++i) { 2303 j = pa[i]; 2304 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr); 2305 } 2306 2307 /* Post Receives to capture the buffer size */ 2308 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2309 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2310 rbuf2[0] = tmp + msz; 2311 for (i=1; i<nrqs; ++i) { 2312 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2313 } 2314 for (i=0; i<nrqs; ++i) { 2315 j = pa[i]; 2316 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 2317 } 2318 2319 /* Send to other procs the buf size they should allocate */ 2320 /* Receive messages*/ 2321 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2322 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2323 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2324 { 2325 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2326 PetscInt *sbuf2_i; 2327 2328 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr); 2329 for (i=0; i<nrqr; ++i) { 2330 req_size[i] = 0; 2331 rbuf1_i = rbuf1[i]; 2332 start = 2*rbuf1_i[0] + 1; 2333 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr); 2334 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2335 sbuf2_i = sbuf2[i]; 2336 for (j=start; j<end; j++) { 2337 id = rbuf1_i[j] - rstart; 2338 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2339 sbuf2_i[j] = ncols; 2340 req_size[i] += ncols; 2341 } 2342 req_source1[i] = r_status1[i].MPI_SOURCE; 2343 /* form the header */ 2344 sbuf2_i[0] = req_size[i]; 2345 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2346 2347 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 2348 } 2349 } 2350 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2351 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2352 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2353 2354 /* Receive messages*/ 2355 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2356 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2357 2358 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr); 2359 for (i=0; i<nrqs; ++i) { 2360 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2361 req_source2[i] = r_status2[i].MPI_SOURCE; 2362 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 2363 } 2364 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2365 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2366 2367 /* Wait on sends1 and sends2 */ 2368 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2369 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2370 2371 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);} 2372 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);} 2373 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2374 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2375 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2376 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2377 2378 /* Now allocate sending buffers for a->j, and send them off */ 2379 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2380 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2381 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2382 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2383 2384 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2385 { 2386 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2387 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2388 PetscInt cend = C->cmap->rend; 2389 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2390 2391 for (i=0; i<nrqr; i++) { 2392 rbuf1_i = rbuf1[i]; 2393 sbuf_aj_i = sbuf_aj[i]; 2394 ct1 = 2*rbuf1_i[0] + 1; 2395 ct2 = 0; 2396 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2397 kmax = rbuf1[i][2*j]; 2398 for (k=0; k<kmax; k++,ct1++) { 2399 row = rbuf1_i[ct1] - rstart; 2400 nzA = a_i[row+1] - a_i[row]; 2401 nzB = b_i[row+1] - b_i[row]; 2402 ncols = nzA + nzB; 2403 cworkA = a_j + a_i[row]; 2404 cworkB = b_j + b_i[row]; 2405 2406 /* load the column indices for this row into cols */ 2407 cols = sbuf_aj_i + ct2; 2408 2409 lwrite = 0; 2410 for (l=0; l<nzB; l++) { 2411 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2412 } 2413 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2414 for (l=0; l<nzB; l++) { 2415 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2416 } 2417 2418 ct2 += ncols; 2419 } 2420 } 2421 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 2422 } 2423 } 2424 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2425 2426 /* create col map: global col of C -> local col of submatrices */ 2427 { 2428 const PetscInt *icol_i; 2429 #if defined(PETSC_USE_CTABLE) 2430 for (i=0; i<ismax; i++) { 2431 if (!allcolumns[i]) { 2432 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2433 2434 jmax = ncol[i]; 2435 icol_i = icol[i]; 2436 cmap_i = cmap[i]; 2437 for (j=0; j<jmax; j++) { 2438 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2439 } 2440 } else cmap[i] = NULL; 2441 } 2442 #else 2443 for (i=0; i<ismax; i++) { 2444 if (!allcolumns[i]) { 2445 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2446 jmax = ncol[i]; 2447 icol_i = icol[i]; 2448 cmap_i = cmap[i]; 2449 for (j=0; j<jmax; j++) { 2450 cmap_i[icol_i[j]] = j+1; 2451 } 2452 } else cmap[i] = NULL; 2453 } 2454 #endif 2455 } 2456 2457 /* Create lens which is required for MatCreate... */ 2458 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2459 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2460 2461 if (ismax) { 2462 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2463 } 2464 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2465 2466 /* Update lens from local data */ 2467 for (i=0; i<ismax; i++) { 2468 row2proc_i = row2proc[i]; 2469 jmax = nrow[i]; 2470 if (!allcolumns[i]) cmap_i = cmap[i]; 2471 irow_i = irow[i]; 2472 lens_i = lens[i]; 2473 for (j=0; j<jmax; j++) { 2474 row = irow_i[j]; 2475 proc = row2proc_i[j]; 2476 if (proc == rank) { 2477 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2478 if (!allcolumns[i]) { 2479 for (k=0; k<ncols; k++) { 2480 #if defined(PETSC_USE_CTABLE) 2481 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2482 #else 2483 tcol = cmap_i[cols[k]]; 2484 #endif 2485 if (tcol) lens_i[j]++; 2486 } 2487 } else { /* allcolumns */ 2488 lens_i[j] = ncols; 2489 } 2490 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2491 } 2492 } 2493 } 2494 2495 /* Create row map: global row of C -> local row of submatrices */ 2496 #if defined(PETSC_USE_CTABLE) 2497 for (i=0; i<ismax; i++) { 2498 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2499 irow_i = irow[i]; 2500 jmax = nrow[i]; 2501 for (j=0; j<jmax; j++) { 2502 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2503 } 2504 } 2505 #else 2506 for (i=0; i<ismax; i++) { 2507 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2508 rmap_i = rmap[i]; 2509 irow_i = irow[i]; 2510 jmax = nrow[i]; 2511 for (j=0; j<jmax; j++) { 2512 rmap_i[irow_i[j]] = j; 2513 } 2514 } 2515 #endif 2516 2517 /* Update lens from offproc data */ 2518 { 2519 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2520 2521 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr); 2522 for (tmp2=0; tmp2<nrqs; tmp2++) { 2523 sbuf1_i = sbuf1[pa[tmp2]]; 2524 jmax = sbuf1_i[0]; 2525 ct1 = 2*jmax+1; 2526 ct2 = 0; 2527 rbuf2_i = rbuf2[tmp2]; 2528 rbuf3_i = rbuf3[tmp2]; 2529 for (j=1; j<=jmax; j++) { 2530 is_no = sbuf1_i[2*j-1]; 2531 max1 = sbuf1_i[2*j]; 2532 lens_i = lens[is_no]; 2533 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2534 rmap_i = rmap[is_no]; 2535 for (k=0; k<max1; k++,ct1++) { 2536 #if defined(PETSC_USE_CTABLE) 2537 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2538 row--; 2539 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2540 #else 2541 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2542 #endif 2543 max2 = rbuf2_i[ct1]; 2544 for (l=0; l<max2; l++,ct2++) { 2545 if (!allcolumns[is_no]) { 2546 #if defined(PETSC_USE_CTABLE) 2547 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2548 #else 2549 tcol = cmap_i[rbuf3_i[ct2]]; 2550 #endif 2551 if (tcol) lens_i[row]++; 2552 } else { /* allcolumns */ 2553 lens_i[row]++; /* lens_i[row] += max2 ? */ 2554 } 2555 } 2556 } 2557 } 2558 } 2559 } 2560 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2561 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);} 2562 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2563 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2564 2565 /* Create the submatrices */ 2566 for (i=0; i<ismax; i++) { 2567 PetscInt rbs,cbs; 2568 2569 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2570 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2571 2572 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2573 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2574 2575 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2576 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2577 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2578 2579 /* create struct Mat_SubSppt and attached it to submat */ 2580 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2581 subc = (Mat_SeqAIJ*)submats[i]->data; 2582 subc->submatis1 = smat_i; 2583 2584 smat_i->destroy = submats[i]->ops->destroy; 2585 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2586 submats[i]->factortype = C->factortype; 2587 2588 smat_i->id = i; 2589 smat_i->nrqs = nrqs; 2590 smat_i->nrqr = nrqr; 2591 smat_i->rbuf1 = rbuf1; 2592 smat_i->rbuf2 = rbuf2; 2593 smat_i->rbuf3 = rbuf3; 2594 smat_i->sbuf2 = sbuf2; 2595 smat_i->req_source2 = req_source2; 2596 2597 smat_i->sbuf1 = sbuf1; 2598 smat_i->ptr = ptr; 2599 smat_i->tmp = tmp; 2600 smat_i->ctr = ctr; 2601 2602 smat_i->pa = pa; 2603 smat_i->req_size = req_size; 2604 smat_i->req_source1 = req_source1; 2605 2606 smat_i->allcolumns = allcolumns[i]; 2607 smat_i->singleis = PETSC_FALSE; 2608 smat_i->row2proc = row2proc[i]; 2609 smat_i->rmap = rmap[i]; 2610 smat_i->cmap = cmap[i]; 2611 } 2612 2613 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2614 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2615 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2616 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2617 2618 /* create struct Mat_SubSppt and attached it to submat */ 2619 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2620 submats[0]->data = (void*)smat_i; 2621 2622 smat_i->destroy = submats[0]->ops->destroy; 2623 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2624 submats[0]->factortype = C->factortype; 2625 2626 smat_i->id = 0; 2627 smat_i->nrqs = nrqs; 2628 smat_i->nrqr = nrqr; 2629 smat_i->rbuf1 = rbuf1; 2630 smat_i->rbuf2 = rbuf2; 2631 smat_i->rbuf3 = rbuf3; 2632 smat_i->sbuf2 = sbuf2; 2633 smat_i->req_source2 = req_source2; 2634 2635 smat_i->sbuf1 = sbuf1; 2636 smat_i->ptr = ptr; 2637 smat_i->tmp = tmp; 2638 smat_i->ctr = ctr; 2639 2640 smat_i->pa = pa; 2641 smat_i->req_size = req_size; 2642 smat_i->req_source1 = req_source1; 2643 2644 smat_i->allcolumns = PETSC_FALSE; 2645 smat_i->singleis = PETSC_FALSE; 2646 smat_i->row2proc = NULL; 2647 smat_i->rmap = NULL; 2648 smat_i->cmap = NULL; 2649 } 2650 2651 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2652 ierr = PetscFree(lens);CHKERRQ(ierr); 2653 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2654 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2655 2656 } /* endof scall == MAT_INITIAL_MATRIX */ 2657 2658 /* Post recv matrix values */ 2659 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2660 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2661 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2662 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2663 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2664 for (i=0; i<nrqs; ++i) { 2665 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2666 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 2667 } 2668 2669 /* Allocate sending buffers for a->a, and send them off */ 2670 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2671 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2672 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2673 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2674 2675 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2676 { 2677 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2678 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2679 PetscInt cend = C->cmap->rend; 2680 PetscInt *b_j = b->j; 2681 PetscScalar *vworkA,*vworkB,*a_a,*b_a; 2682 2683 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2684 ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2685 for (i=0; i<nrqr; i++) { 2686 rbuf1_i = rbuf1[i]; 2687 sbuf_aa_i = sbuf_aa[i]; 2688 ct1 = 2*rbuf1_i[0]+1; 2689 ct2 = 0; 2690 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2691 kmax = rbuf1_i[2*j]; 2692 for (k=0; k<kmax; k++,ct1++) { 2693 row = rbuf1_i[ct1] - rstart; 2694 nzA = a_i[row+1] - a_i[row]; 2695 nzB = b_i[row+1] - b_i[row]; 2696 ncols = nzA + nzB; 2697 cworkB = b_j + b_i[row]; 2698 vworkA = a_a + a_i[row]; 2699 vworkB = b_a + b_i[row]; 2700 2701 /* load the column values for this row into vals*/ 2702 vals = sbuf_aa_i+ct2; 2703 2704 lwrite = 0; 2705 for (l=0; l<nzB; l++) { 2706 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2707 } 2708 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2709 for (l=0; l<nzB; l++) { 2710 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2711 } 2712 2713 ct2 += ncols; 2714 } 2715 } 2716 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 2717 } 2718 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2719 ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2720 } 2721 2722 /* Assemble the matrices */ 2723 /* First assemble the local rows */ 2724 for (i=0; i<ismax; i++) { 2725 row2proc_i = row2proc[i]; 2726 subc = (Mat_SeqAIJ*)submats[i]->data; 2727 imat_ilen = subc->ilen; 2728 imat_j = subc->j; 2729 imat_i = subc->i; 2730 imat_a = subc->a; 2731 2732 if (!allcolumns[i]) cmap_i = cmap[i]; 2733 rmap_i = rmap[i]; 2734 irow_i = irow[i]; 2735 jmax = nrow[i]; 2736 for (j=0; j<jmax; j++) { 2737 row = irow_i[j]; 2738 proc = row2proc_i[j]; 2739 if (proc == rank) { 2740 old_row = row; 2741 #if defined(PETSC_USE_CTABLE) 2742 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2743 row--; 2744 #else 2745 row = rmap_i[row]; 2746 #endif 2747 ilen_row = imat_ilen[row]; 2748 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2749 mat_i = imat_i[row]; 2750 mat_a = imat_a + mat_i; 2751 mat_j = imat_j + mat_i; 2752 if (!allcolumns[i]) { 2753 for (k=0; k<ncols; k++) { 2754 #if defined(PETSC_USE_CTABLE) 2755 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2756 #else 2757 tcol = cmap_i[cols[k]]; 2758 #endif 2759 if (tcol) { 2760 *mat_j++ = tcol - 1; 2761 *mat_a++ = vals[k]; 2762 ilen_row++; 2763 } 2764 } 2765 } else { /* allcolumns */ 2766 for (k=0; k<ncols; k++) { 2767 *mat_j++ = cols[k]; /* global col index! */ 2768 *mat_a++ = vals[k]; 2769 ilen_row++; 2770 } 2771 } 2772 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2773 2774 imat_ilen[row] = ilen_row; 2775 } 2776 } 2777 } 2778 2779 /* Now assemble the off proc rows */ 2780 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRMPI(ierr); 2781 for (tmp2=0; tmp2<nrqs; tmp2++) { 2782 sbuf1_i = sbuf1[pa[tmp2]]; 2783 jmax = sbuf1_i[0]; 2784 ct1 = 2*jmax + 1; 2785 ct2 = 0; 2786 rbuf2_i = rbuf2[tmp2]; 2787 rbuf3_i = rbuf3[tmp2]; 2788 rbuf4_i = rbuf4[tmp2]; 2789 for (j=1; j<=jmax; j++) { 2790 is_no = sbuf1_i[2*j-1]; 2791 rmap_i = rmap[is_no]; 2792 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2793 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2794 imat_ilen = subc->ilen; 2795 imat_j = subc->j; 2796 imat_i = subc->i; 2797 imat_a = subc->a; 2798 max1 = sbuf1_i[2*j]; 2799 for (k=0; k<max1; k++,ct1++) { 2800 row = sbuf1_i[ct1]; 2801 #if defined(PETSC_USE_CTABLE) 2802 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2803 row--; 2804 #else 2805 row = rmap_i[row]; 2806 #endif 2807 ilen = imat_ilen[row]; 2808 mat_i = imat_i[row]; 2809 mat_a = imat_a + mat_i; 2810 mat_j = imat_j + mat_i; 2811 max2 = rbuf2_i[ct1]; 2812 if (!allcolumns[is_no]) { 2813 for (l=0; l<max2; l++,ct2++) { 2814 #if defined(PETSC_USE_CTABLE) 2815 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2816 #else 2817 tcol = cmap_i[rbuf3_i[ct2]]; 2818 #endif 2819 if (tcol) { 2820 *mat_j++ = tcol - 1; 2821 *mat_a++ = rbuf4_i[ct2]; 2822 ilen++; 2823 } 2824 } 2825 } else { /* allcolumns */ 2826 for (l=0; l<max2; l++,ct2++) { 2827 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2828 *mat_a++ = rbuf4_i[ct2]; 2829 ilen++; 2830 } 2831 } 2832 imat_ilen[row] = ilen; 2833 } 2834 } 2835 } 2836 2837 if (!iscsorted) { /* sort column indices of the rows */ 2838 for (i=0; i<ismax; i++) { 2839 subc = (Mat_SeqAIJ*)submats[i]->data; 2840 imat_j = subc->j; 2841 imat_i = subc->i; 2842 imat_a = subc->a; 2843 imat_ilen = subc->ilen; 2844 2845 if (allcolumns[i]) continue; 2846 jmax = nrow[i]; 2847 for (j=0; j<jmax; j++) { 2848 mat_i = imat_i[j]; 2849 mat_a = imat_a + mat_i; 2850 mat_j = imat_j + mat_i; 2851 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2852 } 2853 } 2854 } 2855 2856 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2857 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2858 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);} 2859 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2860 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2861 2862 /* Restore the indices */ 2863 for (i=0; i<ismax; i++) { 2864 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2865 if (!allcolumns[i]) { 2866 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2867 } 2868 } 2869 2870 for (i=0; i<ismax; i++) { 2871 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2872 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2873 } 2874 2875 /* Destroy allocated memory */ 2876 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2877 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2878 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 2879 2880 for (i=0; i<nrqs; ++i) { 2881 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2882 } 2883 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2884 2885 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2886 PetscFunctionReturn(0); 2887 } 2888 2889 /* 2890 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2891 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2892 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2893 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2894 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2895 state, and needs to be "assembled" later by compressing B's column space. 2896 2897 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2898 Following this call, C->A & C->B have been created, even if empty. 2899 */ 2900 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2901 { 2902 /* If making this function public, change the error returned in this function away from _PLIB. */ 2903 PetscErrorCode ierr; 2904 Mat_MPIAIJ *aij; 2905 Mat_SeqAIJ *Baij; 2906 PetscBool seqaij,Bdisassembled; 2907 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2908 PetscScalar v; 2909 const PetscInt *rowindices,*colindices; 2910 2911 PetscFunctionBegin; 2912 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2913 if (A) { 2914 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2915 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2916 if (rowemb) { 2917 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2918 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2919 } else { 2920 if (C->rmap->n != A->rmap->n) { 2921 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2922 } 2923 } 2924 if (dcolemb) { 2925 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2926 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2927 } else { 2928 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2929 } 2930 } 2931 if (B) { 2932 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2933 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2934 if (rowemb) { 2935 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2936 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2937 } else { 2938 if (C->rmap->n != B->rmap->n) { 2939 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2940 } 2941 } 2942 if (ocolemb) { 2943 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2944 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2945 } else { 2946 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2947 } 2948 } 2949 2950 aij = (Mat_MPIAIJ*)C->data; 2951 if (!aij->A) { 2952 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2953 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2954 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2955 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2956 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2957 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2958 } 2959 if (A) { 2960 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2961 } else { 2962 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2963 } 2964 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2965 /* 2966 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2967 need to "disassemble" B -- convert it to using C's global indices. 2968 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2969 2970 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2971 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2972 2973 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2974 At least avoid calling MatSetValues() and the implied searches? 2975 */ 2976 2977 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2978 #if defined(PETSC_USE_CTABLE) 2979 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2980 #else 2981 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2982 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2983 if (aij->B) { 2984 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2985 } 2986 #endif 2987 ngcol = 0; 2988 if (aij->lvec) { 2989 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2990 } 2991 if (aij->garray) { 2992 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2993 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2994 } 2995 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2996 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2997 } 2998 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2999 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 3000 } 3001 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 3002 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 3003 } 3004 } 3005 Bdisassembled = PETSC_FALSE; 3006 if (!aij->B) { 3007 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 3008 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 3009 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 3010 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 3011 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 3012 Bdisassembled = PETSC_TRUE; 3013 } 3014 if (B) { 3015 Baij = (Mat_SeqAIJ*)B->data; 3016 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3017 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3018 for (i=0; i<B->rmap->n; i++) { 3019 nz[i] = Baij->i[i+1] - Baij->i[i]; 3020 } 3021 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3022 ierr = PetscFree(nz);CHKERRQ(ierr); 3023 } 3024 3025 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3026 shift = rend-rstart; 3027 count = 0; 3028 rowindices = NULL; 3029 colindices = NULL; 3030 if (rowemb) { 3031 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3032 } 3033 if (ocolemb) { 3034 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3035 } 3036 for (i=0; i<B->rmap->n; i++) { 3037 PetscInt row; 3038 row = i; 3039 if (rowindices) row = rowindices[i]; 3040 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3041 col = Baij->j[count]; 3042 if (colindices) col = colindices[col]; 3043 if (Bdisassembled && col>=rstart) col += shift; 3044 v = Baij->a[count]; 3045 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3046 ++count; 3047 } 3048 } 3049 /* No assembly for aij->B is necessary. */ 3050 /* FIXME: set aij->B's nonzerostate correctly. */ 3051 } else { 3052 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3053 } 3054 C->preallocated = PETSC_TRUE; 3055 C->was_assembled = PETSC_FALSE; 3056 C->assembled = PETSC_FALSE; 3057 /* 3058 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3059 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3060 */ 3061 PetscFunctionReturn(0); 3062 } 3063 3064 /* 3065 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3066 */ 3067 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3068 { 3069 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3070 3071 PetscFunctionBegin; 3072 PetscValidPointer(A,2); 3073 PetscValidPointer(B,3); 3074 /* FIXME: make sure C is assembled */ 3075 *A = aij->A; 3076 *B = aij->B; 3077 /* Note that we don't incref *A and *B, so be careful! */ 3078 PetscFunctionReturn(0); 3079 } 3080 3081 /* 3082 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3083 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3084 */ 3085 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3086 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3087 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3088 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3089 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3090 { 3091 PetscErrorCode ierr; 3092 PetscMPIInt size,flag; 3093 PetscInt i,ii,cismax,ispar; 3094 Mat *A,*B; 3095 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3096 3097 PetscFunctionBegin; 3098 if (!ismax) PetscFunctionReturn(0); 3099 3100 for (i = 0, cismax = 0; i < ismax; ++i) { 3101 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr); 3102 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3103 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr); 3104 if (size > 1) ++cismax; 3105 } 3106 3107 /* 3108 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3109 ispar counts the number of parallel ISs across C's comm. 3110 */ 3111 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr); 3112 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3113 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3114 PetscFunctionReturn(0); 3115 } 3116 3117 /* if (ispar) */ 3118 /* 3119 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3120 These are used to extract the off-diag portion of the resulting parallel matrix. 3121 The row IS for the off-diag portion is the same as for the diag portion, 3122 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3123 */ 3124 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3125 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3126 for (i = 0, ii = 0; i < ismax; ++i) { 3127 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3128 if (size > 1) { 3129 /* 3130 TODO: This is the part that's ***NOT SCALABLE***. 3131 To fix this we need to extract just the indices of C's nonzero columns 3132 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3133 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3134 to be done without serializing on the IS list, so, most likely, it is best 3135 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3136 */ 3137 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3138 /* Now we have to 3139 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3140 were sorted on each rank, concatenated they might no longer be sorted; 3141 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3142 indices in the nondecreasing order to the original index positions. 3143 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3144 */ 3145 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3146 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3147 ++ii; 3148 } 3149 } 3150 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3151 for (i = 0, ii = 0; i < ismax; ++i) { 3152 PetscInt j,issize; 3153 const PetscInt *indices; 3154 3155 /* 3156 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3157 */ 3158 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3159 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3160 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3161 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3162 for (j = 1; j < issize; ++j) { 3163 if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3164 } 3165 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3166 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3167 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3168 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3169 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3170 for (j = 1; j < issize; ++j) { 3171 if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3172 } 3173 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3174 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3175 if (size > 1) { 3176 cisrow[ii] = isrow[i]; 3177 ++ii; 3178 } 3179 } 3180 /* 3181 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3182 array of sequential matrices underlying the resulting parallel matrices. 3183 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3184 contain duplicates. 3185 3186 There are as many diag matrices as there are original index sets. There are only as many parallel 3187 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3188 3189 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3190 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3191 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3192 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3193 with A[i] and B[ii] extracted from the corresponding MPI submat. 3194 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3195 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3196 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3197 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3198 values into A[i] and B[ii] sitting inside the corresponding submat. 3199 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3200 A[i], B[ii], AA[i] or BB[ii] matrices. 3201 */ 3202 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3203 if (scall == MAT_INITIAL_MATRIX) { 3204 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3205 } 3206 3207 /* Now obtain the sequential A and B submatrices separately. */ 3208 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3209 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3210 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3211 3212 /* 3213 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3214 matrices A & B have been extracted directly into the parallel matrices containing them, or 3215 simply into the sequential matrix identical with the corresponding A (if size == 1). 3216 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3217 to have the same sparsity pattern. 3218 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3219 must be constructed for C. This is done by setseqmat(s). 3220 */ 3221 for (i = 0, ii = 0; i < ismax; ++i) { 3222 /* 3223 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3224 That way we can avoid sorting and computing permutations when reusing. 3225 To this end: 3226 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3227 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3228 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3229 */ 3230 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3231 3232 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3233 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3234 if (size > 1) { 3235 if (scall == MAT_INITIAL_MATRIX) { 3236 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3237 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3238 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3239 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3240 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3241 } 3242 /* 3243 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3244 */ 3245 { 3246 Mat AA = A[i],BB = B[ii]; 3247 3248 if (AA || BB) { 3249 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3250 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3251 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3252 } 3253 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3254 } 3255 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3256 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3257 ++ii; 3258 } else { /* if (size == 1) */ 3259 if (scall == MAT_REUSE_MATRIX) { 3260 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3261 } 3262 if (isrow_p[i] || iscol_p[i]) { 3263 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3264 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3265 /* Otherwise A is extracted straight into (*submats)[i]. */ 3266 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3267 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3268 } else (*submat)[i] = A[i]; 3269 } 3270 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3271 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3272 } 3273 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3274 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3275 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3276 ierr = PetscFree(A);CHKERRQ(ierr); 3277 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3278 PetscFunctionReturn(0); 3279 } 3280 3281 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3282 { 3283 PetscErrorCode ierr; 3284 3285 PetscFunctionBegin; 3286 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3287 PetscFunctionReturn(0); 3288 } 3289