1 2 /* 3 Defines basic operations for the MATSEQAIJPERM matrix class. 4 This class is derived from the MATSEQAIJ class and retains the 5 compressed row storage (aka Yale sparse matrix format) but augments 6 it with some permutation information that enables some operations 7 to be more vectorizable. A physically rearranged copy of the matrix 8 may be stored if the user desires. 9 10 Eventually a variety of permutations may be supported. 11 */ 12 13 #include <../src/mat/impls/aij/seq/aij.h> 14 15 #define NDIM 512 16 /* NDIM specifies how many rows at a time we should work with when 17 * performing the vectorized mat-vec. This depends on various factors 18 * such as vector register length, etc., and I really need to add a 19 * way for the user (or the library) to tune this. I'm setting it to 20 * 512 for now since that is what Ed D'Azevedo was using in his Fortran 21 * routines. */ 22 23 typedef struct { 24 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */ 25 26 PetscInt ngroup; 27 PetscInt *xgroup; 28 /* Denotes where groups of rows with same number of nonzeros 29 * begin and end, i.e., xgroup[i] gives us the position in iperm[] 30 * where the ith group begins. */ 31 32 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */ 33 PetscInt *iperm; /* The permutation vector. */ 34 35 /* Some of this stuff is for Ed's recursive triangular solve. 36 * I'm not sure what I need yet. */ 37 PetscInt blocksize; 38 PetscInt nstep; 39 PetscInt *jstart_list; 40 PetscInt *jend_list; 41 PetscInt *action_list; 42 PetscInt *ngroup_list; 43 PetscInt **ipointer_list; 44 PetscInt **xgroup_list; 45 PetscInt **nzgroup_list; 46 PetscInt **iperm_list; 47 } Mat_SeqAIJPERM; 48 49 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat) 50 { 51 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */ 52 /* so we will ignore 'MatType type'. */ 53 PetscErrorCode ierr; 54 Mat B = *newmat; 55 Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr; 56 57 PetscFunctionBegin; 58 if (reuse == MAT_INITIAL_MATRIX) { 59 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 60 aijperm=(Mat_SeqAIJPERM*)B->spptr; 61 } 62 63 /* Reset the original function pointers. */ 64 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 65 B->ops->destroy = MatDestroy_SeqAIJ; 66 B->ops->duplicate = MatDuplicate_SeqAIJ; 67 B->ops->mult = MatMult_SeqAIJ; 68 B->ops->multadd = MatMultAdd_SeqAIJ; 69 70 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr); 71 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 72 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 73 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 74 75 /* Free everything in the Mat_SeqAIJPERM data structure.*/ 76 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 77 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 78 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 79 ierr = PetscFree(B->spptr);CHKERRQ(ierr); 80 81 /* Change the type of B to MATSEQAIJ. */ 82 ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr); 83 84 *newmat = B; 85 PetscFunctionReturn(0); 86 } 87 88 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) 89 { 90 PetscErrorCode ierr; 91 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 92 93 PetscFunctionBegin; 94 if (aijperm) { 95 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */ 96 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 97 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 98 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 99 ierr = PetscFree(A->spptr);CHKERRQ(ierr); 100 } 101 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 102 * to destroy everything that remains. */ 103 ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr); 104 /* Note that I don't call MatSetType(). I believe this is because that 105 * is only to be called when *building* a matrix. I could be wrong, but 106 * that is how things work for the SuperLU matrix class. */ 107 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 111 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) 112 { 113 PetscErrorCode ierr; 114 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 115 Mat_SeqAIJPERM *aijperm_dest; 116 PetscBool perm; 117 118 PetscFunctionBegin; 119 ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr); 120 ierr = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr); 121 if (perm) { 122 aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr; 123 ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr); 124 ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr); 125 ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr); 126 } else { 127 ierr = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr); 128 (*M)->spptr = (void*) aijperm_dest; 129 ierr = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr); 130 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 131 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr); 132 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr); 133 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr); 134 } 135 ierr = PetscMemcpy(aijperm_dest,aijperm,sizeof(Mat_SeqAIJPERM));CHKERRQ(ierr); 136 /* Allocate space for, and copy the grouping and permutation info. 137 * I note that when the groups are initially determined in 138 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than 139 * necessary. But at this point, we know how large they need to be, and 140 * allocate only the necessary amount of memory. So the duplicated matrix 141 * may actually use slightly less storage than the original! */ 142 ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr); 143 ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr); 144 ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr); 145 ierr = PetscMemcpy(aijperm_dest->iperm,aijperm->iperm,sizeof(PetscInt)*A->rmap->n);CHKERRQ(ierr); 146 ierr = PetscMemcpy(aijperm_dest->xgroup,aijperm->xgroup,sizeof(PetscInt)*(aijperm->ngroup+1));CHKERRQ(ierr); 147 ierr = PetscMemcpy(aijperm_dest->nzgroup,aijperm->nzgroup,sizeof(PetscInt)*aijperm->ngroup);CHKERRQ(ierr); 148 PetscFunctionReturn(0); 149 } 150 151 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) 152 { 153 PetscErrorCode ierr; 154 Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data; 155 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 156 PetscInt m; /* Number of rows in the matrix. */ 157 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */ 158 PetscInt maxnz; /* Maximum number of nonzeros in any row. */ 159 PetscInt *rows_in_bucket; 160 /* To construct the permutation, we sort each row into one of maxnz 161 * buckets based on how many nonzeros are in the row. */ 162 PetscInt nz; 163 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */ 164 PetscInt *ipnz; 165 /* When constructing the iperm permutation vector, 166 * ipnz[nz] is used to point to the next place in the permutation vector 167 * that a row with nz nonzero elements should be placed.*/ 168 PetscInt i, ngroup, istart, ipos; 169 170 PetscFunctionBegin; 171 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */ 172 aijperm->nonzerostate = A->nonzerostate; 173 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */ 174 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 175 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 176 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 177 178 m = A->rmap->n; 179 ia = a->i; 180 181 /* Allocate the arrays that will hold the permutation vector. */ 182 ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr); 183 184 /* Allocate some temporary work arrays that will be used in 185 * calculating the permuation vector and groupings. */ 186 ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr); 187 188 /* Now actually figure out the permutation and grouping. */ 189 190 /* First pass: Determine number of nonzeros in each row, maximum 191 * number of nonzeros in any row, and how many rows fall into each 192 * "bucket" of rows with same number of nonzeros. */ 193 maxnz = 0; 194 for (i=0; i<m; i++) { 195 nz_in_row[i] = ia[i+1]-ia[i]; 196 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i]; 197 } 198 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr); 199 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr); 200 201 for (i=0; i<=maxnz; i++) { 202 rows_in_bucket[i] = 0; 203 } 204 for (i=0; i<m; i++) { 205 nz = nz_in_row[i]; 206 rows_in_bucket[nz]++; 207 } 208 209 /* Allocate space for the grouping info. There will be at most (maxnz + 1) 210 * groups. (It is maxnz + 1 instead of simply maxnz because there may be 211 * rows with no nonzero elements.) If there are (maxnz + 1) groups, 212 * then xgroup[] must consist of (maxnz + 2) elements, since the last 213 * element of xgroup will tell us where the (maxnz + 1)th group ends. 214 * We allocate space for the maximum number of groups; 215 * that is potentially a little wasteful, but not too much so. 216 * Perhaps I should fix it later. */ 217 ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr); 218 ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr); 219 220 /* Second pass. Look at what is in the buckets and create the groupings. 221 * Note that it is OK to have a group of rows with no non-zero values. */ 222 ngroup = 0; 223 istart = 0; 224 for (i=0; i<=maxnz; i++) { 225 if (rows_in_bucket[i] > 0) { 226 aijperm->nzgroup[ngroup] = i; 227 aijperm->xgroup[ngroup] = istart; 228 ngroup++; 229 istart += rows_in_bucket[i]; 230 } 231 } 232 233 aijperm->xgroup[ngroup] = istart; 234 aijperm->ngroup = ngroup; 235 236 /* Now fill in the permutation vector iperm. */ 237 ipnz[0] = 0; 238 for (i=0; i<maxnz; i++) { 239 ipnz[i+1] = ipnz[i] + rows_in_bucket[i]; 240 } 241 242 for (i=0; i<m; i++) { 243 nz = nz_in_row[i]; 244 ipos = ipnz[nz]; 245 aijperm->iperm[ipos] = i; 246 ipnz[nz]++; 247 } 248 249 /* Clean up temporary work arrays. */ 250 ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr); 251 ierr = PetscFree(ipnz);CHKERRQ(ierr); 252 ierr = PetscFree(nz_in_row);CHKERRQ(ierr); 253 PetscFunctionReturn(0); 254 } 255 256 257 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) 258 { 259 PetscErrorCode ierr; 260 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 261 262 PetscFunctionBegin; 263 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 264 265 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some 266 * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 267 * I'm not sure if this is the best way to do this, but it avoids 268 * a lot of code duplication. 269 * I also note that currently MATSEQAIJPERM doesn't know anything about 270 * the Mat_CompressedRow data structure that SeqAIJ now uses when there 271 * are many zero rows. If the SeqAIJ assembly end routine decides to use 272 * this, this may break things. (Don't know... haven't looked at it.) */ 273 a->inode.use = PETSC_FALSE; 274 ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr); 275 276 /* Now calculate the permutation and grouping information. */ 277 ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr); 278 PetscFunctionReturn(0); 279 } 280 281 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy) 282 { 283 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 284 const PetscScalar *x; 285 PetscScalar *y; 286 const MatScalar *aa; 287 PetscErrorCode ierr; 288 const PetscInt *aj,*ai; 289 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)) 290 PetscInt i,j; 291 #endif 292 293 /* Variables that don't appear in MatMult_SeqAIJ. */ 294 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 295 PetscInt *iperm; /* Points to the permutation vector. */ 296 PetscInt *xgroup; 297 /* Denotes where groups of rows with same number of nonzeros 298 * begin and end in iperm. */ 299 PetscInt *nzgroup; 300 PetscInt ngroup; 301 PetscInt igroup; 302 PetscInt jstart,jend; 303 /* jstart is used in loops to denote the position in iperm where a 304 * group starts; jend denotes the position where it ends. 305 * (jend + 1 is where the next group starts.) */ 306 PetscInt iold,nz; 307 PetscInt istart,iend,isize; 308 PetscInt ipos; 309 PetscScalar yp[NDIM]; 310 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */ 311 312 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 313 #pragma disjoint(*x,*y,*aa) 314 #endif 315 316 PetscFunctionBegin; 317 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 318 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 319 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 320 aa = a->a; /* Nonzero elements stored row-by-row. */ 321 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 322 323 /* Get the info we need about the permutations and groupings. */ 324 iperm = aijperm->iperm; 325 ngroup = aijperm->ngroup; 326 xgroup = aijperm->xgroup; 327 nzgroup = aijperm->nzgroup; 328 329 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking) 330 fortranmultaijperm_(&m,x,ii,aj,aa,y); 331 #else 332 333 for (igroup=0; igroup<ngroup; igroup++) { 334 jstart = xgroup[igroup]; 335 jend = xgroup[igroup+1] - 1; 336 nz = nzgroup[igroup]; 337 338 /* Handle the special cases where the number of nonzeros per row 339 * in the group is either 0 or 1. */ 340 if (nz == 0) { 341 for (i=jstart; i<=jend; i++) { 342 y[iperm[i]] = 0.0; 343 } 344 } else if (nz == 1) { 345 for (i=jstart; i<=jend; i++) { 346 iold = iperm[i]; 347 ipos = ai[iold]; 348 y[iold] = aa[ipos] * x[aj[ipos]]; 349 } 350 } else { 351 352 /* We work our way through the current group in chunks of NDIM rows 353 * at a time. */ 354 355 for (istart=jstart; istart<=jend; istart+=NDIM) { 356 /* Figure out where the chunk of 'isize' rows ends in iperm. 357 * 'isize may of course be less than NDIM for the last chunk. */ 358 iend = istart + (NDIM - 1); 359 360 if (iend > jend) iend = jend; 361 362 isize = iend - istart + 1; 363 364 /* Initialize the yp[] array that will be used to hold part of 365 * the permuted results vector, and figure out where in aa each 366 * row of the chunk will begin. */ 367 for (i=0; i<isize; i++) { 368 iold = iperm[istart + i]; 369 /* iold is a row number from the matrix A *before* reordering. */ 370 ip[i] = ai[iold]; 371 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 372 yp[i] = (PetscScalar) 0.0; 373 } 374 375 /* If the number of zeros per row exceeds the number of rows in 376 * the chunk, we should vectorize along nz, that is, perform the 377 * mat-vec one row at a time as in the usual CSR case. */ 378 if (nz > isize) { 379 #if defined(PETSC_HAVE_CRAY_VECTOR) 380 #pragma _CRI preferstream 381 #endif 382 for (i=0; i<isize; i++) { 383 #if defined(PETSC_HAVE_CRAY_VECTOR) 384 #pragma _CRI prefervector 385 #endif 386 for (j=0; j<nz; j++) { 387 ipos = ip[i] + j; 388 yp[i] += aa[ipos] * x[aj[ipos]]; 389 } 390 } 391 } else { 392 /* Otherwise, there are enough rows in the chunk to make it 393 * worthwhile to vectorize across the rows, that is, to do the 394 * matvec by operating with "columns" of the chunk. */ 395 for (j=0; j<nz; j++) { 396 for (i=0; i<isize; i++) { 397 ipos = ip[i] + j; 398 yp[i] += aa[ipos] * x[aj[ipos]]; 399 } 400 } 401 } 402 403 #if defined(PETSC_HAVE_CRAY_VECTOR) 404 #pragma _CRI ivdep 405 #endif 406 /* Put results from yp[] into non-permuted result vector y. */ 407 for (i=0; i<isize; i++) { 408 y[iperm[istart+i]] = yp[i]; 409 } 410 } /* End processing chunk of isize rows of a group. */ 411 } /* End handling matvec for chunk with nz > 1. */ 412 } /* End loop over igroup. */ 413 #endif 414 ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr); 415 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 416 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 417 PetscFunctionReturn(0); 418 } 419 420 421 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx. 422 * Note that the names I used to designate the vectors differs from that 423 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent 424 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */ 425 /* 426 I hate having virtually identical code for the mult and the multadd!!! 427 */ 428 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy) 429 { 430 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 431 const PetscScalar *x; 432 PetscScalar *y,*w; 433 const MatScalar *aa; 434 PetscErrorCode ierr; 435 const PetscInt *aj,*ai; 436 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 437 PetscInt i,j; 438 #endif 439 440 /* Variables that don't appear in MatMultAdd_SeqAIJ. */ 441 Mat_SeqAIJPERM * aijperm; 442 PetscInt *iperm; /* Points to the permutation vector. */ 443 PetscInt *xgroup; 444 /* Denotes where groups of rows with same number of nonzeros 445 * begin and end in iperm. */ 446 PetscInt *nzgroup; 447 PetscInt ngroup; 448 PetscInt igroup; 449 PetscInt jstart,jend; 450 /* jstart is used in loops to denote the position in iperm where a 451 * group starts; jend denotes the position where it ends. 452 * (jend + 1 is where the next group starts.) */ 453 PetscInt iold,nz; 454 PetscInt istart,iend,isize; 455 PetscInt ipos; 456 PetscScalar yp[NDIM]; 457 PetscInt ip[NDIM]; 458 /* yp[] and ip[] are treated as vector "registers" for performing 459 * the mat-vec. */ 460 461 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 462 #pragma disjoint(*x,*y,*aa) 463 #endif 464 465 PetscFunctionBegin; 466 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 467 ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 468 469 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 470 aa = a->a; /* Nonzero elements stored row-by-row. */ 471 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 472 473 /* Get the info we need about the permutations and groupings. */ 474 aijperm = (Mat_SeqAIJPERM*) A->spptr; 475 iperm = aijperm->iperm; 476 ngroup = aijperm->ngroup; 477 xgroup = aijperm->xgroup; 478 nzgroup = aijperm->nzgroup; 479 480 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 481 fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w); 482 #else 483 484 for (igroup=0; igroup<ngroup; igroup++) { 485 jstart = xgroup[igroup]; 486 jend = xgroup[igroup+1] - 1; 487 488 nz = nzgroup[igroup]; 489 490 /* Handle the special cases where the number of nonzeros per row 491 * in the group is either 0 or 1. */ 492 if (nz == 0) { 493 for (i=jstart; i<=jend; i++) { 494 iold = iperm[i]; 495 y[iold] = w[iold]; 496 } 497 } 498 else if (nz == 1) { 499 for (i=jstart; i<=jend; i++) { 500 iold = iperm[i]; 501 ipos = ai[iold]; 502 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]]; 503 } 504 } 505 /* For the general case: */ 506 else { 507 508 /* We work our way through the current group in chunks of NDIM rows 509 * at a time. */ 510 511 for (istart=jstart; istart<=jend; istart+=NDIM) { 512 /* Figure out where the chunk of 'isize' rows ends in iperm. 513 * 'isize may of course be less than NDIM for the last chunk. */ 514 iend = istart + (NDIM - 1); 515 if (iend > jend) iend = jend; 516 isize = iend - istart + 1; 517 518 /* Initialize the yp[] array that will be used to hold part of 519 * the permuted results vector, and figure out where in aa each 520 * row of the chunk will begin. */ 521 for (i=0; i<isize; i++) { 522 iold = iperm[istart + i]; 523 /* iold is a row number from the matrix A *before* reordering. */ 524 ip[i] = ai[iold]; 525 /* ip[i] tells us where the ith row of the chunk begins in aa. */ 526 yp[i] = w[iold]; 527 } 528 529 /* If the number of zeros per row exceeds the number of rows in 530 * the chunk, we should vectorize along nz, that is, perform the 531 * mat-vec one row at a time as in the usual CSR case. */ 532 if (nz > isize) { 533 #if defined(PETSC_HAVE_CRAY_VECTOR) 534 #pragma _CRI preferstream 535 #endif 536 for (i=0; i<isize; i++) { 537 #if defined(PETSC_HAVE_CRAY_VECTOR) 538 #pragma _CRI prefervector 539 #endif 540 for (j=0; j<nz; j++) { 541 ipos = ip[i] + j; 542 yp[i] += aa[ipos] * x[aj[ipos]]; 543 } 544 } 545 } 546 /* Otherwise, there are enough rows in the chunk to make it 547 * worthwhile to vectorize across the rows, that is, to do the 548 * matvec by operating with "columns" of the chunk. */ 549 else { 550 for (j=0; j<nz; j++) { 551 for (i=0; i<isize; i++) { 552 ipos = ip[i] + j; 553 yp[i] += aa[ipos] * x[aj[ipos]]; 554 } 555 } 556 } 557 558 #if defined(PETSC_HAVE_CRAY_VECTOR) 559 #pragma _CRI ivdep 560 #endif 561 /* Put results from yp[] into non-permuted result vector y. */ 562 for (i=0; i<isize; i++) { 563 y[iperm[istart+i]] = yp[i]; 564 } 565 } /* End processing chunk of isize rows of a group. */ 566 567 } /* End handling matvec for chunk with nz > 1. */ 568 } /* End loop over igroup. */ 569 570 #endif 571 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 572 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 573 ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 574 PetscFunctionReturn(0); 575 } 576 577 578 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a 579 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM() 580 * routine, but can also be used to convert an assembled SeqAIJ matrix 581 * into a SeqAIJPERM one. */ 582 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat) 583 { 584 PetscErrorCode ierr; 585 Mat B = *newmat; 586 Mat_SeqAIJPERM *aijperm; 587 PetscBool sametype; 588 589 PetscFunctionBegin; 590 if (reuse == MAT_INITIAL_MATRIX) { 591 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 592 } 593 ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr); 594 if (sametype) PetscFunctionReturn(0); 595 596 ierr = PetscNewLog(B,&aijperm);CHKERRQ(ierr); 597 B->spptr = (void*) aijperm; 598 599 /* Set function pointers for methods that we inherit from AIJ but override. */ 600 B->ops->duplicate = MatDuplicate_SeqAIJPERM; 601 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM; 602 B->ops->destroy = MatDestroy_SeqAIJPERM; 603 B->ops->mult = MatMult_SeqAIJPERM; 604 B->ops->multadd = MatMultAdd_SeqAIJPERM; 605 606 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/ 607 /* If A has already been assembled, compute the permutation. */ 608 if (A->assembled) { 609 ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr); 610 } 611 612 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 613 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr); 614 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr); 615 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr); 616 617 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr); 618 *newmat = B; 619 PetscFunctionReturn(0); 620 } 621 622 /*@C 623 MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM. 624 This type inherits from AIJ, but calculates some additional permutation 625 information that is used to allow better vectorization of some 626 operations. At the cost of increased storage, the AIJ formatted 627 matrix can be copied to a format in which pieces of the matrix are 628 stored in ELLPACK format, allowing the vectorized matrix multiply 629 routine to use stride-1 memory accesses. As with the AIJ type, it is 630 important to preallocate matrix storage in order to get good assembly 631 performance. 632 633 Collective on MPI_Comm 634 635 Input Parameters: 636 + comm - MPI communicator, set to PETSC_COMM_SELF 637 . m - number of rows 638 . n - number of columns 639 . nz - number of nonzeros per row (same for all rows) 640 - nnz - array containing the number of nonzeros in the various rows 641 (possibly different for each row) or NULL 642 643 Output Parameter: 644 . A - the matrix 645 646 Notes: 647 If nnz is given then nz is ignored 648 649 Level: intermediate 650 651 .keywords: matrix, cray, sparse, parallel 652 653 .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues() 654 @*/ 655 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 656 { 657 PetscErrorCode ierr; 658 659 PetscFunctionBegin; 660 ierr = MatCreate(comm,A);CHKERRQ(ierr); 661 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 662 ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr); 663 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr); 664 PetscFunctionReturn(0); 665 } 666 667 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) 668 { 669 PetscErrorCode ierr; 670 671 PetscFunctionBegin; 672 ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr); 673 ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 674 PetscFunctionReturn(0); 675 } 676 677