1 /* 2 Defines basic operations for the MATSEQAIJMKL matrix class. 3 This class is derived from the MATSEQAIJ class and retains the 4 compressed row storage (aka Yale sparse matrix format) but uses 5 sparse BLAS operations from the Intel Math Kernel Library (MKL) 6 wherever possible. 7 */ 8 9 #include <../src/mat/impls/aij/seq/aij.h> 10 #include <../src/mat/impls/aij/seq/aijmkl/aijmkl.h> 11 12 /* MKL include files. */ 13 #include <mkl_spblas.h> /* Sparse BLAS */ 14 15 typedef struct { 16 PetscBool no_SpMV2; /* If PETSC_TRUE, then don't use the MKL SpMV2 inspector-executor routines. */ 17 PetscBool sparse_optimized; /* If PETSC_TRUE, then mkl_sparse_optimize() has been called. */ 18 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE 19 sparse_matrix_t csrA; /* "Handle" used by SpMV2 inspector-executor routines. */ 20 struct matrix_descr descr; 21 #endif 22 } Mat_SeqAIJMKL; 23 24 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 25 26 #undef __FUNCT__ 27 #define __FUNCT__ "MatConvert_SeqAIJMKL_SeqAIJ" 28 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJMKL_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat) 29 { 30 /* This routine is only called to convert a MATAIJMKL to its base PETSc type, */ 31 /* so we will ignore 'MatType type'. */ 32 PetscErrorCode ierr; 33 Mat B = *newmat; 34 Mat_SeqAIJMKL *aijmkl=(Mat_SeqAIJMKL*)A->spptr; 35 36 PetscFunctionBegin; 37 if (reuse == MAT_INITIAL_MATRIX) { 38 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 39 aijmkl = (Mat_SeqAIJMKL*)B->spptr; 40 } 41 42 /* Reset the original function pointers. */ 43 B->ops->duplicate = MatDuplicate_SeqAIJ; 44 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 45 B->ops->destroy = MatDestroy_SeqAIJ; 46 B->ops->mult = MatMult_SeqAIJ; 47 B->ops->multtranspose = MatMultTranspose_SeqAIJ; 48 B->ops->multadd = MatMultAdd_SeqAIJ; 49 B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 50 51 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",NULL);CHKERRQ(ierr); 52 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr); 53 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr); 54 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",NULL);CHKERRQ(ierr); 55 56 /* Free everything in the Mat_SeqAIJMKL data structure. Currently, this 57 * simply involves destroying the MKL sparse matrix handle and then freeing 58 * the spptr pointer. */ 59 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE 60 if (aijmkl->sparse_optimized) { 61 sparse_status_t stat; 62 stat = mkl_sparse_destroy(aijmkl->csrA); 63 if (stat != SPARSE_STATUS_SUCCESS) { 64 PetscFunctionReturn(PETSC_ERR_LIB); 65 } 66 } 67 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */ 68 ierr = PetscFree(B->spptr);CHKERRQ(ierr); 69 70 /* Change the type of B to MATSEQAIJ. */ 71 ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr); 72 73 *newmat = B; 74 PetscFunctionReturn(0); 75 } 76 77 #undef __FUNCT__ 78 #define __FUNCT__ "MatDestroy_SeqAIJMKL" 79 PetscErrorCode MatDestroy_SeqAIJMKL(Mat A) 80 { 81 PetscErrorCode ierr; 82 Mat_SeqAIJMKL *aijmkl = (Mat_SeqAIJMKL*) A->spptr; 83 84 PetscFunctionBegin; 85 86 /* If MatHeaderMerge() was used, then this SeqAIJMKL matrix will not have an 87 * spptr pointer. */ 88 if (aijmkl) { 89 /* Clean up everything in the Mat_SeqAIJMKL data structure, then free A->spptr. */ 90 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE 91 if (aijmkl->sparse_optimized) { 92 sparse_status_t stat = SPARSE_STATUS_SUCCESS; 93 stat = mkl_sparse_destroy(aijmkl->csrA); 94 if (stat != SPARSE_STATUS_SUCCESS) { 95 PetscFunctionReturn(PETSC_ERR_LIB); 96 } 97 } 98 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */ 99 ierr = PetscFree(A->spptr);CHKERRQ(ierr); 100 } 101 102 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 103 * to destroy everything that remains. */ 104 ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr); 105 /* Note that I don't call MatSetType(). I believe this is because that 106 * is only to be called when *building* a matrix. I could be wrong, but 107 * that is how things work for the SuperLU matrix class. */ 108 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 109 PetscFunctionReturn(0); 110 } 111 112 #undef __FUNCT__ 113 #define __FUNCT__ "MatDuplicate_SeqAIJMKL" 114 PetscErrorCode MatDuplicate_SeqAIJMKL(Mat A, MatDuplicateOption op, Mat *M) 115 { 116 PetscErrorCode ierr; 117 Mat_SeqAIJMKL *aijmkl; 118 Mat_SeqAIJMKL *aijmkl_dest; 119 120 PetscFunctionBegin; 121 ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr); 122 aijmkl = (Mat_SeqAIJMKL*) A->spptr; 123 aijmkl_dest = (Mat_SeqAIJMKL*) (*M)->spptr; 124 ierr = PetscMemcpy(aijmkl_dest,aijmkl,sizeof(Mat_SeqAIJMKL));CHKERRQ(ierr); 125 aijmkl_dest->sparse_optimized = PETSC_FALSE; 126 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE 127 aijmkl_dest->csrA = NULL; 128 if (!aijmkl->no_SpMV2 && A->cmap->n > 0) { 129 sparse_status_t stat; 130 stat = mkl_sparse_copy(aijmkl->csrA,aijmkl->descr,&aijmkl_dest->csrA); 131 if (stat != SPARSE_STATUS_SUCCESS) { 132 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_copy"); 133 PetscFunctionReturn(PETSC_ERR_LIB); 134 } 135 stat = mkl_sparse_optimize(aijmkl_dest->csrA); 136 if (stat != SPARSE_STATUS_SUCCESS) { 137 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to complete mkl_sparse_optimize"); 138 PetscFunctionReturn(PETSC_ERR_LIB); 139 } 140 aijmkl_dest->sparse_optimized = PETSC_TRUE; 141 } 142 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */ 143 PetscFunctionReturn(0); 144 } 145 146 #undef __FUNCT__ 147 #define __FUNCT__ "MatAssemblyEnd_SeqAIJMKL" 148 PetscErrorCode MatAssemblyEnd_SeqAIJMKL(Mat A, MatAssemblyType mode) 149 { 150 PetscErrorCode ierr; 151 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 152 Mat_SeqAIJMKL *aijmkl; 153 154 MatScalar *aa; 155 PetscInt m,n; 156 PetscInt *aj,*ai; 157 158 PetscFunctionBegin; 159 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 160 161 /* Since a MATSEQAIJMKL matrix is really just a MATSEQAIJ with some 162 * extra information and some different methods, call the AssemblyEnd 163 * routine for a MATSEQAIJ. 164 * I'm not sure if this is the best way to do this, but it avoids 165 * a lot of code duplication. 166 * I also note that currently MATSEQAIJMKL doesn't know anything about 167 * the Mat_CompressedRow data structure that SeqAIJ now uses when there 168 * are many zero rows. If the SeqAIJ assembly end routine decides to use 169 * this, this may break things. (Don't know... haven't looked at it. 170 * Do I need to disable this somehow?) */ 171 a->inode.use = PETSC_FALSE; /* Must disable: otherwise the MKL routines won't get used. */ 172 ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr); 173 174 aijmkl = (Mat_SeqAIJMKL*) A->spptr; 175 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE 176 if (!aijmkl->no_SpMV2) { 177 sparse_status_t stat; 178 if (aijmkl->sparse_optimized) { 179 /* Matrix has been previously assembled and optimized. Must destroy old 180 * matrix handle before running the optimization step again. */ 181 sparse_status_t stat; 182 stat = mkl_sparse_destroy(aijmkl->csrA); 183 if (stat != SPARSE_STATUS_SUCCESS) { 184 PetscFunctionReturn(PETSC_ERR_LIB); 185 } 186 } 187 /* Now perform the SpMV2 setup and matrix optimization. */ 188 aijmkl->descr.type = SPARSE_MATRIX_TYPE_GENERAL; 189 aijmkl->descr.mode = SPARSE_FILL_MODE_LOWER; 190 aijmkl->descr.diag = SPARSE_DIAG_NON_UNIT; 191 m = A->rmap->n; 192 n = A->cmap->n; 193 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 194 aa = a->a; /* Nonzero elements stored row-by-row. */ 195 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 196 if (n>0) { 197 stat = mkl_sparse_x_create_csr (&aijmkl->csrA,SPARSE_INDEX_BASE_ZERO,m,n,ai,ai+1,aj,aa); 198 stat = mkl_sparse_set_mv_hint(aijmkl->csrA,SPARSE_OPERATION_NON_TRANSPOSE,aijmkl->descr,1000); 199 stat = mkl_sparse_set_memory_hint(aijmkl->csrA,SPARSE_MEMORY_AGGRESSIVE); 200 stat = mkl_sparse_optimize(aijmkl->csrA); 201 if (stat != SPARSE_STATUS_SUCCESS) { 202 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Intel MKL error: unable to create matrix handle/complete mkl_sparse_optimize"); 203 PetscFunctionReturn(PETSC_ERR_LIB); 204 } 205 aijmkl->sparse_optimized = PETSC_TRUE; 206 } 207 } 208 #endif 209 210 PetscFunctionReturn(0); 211 } 212 213 #undef __FUNCT__ 214 #define __FUNCT__ "MatMult_SeqAIJMKL" 215 PetscErrorCode MatMult_SeqAIJMKL(Mat A,Vec xx,Vec yy) 216 { 217 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 218 const PetscScalar *x; 219 PetscScalar *y; 220 const MatScalar *aa; 221 PetscErrorCode ierr; 222 PetscInt m=A->rmap->n; 223 const PetscInt *aj,*ai; 224 225 /* Variables not in MatMult_SeqAIJ. */ 226 char transa = 'n'; /* Used to indicate to MKL that we are not computing the transpose product. */ 227 228 PetscFunctionBegin; 229 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 230 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 231 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 232 aa = a->a; /* Nonzero elements stored row-by-row. */ 233 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 234 235 /* Call MKL sparse BLAS routine to do the MatMult. */ 236 mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y); 237 238 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 239 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 240 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 241 PetscFunctionReturn(0); 242 } 243 244 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE 245 #undef __FUNCT__ 246 #define __FUNCT__ "MatMult_SeqAIJMKL_SpMV2" 247 PetscErrorCode MatMult_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy) 248 { 249 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 250 Mat_SeqAIJMKL *aijmkl=(Mat_SeqAIJMKL*)A->spptr; 251 const PetscScalar *x; 252 PetscScalar *y; 253 PetscErrorCode ierr; 254 sparse_status_t stat = SPARSE_STATUS_SUCCESS; 255 256 PetscFunctionBegin; 257 258 /* If there are no rows, this is a no-op: return immediately. */ 259 if(A->cmap->n < 1) PetscFunctionReturn(0); 260 261 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 262 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 263 264 /* Call MKL SpMV2 executor routine to do the MatMult. */ 265 stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y); 266 267 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 268 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 269 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 270 if (stat != SPARSE_STATUS_SUCCESS) { 271 PetscFunctionReturn(PETSC_ERR_LIB); 272 } 273 PetscFunctionReturn(0); 274 } 275 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */ 276 277 #undef __FUNCT__ 278 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL" 279 PetscErrorCode MatMultTranspose_SeqAIJMKL(Mat A,Vec xx,Vec yy) 280 { 281 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 282 const PetscScalar *x; 283 PetscScalar *y; 284 const MatScalar *aa; 285 PetscErrorCode ierr; 286 PetscInt m=A->rmap->n; 287 const PetscInt *aj,*ai; 288 289 /* Variables not in MatMultTranspose_SeqAIJ. */ 290 char transa = 't'; /* Used to indicate to MKL that we are computing the transpose product. */ 291 292 PetscFunctionBegin; 293 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 294 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 295 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 296 aa = a->a; /* Nonzero elements stored row-by-row. */ 297 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 298 299 /* Call MKL sparse BLAS routine to do the MatMult. */ 300 mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,y); 301 302 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 303 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 304 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 305 PetscFunctionReturn(0); 306 } 307 308 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE 309 #undef __FUNCT__ 310 #define __FUNCT__ "MatMultTranspose_SeqAIJMKL_SpMV2" 311 PetscErrorCode MatMultTranspose_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy) 312 { 313 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 314 Mat_SeqAIJMKL *aijmkl=(Mat_SeqAIJMKL*)A->spptr; 315 const PetscScalar *x; 316 PetscScalar *y; 317 PetscErrorCode ierr; 318 sparse_status_t stat; 319 320 PetscFunctionBegin; 321 322 /* If there are no rows, this is a no-op: return immediately. */ 323 if(A->cmap->n < 1) PetscFunctionReturn(0); 324 325 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 326 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 327 328 /* Call MKL SpMV2 executor routine to do the MatMultTranspose. */ 329 stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y); 330 331 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr); 332 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 333 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 334 if (stat != SPARSE_STATUS_SUCCESS) { 335 PetscFunctionReturn(PETSC_ERR_LIB); 336 } 337 PetscFunctionReturn(0); 338 } 339 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */ 340 341 #undef __FUNCT__ 342 #define __FUNCT__ "MatMultAdd_SeqAIJMKL" 343 PetscErrorCode MatMultAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz) 344 { 345 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 346 const PetscScalar *x; 347 PetscScalar *y,*z; 348 const MatScalar *aa; 349 PetscErrorCode ierr; 350 PetscInt m=A->rmap->n; 351 const PetscInt *aj,*ai; 352 PetscInt i; 353 354 /* Variables not in MatMultAdd_SeqAIJ. */ 355 char transa = 'n'; /* Used to indicate to MKL that we are not computing the transpose product. */ 356 PetscScalar alpha = 1.0; 357 PetscScalar beta = 1.0; 358 char matdescra[6]; 359 360 PetscFunctionBegin; 361 matdescra[0] = 'g'; /* Indicates to MKL that we using a general CSR matrix. */ 362 matdescra[3] = 'c'; /* Indicates to MKL that we use C-style (0-based) indexing. */ 363 364 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 365 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 366 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 367 aa = a->a; /* Nonzero elements stored row-by-row. */ 368 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 369 370 /* Call MKL sparse BLAS routine to do the MatMult. */ 371 if (zz == yy) { 372 /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */ 373 mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y); 374 } else { 375 /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then 376 * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */ 377 mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z); 378 for (i=0; i<m; i++) { 379 z[i] += y[i]; 380 } 381 } 382 383 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 384 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 385 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 386 PetscFunctionReturn(0); 387 } 388 389 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE 390 #undef __FUNCT__ 391 #define __FUNCT__ "MatMultAdd_SeqAIJMKL_SpMV2" 392 PetscErrorCode MatMultAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz) 393 { 394 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 395 Mat_SeqAIJMKL *aijmkl=(Mat_SeqAIJMKL*)A->spptr; 396 const PetscScalar *x; 397 PetscScalar *y,*z; 398 PetscErrorCode ierr; 399 PetscInt m=A->rmap->n; 400 PetscInt i; 401 402 /* Variables not in MatMultAdd_SeqAIJ. */ 403 sparse_status_t stat = SPARSE_STATUS_SUCCESS; 404 405 PetscFunctionBegin; 406 407 /* If there are no rows, this is a no-op: return immediately. */ 408 if(A->cmap->n < 1) PetscFunctionReturn(0); 409 410 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 411 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 412 413 /* Call MKL sparse BLAS routine to do the MatMult. */ 414 if (zz == yy) { 415 /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y, 416 * with alpha and beta both set to 1.0. */ 417 stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y); 418 } else { 419 /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then 420 * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */ 421 stat = mkl_sparse_x_mv(SPARSE_OPERATION_NON_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y); 422 for (i=0; i<m; i++) { 423 z[i] += y[i]; 424 } 425 } 426 427 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 428 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 429 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 430 if (stat != SPARSE_STATUS_SUCCESS) { 431 PetscFunctionReturn(PETSC_ERR_LIB); 432 } 433 PetscFunctionReturn(0); 434 } 435 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */ 436 437 #undef __FUNCT__ 438 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL" 439 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL(Mat A,Vec xx,Vec yy,Vec zz) 440 { 441 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 442 const PetscScalar *x; 443 PetscScalar *y,*z; 444 const MatScalar *aa; 445 PetscErrorCode ierr; 446 PetscInt m=A->rmap->n; 447 const PetscInt *aj,*ai; 448 PetscInt i; 449 450 /* Variables not in MatMultTransposeAdd_SeqAIJ. */ 451 char transa = 't'; /* Used to indicate to MKL that we are computing the transpose product. */ 452 PetscScalar alpha = 1.0; 453 PetscScalar beta = 1.0; 454 char matdescra[6]; 455 456 PetscFunctionBegin; 457 matdescra[0] = 'g'; /* Indicates to MKL that we using a general CSR matrix. */ 458 matdescra[3] = 'c'; /* Indicates to MKL that we use C-style (0-based) indexing. */ 459 460 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 461 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 462 aj = a->j; /* aj[k] gives column index for element aa[k]. */ 463 aa = a->a; /* Nonzero elements stored row-by-row. */ 464 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 465 466 /* Call MKL sparse BLAS routine to do the MatMult. */ 467 if (zz == yy) { 468 /* If zz and yy are the same vector, we can use MKL's mkl_xcsrmv(), which calculates y = alpha*A*x + beta*y. */ 469 mkl_xcsrmv(&transa,&m,&m,&alpha,matdescra,aa,aj,ai,ai+1,x,&beta,y); 470 } else { 471 /* zz and yy are different vectors, so we call mkl_cspblas_xcsrgemv(), which calculates y = A*x, and then 472 * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */ 473 mkl_cspblas_xcsrgemv(&transa,&m,aa,ai,aj,x,z); 474 for (i=0; i<m; i++) { 475 z[i] += y[i]; 476 } 477 } 478 479 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 480 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 481 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 482 PetscFunctionReturn(0); 483 } 484 485 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE 486 #undef __FUNCT__ 487 #define __FUNCT__ "MatMultTransposeAdd_SeqAIJMKL_SpMV2" 488 PetscErrorCode MatMultTransposeAdd_SeqAIJMKL_SpMV2(Mat A,Vec xx,Vec yy,Vec zz) 489 { 490 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 491 Mat_SeqAIJMKL *aijmkl=(Mat_SeqAIJMKL*)A->spptr; 492 const PetscScalar *x; 493 PetscScalar *y,*z; 494 PetscErrorCode ierr; 495 PetscInt m=A->rmap->n; 496 PetscInt i; 497 498 /* Variables not in MatMultTransposeAdd_SeqAIJ. */ 499 sparse_status_t stat = SPARSE_STATUS_SUCCESS; 500 501 PetscFunctionBegin; 502 503 /* If there are no rows, this is a no-op: return immediately. */ 504 if(A->cmap->n < 1) PetscFunctionReturn(0); 505 506 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 507 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 508 509 /* Call MKL sparse BLAS routine to do the MatMult. */ 510 if (zz == yy) { 511 /* If zz and yy are the same vector, we can use mkl_sparse_x_mv, which calculates y = alpha*A*x + beta*y, 512 * with alpha and beta both set to 1.0. */ 513 stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,1.0,y); 514 } else { 515 /* zz and yy are different vectors, so we call mkl_sparse_x_mv with alpha=1.0 and beta=0.0, and then 516 * we add the contents of vector yy to the result; MKL sparse BLAS does not have a MatMultAdd equivalent. */ 517 stat = mkl_sparse_x_mv(SPARSE_OPERATION_TRANSPOSE,1.0,aijmkl->csrA,aijmkl->descr,x,0.0,y); 518 for (i=0; i<m; i++) { 519 z[i] += y[i]; 520 } 521 } 522 523 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 524 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 525 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr); 526 if (stat != SPARSE_STATUS_SUCCESS) { 527 PetscFunctionReturn(PETSC_ERR_LIB); 528 } 529 PetscFunctionReturn(0); 530 } 531 #endif /* PETSC_HAVE_MKL_SPARSE_OPTIMIZE */ 532 533 534 /* MatConvert_SeqAIJ_SeqAIJMKL converts a SeqAIJ matrix into a 535 * SeqAIJMKL matrix. This routine is called by the MatCreate_SeqMKLAIJ() 536 * routine, but can also be used to convert an assembled SeqAIJ matrix 537 * into a SeqAIJMKL one. */ 538 #undef __FUNCT__ 539 #define __FUNCT__ "MatConvert_SeqAIJ_SeqAIJMKL" 540 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJMKL(Mat A,MatType type,MatReuse reuse,Mat *newmat) 541 { 542 PetscErrorCode ierr; 543 Mat B = *newmat; 544 Mat_SeqAIJMKL *aijmkl; 545 PetscBool set; 546 PetscBool sametype; 547 548 PetscFunctionBegin; 549 if (reuse == MAT_INITIAL_MATRIX) { 550 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 551 } 552 553 ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr); 554 if (sametype) PetscFunctionReturn(0); 555 556 ierr = PetscNewLog(B,&aijmkl);CHKERRQ(ierr); 557 B->spptr = (void*) aijmkl; 558 559 /* Set function pointers for methods that we inherit from AIJ but override. 560 * We also parse some command line options below, since those determine some of the methods we point to. 561 * Note: Currently the transposed operations are not being set because I encounter memory corruption 562 * when these are enabled. Need to look at this with Valgrind or similar. --RTM */ 563 B->ops->duplicate = MatDuplicate_SeqAIJMKL; 564 B->ops->assemblyend = MatAssemblyEnd_SeqAIJMKL; 565 B->ops->destroy = MatDestroy_SeqAIJMKL; 566 567 aijmkl->sparse_optimized = PETSC_FALSE; 568 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE 569 aijmkl->no_SpMV2 = PETSC_FALSE; /* Default to using the SpMV2 routines if our MKL supports them. */ 570 #elif 571 aijmkl->no_SpMV2 = PETSC_TRUE; 572 #endif 573 574 /* Parse command line options. */ 575 ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"AIJMKL Options","Mat");CHKERRQ(ierr); 576 ierr = PetscOptionsBool("-mat_aijmkl_no_spmv2","NoSPMV2","None",(PetscBool)aijmkl->no_SpMV2,(PetscBool*)&aijmkl->no_SpMV2,&set);CHKERRQ(ierr); 577 ierr = PetscOptionsEnd();CHKERRQ(ierr); 578 #ifndef PETSC_HAVE_MKL_SPARSE_OPTIMIZE 579 if(!aijmkl->no_SpMV2) { 580 ierr = PetscInfo(B,"User requested use of MKL SpMV2 routines, but MKL version does not support mkl_sparse_optimize(); defaulting to non-SpMV2 routines.\n"); 581 aijmkl->no_SpMV2 = PETSC_TRUE; 582 } 583 #endif 584 585 if(!aijmkl->no_SpMV2) { 586 #ifdef PETSC_HAVE_MKL_SPARSE_OPTIMIZE 587 B->ops->mult = MatMult_SeqAIJMKL_SpMV2; 588 /* B->ops->multtranspose = MatMultTranspose_SeqAIJMKL_SpMV2; */ 589 B->ops->multadd = MatMultAdd_SeqAIJMKL_SpMV2; 590 /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL_SpMV2; */ 591 #endif 592 } else { 593 B->ops->mult = MatMult_SeqAIJMKL; 594 /* B->ops->multtranspose = MatMultTranspose_SeqAIJMKL; */ 595 B->ops->multadd = MatMultAdd_SeqAIJMKL; 596 /* B->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJMKL; */ 597 } 598 599 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijmkl_seqaij_C",MatConvert_SeqAIJMKL_SeqAIJ);CHKERRQ(ierr); 600 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijmkl_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr); 601 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijmkl_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr); 602 ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijmkl_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr); 603 604 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJMKL);CHKERRQ(ierr); 605 *newmat = B; 606 PetscFunctionReturn(0); 607 } 608 609 #undef __FUNCT__ 610 #define __FUNCT__ "MatCreateSeqAIJMKL" 611 /*@C 612 MatCreateSeqAIJMKL - Creates a sparse matrix of type SEQAIJMKL. 613 This type inherits from AIJ and is largely identical, but uses sparse BLAS 614 routines from Intel MKL whenever possible. 615 Collective on MPI_Comm 616 617 Input Parameters: 618 + comm - MPI communicator, set to PETSC_COMM_SELF 619 . m - number of rows 620 . n - number of columns 621 . nz - number of nonzeros per row (same for all rows) 622 - nnz - array containing the number of nonzeros in the various rows 623 (possibly different for each row) or NULL 624 625 Output Parameter: 626 . A - the matrix 627 628 Notes: 629 If nnz is given then nz is ignored 630 631 Level: intermediate 632 633 .keywords: matrix, cray, sparse, parallel 634 635 .seealso: MatCreate(), MatCreateMPIAIJMKL(), MatSetValues() 636 @*/ 637 PetscErrorCode MatCreateSeqAIJMKL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 638 { 639 PetscErrorCode ierr; 640 641 PetscFunctionBegin; 642 ierr = MatCreate(comm,A);CHKERRQ(ierr); 643 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 644 ierr = MatSetType(*A,MATSEQAIJMKL);CHKERRQ(ierr); 645 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr); 646 PetscFunctionReturn(0); 647 } 648 649 #undef __FUNCT__ 650 #define __FUNCT__ "MatCreate_SeqAIJMKL" 651 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJMKL(Mat A) 652 { 653 PetscErrorCode ierr; 654 655 PetscFunctionBegin; 656 ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr); 657 ierr = MatConvert_SeqAIJ_SeqAIJMKL(A,MATSEQAIJMKL,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 658 PetscFunctionReturn(0); 659 } 660