1*a90d8e38SSatish Balay static char help[] = "Benchmarking MatMult() with AIJ and its subclass matrix types\n"; 2*a90d8e38SSatish Balay 3*a90d8e38SSatish Balay /* 4*a90d8e38SSatish Balay Usage: 5*a90d8e38SSatish Balay mpirun -n <np> ./ex5k 6*a90d8e38SSatish Balay -f <file> : input PETSc matrix binary file; one can convert a file from MatrixMarket using mat/tests/ex72.c 7*a90d8e38SSatish Balay -mat_type <type> : aij or its subclass. Default is aij. 8*a90d8e38SSatish Balay -n <num> : run MatMult() this many times and report average time. Default is 500. 9*a90d8e38SSatish Balay 10*a90d8e38SSatish Balay Notes: 11*a90d8e38SSatish Balay It uses CPU-timer to measure the time. 12*a90d8e38SSatish Balay 13*a90d8e38SSatish Balay Examples: 14*a90d8e38SSatish Balay On OLCF Summit (with GPU-aware MPI) 15*a90d8e38SSatish Balay # 6 MPI ranks: 16*a90d8e38SSatish Balay # 6 resource sets (-n 6), 1 MPI rank per RS (-a 1), 7 CPU cores per RS (-c 7), and 1 GPU per RS (-g 1), 6 RSs per node (-r 6) 17*a90d8e38SSatish Balay jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 ./ex5k -f 1138_bus.aij -mat_type aijcusparse 18*a90d8e38SSatish Balay 19*a90d8e38SSatish Balay # 1 MPI rank 20*a90d8e38SSatish Balay jsrun --smpiargs "-gpu" -n 1 -a 1 -c 7 -g 1 -r 1 ./ex5k -f 1138_bus.aij -mat_type aijcusparse 21*a90d8e38SSatish Balay 22*a90d8e38SSatish Balay On OLCF Crusher: 23*a90d8e38SSatish Balay # 1 MPI rank 24*a90d8e38SSatish Balay # run with 1 node (-N1), 1 mpi rank (-n1), 2 hardware threads per rank (-c2) 25*a90d8e38SSatish Balay srun -N1 -n1 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex5k -f HV15R.aij -mat_type aijkokkos 26*a90d8e38SSatish Balay 27*a90d8e38SSatish Balay # 8 MPI ranks 28*a90d8e38SSatish Balay srun -N1 -n8 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex5k -f HV15R.aij -mat_type aijkokkos 29*a90d8e38SSatish Balay */ 30*a90d8e38SSatish Balay #include <petscmat.h> 31*a90d8e38SSatish Balay #include <petscdevice.h> 32*a90d8e38SSatish Balay 33*a90d8e38SSatish Balay #if defined(PETSC_HAVE_CUDA) 34*a90d8e38SSatish Balay #include <petscdevice_cuda.h> 35*a90d8e38SSatish Balay #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize()) 36*a90d8e38SSatish Balay #elif defined(PETSC_HAVE_HIP) 37*a90d8e38SSatish Balay #include <petscdevice_hip.h> 38*a90d8e38SSatish Balay #define SyncDevice() PetscCallHIP(hipDeviceSynchronize()) 39*a90d8e38SSatish Balay #elif defined(PETSC_HAVE_KOKKOS) 40*a90d8e38SSatish Balay #include <Kokkos_Core.hpp> 41*a90d8e38SSatish Balay #define SyncDevice() Kokkos::fence() 42*a90d8e38SSatish Balay #else 43*a90d8e38SSatish Balay #define SyncDevice() 44*a90d8e38SSatish Balay #endif 45*a90d8e38SSatish Balay 46*a90d8e38SSatish Balay int main(int argc, char **args) 47*a90d8e38SSatish Balay { 48*a90d8e38SSatish Balay Mat A, A2; 49*a90d8e38SSatish Balay Vec x, y, x2, y2; 50*a90d8e38SSatish Balay PetscViewer fd; 51*a90d8e38SSatish Balay char matfile[PETSC_MAX_PATH_LEN]; 52*a90d8e38SSatish Balay char mattype[64]; 53*a90d8e38SSatish Balay PetscBool flg; 54*a90d8e38SSatish Balay PetscLogStage stage; 55*a90d8e38SSatish Balay PetscInt i, n = 500, nskip = 5, M, N; 56*a90d8e38SSatish Balay MatInfo info; 57*a90d8e38SSatish Balay PetscLogDouble tstart = 0, tend = 0, avgTime; 58*a90d8e38SSatish Balay PetscRandom rctx; 59*a90d8e38SSatish Balay PetscReal norm; 60*a90d8e38SSatish Balay PetscMPIInt size; 61*a90d8e38SSatish Balay 62*a90d8e38SSatish Balay PetscFunctionBeginUser; 63*a90d8e38SSatish Balay PetscCall(PetscInitialize(&argc, &args, nullptr, help)); 64*a90d8e38SSatish Balay PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size)); 65*a90d8e38SSatish Balay 66*a90d8e38SSatish Balay /* Read options -n */ 67*a90d8e38SSatish Balay PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL)); 68*a90d8e38SSatish Balay 69*a90d8e38SSatish Balay /* Load the matrix from a binary file */ 70*a90d8e38SSatish Balay PetscCall(PetscOptionsGetString(NULL, NULL, "-f", matfile, PETSC_MAX_PATH_LEN, &flg)); 71*a90d8e38SSatish Balay PetscCheck(flg, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Must indicate a PETSc matrix binary file with the -f option"); 72*a90d8e38SSatish Balay PetscCall(PetscOptionsGetString(NULL, NULL, "-mat_type", mattype, sizeof(mattype), &flg)); 73*a90d8e38SSatish Balay if (!flg) PetscCall(PetscStrncpy(mattype, MATAIJ, sizeof(mattype))); 74*a90d8e38SSatish Balay 75*a90d8e38SSatish Balay /* Read the matrix file to A2 */ 76*a90d8e38SSatish Balay PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, matfile, FILE_MODE_READ, &fd)); 77*a90d8e38SSatish Balay PetscCall(MatCreate(PETSC_COMM_WORLD, &A2)); 78*a90d8e38SSatish Balay PetscCall(MatSetType(A2, MATAIJ)); 79*a90d8e38SSatish Balay PetscCall(MatLoad(A2, fd)); 80*a90d8e38SSatish Balay PetscCall(MatCreateVecs(A2, &x2, &y2)); 81*a90d8e38SSatish Balay PetscCall(PetscViewerDestroy(&fd)); 82*a90d8e38SSatish Balay 83*a90d8e38SSatish Balay PetscCall(MatGetSize(A2, &M, &N)); 84*a90d8e38SSatish Balay PetscCall(MatGetInfo(A2, MAT_GLOBAL_SUM, &info)); 85*a90d8e38SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Input matrix %s: %" PetscInt_FMT " x %" PetscInt_FMT "; %lld nonzeros; %.1f per row\n", matfile, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M)); 86*a90d8e38SSatish Balay 87*a90d8e38SSatish Balay /* Copy A2 to A and convert A to the specified type */ 88*a90d8e38SSatish Balay PetscCall(MatDuplicate(A2, MAT_COPY_VALUES, &A)); 89*a90d8e38SSatish Balay PetscCall(MatConvert(A, mattype, MAT_INPLACE_MATRIX, &A)); 90*a90d8e38SSatish Balay PetscCall(MatCreateVecs(A, &x, &y)); 91*a90d8e38SSatish Balay 92*a90d8e38SSatish Balay /* Init x, x2 with the same value */ 93*a90d8e38SSatish Balay PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx)); 94*a90d8e38SSatish Balay PetscCall(VecSetRandom(x2, rctx)); 95*a90d8e38SSatish Balay PetscCall(PetscRandomDestroy(&rctx)); 96*a90d8e38SSatish Balay PetscCall(VecCopy(x2, x)); 97*a90d8e38SSatish Balay 98*a90d8e38SSatish Balay /* Compute the reference y2 = A2 x2 */ 99*a90d8e38SSatish Balay PetscCall(MatMult(A2, x2, y2)); 100*a90d8e38SSatish Balay 101*a90d8e38SSatish Balay /* Measure y = Ax */ 102*a90d8e38SSatish Balay PetscCall(PetscLogStageRegister("MatMult", &stage)); 103*a90d8e38SSatish Balay for (i = 0; i < n + nskip; i++) { 104*a90d8e38SSatish Balay if (i == nskip) { 105*a90d8e38SSatish Balay SyncDevice(); 106*a90d8e38SSatish Balay PetscCall(PetscLogStagePush(stage)); 107*a90d8e38SSatish Balay PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD)); 108*a90d8e38SSatish Balay PetscCall(PetscTime(&tstart)); 109*a90d8e38SSatish Balay } 110*a90d8e38SSatish Balay PetscCall(MatMult(A, x, y)); 111*a90d8e38SSatish Balay } 112*a90d8e38SSatish Balay SyncDevice(); 113*a90d8e38SSatish Balay PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD)); 114*a90d8e38SSatish Balay PetscCall(PetscTime(&tend)); 115*a90d8e38SSatish Balay avgTime = (tend - tstart) * 1e6 / n; /* microseconds */ 116*a90d8e38SSatish Balay PetscCall(PetscLogStagePop()); 117*a90d8e38SSatish Balay 118*a90d8e38SSatish Balay /* Validate y against y2 */ 119*a90d8e38SSatish Balay PetscCall(VecAYPX(y2, -1, y)); 120*a90d8e38SSatish Balay PetscCall(VecNorm(y2, NORM_2, &norm)); 121*a90d8e38SSatish Balay PetscCheck(norm < 1e-6, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "MatMult() error with norm %g", (double)norm); 122*a90d8e38SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "MatMult() average time (us) with %d MPI ranks = %8.2f\n", size, avgTime)); 123*a90d8e38SSatish Balay 124*a90d8e38SSatish Balay PetscCall(MatDestroy(&A)); 125*a90d8e38SSatish Balay PetscCall(VecDestroy(&x)); 126*a90d8e38SSatish Balay PetscCall(VecDestroy(&y)); 127*a90d8e38SSatish Balay PetscCall(MatDestroy(&A2)); 128*a90d8e38SSatish Balay PetscCall(VecDestroy(&x2)); 129*a90d8e38SSatish Balay PetscCall(VecDestroy(&y2)); 130*a90d8e38SSatish Balay PetscCall(PetscFinalize()); 131*a90d8e38SSatish Balay return 0; 132*a90d8e38SSatish Balay } 133*a90d8e38SSatish Balay 134*a90d8e38SSatish Balay /*TEST 135*a90d8e38SSatish Balay 136*a90d8e38SSatish Balay testset: 137*a90d8e38SSatish Balay args: -n 2 -f ${DATAFILESPATH}/matrices/small 138*a90d8e38SSatish Balay nsize: 1 139*a90d8e38SSatish Balay filter: grep "DOES_NOT_EXIST" 140*a90d8e38SSatish Balay output_file: output/empty.out 141*a90d8e38SSatish Balay requires: datafilespath !complex double !defined(PETSC_USE_64BIT_INDICES) kokkos_kernels 142*a90d8e38SSatish Balay 143*a90d8e38SSatish Balay test: 144*a90d8e38SSatish Balay suffix: 1 145*a90d8e38SSatish Balay requires: cuda 146*a90d8e38SSatish Balay args: -mat_type aijcusparse 147*a90d8e38SSatish Balay 148*a90d8e38SSatish Balay test: 149*a90d8e38SSatish Balay suffix: 2 150*a90d8e38SSatish Balay args: -mat_type aijkokkos 151*a90d8e38SSatish Balay 152*a90d8e38SSatish Balay test: 153*a90d8e38SSatish Balay suffix: 3 154*a90d8e38SSatish Balay requires: hip 155*a90d8e38SSatish Balay args: -mat_type aijhipsparse 156*a90d8e38SSatish Balay 157*a90d8e38SSatish Balay TEST*/ 158