xref: /petsc/src/mat/tests/ex5k.kokkos.cxx (revision a90d8e383a2827d476809587898a1fbbc9581506)
1*a90d8e38SSatish Balay static char help[] = "Benchmarking MatMult() with AIJ and its subclass matrix types\n";
2*a90d8e38SSatish Balay 
3*a90d8e38SSatish Balay /*
4*a90d8e38SSatish Balay Usage:
5*a90d8e38SSatish Balay   mpirun -n <np> ./ex5k
6*a90d8e38SSatish Balay     -f <file>        : input PETSc matrix binary file; one can convert a file from MatrixMarket using mat/tests/ex72.c
7*a90d8e38SSatish Balay     -mat_type <type> : aij or its subclass. Default is aij.
8*a90d8e38SSatish Balay     -n <num>         : run MatMult() this many times and report average time. Default is 500.
9*a90d8e38SSatish Balay 
10*a90d8e38SSatish Balay Notes:
11*a90d8e38SSatish Balay   It uses CPU-timer to measure the time.
12*a90d8e38SSatish Balay 
13*a90d8e38SSatish Balay Examples:
14*a90d8e38SSatish Balay   On OLCF Summit (with GPU-aware MPI)
15*a90d8e38SSatish Balay     # 6 MPI ranks:
16*a90d8e38SSatish Balay     # 6 resource sets (-n 6), 1 MPI rank per RS (-a 1), 7 CPU cores per RS (-c 7), and 1 GPU per RS (-g 1), 6 RSs per node (-r 6)
17*a90d8e38SSatish Balay     jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 ./ex5k -f 1138_bus.aij -mat_type aijcusparse
18*a90d8e38SSatish Balay 
19*a90d8e38SSatish Balay     # 1 MPI rank
20*a90d8e38SSatish Balay     jsrun --smpiargs "-gpu" -n 1 -a 1 -c 7 -g 1 -r 1 ./ex5k -f 1138_bus.aij -mat_type aijcusparse
21*a90d8e38SSatish Balay 
22*a90d8e38SSatish Balay   On OLCF Crusher:
23*a90d8e38SSatish Balay     # 1 MPI rank
24*a90d8e38SSatish Balay     # run with 1 node (-N1), 1 mpi rank (-n1), 2 hardware threads per rank (-c2)
25*a90d8e38SSatish Balay     srun -N1 -n1 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex5k -f HV15R.aij -mat_type aijkokkos
26*a90d8e38SSatish Balay 
27*a90d8e38SSatish Balay     # 8 MPI ranks
28*a90d8e38SSatish Balay     srun -N1 -n8 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex5k -f HV15R.aij -mat_type aijkokkos
29*a90d8e38SSatish Balay */
30*a90d8e38SSatish Balay #include <petscmat.h>
31*a90d8e38SSatish Balay #include <petscdevice.h>
32*a90d8e38SSatish Balay 
33*a90d8e38SSatish Balay #if defined(PETSC_HAVE_CUDA)
34*a90d8e38SSatish Balay   #include <petscdevice_cuda.h>
35*a90d8e38SSatish Balay   #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
36*a90d8e38SSatish Balay #elif defined(PETSC_HAVE_HIP)
37*a90d8e38SSatish Balay   #include <petscdevice_hip.h>
38*a90d8e38SSatish Balay   #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
39*a90d8e38SSatish Balay #elif defined(PETSC_HAVE_KOKKOS)
40*a90d8e38SSatish Balay   #include <Kokkos_Core.hpp>
41*a90d8e38SSatish Balay   #define SyncDevice() Kokkos::fence()
42*a90d8e38SSatish Balay #else
43*a90d8e38SSatish Balay   #define SyncDevice()
44*a90d8e38SSatish Balay #endif
45*a90d8e38SSatish Balay 
46*a90d8e38SSatish Balay int main(int argc, char **args)
47*a90d8e38SSatish Balay {
48*a90d8e38SSatish Balay   Mat            A, A2;
49*a90d8e38SSatish Balay   Vec            x, y, x2, y2;
50*a90d8e38SSatish Balay   PetscViewer    fd;
51*a90d8e38SSatish Balay   char           matfile[PETSC_MAX_PATH_LEN];
52*a90d8e38SSatish Balay   char           mattype[64];
53*a90d8e38SSatish Balay   PetscBool      flg;
54*a90d8e38SSatish Balay   PetscLogStage  stage;
55*a90d8e38SSatish Balay   PetscInt       i, n = 500, nskip = 5, M, N;
56*a90d8e38SSatish Balay   MatInfo        info;
57*a90d8e38SSatish Balay   PetscLogDouble tstart = 0, tend = 0, avgTime;
58*a90d8e38SSatish Balay   PetscRandom    rctx;
59*a90d8e38SSatish Balay   PetscReal      norm;
60*a90d8e38SSatish Balay   PetscMPIInt    size;
61*a90d8e38SSatish Balay 
62*a90d8e38SSatish Balay   PetscFunctionBeginUser;
63*a90d8e38SSatish Balay   PetscCall(PetscInitialize(&argc, &args, nullptr, help));
64*a90d8e38SSatish Balay   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
65*a90d8e38SSatish Balay 
66*a90d8e38SSatish Balay   /* Read options -n */
67*a90d8e38SSatish Balay   PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL));
68*a90d8e38SSatish Balay 
69*a90d8e38SSatish Balay   /* Load the matrix from a binary file */
70*a90d8e38SSatish Balay   PetscCall(PetscOptionsGetString(NULL, NULL, "-f", matfile, PETSC_MAX_PATH_LEN, &flg));
71*a90d8e38SSatish Balay   PetscCheck(flg, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Must indicate a PETSc matrix binary file with the -f option");
72*a90d8e38SSatish Balay   PetscCall(PetscOptionsGetString(NULL, NULL, "-mat_type", mattype, sizeof(mattype), &flg));
73*a90d8e38SSatish Balay   if (!flg) PetscCall(PetscStrncpy(mattype, MATAIJ, sizeof(mattype)));
74*a90d8e38SSatish Balay 
75*a90d8e38SSatish Balay   /* Read the matrix file to A2 */
76*a90d8e38SSatish Balay   PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, matfile, FILE_MODE_READ, &fd));
77*a90d8e38SSatish Balay   PetscCall(MatCreate(PETSC_COMM_WORLD, &A2));
78*a90d8e38SSatish Balay   PetscCall(MatSetType(A2, MATAIJ));
79*a90d8e38SSatish Balay   PetscCall(MatLoad(A2, fd));
80*a90d8e38SSatish Balay   PetscCall(MatCreateVecs(A2, &x2, &y2));
81*a90d8e38SSatish Balay   PetscCall(PetscViewerDestroy(&fd));
82*a90d8e38SSatish Balay 
83*a90d8e38SSatish Balay   PetscCall(MatGetSize(A2, &M, &N));
84*a90d8e38SSatish Balay   PetscCall(MatGetInfo(A2, MAT_GLOBAL_SUM, &info));
85*a90d8e38SSatish Balay   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Input matrix %s: %" PetscInt_FMT " x %" PetscInt_FMT "; %lld nonzeros; %.1f per row\n", matfile, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M));
86*a90d8e38SSatish Balay 
87*a90d8e38SSatish Balay   /* Copy A2 to A and convert A to the specified type */
88*a90d8e38SSatish Balay   PetscCall(MatDuplicate(A2, MAT_COPY_VALUES, &A));
89*a90d8e38SSatish Balay   PetscCall(MatConvert(A, mattype, MAT_INPLACE_MATRIX, &A));
90*a90d8e38SSatish Balay   PetscCall(MatCreateVecs(A, &x, &y));
91*a90d8e38SSatish Balay 
92*a90d8e38SSatish Balay   /* Init x, x2 with the same value */
93*a90d8e38SSatish Balay   PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx));
94*a90d8e38SSatish Balay   PetscCall(VecSetRandom(x2, rctx));
95*a90d8e38SSatish Balay   PetscCall(PetscRandomDestroy(&rctx));
96*a90d8e38SSatish Balay   PetscCall(VecCopy(x2, x));
97*a90d8e38SSatish Balay 
98*a90d8e38SSatish Balay   /* Compute the reference y2 = A2 x2 */
99*a90d8e38SSatish Balay   PetscCall(MatMult(A2, x2, y2));
100*a90d8e38SSatish Balay 
101*a90d8e38SSatish Balay   /* Measure y = Ax */
102*a90d8e38SSatish Balay   PetscCall(PetscLogStageRegister("MatMult", &stage));
103*a90d8e38SSatish Balay   for (i = 0; i < n + nskip; i++) {
104*a90d8e38SSatish Balay     if (i == nskip) {
105*a90d8e38SSatish Balay       SyncDevice();
106*a90d8e38SSatish Balay       PetscCall(PetscLogStagePush(stage));
107*a90d8e38SSatish Balay       PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
108*a90d8e38SSatish Balay       PetscCall(PetscTime(&tstart));
109*a90d8e38SSatish Balay     }
110*a90d8e38SSatish Balay     PetscCall(MatMult(A, x, y));
111*a90d8e38SSatish Balay   }
112*a90d8e38SSatish Balay   SyncDevice();
113*a90d8e38SSatish Balay   PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
114*a90d8e38SSatish Balay   PetscCall(PetscTime(&tend));
115*a90d8e38SSatish Balay   avgTime = (tend - tstart) * 1e6 / n; /* microseconds */
116*a90d8e38SSatish Balay   PetscCall(PetscLogStagePop());
117*a90d8e38SSatish Balay 
118*a90d8e38SSatish Balay   /* Validate y against y2 */
119*a90d8e38SSatish Balay   PetscCall(VecAYPX(y2, -1, y));
120*a90d8e38SSatish Balay   PetscCall(VecNorm(y2, NORM_2, &norm));
121*a90d8e38SSatish Balay   PetscCheck(norm < 1e-6, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "MatMult() error with norm %g", (double)norm);
122*a90d8e38SSatish Balay   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "MatMult() average time (us) with %d MPI ranks = %8.2f\n", size, avgTime));
123*a90d8e38SSatish Balay 
124*a90d8e38SSatish Balay   PetscCall(MatDestroy(&A));
125*a90d8e38SSatish Balay   PetscCall(VecDestroy(&x));
126*a90d8e38SSatish Balay   PetscCall(VecDestroy(&y));
127*a90d8e38SSatish Balay   PetscCall(MatDestroy(&A2));
128*a90d8e38SSatish Balay   PetscCall(VecDestroy(&x2));
129*a90d8e38SSatish Balay   PetscCall(VecDestroy(&y2));
130*a90d8e38SSatish Balay   PetscCall(PetscFinalize());
131*a90d8e38SSatish Balay   return 0;
132*a90d8e38SSatish Balay }
133*a90d8e38SSatish Balay 
134*a90d8e38SSatish Balay /*TEST
135*a90d8e38SSatish Balay 
136*a90d8e38SSatish Balay   testset:
137*a90d8e38SSatish Balay     args: -n 2 -f ${DATAFILESPATH}/matrices/small
138*a90d8e38SSatish Balay     nsize: 1
139*a90d8e38SSatish Balay     filter: grep "DOES_NOT_EXIST"
140*a90d8e38SSatish Balay     output_file: output/empty.out
141*a90d8e38SSatish Balay     requires: datafilespath !complex double !defined(PETSC_USE_64BIT_INDICES) kokkos_kernels
142*a90d8e38SSatish Balay 
143*a90d8e38SSatish Balay     test:
144*a90d8e38SSatish Balay       suffix: 1
145*a90d8e38SSatish Balay       requires: cuda
146*a90d8e38SSatish Balay       args: -mat_type aijcusparse
147*a90d8e38SSatish Balay 
148*a90d8e38SSatish Balay     test:
149*a90d8e38SSatish Balay       suffix: 2
150*a90d8e38SSatish Balay       args: -mat_type aijkokkos
151*a90d8e38SSatish Balay 
152*a90d8e38SSatish Balay     test:
153*a90d8e38SSatish Balay       suffix: 3
154*a90d8e38SSatish Balay       requires: hip
155*a90d8e38SSatish Balay       args: -mat_type aijhipsparse
156*a90d8e38SSatish Balay 
157*a90d8e38SSatish Balay TEST*/
158