xref: /petsc/src/vec/is/sf/tests/ex2.c (revision b122ec5aa1bd4469eb4e0673542fb7de3f411254)
14a314419SJunchao Zhang static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n";
24a314419SJunchao Zhang /*
34a314419SJunchao Zhang   SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
44a314419SJunchao Zhang   operations in the default stream and does not sync these operations since it assumes routines consume
54a314419SJunchao Zhang   the destination data are also on the default stream. However, when destination data in on CPU,
64a314419SJunchao Zhang   SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd().
74a314419SJunchao Zhang  */
84a314419SJunchao Zhang 
94a314419SJunchao Zhang #include <petscvec.h>
104a314419SJunchao Zhang int main(int argc,char **argv)
114a314419SJunchao Zhang {
124a314419SJunchao Zhang   PetscInt           i,n=100000; /* Big enough to make the asynchronous copy meaningful */
134a314419SJunchao Zhang   PetscScalar        *val;
144a314419SJunchao Zhang   const PetscScalar  *yval;
154a314419SJunchao Zhang   Vec                x,y;
164a314419SJunchao Zhang   PetscMPIInt        size;
174a314419SJunchao Zhang   IS                 ix,iy;
184a314419SJunchao Zhang   VecScatter         vscat;
194a314419SJunchao Zhang 
204a314419SJunchao Zhang   PetscFunctionBegin;
21*b122ec5aSJacob Faibussowitsch   CHKERRQ(PetscInitialize(&argc,&argv,(char*)0,help));
225f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_size(PETSC_COMM_WORLD,&size));
232c71b3e2SJacob Faibussowitsch   PetscCheckFalse(size != 1,PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test");
244a314419SJunchao Zhang 
254a314419SJunchao Zhang   /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
264a314419SJunchao Zhang      since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
274a314419SJunchao Zhang      cudaMemcpyDeviceToHost.
284a314419SJunchao Zhang    */
295f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCreateSeq(PETSC_COMM_WORLD,n,&x));
305f80ce2aSJacob Faibussowitsch   CHKERRQ(VecSetFromOptions(x));
315f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCreateSeq(PETSC_COMM_WORLD,n,&y));
325f80ce2aSJacob Faibussowitsch   CHKERRQ(VecSetFromOptions(y));
334a314419SJunchao Zhang 
344a314419SJunchao Zhang   /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
355f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetArray(x,&val));
364a314419SJunchao Zhang   for (i=0; i<n; i++) val[i] = i/2.0;
375f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreArray(x,&val));
385f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScale(x,2.0));
395f80ce2aSJacob Faibussowitsch   CHKERRQ(VecSet(y,314));
404a314419SJunchao Zhang 
414a314419SJunchao Zhang   /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
425f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetArray(y,&val));
435f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreArray(y,&val));
444a314419SJunchao Zhang 
454a314419SJunchao Zhang   /* The vscat is simply a vector copy */
465f80ce2aSJacob Faibussowitsch   CHKERRQ(ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix));
475f80ce2aSJacob Faibussowitsch   CHKERRQ(ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy));
485f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterCreate(x,ix,y,iy,&vscat));
494a314419SJunchao Zhang 
504a314419SJunchao Zhang   /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
514a314419SJunchao Zhang      cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed.
524a314419SJunchao Zhang    */
535f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
545f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
555f80ce2aSJacob Faibussowitsch   CHKERRQ(VecGetArrayRead(y,&yval));
564a314419SJunchao Zhang   /* Display the first and the last entries of y to see if it is valid on host */
575f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1])));
585f80ce2aSJacob Faibussowitsch   CHKERRQ(VecRestoreArrayRead(y,&yval));
594a314419SJunchao Zhang 
605f80ce2aSJacob Faibussowitsch   CHKERRQ(VecDestroy(&x));
615f80ce2aSJacob Faibussowitsch   CHKERRQ(VecDestroy(&y));
625f80ce2aSJacob Faibussowitsch   CHKERRQ(ISDestroy(&ix));
635f80ce2aSJacob Faibussowitsch   CHKERRQ(ISDestroy(&iy));
645f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterDestroy(&vscat));
65*b122ec5aSJacob Faibussowitsch   CHKERRQ(PetscFinalize());
66*b122ec5aSJacob Faibussowitsch   return 0;
674a314419SJunchao Zhang }
684a314419SJunchao Zhang 
694a314419SJunchao Zhang /*TEST
704a314419SJunchao Zhang 
714a314419SJunchao Zhang    test:
724a314419SJunchao Zhang     requires: cuda
73328e583dSStefano Zampini     diff_args: -j
744a314419SJunchao Zhang     #make sure the host memory is pinned
7526e8e884SScott Kruger     # sf_backend cuda is not needed if compiling only with cuda
7626e8e884SScott Kruger     args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0
7726e8e884SScott Kruger 
7826e8e884SScott Kruger    test:
7926e8e884SScott Kruger     suffix: hip
8026e8e884SScott Kruger     requires: hip
81328e583dSStefano Zampini     diff_args: -j
8226e8e884SScott Kruger     output_file: output/ex2_1.out
8326e8e884SScott Kruger     #make sure the host memory is pinned
8426e8e884SScott Kruger     # sf_backend hip is not needed if compiling only with hip
8526e8e884SScott Kruger     args:  -vec_type hip -sf_backend hip -vec_pinned_memory_min 0
864a314419SJunchao Zhang 
874a314419SJunchao Zhang TEST*/
88