xref: /petsc/src/vec/is/sf/tests/ex2.c (revision 26e8e8846b1bebea5b632e4e81afdded642403c0)
14a314419SJunchao Zhang static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n";
24a314419SJunchao Zhang /*
34a314419SJunchao Zhang   SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
44a314419SJunchao Zhang   operations in the default stream and does not sync these operations since it assumes routines consume
54a314419SJunchao Zhang   the destination data are also on the default stream. However, when destination data in on CPU,
64a314419SJunchao Zhang   SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd().
74a314419SJunchao Zhang  */
84a314419SJunchao Zhang 
94a314419SJunchao Zhang #include <petscvec.h>
104a314419SJunchao Zhang int main(int argc,char **argv)
114a314419SJunchao Zhang {
124a314419SJunchao Zhang   PetscErrorCode     ierr;
134a314419SJunchao Zhang   PetscInt           i,n=100000; /* Big enough to make the asynchronous copy meaningful */
144a314419SJunchao Zhang   PetscScalar        *val;
154a314419SJunchao Zhang   const PetscScalar  *yval;
164a314419SJunchao Zhang   Vec                x,y;
174a314419SJunchao Zhang   PetscMPIInt        size;
184a314419SJunchao Zhang   IS                 ix,iy;
194a314419SJunchao Zhang   VecScatter         vscat;
204a314419SJunchao Zhang 
214a314419SJunchao Zhang   PetscFunctionBegin;
224a314419SJunchao Zhang   ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr;
23ffc4695bSBarry Smith   ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRMPI(ierr);
244a314419SJunchao Zhang   if (size != 1) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test\n");
254a314419SJunchao Zhang 
264a314419SJunchao Zhang   /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
274a314419SJunchao Zhang      since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
284a314419SJunchao Zhang      cudaMemcpyDeviceToHost.
294a314419SJunchao Zhang    */
30*26e8e884SScott Kruger   ierr = VecCreateSeq(PETSC_COMM_WORLD,n,&x);CHKERRQ(ierr);
31*26e8e884SScott Kruger   ierr = VecSetFromOptions(x);CHKERRQ(ierr);
32*26e8e884SScott Kruger   ierr = VecCreateSeq(PETSC_COMM_WORLD,n,&y);CHKERRQ(ierr);
33*26e8e884SScott Kruger   ierr = VecSetFromOptions(y);CHKERRQ(ierr);
344a314419SJunchao Zhang 
354a314419SJunchao Zhang   /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
364a314419SJunchao Zhang   ierr = VecGetArray(x,&val);CHKERRQ(ierr);
374a314419SJunchao Zhang   for (i=0; i<n; i++) val[i] = i/2.0;
384a314419SJunchao Zhang   ierr = VecRestoreArray(x,&val);CHKERRQ(ierr);
394a314419SJunchao Zhang   ierr = VecScale(x,2.0);CHKERRQ(ierr);
404a314419SJunchao Zhang   ierr = VecSet(y,314);CHKERRQ(ierr);
414a314419SJunchao Zhang 
424a314419SJunchao Zhang   /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
434a314419SJunchao Zhang   ierr = VecGetArray(y,&val);
444a314419SJunchao Zhang   ierr = VecRestoreArray(y,&val);CHKERRQ(ierr);
454a314419SJunchao Zhang 
464a314419SJunchao Zhang   /* The vscat is simply a vector copy */
474a314419SJunchao Zhang   ierr = ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix);
484a314419SJunchao Zhang   ierr = ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy);
494a314419SJunchao Zhang   ierr = VecScatterCreate(x,ix,y,iy,&vscat);CHKERRQ(ierr);
504a314419SJunchao Zhang 
514a314419SJunchao Zhang   /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
524a314419SJunchao Zhang      cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed.
534a314419SJunchao Zhang    */
544a314419SJunchao Zhang   ierr = VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
554a314419SJunchao Zhang   ierr = VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
564a314419SJunchao Zhang   ierr = VecGetArrayRead(y,&yval);CHKERRQ(ierr);
574a314419SJunchao Zhang   /* Display the first and the last entries of y to see if it is valid on host */
584a314419SJunchao Zhang   ierr = PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%D] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]));CHKERRQ(ierr);
594a314419SJunchao Zhang   ierr = VecRestoreArrayRead(y,&yval);CHKERRQ(ierr);
604a314419SJunchao Zhang 
614a314419SJunchao Zhang   ierr = VecDestroy(&x);CHKERRQ(ierr);
624a314419SJunchao Zhang   ierr = VecDestroy(&y);CHKERRQ(ierr);
634a314419SJunchao Zhang   ierr = ISDestroy(&ix);CHKERRQ(ierr);
644a314419SJunchao Zhang   ierr = ISDestroy(&iy);CHKERRQ(ierr);
654a314419SJunchao Zhang   ierr = VecScatterDestroy(&vscat);CHKERRQ(ierr);
664a314419SJunchao Zhang   ierr = PetscFinalize();
674a314419SJunchao Zhang   return ierr;
684a314419SJunchao Zhang }
694a314419SJunchao Zhang 
704a314419SJunchao Zhang /*TEST
714a314419SJunchao Zhang 
724a314419SJunchao Zhang    test:
734a314419SJunchao Zhang     requires: cuda
744a314419SJunchao Zhang     #make sure the host memory is pinned
75*26e8e884SScott Kruger     # sf_backend cuda is not needed if compiling only with cuda
76*26e8e884SScott Kruger     args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0
77*26e8e884SScott Kruger 
78*26e8e884SScott Kruger    test:
79*26e8e884SScott Kruger     suffix: hip
80*26e8e884SScott Kruger     requires: hip
81*26e8e884SScott Kruger     output_file: output/ex2_1.out
82*26e8e884SScott Kruger     #make sure the host memory is pinned
83*26e8e884SScott Kruger     # sf_backend hip is not needed if compiling only with hip
84*26e8e884SScott Kruger     args:  -vec_type hip -sf_backend hip -vec_pinned_memory_min 0
854a314419SJunchao Zhang 
864a314419SJunchao Zhang TEST*/
87