xref: /petsc/src/vec/is/sf/tests/ex2.c (revision 4a3144197e09a67fc286772231bf9f1535f83912)
1*4a314419SJunchao Zhang static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n";
2*4a314419SJunchao Zhang /*
3*4a314419SJunchao Zhang   SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
4*4a314419SJunchao Zhang   operations in the default stream and does not sync these operations since it assumes routines consume
5*4a314419SJunchao Zhang   the destination data are also on the default stream. However, when destination data in on CPU,
6*4a314419SJunchao Zhang   SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd().
7*4a314419SJunchao Zhang  */
8*4a314419SJunchao Zhang 
9*4a314419SJunchao Zhang #include <petscvec.h>
10*4a314419SJunchao Zhang int main(int argc,char **argv)
11*4a314419SJunchao Zhang {
12*4a314419SJunchao Zhang   PetscErrorCode     ierr;
13*4a314419SJunchao Zhang   PetscInt           i,n=100000; /* Big enough to make the asynchronous copy meaningful */
14*4a314419SJunchao Zhang   PetscScalar        *val;
15*4a314419SJunchao Zhang   const PetscScalar  *yval;
16*4a314419SJunchao Zhang   Vec                x,y;
17*4a314419SJunchao Zhang   PetscMPIInt        size;
18*4a314419SJunchao Zhang   IS                 ix,iy;
19*4a314419SJunchao Zhang   VecScatter         vscat;
20*4a314419SJunchao Zhang 
21*4a314419SJunchao Zhang   PetscFunctionBegin;
22*4a314419SJunchao Zhang   ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr;
23*4a314419SJunchao Zhang   ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr);
24*4a314419SJunchao Zhang   if (size != 1) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test\n");
25*4a314419SJunchao Zhang 
26*4a314419SJunchao Zhang   /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
27*4a314419SJunchao Zhang      since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
28*4a314419SJunchao Zhang      cudaMemcpyDeviceToHost.
29*4a314419SJunchao Zhang    */
30*4a314419SJunchao Zhang   ierr = VecCreateSeq(PETSC_COMM_SELF,n,&x);CHKERRQ(ierr);
31*4a314419SJunchao Zhang   ierr = VecSetType(x,VECSEQCUDA);CHKERRQ(ierr);
32*4a314419SJunchao Zhang   ierr = VecCreateSeq(PETSC_COMM_SELF,n,&y);CHKERRQ(ierr);
33*4a314419SJunchao Zhang   ierr = VecSetType(y,VECSEQCUDA);CHKERRQ(ierr);
34*4a314419SJunchao Zhang 
35*4a314419SJunchao Zhang   /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
36*4a314419SJunchao Zhang   ierr = VecGetArray(x,&val);CHKERRQ(ierr);
37*4a314419SJunchao Zhang   for (i=0; i<n; i++) val[i] = i/2.0;
38*4a314419SJunchao Zhang   ierr = VecRestoreArray(x,&val);CHKERRQ(ierr);
39*4a314419SJunchao Zhang   ierr = VecScale(x,2.0);CHKERRQ(ierr);
40*4a314419SJunchao Zhang   ierr = VecSet(y,314);CHKERRQ(ierr);
41*4a314419SJunchao Zhang 
42*4a314419SJunchao Zhang   /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
43*4a314419SJunchao Zhang   ierr = VecGetArray(y,&val);
44*4a314419SJunchao Zhang   ierr = VecRestoreArray(y,&val);CHKERRQ(ierr);
45*4a314419SJunchao Zhang 
46*4a314419SJunchao Zhang   /* The vscat is simply a vector copy */
47*4a314419SJunchao Zhang   ierr = ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix);
48*4a314419SJunchao Zhang   ierr = ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy);
49*4a314419SJunchao Zhang   ierr = VecScatterCreate(x,ix,y,iy,&vscat);CHKERRQ(ierr);
50*4a314419SJunchao Zhang 
51*4a314419SJunchao Zhang   /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
52*4a314419SJunchao Zhang      cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed.
53*4a314419SJunchao Zhang    */
54*4a314419SJunchao Zhang   ierr = VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
55*4a314419SJunchao Zhang   ierr = VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
56*4a314419SJunchao Zhang   ierr = VecGetArrayRead(y,&yval);CHKERRQ(ierr);
57*4a314419SJunchao Zhang   /* Display the first and the last entries of y to see if it is valid on host */
58*4a314419SJunchao Zhang   ierr = PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%D] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]));CHKERRQ(ierr);
59*4a314419SJunchao Zhang   ierr = VecRestoreArrayRead(y,&yval);CHKERRQ(ierr);
60*4a314419SJunchao Zhang 
61*4a314419SJunchao Zhang   ierr = VecDestroy(&x);CHKERRQ(ierr);
62*4a314419SJunchao Zhang   ierr = VecDestroy(&y);CHKERRQ(ierr);
63*4a314419SJunchao Zhang   ierr = ISDestroy(&ix);CHKERRQ(ierr);
64*4a314419SJunchao Zhang   ierr = ISDestroy(&iy);CHKERRQ(ierr);
65*4a314419SJunchao Zhang   ierr = VecScatterDestroy(&vscat);CHKERRQ(ierr);
66*4a314419SJunchao Zhang   ierr = PetscFinalize();
67*4a314419SJunchao Zhang   return ierr;
68*4a314419SJunchao Zhang }
69*4a314419SJunchao Zhang 
70*4a314419SJunchao Zhang /*TEST
71*4a314419SJunchao Zhang 
72*4a314419SJunchao Zhang    test:
73*4a314419SJunchao Zhang     requires: cuda
74*4a314419SJunchao Zhang     #make sure the host memory is pinned
75*4a314419SJunchao Zhang     args: -vec_pinned_memory_min 0
76*4a314419SJunchao Zhang 
77*4a314419SJunchao Zhang TEST*/
78