14a314419SJunchao Zhang static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n"; 24a314419SJunchao Zhang /* 34a314419SJunchao Zhang SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous 44a314419SJunchao Zhang operations in the default stream and does not sync these operations since it assumes routines consume 54a314419SJunchao Zhang the destination data are also on the default stream. However, when destination data in on CPU, 64a314419SJunchao Zhang SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd(). 74a314419SJunchao Zhang */ 84a314419SJunchao Zhang 94a314419SJunchao Zhang #include <petscvec.h> 104a314419SJunchao Zhang int main(int argc,char **argv) 114a314419SJunchao Zhang { 124a314419SJunchao Zhang PetscInt i,n=100000; /* Big enough to make the asynchronous copy meaningful */ 134a314419SJunchao Zhang PetscScalar *val; 144a314419SJunchao Zhang const PetscScalar *yval; 154a314419SJunchao Zhang Vec x,y; 164a314419SJunchao Zhang PetscMPIInt size; 174a314419SJunchao Zhang IS ix,iy; 184a314419SJunchao Zhang VecScatter vscat; 194a314419SJunchao Zhang 204a314419SJunchao Zhang PetscFunctionBegin; 219566063dSJacob Faibussowitsch PetscCall(PetscInitialize(&argc,&argv,(char*)0,help)); 229566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD,&size)); 23*08401ef6SPierre Jolivet PetscCheck(size == 1,PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test"); 244a314419SJunchao Zhang 254a314419SJunchao Zhang /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector, 264a314419SJunchao Zhang since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous 274a314419SJunchao Zhang cudaMemcpyDeviceToHost. 284a314419SJunchao Zhang */ 299566063dSJacob Faibussowitsch PetscCall(VecCreateSeq(PETSC_COMM_WORLD,n,&x)); 309566063dSJacob Faibussowitsch PetscCall(VecSetFromOptions(x)); 319566063dSJacob Faibussowitsch PetscCall(VecCreateSeq(PETSC_COMM_WORLD,n,&y)); 329566063dSJacob Faibussowitsch PetscCall(VecSetFromOptions(y)); 334a314419SJunchao Zhang 344a314419SJunchao Zhang /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */ 359566063dSJacob Faibussowitsch PetscCall(VecGetArray(x,&val)); 364a314419SJunchao Zhang for (i=0; i<n; i++) val[i] = i/2.0; 379566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(x,&val)); 389566063dSJacob Faibussowitsch PetscCall(VecScale(x,2.0)); 399566063dSJacob Faibussowitsch PetscCall(VecSet(y,314)); 404a314419SJunchao Zhang 414a314419SJunchao Zhang /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */ 429566063dSJacob Faibussowitsch PetscCall(VecGetArray(y,&val)); 439566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(y,&val)); 444a314419SJunchao Zhang 454a314419SJunchao Zhang /* The vscat is simply a vector copy */ 469566063dSJacob Faibussowitsch PetscCall(ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix)); 479566063dSJacob Faibussowitsch PetscCall(ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy)); 489566063dSJacob Faibussowitsch PetscCall(VecScatterCreate(x,ix,y,iy,&vscat)); 494a314419SJunchao Zhang 504a314419SJunchao Zhang /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous 514a314419SJunchao Zhang cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed. 524a314419SJunchao Zhang */ 539566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD)); 549566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD)); 559566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(y,&yval)); 564a314419SJunchao Zhang /* Display the first and the last entries of y to see if it is valid on host */ 579566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]))); 589566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(y,&yval)); 594a314419SJunchao Zhang 609566063dSJacob Faibussowitsch PetscCall(VecDestroy(&x)); 619566063dSJacob Faibussowitsch PetscCall(VecDestroy(&y)); 629566063dSJacob Faibussowitsch PetscCall(ISDestroy(&ix)); 639566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iy)); 649566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&vscat)); 659566063dSJacob Faibussowitsch PetscCall(PetscFinalize()); 66b122ec5aSJacob Faibussowitsch return 0; 674a314419SJunchao Zhang } 684a314419SJunchao Zhang 694a314419SJunchao Zhang /*TEST 704a314419SJunchao Zhang 714a314419SJunchao Zhang test: 724a314419SJunchao Zhang requires: cuda 73328e583dSStefano Zampini diff_args: -j 744a314419SJunchao Zhang #make sure the host memory is pinned 7526e8e884SScott Kruger # sf_backend cuda is not needed if compiling only with cuda 7626e8e884SScott Kruger args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0 7726e8e884SScott Kruger 7826e8e884SScott Kruger test: 7926e8e884SScott Kruger suffix: hip 8026e8e884SScott Kruger requires: hip 81328e583dSStefano Zampini diff_args: -j 8226e8e884SScott Kruger output_file: output/ex2_1.out 8326e8e884SScott Kruger #make sure the host memory is pinned 8426e8e884SScott Kruger # sf_backend hip is not needed if compiling only with hip 8526e8e884SScott Kruger args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0 864a314419SJunchao Zhang 874a314419SJunchao Zhang TEST*/ 88