14a314419SJunchao Zhang static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n"; 24a314419SJunchao Zhang /* 34a314419SJunchao Zhang SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous 44a314419SJunchao Zhang operations in the default stream and does not sync these operations since it assumes routines consume 54a314419SJunchao Zhang the destination data are also on the default stream. However, when destination data in on CPU, 64a314419SJunchao Zhang SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd(). 74a314419SJunchao Zhang */ 84a314419SJunchao Zhang 94a314419SJunchao Zhang #include <petscvec.h> 104a314419SJunchao Zhang int main(int argc,char **argv) 114a314419SJunchao Zhang { 124a314419SJunchao Zhang PetscErrorCode ierr; 134a314419SJunchao Zhang PetscInt i,n=100000; /* Big enough to make the asynchronous copy meaningful */ 144a314419SJunchao Zhang PetscScalar *val; 154a314419SJunchao Zhang const PetscScalar *yval; 164a314419SJunchao Zhang Vec x,y; 174a314419SJunchao Zhang PetscMPIInt size; 184a314419SJunchao Zhang IS ix,iy; 194a314419SJunchao Zhang VecScatter vscat; 204a314419SJunchao Zhang 214a314419SJunchao Zhang PetscFunctionBegin; 224a314419SJunchao Zhang ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr; 23*5f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Comm_size(PETSC_COMM_WORLD,&size)); 242c71b3e2SJacob Faibussowitsch PetscCheckFalse(size != 1,PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test"); 254a314419SJunchao Zhang 264a314419SJunchao Zhang /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector, 274a314419SJunchao Zhang since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous 284a314419SJunchao Zhang cudaMemcpyDeviceToHost. 294a314419SJunchao Zhang */ 30*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecCreateSeq(PETSC_COMM_WORLD,n,&x)); 31*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecSetFromOptions(x)); 32*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecCreateSeq(PETSC_COMM_WORLD,n,&y)); 33*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecSetFromOptions(y)); 344a314419SJunchao Zhang 354a314419SJunchao Zhang /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */ 36*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetArray(x,&val)); 374a314419SJunchao Zhang for (i=0; i<n; i++) val[i] = i/2.0; 38*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreArray(x,&val)); 39*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecScale(x,2.0)); 40*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecSet(y,314)); 414a314419SJunchao Zhang 424a314419SJunchao Zhang /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */ 43*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetArray(y,&val)); 44*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreArray(y,&val)); 454a314419SJunchao Zhang 464a314419SJunchao Zhang /* The vscat is simply a vector copy */ 47*5f80ce2aSJacob Faibussowitsch CHKERRQ(ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix)); 48*5f80ce2aSJacob Faibussowitsch CHKERRQ(ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy)); 49*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterCreate(x,ix,y,iy,&vscat)); 504a314419SJunchao Zhang 514a314419SJunchao Zhang /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous 524a314419SJunchao Zhang cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed. 534a314419SJunchao Zhang */ 54*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD)); 55*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD)); 56*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetArrayRead(y,&yval)); 574a314419SJunchao Zhang /* Display the first and the last entries of y to see if it is valid on host */ 58*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]))); 59*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreArrayRead(y,&yval)); 604a314419SJunchao Zhang 61*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecDestroy(&x)); 62*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecDestroy(&y)); 63*5f80ce2aSJacob Faibussowitsch CHKERRQ(ISDestroy(&ix)); 64*5f80ce2aSJacob Faibussowitsch CHKERRQ(ISDestroy(&iy)); 65*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterDestroy(&vscat)); 664a314419SJunchao Zhang ierr = PetscFinalize(); 674a314419SJunchao Zhang return ierr; 684a314419SJunchao Zhang } 694a314419SJunchao Zhang 704a314419SJunchao Zhang /*TEST 714a314419SJunchao Zhang 724a314419SJunchao Zhang test: 734a314419SJunchao Zhang requires: cuda 74328e583dSStefano Zampini diff_args: -j 754a314419SJunchao Zhang #make sure the host memory is pinned 7626e8e884SScott Kruger # sf_backend cuda is not needed if compiling only with cuda 7726e8e884SScott Kruger args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0 7826e8e884SScott Kruger 7926e8e884SScott Kruger test: 8026e8e884SScott Kruger suffix: hip 8126e8e884SScott Kruger requires: hip 82328e583dSStefano Zampini diff_args: -j 8326e8e884SScott Kruger output_file: output/ex2_1.out 8426e8e884SScott Kruger #make sure the host memory is pinned 8526e8e884SScott Kruger # sf_backend hip is not needed if compiling only with hip 8626e8e884SScott Kruger args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0 874a314419SJunchao Zhang 884a314419SJunchao Zhang TEST*/ 89