14a314419SJunchao Zhang static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n"; 24a314419SJunchao Zhang /* 34a314419SJunchao Zhang SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous 44a314419SJunchao Zhang operations in the default stream and does not sync these operations since it assumes routines consume 54a314419SJunchao Zhang the destination data are also on the default stream. However, when destination data in on CPU, 64a314419SJunchao Zhang SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd(). 74a314419SJunchao Zhang */ 84a314419SJunchao Zhang 94a314419SJunchao Zhang #include <petscvec.h> 104a314419SJunchao Zhang int main(int argc,char **argv) 114a314419SJunchao Zhang { 124a314419SJunchao Zhang PetscInt i,n=100000; /* Big enough to make the asynchronous copy meaningful */ 134a314419SJunchao Zhang PetscScalar *val; 144a314419SJunchao Zhang const PetscScalar *yval; 154a314419SJunchao Zhang Vec x,y; 164a314419SJunchao Zhang PetscMPIInt size; 174a314419SJunchao Zhang IS ix,iy; 184a314419SJunchao Zhang VecScatter vscat; 194a314419SJunchao Zhang 204a314419SJunchao Zhang PetscFunctionBegin; 21*b122ec5aSJacob Faibussowitsch CHKERRQ(PetscInitialize(&argc,&argv,(char*)0,help)); 225f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Comm_size(PETSC_COMM_WORLD,&size)); 232c71b3e2SJacob Faibussowitsch PetscCheckFalse(size != 1,PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test"); 244a314419SJunchao Zhang 254a314419SJunchao Zhang /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector, 264a314419SJunchao Zhang since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous 274a314419SJunchao Zhang cudaMemcpyDeviceToHost. 284a314419SJunchao Zhang */ 295f80ce2aSJacob Faibussowitsch CHKERRQ(VecCreateSeq(PETSC_COMM_WORLD,n,&x)); 305f80ce2aSJacob Faibussowitsch CHKERRQ(VecSetFromOptions(x)); 315f80ce2aSJacob Faibussowitsch CHKERRQ(VecCreateSeq(PETSC_COMM_WORLD,n,&y)); 325f80ce2aSJacob Faibussowitsch CHKERRQ(VecSetFromOptions(y)); 334a314419SJunchao Zhang 344a314419SJunchao Zhang /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */ 355f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetArray(x,&val)); 364a314419SJunchao Zhang for (i=0; i<n; i++) val[i] = i/2.0; 375f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreArray(x,&val)); 385f80ce2aSJacob Faibussowitsch CHKERRQ(VecScale(x,2.0)); 395f80ce2aSJacob Faibussowitsch CHKERRQ(VecSet(y,314)); 404a314419SJunchao Zhang 414a314419SJunchao Zhang /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */ 425f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetArray(y,&val)); 435f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreArray(y,&val)); 444a314419SJunchao Zhang 454a314419SJunchao Zhang /* The vscat is simply a vector copy */ 465f80ce2aSJacob Faibussowitsch CHKERRQ(ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix)); 475f80ce2aSJacob Faibussowitsch CHKERRQ(ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy)); 485f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterCreate(x,ix,y,iy,&vscat)); 494a314419SJunchao Zhang 504a314419SJunchao Zhang /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous 514a314419SJunchao Zhang cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed. 524a314419SJunchao Zhang */ 535f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD)); 545f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD)); 555f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetArrayRead(y,&yval)); 564a314419SJunchao Zhang /* Display the first and the last entries of y to see if it is valid on host */ 575f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]))); 585f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreArrayRead(y,&yval)); 594a314419SJunchao Zhang 605f80ce2aSJacob Faibussowitsch CHKERRQ(VecDestroy(&x)); 615f80ce2aSJacob Faibussowitsch CHKERRQ(VecDestroy(&y)); 625f80ce2aSJacob Faibussowitsch CHKERRQ(ISDestroy(&ix)); 635f80ce2aSJacob Faibussowitsch CHKERRQ(ISDestroy(&iy)); 645f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterDestroy(&vscat)); 65*b122ec5aSJacob Faibussowitsch CHKERRQ(PetscFinalize()); 66*b122ec5aSJacob Faibussowitsch return 0; 674a314419SJunchao Zhang } 684a314419SJunchao Zhang 694a314419SJunchao Zhang /*TEST 704a314419SJunchao Zhang 714a314419SJunchao Zhang test: 724a314419SJunchao Zhang requires: cuda 73328e583dSStefano Zampini diff_args: -j 744a314419SJunchao Zhang #make sure the host memory is pinned 7526e8e884SScott Kruger # sf_backend cuda is not needed if compiling only with cuda 7626e8e884SScott Kruger args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0 7726e8e884SScott Kruger 7826e8e884SScott Kruger test: 7926e8e884SScott Kruger suffix: hip 8026e8e884SScott Kruger requires: hip 81328e583dSStefano Zampini diff_args: -j 8226e8e884SScott Kruger output_file: output/ex2_1.out 8326e8e884SScott Kruger #make sure the host memory is pinned 8426e8e884SScott Kruger # sf_backend hip is not needed if compiling only with hip 8526e8e884SScott Kruger args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0 864a314419SJunchao Zhang 874a314419SJunchao Zhang TEST*/ 88