14a314419SJunchao Zhang static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n"; 24a314419SJunchao Zhang /* 34a314419SJunchao Zhang SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous 44a314419SJunchao Zhang operations in the default stream and does not sync these operations since it assumes routines consume 54a314419SJunchao Zhang the destination data are also on the default stream. However, when destination data in on CPU, 64a314419SJunchao Zhang SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd(). 74a314419SJunchao Zhang */ 84a314419SJunchao Zhang 94a314419SJunchao Zhang #include <petscvec.h> 104a314419SJunchao Zhang int main(int argc,char **argv) 114a314419SJunchao Zhang { 124a314419SJunchao Zhang PetscErrorCode ierr; 134a314419SJunchao Zhang PetscInt i,n=100000; /* Big enough to make the asynchronous copy meaningful */ 144a314419SJunchao Zhang PetscScalar *val; 154a314419SJunchao Zhang const PetscScalar *yval; 164a314419SJunchao Zhang Vec x,y; 174a314419SJunchao Zhang PetscMPIInt size; 184a314419SJunchao Zhang IS ix,iy; 194a314419SJunchao Zhang VecScatter vscat; 204a314419SJunchao Zhang 214a314419SJunchao Zhang PetscFunctionBegin; 224a314419SJunchao Zhang ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr; 23ffc4695bSBarry Smith ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRMPI(ierr); 244a314419SJunchao Zhang if (size != 1) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test\n"); 254a314419SJunchao Zhang 264a314419SJunchao Zhang /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector, 274a314419SJunchao Zhang since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous 284a314419SJunchao Zhang cudaMemcpyDeviceToHost. 294a314419SJunchao Zhang */ 30*26e8e884SScott Kruger ierr = VecCreateSeq(PETSC_COMM_WORLD,n,&x);CHKERRQ(ierr); 31*26e8e884SScott Kruger ierr = VecSetFromOptions(x);CHKERRQ(ierr); 32*26e8e884SScott Kruger ierr = VecCreateSeq(PETSC_COMM_WORLD,n,&y);CHKERRQ(ierr); 33*26e8e884SScott Kruger ierr = VecSetFromOptions(y);CHKERRQ(ierr); 344a314419SJunchao Zhang 354a314419SJunchao Zhang /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */ 364a314419SJunchao Zhang ierr = VecGetArray(x,&val);CHKERRQ(ierr); 374a314419SJunchao Zhang for (i=0; i<n; i++) val[i] = i/2.0; 384a314419SJunchao Zhang ierr = VecRestoreArray(x,&val);CHKERRQ(ierr); 394a314419SJunchao Zhang ierr = VecScale(x,2.0);CHKERRQ(ierr); 404a314419SJunchao Zhang ierr = VecSet(y,314);CHKERRQ(ierr); 414a314419SJunchao Zhang 424a314419SJunchao Zhang /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */ 434a314419SJunchao Zhang ierr = VecGetArray(y,&val); 444a314419SJunchao Zhang ierr = VecRestoreArray(y,&val);CHKERRQ(ierr); 454a314419SJunchao Zhang 464a314419SJunchao Zhang /* The vscat is simply a vector copy */ 474a314419SJunchao Zhang ierr = ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix); 484a314419SJunchao Zhang ierr = ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy); 494a314419SJunchao Zhang ierr = VecScatterCreate(x,ix,y,iy,&vscat);CHKERRQ(ierr); 504a314419SJunchao Zhang 514a314419SJunchao Zhang /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous 524a314419SJunchao Zhang cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed. 534a314419SJunchao Zhang */ 544a314419SJunchao Zhang ierr = VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 554a314419SJunchao Zhang ierr = VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 564a314419SJunchao Zhang ierr = VecGetArrayRead(y,&yval);CHKERRQ(ierr); 574a314419SJunchao Zhang /* Display the first and the last entries of y to see if it is valid on host */ 584a314419SJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%D] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]));CHKERRQ(ierr); 594a314419SJunchao Zhang ierr = VecRestoreArrayRead(y,&yval);CHKERRQ(ierr); 604a314419SJunchao Zhang 614a314419SJunchao Zhang ierr = VecDestroy(&x);CHKERRQ(ierr); 624a314419SJunchao Zhang ierr = VecDestroy(&y);CHKERRQ(ierr); 634a314419SJunchao Zhang ierr = ISDestroy(&ix);CHKERRQ(ierr); 644a314419SJunchao Zhang ierr = ISDestroy(&iy);CHKERRQ(ierr); 654a314419SJunchao Zhang ierr = VecScatterDestroy(&vscat);CHKERRQ(ierr); 664a314419SJunchao Zhang ierr = PetscFinalize(); 674a314419SJunchao Zhang return ierr; 684a314419SJunchao Zhang } 694a314419SJunchao Zhang 704a314419SJunchao Zhang /*TEST 714a314419SJunchao Zhang 724a314419SJunchao Zhang test: 734a314419SJunchao Zhang requires: cuda 744a314419SJunchao Zhang #make sure the host memory is pinned 75*26e8e884SScott Kruger # sf_backend cuda is not needed if compiling only with cuda 76*26e8e884SScott Kruger args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0 77*26e8e884SScott Kruger 78*26e8e884SScott Kruger test: 79*26e8e884SScott Kruger suffix: hip 80*26e8e884SScott Kruger requires: hip 81*26e8e884SScott Kruger output_file: output/ex2_1.out 82*26e8e884SScott Kruger #make sure the host memory is pinned 83*26e8e884SScott Kruger # sf_backend hip is not needed if compiling only with hip 84*26e8e884SScott Kruger args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0 854a314419SJunchao Zhang 864a314419SJunchao Zhang TEST*/ 87