1*4a314419SJunchao Zhang static char help[]= "Test SF cuda stream synchronization in device to host communication\n\n"; 2*4a314419SJunchao Zhang /* 3*4a314419SJunchao Zhang SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous 4*4a314419SJunchao Zhang operations in the default stream and does not sync these operations since it assumes routines consume 5*4a314419SJunchao Zhang the destination data are also on the default stream. However, when destination data in on CPU, 6*4a314419SJunchao Zhang SF must guarentee the data is ready to use on CPU after PetscSFXxxEnd(). 7*4a314419SJunchao Zhang */ 8*4a314419SJunchao Zhang 9*4a314419SJunchao Zhang #include <petscvec.h> 10*4a314419SJunchao Zhang int main(int argc,char **argv) 11*4a314419SJunchao Zhang { 12*4a314419SJunchao Zhang PetscErrorCode ierr; 13*4a314419SJunchao Zhang PetscInt i,n=100000; /* Big enough to make the asynchronous copy meaningful */ 14*4a314419SJunchao Zhang PetscScalar *val; 15*4a314419SJunchao Zhang const PetscScalar *yval; 16*4a314419SJunchao Zhang Vec x,y; 17*4a314419SJunchao Zhang PetscMPIInt size; 18*4a314419SJunchao Zhang IS ix,iy; 19*4a314419SJunchao Zhang VecScatter vscat; 20*4a314419SJunchao Zhang 21*4a314419SJunchao Zhang PetscFunctionBegin; 22*4a314419SJunchao Zhang ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr; 23*4a314419SJunchao Zhang ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); 24*4a314419SJunchao Zhang if (size != 1) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"This is a uni-processor test\n"); 25*4a314419SJunchao Zhang 26*4a314419SJunchao Zhang /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector, 27*4a314419SJunchao Zhang since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous 28*4a314419SJunchao Zhang cudaMemcpyDeviceToHost. 29*4a314419SJunchao Zhang */ 30*4a314419SJunchao Zhang ierr = VecCreateSeq(PETSC_COMM_SELF,n,&x);CHKERRQ(ierr); 31*4a314419SJunchao Zhang ierr = VecSetType(x,VECSEQCUDA);CHKERRQ(ierr); 32*4a314419SJunchao Zhang ierr = VecCreateSeq(PETSC_COMM_SELF,n,&y);CHKERRQ(ierr); 33*4a314419SJunchao Zhang ierr = VecSetType(y,VECSEQCUDA);CHKERRQ(ierr); 34*4a314419SJunchao Zhang 35*4a314419SJunchao Zhang /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */ 36*4a314419SJunchao Zhang ierr = VecGetArray(x,&val);CHKERRQ(ierr); 37*4a314419SJunchao Zhang for (i=0; i<n; i++) val[i] = i/2.0; 38*4a314419SJunchao Zhang ierr = VecRestoreArray(x,&val);CHKERRQ(ierr); 39*4a314419SJunchao Zhang ierr = VecScale(x,2.0);CHKERRQ(ierr); 40*4a314419SJunchao Zhang ierr = VecSet(y,314);CHKERRQ(ierr); 41*4a314419SJunchao Zhang 42*4a314419SJunchao Zhang /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */ 43*4a314419SJunchao Zhang ierr = VecGetArray(y,&val); 44*4a314419SJunchao Zhang ierr = VecRestoreArray(y,&val);CHKERRQ(ierr); 45*4a314419SJunchao Zhang 46*4a314419SJunchao Zhang /* The vscat is simply a vector copy */ 47*4a314419SJunchao Zhang ierr = ISCreateStride(PETSC_COMM_SELF,n,0,1,&ix); 48*4a314419SJunchao Zhang ierr = ISCreateStride(PETSC_COMM_SELF,n,0,1,&iy); 49*4a314419SJunchao Zhang ierr = VecScatterCreate(x,ix,y,iy,&vscat);CHKERRQ(ierr); 50*4a314419SJunchao Zhang 51*4a314419SJunchao Zhang /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous 52*4a314419SJunchao Zhang cudaMemcpy or kernels, but it must guarentee y is ready to use on host. Otherwise, wrong data will be displayed. 53*4a314419SJunchao Zhang */ 54*4a314419SJunchao Zhang ierr = VecScatterBegin(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 55*4a314419SJunchao Zhang ierr = VecScatterEnd(vscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 56*4a314419SJunchao Zhang ierr = VecGetArrayRead(y,&yval);CHKERRQ(ierr); 57*4a314419SJunchao Zhang /* Display the first and the last entries of y to see if it is valid on host */ 58*4a314419SJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"y[0]=%.f, y[%D] = %.f\n",(float)PetscRealPart(yval[0]),n-1,(float)PetscRealPart(yval[n-1]));CHKERRQ(ierr); 59*4a314419SJunchao Zhang ierr = VecRestoreArrayRead(y,&yval);CHKERRQ(ierr); 60*4a314419SJunchao Zhang 61*4a314419SJunchao Zhang ierr = VecDestroy(&x);CHKERRQ(ierr); 62*4a314419SJunchao Zhang ierr = VecDestroy(&y);CHKERRQ(ierr); 63*4a314419SJunchao Zhang ierr = ISDestroy(&ix);CHKERRQ(ierr); 64*4a314419SJunchao Zhang ierr = ISDestroy(&iy);CHKERRQ(ierr); 65*4a314419SJunchao Zhang ierr = VecScatterDestroy(&vscat);CHKERRQ(ierr); 66*4a314419SJunchao Zhang ierr = PetscFinalize(); 67*4a314419SJunchao Zhang return ierr; 68*4a314419SJunchao Zhang } 69*4a314419SJunchao Zhang 70*4a314419SJunchao Zhang /*TEST 71*4a314419SJunchao Zhang 72*4a314419SJunchao Zhang test: 73*4a314419SJunchao Zhang requires: cuda 74*4a314419SJunchao Zhang #make sure the host memory is pinned 75*4a314419SJunchao Zhang args: -vec_pinned_memory_min 0 76*4a314419SJunchao Zhang 77*4a314419SJunchao Zhang TEST*/ 78