12c733ed4SBarry Smith #include <../src/mat/impls/baij/seq/baij.h> 22c733ed4SBarry Smith #include <petsc/private/kernels/blockinvert.h> 32c733ed4SBarry Smith 42c733ed4SBarry Smith /* ----------------------------------------------------------- */ 52c733ed4SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 62c733ed4SBarry Smith { 72c733ed4SBarry Smith Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 82c733ed4SBarry Smith IS iscol=a->col,isrow=a->row; 92c733ed4SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 102c733ed4SBarry Smith PetscInt i,nz,j; 112c733ed4SBarry Smith const PetscInt n =a->mbs,bs=A->rmap->bs,bs2=a->bs2; 122c733ed4SBarry Smith const MatScalar *aa=a->a,*v; 132c733ed4SBarry Smith PetscScalar *x,*t,*ls; 142c733ed4SBarry Smith const PetscScalar *b; 152c733ed4SBarry Smith 162c733ed4SBarry Smith PetscFunctionBegin; 17*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 18*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 192c733ed4SBarry Smith t = a->solve_work; 202c733ed4SBarry Smith 21*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 22*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout; 232c733ed4SBarry Smith 242c733ed4SBarry Smith /* copy the b into temp work space according to permutation */ 252c733ed4SBarry Smith for (i=0; i<n; i++) { 262c733ed4SBarry Smith for (j=0; j<bs; j++) { 272c733ed4SBarry Smith t[i*bs+j] = b[c[i]*bs+j]; 282c733ed4SBarry Smith } 292c733ed4SBarry Smith } 302c733ed4SBarry Smith 312c733ed4SBarry Smith /* forward solve the upper triangular transpose */ 322c733ed4SBarry Smith ls = a->solve_work + A->cmap->n; 332c733ed4SBarry Smith for (i=0; i<n; i++) { 34*9566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(ls,t+i*bs,bs)); 352c733ed4SBarry Smith PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 362c733ed4SBarry Smith v = aa + bs2*(a->diag[i] + 1); 372c733ed4SBarry Smith vi = aj + a->diag[i] + 1; 382c733ed4SBarry Smith nz = ai[i+1] - a->diag[i] - 1; 392c733ed4SBarry Smith while (nz--) { 402c733ed4SBarry Smith PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 412c733ed4SBarry Smith v += bs2; 422c733ed4SBarry Smith } 432c733ed4SBarry Smith } 442c733ed4SBarry Smith 452c733ed4SBarry Smith /* backward solve the lower triangular transpose */ 462c733ed4SBarry Smith for (i=n-1; i>=0; i--) { 472c733ed4SBarry Smith v = aa + bs2*ai[i]; 482c733ed4SBarry Smith vi = aj + ai[i]; 492c733ed4SBarry Smith nz = a->diag[i] - ai[i]; 502c733ed4SBarry Smith while (nz--) { 512c733ed4SBarry Smith PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 522c733ed4SBarry Smith v += bs2; 532c733ed4SBarry Smith } 542c733ed4SBarry Smith } 552c733ed4SBarry Smith 562c733ed4SBarry Smith /* copy t into x according to permutation */ 572c733ed4SBarry Smith for (i=0; i<n; i++) { 582c733ed4SBarry Smith for (j=0; j<bs; j++) { 592c733ed4SBarry Smith x[bs*r[i]+j] = t[bs*i+j]; 602c733ed4SBarry Smith } 612c733ed4SBarry Smith } 622c733ed4SBarry Smith 63*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 64*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 65*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 66*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 67*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n)); 682c733ed4SBarry Smith PetscFunctionReturn(0); 692c733ed4SBarry Smith } 702c733ed4SBarry Smith 712c733ed4SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 722c733ed4SBarry Smith { 732c733ed4SBarry Smith Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 742c733ed4SBarry Smith IS iscol=a->col,isrow=a->row; 752c733ed4SBarry Smith const PetscInt *r,*c,*rout,*cout; 762c733ed4SBarry Smith const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 772c733ed4SBarry Smith PetscInt i,j,nz; 782c733ed4SBarry Smith const PetscInt bs =A->rmap->bs,bs2=a->bs2; 792c733ed4SBarry Smith const MatScalar *aa=a->a,*v; 802c733ed4SBarry Smith PetscScalar *x,*t,*ls; 812c733ed4SBarry Smith const PetscScalar *b; 822c733ed4SBarry Smith 832c733ed4SBarry Smith PetscFunctionBegin; 84*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 85*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 862c733ed4SBarry Smith t = a->solve_work; 872c733ed4SBarry Smith 88*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 89*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout; 902c733ed4SBarry Smith 912c733ed4SBarry Smith /* copy the b into temp work space according to permutation */ 922c733ed4SBarry Smith for (i=0; i<n; i++) { 932c733ed4SBarry Smith for (j=0; j<bs; j++) { 942c733ed4SBarry Smith t[i*bs+j] = b[c[i]*bs+j]; 952c733ed4SBarry Smith } 962c733ed4SBarry Smith } 972c733ed4SBarry Smith 982c733ed4SBarry Smith /* forward solve the upper triangular transpose */ 992c733ed4SBarry Smith ls = a->solve_work + A->cmap->n; 1002c733ed4SBarry Smith for (i=0; i<n; i++) { 101*9566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(ls,t+i*bs,bs)); 1022c733ed4SBarry Smith PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 1032c733ed4SBarry Smith v = aa + bs2*(diag[i] - 1); 1042c733ed4SBarry Smith vi = aj + diag[i] - 1; 1052c733ed4SBarry Smith nz = diag[i] - diag[i+1] - 1; 1062c733ed4SBarry Smith for (j=0; j>-nz; j--) { 1072c733ed4SBarry Smith PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 1082c733ed4SBarry Smith v -= bs2; 1092c733ed4SBarry Smith } 1102c733ed4SBarry Smith } 1112c733ed4SBarry Smith 1122c733ed4SBarry Smith /* backward solve the lower triangular transpose */ 1132c733ed4SBarry Smith for (i=n-1; i>=0; i--) { 1142c733ed4SBarry Smith v = aa + bs2*ai[i]; 1152c733ed4SBarry Smith vi = aj + ai[i]; 1162c733ed4SBarry Smith nz = ai[i+1] - ai[i]; 1172c733ed4SBarry Smith for (j=0; j<nz; j++) { 1182c733ed4SBarry Smith PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 1192c733ed4SBarry Smith v += bs2; 1202c733ed4SBarry Smith } 1212c733ed4SBarry Smith } 1222c733ed4SBarry Smith 1232c733ed4SBarry Smith /* copy t into x according to permutation */ 1242c733ed4SBarry Smith for (i=0; i<n; i++) { 1252c733ed4SBarry Smith for (j=0; j<bs; j++) { 1262c733ed4SBarry Smith x[bs*r[i]+j] = t[bs*i+j]; 1272c733ed4SBarry Smith } 1282c733ed4SBarry Smith } 1292c733ed4SBarry Smith 130*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 131*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 132*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 133*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 134*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n)); 1352c733ed4SBarry Smith PetscFunctionReturn(0); 1362c733ed4SBarry Smith } 137