128e30874SSatish Balay #include <../src/mat/impls/baij/seq/baij.h> 2af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 328e30874SSatish Balay 428e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 528e30874SSatish Balay { 628e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 728e30874SSatish Balay IS iscol=a->col,isrow=a->row; 828e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 928e30874SSatish Balay const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 1028e30874SSatish Balay PetscInt i,nz; 1128e30874SSatish Balay const PetscInt bs =A->rmap->bs,bs2=a->bs2; 1228e30874SSatish Balay const MatScalar *aa=a->a,*v; 1328e30874SSatish Balay PetscScalar *x,*s,*t,*ls; 1428e30874SSatish Balay const PetscScalar *b; 1528e30874SSatish Balay 1628e30874SSatish Balay PetscFunctionBegin; 17*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 18*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 1928e30874SSatish Balay t = a->solve_work; 2028e30874SSatish Balay 21*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 22*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 2328e30874SSatish Balay 2428e30874SSatish Balay /* forward solve the lower triangular */ 25*9566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(t,b+bs*(*r++),bs)); 2628e30874SSatish Balay for (i=1; i<n; i++) { 2728e30874SSatish Balay v = aa + bs2*ai[i]; 2828e30874SSatish Balay vi = aj + ai[i]; 2928e30874SSatish Balay nz = a->diag[i] - ai[i]; 3028e30874SSatish Balay s = t + bs*i; 31*9566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(s,b+bs*(*r++),bs)); 3228e30874SSatish Balay while (nz--) { 3328e30874SSatish Balay PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 3428e30874SSatish Balay v += bs2; 3528e30874SSatish Balay } 3628e30874SSatish Balay } 3728e30874SSatish Balay /* backward solve the upper triangular */ 3828e30874SSatish Balay ls = a->solve_work + A->cmap->n; 3928e30874SSatish Balay for (i=n-1; i>=0; i--) { 4028e30874SSatish Balay v = aa + bs2*(a->diag[i] + 1); 4128e30874SSatish Balay vi = aj + a->diag[i] + 1; 4228e30874SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 43*9566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(ls,t+i*bs,bs)); 4428e30874SSatish Balay while (nz--) { 4528e30874SSatish Balay PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 4628e30874SSatish Balay v += bs2; 4728e30874SSatish Balay } 4828e30874SSatish Balay PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 49*9566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(x + bs*(*c--),t+i*bs,bs)); 5028e30874SSatish Balay } 5128e30874SSatish Balay 52*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 53*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 54*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 55*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 56*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n)); 5728e30874SSatish Balay PetscFunctionReturn(0); 5828e30874SSatish Balay } 5928e30874SSatish Balay 6028e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 6128e30874SSatish Balay { 6228e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 6328e30874SSatish Balay IS iscol=a->col,isrow=a->row; 6428e30874SSatish Balay const PetscInt *r,*c,*ai=a->i,*aj=a->j; 6528e30874SSatish Balay const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 6628e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 6728e30874SSatish Balay const MatScalar *aa=a->a,*v; 6828e30874SSatish Balay PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 6928e30874SSatish Balay const PetscScalar *b; 7028e30874SSatish Balay 7128e30874SSatish Balay PetscFunctionBegin; 72*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 73*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 7428e30874SSatish Balay t = a->solve_work; 7528e30874SSatish Balay 76*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 77*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 7828e30874SSatish Balay 7928e30874SSatish Balay /* forward solve the lower triangular */ 8028e30874SSatish Balay idx = 7*(*r++); 8128e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 8228e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 8328e30874SSatish Balay t[5] = b[5+idx]; t[6] = b[6+idx]; 8428e30874SSatish Balay 8528e30874SSatish Balay for (i=1; i<n; i++) { 8628e30874SSatish Balay v = aa + 49*ai[i]; 8728e30874SSatish Balay vi = aj + ai[i]; 8828e30874SSatish Balay nz = diag[i] - ai[i]; 8928e30874SSatish Balay idx = 7*(*r++); 9028e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 9128e30874SSatish Balay s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 9228e30874SSatish Balay while (nz--) { 9328e30874SSatish Balay idx = 7*(*vi++); 9428e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 9528e30874SSatish Balay x4 = t[3+idx];x5 = t[4+idx]; 9628e30874SSatish Balay x6 = t[5+idx];x7 = t[6+idx]; 9728e30874SSatish Balay s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 9828e30874SSatish Balay s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 9928e30874SSatish Balay s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 10028e30874SSatish Balay s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 10128e30874SSatish Balay s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 10228e30874SSatish Balay s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 10328e30874SSatish Balay s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 10428e30874SSatish Balay v += 49; 10528e30874SSatish Balay } 10628e30874SSatish Balay idx = 7*i; 10728e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 10828e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 10928e30874SSatish Balay t[5+idx] = s6;t[6+idx] = s7; 11028e30874SSatish Balay } 11128e30874SSatish Balay /* backward solve the upper triangular */ 11228e30874SSatish Balay for (i=n-1; i>=0; i--) { 11328e30874SSatish Balay v = aa + 49*diag[i] + 49; 11428e30874SSatish Balay vi = aj + diag[i] + 1; 11528e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 11628e30874SSatish Balay idt = 7*i; 11728e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 11828e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 11928e30874SSatish Balay s6 = t[5+idt];s7 = t[6+idt]; 12028e30874SSatish Balay while (nz--) { 12128e30874SSatish Balay idx = 7*(*vi++); 12228e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 12328e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 12428e30874SSatish Balay x6 = t[5+idx]; x7 = t[6+idx]; 12528e30874SSatish Balay s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 12628e30874SSatish Balay s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 12728e30874SSatish Balay s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 12828e30874SSatish Balay s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 12928e30874SSatish Balay s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 13028e30874SSatish Balay s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 13128e30874SSatish Balay s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13228e30874SSatish Balay v += 49; 13328e30874SSatish Balay } 13428e30874SSatish Balay idc = 7*(*c--); 13528e30874SSatish Balay v = aa + 49*diag[i]; 13628e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 13728e30874SSatish Balay v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 13828e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 13928e30874SSatish Balay v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 14028e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 14128e30874SSatish Balay v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 14228e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 14328e30874SSatish Balay v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 14428e30874SSatish Balay x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 14528e30874SSatish Balay v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 14628e30874SSatish Balay x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 14728e30874SSatish Balay v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 14828e30874SSatish Balay x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 14928e30874SSatish Balay v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 15028e30874SSatish Balay } 15128e30874SSatish Balay 152*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 153*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 154*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 155*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 156*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n)); 15728e30874SSatish Balay PetscFunctionReturn(0); 15828e30874SSatish Balay } 15928e30874SSatish Balay 16028e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 16128e30874SSatish Balay { 16228e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 16328e30874SSatish Balay IS iscol=a->col,isrow=a->row; 16428e30874SSatish Balay const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 16528e30874SSatish Balay const PetscInt n=a->mbs,*rout,*cout,*vi; 16628e30874SSatish Balay PetscInt i,nz,idx,idt,idc,m; 16728e30874SSatish Balay const MatScalar *aa=a->a,*v; 16828e30874SSatish Balay PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 16928e30874SSatish Balay const PetscScalar *b; 17028e30874SSatish Balay 17128e30874SSatish Balay PetscFunctionBegin; 172*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 173*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 17428e30874SSatish Balay t = a->solve_work; 17528e30874SSatish Balay 176*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 177*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout; 17828e30874SSatish Balay 17928e30874SSatish Balay /* forward solve the lower triangular */ 18028e30874SSatish Balay idx = 7*r[0]; 18128e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 18228e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 18328e30874SSatish Balay t[5] = b[5+idx]; t[6] = b[6+idx]; 18428e30874SSatish Balay 18528e30874SSatish Balay for (i=1; i<n; i++) { 18628e30874SSatish Balay v = aa + 49*ai[i]; 18728e30874SSatish Balay vi = aj + ai[i]; 18828e30874SSatish Balay nz = ai[i+1] - ai[i]; 18928e30874SSatish Balay idx = 7*r[i]; 19028e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 19128e30874SSatish Balay s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 19228e30874SSatish Balay for (m=0; m<nz; m++) { 19328e30874SSatish Balay idx = 7*vi[m]; 19428e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 19528e30874SSatish Balay x4 = t[3+idx];x5 = t[4+idx]; 19628e30874SSatish Balay x6 = t[5+idx];x7 = t[6+idx]; 19728e30874SSatish Balay s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 19828e30874SSatish Balay s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 19928e30874SSatish Balay s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 20028e30874SSatish Balay s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 20128e30874SSatish Balay s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 20228e30874SSatish Balay s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 20328e30874SSatish Balay s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 20428e30874SSatish Balay v += 49; 20528e30874SSatish Balay } 20628e30874SSatish Balay idx = 7*i; 20728e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 20828e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 20928e30874SSatish Balay t[5+idx] = s6;t[6+idx] = s7; 21028e30874SSatish Balay } 21128e30874SSatish Balay /* backward solve the upper triangular */ 21228e30874SSatish Balay for (i=n-1; i>=0; i--) { 21328e30874SSatish Balay v = aa + 49*(adiag[i+1]+1); 21428e30874SSatish Balay vi = aj + adiag[i+1]+1; 21528e30874SSatish Balay nz = adiag[i] - adiag[i+1] - 1; 21628e30874SSatish Balay idt = 7*i; 21728e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 21828e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 21928e30874SSatish Balay s6 = t[5+idt];s7 = t[6+idt]; 22028e30874SSatish Balay for (m=0; m<nz; m++) { 22128e30874SSatish Balay idx = 7*vi[m]; 22228e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 22328e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 22428e30874SSatish Balay x6 = t[5+idx]; x7 = t[6+idx]; 22528e30874SSatish Balay s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 22628e30874SSatish Balay s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 22728e30874SSatish Balay s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 22828e30874SSatish Balay s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 22928e30874SSatish Balay s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 23028e30874SSatish Balay s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 23128e30874SSatish Balay s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 23228e30874SSatish Balay v += 49; 23328e30874SSatish Balay } 23428e30874SSatish Balay idc = 7*c[i]; 23528e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 23628e30874SSatish Balay v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 23728e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 23828e30874SSatish Balay v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 23928e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 24028e30874SSatish Balay v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 24128e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 24228e30874SSatish Balay v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 24328e30874SSatish Balay x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 24428e30874SSatish Balay v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 24528e30874SSatish Balay x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 24628e30874SSatish Balay v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 24728e30874SSatish Balay x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 24828e30874SSatish Balay v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 24928e30874SSatish Balay } 25028e30874SSatish Balay 251*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 252*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 253*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 254*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 255*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n)); 25628e30874SSatish Balay PetscFunctionReturn(0); 25728e30874SSatish Balay } 25828e30874SSatish Balay 25928e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 26028e30874SSatish Balay { 26128e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 26228e30874SSatish Balay IS iscol=a->col,isrow=a->row; 26328e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 26428e30874SSatish Balay const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 26528e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 26628e30874SSatish Balay const MatScalar *aa=a->a,*v; 26728e30874SSatish Balay PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 26828e30874SSatish Balay const PetscScalar *b; 26928e30874SSatish Balay 27028e30874SSatish Balay PetscFunctionBegin; 271*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 272*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 27328e30874SSatish Balay t = a->solve_work; 27428e30874SSatish Balay 275*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 276*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 27728e30874SSatish Balay 27828e30874SSatish Balay /* forward solve the lower triangular */ 27928e30874SSatish Balay idx = 6*(*r++); 28028e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 28128e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; 28228e30874SSatish Balay t[4] = b[4+idx]; t[5] = b[5+idx]; 28328e30874SSatish Balay for (i=1; i<n; i++) { 28428e30874SSatish Balay v = aa + 36*ai[i]; 28528e30874SSatish Balay vi = aj + ai[i]; 28628e30874SSatish Balay nz = diag[i] - ai[i]; 28728e30874SSatish Balay idx = 6*(*r++); 28828e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 28928e30874SSatish Balay s5 = b[4+idx]; s6 = b[5+idx]; 29028e30874SSatish Balay while (nz--) { 29128e30874SSatish Balay idx = 6*(*vi++); 29228e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 29328e30874SSatish Balay x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 29428e30874SSatish Balay s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 29528e30874SSatish Balay s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 29628e30874SSatish Balay s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 29728e30874SSatish Balay s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 29828e30874SSatish Balay s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 29928e30874SSatish Balay s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 30028e30874SSatish Balay v += 36; 30128e30874SSatish Balay } 30228e30874SSatish Balay idx = 6*i; 30328e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 30428e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; 30528e30874SSatish Balay t[4+idx] = s5;t[5+idx] = s6; 30628e30874SSatish Balay } 30728e30874SSatish Balay /* backward solve the upper triangular */ 30828e30874SSatish Balay for (i=n-1; i>=0; i--) { 30928e30874SSatish Balay v = aa + 36*diag[i] + 36; 31028e30874SSatish Balay vi = aj + diag[i] + 1; 31128e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 31228e30874SSatish Balay idt = 6*i; 31328e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 31428e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; 31528e30874SSatish Balay s5 = t[4+idt];s6 = t[5+idt]; 31628e30874SSatish Balay while (nz--) { 31728e30874SSatish Balay idx = 6*(*vi++); 31828e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 31928e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; 32028e30874SSatish Balay x5 = t[4+idx]; x6 = t[5+idx]; 32128e30874SSatish Balay s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 32228e30874SSatish Balay s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 32328e30874SSatish Balay s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 32428e30874SSatish Balay s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 32528e30874SSatish Balay s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 32628e30874SSatish Balay s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 32728e30874SSatish Balay v += 36; 32828e30874SSatish Balay } 32928e30874SSatish Balay idc = 6*(*c--); 33028e30874SSatish Balay v = aa + 36*diag[i]; 33128e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 33228e30874SSatish Balay v[18]*s4+v[24]*s5+v[30]*s6; 33328e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 33428e30874SSatish Balay v[19]*s4+v[25]*s5+v[31]*s6; 33528e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 33628e30874SSatish Balay v[20]*s4+v[26]*s5+v[32]*s6; 33728e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 33828e30874SSatish Balay v[21]*s4+v[27]*s5+v[33]*s6; 33928e30874SSatish Balay x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 34028e30874SSatish Balay v[22]*s4+v[28]*s5+v[34]*s6; 34128e30874SSatish Balay x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 34228e30874SSatish Balay v[23]*s4+v[29]*s5+v[35]*s6; 34328e30874SSatish Balay } 34428e30874SSatish Balay 345*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 346*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 347*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 348*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 349*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n)); 35028e30874SSatish Balay PetscFunctionReturn(0); 35128e30874SSatish Balay } 35228e30874SSatish Balay 35328e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 35428e30874SSatish Balay { 35528e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 35628e30874SSatish Balay IS iscol=a->col,isrow=a->row; 35728e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 35828e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 35928e30874SSatish Balay PetscInt i,nz,idx,idt,idc,m; 36028e30874SSatish Balay const MatScalar *aa=a->a,*v; 36128e30874SSatish Balay PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 36228e30874SSatish Balay const PetscScalar *b; 36328e30874SSatish Balay 36428e30874SSatish Balay PetscFunctionBegin; 365*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 366*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 36728e30874SSatish Balay t = a->solve_work; 36828e30874SSatish Balay 369*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 370*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout; 37128e30874SSatish Balay 37228e30874SSatish Balay /* forward solve the lower triangular */ 37328e30874SSatish Balay idx = 6*r[0]; 37428e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 37528e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; 37628e30874SSatish Balay t[4] = b[4+idx]; t[5] = b[5+idx]; 37728e30874SSatish Balay for (i=1; i<n; i++) { 37828e30874SSatish Balay v = aa + 36*ai[i]; 37928e30874SSatish Balay vi = aj + ai[i]; 38028e30874SSatish Balay nz = ai[i+1] - ai[i]; 38128e30874SSatish Balay idx = 6*r[i]; 38228e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 38328e30874SSatish Balay s5 = b[4+idx]; s6 = b[5+idx]; 38428e30874SSatish Balay for (m=0; m<nz; m++) { 38528e30874SSatish Balay idx = 6*vi[m]; 38628e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 38728e30874SSatish Balay x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 38828e30874SSatish Balay s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 38928e30874SSatish Balay s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 39028e30874SSatish Balay s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 39128e30874SSatish Balay s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 39228e30874SSatish Balay s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 39328e30874SSatish Balay s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 39428e30874SSatish Balay v += 36; 39528e30874SSatish Balay } 39628e30874SSatish Balay idx = 6*i; 39728e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 39828e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; 39928e30874SSatish Balay t[4+idx] = s5;t[5+idx] = s6; 40028e30874SSatish Balay } 40128e30874SSatish Balay /* backward solve the upper triangular */ 40228e30874SSatish Balay for (i=n-1; i>=0; i--) { 40328e30874SSatish Balay v = aa + 36*(adiag[i+1]+1); 40428e30874SSatish Balay vi = aj + adiag[i+1]+1; 40528e30874SSatish Balay nz = adiag[i] - adiag[i+1] - 1; 40628e30874SSatish Balay idt = 6*i; 40728e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 40828e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; 40928e30874SSatish Balay s5 = t[4+idt];s6 = t[5+idt]; 41028e30874SSatish Balay for (m=0; m<nz; m++) { 41128e30874SSatish Balay idx = 6*vi[m]; 41228e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 41328e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; 41428e30874SSatish Balay x5 = t[4+idx]; x6 = t[5+idx]; 41528e30874SSatish Balay s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 41628e30874SSatish Balay s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 41728e30874SSatish Balay s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 41828e30874SSatish Balay s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 41928e30874SSatish Balay s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 42028e30874SSatish Balay s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 42128e30874SSatish Balay v += 36; 42228e30874SSatish Balay } 42328e30874SSatish Balay idc = 6*c[i]; 42428e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 42528e30874SSatish Balay v[18]*s4+v[24]*s5+v[30]*s6; 42628e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 42728e30874SSatish Balay v[19]*s4+v[25]*s5+v[31]*s6; 42828e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 42928e30874SSatish Balay v[20]*s4+v[26]*s5+v[32]*s6; 43028e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 43128e30874SSatish Balay v[21]*s4+v[27]*s5+v[33]*s6; 43228e30874SSatish Balay x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 43328e30874SSatish Balay v[22]*s4+v[28]*s5+v[34]*s6; 43428e30874SSatish Balay x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 43528e30874SSatish Balay v[23]*s4+v[29]*s5+v[35]*s6; 43628e30874SSatish Balay } 43728e30874SSatish Balay 438*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 439*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 440*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 441*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 442*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n)); 44328e30874SSatish Balay PetscFunctionReturn(0); 44428e30874SSatish Balay } 44528e30874SSatish Balay 44628e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 44728e30874SSatish Balay { 44828e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 44928e30874SSatish Balay IS iscol=a->col,isrow=a->row; 45028e30874SSatish Balay const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 45128e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 45228e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 45328e30874SSatish Balay const MatScalar *aa=a->a,*v; 45428e30874SSatish Balay PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 45528e30874SSatish Balay const PetscScalar *b; 45628e30874SSatish Balay 45728e30874SSatish Balay PetscFunctionBegin; 458*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 459*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 46028e30874SSatish Balay t = a->solve_work; 46128e30874SSatish Balay 462*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 463*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 46428e30874SSatish Balay 46528e30874SSatish Balay /* forward solve the lower triangular */ 46628e30874SSatish Balay idx = 5*(*r++); 46728e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 46828e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 46928e30874SSatish Balay for (i=1; i<n; i++) { 47028e30874SSatish Balay v = aa + 25*ai[i]; 47128e30874SSatish Balay vi = aj + ai[i]; 47228e30874SSatish Balay nz = diag[i] - ai[i]; 47328e30874SSatish Balay idx = 5*(*r++); 47428e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 47528e30874SSatish Balay s5 = b[4+idx]; 47628e30874SSatish Balay while (nz--) { 47728e30874SSatish Balay idx = 5*(*vi++); 47828e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 47928e30874SSatish Balay x4 = t[3+idx];x5 = t[4+idx]; 48028e30874SSatish Balay s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 48128e30874SSatish Balay s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 48228e30874SSatish Balay s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 48328e30874SSatish Balay s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 48428e30874SSatish Balay s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 48528e30874SSatish Balay v += 25; 48628e30874SSatish Balay } 48728e30874SSatish Balay idx = 5*i; 48828e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 48928e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 49028e30874SSatish Balay } 49128e30874SSatish Balay /* backward solve the upper triangular */ 49228e30874SSatish Balay for (i=n-1; i>=0; i--) { 49328e30874SSatish Balay v = aa + 25*diag[i] + 25; 49428e30874SSatish Balay vi = aj + diag[i] + 1; 49528e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 49628e30874SSatish Balay idt = 5*i; 49728e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 49828e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 49928e30874SSatish Balay while (nz--) { 50028e30874SSatish Balay idx = 5*(*vi++); 50128e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 50228e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 50328e30874SSatish Balay s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 50428e30874SSatish Balay s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 50528e30874SSatish Balay s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 50628e30874SSatish Balay s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 50728e30874SSatish Balay s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 50828e30874SSatish Balay v += 25; 50928e30874SSatish Balay } 51028e30874SSatish Balay idc = 5*(*c--); 51128e30874SSatish Balay v = aa + 25*diag[i]; 51228e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 51328e30874SSatish Balay v[15]*s4+v[20]*s5; 51428e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 51528e30874SSatish Balay v[16]*s4+v[21]*s5; 51628e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 51728e30874SSatish Balay v[17]*s4+v[22]*s5; 51828e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 51928e30874SSatish Balay v[18]*s4+v[23]*s5; 52028e30874SSatish Balay x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 52128e30874SSatish Balay v[19]*s4+v[24]*s5; 52228e30874SSatish Balay } 52328e30874SSatish Balay 524*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 525*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 526*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 527*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 528*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n)); 52928e30874SSatish Balay PetscFunctionReturn(0); 53028e30874SSatish Balay } 53128e30874SSatish Balay 53228e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 53328e30874SSatish Balay { 53428e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 53528e30874SSatish Balay IS iscol=a->col,isrow=a->row; 53628e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 53728e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 53828e30874SSatish Balay PetscInt i,nz,idx,idt,idc,m; 53928e30874SSatish Balay const MatScalar *aa=a->a,*v; 54028e30874SSatish Balay PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 54128e30874SSatish Balay const PetscScalar *b; 54228e30874SSatish Balay 54328e30874SSatish Balay PetscFunctionBegin; 544*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 545*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 54628e30874SSatish Balay t = a->solve_work; 54728e30874SSatish Balay 548*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 549*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout; 55028e30874SSatish Balay 55128e30874SSatish Balay /* forward solve the lower triangular */ 55228e30874SSatish Balay idx = 5*r[0]; 55328e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 55428e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 55528e30874SSatish Balay for (i=1; i<n; i++) { 55628e30874SSatish Balay v = aa + 25*ai[i]; 55728e30874SSatish Balay vi = aj + ai[i]; 55828e30874SSatish Balay nz = ai[i+1] - ai[i]; 55928e30874SSatish Balay idx = 5*r[i]; 56028e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 56128e30874SSatish Balay s5 = b[4+idx]; 56228e30874SSatish Balay for (m=0; m<nz; m++) { 56328e30874SSatish Balay idx = 5*vi[m]; 56428e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 56528e30874SSatish Balay x4 = t[3+idx];x5 = t[4+idx]; 56628e30874SSatish Balay s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 56728e30874SSatish Balay s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 56828e30874SSatish Balay s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 56928e30874SSatish Balay s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 57028e30874SSatish Balay s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 57128e30874SSatish Balay v += 25; 57228e30874SSatish Balay } 57328e30874SSatish Balay idx = 5*i; 57428e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 57528e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 57628e30874SSatish Balay } 57728e30874SSatish Balay /* backward solve the upper triangular */ 57828e30874SSatish Balay for (i=n-1; i>=0; i--) { 57928e30874SSatish Balay v = aa + 25*(adiag[i+1]+1); 58028e30874SSatish Balay vi = aj + adiag[i+1]+1; 58128e30874SSatish Balay nz = adiag[i] - adiag[i+1] - 1; 58228e30874SSatish Balay idt = 5*i; 58328e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 58428e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 58528e30874SSatish Balay for (m=0; m<nz; m++) { 58628e30874SSatish Balay idx = 5*vi[m]; 58728e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 58828e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 58928e30874SSatish Balay s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 59028e30874SSatish Balay s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 59128e30874SSatish Balay s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 59228e30874SSatish Balay s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 59328e30874SSatish Balay s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 59428e30874SSatish Balay v += 25; 59528e30874SSatish Balay } 59628e30874SSatish Balay idc = 5*c[i]; 59728e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 59828e30874SSatish Balay v[15]*s4+v[20]*s5; 59928e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 60028e30874SSatish Balay v[16]*s4+v[21]*s5; 60128e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 60228e30874SSatish Balay v[17]*s4+v[22]*s5; 60328e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 60428e30874SSatish Balay v[18]*s4+v[23]*s5; 60528e30874SSatish Balay x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 60628e30874SSatish Balay v[19]*s4+v[24]*s5; 60728e30874SSatish Balay } 60828e30874SSatish Balay 609*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 610*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 611*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 612*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 613*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n)); 61428e30874SSatish Balay PetscFunctionReturn(0); 61528e30874SSatish Balay } 61628e30874SSatish Balay 61728e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 61828e30874SSatish Balay { 61928e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 62028e30874SSatish Balay IS iscol=a->col,isrow=a->row; 62128e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 62228e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 62328e30874SSatish Balay const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 62428e30874SSatish Balay const MatScalar *aa=a->a,*v; 62528e30874SSatish Balay PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 62628e30874SSatish Balay const PetscScalar *b; 62728e30874SSatish Balay 62828e30874SSatish Balay PetscFunctionBegin; 629*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 630*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 63128e30874SSatish Balay t = a->solve_work; 63228e30874SSatish Balay 633*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 634*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 63528e30874SSatish Balay 63628e30874SSatish Balay /* forward solve the lower triangular */ 63728e30874SSatish Balay idx = 4*(*r++); 63828e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 63928e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; 64028e30874SSatish Balay for (i=1; i<n; i++) { 64128e30874SSatish Balay v = aa + 16*ai[i]; 64228e30874SSatish Balay vi = aj + ai[i]; 64328e30874SSatish Balay nz = diag[i] - ai[i]; 64428e30874SSatish Balay idx = 4*(*r++); 64528e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 64628e30874SSatish Balay while (nz--) { 64728e30874SSatish Balay idx = 4*(*vi++); 64828e30874SSatish Balay x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 64928e30874SSatish Balay s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 65028e30874SSatish Balay s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 65128e30874SSatish Balay s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 65228e30874SSatish Balay s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 65328e30874SSatish Balay v += 16; 65428e30874SSatish Balay } 65528e30874SSatish Balay idx = 4*i; 65628e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 65728e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; 65828e30874SSatish Balay } 65928e30874SSatish Balay /* backward solve the upper triangular */ 66028e30874SSatish Balay for (i=n-1; i>=0; i--) { 66128e30874SSatish Balay v = aa + 16*diag[i] + 16; 66228e30874SSatish Balay vi = aj + diag[i] + 1; 66328e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 66428e30874SSatish Balay idt = 4*i; 66528e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 66628e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; 66728e30874SSatish Balay while (nz--) { 66828e30874SSatish Balay idx = 4*(*vi++); 66928e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 67028e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; 67128e30874SSatish Balay s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 67228e30874SSatish Balay s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 67328e30874SSatish Balay s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 67428e30874SSatish Balay s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 67528e30874SSatish Balay v += 16; 67628e30874SSatish Balay } 67728e30874SSatish Balay idc = 4*(*c--); 67828e30874SSatish Balay v = aa + 16*diag[i]; 67928e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 68028e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 68128e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 68228e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 68328e30874SSatish Balay } 68428e30874SSatish Balay 685*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 686*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 687*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 688*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 689*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 69028e30874SSatish Balay PetscFunctionReturn(0); 69128e30874SSatish Balay } 69228e30874SSatish Balay 69328e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 69428e30874SSatish Balay { 69528e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 69628e30874SSatish Balay IS iscol=a->col,isrow=a->row; 69728e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 69828e30874SSatish Balay PetscInt i,nz,idx,idt,idc,m; 69928e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 70028e30874SSatish Balay const MatScalar *aa=a->a,*v; 70128e30874SSatish Balay PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 70228e30874SSatish Balay const PetscScalar *b; 70328e30874SSatish Balay 70428e30874SSatish Balay PetscFunctionBegin; 705*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 706*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 70728e30874SSatish Balay t = a->solve_work; 70828e30874SSatish Balay 709*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 710*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout; 71128e30874SSatish Balay 71228e30874SSatish Balay /* forward solve the lower triangular */ 71328e30874SSatish Balay idx = 4*r[0]; 71428e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 71528e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; 71628e30874SSatish Balay for (i=1; i<n; i++) { 71728e30874SSatish Balay v = aa + 16*ai[i]; 71828e30874SSatish Balay vi = aj + ai[i]; 71928e30874SSatish Balay nz = ai[i+1] - ai[i]; 72028e30874SSatish Balay idx = 4*r[i]; 72128e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 72228e30874SSatish Balay for (m=0; m<nz; m++) { 72328e30874SSatish Balay idx = 4*vi[m]; 72428e30874SSatish Balay x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 72528e30874SSatish Balay s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 72628e30874SSatish Balay s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 72728e30874SSatish Balay s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 72828e30874SSatish Balay s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 72928e30874SSatish Balay v += 16; 73028e30874SSatish Balay } 73128e30874SSatish Balay idx = 4*i; 73228e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 73328e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; 73428e30874SSatish Balay } 73528e30874SSatish Balay /* backward solve the upper triangular */ 73628e30874SSatish Balay for (i=n-1; i>=0; i--) { 73728e30874SSatish Balay v = aa + 16*(adiag[i+1]+1); 73828e30874SSatish Balay vi = aj + adiag[i+1]+1; 73928e30874SSatish Balay nz = adiag[i] - adiag[i+1] - 1; 74028e30874SSatish Balay idt = 4*i; 74128e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 74228e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; 74328e30874SSatish Balay for (m=0; m<nz; m++) { 74428e30874SSatish Balay idx = 4*vi[m]; 74528e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 74628e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; 74728e30874SSatish Balay s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 74828e30874SSatish Balay s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 74928e30874SSatish Balay s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 75028e30874SSatish Balay s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 75128e30874SSatish Balay v += 16; 75228e30874SSatish Balay } 75328e30874SSatish Balay idc = 4*c[i]; 75428e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 75528e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 75628e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 75728e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 75828e30874SSatish Balay } 75928e30874SSatish Balay 760*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 761*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 762*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 763*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 764*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 76528e30874SSatish Balay PetscFunctionReturn(0); 76628e30874SSatish Balay } 76728e30874SSatish Balay 76828e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 76928e30874SSatish Balay { 77028e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 77128e30874SSatish Balay IS iscol=a->col,isrow=a->row; 77228e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 77328e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 77428e30874SSatish Balay const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 77528e30874SSatish Balay const MatScalar *aa=a->a,*v; 77628e30874SSatish Balay MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 77728e30874SSatish Balay PetscScalar *x; 77828e30874SSatish Balay const PetscScalar *b; 77928e30874SSatish Balay 78028e30874SSatish Balay PetscFunctionBegin; 781*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 782*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 78328e30874SSatish Balay t = (MatScalar*)a->solve_work; 78428e30874SSatish Balay 785*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 786*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 78728e30874SSatish Balay 78828e30874SSatish Balay /* forward solve the lower triangular */ 78928e30874SSatish Balay idx = 4*(*r++); 79028e30874SSatish Balay t[0] = (MatScalar)b[idx]; 79128e30874SSatish Balay t[1] = (MatScalar)b[1+idx]; 79228e30874SSatish Balay t[2] = (MatScalar)b[2+idx]; 79328e30874SSatish Balay t[3] = (MatScalar)b[3+idx]; 79428e30874SSatish Balay for (i=1; i<n; i++) { 79528e30874SSatish Balay v = aa + 16*ai[i]; 79628e30874SSatish Balay vi = aj + ai[i]; 79728e30874SSatish Balay nz = diag[i] - ai[i]; 79828e30874SSatish Balay idx = 4*(*r++); 79928e30874SSatish Balay s1 = (MatScalar)b[idx]; 80028e30874SSatish Balay s2 = (MatScalar)b[1+idx]; 80128e30874SSatish Balay s3 = (MatScalar)b[2+idx]; 80228e30874SSatish Balay s4 = (MatScalar)b[3+idx]; 80328e30874SSatish Balay while (nz--) { 80428e30874SSatish Balay idx = 4*(*vi++); 80528e30874SSatish Balay x1 = t[idx]; 80628e30874SSatish Balay x2 = t[1+idx]; 80728e30874SSatish Balay x3 = t[2+idx]; 80828e30874SSatish Balay x4 = t[3+idx]; 80928e30874SSatish Balay s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 81028e30874SSatish Balay s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 81128e30874SSatish Balay s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 81228e30874SSatish Balay s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 81328e30874SSatish Balay v += 16; 81428e30874SSatish Balay } 81528e30874SSatish Balay idx = 4*i; 81628e30874SSatish Balay t[idx] = s1; 81728e30874SSatish Balay t[1+idx] = s2; 81828e30874SSatish Balay t[2+idx] = s3; 81928e30874SSatish Balay t[3+idx] = s4; 82028e30874SSatish Balay } 82128e30874SSatish Balay /* backward solve the upper triangular */ 82228e30874SSatish Balay for (i=n-1; i>=0; i--) { 82328e30874SSatish Balay v = aa + 16*diag[i] + 16; 82428e30874SSatish Balay vi = aj + diag[i] + 1; 82528e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 82628e30874SSatish Balay idt = 4*i; 82728e30874SSatish Balay s1 = t[idt]; 82828e30874SSatish Balay s2 = t[1+idt]; 82928e30874SSatish Balay s3 = t[2+idt]; 83028e30874SSatish Balay s4 = t[3+idt]; 83128e30874SSatish Balay while (nz--) { 83228e30874SSatish Balay idx = 4*(*vi++); 83328e30874SSatish Balay x1 = t[idx]; 83428e30874SSatish Balay x2 = t[1+idx]; 83528e30874SSatish Balay x3 = t[2+idx]; 83628e30874SSatish Balay x4 = t[3+idx]; 83728e30874SSatish Balay s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 83828e30874SSatish Balay s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 83928e30874SSatish Balay s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 84028e30874SSatish Balay s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 84128e30874SSatish Balay v += 16; 84228e30874SSatish Balay } 84328e30874SSatish Balay idc = 4*(*c--); 84428e30874SSatish Balay v = aa + 16*diag[i]; 84528e30874SSatish Balay t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 84628e30874SSatish Balay t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 84728e30874SSatish Balay t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 84828e30874SSatish Balay t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 84928e30874SSatish Balay x[idc] = (PetscScalar)t[idt]; 85028e30874SSatish Balay x[1+idc] = (PetscScalar)t[1+idt]; 85128e30874SSatish Balay x[2+idc] = (PetscScalar)t[2+idt]; 85228e30874SSatish Balay x[3+idc] = (PetscScalar)t[3+idt]; 85328e30874SSatish Balay } 85428e30874SSatish Balay 855*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 856*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 857*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 858*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 859*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 86028e30874SSatish Balay PetscFunctionReturn(0); 86128e30874SSatish Balay } 86228e30874SSatish Balay 86328e30874SSatish Balay #if defined(PETSC_HAVE_SSE) 86428e30874SSatish Balay 86528e30874SSatish Balay #include PETSC_HAVE_SSE 86628e30874SSatish Balay 86728e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 86828e30874SSatish Balay { 86928e30874SSatish Balay /* 87028e30874SSatish Balay Note: This code uses demotion of double 87128e30874SSatish Balay to float when performing the mixed-mode computation. 87228e30874SSatish Balay This may not be numerically reasonable for all applications. 87328e30874SSatish Balay */ 87428e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 87528e30874SSatish Balay IS iscol=a->col,isrow=a->row; 87628e30874SSatish Balay PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 87728e30874SSatish Balay const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 87828e30874SSatish Balay MatScalar *aa=a->a,*v; 87928e30874SSatish Balay PetscScalar *x,*b,*t; 88028e30874SSatish Balay 88128e30874SSatish Balay /* Make space in temp stack for 16 Byte Aligned arrays */ 88228e30874SSatish Balay float ssealignedspace[11],*tmps,*tmpx; 88328e30874SSatish Balay unsigned long offset; 88428e30874SSatish Balay 88528e30874SSatish Balay PetscFunctionBegin; 88628e30874SSatish Balay SSE_SCOPE_BEGIN; 88728e30874SSatish Balay 88828e30874SSatish Balay offset = (unsigned long)ssealignedspace % 16; 88928e30874SSatish Balay if (offset) offset = (16 - offset)/4; 89028e30874SSatish Balay tmps = &ssealignedspace[offset]; 89128e30874SSatish Balay tmpx = &ssealignedspace[offset+4]; 89228e30874SSatish Balay PREFETCH_NTA(aa+16*ai[1]); 89328e30874SSatish Balay 894*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(bb,&b)); 895*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 89628e30874SSatish Balay t = a->solve_work; 89728e30874SSatish Balay 898*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 899*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 90028e30874SSatish Balay 90128e30874SSatish Balay /* forward solve the lower triangular */ 90228e30874SSatish Balay idx = 4*(*r++); 90328e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 90428e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; 90528e30874SSatish Balay v = aa + 16*ai[1]; 90628e30874SSatish Balay 90728e30874SSatish Balay for (i=1; i<n;) { 90828e30874SSatish Balay PREFETCH_NTA(&v[8]); 90928e30874SSatish Balay vi = aj + ai[i]; 91028e30874SSatish Balay nz = diag[i] - ai[i]; 91128e30874SSatish Balay idx = 4*(*r++); 91228e30874SSatish Balay 91328e30874SSatish Balay /* Demote sum from double to float */ 91428e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 91528e30874SSatish Balay LOAD_PS(tmps,XMM7); 91628e30874SSatish Balay 91728e30874SSatish Balay while (nz--) { 91828e30874SSatish Balay PREFETCH_NTA(&v[16]); 91928e30874SSatish Balay idx = 4*(*vi++); 92028e30874SSatish Balay 92128e30874SSatish Balay /* Demote solution (so far) from double to float */ 92228e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 92328e30874SSatish Balay 92428e30874SSatish Balay /* 4x4 Matrix-Vector product with negative accumulation: */ 92528e30874SSatish Balay SSE_INLINE_BEGIN_2(tmpx,v) 92628e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 92728e30874SSatish Balay 92828e30874SSatish Balay /* First Column */ 92928e30874SSatish Balay SSE_COPY_PS(XMM0,XMM6) 93028e30874SSatish Balay SSE_SHUFFLE(XMM0,XMM0,0x00) 93128e30874SSatish Balay SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 93228e30874SSatish Balay SSE_SUB_PS(XMM7,XMM0) 93328e30874SSatish Balay 93428e30874SSatish Balay /* Second Column */ 93528e30874SSatish Balay SSE_COPY_PS(XMM1,XMM6) 93628e30874SSatish Balay SSE_SHUFFLE(XMM1,XMM1,0x55) 93728e30874SSatish Balay SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 93828e30874SSatish Balay SSE_SUB_PS(XMM7,XMM1) 93928e30874SSatish Balay 94028e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 94128e30874SSatish Balay 94228e30874SSatish Balay /* Third Column */ 94328e30874SSatish Balay SSE_COPY_PS(XMM2,XMM6) 94428e30874SSatish Balay SSE_SHUFFLE(XMM2,XMM2,0xAA) 94528e30874SSatish Balay SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 94628e30874SSatish Balay SSE_SUB_PS(XMM7,XMM2) 94728e30874SSatish Balay 94828e30874SSatish Balay /* Fourth Column */ 94928e30874SSatish Balay SSE_COPY_PS(XMM3,XMM6) 95028e30874SSatish Balay SSE_SHUFFLE(XMM3,XMM3,0xFF) 95128e30874SSatish Balay SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 95228e30874SSatish Balay SSE_SUB_PS(XMM7,XMM3) 95328e30874SSatish Balay SSE_INLINE_END_2 95428e30874SSatish Balay 95528e30874SSatish Balay v += 16; 95628e30874SSatish Balay } 95728e30874SSatish Balay idx = 4*i; 95828e30874SSatish Balay v = aa + 16*ai[++i]; 95928e30874SSatish Balay PREFETCH_NTA(v); 96028e30874SSatish Balay STORE_PS(tmps,XMM7); 96128e30874SSatish Balay 96228e30874SSatish Balay /* Promote result from float to double */ 96328e30874SSatish Balay CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 96428e30874SSatish Balay } 96528e30874SSatish Balay /* backward solve the upper triangular */ 96628e30874SSatish Balay idt = 4*(n-1); 96728e30874SSatish Balay ai16 = 16*diag[n-1]; 96828e30874SSatish Balay v = aa + ai16 + 16; 96928e30874SSatish Balay for (i=n-1; i>=0;) { 97028e30874SSatish Balay PREFETCH_NTA(&v[8]); 97128e30874SSatish Balay vi = aj + diag[i] + 1; 97228e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 97328e30874SSatish Balay 97428e30874SSatish Balay /* Demote accumulator from double to float */ 97528e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 97628e30874SSatish Balay LOAD_PS(tmps,XMM7); 97728e30874SSatish Balay 97828e30874SSatish Balay while (nz--) { 97928e30874SSatish Balay PREFETCH_NTA(&v[16]); 98028e30874SSatish Balay idx = 4*(*vi++); 98128e30874SSatish Balay 98228e30874SSatish Balay /* Demote solution (so far) from double to float */ 98328e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 98428e30874SSatish Balay 98528e30874SSatish Balay /* 4x4 Matrix-Vector Product with negative accumulation: */ 98628e30874SSatish Balay SSE_INLINE_BEGIN_2(tmpx,v) 98728e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 98828e30874SSatish Balay 98928e30874SSatish Balay /* First Column */ 99028e30874SSatish Balay SSE_COPY_PS(XMM0,XMM6) 99128e30874SSatish Balay SSE_SHUFFLE(XMM0,XMM0,0x00) 99228e30874SSatish Balay SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 99328e30874SSatish Balay SSE_SUB_PS(XMM7,XMM0) 99428e30874SSatish Balay 99528e30874SSatish Balay /* Second Column */ 99628e30874SSatish Balay SSE_COPY_PS(XMM1,XMM6) 99728e30874SSatish Balay SSE_SHUFFLE(XMM1,XMM1,0x55) 99828e30874SSatish Balay SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 99928e30874SSatish Balay SSE_SUB_PS(XMM7,XMM1) 100028e30874SSatish Balay 100128e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 100228e30874SSatish Balay 100328e30874SSatish Balay /* Third Column */ 100428e30874SSatish Balay SSE_COPY_PS(XMM2,XMM6) 100528e30874SSatish Balay SSE_SHUFFLE(XMM2,XMM2,0xAA) 100628e30874SSatish Balay SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 100728e30874SSatish Balay SSE_SUB_PS(XMM7,XMM2) 100828e30874SSatish Balay 100928e30874SSatish Balay /* Fourth Column */ 101028e30874SSatish Balay SSE_COPY_PS(XMM3,XMM6) 101128e30874SSatish Balay SSE_SHUFFLE(XMM3,XMM3,0xFF) 101228e30874SSatish Balay SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 101328e30874SSatish Balay SSE_SUB_PS(XMM7,XMM3) 101428e30874SSatish Balay SSE_INLINE_END_2 101528e30874SSatish Balay v += 16; 101628e30874SSatish Balay } 101728e30874SSatish Balay v = aa + ai16; 101828e30874SSatish Balay ai16 = 16*diag[--i]; 101928e30874SSatish Balay PREFETCH_NTA(aa+ai16+16); 102028e30874SSatish Balay /* 102128e30874SSatish Balay Scale the result by the diagonal 4x4 block, 102228e30874SSatish Balay which was inverted as part of the factorization 102328e30874SSatish Balay */ 102428e30874SSatish Balay SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 102528e30874SSatish Balay /* First Column */ 102628e30874SSatish Balay SSE_COPY_PS(XMM0,XMM7) 102728e30874SSatish Balay SSE_SHUFFLE(XMM0,XMM0,0x00) 102828e30874SSatish Balay SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 102928e30874SSatish Balay 103028e30874SSatish Balay /* Second Column */ 103128e30874SSatish Balay SSE_COPY_PS(XMM1,XMM7) 103228e30874SSatish Balay SSE_SHUFFLE(XMM1,XMM1,0x55) 103328e30874SSatish Balay SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 103428e30874SSatish Balay SSE_ADD_PS(XMM0,XMM1) 103528e30874SSatish Balay 103628e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 103728e30874SSatish Balay 103828e30874SSatish Balay /* Third Column */ 103928e30874SSatish Balay SSE_COPY_PS(XMM2,XMM7) 104028e30874SSatish Balay SSE_SHUFFLE(XMM2,XMM2,0xAA) 104128e30874SSatish Balay SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 104228e30874SSatish Balay SSE_ADD_PS(XMM0,XMM2) 104328e30874SSatish Balay 104428e30874SSatish Balay /* Fourth Column */ 104528e30874SSatish Balay SSE_COPY_PS(XMM3,XMM7) 104628e30874SSatish Balay SSE_SHUFFLE(XMM3,XMM3,0xFF) 104728e30874SSatish Balay SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 104828e30874SSatish Balay SSE_ADD_PS(XMM0,XMM3) 104928e30874SSatish Balay 105028e30874SSatish Balay SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 105128e30874SSatish Balay SSE_INLINE_END_3 105228e30874SSatish Balay 105328e30874SSatish Balay /* Promote solution from float to double */ 105428e30874SSatish Balay CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 105528e30874SSatish Balay 105628e30874SSatish Balay /* Apply reordering to t and stream into x. */ 105728e30874SSatish Balay /* This way, x doesn't pollute the cache. */ 105828e30874SSatish Balay /* Be careful with size: 2 doubles = 4 floats! */ 105928e30874SSatish Balay idc = 4*(*c--); 106028e30874SSatish Balay SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc]) 106128e30874SSatish Balay /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 106228e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 106328e30874SSatish Balay SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 106428e30874SSatish Balay /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 106528e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 106628e30874SSatish Balay SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 106728e30874SSatish Balay SSE_INLINE_END_2 106828e30874SSatish Balay v = aa + ai16 + 16; 106928e30874SSatish Balay idt -= 4; 107028e30874SSatish Balay } 107128e30874SSatish Balay 1072*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 1073*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 1074*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(bb,&b)); 1075*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 1076*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 107728e30874SSatish Balay SSE_SCOPE_END; 107828e30874SSatish Balay PetscFunctionReturn(0); 107928e30874SSatish Balay } 108028e30874SSatish Balay 108128e30874SSatish Balay #endif 108228e30874SSatish Balay 108328e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 108428e30874SSatish Balay { 108528e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 108628e30874SSatish Balay IS iscol=a->col,isrow=a->row; 108728e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 108828e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 108928e30874SSatish Balay const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 109028e30874SSatish Balay const MatScalar *aa=a->a,*v; 109128e30874SSatish Balay PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 109228e30874SSatish Balay const PetscScalar *b; 109328e30874SSatish Balay 109428e30874SSatish Balay PetscFunctionBegin; 1095*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 1096*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 109728e30874SSatish Balay t = a->solve_work; 109828e30874SSatish Balay 1099*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 1100*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 110128e30874SSatish Balay 110228e30874SSatish Balay /* forward solve the lower triangular */ 110328e30874SSatish Balay idx = 3*(*r++); 110428e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 110528e30874SSatish Balay for (i=1; i<n; i++) { 110628e30874SSatish Balay v = aa + 9*ai[i]; 110728e30874SSatish Balay vi = aj + ai[i]; 110828e30874SSatish Balay nz = diag[i] - ai[i]; 110928e30874SSatish Balay idx = 3*(*r++); 111028e30874SSatish Balay s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 111128e30874SSatish Balay while (nz--) { 111228e30874SSatish Balay idx = 3*(*vi++); 111328e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 111428e30874SSatish Balay s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 111528e30874SSatish Balay s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 111628e30874SSatish Balay s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 111728e30874SSatish Balay v += 9; 111828e30874SSatish Balay } 111928e30874SSatish Balay idx = 3*i; 112028e30874SSatish Balay t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 112128e30874SSatish Balay } 112228e30874SSatish Balay /* backward solve the upper triangular */ 112328e30874SSatish Balay for (i=n-1; i>=0; i--) { 112428e30874SSatish Balay v = aa + 9*diag[i] + 9; 112528e30874SSatish Balay vi = aj + diag[i] + 1; 112628e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 112728e30874SSatish Balay idt = 3*i; 112828e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 112928e30874SSatish Balay while (nz--) { 113028e30874SSatish Balay idx = 3*(*vi++); 113128e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 113228e30874SSatish Balay s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 113328e30874SSatish Balay s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 113428e30874SSatish Balay s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 113528e30874SSatish Balay v += 9; 113628e30874SSatish Balay } 113728e30874SSatish Balay idc = 3*(*c--); 113828e30874SSatish Balay v = aa + 9*diag[i]; 113928e30874SSatish Balay x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 114028e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 114128e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 114228e30874SSatish Balay } 1143*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 1144*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 1145*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 1146*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 1147*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n)); 114828e30874SSatish Balay PetscFunctionReturn(0); 114928e30874SSatish Balay } 115028e30874SSatish Balay 115128e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 115228e30874SSatish Balay { 115328e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 115428e30874SSatish Balay IS iscol=a->col,isrow=a->row; 115528e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 115628e30874SSatish Balay PetscInt i,nz,idx,idt,idc,m; 115728e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 115828e30874SSatish Balay const MatScalar *aa=a->a,*v; 115928e30874SSatish Balay PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 116028e30874SSatish Balay const PetscScalar *b; 116128e30874SSatish Balay 116228e30874SSatish Balay PetscFunctionBegin; 1163*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 1164*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 116528e30874SSatish Balay t = a->solve_work; 116628e30874SSatish Balay 1167*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 1168*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout; 116928e30874SSatish Balay 117028e30874SSatish Balay /* forward solve the lower triangular */ 117128e30874SSatish Balay idx = 3*r[0]; 117228e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 117328e30874SSatish Balay for (i=1; i<n; i++) { 117428e30874SSatish Balay v = aa + 9*ai[i]; 117528e30874SSatish Balay vi = aj + ai[i]; 117628e30874SSatish Balay nz = ai[i+1] - ai[i]; 117728e30874SSatish Balay idx = 3*r[i]; 117828e30874SSatish Balay s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 117928e30874SSatish Balay for (m=0; m<nz; m++) { 118028e30874SSatish Balay idx = 3*vi[m]; 118128e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 118228e30874SSatish Balay s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 118328e30874SSatish Balay s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 118428e30874SSatish Balay s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 118528e30874SSatish Balay v += 9; 118628e30874SSatish Balay } 118728e30874SSatish Balay idx = 3*i; 118828e30874SSatish Balay t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 118928e30874SSatish Balay } 119028e30874SSatish Balay /* backward solve the upper triangular */ 119128e30874SSatish Balay for (i=n-1; i>=0; i--) { 119228e30874SSatish Balay v = aa + 9*(adiag[i+1]+1); 119328e30874SSatish Balay vi = aj + adiag[i+1]+1; 119428e30874SSatish Balay nz = adiag[i] - adiag[i+1] - 1; 119528e30874SSatish Balay idt = 3*i; 119628e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 119728e30874SSatish Balay for (m=0; m<nz; m++) { 119828e30874SSatish Balay idx = 3*vi[m]; 119928e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 120028e30874SSatish Balay s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 120128e30874SSatish Balay s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 120228e30874SSatish Balay s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 120328e30874SSatish Balay v += 9; 120428e30874SSatish Balay } 120528e30874SSatish Balay idc = 3*c[i]; 120628e30874SSatish Balay x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 120728e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 120828e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 120928e30874SSatish Balay } 1210*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 1211*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 1212*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 1213*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 1214*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n)); 121528e30874SSatish Balay PetscFunctionReturn(0); 121628e30874SSatish Balay } 121728e30874SSatish Balay 121828e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 121928e30874SSatish Balay { 122028e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 122128e30874SSatish Balay IS iscol=a->col,isrow=a->row; 122228e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 122328e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 122428e30874SSatish Balay const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 122528e30874SSatish Balay const MatScalar *aa=a->a,*v; 122628e30874SSatish Balay PetscScalar *x,s1,s2,x1,x2,*t; 122728e30874SSatish Balay const PetscScalar *b; 122828e30874SSatish Balay 122928e30874SSatish Balay PetscFunctionBegin; 1230*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 1231*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 123228e30874SSatish Balay t = a->solve_work; 123328e30874SSatish Balay 1234*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 1235*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 123628e30874SSatish Balay 123728e30874SSatish Balay /* forward solve the lower triangular */ 123828e30874SSatish Balay idx = 2*(*r++); 123928e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 124028e30874SSatish Balay for (i=1; i<n; i++) { 124128e30874SSatish Balay v = aa + 4*ai[i]; 124228e30874SSatish Balay vi = aj + ai[i]; 124328e30874SSatish Balay nz = diag[i] - ai[i]; 124428e30874SSatish Balay idx = 2*(*r++); 124528e30874SSatish Balay s1 = b[idx]; s2 = b[1+idx]; 124628e30874SSatish Balay while (nz--) { 124728e30874SSatish Balay idx = 2*(*vi++); 124828e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 124928e30874SSatish Balay s1 -= v[0]*x1 + v[2]*x2; 125028e30874SSatish Balay s2 -= v[1]*x1 + v[3]*x2; 125128e30874SSatish Balay v += 4; 125228e30874SSatish Balay } 125328e30874SSatish Balay idx = 2*i; 125428e30874SSatish Balay t[idx] = s1; t[1+idx] = s2; 125528e30874SSatish Balay } 125628e30874SSatish Balay /* backward solve the upper triangular */ 125728e30874SSatish Balay for (i=n-1; i>=0; i--) { 125828e30874SSatish Balay v = aa + 4*diag[i] + 4; 125928e30874SSatish Balay vi = aj + diag[i] + 1; 126028e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 126128e30874SSatish Balay idt = 2*i; 126228e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 126328e30874SSatish Balay while (nz--) { 126428e30874SSatish Balay idx = 2*(*vi++); 126528e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 126628e30874SSatish Balay s1 -= v[0]*x1 + v[2]*x2; 126728e30874SSatish Balay s2 -= v[1]*x1 + v[3]*x2; 126828e30874SSatish Balay v += 4; 126928e30874SSatish Balay } 127028e30874SSatish Balay idc = 2*(*c--); 127128e30874SSatish Balay v = aa + 4*diag[i]; 127228e30874SSatish Balay x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 127328e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 127428e30874SSatish Balay } 1275*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 1276*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 1277*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 1278*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 1279*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n)); 128028e30874SSatish Balay PetscFunctionReturn(0); 128128e30874SSatish Balay } 128228e30874SSatish Balay 128328e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 128428e30874SSatish Balay { 128528e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 128628e30874SSatish Balay IS iscol=a->col,isrow=a->row; 128728e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 128828e30874SSatish Balay PetscInt i,nz,idx,jdx,idt,idc,m; 128928e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 129028e30874SSatish Balay const MatScalar *aa=a->a,*v; 129128e30874SSatish Balay PetscScalar *x,s1,s2,x1,x2,*t; 129228e30874SSatish Balay const PetscScalar *b; 129328e30874SSatish Balay 129428e30874SSatish Balay PetscFunctionBegin; 1295*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 1296*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 129728e30874SSatish Balay t = a->solve_work; 129828e30874SSatish Balay 1299*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 1300*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout; 130128e30874SSatish Balay 130228e30874SSatish Balay /* forward solve the lower triangular */ 130328e30874SSatish Balay idx = 2*r[0]; 130428e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 130528e30874SSatish Balay for (i=1; i<n; i++) { 130628e30874SSatish Balay v = aa + 4*ai[i]; 130728e30874SSatish Balay vi = aj + ai[i]; 130828e30874SSatish Balay nz = ai[i+1] - ai[i]; 130928e30874SSatish Balay idx = 2*r[i]; 131028e30874SSatish Balay s1 = b[idx]; s2 = b[1+idx]; 131128e30874SSatish Balay for (m=0; m<nz; m++) { 131228e30874SSatish Balay jdx = 2*vi[m]; 131328e30874SSatish Balay x1 = t[jdx]; x2 = t[1+jdx]; 131428e30874SSatish Balay s1 -= v[0]*x1 + v[2]*x2; 131528e30874SSatish Balay s2 -= v[1]*x1 + v[3]*x2; 131628e30874SSatish Balay v += 4; 131728e30874SSatish Balay } 131828e30874SSatish Balay idx = 2*i; 131928e30874SSatish Balay t[idx] = s1; t[1+idx] = s2; 132028e30874SSatish Balay } 132128e30874SSatish Balay /* backward solve the upper triangular */ 132228e30874SSatish Balay for (i=n-1; i>=0; i--) { 132328e30874SSatish Balay v = aa + 4*(adiag[i+1]+1); 132428e30874SSatish Balay vi = aj + adiag[i+1]+1; 132528e30874SSatish Balay nz = adiag[i] - adiag[i+1] - 1; 132628e30874SSatish Balay idt = 2*i; 132728e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 132828e30874SSatish Balay for (m=0; m<nz; m++) { 132928e30874SSatish Balay idx = 2*vi[m]; 133028e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 133128e30874SSatish Balay s1 -= v[0]*x1 + v[2]*x2; 133228e30874SSatish Balay s2 -= v[1]*x1 + v[3]*x2; 133328e30874SSatish Balay v += 4; 133428e30874SSatish Balay } 133528e30874SSatish Balay idc = 2*c[i]; 133628e30874SSatish Balay x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 133728e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 133828e30874SSatish Balay } 1339*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 1340*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 1341*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 1342*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 1343*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n)); 134428e30874SSatish Balay PetscFunctionReturn(0); 134528e30874SSatish Balay } 134628e30874SSatish Balay 134728e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 134828e30874SSatish Balay { 134928e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 135028e30874SSatish Balay IS iscol=a->col,isrow=a->row; 135128e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 135228e30874SSatish Balay PetscInt i,nz; 135328e30874SSatish Balay const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 135428e30874SSatish Balay const MatScalar *aa=a->a,*v; 135528e30874SSatish Balay PetscScalar *x,s1,*t; 135628e30874SSatish Balay const PetscScalar *b; 135728e30874SSatish Balay 135828e30874SSatish Balay PetscFunctionBegin; 135928e30874SSatish Balay if (!n) PetscFunctionReturn(0); 136028e30874SSatish Balay 1361*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 1362*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 136328e30874SSatish Balay t = a->solve_work; 136428e30874SSatish Balay 1365*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 1366*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 136728e30874SSatish Balay 136828e30874SSatish Balay /* forward solve the lower triangular */ 136928e30874SSatish Balay t[0] = b[*r++]; 137028e30874SSatish Balay for (i=1; i<n; i++) { 137128e30874SSatish Balay v = aa + ai[i]; 137228e30874SSatish Balay vi = aj + ai[i]; 137328e30874SSatish Balay nz = diag[i] - ai[i]; 137428e30874SSatish Balay s1 = b[*r++]; 137528e30874SSatish Balay while (nz--) { 137628e30874SSatish Balay s1 -= (*v++)*t[*vi++]; 137728e30874SSatish Balay } 137828e30874SSatish Balay t[i] = s1; 137928e30874SSatish Balay } 138028e30874SSatish Balay /* backward solve the upper triangular */ 138128e30874SSatish Balay for (i=n-1; i>=0; i--) { 138228e30874SSatish Balay v = aa + diag[i] + 1; 138328e30874SSatish Balay vi = aj + diag[i] + 1; 138428e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 138528e30874SSatish Balay s1 = t[i]; 138628e30874SSatish Balay while (nz--) { 138728e30874SSatish Balay s1 -= (*v++)*t[*vi++]; 138828e30874SSatish Balay } 138928e30874SSatish Balay x[*c--] = t[i] = aa[diag[i]]*s1; 139028e30874SSatish Balay } 139128e30874SSatish Balay 1392*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 1393*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 1394*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 1395*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 1396*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*1*(a->nz) - A->cmap->n)); 139728e30874SSatish Balay PetscFunctionReturn(0); 139828e30874SSatish Balay } 139928e30874SSatish Balay 140028e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 140128e30874SSatish Balay { 140228e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 140328e30874SSatish Balay IS iscol = a->col,isrow = a->row; 140428e30874SSatish Balay PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 140528e30874SSatish Balay const PetscInt *rout,*cout,*r,*c; 140628e30874SSatish Balay PetscScalar *x,*tmp,sum; 140728e30874SSatish Balay const PetscScalar *b; 140828e30874SSatish Balay const MatScalar *aa = a->a,*v; 140928e30874SSatish Balay 141028e30874SSatish Balay PetscFunctionBegin; 141128e30874SSatish Balay if (!n) PetscFunctionReturn(0); 141228e30874SSatish Balay 1413*9566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb,&b)); 1414*9566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx,&x)); 141528e30874SSatish Balay tmp = a->solve_work; 141628e30874SSatish Balay 1417*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&rout)); r = rout; 1418*9566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&cout)); c = cout; 141928e30874SSatish Balay 142028e30874SSatish Balay /* forward solve the lower triangular */ 142128e30874SSatish Balay tmp[0] = b[r[0]]; 142228e30874SSatish Balay v = aa; 142328e30874SSatish Balay vi = aj; 142428e30874SSatish Balay for (i=1; i<n; i++) { 142528e30874SSatish Balay nz = ai[i+1] - ai[i]; 142628e30874SSatish Balay sum = b[r[i]]; 142728e30874SSatish Balay PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 142828e30874SSatish Balay tmp[i] = sum; 142928e30874SSatish Balay v += nz; vi += nz; 143028e30874SSatish Balay } 143128e30874SSatish Balay 143228e30874SSatish Balay /* backward solve the upper triangular */ 143328e30874SSatish Balay for (i=n-1; i>=0; i--) { 143428e30874SSatish Balay v = aa + adiag[i+1]+1; 143528e30874SSatish Balay vi = aj + adiag[i+1]+1; 143628e30874SSatish Balay nz = adiag[i]-adiag[i+1]-1; 143728e30874SSatish Balay sum = tmp[i]; 143828e30874SSatish Balay PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 143928e30874SSatish Balay x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 144028e30874SSatish Balay } 144128e30874SSatish Balay 1442*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&rout)); 1443*9566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&cout)); 1444*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb,&b)); 1445*9566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx,&x)); 1446*9566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0*a->nz - A->cmap->n)); 144728e30874SSatish Balay PetscFunctionReturn(0); 144828e30874SSatish Balay } 1449