1*28e30874SSatish Balay #include <../src/mat/impls/baij/seq/baij.h> 2*28e30874SSatish Balay #include <petsc-private/kernels/blockinvert.h> 3*28e30874SSatish Balay 4*28e30874SSatish Balay #undef __FUNCT__ 5*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 6*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 7*28e30874SSatish Balay { 8*28e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 9*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 10*28e30874SSatish Balay PetscErrorCode ierr; 11*28e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 12*28e30874SSatish Balay const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 13*28e30874SSatish Balay PetscInt i,nz; 14*28e30874SSatish Balay const PetscInt bs =A->rmap->bs,bs2=a->bs2; 15*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 16*28e30874SSatish Balay PetscScalar *x,*s,*t,*ls; 17*28e30874SSatish Balay const PetscScalar *b; 18*28e30874SSatish Balay 19*28e30874SSatish Balay PetscFunctionBegin; 20*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 21*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22*28e30874SSatish Balay t = a->solve_work; 23*28e30874SSatish Balay 24*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 25*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 26*28e30874SSatish Balay 27*28e30874SSatish Balay /* forward solve the lower triangular */ 28*28e30874SSatish Balay ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 29*28e30874SSatish Balay for (i=1; i<n; i++) { 30*28e30874SSatish Balay v = aa + bs2*ai[i]; 31*28e30874SSatish Balay vi = aj + ai[i]; 32*28e30874SSatish Balay nz = a->diag[i] - ai[i]; 33*28e30874SSatish Balay s = t + bs*i; 34*28e30874SSatish Balay ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 35*28e30874SSatish Balay while (nz--) { 36*28e30874SSatish Balay PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 37*28e30874SSatish Balay v += bs2; 38*28e30874SSatish Balay } 39*28e30874SSatish Balay } 40*28e30874SSatish Balay /* backward solve the upper triangular */ 41*28e30874SSatish Balay ls = a->solve_work + A->cmap->n; 42*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 43*28e30874SSatish Balay v = aa + bs2*(a->diag[i] + 1); 44*28e30874SSatish Balay vi = aj + a->diag[i] + 1; 45*28e30874SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 46*28e30874SSatish Balay ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 47*28e30874SSatish Balay while (nz--) { 48*28e30874SSatish Balay PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 49*28e30874SSatish Balay v += bs2; 50*28e30874SSatish Balay } 51*28e30874SSatish Balay PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 52*28e30874SSatish Balay ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 53*28e30874SSatish Balay } 54*28e30874SSatish Balay 55*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 56*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 57*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 58*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 59*28e30874SSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 60*28e30874SSatish Balay PetscFunctionReturn(0); 61*28e30874SSatish Balay } 62*28e30874SSatish Balay 63*28e30874SSatish Balay #undef __FUNCT__ 64*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 65*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 66*28e30874SSatish Balay { 67*28e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 68*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 69*28e30874SSatish Balay PetscErrorCode ierr; 70*28e30874SSatish Balay const PetscInt *r,*c,*ai=a->i,*aj=a->j; 71*28e30874SSatish Balay const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 72*28e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 73*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 74*28e30874SSatish Balay PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 75*28e30874SSatish Balay const PetscScalar *b; 76*28e30874SSatish Balay 77*28e30874SSatish Balay PetscFunctionBegin; 78*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 79*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 80*28e30874SSatish Balay t = a->solve_work; 81*28e30874SSatish Balay 82*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 83*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 84*28e30874SSatish Balay 85*28e30874SSatish Balay /* forward solve the lower triangular */ 86*28e30874SSatish Balay idx = 7*(*r++); 87*28e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 88*28e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 89*28e30874SSatish Balay t[5] = b[5+idx]; t[6] = b[6+idx]; 90*28e30874SSatish Balay 91*28e30874SSatish Balay for (i=1; i<n; i++) { 92*28e30874SSatish Balay v = aa + 49*ai[i]; 93*28e30874SSatish Balay vi = aj + ai[i]; 94*28e30874SSatish Balay nz = diag[i] - ai[i]; 95*28e30874SSatish Balay idx = 7*(*r++); 96*28e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 97*28e30874SSatish Balay s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 98*28e30874SSatish Balay while (nz--) { 99*28e30874SSatish Balay idx = 7*(*vi++); 100*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 101*28e30874SSatish Balay x4 = t[3+idx];x5 = t[4+idx]; 102*28e30874SSatish Balay x6 = t[5+idx];x7 = t[6+idx]; 103*28e30874SSatish Balay s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 104*28e30874SSatish Balay s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 105*28e30874SSatish Balay s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 106*28e30874SSatish Balay s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 107*28e30874SSatish Balay s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 108*28e30874SSatish Balay s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 109*28e30874SSatish Balay s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 110*28e30874SSatish Balay v += 49; 111*28e30874SSatish Balay } 112*28e30874SSatish Balay idx = 7*i; 113*28e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 114*28e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 115*28e30874SSatish Balay t[5+idx] = s6;t[6+idx] = s7; 116*28e30874SSatish Balay } 117*28e30874SSatish Balay /* backward solve the upper triangular */ 118*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 119*28e30874SSatish Balay v = aa + 49*diag[i] + 49; 120*28e30874SSatish Balay vi = aj + diag[i] + 1; 121*28e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 122*28e30874SSatish Balay idt = 7*i; 123*28e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 124*28e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 125*28e30874SSatish Balay s6 = t[5+idt];s7 = t[6+idt]; 126*28e30874SSatish Balay while (nz--) { 127*28e30874SSatish Balay idx = 7*(*vi++); 128*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 129*28e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 130*28e30874SSatish Balay x6 = t[5+idx]; x7 = t[6+idx]; 131*28e30874SSatish Balay s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 132*28e30874SSatish Balay s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 133*28e30874SSatish Balay s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 134*28e30874SSatish Balay s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 135*28e30874SSatish Balay s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 136*28e30874SSatish Balay s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 137*28e30874SSatish Balay s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 138*28e30874SSatish Balay v += 49; 139*28e30874SSatish Balay } 140*28e30874SSatish Balay idc = 7*(*c--); 141*28e30874SSatish Balay v = aa + 49*diag[i]; 142*28e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 143*28e30874SSatish Balay v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 144*28e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 145*28e30874SSatish Balay v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 146*28e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 147*28e30874SSatish Balay v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 148*28e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 149*28e30874SSatish Balay v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 150*28e30874SSatish Balay x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 151*28e30874SSatish Balay v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 152*28e30874SSatish Balay x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 153*28e30874SSatish Balay v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 154*28e30874SSatish Balay x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 155*28e30874SSatish Balay v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 156*28e30874SSatish Balay } 157*28e30874SSatish Balay 158*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 159*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 160*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 161*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 162*28e30874SSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 163*28e30874SSatish Balay PetscFunctionReturn(0); 164*28e30874SSatish Balay } 165*28e30874SSatish Balay 166*28e30874SSatish Balay #undef __FUNCT__ 167*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 168*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 169*28e30874SSatish Balay { 170*28e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 171*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 172*28e30874SSatish Balay PetscErrorCode ierr; 173*28e30874SSatish Balay const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 174*28e30874SSatish Balay const PetscInt n=a->mbs,*rout,*cout,*vi; 175*28e30874SSatish Balay PetscInt i,nz,idx,idt,idc,m; 176*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 177*28e30874SSatish Balay PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 178*28e30874SSatish Balay const PetscScalar *b; 179*28e30874SSatish Balay 180*28e30874SSatish Balay PetscFunctionBegin; 181*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 182*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 183*28e30874SSatish Balay t = a->solve_work; 184*28e30874SSatish Balay 185*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 186*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 187*28e30874SSatish Balay 188*28e30874SSatish Balay /* forward solve the lower triangular */ 189*28e30874SSatish Balay idx = 7*r[0]; 190*28e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 191*28e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 192*28e30874SSatish Balay t[5] = b[5+idx]; t[6] = b[6+idx]; 193*28e30874SSatish Balay 194*28e30874SSatish Balay for (i=1; i<n; i++) { 195*28e30874SSatish Balay v = aa + 49*ai[i]; 196*28e30874SSatish Balay vi = aj + ai[i]; 197*28e30874SSatish Balay nz = ai[i+1] - ai[i]; 198*28e30874SSatish Balay idx = 7*r[i]; 199*28e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 200*28e30874SSatish Balay s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 201*28e30874SSatish Balay for (m=0; m<nz; m++) { 202*28e30874SSatish Balay idx = 7*vi[m]; 203*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 204*28e30874SSatish Balay x4 = t[3+idx];x5 = t[4+idx]; 205*28e30874SSatish Balay x6 = t[5+idx];x7 = t[6+idx]; 206*28e30874SSatish Balay s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 207*28e30874SSatish Balay s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 208*28e30874SSatish Balay s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 209*28e30874SSatish Balay s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 210*28e30874SSatish Balay s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 211*28e30874SSatish Balay s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 212*28e30874SSatish Balay s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 213*28e30874SSatish Balay v += 49; 214*28e30874SSatish Balay } 215*28e30874SSatish Balay idx = 7*i; 216*28e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 217*28e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 218*28e30874SSatish Balay t[5+idx] = s6;t[6+idx] = s7; 219*28e30874SSatish Balay } 220*28e30874SSatish Balay /* backward solve the upper triangular */ 221*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 222*28e30874SSatish Balay v = aa + 49*(adiag[i+1]+1); 223*28e30874SSatish Balay vi = aj + adiag[i+1]+1; 224*28e30874SSatish Balay nz = adiag[i] - adiag[i+1] - 1; 225*28e30874SSatish Balay idt = 7*i; 226*28e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 227*28e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 228*28e30874SSatish Balay s6 = t[5+idt];s7 = t[6+idt]; 229*28e30874SSatish Balay for (m=0; m<nz; m++) { 230*28e30874SSatish Balay idx = 7*vi[m]; 231*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 232*28e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 233*28e30874SSatish Balay x6 = t[5+idx]; x7 = t[6+idx]; 234*28e30874SSatish Balay s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 235*28e30874SSatish Balay s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 236*28e30874SSatish Balay s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 237*28e30874SSatish Balay s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 238*28e30874SSatish Balay s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 239*28e30874SSatish Balay s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 240*28e30874SSatish Balay s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 241*28e30874SSatish Balay v += 49; 242*28e30874SSatish Balay } 243*28e30874SSatish Balay idc = 7*c[i]; 244*28e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 245*28e30874SSatish Balay v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 246*28e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 247*28e30874SSatish Balay v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 248*28e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 249*28e30874SSatish Balay v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 250*28e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 251*28e30874SSatish Balay v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 252*28e30874SSatish Balay x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 253*28e30874SSatish Balay v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 254*28e30874SSatish Balay x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 255*28e30874SSatish Balay v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 256*28e30874SSatish Balay x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 257*28e30874SSatish Balay v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 258*28e30874SSatish Balay } 259*28e30874SSatish Balay 260*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 261*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 262*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 263*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 264*28e30874SSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 265*28e30874SSatish Balay PetscFunctionReturn(0); 266*28e30874SSatish Balay } 267*28e30874SSatish Balay 268*28e30874SSatish Balay #undef __FUNCT__ 269*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 270*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 271*28e30874SSatish Balay { 272*28e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 273*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 274*28e30874SSatish Balay PetscErrorCode ierr; 275*28e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 276*28e30874SSatish Balay const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 277*28e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 278*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 279*28e30874SSatish Balay PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 280*28e30874SSatish Balay const PetscScalar *b; 281*28e30874SSatish Balay 282*28e30874SSatish Balay PetscFunctionBegin; 283*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 284*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 285*28e30874SSatish Balay t = a->solve_work; 286*28e30874SSatish Balay 287*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 288*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 289*28e30874SSatish Balay 290*28e30874SSatish Balay /* forward solve the lower triangular */ 291*28e30874SSatish Balay idx = 6*(*r++); 292*28e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 293*28e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; 294*28e30874SSatish Balay t[4] = b[4+idx]; t[5] = b[5+idx]; 295*28e30874SSatish Balay for (i=1; i<n; i++) { 296*28e30874SSatish Balay v = aa + 36*ai[i]; 297*28e30874SSatish Balay vi = aj + ai[i]; 298*28e30874SSatish Balay nz = diag[i] - ai[i]; 299*28e30874SSatish Balay idx = 6*(*r++); 300*28e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 301*28e30874SSatish Balay s5 = b[4+idx]; s6 = b[5+idx]; 302*28e30874SSatish Balay while (nz--) { 303*28e30874SSatish Balay idx = 6*(*vi++); 304*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 305*28e30874SSatish Balay x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 306*28e30874SSatish Balay s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 307*28e30874SSatish Balay s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 308*28e30874SSatish Balay s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 309*28e30874SSatish Balay s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 310*28e30874SSatish Balay s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 311*28e30874SSatish Balay s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 312*28e30874SSatish Balay v += 36; 313*28e30874SSatish Balay } 314*28e30874SSatish Balay idx = 6*i; 315*28e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 316*28e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; 317*28e30874SSatish Balay t[4+idx] = s5;t[5+idx] = s6; 318*28e30874SSatish Balay } 319*28e30874SSatish Balay /* backward solve the upper triangular */ 320*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 321*28e30874SSatish Balay v = aa + 36*diag[i] + 36; 322*28e30874SSatish Balay vi = aj + diag[i] + 1; 323*28e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 324*28e30874SSatish Balay idt = 6*i; 325*28e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 326*28e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; 327*28e30874SSatish Balay s5 = t[4+idt];s6 = t[5+idt]; 328*28e30874SSatish Balay while (nz--) { 329*28e30874SSatish Balay idx = 6*(*vi++); 330*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 331*28e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; 332*28e30874SSatish Balay x5 = t[4+idx]; x6 = t[5+idx]; 333*28e30874SSatish Balay s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 334*28e30874SSatish Balay s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 335*28e30874SSatish Balay s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 336*28e30874SSatish Balay s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 337*28e30874SSatish Balay s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 338*28e30874SSatish Balay s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 339*28e30874SSatish Balay v += 36; 340*28e30874SSatish Balay } 341*28e30874SSatish Balay idc = 6*(*c--); 342*28e30874SSatish Balay v = aa + 36*diag[i]; 343*28e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 344*28e30874SSatish Balay v[18]*s4+v[24]*s5+v[30]*s6; 345*28e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 346*28e30874SSatish Balay v[19]*s4+v[25]*s5+v[31]*s6; 347*28e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 348*28e30874SSatish Balay v[20]*s4+v[26]*s5+v[32]*s6; 349*28e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 350*28e30874SSatish Balay v[21]*s4+v[27]*s5+v[33]*s6; 351*28e30874SSatish Balay x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 352*28e30874SSatish Balay v[22]*s4+v[28]*s5+v[34]*s6; 353*28e30874SSatish Balay x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 354*28e30874SSatish Balay v[23]*s4+v[29]*s5+v[35]*s6; 355*28e30874SSatish Balay } 356*28e30874SSatish Balay 357*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 358*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 359*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 360*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 361*28e30874SSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 362*28e30874SSatish Balay PetscFunctionReturn(0); 363*28e30874SSatish Balay } 364*28e30874SSatish Balay 365*28e30874SSatish Balay #undef __FUNCT__ 366*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 367*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 368*28e30874SSatish Balay { 369*28e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 370*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 371*28e30874SSatish Balay PetscErrorCode ierr; 372*28e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 373*28e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 374*28e30874SSatish Balay PetscInt i,nz,idx,idt,idc,m; 375*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 376*28e30874SSatish Balay PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 377*28e30874SSatish Balay const PetscScalar *b; 378*28e30874SSatish Balay 379*28e30874SSatish Balay PetscFunctionBegin; 380*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 381*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 382*28e30874SSatish Balay t = a->solve_work; 383*28e30874SSatish Balay 384*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 385*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 386*28e30874SSatish Balay 387*28e30874SSatish Balay /* forward solve the lower triangular */ 388*28e30874SSatish Balay idx = 6*r[0]; 389*28e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 390*28e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; 391*28e30874SSatish Balay t[4] = b[4+idx]; t[5] = b[5+idx]; 392*28e30874SSatish Balay for (i=1; i<n; i++) { 393*28e30874SSatish Balay v = aa + 36*ai[i]; 394*28e30874SSatish Balay vi = aj + ai[i]; 395*28e30874SSatish Balay nz = ai[i+1] - ai[i]; 396*28e30874SSatish Balay idx = 6*r[i]; 397*28e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 398*28e30874SSatish Balay s5 = b[4+idx]; s6 = b[5+idx]; 399*28e30874SSatish Balay for (m=0; m<nz; m++) { 400*28e30874SSatish Balay idx = 6*vi[m]; 401*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 402*28e30874SSatish Balay x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 403*28e30874SSatish Balay s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 404*28e30874SSatish Balay s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 405*28e30874SSatish Balay s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 406*28e30874SSatish Balay s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 407*28e30874SSatish Balay s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 408*28e30874SSatish Balay s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 409*28e30874SSatish Balay v += 36; 410*28e30874SSatish Balay } 411*28e30874SSatish Balay idx = 6*i; 412*28e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 413*28e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; 414*28e30874SSatish Balay t[4+idx] = s5;t[5+idx] = s6; 415*28e30874SSatish Balay } 416*28e30874SSatish Balay /* backward solve the upper triangular */ 417*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 418*28e30874SSatish Balay v = aa + 36*(adiag[i+1]+1); 419*28e30874SSatish Balay vi = aj + adiag[i+1]+1; 420*28e30874SSatish Balay nz = adiag[i] - adiag[i+1] - 1; 421*28e30874SSatish Balay idt = 6*i; 422*28e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 423*28e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; 424*28e30874SSatish Balay s5 = t[4+idt];s6 = t[5+idt]; 425*28e30874SSatish Balay for (m=0; m<nz; m++) { 426*28e30874SSatish Balay idx = 6*vi[m]; 427*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 428*28e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; 429*28e30874SSatish Balay x5 = t[4+idx]; x6 = t[5+idx]; 430*28e30874SSatish Balay s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 431*28e30874SSatish Balay s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 432*28e30874SSatish Balay s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 433*28e30874SSatish Balay s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 434*28e30874SSatish Balay s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 435*28e30874SSatish Balay s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 436*28e30874SSatish Balay v += 36; 437*28e30874SSatish Balay } 438*28e30874SSatish Balay idc = 6*c[i]; 439*28e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 440*28e30874SSatish Balay v[18]*s4+v[24]*s5+v[30]*s6; 441*28e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 442*28e30874SSatish Balay v[19]*s4+v[25]*s5+v[31]*s6; 443*28e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 444*28e30874SSatish Balay v[20]*s4+v[26]*s5+v[32]*s6; 445*28e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 446*28e30874SSatish Balay v[21]*s4+v[27]*s5+v[33]*s6; 447*28e30874SSatish Balay x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 448*28e30874SSatish Balay v[22]*s4+v[28]*s5+v[34]*s6; 449*28e30874SSatish Balay x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 450*28e30874SSatish Balay v[23]*s4+v[29]*s5+v[35]*s6; 451*28e30874SSatish Balay } 452*28e30874SSatish Balay 453*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 454*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 455*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 456*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 457*28e30874SSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 458*28e30874SSatish Balay PetscFunctionReturn(0); 459*28e30874SSatish Balay } 460*28e30874SSatish Balay 461*28e30874SSatish Balay #undef __FUNCT__ 462*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 463*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 464*28e30874SSatish Balay { 465*28e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 466*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 467*28e30874SSatish Balay PetscErrorCode ierr; 468*28e30874SSatish Balay const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 469*28e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 470*28e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 471*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 472*28e30874SSatish Balay PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 473*28e30874SSatish Balay const PetscScalar *b; 474*28e30874SSatish Balay 475*28e30874SSatish Balay PetscFunctionBegin; 476*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 477*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 478*28e30874SSatish Balay t = a->solve_work; 479*28e30874SSatish Balay 480*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 481*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 482*28e30874SSatish Balay 483*28e30874SSatish Balay /* forward solve the lower triangular */ 484*28e30874SSatish Balay idx = 5*(*r++); 485*28e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 486*28e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 487*28e30874SSatish Balay for (i=1; i<n; i++) { 488*28e30874SSatish Balay v = aa + 25*ai[i]; 489*28e30874SSatish Balay vi = aj + ai[i]; 490*28e30874SSatish Balay nz = diag[i] - ai[i]; 491*28e30874SSatish Balay idx = 5*(*r++); 492*28e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 493*28e30874SSatish Balay s5 = b[4+idx]; 494*28e30874SSatish Balay while (nz--) { 495*28e30874SSatish Balay idx = 5*(*vi++); 496*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 497*28e30874SSatish Balay x4 = t[3+idx];x5 = t[4+idx]; 498*28e30874SSatish Balay s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 499*28e30874SSatish Balay s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 500*28e30874SSatish Balay s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 501*28e30874SSatish Balay s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 502*28e30874SSatish Balay s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 503*28e30874SSatish Balay v += 25; 504*28e30874SSatish Balay } 505*28e30874SSatish Balay idx = 5*i; 506*28e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 507*28e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 508*28e30874SSatish Balay } 509*28e30874SSatish Balay /* backward solve the upper triangular */ 510*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 511*28e30874SSatish Balay v = aa + 25*diag[i] + 25; 512*28e30874SSatish Balay vi = aj + diag[i] + 1; 513*28e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 514*28e30874SSatish Balay idt = 5*i; 515*28e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 516*28e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 517*28e30874SSatish Balay while (nz--) { 518*28e30874SSatish Balay idx = 5*(*vi++); 519*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 520*28e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 521*28e30874SSatish Balay s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 522*28e30874SSatish Balay s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 523*28e30874SSatish Balay s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 524*28e30874SSatish Balay s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 525*28e30874SSatish Balay s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 526*28e30874SSatish Balay v += 25; 527*28e30874SSatish Balay } 528*28e30874SSatish Balay idc = 5*(*c--); 529*28e30874SSatish Balay v = aa + 25*diag[i]; 530*28e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 531*28e30874SSatish Balay v[15]*s4+v[20]*s5; 532*28e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 533*28e30874SSatish Balay v[16]*s4+v[21]*s5; 534*28e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 535*28e30874SSatish Balay v[17]*s4+v[22]*s5; 536*28e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 537*28e30874SSatish Balay v[18]*s4+v[23]*s5; 538*28e30874SSatish Balay x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 539*28e30874SSatish Balay v[19]*s4+v[24]*s5; 540*28e30874SSatish Balay } 541*28e30874SSatish Balay 542*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 543*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 544*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 545*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 546*28e30874SSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 547*28e30874SSatish Balay PetscFunctionReturn(0); 548*28e30874SSatish Balay } 549*28e30874SSatish Balay 550*28e30874SSatish Balay #undef __FUNCT__ 551*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 552*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 553*28e30874SSatish Balay { 554*28e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 555*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 556*28e30874SSatish Balay PetscErrorCode ierr; 557*28e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 558*28e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 559*28e30874SSatish Balay PetscInt i,nz,idx,idt,idc,m; 560*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 561*28e30874SSatish Balay PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 562*28e30874SSatish Balay const PetscScalar *b; 563*28e30874SSatish Balay 564*28e30874SSatish Balay PetscFunctionBegin; 565*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 566*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 567*28e30874SSatish Balay t = a->solve_work; 568*28e30874SSatish Balay 569*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 570*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 571*28e30874SSatish Balay 572*28e30874SSatish Balay /* forward solve the lower triangular */ 573*28e30874SSatish Balay idx = 5*r[0]; 574*28e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 575*28e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 576*28e30874SSatish Balay for (i=1; i<n; i++) { 577*28e30874SSatish Balay v = aa + 25*ai[i]; 578*28e30874SSatish Balay vi = aj + ai[i]; 579*28e30874SSatish Balay nz = ai[i+1] - ai[i]; 580*28e30874SSatish Balay idx = 5*r[i]; 581*28e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 582*28e30874SSatish Balay s5 = b[4+idx]; 583*28e30874SSatish Balay for (m=0; m<nz; m++) { 584*28e30874SSatish Balay idx = 5*vi[m]; 585*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 586*28e30874SSatish Balay x4 = t[3+idx];x5 = t[4+idx]; 587*28e30874SSatish Balay s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 588*28e30874SSatish Balay s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 589*28e30874SSatish Balay s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 590*28e30874SSatish Balay s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 591*28e30874SSatish Balay s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 592*28e30874SSatish Balay v += 25; 593*28e30874SSatish Balay } 594*28e30874SSatish Balay idx = 5*i; 595*28e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 596*28e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 597*28e30874SSatish Balay } 598*28e30874SSatish Balay /* backward solve the upper triangular */ 599*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 600*28e30874SSatish Balay v = aa + 25*(adiag[i+1]+1); 601*28e30874SSatish Balay vi = aj + adiag[i+1]+1; 602*28e30874SSatish Balay nz = adiag[i] - adiag[i+1] - 1; 603*28e30874SSatish Balay idt = 5*i; 604*28e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 605*28e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 606*28e30874SSatish Balay for (m=0; m<nz; m++) { 607*28e30874SSatish Balay idx = 5*vi[m]; 608*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 609*28e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 610*28e30874SSatish Balay s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 611*28e30874SSatish Balay s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 612*28e30874SSatish Balay s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 613*28e30874SSatish Balay s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 614*28e30874SSatish Balay s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 615*28e30874SSatish Balay v += 25; 616*28e30874SSatish Balay } 617*28e30874SSatish Balay idc = 5*c[i]; 618*28e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 619*28e30874SSatish Balay v[15]*s4+v[20]*s5; 620*28e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 621*28e30874SSatish Balay v[16]*s4+v[21]*s5; 622*28e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 623*28e30874SSatish Balay v[17]*s4+v[22]*s5; 624*28e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 625*28e30874SSatish Balay v[18]*s4+v[23]*s5; 626*28e30874SSatish Balay x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 627*28e30874SSatish Balay v[19]*s4+v[24]*s5; 628*28e30874SSatish Balay } 629*28e30874SSatish Balay 630*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 631*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 632*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 633*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 634*28e30874SSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 635*28e30874SSatish Balay PetscFunctionReturn(0); 636*28e30874SSatish Balay } 637*28e30874SSatish Balay 638*28e30874SSatish Balay #undef __FUNCT__ 639*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 640*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 641*28e30874SSatish Balay { 642*28e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 643*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 644*28e30874SSatish Balay PetscErrorCode ierr; 645*28e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 646*28e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 647*28e30874SSatish Balay const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 648*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 649*28e30874SSatish Balay PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 650*28e30874SSatish Balay const PetscScalar *b; 651*28e30874SSatish Balay 652*28e30874SSatish Balay PetscFunctionBegin; 653*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 654*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 655*28e30874SSatish Balay t = a->solve_work; 656*28e30874SSatish Balay 657*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 658*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 659*28e30874SSatish Balay 660*28e30874SSatish Balay /* forward solve the lower triangular */ 661*28e30874SSatish Balay idx = 4*(*r++); 662*28e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 663*28e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; 664*28e30874SSatish Balay for (i=1; i<n; i++) { 665*28e30874SSatish Balay v = aa + 16*ai[i]; 666*28e30874SSatish Balay vi = aj + ai[i]; 667*28e30874SSatish Balay nz = diag[i] - ai[i]; 668*28e30874SSatish Balay idx = 4*(*r++); 669*28e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 670*28e30874SSatish Balay while (nz--) { 671*28e30874SSatish Balay idx = 4*(*vi++); 672*28e30874SSatish Balay x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 673*28e30874SSatish Balay s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 674*28e30874SSatish Balay s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 675*28e30874SSatish Balay s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 676*28e30874SSatish Balay s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 677*28e30874SSatish Balay v += 16; 678*28e30874SSatish Balay } 679*28e30874SSatish Balay idx = 4*i; 680*28e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 681*28e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; 682*28e30874SSatish Balay } 683*28e30874SSatish Balay /* backward solve the upper triangular */ 684*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 685*28e30874SSatish Balay v = aa + 16*diag[i] + 16; 686*28e30874SSatish Balay vi = aj + diag[i] + 1; 687*28e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 688*28e30874SSatish Balay idt = 4*i; 689*28e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 690*28e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; 691*28e30874SSatish Balay while (nz--) { 692*28e30874SSatish Balay idx = 4*(*vi++); 693*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 694*28e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; 695*28e30874SSatish Balay s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 696*28e30874SSatish Balay s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 697*28e30874SSatish Balay s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 698*28e30874SSatish Balay s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 699*28e30874SSatish Balay v += 16; 700*28e30874SSatish Balay } 701*28e30874SSatish Balay idc = 4*(*c--); 702*28e30874SSatish Balay v = aa + 16*diag[i]; 703*28e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 704*28e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 705*28e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 706*28e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 707*28e30874SSatish Balay } 708*28e30874SSatish Balay 709*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 710*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 711*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 712*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 713*28e30874SSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 714*28e30874SSatish Balay PetscFunctionReturn(0); 715*28e30874SSatish Balay } 716*28e30874SSatish Balay 717*28e30874SSatish Balay #undef __FUNCT__ 718*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 719*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 720*28e30874SSatish Balay { 721*28e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 722*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 723*28e30874SSatish Balay PetscErrorCode ierr; 724*28e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 725*28e30874SSatish Balay PetscInt i,nz,idx,idt,idc,m; 726*28e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 727*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 728*28e30874SSatish Balay PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 729*28e30874SSatish Balay const PetscScalar *b; 730*28e30874SSatish Balay 731*28e30874SSatish Balay PetscFunctionBegin; 732*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 733*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 734*28e30874SSatish Balay t = a->solve_work; 735*28e30874SSatish Balay 736*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 737*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 738*28e30874SSatish Balay 739*28e30874SSatish Balay /* forward solve the lower triangular */ 740*28e30874SSatish Balay idx = 4*r[0]; 741*28e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 742*28e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; 743*28e30874SSatish Balay for (i=1; i<n; i++) { 744*28e30874SSatish Balay v = aa + 16*ai[i]; 745*28e30874SSatish Balay vi = aj + ai[i]; 746*28e30874SSatish Balay nz = ai[i+1] - ai[i]; 747*28e30874SSatish Balay idx = 4*r[i]; 748*28e30874SSatish Balay s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 749*28e30874SSatish Balay for (m=0; m<nz; m++) { 750*28e30874SSatish Balay idx = 4*vi[m]; 751*28e30874SSatish Balay x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 752*28e30874SSatish Balay s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 753*28e30874SSatish Balay s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 754*28e30874SSatish Balay s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 755*28e30874SSatish Balay s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 756*28e30874SSatish Balay v += 16; 757*28e30874SSatish Balay } 758*28e30874SSatish Balay idx = 4*i; 759*28e30874SSatish Balay t[idx] = s1;t[1+idx] = s2; 760*28e30874SSatish Balay t[2+idx] = s3;t[3+idx] = s4; 761*28e30874SSatish Balay } 762*28e30874SSatish Balay /* backward solve the upper triangular */ 763*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 764*28e30874SSatish Balay v = aa + 16*(adiag[i+1]+1); 765*28e30874SSatish Balay vi = aj + adiag[i+1]+1; 766*28e30874SSatish Balay nz = adiag[i] - adiag[i+1] - 1; 767*28e30874SSatish Balay idt = 4*i; 768*28e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 769*28e30874SSatish Balay s3 = t[2+idt];s4 = t[3+idt]; 770*28e30874SSatish Balay for (m=0; m<nz; m++) { 771*28e30874SSatish Balay idx = 4*vi[m]; 772*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 773*28e30874SSatish Balay x3 = t[2+idx]; x4 = t[3+idx]; 774*28e30874SSatish Balay s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 775*28e30874SSatish Balay s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 776*28e30874SSatish Balay s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 777*28e30874SSatish Balay s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 778*28e30874SSatish Balay v += 16; 779*28e30874SSatish Balay } 780*28e30874SSatish Balay idc = 4*c[i]; 781*28e30874SSatish Balay x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 782*28e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 783*28e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 784*28e30874SSatish Balay x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 785*28e30874SSatish Balay } 786*28e30874SSatish Balay 787*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 788*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 789*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 790*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 791*28e30874SSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 792*28e30874SSatish Balay PetscFunctionReturn(0); 793*28e30874SSatish Balay } 794*28e30874SSatish Balay 795*28e30874SSatish Balay #undef __FUNCT__ 796*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 797*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 798*28e30874SSatish Balay { 799*28e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 800*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 801*28e30874SSatish Balay PetscErrorCode ierr; 802*28e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 803*28e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 804*28e30874SSatish Balay const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 805*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 806*28e30874SSatish Balay MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 807*28e30874SSatish Balay PetscScalar *x; 808*28e30874SSatish Balay const PetscScalar *b; 809*28e30874SSatish Balay 810*28e30874SSatish Balay PetscFunctionBegin; 811*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 812*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 813*28e30874SSatish Balay t = (MatScalar*)a->solve_work; 814*28e30874SSatish Balay 815*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 816*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 817*28e30874SSatish Balay 818*28e30874SSatish Balay /* forward solve the lower triangular */ 819*28e30874SSatish Balay idx = 4*(*r++); 820*28e30874SSatish Balay t[0] = (MatScalar)b[idx]; 821*28e30874SSatish Balay t[1] = (MatScalar)b[1+idx]; 822*28e30874SSatish Balay t[2] = (MatScalar)b[2+idx]; 823*28e30874SSatish Balay t[3] = (MatScalar)b[3+idx]; 824*28e30874SSatish Balay for (i=1; i<n; i++) { 825*28e30874SSatish Balay v = aa + 16*ai[i]; 826*28e30874SSatish Balay vi = aj + ai[i]; 827*28e30874SSatish Balay nz = diag[i] - ai[i]; 828*28e30874SSatish Balay idx = 4*(*r++); 829*28e30874SSatish Balay s1 = (MatScalar)b[idx]; 830*28e30874SSatish Balay s2 = (MatScalar)b[1+idx]; 831*28e30874SSatish Balay s3 = (MatScalar)b[2+idx]; 832*28e30874SSatish Balay s4 = (MatScalar)b[3+idx]; 833*28e30874SSatish Balay while (nz--) { 834*28e30874SSatish Balay idx = 4*(*vi++); 835*28e30874SSatish Balay x1 = t[idx]; 836*28e30874SSatish Balay x2 = t[1+idx]; 837*28e30874SSatish Balay x3 = t[2+idx]; 838*28e30874SSatish Balay x4 = t[3+idx]; 839*28e30874SSatish Balay s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 840*28e30874SSatish Balay s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 841*28e30874SSatish Balay s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 842*28e30874SSatish Balay s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 843*28e30874SSatish Balay v += 16; 844*28e30874SSatish Balay } 845*28e30874SSatish Balay idx = 4*i; 846*28e30874SSatish Balay t[idx] = s1; 847*28e30874SSatish Balay t[1+idx] = s2; 848*28e30874SSatish Balay t[2+idx] = s3; 849*28e30874SSatish Balay t[3+idx] = s4; 850*28e30874SSatish Balay } 851*28e30874SSatish Balay /* backward solve the upper triangular */ 852*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 853*28e30874SSatish Balay v = aa + 16*diag[i] + 16; 854*28e30874SSatish Balay vi = aj + diag[i] + 1; 855*28e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 856*28e30874SSatish Balay idt = 4*i; 857*28e30874SSatish Balay s1 = t[idt]; 858*28e30874SSatish Balay s2 = t[1+idt]; 859*28e30874SSatish Balay s3 = t[2+idt]; 860*28e30874SSatish Balay s4 = t[3+idt]; 861*28e30874SSatish Balay while (nz--) { 862*28e30874SSatish Balay idx = 4*(*vi++); 863*28e30874SSatish Balay x1 = t[idx]; 864*28e30874SSatish Balay x2 = t[1+idx]; 865*28e30874SSatish Balay x3 = t[2+idx]; 866*28e30874SSatish Balay x4 = t[3+idx]; 867*28e30874SSatish Balay s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 868*28e30874SSatish Balay s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 869*28e30874SSatish Balay s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 870*28e30874SSatish Balay s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 871*28e30874SSatish Balay v += 16; 872*28e30874SSatish Balay } 873*28e30874SSatish Balay idc = 4*(*c--); 874*28e30874SSatish Balay v = aa + 16*diag[i]; 875*28e30874SSatish Balay t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 876*28e30874SSatish Balay t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 877*28e30874SSatish Balay t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 878*28e30874SSatish Balay t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 879*28e30874SSatish Balay x[idc] = (PetscScalar)t[idt]; 880*28e30874SSatish Balay x[1+idc] = (PetscScalar)t[1+idt]; 881*28e30874SSatish Balay x[2+idc] = (PetscScalar)t[2+idt]; 882*28e30874SSatish Balay x[3+idc] = (PetscScalar)t[3+idt]; 883*28e30874SSatish Balay } 884*28e30874SSatish Balay 885*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 886*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 887*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 888*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 889*28e30874SSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 890*28e30874SSatish Balay PetscFunctionReturn(0); 891*28e30874SSatish Balay } 892*28e30874SSatish Balay 893*28e30874SSatish Balay #if defined(PETSC_HAVE_SSE) 894*28e30874SSatish Balay 895*28e30874SSatish Balay #include PETSC_HAVE_SSE 896*28e30874SSatish Balay 897*28e30874SSatish Balay #undef __FUNCT__ 898*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 899*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 900*28e30874SSatish Balay { 901*28e30874SSatish Balay /* 902*28e30874SSatish Balay Note: This code uses demotion of double 903*28e30874SSatish Balay to float when performing the mixed-mode computation. 904*28e30874SSatish Balay This may not be numerically reasonable for all applications. 905*28e30874SSatish Balay */ 906*28e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 907*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 908*28e30874SSatish Balay PetscErrorCode ierr; 909*28e30874SSatish Balay PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 910*28e30874SSatish Balay const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 911*28e30874SSatish Balay MatScalar *aa=a->a,*v; 912*28e30874SSatish Balay PetscScalar *x,*b,*t; 913*28e30874SSatish Balay 914*28e30874SSatish Balay /* Make space in temp stack for 16 Byte Aligned arrays */ 915*28e30874SSatish Balay float ssealignedspace[11],*tmps,*tmpx; 916*28e30874SSatish Balay unsigned long offset; 917*28e30874SSatish Balay 918*28e30874SSatish Balay PetscFunctionBegin; 919*28e30874SSatish Balay SSE_SCOPE_BEGIN; 920*28e30874SSatish Balay 921*28e30874SSatish Balay offset = (unsigned long)ssealignedspace % 16; 922*28e30874SSatish Balay if (offset) offset = (16 - offset)/4; 923*28e30874SSatish Balay tmps = &ssealignedspace[offset]; 924*28e30874SSatish Balay tmpx = &ssealignedspace[offset+4]; 925*28e30874SSatish Balay PREFETCH_NTA(aa+16*ai[1]); 926*28e30874SSatish Balay 927*28e30874SSatish Balay ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 928*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 929*28e30874SSatish Balay t = a->solve_work; 930*28e30874SSatish Balay 931*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 932*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 933*28e30874SSatish Balay 934*28e30874SSatish Balay /* forward solve the lower triangular */ 935*28e30874SSatish Balay idx = 4*(*r++); 936*28e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 937*28e30874SSatish Balay t[2] = b[2+idx]; t[3] = b[3+idx]; 938*28e30874SSatish Balay v = aa + 16*ai[1]; 939*28e30874SSatish Balay 940*28e30874SSatish Balay for (i=1; i<n; ) { 941*28e30874SSatish Balay PREFETCH_NTA(&v[8]); 942*28e30874SSatish Balay vi = aj + ai[i]; 943*28e30874SSatish Balay nz = diag[i] - ai[i]; 944*28e30874SSatish Balay idx = 4*(*r++); 945*28e30874SSatish Balay 946*28e30874SSatish Balay /* Demote sum from double to float */ 947*28e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 948*28e30874SSatish Balay LOAD_PS(tmps,XMM7); 949*28e30874SSatish Balay 950*28e30874SSatish Balay while (nz--) { 951*28e30874SSatish Balay PREFETCH_NTA(&v[16]); 952*28e30874SSatish Balay idx = 4*(*vi++); 953*28e30874SSatish Balay 954*28e30874SSatish Balay /* Demote solution (so far) from double to float */ 955*28e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 956*28e30874SSatish Balay 957*28e30874SSatish Balay /* 4x4 Matrix-Vector product with negative accumulation: */ 958*28e30874SSatish Balay SSE_INLINE_BEGIN_2(tmpx,v) 959*28e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 960*28e30874SSatish Balay 961*28e30874SSatish Balay /* First Column */ 962*28e30874SSatish Balay SSE_COPY_PS(XMM0,XMM6) 963*28e30874SSatish Balay SSE_SHUFFLE(XMM0,XMM0,0x00) 964*28e30874SSatish Balay SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 965*28e30874SSatish Balay SSE_SUB_PS(XMM7,XMM0) 966*28e30874SSatish Balay 967*28e30874SSatish Balay /* Second Column */ 968*28e30874SSatish Balay SSE_COPY_PS(XMM1,XMM6) 969*28e30874SSatish Balay SSE_SHUFFLE(XMM1,XMM1,0x55) 970*28e30874SSatish Balay SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 971*28e30874SSatish Balay SSE_SUB_PS(XMM7,XMM1) 972*28e30874SSatish Balay 973*28e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 974*28e30874SSatish Balay 975*28e30874SSatish Balay /* Third Column */ 976*28e30874SSatish Balay SSE_COPY_PS(XMM2,XMM6) 977*28e30874SSatish Balay SSE_SHUFFLE(XMM2,XMM2,0xAA) 978*28e30874SSatish Balay SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 979*28e30874SSatish Balay SSE_SUB_PS(XMM7,XMM2) 980*28e30874SSatish Balay 981*28e30874SSatish Balay /* Fourth Column */ 982*28e30874SSatish Balay SSE_COPY_PS(XMM3,XMM6) 983*28e30874SSatish Balay SSE_SHUFFLE(XMM3,XMM3,0xFF) 984*28e30874SSatish Balay SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 985*28e30874SSatish Balay SSE_SUB_PS(XMM7,XMM3) 986*28e30874SSatish Balay SSE_INLINE_END_2 987*28e30874SSatish Balay 988*28e30874SSatish Balay v += 16; 989*28e30874SSatish Balay } 990*28e30874SSatish Balay idx = 4*i; 991*28e30874SSatish Balay v = aa + 16*ai[++i]; 992*28e30874SSatish Balay PREFETCH_NTA(v); 993*28e30874SSatish Balay STORE_PS(tmps,XMM7); 994*28e30874SSatish Balay 995*28e30874SSatish Balay /* Promote result from float to double */ 996*28e30874SSatish Balay CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 997*28e30874SSatish Balay } 998*28e30874SSatish Balay /* backward solve the upper triangular */ 999*28e30874SSatish Balay idt = 4*(n-1); 1000*28e30874SSatish Balay ai16 = 16*diag[n-1]; 1001*28e30874SSatish Balay v = aa + ai16 + 16; 1002*28e30874SSatish Balay for (i=n-1; i>=0; ) { 1003*28e30874SSatish Balay PREFETCH_NTA(&v[8]); 1004*28e30874SSatish Balay vi = aj + diag[i] + 1; 1005*28e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 1006*28e30874SSatish Balay 1007*28e30874SSatish Balay /* Demote accumulator from double to float */ 1008*28e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1009*28e30874SSatish Balay LOAD_PS(tmps,XMM7); 1010*28e30874SSatish Balay 1011*28e30874SSatish Balay while (nz--) { 1012*28e30874SSatish Balay PREFETCH_NTA(&v[16]); 1013*28e30874SSatish Balay idx = 4*(*vi++); 1014*28e30874SSatish Balay 1015*28e30874SSatish Balay /* Demote solution (so far) from double to float */ 1016*28e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 1017*28e30874SSatish Balay 1018*28e30874SSatish Balay /* 4x4 Matrix-Vector Product with negative accumulation: */ 1019*28e30874SSatish Balay SSE_INLINE_BEGIN_2(tmpx,v) 1020*28e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1021*28e30874SSatish Balay 1022*28e30874SSatish Balay /* First Column */ 1023*28e30874SSatish Balay SSE_COPY_PS(XMM0,XMM6) 1024*28e30874SSatish Balay SSE_SHUFFLE(XMM0,XMM0,0x00) 1025*28e30874SSatish Balay SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1026*28e30874SSatish Balay SSE_SUB_PS(XMM7,XMM0) 1027*28e30874SSatish Balay 1028*28e30874SSatish Balay /* Second Column */ 1029*28e30874SSatish Balay SSE_COPY_PS(XMM1,XMM6) 1030*28e30874SSatish Balay SSE_SHUFFLE(XMM1,XMM1,0x55) 1031*28e30874SSatish Balay SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1032*28e30874SSatish Balay SSE_SUB_PS(XMM7,XMM1) 1033*28e30874SSatish Balay 1034*28e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1035*28e30874SSatish Balay 1036*28e30874SSatish Balay /* Third Column */ 1037*28e30874SSatish Balay SSE_COPY_PS(XMM2,XMM6) 1038*28e30874SSatish Balay SSE_SHUFFLE(XMM2,XMM2,0xAA) 1039*28e30874SSatish Balay SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1040*28e30874SSatish Balay SSE_SUB_PS(XMM7,XMM2) 1041*28e30874SSatish Balay 1042*28e30874SSatish Balay /* Fourth Column */ 1043*28e30874SSatish Balay SSE_COPY_PS(XMM3,XMM6) 1044*28e30874SSatish Balay SSE_SHUFFLE(XMM3,XMM3,0xFF) 1045*28e30874SSatish Balay SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1046*28e30874SSatish Balay SSE_SUB_PS(XMM7,XMM3) 1047*28e30874SSatish Balay SSE_INLINE_END_2 1048*28e30874SSatish Balay v += 16; 1049*28e30874SSatish Balay } 1050*28e30874SSatish Balay v = aa + ai16; 1051*28e30874SSatish Balay ai16 = 16*diag[--i]; 1052*28e30874SSatish Balay PREFETCH_NTA(aa+ai16+16); 1053*28e30874SSatish Balay /* 1054*28e30874SSatish Balay Scale the result by the diagonal 4x4 block, 1055*28e30874SSatish Balay which was inverted as part of the factorization 1056*28e30874SSatish Balay */ 1057*28e30874SSatish Balay SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 1058*28e30874SSatish Balay /* First Column */ 1059*28e30874SSatish Balay SSE_COPY_PS(XMM0,XMM7) 1060*28e30874SSatish Balay SSE_SHUFFLE(XMM0,XMM0,0x00) 1061*28e30874SSatish Balay SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 1062*28e30874SSatish Balay 1063*28e30874SSatish Balay /* Second Column */ 1064*28e30874SSatish Balay SSE_COPY_PS(XMM1,XMM7) 1065*28e30874SSatish Balay SSE_SHUFFLE(XMM1,XMM1,0x55) 1066*28e30874SSatish Balay SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 1067*28e30874SSatish Balay SSE_ADD_PS(XMM0,XMM1) 1068*28e30874SSatish Balay 1069*28e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 1070*28e30874SSatish Balay 1071*28e30874SSatish Balay /* Third Column */ 1072*28e30874SSatish Balay SSE_COPY_PS(XMM2,XMM7) 1073*28e30874SSatish Balay SSE_SHUFFLE(XMM2,XMM2,0xAA) 1074*28e30874SSatish Balay SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 1075*28e30874SSatish Balay SSE_ADD_PS(XMM0,XMM2) 1076*28e30874SSatish Balay 1077*28e30874SSatish Balay /* Fourth Column */ 1078*28e30874SSatish Balay SSE_COPY_PS(XMM3,XMM7) 1079*28e30874SSatish Balay SSE_SHUFFLE(XMM3,XMM3,0xFF) 1080*28e30874SSatish Balay SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 1081*28e30874SSatish Balay SSE_ADD_PS(XMM0,XMM3) 1082*28e30874SSatish Balay 1083*28e30874SSatish Balay SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 1084*28e30874SSatish Balay SSE_INLINE_END_3 1085*28e30874SSatish Balay 1086*28e30874SSatish Balay /* Promote solution from float to double */ 1087*28e30874SSatish Balay CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 1088*28e30874SSatish Balay 1089*28e30874SSatish Balay /* Apply reordering to t and stream into x. */ 1090*28e30874SSatish Balay /* This way, x doesn't pollute the cache. */ 1091*28e30874SSatish Balay /* Be careful with size: 2 doubles = 4 floats! */ 1092*28e30874SSatish Balay idc = 4*(*c--); 1093*28e30874SSatish Balay SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc]) 1094*28e30874SSatish Balay /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 1095*28e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 1096*28e30874SSatish Balay SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 1097*28e30874SSatish Balay /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 1098*28e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 1099*28e30874SSatish Balay SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 1100*28e30874SSatish Balay SSE_INLINE_END_2 1101*28e30874SSatish Balay v = aa + ai16 + 16; 1102*28e30874SSatish Balay idt -= 4; 1103*28e30874SSatish Balay } 1104*28e30874SSatish Balay 1105*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1106*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1107*28e30874SSatish Balay ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1108*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1109*28e30874SSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1110*28e30874SSatish Balay SSE_SCOPE_END; 1111*28e30874SSatish Balay PetscFunctionReturn(0); 1112*28e30874SSatish Balay } 1113*28e30874SSatish Balay 1114*28e30874SSatish Balay #endif 1115*28e30874SSatish Balay 1116*28e30874SSatish Balay #undef __FUNCT__ 1117*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 1118*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1119*28e30874SSatish Balay { 1120*28e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1121*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 1122*28e30874SSatish Balay PetscErrorCode ierr; 1123*28e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1124*28e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 1125*28e30874SSatish Balay const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1126*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 1127*28e30874SSatish Balay PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 1128*28e30874SSatish Balay const PetscScalar *b; 1129*28e30874SSatish Balay 1130*28e30874SSatish Balay PetscFunctionBegin; 1131*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1132*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1133*28e30874SSatish Balay t = a->solve_work; 1134*28e30874SSatish Balay 1135*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1136*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1137*28e30874SSatish Balay 1138*28e30874SSatish Balay /* forward solve the lower triangular */ 1139*28e30874SSatish Balay idx = 3*(*r++); 1140*28e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 1141*28e30874SSatish Balay for (i=1; i<n; i++) { 1142*28e30874SSatish Balay v = aa + 9*ai[i]; 1143*28e30874SSatish Balay vi = aj + ai[i]; 1144*28e30874SSatish Balay nz = diag[i] - ai[i]; 1145*28e30874SSatish Balay idx = 3*(*r++); 1146*28e30874SSatish Balay s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1147*28e30874SSatish Balay while (nz--) { 1148*28e30874SSatish Balay idx = 3*(*vi++); 1149*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1150*28e30874SSatish Balay s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1151*28e30874SSatish Balay s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1152*28e30874SSatish Balay s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1153*28e30874SSatish Balay v += 9; 1154*28e30874SSatish Balay } 1155*28e30874SSatish Balay idx = 3*i; 1156*28e30874SSatish Balay t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 1157*28e30874SSatish Balay } 1158*28e30874SSatish Balay /* backward solve the upper triangular */ 1159*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 1160*28e30874SSatish Balay v = aa + 9*diag[i] + 9; 1161*28e30874SSatish Balay vi = aj + diag[i] + 1; 1162*28e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 1163*28e30874SSatish Balay idt = 3*i; 1164*28e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1165*28e30874SSatish Balay while (nz--) { 1166*28e30874SSatish Balay idx = 3*(*vi++); 1167*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1168*28e30874SSatish Balay s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1169*28e30874SSatish Balay s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1170*28e30874SSatish Balay s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1171*28e30874SSatish Balay v += 9; 1172*28e30874SSatish Balay } 1173*28e30874SSatish Balay idc = 3*(*c--); 1174*28e30874SSatish Balay v = aa + 9*diag[i]; 1175*28e30874SSatish Balay x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1176*28e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1177*28e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1178*28e30874SSatish Balay } 1179*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1180*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1181*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1182*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1183*28e30874SSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1184*28e30874SSatish Balay PetscFunctionReturn(0); 1185*28e30874SSatish Balay } 1186*28e30874SSatish Balay 1187*28e30874SSatish Balay #undef __FUNCT__ 1188*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 1189*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1190*28e30874SSatish Balay { 1191*28e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1192*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 1193*28e30874SSatish Balay PetscErrorCode ierr; 1194*28e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 1195*28e30874SSatish Balay PetscInt i,nz,idx,idt,idc,m; 1196*28e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 1197*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 1198*28e30874SSatish Balay PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 1199*28e30874SSatish Balay const PetscScalar *b; 1200*28e30874SSatish Balay 1201*28e30874SSatish Balay PetscFunctionBegin; 1202*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1203*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1204*28e30874SSatish Balay t = a->solve_work; 1205*28e30874SSatish Balay 1206*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1207*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1208*28e30874SSatish Balay 1209*28e30874SSatish Balay /* forward solve the lower triangular */ 1210*28e30874SSatish Balay idx = 3*r[0]; 1211*28e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 1212*28e30874SSatish Balay for (i=1; i<n; i++) { 1213*28e30874SSatish Balay v = aa + 9*ai[i]; 1214*28e30874SSatish Balay vi = aj + ai[i]; 1215*28e30874SSatish Balay nz = ai[i+1] - ai[i]; 1216*28e30874SSatish Balay idx = 3*r[i]; 1217*28e30874SSatish Balay s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1218*28e30874SSatish Balay for (m=0; m<nz; m++) { 1219*28e30874SSatish Balay idx = 3*vi[m]; 1220*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1221*28e30874SSatish Balay s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1222*28e30874SSatish Balay s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1223*28e30874SSatish Balay s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1224*28e30874SSatish Balay v += 9; 1225*28e30874SSatish Balay } 1226*28e30874SSatish Balay idx = 3*i; 1227*28e30874SSatish Balay t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 1228*28e30874SSatish Balay } 1229*28e30874SSatish Balay /* backward solve the upper triangular */ 1230*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 1231*28e30874SSatish Balay v = aa + 9*(adiag[i+1]+1); 1232*28e30874SSatish Balay vi = aj + adiag[i+1]+1; 1233*28e30874SSatish Balay nz = adiag[i] - adiag[i+1] - 1; 1234*28e30874SSatish Balay idt = 3*i; 1235*28e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1236*28e30874SSatish Balay for (m=0; m<nz; m++) { 1237*28e30874SSatish Balay idx = 3*vi[m]; 1238*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1239*28e30874SSatish Balay s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1240*28e30874SSatish Balay s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1241*28e30874SSatish Balay s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1242*28e30874SSatish Balay v += 9; 1243*28e30874SSatish Balay } 1244*28e30874SSatish Balay idc = 3*c[i]; 1245*28e30874SSatish Balay x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1246*28e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1247*28e30874SSatish Balay x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1248*28e30874SSatish Balay } 1249*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1250*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1251*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1252*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1253*28e30874SSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1254*28e30874SSatish Balay PetscFunctionReturn(0); 1255*28e30874SSatish Balay } 1256*28e30874SSatish Balay 1257*28e30874SSatish Balay #undef __FUNCT__ 1258*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 1259*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1260*28e30874SSatish Balay { 1261*28e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1262*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 1263*28e30874SSatish Balay PetscErrorCode ierr; 1264*28e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1265*28e30874SSatish Balay PetscInt i,nz,idx,idt,idc; 1266*28e30874SSatish Balay const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1267*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 1268*28e30874SSatish Balay PetscScalar *x,s1,s2,x1,x2,*t; 1269*28e30874SSatish Balay const PetscScalar *b; 1270*28e30874SSatish Balay 1271*28e30874SSatish Balay PetscFunctionBegin; 1272*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1273*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1274*28e30874SSatish Balay t = a->solve_work; 1275*28e30874SSatish Balay 1276*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1277*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1278*28e30874SSatish Balay 1279*28e30874SSatish Balay /* forward solve the lower triangular */ 1280*28e30874SSatish Balay idx = 2*(*r++); 1281*28e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 1282*28e30874SSatish Balay for (i=1; i<n; i++) { 1283*28e30874SSatish Balay v = aa + 4*ai[i]; 1284*28e30874SSatish Balay vi = aj + ai[i]; 1285*28e30874SSatish Balay nz = diag[i] - ai[i]; 1286*28e30874SSatish Balay idx = 2*(*r++); 1287*28e30874SSatish Balay s1 = b[idx]; s2 = b[1+idx]; 1288*28e30874SSatish Balay while (nz--) { 1289*28e30874SSatish Balay idx = 2*(*vi++); 1290*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 1291*28e30874SSatish Balay s1 -= v[0]*x1 + v[2]*x2; 1292*28e30874SSatish Balay s2 -= v[1]*x1 + v[3]*x2; 1293*28e30874SSatish Balay v += 4; 1294*28e30874SSatish Balay } 1295*28e30874SSatish Balay idx = 2*i; 1296*28e30874SSatish Balay t[idx] = s1; t[1+idx] = s2; 1297*28e30874SSatish Balay } 1298*28e30874SSatish Balay /* backward solve the upper triangular */ 1299*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 1300*28e30874SSatish Balay v = aa + 4*diag[i] + 4; 1301*28e30874SSatish Balay vi = aj + diag[i] + 1; 1302*28e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 1303*28e30874SSatish Balay idt = 2*i; 1304*28e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 1305*28e30874SSatish Balay while (nz--) { 1306*28e30874SSatish Balay idx = 2*(*vi++); 1307*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 1308*28e30874SSatish Balay s1 -= v[0]*x1 + v[2]*x2; 1309*28e30874SSatish Balay s2 -= v[1]*x1 + v[3]*x2; 1310*28e30874SSatish Balay v += 4; 1311*28e30874SSatish Balay } 1312*28e30874SSatish Balay idc = 2*(*c--); 1313*28e30874SSatish Balay v = aa + 4*diag[i]; 1314*28e30874SSatish Balay x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 1315*28e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 1316*28e30874SSatish Balay } 1317*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1318*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1319*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1320*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1321*28e30874SSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1322*28e30874SSatish Balay PetscFunctionReturn(0); 1323*28e30874SSatish Balay } 1324*28e30874SSatish Balay 1325*28e30874SSatish Balay #undef __FUNCT__ 1326*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 1327*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 1328*28e30874SSatish Balay { 1329*28e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1330*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 1331*28e30874SSatish Balay PetscErrorCode ierr; 1332*28e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 1333*28e30874SSatish Balay PetscInt i,nz,idx,jdx,idt,idc,m; 1334*28e30874SSatish Balay const PetscInt *r,*c,*rout,*cout; 1335*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 1336*28e30874SSatish Balay PetscScalar *x,s1,s2,x1,x2,*t; 1337*28e30874SSatish Balay const PetscScalar *b; 1338*28e30874SSatish Balay 1339*28e30874SSatish Balay PetscFunctionBegin; 1340*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1341*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1342*28e30874SSatish Balay t = a->solve_work; 1343*28e30874SSatish Balay 1344*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1345*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1346*28e30874SSatish Balay 1347*28e30874SSatish Balay /* forward solve the lower triangular */ 1348*28e30874SSatish Balay idx = 2*r[0]; 1349*28e30874SSatish Balay t[0] = b[idx]; t[1] = b[1+idx]; 1350*28e30874SSatish Balay for (i=1; i<n; i++) { 1351*28e30874SSatish Balay v = aa + 4*ai[i]; 1352*28e30874SSatish Balay vi = aj + ai[i]; 1353*28e30874SSatish Balay nz = ai[i+1] - ai[i]; 1354*28e30874SSatish Balay idx = 2*r[i]; 1355*28e30874SSatish Balay s1 = b[idx]; s2 = b[1+idx]; 1356*28e30874SSatish Balay for (m=0; m<nz; m++) { 1357*28e30874SSatish Balay jdx = 2*vi[m]; 1358*28e30874SSatish Balay x1 = t[jdx]; x2 = t[1+jdx]; 1359*28e30874SSatish Balay s1 -= v[0]*x1 + v[2]*x2; 1360*28e30874SSatish Balay s2 -= v[1]*x1 + v[3]*x2; 1361*28e30874SSatish Balay v += 4; 1362*28e30874SSatish Balay } 1363*28e30874SSatish Balay idx = 2*i; 1364*28e30874SSatish Balay t[idx] = s1; t[1+idx] = s2; 1365*28e30874SSatish Balay } 1366*28e30874SSatish Balay /* backward solve the upper triangular */ 1367*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 1368*28e30874SSatish Balay v = aa + 4*(adiag[i+1]+1); 1369*28e30874SSatish Balay vi = aj + adiag[i+1]+1; 1370*28e30874SSatish Balay nz = adiag[i] - adiag[i+1] - 1; 1371*28e30874SSatish Balay idt = 2*i; 1372*28e30874SSatish Balay s1 = t[idt]; s2 = t[1+idt]; 1373*28e30874SSatish Balay for (m=0; m<nz; m++) { 1374*28e30874SSatish Balay idx = 2*vi[m]; 1375*28e30874SSatish Balay x1 = t[idx]; x2 = t[1+idx]; 1376*28e30874SSatish Balay s1 -= v[0]*x1 + v[2]*x2; 1377*28e30874SSatish Balay s2 -= v[1]*x1 + v[3]*x2; 1378*28e30874SSatish Balay v += 4; 1379*28e30874SSatish Balay } 1380*28e30874SSatish Balay idc = 2*c[i]; 1381*28e30874SSatish Balay x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 1382*28e30874SSatish Balay x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 1383*28e30874SSatish Balay } 1384*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1385*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1386*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1387*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1388*28e30874SSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1389*28e30874SSatish Balay PetscFunctionReturn(0); 1390*28e30874SSatish Balay } 1391*28e30874SSatish Balay 1392*28e30874SSatish Balay #undef __FUNCT__ 1393*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 1394*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1395*28e30874SSatish Balay { 1396*28e30874SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1397*28e30874SSatish Balay IS iscol=a->col,isrow=a->row; 1398*28e30874SSatish Balay PetscErrorCode ierr; 1399*28e30874SSatish Balay const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1400*28e30874SSatish Balay PetscInt i,nz; 1401*28e30874SSatish Balay const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1402*28e30874SSatish Balay const MatScalar *aa=a->a,*v; 1403*28e30874SSatish Balay PetscScalar *x,s1,*t; 1404*28e30874SSatish Balay const PetscScalar *b; 1405*28e30874SSatish Balay 1406*28e30874SSatish Balay PetscFunctionBegin; 1407*28e30874SSatish Balay if (!n) PetscFunctionReturn(0); 1408*28e30874SSatish Balay 1409*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1410*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1411*28e30874SSatish Balay t = a->solve_work; 1412*28e30874SSatish Balay 1413*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1414*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1415*28e30874SSatish Balay 1416*28e30874SSatish Balay /* forward solve the lower triangular */ 1417*28e30874SSatish Balay t[0] = b[*r++]; 1418*28e30874SSatish Balay for (i=1; i<n; i++) { 1419*28e30874SSatish Balay v = aa + ai[i]; 1420*28e30874SSatish Balay vi = aj + ai[i]; 1421*28e30874SSatish Balay nz = diag[i] - ai[i]; 1422*28e30874SSatish Balay s1 = b[*r++]; 1423*28e30874SSatish Balay while (nz--) { 1424*28e30874SSatish Balay s1 -= (*v++)*t[*vi++]; 1425*28e30874SSatish Balay } 1426*28e30874SSatish Balay t[i] = s1; 1427*28e30874SSatish Balay } 1428*28e30874SSatish Balay /* backward solve the upper triangular */ 1429*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 1430*28e30874SSatish Balay v = aa + diag[i] + 1; 1431*28e30874SSatish Balay vi = aj + diag[i] + 1; 1432*28e30874SSatish Balay nz = ai[i+1] - diag[i] - 1; 1433*28e30874SSatish Balay s1 = t[i]; 1434*28e30874SSatish Balay while (nz--) { 1435*28e30874SSatish Balay s1 -= (*v++)*t[*vi++]; 1436*28e30874SSatish Balay } 1437*28e30874SSatish Balay x[*c--] = t[i] = aa[diag[i]]*s1; 1438*28e30874SSatish Balay } 1439*28e30874SSatish Balay 1440*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1441*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1442*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1443*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1444*28e30874SSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 1445*28e30874SSatish Balay PetscFunctionReturn(0); 1446*28e30874SSatish Balay } 1447*28e30874SSatish Balay 1448*28e30874SSatish Balay #undef __FUNCT__ 1449*28e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 1450*28e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 1451*28e30874SSatish Balay { 1452*28e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1453*28e30874SSatish Balay IS iscol = a->col,isrow = a->row; 1454*28e30874SSatish Balay PetscErrorCode ierr; 1455*28e30874SSatish Balay PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 1456*28e30874SSatish Balay const PetscInt *rout,*cout,*r,*c; 1457*28e30874SSatish Balay PetscScalar *x,*tmp,sum; 1458*28e30874SSatish Balay const PetscScalar *b; 1459*28e30874SSatish Balay const MatScalar *aa = a->a,*v; 1460*28e30874SSatish Balay 1461*28e30874SSatish Balay PetscFunctionBegin; 1462*28e30874SSatish Balay if (!n) PetscFunctionReturn(0); 1463*28e30874SSatish Balay 1464*28e30874SSatish Balay ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1465*28e30874SSatish Balay ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1466*28e30874SSatish Balay tmp = a->solve_work; 1467*28e30874SSatish Balay 1468*28e30874SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1469*28e30874SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1470*28e30874SSatish Balay 1471*28e30874SSatish Balay /* forward solve the lower triangular */ 1472*28e30874SSatish Balay tmp[0] = b[r[0]]; 1473*28e30874SSatish Balay v = aa; 1474*28e30874SSatish Balay vi = aj; 1475*28e30874SSatish Balay for (i=1; i<n; i++) { 1476*28e30874SSatish Balay nz = ai[i+1] - ai[i]; 1477*28e30874SSatish Balay sum = b[r[i]]; 1478*28e30874SSatish Balay PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 1479*28e30874SSatish Balay tmp[i] = sum; 1480*28e30874SSatish Balay v += nz; vi += nz; 1481*28e30874SSatish Balay } 1482*28e30874SSatish Balay 1483*28e30874SSatish Balay /* backward solve the upper triangular */ 1484*28e30874SSatish Balay for (i=n-1; i>=0; i--) { 1485*28e30874SSatish Balay v = aa + adiag[i+1]+1; 1486*28e30874SSatish Balay vi = aj + adiag[i+1]+1; 1487*28e30874SSatish Balay nz = adiag[i]-adiag[i+1]-1; 1488*28e30874SSatish Balay sum = tmp[i]; 1489*28e30874SSatish Balay PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 1490*28e30874SSatish Balay x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 1491*28e30874SSatish Balay } 1492*28e30874SSatish Balay 1493*28e30874SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1494*28e30874SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1495*28e30874SSatish Balay ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1496*28e30874SSatish Balay ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1497*28e30874SSatish Balay ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 1498*28e30874SSatish Balay PetscFunctionReturn(0); 1499*28e30874SSatish Balay } 1500*28e30874SSatish Balay 1501