xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision ecf371e4f11dc92d760a8a34d4f20e65e993c904)
14e2b4712SSatish Balay #ifdef PETSC_RCS_HEADER
2*ecf371e4SBarry Smith static char vcid[] = "$Id: baijfact2.c,v 1.4 1998/03/12 23:19:14 bsmith Exp bsmith $";
34e2b4712SSatish Balay #endif
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
84e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h"
94e2b4712SSatish Balay #include "src/vec/vecimpl.h"
104e2b4712SSatish Balay #include "src/inline/ilu.h"
114e2b4712SSatish Balay 
124e2b4712SSatish Balay /* ----------------------------------------------------------- */
134e2b4712SSatish Balay #undef __FUNC__
144e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_N"
154e2b4712SSatish Balay int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
164e2b4712SSatish Balay {
174e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
184e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
194e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
204e2b4712SSatish Balay   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
214e2b4712SSatish Balay   Scalar          *aa=a->a,*sum;
224e2b4712SSatish Balay   register Scalar *x,*b,*lsum,*tmp,*v;
234e2b4712SSatish Balay 
244e2b4712SSatish Balay   PetscFunctionBegin;
254e2b4712SSatish Balay   VecGetArray_Fast(bb,b);
264e2b4712SSatish Balay   VecGetArray_Fast(xx,x);
274e2b4712SSatish Balay   tmp  = a->solve_work;
284e2b4712SSatish Balay 
294e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
304e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
314e2b4712SSatish Balay 
324e2b4712SSatish Balay   /* forward solve the lower triangular */
334e2b4712SSatish Balay   PetscMemcpy(tmp,b + bs*(*r++), bs*sizeof(Scalar));
344e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
354e2b4712SSatish Balay     v   = aa + bs2*ai[i];
364e2b4712SSatish Balay     vi  = aj + ai[i];
374e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
384e2b4712SSatish Balay     sum = tmp + bs*i;
394e2b4712SSatish Balay     PetscMemcpy(sum,b+bs*(*r++),bs*sizeof(Scalar));
404e2b4712SSatish Balay     while (nz--) {
414e2b4712SSatish Balay       Kernel_v_gets_v_minus_A_times_w(bs,sum,v,tmp+bs*(*vi++));
424e2b4712SSatish Balay       v += bs2;
434e2b4712SSatish Balay     }
444e2b4712SSatish Balay   }
454e2b4712SSatish Balay   /* backward solve the upper triangular */
464e2b4712SSatish Balay   lsum = a->solve_work + a->n;
474e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
484e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
494e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
504e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
514e2b4712SSatish Balay     PetscMemcpy(lsum,tmp+i*bs,bs*sizeof(Scalar));
524e2b4712SSatish Balay     while (nz--) {
534e2b4712SSatish Balay       Kernel_v_gets_v_minus_A_times_w(bs,lsum,v,tmp+bs*(*vi++));
544e2b4712SSatish Balay       v += bs2;
554e2b4712SSatish Balay     }
564e2b4712SSatish Balay     Kernel_w_gets_A_times_v(bs,lsum,aa+bs2*a->diag[i],tmp+i*bs);
574e2b4712SSatish Balay     PetscMemcpy(x + bs*(*c--),tmp+i*bs,bs*sizeof(Scalar));
584e2b4712SSatish Balay   }
594e2b4712SSatish Balay 
604e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
614e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
624e2b4712SSatish Balay   VecRestoreArray_Fast(bb,b);
634e2b4712SSatish Balay   VecRestoreArray_Fast(xx,x);
644e2b4712SSatish Balay   PLogFlops(2*(a->bs2)*(a->nz) - a->n);
654e2b4712SSatish Balay   PetscFunctionReturn(0);
664e2b4712SSatish Balay }
674e2b4712SSatish Balay 
684e2b4712SSatish Balay #undef __FUNC__
694e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_7"
704e2b4712SSatish Balay int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
714e2b4712SSatish Balay {
724e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
734e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
744e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
754e2b4712SSatish Balay   int             *diag = a->diag;
764e2b4712SSatish Balay   Scalar          *aa=a->a,sum1,sum2,sum3,sum4,sum5,sum6,sum7,x1,x2,x3,x4,x5,x6,x7;
774e2b4712SSatish Balay   register Scalar *x,*b,*tmp,*v;
784e2b4712SSatish Balay 
794e2b4712SSatish Balay   PetscFunctionBegin;
804e2b4712SSatish Balay   VecGetArray_Fast(bb,b);
814e2b4712SSatish Balay   VecGetArray_Fast(xx,x);
824e2b4712SSatish Balay   tmp  = a->solve_work;
834e2b4712SSatish Balay 
844e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
854e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
864e2b4712SSatish Balay 
874e2b4712SSatish Balay   /* forward solve the lower triangular */
884e2b4712SSatish Balay   idx    = 7*(*r++);
894e2b4712SSatish Balay   tmp[0] = b[idx];   tmp[1] = b[1+idx];
904e2b4712SSatish Balay   tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx];
914e2b4712SSatish Balay   tmp[5] = b[5+idx]; tmp[6] = b[6+idx];
924e2b4712SSatish Balay 
934e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
944e2b4712SSatish Balay     v     = aa + 49*ai[i];
954e2b4712SSatish Balay     vi    = aj + ai[i];
964e2b4712SSatish Balay     nz    = diag[i] - ai[i];
974e2b4712SSatish Balay     idx   = 7*(*r++);
984e2b4712SSatish Balay     sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
994e2b4712SSatish Balay     sum5  = b[4+idx];sum6 = b[5+idx];sum7 = b[6+idx];
1004e2b4712SSatish Balay     while (nz--) {
1014e2b4712SSatish Balay       idx   = 7*(*vi++);
1024e2b4712SSatish Balay       x1    = tmp[idx];  x2 = tmp[1+idx];x3 = tmp[2+idx];
1034e2b4712SSatish Balay       x4    = tmp[3+idx];x5 = tmp[4+idx];
1044e2b4712SSatish Balay       x6    = tmp[5+idx];x7 = tmp[6+idx];
1054e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1064e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1074e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1084e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1094e2b4712SSatish Balay       sum5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1104e2b4712SSatish Balay       sum6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1114e2b4712SSatish Balay       sum7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1124e2b4712SSatish Balay       v += 49;
1134e2b4712SSatish Balay     }
1144e2b4712SSatish Balay     idx = 7*i;
1154e2b4712SSatish Balay     tmp[idx]   = sum1;tmp[1+idx] = sum2;
1164e2b4712SSatish Balay     tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5;
1174e2b4712SSatish Balay     tmp[5+idx] = sum6;tmp[6+idx] = sum7;
1184e2b4712SSatish Balay   }
1194e2b4712SSatish Balay   /* backward solve the upper triangular */
1204e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
1214e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
1224e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
1234e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
1244e2b4712SSatish Balay     idt  = 7*i;
1254e2b4712SSatish Balay     sum1 = tmp[idt];  sum2 = tmp[1+idt];
1264e2b4712SSatish Balay     sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt];
1274e2b4712SSatish Balay     sum6 = tmp[5+idt];sum7 = tmp[6+idt];
1284e2b4712SSatish Balay     while (nz--) {
1294e2b4712SSatish Balay       idx   = 7*(*vi++);
1304e2b4712SSatish Balay       x1    = tmp[idx];   x2 = tmp[1+idx];
1314e2b4712SSatish Balay       x3    = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx];
1324e2b4712SSatish Balay       x6    = tmp[5+idx]; x7 = tmp[6+idx];
1334e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1344e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1354e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1364e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1374e2b4712SSatish Balay       sum5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1384e2b4712SSatish Balay       sum6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1394e2b4712SSatish Balay       sum7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1404e2b4712SSatish Balay       v += 49;
1414e2b4712SSatish Balay     }
1424e2b4712SSatish Balay     idc = 7*(*c--);
1434e2b4712SSatish Balay     v   = aa + 49*diag[i];
1444e2b4712SSatish Balay     x[idc]   = tmp[idt]   = v[0]*sum1+v[7]*sum2+v[14]*sum3+
1454e2b4712SSatish Balay                                  v[21]*sum4+v[28]*sum5+v[35]*sum6+v[42]*sum7;
1464e2b4712SSatish Balay     x[1+idc] = tmp[1+idt] = v[1]*sum1+v[8]*sum2+v[15]*sum3+
1474e2b4712SSatish Balay                                  v[22]*sum4+v[29]*sum5+v[36]*sum6+v[43]*sum7;
1484e2b4712SSatish Balay     x[2+idc] = tmp[2+idt] = v[2]*sum1+v[9]*sum2+v[16]*sum3+
1494e2b4712SSatish Balay                                  v[23]*sum4+v[30]*sum5+v[37]*sum6+v[44]*sum7;
1504e2b4712SSatish Balay     x[3+idc] = tmp[3+idt] = v[3]*sum1+v[10]*sum2+v[17]*sum3+
1514e2b4712SSatish Balay                                  v[24]*sum4+v[31]*sum5+v[38]*sum6+v[45]*sum7;
1524e2b4712SSatish Balay     x[4+idc] = tmp[4+idt] = v[4]*sum1+v[11]*sum2+v[18]*sum3+
1534e2b4712SSatish Balay                                  v[25]*sum4+v[32]*sum5+v[39]*sum6+v[46]*sum7;
1544e2b4712SSatish Balay     x[5+idc] = tmp[5+idt] = v[5]*sum1+v[12]*sum2+v[19]*sum3+
1554e2b4712SSatish Balay                                  v[26]*sum4+v[33]*sum5+v[40]*sum6+v[47]*sum7;
1564e2b4712SSatish Balay     x[6+idc] = tmp[6+idt] = v[6]*sum1+v[13]*sum2+v[20]*sum3+
1574e2b4712SSatish Balay                                  v[27]*sum4+v[34]*sum5+v[41]*sum6+v[48]*sum7;
1584e2b4712SSatish Balay   }
1594e2b4712SSatish Balay 
1604e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
1614e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
1624e2b4712SSatish Balay   VecRestoreArray_Fast(bb,b);
1634e2b4712SSatish Balay   VecRestoreArray_Fast(xx,x);
1644e2b4712SSatish Balay   PLogFlops(2*49*(a->nz) - a->n);
1654e2b4712SSatish Balay   PetscFunctionReturn(0);
1664e2b4712SSatish Balay }
1674e2b4712SSatish Balay 
1684e2b4712SSatish Balay #undef __FUNC__
1694e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_5"
1704e2b4712SSatish Balay int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1714e2b4712SSatish Balay {
1724e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1734e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
1744e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1754e2b4712SSatish Balay   int             *diag = a->diag;
1764e2b4712SSatish Balay   Scalar          *aa=a->a,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1774e2b4712SSatish Balay   register Scalar *x,*b,*tmp,*v;
1784e2b4712SSatish Balay 
1794e2b4712SSatish Balay   PetscFunctionBegin;
1804e2b4712SSatish Balay   VecGetArray_Fast(bb,b);
1814e2b4712SSatish Balay   VecGetArray_Fast(xx,x);
1824e2b4712SSatish Balay   tmp  = a->solve_work;
1834e2b4712SSatish Balay 
1844e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1854e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1864e2b4712SSatish Balay 
1874e2b4712SSatish Balay   /* forward solve the lower triangular */
1884e2b4712SSatish Balay   idx    = 5*(*r++);
1894e2b4712SSatish Balay   tmp[0] = b[idx];   tmp[1] = b[1+idx];
1904e2b4712SSatish Balay   tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx];
1914e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
1924e2b4712SSatish Balay     v     = aa + 25*ai[i];
1934e2b4712SSatish Balay     vi    = aj + ai[i];
1944e2b4712SSatish Balay     nz    = diag[i] - ai[i];
1954e2b4712SSatish Balay     idx   = 5*(*r++);
1964e2b4712SSatish Balay     sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
1974e2b4712SSatish Balay     sum5  = b[4+idx];
1984e2b4712SSatish Balay     while (nz--) {
1994e2b4712SSatish Balay       idx   = 5*(*vi++);
2004e2b4712SSatish Balay       x1    = tmp[idx];  x2 = tmp[1+idx];x3 = tmp[2+idx];
2014e2b4712SSatish Balay       x4    = tmp[3+idx];x5 = tmp[4+idx];
2024e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2034e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2044e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2054e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2064e2b4712SSatish Balay       sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2074e2b4712SSatish Balay       v += 25;
2084e2b4712SSatish Balay     }
2094e2b4712SSatish Balay     idx = 5*i;
2104e2b4712SSatish Balay     tmp[idx]   = sum1;tmp[1+idx] = sum2;
2114e2b4712SSatish Balay     tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5;
2124e2b4712SSatish Balay   }
2134e2b4712SSatish Balay   /* backward solve the upper triangular */
2144e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
2154e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
2164e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
2174e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2184e2b4712SSatish Balay     idt  = 5*i;
2194e2b4712SSatish Balay     sum1 = tmp[idt];  sum2 = tmp[1+idt];
2204e2b4712SSatish Balay     sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt];
2214e2b4712SSatish Balay     while (nz--) {
2224e2b4712SSatish Balay       idx   = 5*(*vi++);
2234e2b4712SSatish Balay       x1    = tmp[idx];   x2 = tmp[1+idx];
2244e2b4712SSatish Balay       x3    = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx];
2254e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2264e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2274e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2284e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2294e2b4712SSatish Balay       sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2304e2b4712SSatish Balay       v += 25;
2314e2b4712SSatish Balay     }
2324e2b4712SSatish Balay     idc = 5*(*c--);
2334e2b4712SSatish Balay     v   = aa + 25*diag[i];
2344e2b4712SSatish Balay     x[idc]   = tmp[idt]   = v[0]*sum1+v[5]*sum2+v[10]*sum3+
2354e2b4712SSatish Balay                                  v[15]*sum4+v[20]*sum5;
2364e2b4712SSatish Balay     x[1+idc] = tmp[1+idt] = v[1]*sum1+v[6]*sum2+v[11]*sum3+
2374e2b4712SSatish Balay                                  v[16]*sum4+v[21]*sum5;
2384e2b4712SSatish Balay     x[2+idc] = tmp[2+idt] = v[2]*sum1+v[7]*sum2+v[12]*sum3+
2394e2b4712SSatish Balay                                  v[17]*sum4+v[22]*sum5;
2404e2b4712SSatish Balay     x[3+idc] = tmp[3+idt] = v[3]*sum1+v[8]*sum2+v[13]*sum3+
2414e2b4712SSatish Balay                                  v[18]*sum4+v[23]*sum5;
2424e2b4712SSatish Balay     x[4+idc] = tmp[4+idt] = v[4]*sum1+v[9]*sum2+v[14]*sum3+
2434e2b4712SSatish Balay                                  v[19]*sum4+v[24]*sum5;
2444e2b4712SSatish Balay   }
2454e2b4712SSatish Balay 
2464e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
2474e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
2484e2b4712SSatish Balay   VecRestoreArray_Fast(bb,b);
2494e2b4712SSatish Balay   VecRestoreArray_Fast(xx,x);
2504e2b4712SSatish Balay   PLogFlops(2*25*(a->nz) - a->n);
2514e2b4712SSatish Balay   PetscFunctionReturn(0);
2524e2b4712SSatish Balay }
2534e2b4712SSatish Balay 
2544e2b4712SSatish Balay #undef __FUNC__
2554e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_4"
2564e2b4712SSatish Balay int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
2574e2b4712SSatish Balay {
2584e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2594e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
2604e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2614e2b4712SSatish Balay   int             *diag = a->diag;
2624e2b4712SSatish Balay   Scalar          *aa=a->a,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
2634e2b4712SSatish Balay   register Scalar *x,*b,*tmp,*v;
2644e2b4712SSatish Balay 
2654e2b4712SSatish Balay   PetscFunctionBegin;
2664e2b4712SSatish Balay   VecGetArray_Fast(bb,b);
2674e2b4712SSatish Balay   VecGetArray_Fast(xx,x);
2684e2b4712SSatish Balay   tmp  = a->solve_work;
2694e2b4712SSatish Balay 
2704e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2714e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2724e2b4712SSatish Balay 
2734e2b4712SSatish Balay   /* forward solve the lower triangular */
2744e2b4712SSatish Balay   idx    = 4*(*r++);
2754e2b4712SSatish Balay   tmp[0] = b[idx];   tmp[1] = b[1+idx];
2764e2b4712SSatish Balay   tmp[2] = b[2+idx]; tmp[3] = b[3+idx];
2774e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
2784e2b4712SSatish Balay     v     = aa + 16*ai[i];
2794e2b4712SSatish Balay     vi    = aj + ai[i];
2804e2b4712SSatish Balay     nz    = diag[i] - ai[i];
2814e2b4712SSatish Balay     idx   = 4*(*r++);
2824e2b4712SSatish Balay     sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
2834e2b4712SSatish Balay     while (nz--) {
2844e2b4712SSatish Balay       idx   = 4*(*vi++);
2854e2b4712SSatish Balay       x1    = tmp[idx];x2 = tmp[1+idx];x3 = tmp[2+idx];x4 = tmp[3+idx];
2864e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2874e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2884e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2894e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2904e2b4712SSatish Balay       v    += 16;
2914e2b4712SSatish Balay     }
2924e2b4712SSatish Balay     idx        = 4*i;
2934e2b4712SSatish Balay     tmp[idx]   = sum1;tmp[1+idx] = sum2;
2944e2b4712SSatish Balay     tmp[2+idx] = sum3;tmp[3+idx] = sum4;
2954e2b4712SSatish Balay   }
2964e2b4712SSatish Balay   /* backward solve the upper triangular */
2974e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
2984e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
2994e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
3004e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3014e2b4712SSatish Balay     idt  = 4*i;
3024e2b4712SSatish Balay     sum1 = tmp[idt];  sum2 = tmp[1+idt];
3034e2b4712SSatish Balay     sum3 = tmp[2+idt];sum4 = tmp[3+idt];
3044e2b4712SSatish Balay     while (nz--) {
3054e2b4712SSatish Balay       idx   = 4*(*vi++);
3064e2b4712SSatish Balay       x1    = tmp[idx];   x2 = tmp[1+idx];
3074e2b4712SSatish Balay       x3    = tmp[2+idx]; x4 = tmp[3+idx];
3084e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3094e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3104e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3114e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3124e2b4712SSatish Balay       v += 16;
3134e2b4712SSatish Balay     }
3144e2b4712SSatish Balay     idc      = 4*(*c--);
3154e2b4712SSatish Balay     v        = aa + 16*diag[i];
3164e2b4712SSatish Balay     x[idc]   = tmp[idt]   = v[0]*sum1+v[4]*sum2+v[8]*sum3+v[12]*sum4;
3174e2b4712SSatish Balay     x[1+idc] = tmp[1+idt] = v[1]*sum1+v[5]*sum2+v[9]*sum3+v[13]*sum4;
3184e2b4712SSatish Balay     x[2+idc] = tmp[2+idt] = v[2]*sum1+v[6]*sum2+v[10]*sum3+v[14]*sum4;
3194e2b4712SSatish Balay     x[3+idc] = tmp[3+idt] = v[3]*sum1+v[7]*sum2+v[11]*sum3+v[15]*sum4;
3204e2b4712SSatish Balay   }
3214e2b4712SSatish Balay 
3224e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
3234e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
3244e2b4712SSatish Balay   VecRestoreArray_Fast(bb,b);
3254e2b4712SSatish Balay   VecRestoreArray_Fast(xx,x);
3264e2b4712SSatish Balay   PLogFlops(2*16*(a->nz) - a->n);
3274e2b4712SSatish Balay   PetscFunctionReturn(0);
3284e2b4712SSatish Balay }
3294e2b4712SSatish Balay 
3304e2b4712SSatish Balay /*
3314e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
3324e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
3334e2b4712SSatish Balay */
3344e2b4712SSatish Balay #undef __FUNC__
3354e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3364e2b4712SSatish Balay int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
3374e2b4712SSatish Balay {
3384e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
3394e2b4712SSatish Balay   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
3404e2b4712SSatish Balay   int             *diag = a->diag,jdx;
3414e2b4712SSatish Balay   Scalar          *aa=a->a,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
3424e2b4712SSatish Balay   register Scalar *x,*b,*v;
3434e2b4712SSatish Balay 
3444e2b4712SSatish Balay   PetscFunctionBegin;
3454e2b4712SSatish Balay   VecGetArray_Fast(bb,b);
3464e2b4712SSatish Balay   VecGetArray_Fast(xx,x);
3474e2b4712SSatish Balay 
3484e2b4712SSatish Balay   /* forward solve the lower triangular */
3494e2b4712SSatish Balay   idx    = 0;
3504e2b4712SSatish Balay   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];
3514e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
3524e2b4712SSatish Balay     v     =  aa + 16*ai[i];
3534e2b4712SSatish Balay     vi    =  aj + ai[i];
3544e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
3554e2b4712SSatish Balay     idx   =  4*i;
3564e2b4712SSatish Balay     sum1  =  b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
3574e2b4712SSatish Balay     while (nz--) {
3584e2b4712SSatish Balay       jdx   = 4*(*vi++);
3594e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3604e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3614e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3624e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3634e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3644e2b4712SSatish Balay       v    += 16;
3654e2b4712SSatish Balay     }
3664e2b4712SSatish Balay     x[idx]   = sum1;
3674e2b4712SSatish Balay     x[1+idx] = sum2;
3684e2b4712SSatish Balay     x[2+idx] = sum3;
3694e2b4712SSatish Balay     x[3+idx] = sum4;
3704e2b4712SSatish Balay   }
3714e2b4712SSatish Balay   /* backward solve the upper triangular */
3724e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
3734e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
3744e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
3754e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3764e2b4712SSatish Balay     idt  = 4*i;
3774e2b4712SSatish Balay     sum1 = x[idt];  sum2 = x[1+idt];
3784e2b4712SSatish Balay     sum3 = x[2+idt];sum4 = x[3+idt];
3794e2b4712SSatish Balay     while (nz--) {
3804e2b4712SSatish Balay       idx   = 4*(*vi++);
3814e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3824e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3834e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3844e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3854e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3864e2b4712SSatish Balay       v += 16;
3874e2b4712SSatish Balay     }
3884e2b4712SSatish Balay     v        = aa + 16*diag[i];
3894e2b4712SSatish Balay     x[idt]   = v[0]*sum1 + v[4]*sum2 + v[8]*sum3  + v[12]*sum4;
3904e2b4712SSatish Balay     x[1+idt] = v[1]*sum1 + v[5]*sum2 + v[9]*sum3  + v[13]*sum4;
3914e2b4712SSatish Balay     x[2+idt] = v[2]*sum1 + v[6]*sum2 + v[10]*sum3 + v[14]*sum4;
3924e2b4712SSatish Balay     x[3+idt] = v[3]*sum1 + v[7]*sum2 + v[11]*sum3 + v[15]*sum4;
3934e2b4712SSatish Balay   }
3944e2b4712SSatish Balay 
3954e2b4712SSatish Balay   VecRestoreArray_Fast(bb,b);
3964e2b4712SSatish Balay   VecRestoreArray_Fast(xx,x);
3974e2b4712SSatish Balay   PLogFlops(2*16*(a->nz) - a->n);
3984e2b4712SSatish Balay   PetscFunctionReturn(0);
3994e2b4712SSatish Balay }
4004e2b4712SSatish Balay 
4014e2b4712SSatish Balay 
4024e2b4712SSatish Balay #undef __FUNC__
4034e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_3"
4044e2b4712SSatish Balay int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4054e2b4712SSatish Balay {
4064e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
4074e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
4084e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
4094e2b4712SSatish Balay   int             *diag = a->diag;
4104e2b4712SSatish Balay   Scalar          *aa=a->a,sum1,sum2,sum3,x1,x2,x3;
4114e2b4712SSatish Balay   register Scalar *x,*b,*tmp,*v;
4124e2b4712SSatish Balay 
4134e2b4712SSatish Balay   PetscFunctionBegin;
4144e2b4712SSatish Balay   VecGetArray_Fast(bb,b);
4154e2b4712SSatish Balay   VecGetArray_Fast(xx,x);
4164e2b4712SSatish Balay   tmp  = a->solve_work;
4174e2b4712SSatish Balay 
4184e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4194e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4204e2b4712SSatish Balay 
4214e2b4712SSatish Balay   /* forward solve the lower triangular */
4224e2b4712SSatish Balay   idx    = 3*(*r++);
4234e2b4712SSatish Balay   tmp[0] = b[idx]; tmp[1] = b[1+idx]; tmp[2] = b[2+idx];
4244e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
4254e2b4712SSatish Balay     v     = aa + 9*ai[i];
4264e2b4712SSatish Balay     vi    = aj + ai[i];
4274e2b4712SSatish Balay     nz    = diag[i] - ai[i];
4284e2b4712SSatish Balay     idx   = 3*(*r++);
4294e2b4712SSatish Balay     sum1  = b[idx]; sum2 = b[1+idx]; sum3 = b[2+idx];
4304e2b4712SSatish Balay     while (nz--) {
4314e2b4712SSatish Balay       idx   = 3*(*vi++);
4324e2b4712SSatish Balay       x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
4334e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4344e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4354e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4364e2b4712SSatish Balay       v += 9;
4374e2b4712SSatish Balay     }
4384e2b4712SSatish Balay     idx = 3*i;
4394e2b4712SSatish Balay     tmp[idx] = sum1; tmp[1+idx] = sum2; tmp[2+idx] = sum3;
4404e2b4712SSatish Balay   }
4414e2b4712SSatish Balay   /* backward solve the upper triangular */
4424e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
4434e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
4444e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
4454e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4464e2b4712SSatish Balay     idt  = 3*i;
4474e2b4712SSatish Balay     sum1 = tmp[idt]; sum2 = tmp[1+idt]; sum3 = tmp[2+idt];
4484e2b4712SSatish Balay     while (nz--) {
4494e2b4712SSatish Balay       idx   = 3*(*vi++);
4504e2b4712SSatish Balay       x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
4514e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4524e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4534e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4544e2b4712SSatish Balay       v += 9;
4554e2b4712SSatish Balay     }
4564e2b4712SSatish Balay     idc = 3*(*c--);
4574e2b4712SSatish Balay     v   = aa + 9*diag[i];
4584e2b4712SSatish Balay     x[idc]   = tmp[idt]   = v[0]*sum1 + v[3]*sum2 + v[6]*sum3;
4594e2b4712SSatish Balay     x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[4]*sum2 + v[7]*sum3;
4604e2b4712SSatish Balay     x[2+idc] = tmp[2+idt] = v[2]*sum1 + v[5]*sum2 + v[8]*sum3;
4614e2b4712SSatish Balay   }
4624e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
4634e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
4644e2b4712SSatish Balay   VecRestoreArray_Fast(bb,b);
4654e2b4712SSatish Balay   VecRestoreArray_Fast(xx,x);
4664e2b4712SSatish Balay   PLogFlops(2*9*(a->nz) - a->n);
4674e2b4712SSatish Balay   PetscFunctionReturn(0);
4684e2b4712SSatish Balay }
4694e2b4712SSatish Balay 
4704e2b4712SSatish Balay #undef __FUNC__
4714e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_2"
4724e2b4712SSatish Balay int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
4734e2b4712SSatish Balay {
4744e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
4754e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
4764e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
4774e2b4712SSatish Balay   int             *diag = a->diag;
4784e2b4712SSatish Balay   Scalar          *aa=a->a,sum1,sum2,x1,x2;
4794e2b4712SSatish Balay   register Scalar *x,*b,*tmp,*v;
4804e2b4712SSatish Balay 
4814e2b4712SSatish Balay   PetscFunctionBegin;
4824e2b4712SSatish Balay   VecGetArray_Fast(bb,b);
4834e2b4712SSatish Balay   VecGetArray_Fast(xx,x);
4844e2b4712SSatish Balay   tmp  = a->solve_work;
4854e2b4712SSatish Balay 
4864e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4874e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4884e2b4712SSatish Balay 
4894e2b4712SSatish Balay   /* forward solve the lower triangular */
4904e2b4712SSatish Balay   idx    = 2*(*r++);
4914e2b4712SSatish Balay   tmp[0] = b[idx]; tmp[1] = b[1+idx];
4924e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
4934e2b4712SSatish Balay     v     = aa + 4*ai[i];
4944e2b4712SSatish Balay     vi    = aj + ai[i];
4954e2b4712SSatish Balay     nz    = diag[i] - ai[i];
4964e2b4712SSatish Balay     idx   = 2*(*r++);
4974e2b4712SSatish Balay     sum1  = b[idx]; sum2 = b[1+idx];
4984e2b4712SSatish Balay     while (nz--) {
4994e2b4712SSatish Balay       idx   = 2*(*vi++);
5004e2b4712SSatish Balay       x1    = tmp[idx]; x2 = tmp[1+idx];
5014e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[2]*x2;
5024e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[3]*x2;
5034e2b4712SSatish Balay       v += 4;
5044e2b4712SSatish Balay     }
5054e2b4712SSatish Balay     idx = 2*i;
5064e2b4712SSatish Balay     tmp[idx] = sum1; tmp[1+idx] = sum2;
5074e2b4712SSatish Balay   }
5084e2b4712SSatish Balay   /* backward solve the upper triangular */
5094e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
5104e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
5114e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
5124e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
5134e2b4712SSatish Balay     idt  = 2*i;
5144e2b4712SSatish Balay     sum1 = tmp[idt]; sum2 = tmp[1+idt];
5154e2b4712SSatish Balay     while (nz--) {
5164e2b4712SSatish Balay       idx   = 2*(*vi++);
5174e2b4712SSatish Balay       x1    = tmp[idx]; x2 = tmp[1+idx];
5184e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[2]*x2;
5194e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[3]*x2;
5204e2b4712SSatish Balay       v += 4;
5214e2b4712SSatish Balay     }
5224e2b4712SSatish Balay     idc = 2*(*c--);
5234e2b4712SSatish Balay     v   = aa + 4*diag[i];
5244e2b4712SSatish Balay     x[idc]   = tmp[idt]   = v[0]*sum1 + v[2]*sum2;
5254e2b4712SSatish Balay     x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[3]*sum2;
5264e2b4712SSatish Balay   }
5274e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
5284e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
5294e2b4712SSatish Balay   VecRestoreArray_Fast(bb,b);
5304e2b4712SSatish Balay   VecRestoreArray_Fast(xx,x);
5314e2b4712SSatish Balay   PLogFlops(2*4*(a->nz) - a->n);
5324e2b4712SSatish Balay   PetscFunctionReturn(0);
5334e2b4712SSatish Balay }
5344e2b4712SSatish Balay 
5354e2b4712SSatish Balay 
5364e2b4712SSatish Balay #undef __FUNC__
5374e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_1"
5384e2b4712SSatish Balay int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5394e2b4712SSatish Balay {
5404e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
5414e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
5424e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
5434e2b4712SSatish Balay   int             *diag = a->diag;
5444e2b4712SSatish Balay   Scalar          *aa=a->a,sum1;
5454e2b4712SSatish Balay   register Scalar *x,*b,*tmp,*v;
5464e2b4712SSatish Balay 
5474e2b4712SSatish Balay   PetscFunctionBegin;
5484e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
5494e2b4712SSatish Balay 
5504e2b4712SSatish Balay   VecGetArray_Fast(bb,b);
5514e2b4712SSatish Balay   VecGetArray_Fast(xx,x);
5524e2b4712SSatish Balay   tmp  = a->solve_work;
5534e2b4712SSatish Balay 
5544e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5554e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5564e2b4712SSatish Balay 
5574e2b4712SSatish Balay   /* forward solve the lower triangular */
5584e2b4712SSatish Balay   tmp[0] = b[*r++];
5594e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
5604e2b4712SSatish Balay     v     = aa + ai[i];
5614e2b4712SSatish Balay     vi    = aj + ai[i];
5624e2b4712SSatish Balay     nz    = diag[i] - ai[i];
5634e2b4712SSatish Balay     sum1  = b[*r++];
5644e2b4712SSatish Balay     while (nz--) {
5654e2b4712SSatish Balay       sum1 -= (*v++)*tmp[*vi++];
5664e2b4712SSatish Balay     }
5674e2b4712SSatish Balay     tmp[i] = sum1;
5684e2b4712SSatish Balay   }
5694e2b4712SSatish Balay   /* backward solve the upper triangular */
5704e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
5714e2b4712SSatish Balay     v    = aa + diag[i] + 1;
5724e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
5734e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
5744e2b4712SSatish Balay     sum1 = tmp[i];
5754e2b4712SSatish Balay     while (nz--) {
5764e2b4712SSatish Balay       sum1 -= (*v++)*tmp[*vi++];
5774e2b4712SSatish Balay     }
5784e2b4712SSatish Balay     x[*c--] = tmp[i] = aa[diag[i]]*sum1;
5794e2b4712SSatish Balay   }
5804e2b4712SSatish Balay 
5814e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
5824e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
5834e2b4712SSatish Balay   VecRestoreArray_Fast(bb,b);
5844e2b4712SSatish Balay   VecRestoreArray_Fast(xx,x);
5854e2b4712SSatish Balay   PLogFlops(2*1*(a->nz) - a->n);
5864e2b4712SSatish Balay   PetscFunctionReturn(0);
5874e2b4712SSatish Balay }
5884e2b4712SSatish Balay 
5894e2b4712SSatish Balay extern int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat,Mat*);
5904e2b4712SSatish Balay extern int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat,Vec,Vec);
5914e2b4712SSatish Balay /* ----------------------------------------------------------------*/
5924e2b4712SSatish Balay /*
5934e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5944e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
5954e2b4712SSatish Balay    Not a good example of code reuse.
5964e2b4712SSatish Balay */
5974e2b4712SSatish Balay #undef __FUNC__
5984e2b4712SSatish Balay #define __FUNC__ "MatILUFactorSymbolic_SeqBAIJ"
5994e2b4712SSatish Balay int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,int levels,
6004e2b4712SSatish Balay                                  Mat *fact)
6014e2b4712SSatish Balay {
6024e2b4712SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
6034e2b4712SSatish Balay   IS          isicol;
6044e2b4712SSatish Balay   int         *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j;
6054e2b4712SSatish Balay   int         *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev;
6064e2b4712SSatish Balay   int         *dloc, idx, row,m,fm, nzf, nzi,len,  realloc = 0;
6074e2b4712SSatish Balay   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2;
6084e2b4712SSatish Balay   PetscTruth  col_identity, row_identity;
6094e2b4712SSatish Balay 
6104e2b4712SSatish Balay   PetscFunctionBegin;
6114e2b4712SSatish Balay   /* special case that simply copies fill pattern */
6124e2b4712SSatish Balay   PetscValidHeaderSpecific(isrow,IS_COOKIE);
6134e2b4712SSatish Balay   PetscValidHeaderSpecific(iscol,IS_COOKIE);
6144e2b4712SSatish Balay   ISIdentity(isrow,&row_identity); ISIdentity(iscol,&col_identity);
6154e2b4712SSatish Balay   if (levels == 0 && row_identity && col_identity) {
6164e2b4712SSatish Balay     ierr = MatConvertSameType_SeqBAIJ(A,fact,DO_NOT_COPY_VALUES); CHKERRQ(ierr);
6174e2b4712SSatish Balay     (*fact)->factor = FACTOR_LU;
6184e2b4712SSatish Balay     b               = (Mat_SeqBAIJ *) (*fact)->data;
6194e2b4712SSatish Balay     if (!b->diag) {
6204e2b4712SSatish Balay       ierr = MatMarkDiag_SeqBAIJ(*fact); CHKERRQ(ierr);
6214e2b4712SSatish Balay     }
6224e2b4712SSatish Balay     b->row        = isrow;
6234e2b4712SSatish Balay     b->col        = iscol;
6244e2b4712SSatish Balay     b->solve_work = (Scalar *) PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
6254e2b4712SSatish Balay     /*
6264e2b4712SSatish Balay         Blocksize 4 has a special faster solver for ILU(0) factorization
6274e2b4712SSatish Balay         with natural ordering
6284e2b4712SSatish Balay     */
6294e2b4712SSatish Balay     if (b->bs == 4) {
630f830108cSBarry Smith       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
631f830108cSBarry Smith       (*fact)->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
6324e2b4712SSatish Balay     }
6334e2b4712SSatish Balay     PetscFunctionReturn(0);
6344e2b4712SSatish Balay   }
6354e2b4712SSatish Balay 
6364e2b4712SSatish Balay   ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
6374e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr);
6384e2b4712SSatish Balay   ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
6394e2b4712SSatish Balay 
6404e2b4712SSatish Balay   /* get new row pointers */
6414e2b4712SSatish Balay   ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
6424e2b4712SSatish Balay   ainew[0] = 0;
6434e2b4712SSatish Balay   /* don't know how many column pointers are needed so estimate */
6444e2b4712SSatish Balay   jmax = (int) (f*ai[n] + 1);
6454e2b4712SSatish Balay   ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
6464e2b4712SSatish Balay   /* ajfill is level of fill for each fill entry */
6474e2b4712SSatish Balay   ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajfill);
6484e2b4712SSatish Balay   /* fill is a linked list of nonzeros in active row */
6494e2b4712SSatish Balay   fill = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(fill);
6504e2b4712SSatish Balay   /* im is level for each filled value */
6514e2b4712SSatish Balay   im = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(im);
6524e2b4712SSatish Balay   /* dloc is location of diagonal in factor */
6534e2b4712SSatish Balay   dloc = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(dloc);
6544e2b4712SSatish Balay   dloc[0]  = 0;
6554e2b4712SSatish Balay   for ( prow=0; prow<n; prow++ ) {
6564e2b4712SSatish Balay     /* first copy previous fill into linked list */
6574e2b4712SSatish Balay     nzf     = nz  = ai[r[prow]+1] - ai[r[prow]];
6584e2b4712SSatish Balay     if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,1,"Empty row in matrix");
6594e2b4712SSatish Balay     xi      = aj + ai[r[prow]];
6604e2b4712SSatish Balay     fill[n] = n;
6614e2b4712SSatish Balay     while (nz--) {
6624e2b4712SSatish Balay       fm  = n;
6634e2b4712SSatish Balay       idx = ic[*xi++];
6644e2b4712SSatish Balay       do {
6654e2b4712SSatish Balay         m  = fm;
6664e2b4712SSatish Balay         fm = fill[m];
6674e2b4712SSatish Balay       } while (fm < idx);
6684e2b4712SSatish Balay       fill[m]   = idx;
6694e2b4712SSatish Balay       fill[idx] = fm;
6704e2b4712SSatish Balay       im[idx]   = 0;
6714e2b4712SSatish Balay     }
6724e2b4712SSatish Balay     nzi = 0;
6734e2b4712SSatish Balay     row = fill[n];
6744e2b4712SSatish Balay     while ( row < prow ) {
6754e2b4712SSatish Balay       incrlev = im[row] + 1;
6764e2b4712SSatish Balay       nz      = dloc[row];
6774e2b4712SSatish Balay       xi      = ajnew  + ainew[row] + nz;
6784e2b4712SSatish Balay       flev    = ajfill + ainew[row] + nz + 1;
6794e2b4712SSatish Balay       nnz     = ainew[row+1] - ainew[row] - nz - 1;
6804e2b4712SSatish Balay       if (*xi++ != row) {
6814e2b4712SSatish Balay         SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,0,"Zero pivot: try running with -pc_ilu_nonzeros_along_diagonal");
6824e2b4712SSatish Balay       }
6834e2b4712SSatish Balay       fm      = row;
6844e2b4712SSatish Balay       while (nnz-- > 0) {
6854e2b4712SSatish Balay         idx = *xi++;
6864e2b4712SSatish Balay         if (*flev + incrlev > levels) {
6874e2b4712SSatish Balay           flev++;
6884e2b4712SSatish Balay           continue;
6894e2b4712SSatish Balay         }
6904e2b4712SSatish Balay         do {
6914e2b4712SSatish Balay           m  = fm;
6924e2b4712SSatish Balay           fm = fill[m];
6934e2b4712SSatish Balay         } while (fm < idx);
6944e2b4712SSatish Balay         if (fm != idx) {
6954e2b4712SSatish Balay           im[idx]   = *flev + incrlev;
6964e2b4712SSatish Balay           fill[m]   = idx;
6974e2b4712SSatish Balay           fill[idx] = fm;
6984e2b4712SSatish Balay           fm        = idx;
6994e2b4712SSatish Balay           nzf++;
700*ecf371e4SBarry Smith         } else {
7014e2b4712SSatish Balay           if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
7024e2b4712SSatish Balay         }
7034e2b4712SSatish Balay         flev++;
7044e2b4712SSatish Balay       }
7054e2b4712SSatish Balay       row = fill[row];
7064e2b4712SSatish Balay       nzi++;
7074e2b4712SSatish Balay     }
7084e2b4712SSatish Balay     /* copy new filled row into permanent storage */
7094e2b4712SSatish Balay     ainew[prow+1] = ainew[prow] + nzf;
7104e2b4712SSatish Balay     if (ainew[prow+1] > jmax) {
711*ecf371e4SBarry Smith 
712*ecf371e4SBarry Smith       /* estimate how much additional space we will need */
713*ecf371e4SBarry Smith       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
714*ecf371e4SBarry Smith       /* just double the memory each time */
715*ecf371e4SBarry Smith       int maxadd = jmax;
716*ecf371e4SBarry Smith       /* maxadd = (int) (((f*ai[n]+1)*(n-prow+5))/n); */
7174e2b4712SSatish Balay       if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
7184e2b4712SSatish Balay       jmax += maxadd;
719*ecf371e4SBarry Smith 
720*ecf371e4SBarry Smith       /* allocate a longer ajnew and ajfill */
7214e2b4712SSatish Balay       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
7224e2b4712SSatish Balay       PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
7234e2b4712SSatish Balay       PetscFree(ajnew);
7244e2b4712SSatish Balay       ajnew = xi;
7254e2b4712SSatish Balay       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
7264e2b4712SSatish Balay       PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
7274e2b4712SSatish Balay       PetscFree(ajfill);
7284e2b4712SSatish Balay       ajfill = xi;
729*ecf371e4SBarry Smith       realloc++; /* count how many reallocations are needed */
7304e2b4712SSatish Balay     }
7314e2b4712SSatish Balay     xi          = ajnew + ainew[prow];
7324e2b4712SSatish Balay     flev        = ajfill + ainew[prow];
7334e2b4712SSatish Balay     dloc[prow]  = nzi;
7344e2b4712SSatish Balay     fm          = fill[n];
7354e2b4712SSatish Balay     while (nzf--) {
7364e2b4712SSatish Balay       *xi++   = fm;
7374e2b4712SSatish Balay       *flev++ = im[fm];
7384e2b4712SSatish Balay       fm      = fill[fm];
7394e2b4712SSatish Balay     }
7404e2b4712SSatish Balay   }
7414e2b4712SSatish Balay   PetscFree(ajfill);
7424e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
7434e2b4712SSatish Balay   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
7444e2b4712SSatish Balay   ierr = ISDestroy(isicol); CHKERRQ(ierr);
7454e2b4712SSatish Balay   PetscFree(fill); PetscFree(im);
7464e2b4712SSatish Balay 
7474e2b4712SSatish Balay   {
7484e2b4712SSatish Balay     double af = ((double)ainew[n])/((double)ai[n]);
749981c4779SBarry Smith     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",
7504e2b4712SSatish Balay              realloc,f,af);
751981c4779SBarry Smith     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
752981c4779SBarry Smith     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
753981c4779SBarry Smith     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
7544e2b4712SSatish Balay   }
7554e2b4712SSatish Balay 
7564e2b4712SSatish Balay   /* put together the new matrix */
7574e2b4712SSatish Balay   ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
7584e2b4712SSatish Balay   b = (Mat_SeqBAIJ *) (*fact)->data;
7594e2b4712SSatish Balay   PetscFree(b->imax);
7604e2b4712SSatish Balay   b->singlemalloc = 0;
7614e2b4712SSatish Balay   len = bs2*ainew[n]*sizeof(Scalar);
7624e2b4712SSatish Balay   /* the next line frees the default space generated by the Create() */
7634e2b4712SSatish Balay   PetscFree(b->a); PetscFree(b->ilen);
7644e2b4712SSatish Balay   b->a          = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
7654e2b4712SSatish Balay   b->j          = ajnew;
7664e2b4712SSatish Balay   b->i          = ainew;
7674e2b4712SSatish Balay   for ( i=0; i<n; i++ ) dloc[i] += ainew[i];
7684e2b4712SSatish Balay   b->diag       = dloc;
7694e2b4712SSatish Balay   b->ilen       = 0;
7704e2b4712SSatish Balay   b->imax       = 0;
7714e2b4712SSatish Balay   b->row        = isrow;
7724e2b4712SSatish Balay   b->col        = iscol;
7734e2b4712SSatish Balay   b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
7744e2b4712SSatish Balay   /* In b structure:  Free imax, ilen, old a, old j.
7754e2b4712SSatish Balay      Allocate dloc, solve_work, new a, new j */
7764e2b4712SSatish Balay   PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(Scalar));
7774e2b4712SSatish Balay   b->maxnz          = b->nz = ainew[n];
7784e2b4712SSatish Balay   (*fact)->factor   = FACTOR_LU;
7794e2b4712SSatish Balay 
7804e2b4712SSatish Balay   (*fact)->info.factor_mallocs    = realloc;
7814e2b4712SSatish Balay   (*fact)->info.fill_ratio_given  = f;
7824e2b4712SSatish Balay   (*fact)->info.fill_ratio_needed = ((double)ainew[n])/((double)ai[prow]);
7834e2b4712SSatish Balay 
7844e2b4712SSatish Balay   PetscFunctionReturn(0);
7854e2b4712SSatish Balay }
7864e2b4712SSatish Balay 
7874e2b4712SSatish Balay 
7884e2b4712SSatish Balay 
7894e2b4712SSatish Balay 
790