xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 74c49fae86232ee8d4cedab0aa791f2687f7bef6)
14e2b4712SSatish Balay #ifdef PETSC_RCS_HEADER
2*74c49faeSBarry Smith static char vcid[] = "$Id: baijfact2.c,v 1.14 1998/10/13 02:11:28 bsmith Exp bsmith $";
34e2b4712SSatish Balay #endif
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
84e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h"
94e2b4712SSatish Balay #include "src/vec/vecimpl.h"
104e2b4712SSatish Balay #include "src/inline/ilu.h"
11*74c49faeSBarry Smith #include "src/inline/dot.h"
124e2b4712SSatish Balay 
134e2b4712SSatish Balay /* ----------------------------------------------------------- */
144e2b4712SSatish Balay #undef __FUNC__
154e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_N"
164e2b4712SSatish Balay int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
174e2b4712SSatish Balay {
184e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
194e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
204e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
214e2b4712SSatish Balay   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
224e2b4712SSatish Balay   Scalar          *aa=a->a,*sum;
23e1311b90SBarry Smith   Scalar *x,*b,*lsum,*tmp,*v;
244e2b4712SSatish Balay 
254e2b4712SSatish Balay   PetscFunctionBegin;
26e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27e1311b90SBarry Smith   ierr = VecGetArray(xx,&x); CHKERRQ(ierr);
284e2b4712SSatish Balay   tmp  = a->solve_work;
294e2b4712SSatish Balay 
304e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
314e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
324e2b4712SSatish Balay 
334e2b4712SSatish Balay   /* forward solve the lower triangular */
344e2b4712SSatish Balay   PetscMemcpy(tmp,b + bs*(*r++), bs*sizeof(Scalar));
354e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
364e2b4712SSatish Balay     v   = aa + bs2*ai[i];
374e2b4712SSatish Balay     vi  = aj + ai[i];
384e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
394e2b4712SSatish Balay     sum = tmp + bs*i;
404e2b4712SSatish Balay     PetscMemcpy(sum,b+bs*(*r++),bs*sizeof(Scalar));
414e2b4712SSatish Balay     while (nz--) {
424e2b4712SSatish Balay       Kernel_v_gets_v_minus_A_times_w(bs,sum,v,tmp+bs*(*vi++));
434e2b4712SSatish Balay       v += bs2;
444e2b4712SSatish Balay     }
454e2b4712SSatish Balay   }
464e2b4712SSatish Balay   /* backward solve the upper triangular */
474e2b4712SSatish Balay   lsum = a->solve_work + a->n;
484e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
494e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
504e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
514e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
524e2b4712SSatish Balay     PetscMemcpy(lsum,tmp+i*bs,bs*sizeof(Scalar));
534e2b4712SSatish Balay     while (nz--) {
544e2b4712SSatish Balay       Kernel_v_gets_v_minus_A_times_w(bs,lsum,v,tmp+bs*(*vi++));
554e2b4712SSatish Balay       v += bs2;
564e2b4712SSatish Balay     }
574e2b4712SSatish Balay     Kernel_w_gets_A_times_v(bs,lsum,aa+bs2*a->diag[i],tmp+i*bs);
584e2b4712SSatish Balay     PetscMemcpy(x + bs*(*c--),tmp+i*bs,bs*sizeof(Scalar));
594e2b4712SSatish Balay   }
604e2b4712SSatish Balay 
614e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
624e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
63e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
64e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x); CHKERRQ(ierr);
654e2b4712SSatish Balay   PLogFlops(2*(a->bs2)*(a->nz) - a->n);
664e2b4712SSatish Balay   PetscFunctionReturn(0);
674e2b4712SSatish Balay }
684e2b4712SSatish Balay 
694e2b4712SSatish Balay #undef __FUNC__
704e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_7"
714e2b4712SSatish Balay int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
724e2b4712SSatish Balay {
734e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
744e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
754e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
764e2b4712SSatish Balay   int             *diag = a->diag;
774e2b4712SSatish Balay   Scalar          *aa=a->a,sum1,sum2,sum3,sum4,sum5,sum6,sum7,x1,x2,x3,x4,x5,x6,x7;
78e1311b90SBarry Smith   Scalar *x,*b,*tmp,*v;
794e2b4712SSatish Balay 
804e2b4712SSatish Balay   PetscFunctionBegin;
81e1311b90SBarry Smith   ierr = VecGetArray(bb,&b); CHKERRQ(ierr);
82e1311b90SBarry Smith   ierr = VecGetArray(xx,&x); CHKERRQ(ierr);
834e2b4712SSatish Balay   tmp  = a->solve_work;
844e2b4712SSatish Balay 
854e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
864e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
874e2b4712SSatish Balay 
884e2b4712SSatish Balay   /* forward solve the lower triangular */
894e2b4712SSatish Balay   idx    = 7*(*r++);
904e2b4712SSatish Balay   tmp[0] = b[idx];   tmp[1] = b[1+idx];
914e2b4712SSatish Balay   tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx];
924e2b4712SSatish Balay   tmp[5] = b[5+idx]; tmp[6] = b[6+idx];
934e2b4712SSatish Balay 
944e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
954e2b4712SSatish Balay     v     = aa + 49*ai[i];
964e2b4712SSatish Balay     vi    = aj + ai[i];
974e2b4712SSatish Balay     nz    = diag[i] - ai[i];
984e2b4712SSatish Balay     idx   = 7*(*r++);
994e2b4712SSatish Balay     sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
1004e2b4712SSatish Balay     sum5  = b[4+idx];sum6 = b[5+idx];sum7 = b[6+idx];
1014e2b4712SSatish Balay     while (nz--) {
1024e2b4712SSatish Balay       idx   = 7*(*vi++);
1034e2b4712SSatish Balay       x1    = tmp[idx];  x2 = tmp[1+idx];x3 = tmp[2+idx];
1044e2b4712SSatish Balay       x4    = tmp[3+idx];x5 = tmp[4+idx];
1054e2b4712SSatish Balay       x6    = tmp[5+idx];x7 = tmp[6+idx];
1064e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1074e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1084e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1094e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1104e2b4712SSatish Balay       sum5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1114e2b4712SSatish Balay       sum6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1124e2b4712SSatish Balay       sum7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1134e2b4712SSatish Balay       v += 49;
1144e2b4712SSatish Balay     }
1154e2b4712SSatish Balay     idx = 7*i;
1164e2b4712SSatish Balay     tmp[idx]   = sum1;tmp[1+idx] = sum2;
1174e2b4712SSatish Balay     tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5;
1184e2b4712SSatish Balay     tmp[5+idx] = sum6;tmp[6+idx] = sum7;
1194e2b4712SSatish Balay   }
1204e2b4712SSatish Balay   /* backward solve the upper triangular */
1214e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
1224e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
1234e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
1244e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
1254e2b4712SSatish Balay     idt  = 7*i;
1264e2b4712SSatish Balay     sum1 = tmp[idt];  sum2 = tmp[1+idt];
1274e2b4712SSatish Balay     sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt];
1284e2b4712SSatish Balay     sum6 = tmp[5+idt];sum7 = tmp[6+idt];
1294e2b4712SSatish Balay     while (nz--) {
1304e2b4712SSatish Balay       idx   = 7*(*vi++);
1314e2b4712SSatish Balay       x1    = tmp[idx];   x2 = tmp[1+idx];
1324e2b4712SSatish Balay       x3    = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx];
1334e2b4712SSatish Balay       x6    = tmp[5+idx]; x7 = tmp[6+idx];
1344e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1354e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1364e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1374e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1384e2b4712SSatish Balay       sum5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1394e2b4712SSatish Balay       sum6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1404e2b4712SSatish Balay       sum7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1414e2b4712SSatish Balay       v += 49;
1424e2b4712SSatish Balay     }
1434e2b4712SSatish Balay     idc = 7*(*c--);
1444e2b4712SSatish Balay     v   = aa + 49*diag[i];
1454e2b4712SSatish Balay     x[idc]   = tmp[idt]   = v[0]*sum1+v[7]*sum2+v[14]*sum3+
1464e2b4712SSatish Balay                                  v[21]*sum4+v[28]*sum5+v[35]*sum6+v[42]*sum7;
1474e2b4712SSatish Balay     x[1+idc] = tmp[1+idt] = v[1]*sum1+v[8]*sum2+v[15]*sum3+
1484e2b4712SSatish Balay                                  v[22]*sum4+v[29]*sum5+v[36]*sum6+v[43]*sum7;
1494e2b4712SSatish Balay     x[2+idc] = tmp[2+idt] = v[2]*sum1+v[9]*sum2+v[16]*sum3+
1504e2b4712SSatish Balay                                  v[23]*sum4+v[30]*sum5+v[37]*sum6+v[44]*sum7;
1514e2b4712SSatish Balay     x[3+idc] = tmp[3+idt] = v[3]*sum1+v[10]*sum2+v[17]*sum3+
1524e2b4712SSatish Balay                                  v[24]*sum4+v[31]*sum5+v[38]*sum6+v[45]*sum7;
1534e2b4712SSatish Balay     x[4+idc] = tmp[4+idt] = v[4]*sum1+v[11]*sum2+v[18]*sum3+
1544e2b4712SSatish Balay                                  v[25]*sum4+v[32]*sum5+v[39]*sum6+v[46]*sum7;
1554e2b4712SSatish Balay     x[5+idc] = tmp[5+idt] = v[5]*sum1+v[12]*sum2+v[19]*sum3+
1564e2b4712SSatish Balay                                  v[26]*sum4+v[33]*sum5+v[40]*sum6+v[47]*sum7;
1574e2b4712SSatish Balay     x[6+idc] = tmp[6+idt] = v[6]*sum1+v[13]*sum2+v[20]*sum3+
1584e2b4712SSatish Balay                                  v[27]*sum4+v[34]*sum5+v[41]*sum6+v[48]*sum7;
1594e2b4712SSatish Balay   }
1604e2b4712SSatish Balay 
1614e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
1624e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
163e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
164e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x); CHKERRQ(ierr);
1654e2b4712SSatish Balay   PLogFlops(2*49*(a->nz) - a->n);
1664e2b4712SSatish Balay   PetscFunctionReturn(0);
1674e2b4712SSatish Balay }
1684e2b4712SSatish Balay 
1694e2b4712SSatish Balay #undef __FUNC__
1704e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_5"
1714e2b4712SSatish Balay int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1724e2b4712SSatish Balay {
1734e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1744e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
1754e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1764e2b4712SSatish Balay   int             *diag = a->diag;
1774e2b4712SSatish Balay   Scalar          *aa=a->a,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
178e1311b90SBarry Smith   Scalar *x,*b,*tmp,*v;
1794e2b4712SSatish Balay 
1804e2b4712SSatish Balay   PetscFunctionBegin;
181e1311b90SBarry Smith   ierr = VecGetArray(bb,&b); CHKERRQ(ierr);
182e1311b90SBarry Smith   ierr = VecGetArray(xx,&x); CHKERRQ(ierr);
1834e2b4712SSatish Balay   tmp  = a->solve_work;
1844e2b4712SSatish Balay 
1854e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1864e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1874e2b4712SSatish Balay 
1884e2b4712SSatish Balay   /* forward solve the lower triangular */
1894e2b4712SSatish Balay   idx    = 5*(*r++);
1904e2b4712SSatish Balay   tmp[0] = b[idx];   tmp[1] = b[1+idx];
1914e2b4712SSatish Balay   tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx];
1924e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
1934e2b4712SSatish Balay     v     = aa + 25*ai[i];
1944e2b4712SSatish Balay     vi    = aj + ai[i];
1954e2b4712SSatish Balay     nz    = diag[i] - ai[i];
1964e2b4712SSatish Balay     idx   = 5*(*r++);
1974e2b4712SSatish Balay     sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
1984e2b4712SSatish Balay     sum5  = b[4+idx];
1994e2b4712SSatish Balay     while (nz--) {
2004e2b4712SSatish Balay       idx   = 5*(*vi++);
2014e2b4712SSatish Balay       x1    = tmp[idx];  x2 = tmp[1+idx];x3 = tmp[2+idx];
2024e2b4712SSatish Balay       x4    = tmp[3+idx];x5 = tmp[4+idx];
2034e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2044e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2054e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2064e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2074e2b4712SSatish Balay       sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2084e2b4712SSatish Balay       v += 25;
2094e2b4712SSatish Balay     }
2104e2b4712SSatish Balay     idx = 5*i;
2114e2b4712SSatish Balay     tmp[idx]   = sum1;tmp[1+idx] = sum2;
2124e2b4712SSatish Balay     tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5;
2134e2b4712SSatish Balay   }
2144e2b4712SSatish Balay   /* backward solve the upper triangular */
2154e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
2164e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
2174e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
2184e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2194e2b4712SSatish Balay     idt  = 5*i;
2204e2b4712SSatish Balay     sum1 = tmp[idt];  sum2 = tmp[1+idt];
2214e2b4712SSatish Balay     sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt];
2224e2b4712SSatish Balay     while (nz--) {
2234e2b4712SSatish Balay       idx   = 5*(*vi++);
2244e2b4712SSatish Balay       x1    = tmp[idx];   x2 = tmp[1+idx];
2254e2b4712SSatish Balay       x3    = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx];
2264e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2274e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2284e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2294e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2304e2b4712SSatish Balay       sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2314e2b4712SSatish Balay       v += 25;
2324e2b4712SSatish Balay     }
2334e2b4712SSatish Balay     idc = 5*(*c--);
2344e2b4712SSatish Balay     v   = aa + 25*diag[i];
2354e2b4712SSatish Balay     x[idc]   = tmp[idt]   = v[0]*sum1+v[5]*sum2+v[10]*sum3+
2364e2b4712SSatish Balay                                  v[15]*sum4+v[20]*sum5;
2374e2b4712SSatish Balay     x[1+idc] = tmp[1+idt] = v[1]*sum1+v[6]*sum2+v[11]*sum3+
2384e2b4712SSatish Balay                                  v[16]*sum4+v[21]*sum5;
2394e2b4712SSatish Balay     x[2+idc] = tmp[2+idt] = v[2]*sum1+v[7]*sum2+v[12]*sum3+
2404e2b4712SSatish Balay                                  v[17]*sum4+v[22]*sum5;
2414e2b4712SSatish Balay     x[3+idc] = tmp[3+idt] = v[3]*sum1+v[8]*sum2+v[13]*sum3+
2424e2b4712SSatish Balay                                  v[18]*sum4+v[23]*sum5;
2434e2b4712SSatish Balay     x[4+idc] = tmp[4+idt] = v[4]*sum1+v[9]*sum2+v[14]*sum3+
2444e2b4712SSatish Balay                                  v[19]*sum4+v[24]*sum5;
2454e2b4712SSatish Balay   }
2464e2b4712SSatish Balay 
2474e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
2484e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
249e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
250e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x); CHKERRQ(ierr);
2514e2b4712SSatish Balay   PLogFlops(2*25*(a->nz) - a->n);
2524e2b4712SSatish Balay   PetscFunctionReturn(0);
2534e2b4712SSatish Balay }
2544e2b4712SSatish Balay 
2554e2b4712SSatish Balay #undef __FUNC__
2564e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_4"
2574e2b4712SSatish Balay int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
2584e2b4712SSatish Balay {
2594e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2604e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
2614e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2624e2b4712SSatish Balay   int             *diag = a->diag;
2634e2b4712SSatish Balay   Scalar          *aa=a->a,sum1,sum2,sum3,sum4,x1,x2,x3,x4;
264e1311b90SBarry Smith   Scalar          *x,*b,*tmp,*v;
2654e2b4712SSatish Balay 
2664e2b4712SSatish Balay   PetscFunctionBegin;
267e1311b90SBarry Smith   ierr = VecGetArray(bb,&b); CHKERRQ(ierr);
268e1311b90SBarry Smith   ierr = VecGetArray(xx,&x); CHKERRQ(ierr);
2694e2b4712SSatish Balay   tmp  = a->solve_work;
2704e2b4712SSatish Balay 
2714e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2724e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2734e2b4712SSatish Balay 
2744e2b4712SSatish Balay   /* forward solve the lower triangular */
2754e2b4712SSatish Balay   idx    = 4*(*r++);
2764e2b4712SSatish Balay   tmp[0] = b[idx];   tmp[1] = b[1+idx];
2774e2b4712SSatish Balay   tmp[2] = b[2+idx]; tmp[3] = b[3+idx];
2784e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
2794e2b4712SSatish Balay     v     = aa + 16*ai[i];
2804e2b4712SSatish Balay     vi    = aj + ai[i];
2814e2b4712SSatish Balay     nz    = diag[i] - ai[i];
2824e2b4712SSatish Balay     idx   = 4*(*r++);
2834e2b4712SSatish Balay     sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
2844e2b4712SSatish Balay     while (nz--) {
2854e2b4712SSatish Balay       idx   = 4*(*vi++);
2864e2b4712SSatish Balay       x1    = tmp[idx];x2 = tmp[1+idx];x3 = tmp[2+idx];x4 = tmp[3+idx];
2874e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2884e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2894e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2904e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2914e2b4712SSatish Balay       v    += 16;
2924e2b4712SSatish Balay     }
2934e2b4712SSatish Balay     idx        = 4*i;
2944e2b4712SSatish Balay     tmp[idx]   = sum1;tmp[1+idx] = sum2;
2954e2b4712SSatish Balay     tmp[2+idx] = sum3;tmp[3+idx] = sum4;
2964e2b4712SSatish Balay   }
2974e2b4712SSatish Balay   /* backward solve the upper triangular */
2984e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
2994e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
3004e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
3014e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3024e2b4712SSatish Balay     idt  = 4*i;
3034e2b4712SSatish Balay     sum1 = tmp[idt];  sum2 = tmp[1+idt];
3044e2b4712SSatish Balay     sum3 = tmp[2+idt];sum4 = tmp[3+idt];
3054e2b4712SSatish Balay     while (nz--) {
3064e2b4712SSatish Balay       idx   = 4*(*vi++);
3074e2b4712SSatish Balay       x1    = tmp[idx];   x2 = tmp[1+idx];
3084e2b4712SSatish Balay       x3    = tmp[2+idx]; x4 = tmp[3+idx];
3094e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3104e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3114e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3124e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3134e2b4712SSatish Balay       v += 16;
3144e2b4712SSatish Balay     }
3154e2b4712SSatish Balay     idc      = 4*(*c--);
3164e2b4712SSatish Balay     v        = aa + 16*diag[i];
3174e2b4712SSatish Balay     x[idc]   = tmp[idt]   = v[0]*sum1+v[4]*sum2+v[8]*sum3+v[12]*sum4;
3184e2b4712SSatish Balay     x[1+idc] = tmp[1+idt] = v[1]*sum1+v[5]*sum2+v[9]*sum3+v[13]*sum4;
3194e2b4712SSatish Balay     x[2+idc] = tmp[2+idt] = v[2]*sum1+v[6]*sum2+v[10]*sum3+v[14]*sum4;
3204e2b4712SSatish Balay     x[3+idc] = tmp[3+idt] = v[3]*sum1+v[7]*sum2+v[11]*sum3+v[15]*sum4;
3214e2b4712SSatish Balay   }
3224e2b4712SSatish Balay 
3234e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
3244e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
325e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
326e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x); CHKERRQ(ierr);
3274e2b4712SSatish Balay   PLogFlops(2*16*(a->nz) - a->n);
3284e2b4712SSatish Balay   PetscFunctionReturn(0);
3294e2b4712SSatish Balay }
3304e2b4712SSatish Balay 
3314e2b4712SSatish Balay /*
3324e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
3334e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
3344e2b4712SSatish Balay */
3354e2b4712SSatish Balay #undef __FUNC__
3364e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3374e2b4712SSatish Balay int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
3384e2b4712SSatish Balay {
3394e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
34030d4dcafSBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
34130d4dcafSBarry Smith   int             ierr,*diag = a->diag;
34230d4dcafSBarry Smith   Scalar          *aa=a->a;
34330d4dcafSBarry Smith   Scalar          *x,*b;
3444e2b4712SSatish Balay 
3454e2b4712SSatish Balay   PetscFunctionBegin;
346e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
347e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3484e2b4712SSatish Balay 
3492853dc0eSBarry Smith #if defined(USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
3502853dc0eSBarry Smith   {
3512853dc0eSBarry Smith     static Scalar w[2000]; /* very BAD need to fix */
3522853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
3532853dc0eSBarry Smith   }
3542853dc0eSBarry Smith #elif defined(USE_FORTRAN_KERNEL_SOLVEBAIJ)
3552853dc0eSBarry Smith   {
3562853dc0eSBarry Smith     static Scalar w[2000]; /* very BAD need to fix */
3572853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
3582853dc0eSBarry Smith   }
3592853dc0eSBarry Smith #elif defined(USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
3602853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
361e1293385SBarry Smith #else
36230d4dcafSBarry Smith   {
36330d4dcafSBarry Smith     Scalar sum1,sum2,sum3,sum4,x1,x2,x3,x4,*v;
36430d4dcafSBarry Smith     int    jdx,idt,idx,nz,*vi,i;
365e1293385SBarry Smith 
3664e2b4712SSatish Balay   /* forward solve the lower triangular */
3674e2b4712SSatish Balay   idx    = 0;
368e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
3694e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
3704e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
3714e2b4712SSatish Balay     vi    =  aj      + ai[i];
3724e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
373e1293385SBarry Smith     idx   +=  4;
3744e2b4712SSatish Balay     sum1  =  b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
3754e2b4712SSatish Balay     while (nz--) {
3764e2b4712SSatish Balay       jdx   = 4*(*vi++);
3774e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3784e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3794e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3804e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3814e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3824e2b4712SSatish Balay       v    += 16;
3834e2b4712SSatish Balay     }
3844e2b4712SSatish Balay     x[idx]   = sum1;
3854e2b4712SSatish Balay     x[1+idx] = sum2;
3864e2b4712SSatish Balay     x[2+idx] = sum3;
3874e2b4712SSatish Balay     x[3+idx] = sum4;
3884e2b4712SSatish Balay   }
3894e2b4712SSatish Balay   /* backward solve the upper triangular */
3904e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
3914e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
3924e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
3934e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3944e2b4712SSatish Balay     idt  = 4*i;
3954e2b4712SSatish Balay     sum1 = x[idt];  sum2 = x[1+idt];
3964e2b4712SSatish Balay     sum3 = x[2+idt];sum4 = x[3+idt];
3974e2b4712SSatish Balay     while (nz--) {
3984e2b4712SSatish Balay       idx   = 4*(*vi++);
3994e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4004e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4014e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4024e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4034e2b4712SSatish Balay       sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
4044e2b4712SSatish Balay       v    += 16;
4054e2b4712SSatish Balay     }
4064e2b4712SSatish Balay     v        = aa + 16*diag[i];
4074e2b4712SSatish Balay     x[idt]   = v[0]*sum1 + v[4]*sum2 + v[8]*sum3  + v[12]*sum4;
4084e2b4712SSatish Balay     x[1+idt] = v[1]*sum1 + v[5]*sum2 + v[9]*sum3  + v[13]*sum4;
4094e2b4712SSatish Balay     x[2+idt] = v[2]*sum1 + v[6]*sum2 + v[10]*sum3 + v[14]*sum4;
4104e2b4712SSatish Balay     x[3+idt] = v[3]*sum1 + v[7]*sum2 + v[11]*sum3 + v[15]*sum4;
4114e2b4712SSatish Balay   }
41230d4dcafSBarry Smith   }
413e1293385SBarry Smith #endif
4144e2b4712SSatish Balay 
415e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
416e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4174e2b4712SSatish Balay   PLogFlops(2*16*(a->nz) - a->n);
4184e2b4712SSatish Balay   PetscFunctionReturn(0);
4194e2b4712SSatish Balay }
4204e2b4712SSatish Balay 
4214e2b4712SSatish Balay 
4224e2b4712SSatish Balay #undef __FUNC__
4234e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_3"
4244e2b4712SSatish Balay int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4254e2b4712SSatish Balay {
4264e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
4274e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
4284e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
4294e2b4712SSatish Balay   int             *diag = a->diag;
4304e2b4712SSatish Balay   Scalar          *aa=a->a,sum1,sum2,sum3,x1,x2,x3;
431e1311b90SBarry Smith   Scalar *x,*b,*tmp,*v;
4324e2b4712SSatish Balay 
4334e2b4712SSatish Balay   PetscFunctionBegin;
434e1311b90SBarry Smith   ierr = VecGetArray(bb,&b); CHKERRQ(ierr);
435e1311b90SBarry Smith   ierr = VecGetArray(xx,&x); CHKERRQ(ierr);
4364e2b4712SSatish Balay   tmp  = a->solve_work;
4374e2b4712SSatish Balay 
4384e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4394e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4404e2b4712SSatish Balay 
4414e2b4712SSatish Balay   /* forward solve the lower triangular */
4424e2b4712SSatish Balay   idx    = 3*(*r++);
4434e2b4712SSatish Balay   tmp[0] = b[idx]; tmp[1] = b[1+idx]; tmp[2] = b[2+idx];
4444e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
4454e2b4712SSatish Balay     v     = aa + 9*ai[i];
4464e2b4712SSatish Balay     vi    = aj + ai[i];
4474e2b4712SSatish Balay     nz    = diag[i] - ai[i];
4484e2b4712SSatish Balay     idx   = 3*(*r++);
4494e2b4712SSatish Balay     sum1  = b[idx]; sum2 = b[1+idx]; sum3 = b[2+idx];
4504e2b4712SSatish Balay     while (nz--) {
4514e2b4712SSatish Balay       idx   = 3*(*vi++);
4524e2b4712SSatish Balay       x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
4534e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4544e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4554e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4564e2b4712SSatish Balay       v += 9;
4574e2b4712SSatish Balay     }
4584e2b4712SSatish Balay     idx = 3*i;
4594e2b4712SSatish Balay     tmp[idx] = sum1; tmp[1+idx] = sum2; tmp[2+idx] = sum3;
4604e2b4712SSatish Balay   }
4614e2b4712SSatish Balay   /* backward solve the upper triangular */
4624e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
4634e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
4644e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
4654e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4664e2b4712SSatish Balay     idt  = 3*i;
4674e2b4712SSatish Balay     sum1 = tmp[idt]; sum2 = tmp[1+idt]; sum3 = tmp[2+idt];
4684e2b4712SSatish Balay     while (nz--) {
4694e2b4712SSatish Balay       idx   = 3*(*vi++);
4704e2b4712SSatish Balay       x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
4714e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4724e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4734e2b4712SSatish Balay       sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4744e2b4712SSatish Balay       v += 9;
4754e2b4712SSatish Balay     }
4764e2b4712SSatish Balay     idc = 3*(*c--);
4774e2b4712SSatish Balay     v   = aa + 9*diag[i];
4784e2b4712SSatish Balay     x[idc]   = tmp[idt]   = v[0]*sum1 + v[3]*sum2 + v[6]*sum3;
4794e2b4712SSatish Balay     x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[4]*sum2 + v[7]*sum3;
4804e2b4712SSatish Balay     x[2+idc] = tmp[2+idt] = v[2]*sum1 + v[5]*sum2 + v[8]*sum3;
4814e2b4712SSatish Balay   }
4824e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
4834e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
484e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
485e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4864e2b4712SSatish Balay   PLogFlops(2*9*(a->nz) - a->n);
4874e2b4712SSatish Balay   PetscFunctionReturn(0);
4884e2b4712SSatish Balay }
4894e2b4712SSatish Balay 
4904e2b4712SSatish Balay #undef __FUNC__
4914e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_2"
4924e2b4712SSatish Balay int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
4934e2b4712SSatish Balay {
4944e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
4954e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
4964e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
4974e2b4712SSatish Balay   int             *diag = a->diag;
4984e2b4712SSatish Balay   Scalar          *aa=a->a,sum1,sum2,x1,x2;
499e1311b90SBarry Smith   Scalar *x,*b,*tmp,*v;
5004e2b4712SSatish Balay 
5014e2b4712SSatish Balay   PetscFunctionBegin;
502e1311b90SBarry Smith   ierr = VecGetArray(bb,&b); CHKERRQ(ierr);
503e1311b90SBarry Smith   ierr = VecGetArray(xx,&x); CHKERRQ(ierr);
5044e2b4712SSatish Balay   tmp  = a->solve_work;
5054e2b4712SSatish Balay 
5064e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5074e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5084e2b4712SSatish Balay 
5094e2b4712SSatish Balay   /* forward solve the lower triangular */
5104e2b4712SSatish Balay   idx    = 2*(*r++);
5114e2b4712SSatish Balay   tmp[0] = b[idx]; tmp[1] = b[1+idx];
5124e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
5134e2b4712SSatish Balay     v     = aa + 4*ai[i];
5144e2b4712SSatish Balay     vi    = aj + ai[i];
5154e2b4712SSatish Balay     nz    = diag[i] - ai[i];
5164e2b4712SSatish Balay     idx   = 2*(*r++);
5174e2b4712SSatish Balay     sum1  = b[idx]; sum2 = b[1+idx];
5184e2b4712SSatish Balay     while (nz--) {
5194e2b4712SSatish Balay       idx   = 2*(*vi++);
5204e2b4712SSatish Balay       x1    = tmp[idx]; x2 = tmp[1+idx];
5214e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[2]*x2;
5224e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[3]*x2;
5234e2b4712SSatish Balay       v += 4;
5244e2b4712SSatish Balay     }
5254e2b4712SSatish Balay     idx = 2*i;
5264e2b4712SSatish Balay     tmp[idx] = sum1; tmp[1+idx] = sum2;
5274e2b4712SSatish Balay   }
5284e2b4712SSatish Balay   /* backward solve the upper triangular */
5294e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
5304e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
5314e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
5324e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
5334e2b4712SSatish Balay     idt  = 2*i;
5344e2b4712SSatish Balay     sum1 = tmp[idt]; sum2 = tmp[1+idt];
5354e2b4712SSatish Balay     while (nz--) {
5364e2b4712SSatish Balay       idx   = 2*(*vi++);
5374e2b4712SSatish Balay       x1    = tmp[idx]; x2 = tmp[1+idx];
5384e2b4712SSatish Balay       sum1 -= v[0]*x1 + v[2]*x2;
5394e2b4712SSatish Balay       sum2 -= v[1]*x1 + v[3]*x2;
5404e2b4712SSatish Balay       v += 4;
5414e2b4712SSatish Balay     }
5424e2b4712SSatish Balay     idc = 2*(*c--);
5434e2b4712SSatish Balay     v   = aa + 4*diag[i];
5444e2b4712SSatish Balay     x[idc]   = tmp[idt]   = v[0]*sum1 + v[2]*sum2;
5454e2b4712SSatish Balay     x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[3]*sum2;
5464e2b4712SSatish Balay   }
5474e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
5484e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
549e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
550e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5514e2b4712SSatish Balay   PLogFlops(2*4*(a->nz) - a->n);
5524e2b4712SSatish Balay   PetscFunctionReturn(0);
5534e2b4712SSatish Balay }
5544e2b4712SSatish Balay 
5554e2b4712SSatish Balay 
5564e2b4712SSatish Balay #undef __FUNC__
5574e2b4712SSatish Balay #define __FUNC__ "MatSolve_SeqBAIJ_1"
5584e2b4712SSatish Balay int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5594e2b4712SSatish Balay {
5604e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
5614e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
5624e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
5634e2b4712SSatish Balay   int             *diag = a->diag;
5644e2b4712SSatish Balay   Scalar          *aa=a->a,sum1;
565e1311b90SBarry Smith   Scalar *x,*b,*tmp,*v;
5664e2b4712SSatish Balay 
5674e2b4712SSatish Balay   PetscFunctionBegin;
5684e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
5694e2b4712SSatish Balay 
570e1311b90SBarry Smith   ierr = VecGetArray(bb,&b); CHKERRQ(ierr);
571e1311b90SBarry Smith   ierr = VecGetArray(xx,&x); CHKERRQ(ierr);
5724e2b4712SSatish Balay   tmp  = a->solve_work;
5734e2b4712SSatish Balay 
5744e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5754e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5764e2b4712SSatish Balay 
5774e2b4712SSatish Balay   /* forward solve the lower triangular */
5784e2b4712SSatish Balay   tmp[0] = b[*r++];
5794e2b4712SSatish Balay   for ( i=1; i<n; i++ ) {
5804e2b4712SSatish Balay     v     = aa + ai[i];
5814e2b4712SSatish Balay     vi    = aj + ai[i];
5824e2b4712SSatish Balay     nz    = diag[i] - ai[i];
5834e2b4712SSatish Balay     sum1  = b[*r++];
5844e2b4712SSatish Balay     while (nz--) {
5854e2b4712SSatish Balay       sum1 -= (*v++)*tmp[*vi++];
5864e2b4712SSatish Balay     }
5874e2b4712SSatish Balay     tmp[i] = sum1;
5884e2b4712SSatish Balay   }
5894e2b4712SSatish Balay   /* backward solve the upper triangular */
5904e2b4712SSatish Balay   for ( i=n-1; i>=0; i-- ){
5914e2b4712SSatish Balay     v    = aa + diag[i] + 1;
5924e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
5934e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
5944e2b4712SSatish Balay     sum1 = tmp[i];
5954e2b4712SSatish Balay     while (nz--) {
5964e2b4712SSatish Balay       sum1 -= (*v++)*tmp[*vi++];
5974e2b4712SSatish Balay     }
5984e2b4712SSatish Balay     x[*c--] = tmp[i] = aa[diag[i]]*sum1;
5994e2b4712SSatish Balay   }
6004e2b4712SSatish Balay 
6014e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout); CHKERRQ(ierr);
6024e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout); CHKERRQ(ierr);
603e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b); CHKERRQ(ierr);
604e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x); CHKERRQ(ierr);
6054e2b4712SSatish Balay   PLogFlops(2*1*(a->nz) - a->n);
6064e2b4712SSatish Balay   PetscFunctionReturn(0);
6074e2b4712SSatish Balay }
6084e2b4712SSatish Balay 
6094e2b4712SSatish Balay extern int MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat,Mat*);
6104e2b4712SSatish Balay extern int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat,Vec,Vec);
6114e2b4712SSatish Balay /* ----------------------------------------------------------------*/
6124e2b4712SSatish Balay /*
6134e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
6144e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
6154e2b4712SSatish Balay    Not a good example of code reuse.
6164e2b4712SSatish Balay */
6174e2b4712SSatish Balay #undef __FUNC__
6184e2b4712SSatish Balay #define __FUNC__ "MatILUFactorSymbolic_SeqBAIJ"
6194e2b4712SSatish Balay int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,int levels,
6204e2b4712SSatish Balay                                  Mat *fact)
6214e2b4712SSatish Balay {
6224e2b4712SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
6234e2b4712SSatish Balay   IS          isicol;
6244e2b4712SSatish Balay   int         *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j;
6254e2b4712SSatish Balay   int         *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev;
6264e2b4712SSatish Balay   int         *dloc, idx, row,m,fm, nzf, nzi,len,  realloc = 0;
6274e2b4712SSatish Balay   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2;
6284e2b4712SSatish Balay   PetscTruth  col_identity, row_identity;
6294e2b4712SSatish Balay 
6304e2b4712SSatish Balay   PetscFunctionBegin;
631e51c0b9cSSatish Balay   ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
632e51c0b9cSSatish Balay 
6334e2b4712SSatish Balay   /* special case that simply copies fill pattern */
6344e2b4712SSatish Balay   PetscValidHeaderSpecific(isrow,IS_COOKIE);
6354e2b4712SSatish Balay   PetscValidHeaderSpecific(iscol,IS_COOKIE);
6364e2b4712SSatish Balay   ISIdentity(isrow,&row_identity); ISIdentity(iscol,&col_identity);
6374e2b4712SSatish Balay   if (levels == 0 && row_identity && col_identity) {
638e1293385SBarry Smith     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact); CHKERRQ(ierr);
6394e2b4712SSatish Balay     (*fact)->factor = FACTOR_LU;
6404e2b4712SSatish Balay     b               = (Mat_SeqBAIJ *) (*fact)->data;
6414e2b4712SSatish Balay     if (!b->diag) {
6424e2b4712SSatish Balay       ierr = MatMarkDiag_SeqBAIJ(*fact); CHKERRQ(ierr);
6434e2b4712SSatish Balay     }
6444e2b4712SSatish Balay     b->row        = isrow;
6454e2b4712SSatish Balay     b->col        = iscol;
646e51c0b9cSSatish Balay     b->icol       = isicol;
6474e2b4712SSatish Balay     b->solve_work = (Scalar *) PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
6484e2b4712SSatish Balay     /*
6494e2b4712SSatish Balay         Blocksize 4 has a special faster solver for ILU(0) factorization
6504e2b4712SSatish Balay         with natural ordering
6514e2b4712SSatish Balay     */
6524e2b4712SSatish Balay     if (b->bs == 4) {
653f830108cSBarry Smith       (*fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
654f830108cSBarry Smith       (*fact)->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
6554e2b4712SSatish Balay     }
6564e2b4712SSatish Balay     PetscFunctionReturn(0);
6574e2b4712SSatish Balay   }
6584e2b4712SSatish Balay 
6594e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr);
6604e2b4712SSatish Balay   ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
6614e2b4712SSatish Balay 
6624e2b4712SSatish Balay   /* get new row pointers */
6634e2b4712SSatish Balay   ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
6644e2b4712SSatish Balay   ainew[0] = 0;
6654e2b4712SSatish Balay   /* don't know how many column pointers are needed so estimate */
6664e2b4712SSatish Balay   jmax = (int) (f*ai[n] + 1);
6674e2b4712SSatish Balay   ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
6684e2b4712SSatish Balay   /* ajfill is level of fill for each fill entry */
6694e2b4712SSatish Balay   ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajfill);
6704e2b4712SSatish Balay   /* fill is a linked list of nonzeros in active row */
6714e2b4712SSatish Balay   fill = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(fill);
6724e2b4712SSatish Balay   /* im is level for each filled value */
6734e2b4712SSatish Balay   im = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(im);
6744e2b4712SSatish Balay   /* dloc is location of diagonal in factor */
6754e2b4712SSatish Balay   dloc = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(dloc);
6764e2b4712SSatish Balay   dloc[0]  = 0;
6774e2b4712SSatish Balay   for ( prow=0; prow<n; prow++ ) {
6784e2b4712SSatish Balay     /* first copy previous fill into linked list */
6794e2b4712SSatish Balay     nzf     = nz  = ai[r[prow]+1] - ai[r[prow]];
6804e2b4712SSatish Balay     if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,1,"Empty row in matrix");
6814e2b4712SSatish Balay     xi      = aj + ai[r[prow]];
6824e2b4712SSatish Balay     fill[n] = n;
6834e2b4712SSatish Balay     while (nz--) {
6844e2b4712SSatish Balay       fm  = n;
6854e2b4712SSatish Balay       idx = ic[*xi++];
6864e2b4712SSatish Balay       do {
6874e2b4712SSatish Balay         m  = fm;
6884e2b4712SSatish Balay         fm = fill[m];
6894e2b4712SSatish Balay       } while (fm < idx);
6904e2b4712SSatish Balay       fill[m]   = idx;
6914e2b4712SSatish Balay       fill[idx] = fm;
6924e2b4712SSatish Balay       im[idx]   = 0;
6934e2b4712SSatish Balay     }
6944e2b4712SSatish Balay     nzi = 0;
6954e2b4712SSatish Balay     row = fill[n];
6964e2b4712SSatish Balay     while ( row < prow ) {
6974e2b4712SSatish Balay       incrlev = im[row] + 1;
6984e2b4712SSatish Balay       nz      = dloc[row];
6994e2b4712SSatish Balay       xi      = ajnew  + ainew[row] + nz;
7004e2b4712SSatish Balay       flev    = ajfill + ainew[row] + nz + 1;
7014e2b4712SSatish Balay       nnz     = ainew[row+1] - ainew[row] - nz - 1;
7024e2b4712SSatish Balay       if (*xi++ != row) {
7034e2b4712SSatish Balay         SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,0,"Zero pivot: try running with -pc_ilu_nonzeros_along_diagonal");
7044e2b4712SSatish Balay       }
7054e2b4712SSatish Balay       fm      = row;
7064e2b4712SSatish Balay       while (nnz-- > 0) {
7074e2b4712SSatish Balay         idx = *xi++;
7084e2b4712SSatish Balay         if (*flev + incrlev > levels) {
7094e2b4712SSatish Balay           flev++;
7104e2b4712SSatish Balay           continue;
7114e2b4712SSatish Balay         }
7124e2b4712SSatish Balay         do {
7134e2b4712SSatish Balay           m  = fm;
7144e2b4712SSatish Balay           fm = fill[m];
7154e2b4712SSatish Balay         } while (fm < idx);
7164e2b4712SSatish Balay         if (fm != idx) {
7174e2b4712SSatish Balay           im[idx]   = *flev + incrlev;
7184e2b4712SSatish Balay           fill[m]   = idx;
7194e2b4712SSatish Balay           fill[idx] = fm;
7204e2b4712SSatish Balay           fm        = idx;
7214e2b4712SSatish Balay           nzf++;
722ecf371e4SBarry Smith         } else {
7234e2b4712SSatish Balay           if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
7244e2b4712SSatish Balay         }
7254e2b4712SSatish Balay         flev++;
7264e2b4712SSatish Balay       }
7274e2b4712SSatish Balay       row = fill[row];
7284e2b4712SSatish Balay       nzi++;
7294e2b4712SSatish Balay     }
7304e2b4712SSatish Balay     /* copy new filled row into permanent storage */
7314e2b4712SSatish Balay     ainew[prow+1] = ainew[prow] + nzf;
7324e2b4712SSatish Balay     if (ainew[prow+1] > jmax) {
733ecf371e4SBarry Smith 
734ecf371e4SBarry Smith       /* estimate how much additional space we will need */
735ecf371e4SBarry Smith       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
736ecf371e4SBarry Smith       /* just double the memory each time */
737ecf371e4SBarry Smith       int maxadd = jmax;
738ecf371e4SBarry Smith       /* maxadd = (int) (((f*ai[n]+1)*(n-prow+5))/n); */
7394e2b4712SSatish Balay       if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
7404e2b4712SSatish Balay       jmax += maxadd;
741ecf371e4SBarry Smith 
742ecf371e4SBarry Smith       /* allocate a longer ajnew and ajfill */
7434e2b4712SSatish Balay       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
7444e2b4712SSatish Balay       PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
7454e2b4712SSatish Balay       PetscFree(ajnew);
7464e2b4712SSatish Balay       ajnew = xi;
7474e2b4712SSatish Balay       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
7484e2b4712SSatish Balay       PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
7494e2b4712SSatish Balay       PetscFree(ajfill);
7504e2b4712SSatish Balay       ajfill = xi;
751ecf371e4SBarry Smith       realloc++; /* count how many reallocations are needed */
7524e2b4712SSatish Balay     }
7534e2b4712SSatish Balay     xi          = ajnew + ainew[prow];
7544e2b4712SSatish Balay     flev        = ajfill + ainew[prow];
7554e2b4712SSatish Balay     dloc[prow]  = nzi;
7564e2b4712SSatish Balay     fm          = fill[n];
7574e2b4712SSatish Balay     while (nzf--) {
7584e2b4712SSatish Balay       *xi++   = fm;
7594e2b4712SSatish Balay       *flev++ = im[fm];
7604e2b4712SSatish Balay       fm      = fill[fm];
7614e2b4712SSatish Balay     }
7624e2b4712SSatish Balay   }
7634e2b4712SSatish Balay   PetscFree(ajfill);
7644e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
7654e2b4712SSatish Balay   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
7664e2b4712SSatish Balay   PetscFree(fill); PetscFree(im);
7674e2b4712SSatish Balay 
7684e2b4712SSatish Balay   {
7694e2b4712SSatish Balay     double af = ((double)ainew[n])/((double)ai[n]);
770981c4779SBarry Smith     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",
7714e2b4712SSatish Balay              realloc,f,af);
772981c4779SBarry Smith     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
773981c4779SBarry Smith     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
774981c4779SBarry Smith     PLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
7754e2b4712SSatish Balay   }
7764e2b4712SSatish Balay 
7774e2b4712SSatish Balay   /* put together the new matrix */
7784e2b4712SSatish Balay   ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
779fa6007c9SSatish Balay   PLogObjectParent(*fact,isicol);
7804e2b4712SSatish Balay   b = (Mat_SeqBAIJ *) (*fact)->data;
7814e2b4712SSatish Balay   PetscFree(b->imax);
7824e2b4712SSatish Balay   b->singlemalloc = 0;
7834e2b4712SSatish Balay   len = bs2*ainew[n]*sizeof(Scalar);
7844e2b4712SSatish Balay   /* the next line frees the default space generated by the Create() */
7854e2b4712SSatish Balay   PetscFree(b->a); PetscFree(b->ilen);
7864e2b4712SSatish Balay   b->a          = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
7874e2b4712SSatish Balay   b->j          = ajnew;
7884e2b4712SSatish Balay   b->i          = ainew;
7894e2b4712SSatish Balay   for ( i=0; i<n; i++ ) dloc[i] += ainew[i];
7904e2b4712SSatish Balay   b->diag       = dloc;
7914e2b4712SSatish Balay   b->ilen       = 0;
7924e2b4712SSatish Balay   b->imax       = 0;
7934e2b4712SSatish Balay   b->row        = isrow;
7944e2b4712SSatish Balay   b->col        = iscol;
795e51c0b9cSSatish Balay   b->icol       = isicol;
7964e2b4712SSatish Balay   b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
7974e2b4712SSatish Balay   /* In b structure:  Free imax, ilen, old a, old j.
7984e2b4712SSatish Balay      Allocate dloc, solve_work, new a, new j */
7994e2b4712SSatish Balay   PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(Scalar));
8004e2b4712SSatish Balay   b->maxnz          = b->nz = ainew[n];
8014e2b4712SSatish Balay   (*fact)->factor   = FACTOR_LU;
8024e2b4712SSatish Balay 
8034e2b4712SSatish Balay   (*fact)->info.factor_mallocs    = realloc;
8044e2b4712SSatish Balay   (*fact)->info.fill_ratio_given  = f;
8054e2b4712SSatish Balay   (*fact)->info.fill_ratio_needed = ((double)ainew[n])/((double)ai[prow]);
8064e2b4712SSatish Balay 
8074e2b4712SSatish Balay   PetscFunctionReturn(0);
8084e2b4712SSatish Balay }
8094e2b4712SSatish Balay 
8104e2b4712SSatish Balay 
8114e2b4712SSatish Balay 
8124e2b4712SSatish Balay 
813