xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 4c0dbd8d9ffab79a3b6bd0a1a69f9c87d25f6e92)
1be1d678aSKris Buschelman 
24e2b4712SSatish Balay /*
34e2b4712SSatish Balay     Factorization code for BAIJ format.
44e2b4712SSatish Balay */
54e2b4712SSatish Balay 
6c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
7c6db04a5SJed Brown #include <../src/mat/blockinvert.h>
8c6db04a5SJed Brown #include <petscbt.h>
9c6db04a5SJed Brown #include <../src/mat/utils/freespace.h>
104e2b4712SSatish Balay 
114a2ae208SSatish Balay #undef __FUNCT__
1293fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
1393fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1493fd935bSShri Abhyankar {
1593fd935bSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1693fd935bSShri Abhyankar   PetscErrorCode    ierr;
1793fd935bSShri Abhyankar   const PetscInt    *adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
1893fd935bSShri Abhyankar   PetscInt          i,n = a->mbs,j;
1993fd935bSShri Abhyankar   PetscInt          nz;
2093fd935bSShri Abhyankar   PetscScalar       *x,*tmp,s1;
2193fd935bSShri Abhyankar   const MatScalar   *aa = a->a,*v;
2293fd935bSShri Abhyankar   const PetscScalar *b;
2393fd935bSShri Abhyankar 
2493fd935bSShri Abhyankar   PetscFunctionBegin;
253649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2693fd935bSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2793fd935bSShri Abhyankar   tmp  = a->solve_work;
2893fd935bSShri Abhyankar 
2993fd935bSShri Abhyankar 
3093fd935bSShri Abhyankar   /* copy the b into temp work space according to permutation */
3193fd935bSShri Abhyankar   for (i=0; i<n; i++) tmp[i] = b[i];
3293fd935bSShri Abhyankar 
3393fd935bSShri Abhyankar   /* forward solve the U^T */
3493fd935bSShri Abhyankar   for (i=0; i<n; i++) {
3593fd935bSShri Abhyankar     v   = aa + adiag[i+1] + 1;
3693fd935bSShri Abhyankar     vi  = aj + adiag[i+1] + 1;
3793fd935bSShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
3893fd935bSShri Abhyankar     s1  = tmp[i];
3993fd935bSShri Abhyankar     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
4093fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
4193fd935bSShri Abhyankar     tmp[i] = s1;
4293fd935bSShri Abhyankar   }
4393fd935bSShri Abhyankar 
4493fd935bSShri Abhyankar   /* backward solve the L^T */
4593fd935bSShri Abhyankar   for (i=n-1; i>=0; i--){
4693fd935bSShri Abhyankar     v   = aa + ai[i];
4793fd935bSShri Abhyankar     vi  = aj + ai[i];
4893fd935bSShri Abhyankar     nz  = ai[i+1] - ai[i];
4993fd935bSShri Abhyankar     s1  = tmp[i];
5093fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
5193fd935bSShri Abhyankar   }
5293fd935bSShri Abhyankar 
5393fd935bSShri Abhyankar   /* copy tmp into x according to permutation */
5493fd935bSShri Abhyankar   for (i=0; i<n; i++) x[i] = tmp[i];
5593fd935bSShri Abhyankar 
563649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5793fd935bSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5893fd935bSShri Abhyankar 
5993fd935bSShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
6093fd935bSShri Abhyankar   PetscFunctionReturn(0);
6193fd935bSShri Abhyankar }
6293fd935bSShri Abhyankar 
6393fd935bSShri Abhyankar #undef __FUNCT__
6406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
6506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
66f1af5d2fSBarry Smith {
67f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
68dfbe8321SBarry Smith   PetscErrorCode    ierr;
690b68f018SBarry Smith   PetscInt          i,nz;
700b68f018SBarry Smith   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
710b68f018SBarry Smith   const MatScalar   *aa=a->a,*v;
720b68f018SBarry Smith   PetscScalar       s1,*x;
730b68f018SBarry Smith   const PetscScalar *b;
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith   PetscFunctionBegin;
76ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
773649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
781ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
79f1af5d2fSBarry Smith 
80f1af5d2fSBarry Smith   /* forward solve the U^T */
81f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
82f1af5d2fSBarry Smith 
83f1af5d2fSBarry Smith     v     = aa + diag[i];
84f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
85ef66eb69SBarry Smith     s1    = (*v++)*x[i];
86f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
87f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
88f1af5d2fSBarry Smith     while (nz--) {
89f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
90f1af5d2fSBarry Smith     }
91f1af5d2fSBarry Smith     x[i]   = s1;
92f1af5d2fSBarry Smith   }
93f1af5d2fSBarry Smith   /* backward solve the L^T */
94f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
95f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
96f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
97f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
98f1af5d2fSBarry Smith     s1   = x[i];
99f1af5d2fSBarry Smith     while (nz--) {
100f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
101f1af5d2fSBarry Smith     }
102f1af5d2fSBarry Smith   }
1033649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1041ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
105dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
106f1af5d2fSBarry Smith   PetscFunctionReturn(0);
107f1af5d2fSBarry Smith }
108f1af5d2fSBarry Smith 
1094a2ae208SSatish Balay #undef __FUNCT__
11006e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
11106e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
112f1af5d2fSBarry Smith {
113f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
114dfbe8321SBarry Smith   PetscErrorCode    ierr;
115b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
116b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
117b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
118b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x;
119b3260449SShri Abhyankar   const PetscScalar *b;
120f1af5d2fSBarry Smith 
121f1af5d2fSBarry Smith   PetscFunctionBegin;
122ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1233649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1241ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
125f1af5d2fSBarry Smith 
126f1af5d2fSBarry Smith   /* forward solve the U^T */
127f1af5d2fSBarry Smith   idx = 0;
128f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
129f1af5d2fSBarry Smith 
130f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
131f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
132ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
133f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
134f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
135f1af5d2fSBarry Smith     v += 4;
136f1af5d2fSBarry Smith 
137f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
138f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
139f1af5d2fSBarry Smith     while (nz--) {
140f1af5d2fSBarry Smith       oidx = 2*(*vi++);
141f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
142f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
143f1af5d2fSBarry Smith       v  += 4;
144f1af5d2fSBarry Smith     }
145f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
146f1af5d2fSBarry Smith     idx += 2;
147f1af5d2fSBarry Smith   }
148f1af5d2fSBarry Smith   /* backward solve the L^T */
149f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
150f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
151f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
152f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
153f1af5d2fSBarry Smith     idt  = 2*i;
154f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
155f1af5d2fSBarry Smith     while (nz--) {
156f1af5d2fSBarry Smith       idx   = 2*(*vi--);
157f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
158f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
159f1af5d2fSBarry Smith       v -= 4;
160f1af5d2fSBarry Smith     }
161f1af5d2fSBarry Smith   }
1623649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1631ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
164dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
165f1af5d2fSBarry Smith   PetscFunctionReturn(0);
166f1af5d2fSBarry Smith }
167f1af5d2fSBarry Smith 
1684a2ae208SSatish Balay #undef __FUNCT__
1694dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
1704dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1716929473cSShri Abhyankar {
1726929473cSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1736929473cSShri Abhyankar   PetscErrorCode    ierr;
174b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1756929473cSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
176b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
177b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
178b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x;
179b3260449SShri Abhyankar   const PetscScalar *b;
1806929473cSShri Abhyankar 
1816929473cSShri Abhyankar   PetscFunctionBegin;
1826929473cSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1833649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1846929473cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1856929473cSShri Abhyankar 
1866929473cSShri Abhyankar   /* forward solve the U^T */
1876929473cSShri Abhyankar   idx = 0;
1886929473cSShri Abhyankar   for (i=0; i<n; i++) {
1896929473cSShri Abhyankar     v     = aa + bs2*diag[i];
1906929473cSShri Abhyankar     /* multiply by the inverse of the block diagonal */
1916929473cSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];
1926929473cSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
1936929473cSShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
1946929473cSShri Abhyankar     v -= bs2;
1956929473cSShri Abhyankar 
1966929473cSShri Abhyankar     vi    = aj + diag[i] - 1;
1976929473cSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
1986929473cSShri Abhyankar     for(j=0;j>-nz;j--){
1996929473cSShri Abhyankar       oidx = bs*vi[j];
2006929473cSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2;
2016929473cSShri Abhyankar       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
2026929473cSShri Abhyankar       v  -= bs2;
2036929473cSShri Abhyankar     }
2046929473cSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;
2056929473cSShri Abhyankar     idx += bs;
2066929473cSShri Abhyankar   }
2076929473cSShri Abhyankar   /* backward solve the L^T */
2086929473cSShri Abhyankar   for (i=n-1; i>=0; i--){
2096929473cSShri Abhyankar     v    = aa + bs2*ai[i];
2106929473cSShri Abhyankar     vi   = aj + ai[i];
2116929473cSShri Abhyankar     nz   = ai[i+1] - ai[i];
2126929473cSShri Abhyankar     idt  = bs*i;
2136929473cSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];
2146929473cSShri Abhyankar     for(j=0;j<nz;j++){
2156929473cSShri Abhyankar       idx   = bs*vi[j];
2166929473cSShri Abhyankar       x[idx]   -=  v[0]*s1 +  v[1]*s2;
2176929473cSShri Abhyankar       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
2186929473cSShri Abhyankar       v += bs2;
2196929473cSShri Abhyankar     }
2206929473cSShri Abhyankar   }
2213649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2226929473cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2236929473cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2246929473cSShri Abhyankar   PetscFunctionReturn(0);
2256929473cSShri Abhyankar }
2266929473cSShri Abhyankar 
2276929473cSShri Abhyankar #undef __FUNCT__
22806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
22906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
230f1af5d2fSBarry Smith {
231f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
232dfbe8321SBarry Smith   PetscErrorCode    ierr;
233b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
234b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
235b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
236b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
237b3260449SShri Abhyankar   const PetscScalar *b;
238f1af5d2fSBarry Smith 
239f1af5d2fSBarry Smith   PetscFunctionBegin;
240ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2413649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2421ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
243f1af5d2fSBarry Smith 
244f1af5d2fSBarry Smith   /* forward solve the U^T */
245f1af5d2fSBarry Smith   idx = 0;
246f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
247f1af5d2fSBarry Smith 
248f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
249f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
250ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
251f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
252f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
253f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
254f1af5d2fSBarry Smith     v += 9;
255f1af5d2fSBarry Smith 
256f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
257f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
258f1af5d2fSBarry Smith     while (nz--) {
259f1af5d2fSBarry Smith       oidx = 3*(*vi++);
260f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
261f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
262f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
263f1af5d2fSBarry Smith       v  += 9;
264f1af5d2fSBarry Smith     }
265f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
266f1af5d2fSBarry Smith     idx += 3;
267f1af5d2fSBarry Smith   }
268f1af5d2fSBarry Smith   /* backward solve the L^T */
269f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
270f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
271f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
272f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
273f1af5d2fSBarry Smith     idt  = 3*i;
274f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
275f1af5d2fSBarry Smith     while (nz--) {
276f1af5d2fSBarry Smith       idx   = 3*(*vi--);
277f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
278f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
279f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
280f1af5d2fSBarry Smith       v -= 9;
281f1af5d2fSBarry Smith     }
282f1af5d2fSBarry Smith   }
2833649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2841ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
285dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
286f1af5d2fSBarry Smith   PetscFunctionReturn(0);
287f1af5d2fSBarry Smith }
288f1af5d2fSBarry Smith 
2894a2ae208SSatish Balay #undef __FUNCT__
2904dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
2914dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2928499736aSShri Abhyankar {
2938499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2948499736aSShri Abhyankar   PetscErrorCode    ierr;
295b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2968499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
297b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
298b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
299b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
300b3260449SShri Abhyankar   const PetscScalar *b;
3018499736aSShri Abhyankar 
3028499736aSShri Abhyankar   PetscFunctionBegin;
3038499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3043649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3058499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3068499736aSShri Abhyankar 
3078499736aSShri Abhyankar   /* forward solve the U^T */
3088499736aSShri Abhyankar   idx = 0;
3098499736aSShri Abhyankar   for (i=0; i<n; i++) {
3108499736aSShri Abhyankar     v     = aa + bs2*diag[i];
3118499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
3128499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
3138499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
3148499736aSShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
3158499736aSShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
3168499736aSShri Abhyankar     v -= bs2;
3178499736aSShri Abhyankar 
3188499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
3198499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
3208499736aSShri Abhyankar     for(j=0;j>-nz;j--){
3218499736aSShri Abhyankar       oidx = bs*vi[j];
3228499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
3238499736aSShri Abhyankar       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
3248499736aSShri Abhyankar       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
3258499736aSShri Abhyankar       v  -= bs2;
3268499736aSShri Abhyankar     }
3278499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
3288499736aSShri Abhyankar     idx += bs;
3298499736aSShri Abhyankar   }
3308499736aSShri Abhyankar   /* backward solve the L^T */
3318499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
3328499736aSShri Abhyankar     v    = aa + bs2*ai[i];
3338499736aSShri Abhyankar     vi   = aj + ai[i];
3348499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
3358499736aSShri Abhyankar     idt  = bs*i;
3368499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
3378499736aSShri Abhyankar     for(j=0;j<nz;j++){
3388499736aSShri Abhyankar       idx   = bs*vi[j];
3398499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
3408499736aSShri Abhyankar       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
3418499736aSShri Abhyankar       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
3428499736aSShri Abhyankar       v += bs2;
3438499736aSShri Abhyankar     }
3448499736aSShri Abhyankar   }
3453649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3468499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3478499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3488499736aSShri Abhyankar   PetscFunctionReturn(0);
3498499736aSShri Abhyankar }
3508499736aSShri Abhyankar 
3518499736aSShri Abhyankar #undef __FUNCT__
35206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
35306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
354f1af5d2fSBarry Smith {
355f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
356dfbe8321SBarry Smith   PetscErrorCode    ierr;
357b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
358b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
359b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
360b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
361b3260449SShri Abhyankar   const PetscScalar *b;
362f1af5d2fSBarry Smith 
363f1af5d2fSBarry Smith   PetscFunctionBegin;
364ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3653649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3661ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
367f1af5d2fSBarry Smith 
368f1af5d2fSBarry Smith   /* forward solve the U^T */
369f1af5d2fSBarry Smith   idx = 0;
370f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
371f1af5d2fSBarry Smith 
372f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
373f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
374ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
375f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
376f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
377f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
378f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
379f1af5d2fSBarry Smith     v += 16;
380f1af5d2fSBarry Smith 
381f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
382f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
383f1af5d2fSBarry Smith     while (nz--) {
384f1af5d2fSBarry Smith       oidx = 4*(*vi++);
385f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
386f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
387f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
388f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
389f1af5d2fSBarry Smith       v  += 16;
390f1af5d2fSBarry Smith     }
391f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
392f1af5d2fSBarry Smith     idx += 4;
393f1af5d2fSBarry Smith   }
394f1af5d2fSBarry Smith   /* backward solve the L^T */
395f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
396f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
397f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
398f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
399f1af5d2fSBarry Smith     idt  = 4*i;
400f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
401f1af5d2fSBarry Smith     while (nz--) {
402f1af5d2fSBarry Smith       idx   = 4*(*vi--);
403f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
404f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
405f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
406f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
407f1af5d2fSBarry Smith       v -= 16;
408f1af5d2fSBarry Smith     }
409f1af5d2fSBarry Smith   }
4103649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4111ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
412dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
413f1af5d2fSBarry Smith   PetscFunctionReturn(0);
414f1af5d2fSBarry Smith }
415f1af5d2fSBarry Smith 
4164a2ae208SSatish Balay #undef __FUNCT__
4174dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
4184dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4198499736aSShri Abhyankar {
4208499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4218499736aSShri Abhyankar   PetscErrorCode    ierr;
422b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
4238499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
424b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
425b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
426b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
427b3260449SShri Abhyankar   const PetscScalar *b;
4288499736aSShri Abhyankar 
4298499736aSShri Abhyankar   PetscFunctionBegin;
4308499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4313649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4328499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4338499736aSShri Abhyankar 
4348499736aSShri Abhyankar   /* forward solve the U^T */
4358499736aSShri Abhyankar   idx = 0;
4368499736aSShri Abhyankar   for (i=0; i<n; i++) {
4378499736aSShri Abhyankar     v     = aa + bs2*diag[i];
4388499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
4398499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
4408499736aSShri Abhyankar     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
4418499736aSShri Abhyankar     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
4428499736aSShri Abhyankar     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
4438499736aSShri Abhyankar     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
4448499736aSShri Abhyankar     v -= bs2;
4458499736aSShri Abhyankar 
4468499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
4478499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
4488499736aSShri Abhyankar     for(j=0;j>-nz;j--){
4498499736aSShri Abhyankar       oidx = bs*vi[j];
4508499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4518499736aSShri Abhyankar       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4528499736aSShri Abhyankar       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4538499736aSShri Abhyankar       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4548499736aSShri Abhyankar       v  -= bs2;
4558499736aSShri Abhyankar     }
4568499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
4578499736aSShri Abhyankar     idx += bs;
4588499736aSShri Abhyankar   }
4598499736aSShri Abhyankar   /* backward solve the L^T */
4608499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
4618499736aSShri Abhyankar     v    = aa + bs2*ai[i];
4628499736aSShri Abhyankar     vi   = aj + ai[i];
4638499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
4648499736aSShri Abhyankar     idt  = bs*i;
4658499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
4668499736aSShri Abhyankar     for(j=0;j<nz;j++){
4678499736aSShri Abhyankar       idx   = bs*vi[j];
4688499736aSShri Abhyankar       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4698499736aSShri Abhyankar       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4708499736aSShri Abhyankar       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4718499736aSShri Abhyankar       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4728499736aSShri Abhyankar       v += bs2;
4738499736aSShri Abhyankar     }
4748499736aSShri Abhyankar   }
4753649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4768499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4778499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4788499736aSShri Abhyankar   PetscFunctionReturn(0);
4798499736aSShri Abhyankar }
4808499736aSShri Abhyankar 
4818499736aSShri Abhyankar #undef __FUNCT__
48206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
48306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
484f1af5d2fSBarry Smith {
485f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
486dfbe8321SBarry Smith   PetscErrorCode    ierr;
487b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
488b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
489b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
490b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
491b3260449SShri Abhyankar   const PetscScalar *b;
492f1af5d2fSBarry Smith 
493f1af5d2fSBarry Smith   PetscFunctionBegin;
494ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4953649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4961ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
497f1af5d2fSBarry Smith 
498f1af5d2fSBarry Smith   /* forward solve the U^T */
499f1af5d2fSBarry Smith   idx = 0;
500f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
501f1af5d2fSBarry Smith 
502f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
503f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
504ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
505f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
506f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
507f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
508f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
509f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
510f1af5d2fSBarry Smith     v += 25;
511f1af5d2fSBarry Smith 
512f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
513f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
514f1af5d2fSBarry Smith     while (nz--) {
515f1af5d2fSBarry Smith       oidx = 5*(*vi++);
516f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
517f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
518f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
519f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
520f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
521f1af5d2fSBarry Smith       v  += 25;
522f1af5d2fSBarry Smith     }
523f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
524f1af5d2fSBarry Smith     idx += 5;
525f1af5d2fSBarry Smith   }
526f1af5d2fSBarry Smith   /* backward solve the L^T */
527f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
528f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
529f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
530f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
531f1af5d2fSBarry Smith     idt  = 5*i;
532f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
533f1af5d2fSBarry Smith     while (nz--) {
534f1af5d2fSBarry Smith       idx   = 5*(*vi--);
535f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
536f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
537f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
538f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
539f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
540f1af5d2fSBarry Smith       v -= 25;
541f1af5d2fSBarry Smith     }
542f1af5d2fSBarry Smith   }
5433649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5441ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
545dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
546f1af5d2fSBarry Smith   PetscFunctionReturn(0);
547f1af5d2fSBarry Smith }
548f1af5d2fSBarry Smith 
5494a2ae208SSatish Balay #undef __FUNCT__
5504dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
5514dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
5528499736aSShri Abhyankar {
5538499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
5548499736aSShri Abhyankar   PetscErrorCode ierr;
555b3260449SShri Abhyankar   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5568499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
557b3260449SShri Abhyankar   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
558b3260449SShri Abhyankar   const MatScalar      *aa=a->a,*v;
559b3260449SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
560b3260449SShri Abhyankar   const PetscScalar    *b;
5618499736aSShri Abhyankar 
5628499736aSShri Abhyankar   PetscFunctionBegin;
5638499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
5643649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5658499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5668499736aSShri Abhyankar 
5678499736aSShri Abhyankar   /* forward solve the U^T */
5688499736aSShri Abhyankar   idx = 0;
5698499736aSShri Abhyankar   for (i=0; i<n; i++) {
5708499736aSShri Abhyankar     v     = aa + bs2*diag[i];
5718499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
5728499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
5738499736aSShri Abhyankar     x5 = x[4+idx];
5748499736aSShri Abhyankar     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
5758499736aSShri Abhyankar     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
5768499736aSShri Abhyankar     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
5778499736aSShri Abhyankar     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
5788499736aSShri Abhyankar     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
5798499736aSShri Abhyankar     v -= bs2;
5808499736aSShri Abhyankar 
5818499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
5828499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
5838499736aSShri Abhyankar     for(j=0;j>-nz;j--){
5848499736aSShri Abhyankar       oidx = bs*vi[j];
5858499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
5868499736aSShri Abhyankar       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
5878499736aSShri Abhyankar       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
5888499736aSShri Abhyankar       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
5898499736aSShri Abhyankar       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
5908499736aSShri Abhyankar       v  -= bs2;
5918499736aSShri Abhyankar     }
5928499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
5938499736aSShri Abhyankar     idx += bs;
5948499736aSShri Abhyankar   }
5958499736aSShri Abhyankar   /* backward solve the L^T */
5968499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
5978499736aSShri Abhyankar     v    = aa + bs2*ai[i];
5988499736aSShri Abhyankar     vi   = aj + ai[i];
5998499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
6008499736aSShri Abhyankar     idt  = bs*i;
6018499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
6028499736aSShri Abhyankar     for(j=0;j<nz;j++){
6038499736aSShri Abhyankar       idx   = bs*vi[j];
6048499736aSShri Abhyankar       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
6058499736aSShri Abhyankar       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
6068499736aSShri Abhyankar       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
6078499736aSShri Abhyankar       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
6088499736aSShri Abhyankar       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
6098499736aSShri Abhyankar       v += bs2;
6108499736aSShri Abhyankar     }
6118499736aSShri Abhyankar   }
6123649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
6138499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
6148499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
6158499736aSShri Abhyankar   PetscFunctionReturn(0);
6168499736aSShri Abhyankar }
6178499736aSShri Abhyankar 
6188499736aSShri Abhyankar #undef __FUNCT__
61906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
62006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
621f1af5d2fSBarry Smith {
622f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
623dfbe8321SBarry Smith   PetscErrorCode    ierr;
624b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
625b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
626b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
627b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
628b3260449SShri Abhyankar   const PetscScalar *b;
629f1af5d2fSBarry Smith 
630f1af5d2fSBarry Smith   PetscFunctionBegin;
631ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
6323649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
6331ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
634f1af5d2fSBarry Smith 
635f1af5d2fSBarry Smith   /* forward solve the U^T */
636f1af5d2fSBarry Smith   idx = 0;
637f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
638f1af5d2fSBarry Smith 
639f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
640f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
641ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
642ef66eb69SBarry Smith     x6    = x[5+idx];
643f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
644f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
645f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
646f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
647f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
648f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
649f1af5d2fSBarry Smith     v += 36;
650f1af5d2fSBarry Smith 
651f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
652f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
653f1af5d2fSBarry Smith     while (nz--) {
654f1af5d2fSBarry Smith       oidx = 6*(*vi++);
655f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
656f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
657f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
658f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
659f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
660f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
661f1af5d2fSBarry Smith       v  += 36;
662f1af5d2fSBarry Smith     }
663f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
664f1af5d2fSBarry Smith     x[5+idx] = s6;
665f1af5d2fSBarry Smith     idx += 6;
666f1af5d2fSBarry Smith   }
667f1af5d2fSBarry Smith   /* backward solve the L^T */
668f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
669f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
670f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
671f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
672f1af5d2fSBarry Smith     idt  = 6*i;
673f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
674f1af5d2fSBarry Smith     s6 = x[5+idt];
675f1af5d2fSBarry Smith     while (nz--) {
676f1af5d2fSBarry Smith       idx   = 6*(*vi--);
677f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
678f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
679f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
680f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
681f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
682f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
683f1af5d2fSBarry Smith       v -= 36;
684f1af5d2fSBarry Smith     }
685f1af5d2fSBarry Smith   }
6863649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
6871ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
688dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
689f1af5d2fSBarry Smith   PetscFunctionReturn(0);
690f1af5d2fSBarry Smith }
691f1af5d2fSBarry Smith 
6924a2ae208SSatish Balay #undef __FUNCT__
6934dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
6944dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
6958499736aSShri Abhyankar {
6968499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
6978499736aSShri Abhyankar   PetscErrorCode    ierr;
698b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
6998499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
700b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
701b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
702b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
703b3260449SShri Abhyankar   const PetscScalar *b;
7048499736aSShri Abhyankar 
7058499736aSShri Abhyankar   PetscFunctionBegin;
7068499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
7073649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
7088499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
7098499736aSShri Abhyankar 
7108499736aSShri Abhyankar   /* forward solve the U^T */
7118499736aSShri Abhyankar   idx = 0;
7128499736aSShri Abhyankar   for (i=0; i<n; i++) {
7138499736aSShri Abhyankar     v     = aa + bs2*diag[i];
7148499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
7158499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
7168499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];
7178499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
7188499736aSShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
7198499736aSShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
7208499736aSShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
7218499736aSShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
7228499736aSShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
7238499736aSShri Abhyankar     v -= bs2;
7248499736aSShri Abhyankar 
7258499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
7268499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
7278499736aSShri Abhyankar     for(j=0;j>-nz;j--){
7288499736aSShri Abhyankar       oidx = bs*vi[j];
7298499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
7308499736aSShri Abhyankar       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
7318499736aSShri Abhyankar       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
7328499736aSShri Abhyankar       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
7338499736aSShri Abhyankar       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
7348499736aSShri Abhyankar       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
7358499736aSShri Abhyankar       v  -= bs2;
7368499736aSShri Abhyankar     }
7378499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
7388499736aSShri Abhyankar     x[5+idx] = s6;
7398499736aSShri Abhyankar     idx += bs;
7408499736aSShri Abhyankar   }
7418499736aSShri Abhyankar   /* backward solve the L^T */
7428499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
7438499736aSShri Abhyankar     v    = aa + bs2*ai[i];
7448499736aSShri Abhyankar     vi   = aj + ai[i];
7458499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
7468499736aSShri Abhyankar     idt  = bs*i;
7478499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
7488499736aSShri Abhyankar     s6   = x[5+idt];
7498499736aSShri Abhyankar     for(j=0;j<nz;j++){
7508499736aSShri Abhyankar       idx   = bs*vi[j];
7518499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
7528499736aSShri Abhyankar       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
7538499736aSShri Abhyankar       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
7548499736aSShri Abhyankar       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
7558499736aSShri Abhyankar       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
7568499736aSShri Abhyankar       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
7578499736aSShri Abhyankar       v += bs2;
7588499736aSShri Abhyankar     }
7598499736aSShri Abhyankar   }
7603649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
7618499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
7628499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
7638499736aSShri Abhyankar   PetscFunctionReturn(0);
7648499736aSShri Abhyankar }
7658499736aSShri Abhyankar 
7668499736aSShri Abhyankar #undef __FUNCT__
76706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
76806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
769f1af5d2fSBarry Smith {
770f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
771dfbe8321SBarry Smith   PetscErrorCode    ierr;
772b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
773b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
774b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
775b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
776b3260449SShri Abhyankar   const PetscScalar *b;
777f1af5d2fSBarry Smith 
778f1af5d2fSBarry Smith   PetscFunctionBegin;
779ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
7803649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
7811ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
782f1af5d2fSBarry Smith 
783f1af5d2fSBarry Smith   /* forward solve the U^T */
784f1af5d2fSBarry Smith   idx = 0;
785f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
786f1af5d2fSBarry Smith 
787f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
788f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
789ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
790ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
791f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
792f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
793f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
794f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
795f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
796f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
797f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
798f1af5d2fSBarry Smith     v += 49;
799f1af5d2fSBarry Smith 
800f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
801f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
802f1af5d2fSBarry Smith     while (nz--) {
803f1af5d2fSBarry Smith       oidx = 7*(*vi++);
804f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
805f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
806f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
807f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
808f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
809f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
810f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
811f1af5d2fSBarry Smith       v  += 49;
812f1af5d2fSBarry Smith     }
813f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
814f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
815f1af5d2fSBarry Smith     idx += 7;
816f1af5d2fSBarry Smith   }
817f1af5d2fSBarry Smith   /* backward solve the L^T */
818f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
819f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
820f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
821f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
822f1af5d2fSBarry Smith     idt  = 7*i;
823f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
824f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
825f1af5d2fSBarry Smith     while (nz--) {
826f1af5d2fSBarry Smith       idx   = 7*(*vi--);
827f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
828f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
829f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
830f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
831f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
832f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
833f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
834f1af5d2fSBarry Smith       v -= 49;
835f1af5d2fSBarry Smith     }
836f1af5d2fSBarry Smith   }
8373649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
8381ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
839dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
840f1af5d2fSBarry Smith   PetscFunctionReturn(0);
841f1af5d2fSBarry Smith }
8428499736aSShri Abhyankar #undef __FUNCT__
8434dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
8444dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
8458499736aSShri Abhyankar {
8468499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
8478499736aSShri Abhyankar   PetscErrorCode    ierr;
848b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
8498499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
850b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
851b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
852b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
853b3260449SShri Abhyankar   const PetscScalar *b;
8548499736aSShri Abhyankar 
8558499736aSShri Abhyankar   PetscFunctionBegin;
8568499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
8573649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
8588499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
8598499736aSShri Abhyankar 
8608499736aSShri Abhyankar   /* forward solve the U^T */
8618499736aSShri Abhyankar   idx = 0;
8628499736aSShri Abhyankar   for (i=0; i<n; i++) {
8638499736aSShri Abhyankar     v     = aa + bs2*diag[i];
8648499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
8658499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
8668499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
8678499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
8688499736aSShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
8698499736aSShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
8708499736aSShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
8718499736aSShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
8728499736aSShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
8738499736aSShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
8748499736aSShri Abhyankar     v -= bs2;
8758499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
8768499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
8778499736aSShri Abhyankar     for(j=0;j>-nz;j--){
8788499736aSShri Abhyankar       oidx = bs*vi[j];
8798499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
8808499736aSShri Abhyankar       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
8818499736aSShri Abhyankar       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
8828499736aSShri Abhyankar       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
8838499736aSShri Abhyankar       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
8848499736aSShri Abhyankar       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
8858499736aSShri Abhyankar       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
8868499736aSShri Abhyankar       v  -= bs2;
8878499736aSShri Abhyankar     }
8888499736aSShri Abhyankar     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
8898499736aSShri Abhyankar     x[5+idx] = s6;  x[6+idx] = s7;
8908499736aSShri Abhyankar     idx += bs;
8918499736aSShri Abhyankar   }
8928499736aSShri Abhyankar   /* backward solve the L^T */
8938499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
8948499736aSShri Abhyankar     v    = aa + bs2*ai[i];
8958499736aSShri Abhyankar     vi   = aj + ai[i];
8968499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
8978499736aSShri Abhyankar     idt  = bs*i;
8988499736aSShri Abhyankar     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
8998499736aSShri Abhyankar     s6   = x[5+idt];  s7 = x[6+idt];
9008499736aSShri Abhyankar     for(j=0;j<nz;j++){
9018499736aSShri Abhyankar       idx   = bs*vi[j];
9028499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
9038499736aSShri Abhyankar       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
9048499736aSShri Abhyankar       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
9058499736aSShri Abhyankar       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
9068499736aSShri Abhyankar       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
9078499736aSShri Abhyankar       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
9088499736aSShri Abhyankar       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
9098499736aSShri Abhyankar       v += bs2;
9108499736aSShri Abhyankar     }
9118499736aSShri Abhyankar   }
9123649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
9138499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
9148499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
9158499736aSShri Abhyankar   PetscFunctionReturn(0);
9168499736aSShri Abhyankar }
917f1af5d2fSBarry Smith 
918f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
9194a2ae208SSatish Balay #undef __FUNCT__
92093fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
92193fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
92293fd935bSShri Abhyankar {
92393fd935bSShri Abhyankar   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
92493fd935bSShri Abhyankar   IS                iscol = a->col,isrow = a->row;
92593fd935bSShri Abhyankar   PetscErrorCode    ierr;
92693fd935bSShri Abhyankar   const PetscInt    *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi;
92793fd935bSShri Abhyankar   PetscInt          i,n = a->mbs,j;
92893fd935bSShri Abhyankar   PetscInt          nz;
92993fd935bSShri Abhyankar   PetscScalar       *x,*tmp,s1;
93093fd935bSShri Abhyankar   const MatScalar   *aa = a->a,*v;
93193fd935bSShri Abhyankar   const PetscScalar *b;
93293fd935bSShri Abhyankar 
93393fd935bSShri Abhyankar   PetscFunctionBegin;
9343649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
93593fd935bSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
93693fd935bSShri Abhyankar   tmp  = a->solve_work;
93793fd935bSShri Abhyankar 
93893fd935bSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
93993fd935bSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
94093fd935bSShri Abhyankar 
94193fd935bSShri Abhyankar   /* copy the b into temp work space according to permutation */
94293fd935bSShri Abhyankar   for (i=0; i<n; i++) tmp[i] = b[c[i]];
94393fd935bSShri Abhyankar 
94493fd935bSShri Abhyankar   /* forward solve the U^T */
94593fd935bSShri Abhyankar   for (i=0; i<n; i++) {
94693fd935bSShri Abhyankar     v   = aa + adiag[i+1] + 1;
94793fd935bSShri Abhyankar     vi  = aj + adiag[i+1] + 1;
94893fd935bSShri Abhyankar     nz  = adiag[i] - adiag[i+1] - 1;
94993fd935bSShri Abhyankar     s1  = tmp[i];
95093fd935bSShri Abhyankar     s1 *= v[nz];  /* multiply by inverse of diagonal entry */
95193fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
95293fd935bSShri Abhyankar     tmp[i] = s1;
95393fd935bSShri Abhyankar   }
95493fd935bSShri Abhyankar 
95593fd935bSShri Abhyankar   /* backward solve the L^T */
95693fd935bSShri Abhyankar   for (i=n-1; i>=0; i--){
95793fd935bSShri Abhyankar     v   = aa + ai[i];
95893fd935bSShri Abhyankar     vi  = aj + ai[i];
95993fd935bSShri Abhyankar     nz  = ai[i+1] - ai[i];
96093fd935bSShri Abhyankar     s1  = tmp[i];
96193fd935bSShri Abhyankar     for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j];
96293fd935bSShri Abhyankar   }
96393fd935bSShri Abhyankar 
96493fd935bSShri Abhyankar   /* copy tmp into x according to permutation */
96593fd935bSShri Abhyankar   for (i=0; i<n; i++) x[r[i]] = tmp[i];
96693fd935bSShri Abhyankar 
96793fd935bSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
96893fd935bSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9693649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
97093fd935bSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
97193fd935bSShri Abhyankar 
97293fd935bSShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr);
97393fd935bSShri Abhyankar   PetscFunctionReturn(0);
97493fd935bSShri Abhyankar }
97593fd935bSShri Abhyankar 
97693fd935bSShri Abhyankar #undef __FUNCT__
97706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
97806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
979f1af5d2fSBarry Smith {
980f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
981f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
9826849ba73SBarry Smith   PetscErrorCode    ierr;
9835d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
984b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
985b3260449SShri Abhyankar   PetscInt          i,nz;
986b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
987b3260449SShri Abhyankar   PetscScalar       s1,*x,*t;
988b3260449SShri Abhyankar   const PetscScalar *b;
989f1af5d2fSBarry Smith 
990f1af5d2fSBarry Smith   PetscFunctionBegin;
9913649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
9921ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
993f1af5d2fSBarry Smith   t  = a->solve_work;
994f1af5d2fSBarry Smith 
995f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
996f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
997f1af5d2fSBarry Smith 
998f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
999f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1000f1af5d2fSBarry Smith     t[i] = b[c[i]];
1001f1af5d2fSBarry Smith   }
1002f1af5d2fSBarry Smith 
1003f1af5d2fSBarry Smith   /* forward solve the U^T */
1004f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1005f1af5d2fSBarry Smith 
1006f1af5d2fSBarry Smith     v     = aa + diag[i];
1007f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1008f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
1009f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1010f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1011f1af5d2fSBarry Smith     while (nz--) {
1012f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
1013f1af5d2fSBarry Smith     }
1014f1af5d2fSBarry Smith     t[i]   = s1;
1015f1af5d2fSBarry Smith   }
1016f1af5d2fSBarry Smith   /* backward solve the L^T */
1017f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1018f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
1019f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1020f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1021f1af5d2fSBarry Smith     s1   = t[i];
1022f1af5d2fSBarry Smith     while (nz--) {
1023f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
1024f1af5d2fSBarry Smith     }
1025f1af5d2fSBarry Smith   }
1026f1af5d2fSBarry Smith 
1027f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1028f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1029f1af5d2fSBarry Smith     x[r[i]]   = t[i];
1030f1af5d2fSBarry Smith   }
1031f1af5d2fSBarry Smith 
1032f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1033f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
10343649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
10351ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1036dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
1037f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1038f1af5d2fSBarry Smith }
1039f1af5d2fSBarry Smith 
10404a2ae208SSatish Balay #undef __FUNCT__
104106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
104206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1043f1af5d2fSBarry Smith {
1044f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1045f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
10466849ba73SBarry Smith   PetscErrorCode    ierr;
10475d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1048b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1049b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1050b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1051b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x,*t;
1052b3260449SShri Abhyankar   const PetscScalar *b;
1053f1af5d2fSBarry Smith 
1054f1af5d2fSBarry Smith   PetscFunctionBegin;
10553649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
10561ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1057f1af5d2fSBarry Smith   t  = a->solve_work;
1058f1af5d2fSBarry Smith 
1059f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1060f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1061f1af5d2fSBarry Smith 
1062f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1063f1af5d2fSBarry Smith   ii = 0;
1064f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1065f1af5d2fSBarry Smith     ic      = 2*c[i];
1066f1af5d2fSBarry Smith     t[ii]   = b[ic];
1067f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1068f1af5d2fSBarry Smith     ii += 2;
1069f1af5d2fSBarry Smith   }
1070f1af5d2fSBarry Smith 
1071f1af5d2fSBarry Smith   /* forward solve the U^T */
1072f1af5d2fSBarry Smith   idx = 0;
1073f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1074f1af5d2fSBarry Smith 
1075f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
1076f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1077f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
1078f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
1079f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
1080f1af5d2fSBarry Smith     v += 4;
1081f1af5d2fSBarry Smith 
1082f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1083f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1084f1af5d2fSBarry Smith     while (nz--) {
1085f1af5d2fSBarry Smith       oidx = 2*(*vi++);
1086f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1087f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1088f1af5d2fSBarry Smith       v  += 4;
1089f1af5d2fSBarry Smith     }
1090f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1091f1af5d2fSBarry Smith     idx += 2;
1092f1af5d2fSBarry Smith   }
1093f1af5d2fSBarry Smith   /* backward solve the L^T */
1094f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1095f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
1096f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1097f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1098f1af5d2fSBarry Smith     idt  = 2*i;
1099f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1100f1af5d2fSBarry Smith     while (nz--) {
1101f1af5d2fSBarry Smith       idx   = 2*(*vi--);
1102f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1103f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1104f1af5d2fSBarry Smith       v -= 4;
1105f1af5d2fSBarry Smith     }
1106f1af5d2fSBarry Smith   }
1107f1af5d2fSBarry Smith 
1108f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1109f1af5d2fSBarry Smith   ii = 0;
1110f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1111f1af5d2fSBarry Smith     ir      = 2*r[i];
1112f1af5d2fSBarry Smith     x[ir]   = t[ii];
1113f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1114f1af5d2fSBarry Smith     ii += 2;
1115f1af5d2fSBarry Smith   }
1116f1af5d2fSBarry Smith 
1117f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1118f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11193649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
11201ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1121dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1122f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1123f1af5d2fSBarry Smith }
1124f1af5d2fSBarry Smith 
11254a2ae208SSatish Balay #undef __FUNCT__
11264dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
11274dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
112832121132SShri Abhyankar {
112932121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
113032121132SShri Abhyankar   PetscErrorCode    ierr;
113132121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1132b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
113332121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
113432121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1135b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1136b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1137b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x,*t;
1138b3260449SShri Abhyankar   const PetscScalar *b;
113932121132SShri Abhyankar 
114032121132SShri Abhyankar   PetscFunctionBegin;
11413649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
114232121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
114332121132SShri Abhyankar   t = a->solve_work;
114432121132SShri Abhyankar 
114532121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
114632121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
114732121132SShri Abhyankar 
114832121132SShri Abhyankar   /* copy b into temp work space according to permutation */
114932121132SShri Abhyankar   for(i=0;i<n;i++){
115032121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
115132121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1];
115232121132SShri Abhyankar   }
115332121132SShri Abhyankar 
115432121132SShri Abhyankar   /* forward solve the U^T */
115532121132SShri Abhyankar   idx = 0;
115632121132SShri Abhyankar   for (i=0; i<n; i++) {
115732121132SShri Abhyankar     v     = aa + bs2*diag[i];
115832121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
115932121132SShri Abhyankar     x1 = t[idx];   x2 = t[1+idx];
116032121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
116132121132SShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
116232121132SShri Abhyankar     v -= bs2;
116332121132SShri Abhyankar 
116432121132SShri Abhyankar     vi    = aj + diag[i] - 1;
116532121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
116632121132SShri Abhyankar     for(j=0;j>-nz;j--){
116732121132SShri Abhyankar       oidx = bs*vi[j];
116832121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2;
116932121132SShri Abhyankar       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
117032121132SShri Abhyankar       v  -= bs2;
117132121132SShri Abhyankar     }
117232121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
117332121132SShri Abhyankar     idx += bs;
117432121132SShri Abhyankar   }
117532121132SShri Abhyankar   /* backward solve the L^T */
117632121132SShri Abhyankar   for (i=n-1; i>=0; i--){
117732121132SShri Abhyankar     v    = aa + bs2*ai[i];
117832121132SShri Abhyankar     vi   = aj + ai[i];
117932121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
118032121132SShri Abhyankar     idt  = bs*i;
118132121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];
118232121132SShri Abhyankar     for(j=0;j<nz;j++){
118332121132SShri Abhyankar       idx   = bs*vi[j];
118432121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2;
118532121132SShri Abhyankar       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
118632121132SShri Abhyankar       v += bs2;
118732121132SShri Abhyankar     }
118832121132SShri Abhyankar   }
118932121132SShri Abhyankar 
119032121132SShri Abhyankar   /* copy t into x according to permutation */
119132121132SShri Abhyankar   for(i=0;i<n;i++){
119232121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
119332121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1];
119432121132SShri Abhyankar   }
119532121132SShri Abhyankar 
119632121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
119732121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11983649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
119932121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
120032121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
120132121132SShri Abhyankar   PetscFunctionReturn(0);
120232121132SShri Abhyankar }
120332121132SShri Abhyankar 
120432121132SShri Abhyankar #undef __FUNCT__
120506e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
120606e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1207f1af5d2fSBarry Smith {
1208f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1209f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
12106849ba73SBarry Smith   PetscErrorCode    ierr;
12115d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1212b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1213b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1214b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1215b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1216b3260449SShri Abhyankar   const PetscScalar *b;
1217f1af5d2fSBarry Smith 
1218f1af5d2fSBarry Smith   PetscFunctionBegin;
12193649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
12201ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1221f1af5d2fSBarry Smith   t  = a->solve_work;
1222f1af5d2fSBarry Smith 
1223f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1224f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1225f1af5d2fSBarry Smith 
1226f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1227f1af5d2fSBarry Smith   ii = 0;
1228f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1229f1af5d2fSBarry Smith     ic      = 3*c[i];
1230f1af5d2fSBarry Smith     t[ii]   = b[ic];
1231f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1232f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1233f1af5d2fSBarry Smith     ii += 3;
1234f1af5d2fSBarry Smith   }
1235f1af5d2fSBarry Smith 
1236f1af5d2fSBarry Smith   /* forward solve the U^T */
1237f1af5d2fSBarry Smith   idx = 0;
1238f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1239f1af5d2fSBarry Smith 
1240f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
1241f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1242f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1243f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1244f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1245f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1246f1af5d2fSBarry Smith     v += 9;
1247f1af5d2fSBarry Smith 
1248f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1249f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1250f1af5d2fSBarry Smith     while (nz--) {
1251f1af5d2fSBarry Smith       oidx = 3*(*vi++);
1252f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1253f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1254f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1255f1af5d2fSBarry Smith       v  += 9;
1256f1af5d2fSBarry Smith     }
1257f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1258f1af5d2fSBarry Smith     idx += 3;
1259f1af5d2fSBarry Smith   }
1260f1af5d2fSBarry Smith   /* backward solve the L^T */
1261f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1262f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
1263f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1264f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1265f1af5d2fSBarry Smith     idt  = 3*i;
1266f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1267f1af5d2fSBarry Smith     while (nz--) {
1268f1af5d2fSBarry Smith       idx   = 3*(*vi--);
1269f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1270f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1271f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1272f1af5d2fSBarry Smith       v -= 9;
1273f1af5d2fSBarry Smith     }
1274f1af5d2fSBarry Smith   }
1275f1af5d2fSBarry Smith 
1276f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1277f1af5d2fSBarry Smith   ii = 0;
1278f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1279f1af5d2fSBarry Smith     ir      = 3*r[i];
1280f1af5d2fSBarry Smith     x[ir]   = t[ii];
1281f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1282f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1283f1af5d2fSBarry Smith     ii += 3;
1284f1af5d2fSBarry Smith   }
1285f1af5d2fSBarry Smith 
1286f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1287f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12883649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
12891ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1290dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1291f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1292f1af5d2fSBarry Smith }
1293f1af5d2fSBarry Smith 
12944a2ae208SSatish Balay #undef __FUNCT__
12954dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
12964dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
129732121132SShri Abhyankar {
129832121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
129932121132SShri Abhyankar   PetscErrorCode    ierr;
130032121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1301b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
130232121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
130332121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1304b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1305b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1306b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1307b3260449SShri Abhyankar   const PetscScalar *b;
130832121132SShri Abhyankar 
130932121132SShri Abhyankar   PetscFunctionBegin;
13103649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
131132121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
131232121132SShri Abhyankar   t = a->solve_work;
131332121132SShri Abhyankar 
131432121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
131532121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
131632121132SShri Abhyankar 
131732121132SShri Abhyankar   /* copy b into temp work space according to permutation */
131832121132SShri Abhyankar   for(i=0;i<n;i++){
131932121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
132032121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
132132121132SShri Abhyankar   }
132232121132SShri Abhyankar 
132332121132SShri Abhyankar   /* forward solve the U^T */
132432121132SShri Abhyankar   idx = 0;
132532121132SShri Abhyankar   for (i=0; i<n; i++) {
132632121132SShri Abhyankar     v     = aa + bs2*diag[i];
132732121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
132832121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
132932121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
133032121132SShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
133132121132SShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
133232121132SShri Abhyankar     v -= bs2;
133332121132SShri Abhyankar 
133432121132SShri Abhyankar     vi    = aj + diag[i] - 1;
133532121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
133632121132SShri Abhyankar     for(j=0;j>-nz;j--){
133732121132SShri Abhyankar       oidx = bs*vi[j];
133832121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
133932121132SShri Abhyankar       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
134032121132SShri Abhyankar       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
134132121132SShri Abhyankar       v  -= bs2;
134232121132SShri Abhyankar     }
134332121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
134432121132SShri Abhyankar     idx += bs;
134532121132SShri Abhyankar   }
134632121132SShri Abhyankar   /* backward solve the L^T */
134732121132SShri Abhyankar   for (i=n-1; i>=0; i--){
134832121132SShri Abhyankar     v    = aa + bs2*ai[i];
134932121132SShri Abhyankar     vi   = aj + ai[i];
135032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
135132121132SShri Abhyankar     idt  = bs*i;
135232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
135332121132SShri Abhyankar     for(j=0;j<nz;j++){
135432121132SShri Abhyankar       idx   = bs*vi[j];
135532121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
135632121132SShri Abhyankar       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
135732121132SShri Abhyankar       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
135832121132SShri Abhyankar       v += bs2;
135932121132SShri Abhyankar     }
136032121132SShri Abhyankar   }
136132121132SShri Abhyankar 
136232121132SShri Abhyankar   /* copy t into x according to permutation */
136332121132SShri Abhyankar   for(i=0;i<n;i++){
136432121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
136532121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
136632121132SShri Abhyankar   }
136732121132SShri Abhyankar 
136832121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
136932121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
13703649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
137132121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
137232121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
137332121132SShri Abhyankar   PetscFunctionReturn(0);
137432121132SShri Abhyankar }
137532121132SShri Abhyankar 
137632121132SShri Abhyankar #undef __FUNCT__
137706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
137806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1379f1af5d2fSBarry Smith {
1380f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1381f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
13826849ba73SBarry Smith   PetscErrorCode    ierr;
13835d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1384b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1385b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1386b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1387b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1388b3260449SShri Abhyankar   const PetscScalar *b;
1389f1af5d2fSBarry Smith 
1390f1af5d2fSBarry Smith   PetscFunctionBegin;
13913649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
13921ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1393f1af5d2fSBarry Smith   t  = a->solve_work;
1394f1af5d2fSBarry Smith 
1395f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1396f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1397f1af5d2fSBarry Smith 
1398f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1399f1af5d2fSBarry Smith   ii = 0;
1400f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1401f1af5d2fSBarry Smith     ic      = 4*c[i];
1402f1af5d2fSBarry Smith     t[ii]   = b[ic];
1403f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1404f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1405f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1406f1af5d2fSBarry Smith     ii += 4;
1407f1af5d2fSBarry Smith   }
1408f1af5d2fSBarry Smith 
1409f1af5d2fSBarry Smith   /* forward solve the U^T */
1410f1af5d2fSBarry Smith   idx = 0;
1411f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1412f1af5d2fSBarry Smith 
1413f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
1414f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1415f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1416f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1417f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1418f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1419f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1420f1af5d2fSBarry Smith     v += 16;
1421f1af5d2fSBarry Smith 
1422f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1423f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1424f1af5d2fSBarry Smith     while (nz--) {
1425f1af5d2fSBarry Smith       oidx = 4*(*vi++);
1426f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1427f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1428f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1429f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1430f1af5d2fSBarry Smith       v  += 16;
1431f1af5d2fSBarry Smith     }
1432f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1433f1af5d2fSBarry Smith     idx += 4;
1434f1af5d2fSBarry Smith   }
1435f1af5d2fSBarry Smith   /* backward solve the L^T */
1436f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1437f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
1438f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1439f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1440f1af5d2fSBarry Smith     idt  = 4*i;
1441f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1442f1af5d2fSBarry Smith     while (nz--) {
1443f1af5d2fSBarry Smith       idx   = 4*(*vi--);
1444f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1445f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1446f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1447f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1448f1af5d2fSBarry Smith       v -= 16;
1449f1af5d2fSBarry Smith     }
1450f1af5d2fSBarry Smith   }
1451f1af5d2fSBarry Smith 
1452f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1453f1af5d2fSBarry Smith   ii = 0;
1454f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1455f1af5d2fSBarry Smith     ir      = 4*r[i];
1456f1af5d2fSBarry Smith     x[ir]   = t[ii];
1457f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1458f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1459f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1460f1af5d2fSBarry Smith     ii += 4;
1461f1af5d2fSBarry Smith   }
1462f1af5d2fSBarry Smith 
1463f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1464f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
14653649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
14661ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1467dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1468f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1469f1af5d2fSBarry Smith }
1470f1af5d2fSBarry Smith 
14714a2ae208SSatish Balay #undef __FUNCT__
14724dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
14734dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
147432121132SShri Abhyankar {
147532121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
147632121132SShri Abhyankar   PetscErrorCode    ierr;
147732121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1478b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
147932121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
148032121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1481b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1482b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1483b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1484b3260449SShri Abhyankar   const PetscScalar *b;
148532121132SShri Abhyankar 
148632121132SShri Abhyankar   PetscFunctionBegin;
14873649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
148832121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
148932121132SShri Abhyankar   t = a->solve_work;
149032121132SShri Abhyankar 
149132121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
149232121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
149332121132SShri Abhyankar 
149432121132SShri Abhyankar   /* copy b into temp work space according to permutation */
149532121132SShri Abhyankar   for(i=0;i<n;i++){
149632121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
149732121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
149832121132SShri Abhyankar   }
149932121132SShri Abhyankar 
150032121132SShri Abhyankar   /* forward solve the U^T */
150132121132SShri Abhyankar   idx = 0;
150232121132SShri Abhyankar   for (i=0; i<n; i++) {
150332121132SShri Abhyankar     v     = aa + bs2*diag[i];
150432121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
150532121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
150632121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
150732121132SShri Abhyankar     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
150832121132SShri Abhyankar     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
150932121132SShri Abhyankar     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
151032121132SShri Abhyankar     v -= bs2;
151132121132SShri Abhyankar 
151232121132SShri Abhyankar     vi    = aj + diag[i] - 1;
151332121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
151432121132SShri Abhyankar     for(j=0;j>-nz;j--){
151532121132SShri Abhyankar       oidx = bs*vi[j];
151632121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
151732121132SShri Abhyankar       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
151832121132SShri Abhyankar       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
151932121132SShri Abhyankar       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
152032121132SShri Abhyankar       v  -= bs2;
152132121132SShri Abhyankar     }
152232121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
152332121132SShri Abhyankar     idx += bs;
152432121132SShri Abhyankar   }
152532121132SShri Abhyankar   /* backward solve the L^T */
152632121132SShri Abhyankar   for (i=n-1; i>=0; i--){
152732121132SShri Abhyankar     v    = aa + bs2*ai[i];
152832121132SShri Abhyankar     vi   = aj + ai[i];
152932121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
153032121132SShri Abhyankar     idt  = bs*i;
153132121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
153232121132SShri Abhyankar     for(j=0;j<nz;j++){
153332121132SShri Abhyankar       idx   = bs*vi[j];
153432121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
153532121132SShri Abhyankar       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
153632121132SShri Abhyankar       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
153732121132SShri Abhyankar       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
153832121132SShri Abhyankar       v += bs2;
153932121132SShri Abhyankar     }
154032121132SShri Abhyankar   }
154132121132SShri Abhyankar 
154232121132SShri Abhyankar   /* copy t into x according to permutation */
154332121132SShri Abhyankar   for(i=0;i<n;i++){
154432121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
154532121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
154632121132SShri Abhyankar   }
154732121132SShri Abhyankar 
154832121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
154932121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
15503649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
155132121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
155232121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
155332121132SShri Abhyankar   PetscFunctionReturn(0);
155432121132SShri Abhyankar }
155532121132SShri Abhyankar 
155632121132SShri Abhyankar #undef __FUNCT__
155706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
155806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1559f1af5d2fSBarry Smith {
1560f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1561f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
15626849ba73SBarry Smith   PetscErrorCode    ierr;
15635d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1564b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1565b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1566b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1567b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1568b3260449SShri Abhyankar   const PetscScalar *b;
1569f1af5d2fSBarry Smith 
1570f1af5d2fSBarry Smith   PetscFunctionBegin;
15713649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
15721ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1573f1af5d2fSBarry Smith   t  = a->solve_work;
1574f1af5d2fSBarry Smith 
1575f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1576f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1577f1af5d2fSBarry Smith 
1578f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1579f1af5d2fSBarry Smith   ii = 0;
1580f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1581f1af5d2fSBarry Smith     ic      = 5*c[i];
1582f1af5d2fSBarry Smith     t[ii]   = b[ic];
1583f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1584f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1585f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1586f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1587f1af5d2fSBarry Smith     ii += 5;
1588f1af5d2fSBarry Smith   }
1589f1af5d2fSBarry Smith 
1590f1af5d2fSBarry Smith   /* forward solve the U^T */
1591f1af5d2fSBarry Smith   idx = 0;
1592f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1593f1af5d2fSBarry Smith 
1594f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
1595f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1596f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1597f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1598f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1599f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1600f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1601f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1602f1af5d2fSBarry Smith     v += 25;
1603f1af5d2fSBarry Smith 
1604f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1605f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1606f1af5d2fSBarry Smith     while (nz--) {
1607f1af5d2fSBarry Smith       oidx = 5*(*vi++);
1608f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1609f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1610f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1611f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1612f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1613f1af5d2fSBarry Smith       v  += 25;
1614f1af5d2fSBarry Smith     }
1615f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1616f1af5d2fSBarry Smith     idx += 5;
1617f1af5d2fSBarry Smith   }
1618f1af5d2fSBarry Smith   /* backward solve the L^T */
1619f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1620f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
1621f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1622f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1623f1af5d2fSBarry Smith     idt  = 5*i;
1624f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1625f1af5d2fSBarry Smith     while (nz--) {
1626f1af5d2fSBarry Smith       idx   = 5*(*vi--);
1627f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1628f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1629f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1630f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1631f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1632f1af5d2fSBarry Smith       v -= 25;
1633f1af5d2fSBarry Smith     }
1634f1af5d2fSBarry Smith   }
1635f1af5d2fSBarry Smith 
1636f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1637f1af5d2fSBarry Smith   ii = 0;
1638f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1639f1af5d2fSBarry Smith     ir      = 5*r[i];
1640f1af5d2fSBarry Smith     x[ir]   = t[ii];
1641f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1642f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1643f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1644f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1645f1af5d2fSBarry Smith     ii += 5;
1646f1af5d2fSBarry Smith   }
1647f1af5d2fSBarry Smith 
1648f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1649f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
16503649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
16511ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1652dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1653f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1654f1af5d2fSBarry Smith }
1655f1af5d2fSBarry Smith 
16564a2ae208SSatish Balay #undef __FUNCT__
16574dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
16584dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
165932121132SShri Abhyankar {
166032121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
166132121132SShri Abhyankar   PetscErrorCode    ierr;
166232121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1663b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
166432121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
166532121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1666b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1667b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1668b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1669b3260449SShri Abhyankar   const PetscScalar *b;
167032121132SShri Abhyankar 
167132121132SShri Abhyankar   PetscFunctionBegin;
16723649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
167332121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
167432121132SShri Abhyankar   t = a->solve_work;
167532121132SShri Abhyankar 
167632121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
167732121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
167832121132SShri Abhyankar 
167932121132SShri Abhyankar   /* copy b into temp work space according to permutation */
168032121132SShri Abhyankar   for(i=0;i<n;i++){
168132121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
168232121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
168332121132SShri Abhyankar     t[ii+4] = b[ic+4];
168432121132SShri Abhyankar   }
168532121132SShri Abhyankar 
168632121132SShri Abhyankar   /* forward solve the U^T */
168732121132SShri Abhyankar   idx = 0;
168832121132SShri Abhyankar   for (i=0; i<n; i++) {
168932121132SShri Abhyankar     v     = aa + bs2*diag[i];
169032121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
169132121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
169232121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
169332121132SShri Abhyankar     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
169432121132SShri Abhyankar     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
169532121132SShri Abhyankar     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
169632121132SShri Abhyankar     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
169732121132SShri Abhyankar     v -= bs2;
169832121132SShri Abhyankar 
169932121132SShri Abhyankar     vi    = aj + diag[i] - 1;
170032121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
170132121132SShri Abhyankar     for(j=0;j>-nz;j--){
170232121132SShri Abhyankar       oidx = bs*vi[j];
170332121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
170432121132SShri Abhyankar       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
170532121132SShri Abhyankar       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
170632121132SShri Abhyankar       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
170732121132SShri Abhyankar       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
170832121132SShri Abhyankar       v  -= bs2;
170932121132SShri Abhyankar     }
171032121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
171132121132SShri Abhyankar     idx += bs;
171232121132SShri Abhyankar   }
171332121132SShri Abhyankar   /* backward solve the L^T */
171432121132SShri Abhyankar   for (i=n-1; i>=0; i--){
171532121132SShri Abhyankar     v    = aa + bs2*ai[i];
171632121132SShri Abhyankar     vi   = aj + ai[i];
171732121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
171832121132SShri Abhyankar     idt  = bs*i;
171932121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
172032121132SShri Abhyankar     for(j=0;j<nz;j++){
172132121132SShri Abhyankar       idx   = bs*vi[j];
172232121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
172332121132SShri Abhyankar       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
172432121132SShri Abhyankar       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
172532121132SShri Abhyankar       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
172632121132SShri Abhyankar       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
172732121132SShri Abhyankar       v += bs2;
172832121132SShri Abhyankar     }
172932121132SShri Abhyankar   }
173032121132SShri Abhyankar 
173132121132SShri Abhyankar   /* copy t into x according to permutation */
173232121132SShri Abhyankar   for(i=0;i<n;i++){
173332121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
173432121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
173532121132SShri Abhyankar     x[ir+4] = t[ii+4];
173632121132SShri Abhyankar   }
173732121132SShri Abhyankar 
173832121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
173932121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
17403649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
174132121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
174232121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
174332121132SShri Abhyankar   PetscFunctionReturn(0);
174432121132SShri Abhyankar }
174532121132SShri Abhyankar 
174632121132SShri Abhyankar #undef __FUNCT__
174706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
174806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1749f1af5d2fSBarry Smith {
1750f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1751f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
17526849ba73SBarry Smith   PetscErrorCode    ierr;
17535d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1754b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1755b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1756b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1757b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1758b3260449SShri Abhyankar   const PetscScalar *b;
1759f1af5d2fSBarry Smith 
1760f1af5d2fSBarry Smith   PetscFunctionBegin;
17613649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
17621ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1763f1af5d2fSBarry Smith   t  = a->solve_work;
1764f1af5d2fSBarry Smith 
1765f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1766f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1767f1af5d2fSBarry Smith 
1768f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1769f1af5d2fSBarry Smith   ii = 0;
1770f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1771f1af5d2fSBarry Smith     ic      = 6*c[i];
1772f1af5d2fSBarry Smith     t[ii]   = b[ic];
1773f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1774f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1775f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1776f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1777f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1778f1af5d2fSBarry Smith     ii += 6;
1779f1af5d2fSBarry Smith   }
1780f1af5d2fSBarry Smith 
1781f1af5d2fSBarry Smith   /* forward solve the U^T */
1782f1af5d2fSBarry Smith   idx = 0;
1783f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1784f1af5d2fSBarry Smith 
1785f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
1786f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1787f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1788f1af5d2fSBarry Smith     x6    = t[5+idx];
1789f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1790f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1791f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1792f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1793f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1794f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1795f1af5d2fSBarry Smith     v += 36;
1796f1af5d2fSBarry Smith 
1797f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1798f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1799f1af5d2fSBarry Smith     while (nz--) {
1800f1af5d2fSBarry Smith       oidx = 6*(*vi++);
1801f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1802f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1803f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1804f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1805f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1806f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1807f1af5d2fSBarry Smith       v  += 36;
1808f1af5d2fSBarry Smith     }
1809f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1810f1af5d2fSBarry Smith     t[5+idx] = s6;
1811f1af5d2fSBarry Smith     idx += 6;
1812f1af5d2fSBarry Smith   }
1813f1af5d2fSBarry Smith   /* backward solve the L^T */
1814f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1815f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
1816f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1817f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1818f1af5d2fSBarry Smith     idt  = 6*i;
1819f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1820f1af5d2fSBarry Smith     s6 = t[5+idt];
1821f1af5d2fSBarry Smith     while (nz--) {
1822f1af5d2fSBarry Smith       idx   = 6*(*vi--);
1823f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1824f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1825f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1826f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1827f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1828f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1829f1af5d2fSBarry Smith       v -= 36;
1830f1af5d2fSBarry Smith     }
1831f1af5d2fSBarry Smith   }
1832f1af5d2fSBarry Smith 
1833f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1834f1af5d2fSBarry Smith   ii = 0;
1835f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1836f1af5d2fSBarry Smith     ir      = 6*r[i];
1837f1af5d2fSBarry Smith     x[ir]   = t[ii];
1838f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1839f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1840f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1841f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1842f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1843f1af5d2fSBarry Smith     ii += 6;
1844f1af5d2fSBarry Smith   }
1845f1af5d2fSBarry Smith 
1846f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1847f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
18483649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
18491ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1850dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1851f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1852f1af5d2fSBarry Smith }
1853f1af5d2fSBarry Smith 
18544a2ae208SSatish Balay #undef __FUNCT__
18554dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
18564dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
185732121132SShri Abhyankar {
185832121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
185932121132SShri Abhyankar   PetscErrorCode    ierr;
186032121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1861b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
186232121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
186332121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1864b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1865b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1866b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1867b3260449SShri Abhyankar   const PetscScalar *b;
186832121132SShri Abhyankar 
186932121132SShri Abhyankar   PetscFunctionBegin;
18703649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
187132121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
187232121132SShri Abhyankar   t = a->solve_work;
187332121132SShri Abhyankar 
187432121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
187532121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
187632121132SShri Abhyankar 
187732121132SShri Abhyankar   /* copy b into temp work space according to permutation */
187832121132SShri Abhyankar   for(i=0;i<n;i++){
187932121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
188032121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
188132121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
188232121132SShri Abhyankar   }
188332121132SShri Abhyankar 
188432121132SShri Abhyankar   /* forward solve the U^T */
188532121132SShri Abhyankar   idx = 0;
188632121132SShri Abhyankar   for (i=0; i<n; i++) {
188732121132SShri Abhyankar     v     = aa + bs2*diag[i];
188832121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
188932121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
189032121132SShri Abhyankar     x6    = t[5+idx];
189132121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
189232121132SShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
189332121132SShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
189432121132SShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
189532121132SShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
189632121132SShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
189732121132SShri Abhyankar     v -= bs2;
189832121132SShri Abhyankar 
189932121132SShri Abhyankar     vi    = aj + diag[i] - 1;
190032121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
190132121132SShri Abhyankar     for(j=0;j>-nz;j--){
190232121132SShri Abhyankar       oidx = bs*vi[j];
190332121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
190432121132SShri Abhyankar       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
190532121132SShri Abhyankar       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
190632121132SShri Abhyankar       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
190732121132SShri Abhyankar       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
190832121132SShri Abhyankar       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
190932121132SShri Abhyankar       v  -= bs2;
191032121132SShri Abhyankar     }
191132121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
191232121132SShri Abhyankar     t[5+idx] = s6;
191332121132SShri Abhyankar     idx += bs;
191432121132SShri Abhyankar   }
191532121132SShri Abhyankar   /* backward solve the L^T */
191632121132SShri Abhyankar   for (i=n-1; i>=0; i--){
191732121132SShri Abhyankar     v    = aa + bs2*ai[i];
191832121132SShri Abhyankar     vi   = aj + ai[i];
191932121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
192032121132SShri Abhyankar     idt  = bs*i;
192132121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
192232121132SShri Abhyankar     s6   = t[5+idt];
192332121132SShri Abhyankar    for(j=0;j<nz;j++){
192432121132SShri Abhyankar       idx   = bs*vi[j];
192532121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
192632121132SShri Abhyankar       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
192732121132SShri Abhyankar       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
192832121132SShri Abhyankar       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
192932121132SShri Abhyankar       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
193032121132SShri Abhyankar       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
193132121132SShri Abhyankar       v += bs2;
193232121132SShri Abhyankar     }
193332121132SShri Abhyankar   }
193432121132SShri Abhyankar 
193532121132SShri Abhyankar   /* copy t into x according to permutation */
193632121132SShri Abhyankar   for(i=0;i<n;i++){
193732121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
193832121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
193932121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
194032121132SShri Abhyankar   }
194132121132SShri Abhyankar 
194232121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
194332121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
19443649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
194532121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
194632121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
194732121132SShri Abhyankar   PetscFunctionReturn(0);
194832121132SShri Abhyankar }
194932121132SShri Abhyankar 
195032121132SShri Abhyankar #undef __FUNCT__
195106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
195206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1953f1af5d2fSBarry Smith {
1954f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1955f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
19566849ba73SBarry Smith   PetscErrorCode    ierr;
19575d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1958b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1959b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1960b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1961b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1962b3260449SShri Abhyankar   const PetscScalar *b;
1963f1af5d2fSBarry Smith 
1964f1af5d2fSBarry Smith   PetscFunctionBegin;
19653649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
19661ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1967f1af5d2fSBarry Smith   t  = a->solve_work;
1968f1af5d2fSBarry Smith 
1969f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1970f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1971f1af5d2fSBarry Smith 
1972f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1973f1af5d2fSBarry Smith   ii = 0;
1974f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1975f1af5d2fSBarry Smith     ic      = 7*c[i];
1976f1af5d2fSBarry Smith     t[ii]   = b[ic];
1977f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1978f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1979f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1980f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1981f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1982f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1983f1af5d2fSBarry Smith     ii += 7;
1984f1af5d2fSBarry Smith   }
1985f1af5d2fSBarry Smith 
1986f1af5d2fSBarry Smith   /* forward solve the U^T */
1987f1af5d2fSBarry Smith   idx = 0;
1988f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1989f1af5d2fSBarry Smith 
1990f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1991f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1992f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1993f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1994f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1995f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1996f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1997f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1998f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1999f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
2000f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
2001f1af5d2fSBarry Smith     v += 49;
2002f1af5d2fSBarry Smith 
2003f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
2004f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
2005f1af5d2fSBarry Smith     while (nz--) {
2006f1af5d2fSBarry Smith       oidx = 7*(*vi++);
2007f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2008f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2009f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2010f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2011f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2012f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2013f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2014f1af5d2fSBarry Smith       v  += 49;
2015f1af5d2fSBarry Smith     }
2016f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2017f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
2018f1af5d2fSBarry Smith     idx += 7;
2019f1af5d2fSBarry Smith   }
2020f1af5d2fSBarry Smith   /* backward solve the L^T */
2021f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
2022f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
2023f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
2024f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
2025f1af5d2fSBarry Smith     idt  = 7*i;
2026f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2027f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
2028f1af5d2fSBarry Smith     while (nz--) {
2029f1af5d2fSBarry Smith       idx   = 7*(*vi--);
2030f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2031f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2032f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2033f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2034f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2035f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2036f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2037f1af5d2fSBarry Smith       v -= 49;
2038f1af5d2fSBarry Smith     }
2039f1af5d2fSBarry Smith   }
2040f1af5d2fSBarry Smith 
2041f1af5d2fSBarry Smith   /* copy t into x according to permutation */
2042f1af5d2fSBarry Smith   ii = 0;
2043f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
2044f1af5d2fSBarry Smith     ir      = 7*r[i];
2045f1af5d2fSBarry Smith     x[ir]   = t[ii];
2046f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
2047f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
2048f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
2049f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
2050f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
2051f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
2052f1af5d2fSBarry Smith     ii += 7;
2053f1af5d2fSBarry Smith   }
2054f1af5d2fSBarry Smith 
2055f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2056f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
20573649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
20581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2059dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2060f1af5d2fSBarry Smith   PetscFunctionReturn(0);
2061f1af5d2fSBarry Smith }
206232121132SShri Abhyankar #undef __FUNCT__
20634dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
20644dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
206532121132SShri Abhyankar {
206632121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
206732121132SShri Abhyankar   PetscErrorCode    ierr;
206832121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
2069b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
207032121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
207132121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
2072b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2073b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2074b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2075b3260449SShri Abhyankar   const PetscScalar *b;
207632121132SShri Abhyankar 
207732121132SShri Abhyankar   PetscFunctionBegin;
20783649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
207932121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
208032121132SShri Abhyankar   t = a->solve_work;
208132121132SShri Abhyankar 
208232121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
208332121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
208432121132SShri Abhyankar 
208532121132SShri Abhyankar   /* copy b into temp work space according to permutation */
208632121132SShri Abhyankar   for(i=0;i<n;i++){
208732121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
208832121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
208932121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
209032121132SShri Abhyankar   }
209132121132SShri Abhyankar 
209232121132SShri Abhyankar   /* forward solve the U^T */
209332121132SShri Abhyankar   idx = 0;
209432121132SShri Abhyankar   for (i=0; i<n; i++) {
209532121132SShri Abhyankar     v     = aa + bs2*diag[i];
209632121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
209732121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
209832121132SShri Abhyankar     x6    = t[5+idx]; x7 = t[6+idx];
209932121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
210032121132SShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
210132121132SShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
210232121132SShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
210332121132SShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
210432121132SShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
210532121132SShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
210632121132SShri Abhyankar     v -= bs2;
210732121132SShri Abhyankar 
210832121132SShri Abhyankar     vi    = aj + diag[i] - 1;
210932121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
211032121132SShri Abhyankar     for(j=0;j>-nz;j--){
211132121132SShri Abhyankar       oidx = bs*vi[j];
211232121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
211332121132SShri Abhyankar       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
211432121132SShri Abhyankar       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
211532121132SShri Abhyankar       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
211632121132SShri Abhyankar       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
211732121132SShri Abhyankar       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
211832121132SShri Abhyankar       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
211932121132SShri Abhyankar       v  -= bs2;
212032121132SShri Abhyankar     }
212132121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
212232121132SShri Abhyankar     t[5+idx] = s6;  t[6+idx] = s7;
212332121132SShri Abhyankar     idx += bs;
212432121132SShri Abhyankar   }
212532121132SShri Abhyankar   /* backward solve the L^T */
212632121132SShri Abhyankar   for (i=n-1; i>=0; i--){
212732121132SShri Abhyankar     v    = aa + bs2*ai[i];
212832121132SShri Abhyankar     vi   = aj + ai[i];
212932121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
213032121132SShri Abhyankar     idt  = bs*i;
213132121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
213232121132SShri Abhyankar     s6   = t[5+idt];  s7 = t[6+idt];
213332121132SShri Abhyankar    for(j=0;j<nz;j++){
213432121132SShri Abhyankar       idx   = bs*vi[j];
213532121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
213632121132SShri Abhyankar       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
213732121132SShri Abhyankar       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
213832121132SShri Abhyankar       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
213932121132SShri Abhyankar       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
214032121132SShri Abhyankar       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
214132121132SShri Abhyankar       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
214232121132SShri Abhyankar       v += bs2;
214332121132SShri Abhyankar     }
214432121132SShri Abhyankar   }
214532121132SShri Abhyankar 
214632121132SShri Abhyankar   /* copy t into x according to permutation */
214732121132SShri Abhyankar   for(i=0;i<n;i++){
214832121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
214932121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
215032121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
215132121132SShri Abhyankar   }
215232121132SShri Abhyankar 
215332121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
215432121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21553649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
215632121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
215732121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
215832121132SShri Abhyankar   PetscFunctionReturn(0);
215932121132SShri Abhyankar }
2160f1af5d2fSBarry Smith 
21614e2b4712SSatish Balay /* ----------------------------------------------------------- */
21624a2ae208SSatish Balay #undef __FUNCT__
216306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
216406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
21654e2b4712SSatish Balay {
21664e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
21674e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
21686849ba73SBarry Smith   PetscErrorCode    ierr;
2169b3260449SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2170b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2171b3260449SShri Abhyankar   PetscInt          i,nz;
2172b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2173b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2174b3260449SShri Abhyankar   PetscScalar       *x,*s,*t,*ls;
2175b3260449SShri Abhyankar   const PetscScalar *b;
21764e2b4712SSatish Balay 
21774e2b4712SSatish Balay   PetscFunctionBegin;
21783649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
21791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2180f1af5d2fSBarry Smith   t  = a->solve_work;
21814e2b4712SSatish Balay 
21824e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
21834e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
21844e2b4712SSatish Balay 
21854e2b4712SSatish Balay   /* forward solve the lower triangular */
218687828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
21874e2b4712SSatish Balay   for (i=1; i<n; i++) {
21884e2b4712SSatish Balay     v   = aa + bs2*ai[i];
21894e2b4712SSatish Balay     vi  = aj + ai[i];
21904e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
2191f1af5d2fSBarry Smith     s = t + bs*i;
219287828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
21934e2b4712SSatish Balay     while (nz--) {
2194f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
21954e2b4712SSatish Balay       v += bs2;
21964e2b4712SSatish Balay     }
21974e2b4712SSatish Balay   }
21984e2b4712SSatish Balay   /* backward solve the upper triangular */
2199d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
22004e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
22014e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
22024e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
22034e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
220487828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
22054e2b4712SSatish Balay     while (nz--) {
2206f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
22074e2b4712SSatish Balay       v += bs2;
22084e2b4712SSatish Balay     }
2209f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
221087828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
22114e2b4712SSatish Balay   }
22124e2b4712SSatish Balay 
22134e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
22144e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
22153649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
22161ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2217dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
22184e2b4712SSatish Balay   PetscFunctionReturn(0);
22194e2b4712SSatish Balay }
22204e2b4712SSatish Balay 
22215c42ef9dSBarry Smith /* ----------------------------------------------------------- */
22225c42ef9dSBarry Smith #undef __FUNCT__
222306e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
222406e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
22255c42ef9dSBarry Smith {
22265c42ef9dSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
22275c42ef9dSBarry Smith   IS                iscol=a->col,isrow=a->row;
22285c42ef9dSBarry Smith   PetscErrorCode    ierr;
22295c42ef9dSBarry Smith   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2230b3260449SShri Abhyankar   PetscInt          i,nz,j;
2231b3260449SShri Abhyankar   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
22325c42ef9dSBarry Smith   const MatScalar   *aa=a->a,*v;
22335c42ef9dSBarry Smith   PetscScalar       *x,*t,*ls;
22345c42ef9dSBarry Smith   const PetscScalar *b;
22355c42ef9dSBarry Smith   PetscFunctionBegin;
22363649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
22375c42ef9dSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22385c42ef9dSBarry Smith   t    = a->solve_work;
22395c42ef9dSBarry Smith 
22405c42ef9dSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22415c42ef9dSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22425c42ef9dSBarry Smith 
22435c42ef9dSBarry Smith   /* copy the b into temp work space according to permutation */
22445c42ef9dSBarry Smith   for (i=0; i<n; i++) {
22455c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
22465c42ef9dSBarry Smith       t[i*bs+j] = b[c[i]*bs+j];
22475c42ef9dSBarry Smith     }
22485c42ef9dSBarry Smith   }
22495c42ef9dSBarry Smith 
22505c42ef9dSBarry Smith 
22515c42ef9dSBarry Smith   /* forward solve the upper triangular transpose */
22525c42ef9dSBarry Smith   ls = a->solve_work + A->cmap->n;
22535c42ef9dSBarry Smith   for (i=0; i<n; i++){
22545c42ef9dSBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
22555c42ef9dSBarry Smith     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
22565c42ef9dSBarry Smith     v   = aa + bs2*(a->diag[i] + 1);
22575c42ef9dSBarry Smith     vi  = aj + a->diag[i] + 1;
22585c42ef9dSBarry Smith     nz  = ai[i+1] - a->diag[i] - 1;
22595c42ef9dSBarry Smith     while (nz--) {
22605c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
22615c42ef9dSBarry Smith       v += bs2;
22625c42ef9dSBarry Smith     }
22635c42ef9dSBarry Smith   }
22645c42ef9dSBarry Smith 
22655c42ef9dSBarry Smith   /* backward solve the lower triangular transpose */
22665c42ef9dSBarry Smith   for (i=n-1; i>=0; i--) {
22675c42ef9dSBarry Smith     v   = aa + bs2*ai[i];
22685c42ef9dSBarry Smith     vi  = aj + ai[i];
22695c42ef9dSBarry Smith     nz  = a->diag[i] - ai[i];
22705c42ef9dSBarry Smith     while (nz--) {
22715c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
22725c42ef9dSBarry Smith       v += bs2;
22735c42ef9dSBarry Smith     }
22745c42ef9dSBarry Smith   }
22755c42ef9dSBarry Smith 
22765c42ef9dSBarry Smith   /* copy t into x according to permutation */
22775c42ef9dSBarry Smith   for (i=0; i<n; i++) {
22785c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
22795c42ef9dSBarry Smith       x[bs*r[i]+j]   = t[bs*i+j];
22805c42ef9dSBarry Smith     }
22815c42ef9dSBarry Smith   }
22825c42ef9dSBarry Smith 
22835c42ef9dSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
22845c42ef9dSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
22853649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
22865c42ef9dSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
22875c42ef9dSBarry Smith   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
22885c42ef9dSBarry Smith   PetscFunctionReturn(0);
22895c42ef9dSBarry Smith }
22905c42ef9dSBarry Smith 
22914a2ae208SSatish Balay #undef __FUNCT__
22924dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
22934dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
22948499736aSShri Abhyankar {
22958499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
22968499736aSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
22978499736aSShri Abhyankar   PetscErrorCode    ierr;
2298b3260449SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2299b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2300b3260449SShri Abhyankar   PetscInt          i,j,nz;
2301b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
23028499736aSShri Abhyankar   const MatScalar   *aa=a->a,*v;
23038499736aSShri Abhyankar   PetscScalar       *x,*t,*ls;
23048499736aSShri Abhyankar   const PetscScalar *b;
2305b3260449SShri Abhyankar 
23068499736aSShri Abhyankar   PetscFunctionBegin;
23073649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
23088499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23098499736aSShri Abhyankar   t    = a->solve_work;
23108499736aSShri Abhyankar 
23118499736aSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
23128499736aSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
23138499736aSShri Abhyankar 
23148499736aSShri Abhyankar   /* copy the b into temp work space according to permutation */
23158499736aSShri Abhyankar   for (i=0; i<n; i++) {
23168499736aSShri Abhyankar     for (j=0; j<bs; j++) {
23178499736aSShri Abhyankar       t[i*bs+j] = b[c[i]*bs+j];
23188499736aSShri Abhyankar     }
23198499736aSShri Abhyankar   }
23208499736aSShri Abhyankar 
23218499736aSShri Abhyankar 
23228499736aSShri Abhyankar   /* forward solve the upper triangular transpose */
23238499736aSShri Abhyankar   ls = a->solve_work + A->cmap->n;
23248499736aSShri Abhyankar   for (i=0; i<n; i++){
23258499736aSShri Abhyankar     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
23268499736aSShri Abhyankar     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
23278499736aSShri Abhyankar     v   = aa + bs2*(diag[i] - 1);
23288499736aSShri Abhyankar     vi  = aj + diag[i] - 1;
23298499736aSShri Abhyankar     nz  = diag[i] - diag[i+1] - 1;
23308499736aSShri Abhyankar     for(j=0;j>-nz;j--){
23318499736aSShri Abhyankar       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
23328499736aSShri Abhyankar       v -= bs2;
23338499736aSShri Abhyankar     }
23348499736aSShri Abhyankar   }
23358499736aSShri Abhyankar 
23368499736aSShri Abhyankar   /* backward solve the lower triangular transpose */
23378499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
23388499736aSShri Abhyankar     v   = aa + bs2*ai[i];
23398499736aSShri Abhyankar     vi  = aj + ai[i];
23408499736aSShri Abhyankar     nz  = ai[i+1] - ai[i];
23418499736aSShri Abhyankar     for(j=0;j<nz;j++){
23428499736aSShri Abhyankar       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
23438499736aSShri Abhyankar       v += bs2;
23448499736aSShri Abhyankar     }
23458499736aSShri Abhyankar   }
23468499736aSShri Abhyankar 
23478499736aSShri Abhyankar   /* copy t into x according to permutation */
23488499736aSShri Abhyankar   for (i=0; i<n; i++) {
23498499736aSShri Abhyankar     for (j=0; j<bs; j++) {
23508499736aSShri Abhyankar       x[bs*r[i]+j]   = t[bs*i+j];
23518499736aSShri Abhyankar     }
23528499736aSShri Abhyankar   }
23538499736aSShri Abhyankar 
23548499736aSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
23558499736aSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
23563649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
23578499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
23588499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
23598499736aSShri Abhyankar   PetscFunctionReturn(0);
23608499736aSShri Abhyankar }
23618499736aSShri Abhyankar 
2362832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */
236329a97285SShri Abhyankar 
23642b0b2ea7SShri Abhyankar #undef __FUNCT__
2365832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2366832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
23672b0b2ea7SShri Abhyankar {
23682b0b2ea7SShri Abhyankar   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
23692b0b2ea7SShri Abhyankar   PetscErrorCode    ierr;
2370b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
23710fa040f9SShri Abhyankar   PetscInt          i,nz,idx,idt,m;
23720b68f018SBarry Smith   const MatScalar   *aa=a->a,*v;
23732b0b2ea7SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
23742b0b2ea7SShri Abhyankar   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
23750fa040f9SShri Abhyankar   PetscScalar       *x;
23760b68f018SBarry Smith   const PetscScalar *b;
23772b0b2ea7SShri Abhyankar 
23782b0b2ea7SShri Abhyankar   PetscFunctionBegin;
23793649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
23802b0b2ea7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23812b0b2ea7SShri Abhyankar 
23822b0b2ea7SShri Abhyankar   /* forward solve the lower triangular */
238329a97285SShri Abhyankar   idx    = 0;
23840fa040f9SShri Abhyankar   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
23850fa040f9SShri Abhyankar   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
23860fa040f9SShri Abhyankar   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
23872b0b2ea7SShri Abhyankar 
23882b0b2ea7SShri Abhyankar   for (i=1; i<n; i++) {
23892b0b2ea7SShri Abhyankar     v     = aa + bs2*ai[i];
23902b0b2ea7SShri Abhyankar     vi    = aj + ai[i];
23912b0b2ea7SShri Abhyankar     nz    = ai[i+1] - ai[i];
23920fa040f9SShri Abhyankar     idt   = bs*i;
23930fa040f9SShri Abhyankar     s1   = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
23940fa040f9SShri Abhyankar     s6   = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
23950fa040f9SShri Abhyankar     s11  = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
23962b0b2ea7SShri Abhyankar     for(m=0;m<nz;m++){
23972b0b2ea7SShri Abhyankar       idx   = bs*vi[m];
23980fa040f9SShri Abhyankar       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
23990fa040f9SShri Abhyankar       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
24000fa040f9SShri Abhyankar       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
24012b0b2ea7SShri Abhyankar 
24020b8f6341SShri Abhyankar 
24032b0b2ea7SShri Abhyankar       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
24042b0b2ea7SShri Abhyankar       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
24052b0b2ea7SShri Abhyankar       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
24062b0b2ea7SShri Abhyankar       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
24072b0b2ea7SShri Abhyankar       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
24082b0b2ea7SShri Abhyankar       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
24092b0b2ea7SShri Abhyankar       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
24102b0b2ea7SShri Abhyankar       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
24112b0b2ea7SShri Abhyankar       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
24122b0b2ea7SShri Abhyankar       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
24132b0b2ea7SShri Abhyankar       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
24142b0b2ea7SShri Abhyankar       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
24152b0b2ea7SShri Abhyankar       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
24162b0b2ea7SShri Abhyankar       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
24172b0b2ea7SShri Abhyankar       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
24182b0b2ea7SShri Abhyankar 
24192b0b2ea7SShri Abhyankar       v += bs2;
24202b0b2ea7SShri Abhyankar     }
24210fa040f9SShri Abhyankar     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
24220fa040f9SShri Abhyankar     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
24230fa040f9SShri Abhyankar     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
24242b0b2ea7SShri Abhyankar 
24252b0b2ea7SShri Abhyankar   }
24262b0b2ea7SShri Abhyankar   /* backward solve the upper triangular */
24272b0b2ea7SShri Abhyankar   for (i=n-1; i>=0; i--){
24282b0b2ea7SShri Abhyankar     v    = aa + bs2*(adiag[i+1]+1);
24292b0b2ea7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
24302b0b2ea7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
24312b0b2ea7SShri Abhyankar     idt  = bs*i;
24320fa040f9SShri Abhyankar     s1   = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
24330fa040f9SShri Abhyankar     s6   = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
24340fa040f9SShri Abhyankar     s11  = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
24352b0b2ea7SShri Abhyankar 
24362b0b2ea7SShri Abhyankar     for(m=0;m<nz;m++){
24372b0b2ea7SShri Abhyankar       idx   = bs*vi[m];
24380fa040f9SShri Abhyankar       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
24390fa040f9SShri Abhyankar       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
24400fa040f9SShri Abhyankar       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
24412b0b2ea7SShri Abhyankar 
24422b0b2ea7SShri Abhyankar       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
24432b0b2ea7SShri Abhyankar       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
24442b0b2ea7SShri Abhyankar       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
24452b0b2ea7SShri Abhyankar       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
24462b0b2ea7SShri Abhyankar       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
24472b0b2ea7SShri Abhyankar       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
24482b0b2ea7SShri Abhyankar       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
24492b0b2ea7SShri Abhyankar       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
24502b0b2ea7SShri Abhyankar       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
24512b0b2ea7SShri Abhyankar       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
24522b0b2ea7SShri Abhyankar       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
24532b0b2ea7SShri Abhyankar       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
24542b0b2ea7SShri Abhyankar       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
24552b0b2ea7SShri Abhyankar       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
24562b0b2ea7SShri Abhyankar       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
24572b0b2ea7SShri Abhyankar 
24582b0b2ea7SShri Abhyankar       v += bs2;
24592b0b2ea7SShri Abhyankar     }
24602b0b2ea7SShri Abhyankar 
24610fa040f9SShri Abhyankar     x[idt] = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
24620fa040f9SShri Abhyankar     x[1+idt] = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
24630fa040f9SShri Abhyankar     x[2+idt] = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
24640fa040f9SShri Abhyankar     x[3+idt] = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
24650fa040f9SShri Abhyankar     x[4+idt] = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
24660fa040f9SShri Abhyankar     x[5+idt] = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
24670fa040f9SShri Abhyankar     x[6+idt] = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
24680fa040f9SShri Abhyankar     x[7+idt] = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
24690fa040f9SShri Abhyankar     x[8+idt] = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
24700fa040f9SShri Abhyankar     x[9+idt] = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
24710fa040f9SShri Abhyankar     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
24720fa040f9SShri Abhyankar     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
24730fa040f9SShri Abhyankar     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
24740fa040f9SShri Abhyankar     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
24750fa040f9SShri Abhyankar     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
24762b0b2ea7SShri Abhyankar 
24772b0b2ea7SShri Abhyankar   }
24782b0b2ea7SShri Abhyankar 
24793649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
24802b0b2ea7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
24812b0b2ea7SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
24822b0b2ea7SShri Abhyankar   PetscFunctionReturn(0);
24832b0b2ea7SShri Abhyankar }
24842b0b2ea7SShri Abhyankar 
2485832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2486832cc040SShri Abhyankar /* Default MatSolve for block size 15 */
2487832cc040SShri Abhyankar 
24888499736aSShri Abhyankar #undef __FUNCT__
2489832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1"
2490832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
24910b8f6341SShri Abhyankar {
24920b8f6341SShri Abhyankar   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
24930b8f6341SShri Abhyankar   PetscErrorCode    ierr;
24940b8f6341SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
249553ef36baSBarry Smith   PetscInt          i,k,nz,idx,idt,m;
24960b8f6341SShri Abhyankar   const MatScalar   *aa=a->a,*v;
24970b8f6341SShri Abhyankar   PetscScalar       s[15];
249853ef36baSBarry Smith   PetscScalar       *x,xv;
24990b8f6341SShri Abhyankar   const PetscScalar *b;
25000b8f6341SShri Abhyankar 
25010b8f6341SShri Abhyankar   PetscFunctionBegin;
25023649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
25030b8f6341SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25040b8f6341SShri Abhyankar 
25050b8f6341SShri Abhyankar   /* forward solve the lower triangular */
2506832cc040SShri Abhyankar   for (i=0; i<n; i++) {
25070b8f6341SShri Abhyankar     v     = aa + bs2*ai[i];
25080b8f6341SShri Abhyankar     vi    = aj + ai[i];
25090b8f6341SShri Abhyankar     nz    = ai[i+1] - ai[i];
25100fa040f9SShri Abhyankar     idt   = bs*i;
2511832cc040SShri Abhyankar     x[idt]   = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2512832cc040SShri Abhyankar     x[5+idt]   = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2513832cc040SShri Abhyankar     x[10+idt]  = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
25140b8f6341SShri Abhyankar     for(m=0;m<nz;m++){
25150b8f6341SShri Abhyankar       idx   = bs*vi[m];
25160b8f6341SShri Abhyankar       for(k=0;k<15;k++){
251753ef36baSBarry Smith 	xv        = x[k + idx];
251853ef36baSBarry Smith 	x[idt]    -= v[0]*xv;
251953ef36baSBarry Smith 	x[1+idt]  -= v[1]*xv;
252053ef36baSBarry Smith 	x[2+idt]  -= v[2]*xv;
252153ef36baSBarry Smith         x[3+idt]  -= v[3]*xv;
252253ef36baSBarry Smith 	x[4+idt]  -= v[4]*xv;
252353ef36baSBarry Smith 	x[5+idt]  -= v[5]*xv;
252453ef36baSBarry Smith 	x[6+idt]  -= v[6]*xv;
252553ef36baSBarry Smith         x[7+idt]  -= v[7]*xv;
252653ef36baSBarry Smith 	x[8+idt]  -= v[8]*xv;
252753ef36baSBarry Smith 	x[9+idt]  -= v[9]*xv;
252853ef36baSBarry Smith 	x[10+idt] -= v[10]*xv;
252953ef36baSBarry Smith         x[11+idt] -= v[11]*xv;
253053ef36baSBarry Smith 	x[12+idt] -= v[12]*xv;
253153ef36baSBarry Smith 	x[13+idt] -= v[13]*xv;
253253ef36baSBarry Smith 	x[14+idt] -= v[14]*xv;
25330b8f6341SShri Abhyankar 	v += 15;
25340b8f6341SShri Abhyankar       }
25350b8f6341SShri Abhyankar     }
25360b8f6341SShri Abhyankar   }
25370b8f6341SShri Abhyankar   /* backward solve the upper triangular */
25380b8f6341SShri Abhyankar   for (i=n-1; i>=0; i--){
25390b8f6341SShri Abhyankar     v    = aa + bs2*(adiag[i+1]+1);
25400b8f6341SShri Abhyankar     vi   = aj + adiag[i+1]+1;
25410b8f6341SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
25420b8f6341SShri Abhyankar     idt  = bs*i;
25430fa040f9SShri Abhyankar     s[0]   = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
25440fa040f9SShri Abhyankar     s[5]   = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
25450fa040f9SShri Abhyankar     s[10]  = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
25460b8f6341SShri Abhyankar 
25470b8f6341SShri Abhyankar     for(m=0;m<nz;m++){
25480b8f6341SShri Abhyankar       idx   = bs*vi[m];
25490b8f6341SShri Abhyankar       for(k=0;k<15;k++){
255053ef36baSBarry Smith 	xv = x[k + idx];
255153ef36baSBarry Smith 	s[0]  -= v[0]*xv;
255253ef36baSBarry Smith 	s[1]  -= v[1]*xv;
255353ef36baSBarry Smith 	s[2]  -= v[2]*xv;
255453ef36baSBarry Smith         s[3]  -= v[3]*xv;
255553ef36baSBarry Smith 	s[4]  -= v[4]*xv;
255653ef36baSBarry Smith 	s[5]  -= v[5]*xv;
255753ef36baSBarry Smith 	s[6]  -= v[6]*xv;
255853ef36baSBarry Smith         s[7]  -= v[7]*xv;
255953ef36baSBarry Smith 	s[8]  -= v[8]*xv;
256053ef36baSBarry Smith 	s[9]  -= v[9]*xv;
256153ef36baSBarry Smith 	s[10] -= v[10]*xv;
256253ef36baSBarry Smith         s[11] -= v[11]*xv;
256353ef36baSBarry Smith 	s[12] -= v[12]*xv;
256453ef36baSBarry Smith 	s[13] -= v[13]*xv;
256553ef36baSBarry Smith 	s[14] -= v[14]*xv;
25660b8f6341SShri Abhyankar 	v += 15;
25670b8f6341SShri Abhyankar       }
25680b8f6341SShri Abhyankar     }
25690fa040f9SShri Abhyankar     ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr);
25700b8f6341SShri Abhyankar     for(k=0;k<15;k++){
25710fa040f9SShri Abhyankar       x[idt]    += v[0]*s[k];
25720fa040f9SShri Abhyankar       x[1+idt]  += v[1]*s[k];
25730fa040f9SShri Abhyankar       x[2+idt]  += v[2]*s[k];
25740fa040f9SShri Abhyankar       x[3+idt]  += v[3]*s[k];
25750fa040f9SShri Abhyankar       x[4+idt]  += v[4]*s[k];
25760fa040f9SShri Abhyankar       x[5+idt]  += v[5]*s[k];
25770fa040f9SShri Abhyankar       x[6+idt]  += v[6]*s[k];
25780fa040f9SShri Abhyankar       x[7+idt]  += v[7]*s[k];
25790fa040f9SShri Abhyankar       x[8+idt]  += v[8]*s[k];
25800fa040f9SShri Abhyankar       x[9+idt]  += v[9]*s[k];
25810fa040f9SShri Abhyankar       x[10+idt] += v[10]*s[k];
25820fa040f9SShri Abhyankar       x[11+idt] += v[11]*s[k];
25830fa040f9SShri Abhyankar       x[12+idt] += v[12]*s[k];
25840fa040f9SShri Abhyankar       x[13+idt] += v[13]*s[k];
25850fa040f9SShri Abhyankar       x[14+idt] += v[14]*s[k];
25860b8f6341SShri Abhyankar       v += 15;
25870b8f6341SShri Abhyankar     }
25880b8f6341SShri Abhyankar   }
25893649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
25900b8f6341SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
25910b8f6341SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
25920b8f6341SShri Abhyankar   PetscFunctionReturn(0);
25930b8f6341SShri Abhyankar }
25940b8f6341SShri Abhyankar 
25950b8f6341SShri Abhyankar 
25960b8f6341SShri Abhyankar #undef __FUNCT__
259706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
259806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
25994e2b4712SSatish Balay {
26004e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
26014e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
26026849ba73SBarry Smith   PetscErrorCode    ierr;
2603b3260449SShri Abhyankar   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2604b3260449SShri Abhyankar   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2605b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
2606b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2607b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2608b3260449SShri Abhyankar   const PetscScalar *b;
26094e2b4712SSatish Balay 
26104e2b4712SSatish Balay   PetscFunctionBegin;
26113649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
26121ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2613f1af5d2fSBarry Smith   t  = a->solve_work;
26144e2b4712SSatish Balay 
26154e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
26164e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
26174e2b4712SSatish Balay 
26184e2b4712SSatish Balay   /* forward solve the lower triangular */
26194e2b4712SSatish Balay   idx    = 7*(*r++);
2620f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2621f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2622f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
26234e2b4712SSatish Balay 
26244e2b4712SSatish Balay   for (i=1; i<n; i++) {
26254e2b4712SSatish Balay     v     = aa + 49*ai[i];
26264e2b4712SSatish Balay     vi    = aj + ai[i];
26274e2b4712SSatish Balay     nz    = diag[i] - ai[i];
26284e2b4712SSatish Balay     idx   = 7*(*r++);
2629f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2630f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
26314e2b4712SSatish Balay     while (nz--) {
26324e2b4712SSatish Balay       idx   = 7*(*vi++);
2633f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2634f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
2635f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
2636f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2637f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2638f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2639f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2640f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2641f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2642f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
26434e2b4712SSatish Balay       v += 49;
26444e2b4712SSatish Balay     }
26454e2b4712SSatish Balay     idx = 7*i;
2646f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2647f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2648f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
26494e2b4712SSatish Balay   }
26504e2b4712SSatish Balay   /* backward solve the upper triangular */
26514e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
26524e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
26534e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
26544e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
26554e2b4712SSatish Balay     idt  = 7*i;
2656f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2657f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2658f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
26594e2b4712SSatish Balay     while (nz--) {
26604e2b4712SSatish Balay       idx   = 7*(*vi++);
2661f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2662f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2663f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
2664f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2665f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2666f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2667f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2668f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2669f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2670f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
26714e2b4712SSatish Balay       v += 49;
26724e2b4712SSatish Balay     }
26734e2b4712SSatish Balay     idc = 7*(*c--);
26744e2b4712SSatish Balay     v   = aa + 49*diag[i];
2675f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2676f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2677f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2678f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2679f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2680f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2681f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2682f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2683f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2684f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2685f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2686f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2687f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2688f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
26894e2b4712SSatish Balay   }
26904e2b4712SSatish Balay 
26914e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
26924e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
26933649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
26941ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2695dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
26964e2b4712SSatish Balay   PetscFunctionReturn(0);
26974e2b4712SSatish Balay }
26984e2b4712SSatish Balay 
26998f690400SShri Abhyankar #undef __FUNCT__
27004dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7"
27014dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
270235aa4fcfSShri Abhyankar {
270335aa4fcfSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
270435aa4fcfSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
270535aa4fcfSShri Abhyankar   PetscErrorCode    ierr;
2706b3260449SShri Abhyankar   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2707b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2708b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
2709b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2710b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2711b3260449SShri Abhyankar   const PetscScalar *b;
271235aa4fcfSShri Abhyankar 
271335aa4fcfSShri Abhyankar   PetscFunctionBegin;
27143649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
271535aa4fcfSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
271635aa4fcfSShri Abhyankar   t  = a->solve_work;
271735aa4fcfSShri Abhyankar 
271835aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
271935aa4fcfSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
272035aa4fcfSShri Abhyankar 
272135aa4fcfSShri Abhyankar   /* forward solve the lower triangular */
272235aa4fcfSShri Abhyankar   idx    = 7*r[0];
272335aa4fcfSShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
272435aa4fcfSShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
272535aa4fcfSShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
272635aa4fcfSShri Abhyankar 
272735aa4fcfSShri Abhyankar   for (i=1; i<n; i++) {
272835aa4fcfSShri Abhyankar     v     = aa + 49*ai[i];
272935aa4fcfSShri Abhyankar     vi    = aj + ai[i];
273035aa4fcfSShri Abhyankar     nz    = ai[i+1] - ai[i];
273135aa4fcfSShri Abhyankar     idx   = 7*r[i];
273235aa4fcfSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
273335aa4fcfSShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
273435aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
273535aa4fcfSShri Abhyankar       idx   = 7*vi[m];
273635aa4fcfSShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
273735aa4fcfSShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
273835aa4fcfSShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
273935aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
274035aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
274135aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
274235aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
274335aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
274435aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
274535aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
274635aa4fcfSShri Abhyankar       v += 49;
274735aa4fcfSShri Abhyankar     }
274835aa4fcfSShri Abhyankar     idx = 7*i;
274935aa4fcfSShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
275035aa4fcfSShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
275135aa4fcfSShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
275235aa4fcfSShri Abhyankar   }
275335aa4fcfSShri Abhyankar   /* backward solve the upper triangular */
275435aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
275535aa4fcfSShri Abhyankar     v    = aa + 49*(adiag[i+1]+1);
275635aa4fcfSShri Abhyankar     vi   = aj + adiag[i+1]+1;
275735aa4fcfSShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
275835aa4fcfSShri Abhyankar     idt  = 7*i;
275935aa4fcfSShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
276035aa4fcfSShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
276135aa4fcfSShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
276235aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
276335aa4fcfSShri Abhyankar       idx   = 7*vi[m];
276435aa4fcfSShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
276535aa4fcfSShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
276635aa4fcfSShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
276735aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
276835aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
276935aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
277035aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
277135aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
277235aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
277335aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
277435aa4fcfSShri Abhyankar       v += 49;
277535aa4fcfSShri Abhyankar     }
277635aa4fcfSShri Abhyankar     idc = 7*c[i];
277735aa4fcfSShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
277835aa4fcfSShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
277935aa4fcfSShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
278035aa4fcfSShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
278135aa4fcfSShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
278235aa4fcfSShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
278335aa4fcfSShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
278435aa4fcfSShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
278535aa4fcfSShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
278635aa4fcfSShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
278735aa4fcfSShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
278835aa4fcfSShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
278935aa4fcfSShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
279035aa4fcfSShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
279135aa4fcfSShri Abhyankar   }
279235aa4fcfSShri Abhyankar 
279335aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
279435aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
27953649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
279635aa4fcfSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
279735aa4fcfSShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
279835aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
279935aa4fcfSShri Abhyankar }
280035aa4fcfSShri Abhyankar 
280135aa4fcfSShri Abhyankar #undef __FUNCT__
280206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
280306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
280415091d37SBarry Smith {
280515091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2806b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2807dfbe8321SBarry Smith   PetscErrorCode    ierr;
2808b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
2809d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2810d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2811d9fead3dSBarry Smith   const PetscScalar *b;
281215091d37SBarry Smith 
281315091d37SBarry Smith   PetscFunctionBegin;
28143649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
28151ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
281615091d37SBarry Smith   /* forward solve the lower triangular */
281715091d37SBarry Smith   idx    = 0;
281815091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
281915091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
282015091d37SBarry Smith   x[6] = b[6+idx];
282115091d37SBarry Smith   for (i=1; i<n; i++) {
282215091d37SBarry Smith     v     =  aa + 49*ai[i];
282315091d37SBarry Smith     vi    =  aj + ai[i];
282415091d37SBarry Smith     nz    =  diag[i] - ai[i];
282515091d37SBarry Smith     idx   =  7*i;
2826f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2827f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2828f1af5d2fSBarry Smith     s7  =  b[6+idx];
282915091d37SBarry Smith     while (nz--) {
283015091d37SBarry Smith       jdx   = 7*(*vi++);
283115091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
283215091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
283315091d37SBarry Smith       x7    = x[6+jdx];
2834f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2835f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2836f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2837f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2838f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2839f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2840f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
284115091d37SBarry Smith       v += 49;
284215091d37SBarry Smith      }
2843f1af5d2fSBarry Smith     x[idx]   = s1;
2844f1af5d2fSBarry Smith     x[1+idx] = s2;
2845f1af5d2fSBarry Smith     x[2+idx] = s3;
2846f1af5d2fSBarry Smith     x[3+idx] = s4;
2847f1af5d2fSBarry Smith     x[4+idx] = s5;
2848f1af5d2fSBarry Smith     x[5+idx] = s6;
2849f1af5d2fSBarry Smith     x[6+idx] = s7;
285015091d37SBarry Smith   }
285115091d37SBarry Smith   /* backward solve the upper triangular */
285215091d37SBarry Smith   for (i=n-1; i>=0; i--){
285315091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
285415091d37SBarry Smith     vi   = aj + diag[i] + 1;
285515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
285615091d37SBarry Smith     idt  = 7*i;
2857f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
2858f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
2859f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
2860f1af5d2fSBarry Smith     s7 = x[6+idt];
286115091d37SBarry Smith     while (nz--) {
286215091d37SBarry Smith       idx   = 7*(*vi++);
286315091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
286415091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
286515091d37SBarry Smith       x7    = x[6+idx];
2866f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2867f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2868f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2869f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2870f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2871f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2872f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
287315091d37SBarry Smith       v += 49;
287415091d37SBarry Smith     }
287515091d37SBarry Smith     v        = aa + 49*diag[i];
2876f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2877f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2878f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2879f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2880f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2881f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2882f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2883f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2884f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2885f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2886f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2887f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2888f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2889f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
289015091d37SBarry Smith   }
289115091d37SBarry Smith 
28923649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
28931ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2894dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
289515091d37SBarry Smith   PetscFunctionReturn(0);
289615091d37SBarry Smith }
289715091d37SBarry Smith 
2898cee9d6f2SShri Abhyankar #undef __FUNCT__
28994dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
29004dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
290153cca76cSShri Abhyankar {
290253cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2903b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
290453cca76cSShri Abhyankar     PetscErrorCode    ierr;
2905b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
2906b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
290753cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
290853cca76cSShri Abhyankar     PetscScalar       *x;
290953cca76cSShri Abhyankar     const PetscScalar *b;
291053cca76cSShri Abhyankar     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
291153cca76cSShri Abhyankar 
291253cca76cSShri Abhyankar     PetscFunctionBegin;
29133649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
291453cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
291553cca76cSShri Abhyankar     /* forward solve the lower triangular */
291653cca76cSShri Abhyankar     idx    = 0;
291753cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
291853cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
291953cca76cSShri Abhyankar     for (i=1; i<n; i++) {
292053cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
292153cca76cSShri Abhyankar        vi   = aj + ai[i];
292253cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
292353cca76cSShri Abhyankar       idx   = bs*i;
292453cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
292553cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
292653cca76cSShri Abhyankar        for(k=0;k<nz;k++) {
292753cca76cSShri Abhyankar           jdx   = bs*vi[k];
292853cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
292953cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
293053cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
293153cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
293253cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
293353cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
293453cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
293553cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
293653cca76cSShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
293753cca76cSShri Abhyankar           v   +=  bs2;
293853cca76cSShri Abhyankar         }
293953cca76cSShri Abhyankar 
294053cca76cSShri Abhyankar        x[idx]   = s1;
294153cca76cSShri Abhyankar        x[1+idx] = s2;
294253cca76cSShri Abhyankar        x[2+idx] = s3;
294353cca76cSShri Abhyankar        x[3+idx] = s4;
294453cca76cSShri Abhyankar        x[4+idx] = s5;
294553cca76cSShri Abhyankar        x[5+idx] = s6;
294653cca76cSShri Abhyankar        x[6+idx] = s7;
294753cca76cSShri Abhyankar     }
294853cca76cSShri Abhyankar 
294953cca76cSShri Abhyankar    /* backward solve the upper triangular */
295053cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
295153cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
295253cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
295353cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
295453cca76cSShri Abhyankar      idt = bs*i;
295553cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
295653cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
295753cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
295853cca76cSShri Abhyankar       idx   = bs*vi[k];
295953cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
296053cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
296153cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
296253cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
296353cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
296453cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
296553cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
296653cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
296753cca76cSShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
296853cca76cSShri Abhyankar         v   +=  bs2;
296953cca76cSShri Abhyankar     }
297053cca76cSShri Abhyankar     /* x = inv_diagonal*x */
297153cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
297253cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
297353cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
297453cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
297553cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
297653cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
297753cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
297853cca76cSShri Abhyankar   }
297953cca76cSShri Abhyankar 
29803649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
298153cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
298253cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
298353cca76cSShri Abhyankar   PetscFunctionReturn(0);
298453cca76cSShri Abhyankar }
298553cca76cSShri Abhyankar 
298653cca76cSShri Abhyankar #undef __FUNCT__
298706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
298806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
298915091d37SBarry Smith {
299015091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
299115091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
29926849ba73SBarry Smith   PetscErrorCode    ierr;
29935d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
2994b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2995b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
2996d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2997d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2998d9fead3dSBarry Smith   const PetscScalar *b;
2999b3260449SShri Abhyankar 
300015091d37SBarry Smith   PetscFunctionBegin;
30013649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
30021ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3003f1af5d2fSBarry Smith   t  = a->solve_work;
300415091d37SBarry Smith 
300515091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
300615091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
300715091d37SBarry Smith 
300815091d37SBarry Smith   /* forward solve the lower triangular */
300915091d37SBarry Smith   idx    = 6*(*r++);
3010f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3011f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
3012f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
301315091d37SBarry Smith   for (i=1; i<n; i++) {
301415091d37SBarry Smith     v     = aa + 36*ai[i];
301515091d37SBarry Smith     vi    = aj + ai[i];
301615091d37SBarry Smith     nz    = diag[i] - ai[i];
301715091d37SBarry Smith     idx   = 6*(*r++);
3018f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3019f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
302015091d37SBarry Smith     while (nz--) {
302115091d37SBarry Smith       idx   = 6*(*vi++);
3022f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
3023f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
3024f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3025f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3026f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3027f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3028f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3029f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
303015091d37SBarry Smith       v += 36;
303115091d37SBarry Smith     }
303215091d37SBarry Smith     idx = 6*i;
3033f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3034f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
3035f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
303615091d37SBarry Smith   }
303715091d37SBarry Smith   /* backward solve the upper triangular */
303815091d37SBarry Smith   for (i=n-1; i>=0; i--){
303915091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
304015091d37SBarry Smith     vi   = aj + diag[i] + 1;
304115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
304215091d37SBarry Smith     idt  = 6*i;
3043f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3044f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
3045f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
304615091d37SBarry Smith     while (nz--) {
304715091d37SBarry Smith       idx   = 6*(*vi++);
3048f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3049f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
3050f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
3051f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3052f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3053f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3054f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3055f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3056f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
305715091d37SBarry Smith       v += 36;
305815091d37SBarry Smith     }
305915091d37SBarry Smith     idc = 6*(*c--);
306015091d37SBarry Smith     v   = aa + 36*diag[i];
3061f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
3062f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
3063f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
3064f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
3065f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
3066f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
3067f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
3068f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
3069f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
3070f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
3071f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
3072f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
307315091d37SBarry Smith   }
307415091d37SBarry Smith 
307515091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
307615091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
30773649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
30781ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3079dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
308015091d37SBarry Smith   PetscFunctionReturn(0);
308115091d37SBarry Smith }
308215091d37SBarry Smith 
30836506fda5SShri Abhyankar #undef __FUNCT__
30844dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6"
30854dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
30866506fda5SShri Abhyankar {
30876506fda5SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
30886506fda5SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
30896506fda5SShri Abhyankar   PetscErrorCode    ierr;
30906506fda5SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
3091b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3092b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
30936506fda5SShri Abhyankar   const MatScalar   *aa=a->a,*v;
30946506fda5SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
30956506fda5SShri Abhyankar   const PetscScalar *b;
3096b3260449SShri Abhyankar 
30976506fda5SShri Abhyankar   PetscFunctionBegin;
30983649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
30996506fda5SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
31006506fda5SShri Abhyankar   t  = a->solve_work;
31016506fda5SShri Abhyankar 
31026506fda5SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
31036506fda5SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
31046506fda5SShri Abhyankar 
31056506fda5SShri Abhyankar   /* forward solve the lower triangular */
31066506fda5SShri Abhyankar   idx    = 6*r[0];
31076506fda5SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
31086506fda5SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
31096506fda5SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
31106506fda5SShri Abhyankar   for (i=1; i<n; i++) {
31116506fda5SShri Abhyankar     v     = aa + 36*ai[i];
31126506fda5SShri Abhyankar     vi    = aj + ai[i];
31136506fda5SShri Abhyankar     nz    = ai[i+1] - ai[i];
31146506fda5SShri Abhyankar     idx   = 6*r[i];
31156506fda5SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
31166506fda5SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
31176506fda5SShri Abhyankar     for(m=0;m<nz;m++){
31186506fda5SShri Abhyankar       idx   = 6*vi[m];
31196506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
31206506fda5SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
31216506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
31226506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
31236506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
31246506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
31256506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
31266506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
31276506fda5SShri Abhyankar       v += 36;
31286506fda5SShri Abhyankar     }
31296506fda5SShri Abhyankar     idx = 6*i;
31306506fda5SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
31316506fda5SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
31326506fda5SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
31336506fda5SShri Abhyankar   }
31346506fda5SShri Abhyankar   /* backward solve the upper triangular */
31356506fda5SShri Abhyankar   for (i=n-1; i>=0; i--){
31366506fda5SShri Abhyankar     v    = aa + 36*(adiag[i+1]+1);
31376506fda5SShri Abhyankar     vi   = aj + adiag[i+1]+1;
31386506fda5SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
31396506fda5SShri Abhyankar     idt  = 6*i;
31406506fda5SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
31416506fda5SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
31426506fda5SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
31436506fda5SShri Abhyankar     for(m=0;m<nz;m++){
31446506fda5SShri Abhyankar       idx   = 6*vi[m];
31456506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
31466506fda5SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
31476506fda5SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
31486506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
31496506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
31506506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
31516506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
31526506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
31536506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
31546506fda5SShri Abhyankar       v += 36;
31556506fda5SShri Abhyankar     }
31566506fda5SShri Abhyankar     idc = 6*c[i];
31576506fda5SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
31586506fda5SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
31596506fda5SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
31606506fda5SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
31616506fda5SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
31626506fda5SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
31636506fda5SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
31646506fda5SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
31656506fda5SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
31666506fda5SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
31676506fda5SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
31686506fda5SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
31696506fda5SShri Abhyankar   }
31706506fda5SShri Abhyankar 
31716506fda5SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
31726506fda5SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
31733649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
31746506fda5SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
31756506fda5SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
31766506fda5SShri Abhyankar   PetscFunctionReturn(0);
31776506fda5SShri Abhyankar }
31788f690400SShri Abhyankar 
31798f690400SShri Abhyankar #undef __FUNCT__
318006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
318106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
318215091d37SBarry Smith {
318315091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3184b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
3185dfbe8321SBarry Smith   PetscErrorCode    ierr;
3186b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3187d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3188d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3189d9fead3dSBarry Smith   const PetscScalar *b;
319015091d37SBarry Smith 
319115091d37SBarry Smith   PetscFunctionBegin;
31923649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
31931ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
319415091d37SBarry Smith   /* forward solve the lower triangular */
319515091d37SBarry Smith   idx    = 0;
319615091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
319715091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
319815091d37SBarry Smith   for (i=1; i<n; i++) {
319915091d37SBarry Smith     v     =  aa + 36*ai[i];
320015091d37SBarry Smith     vi    =  aj + ai[i];
320115091d37SBarry Smith     nz    =  diag[i] - ai[i];
320215091d37SBarry Smith     idx   =  6*i;
3203f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3204f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
320515091d37SBarry Smith     while (nz--) {
320615091d37SBarry Smith       jdx   = 6*(*vi++);
320715091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
320815091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3209f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3210f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3211f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3212f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3213f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3214f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
321515091d37SBarry Smith       v += 36;
321615091d37SBarry Smith      }
3217f1af5d2fSBarry Smith     x[idx]   = s1;
3218f1af5d2fSBarry Smith     x[1+idx] = s2;
3219f1af5d2fSBarry Smith     x[2+idx] = s3;
3220f1af5d2fSBarry Smith     x[3+idx] = s4;
3221f1af5d2fSBarry Smith     x[4+idx] = s5;
3222f1af5d2fSBarry Smith     x[5+idx] = s6;
322315091d37SBarry Smith   }
322415091d37SBarry Smith   /* backward solve the upper triangular */
322515091d37SBarry Smith   for (i=n-1; i>=0; i--){
322615091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
322715091d37SBarry Smith     vi   = aj + diag[i] + 1;
322815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
322915091d37SBarry Smith     idt  = 6*i;
3230f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
3231f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
3232f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
323315091d37SBarry Smith     while (nz--) {
323415091d37SBarry Smith       idx   = 6*(*vi++);
323515091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
323615091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3237f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3238f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3239f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3240f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3241f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3242f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
324315091d37SBarry Smith       v += 36;
324415091d37SBarry Smith     }
324515091d37SBarry Smith     v        = aa + 36*diag[i];
3246f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3247f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3248f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3249f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3250f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3251f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
325215091d37SBarry Smith   }
325315091d37SBarry Smith 
32543649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
32551ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3256dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
325715091d37SBarry Smith   PetscFunctionReturn(0);
325815091d37SBarry Smith }
325915091d37SBarry Smith 
3260cee9d6f2SShri Abhyankar #undef __FUNCT__
32614dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
32624dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
326353cca76cSShri Abhyankar {
326453cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3265b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
326653cca76cSShri Abhyankar     PetscErrorCode    ierr;
3267b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
3268b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
326953cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
327053cca76cSShri Abhyankar     PetscScalar       *x;
327153cca76cSShri Abhyankar     const PetscScalar *b;
327253cca76cSShri Abhyankar     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
327353cca76cSShri Abhyankar 
327453cca76cSShri Abhyankar     PetscFunctionBegin;
32753649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
327653cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
327753cca76cSShri Abhyankar     /* forward solve the lower triangular */
327853cca76cSShri Abhyankar     idx    = 0;
327953cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
328053cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
328153cca76cSShri Abhyankar     for (i=1; i<n; i++) {
328253cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
328353cca76cSShri Abhyankar        vi   = aj + ai[i];
328453cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
328553cca76cSShri Abhyankar       idx   = bs*i;
328653cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
328753cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
328853cca76cSShri Abhyankar        for(k=0;k<nz;k++){
328953cca76cSShri Abhyankar           jdx   = bs*vi[k];
329053cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
329153cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
329253cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
329353cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
329453cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
329553cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
329653cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
329753cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
329853cca76cSShri Abhyankar           v   +=  bs2;
329953cca76cSShri Abhyankar         }
330053cca76cSShri Abhyankar 
330153cca76cSShri Abhyankar        x[idx]   = s1;
330253cca76cSShri Abhyankar        x[1+idx] = s2;
330353cca76cSShri Abhyankar        x[2+idx] = s3;
330453cca76cSShri Abhyankar        x[3+idx] = s4;
330553cca76cSShri Abhyankar        x[4+idx] = s5;
330653cca76cSShri Abhyankar        x[5+idx] = s6;
330753cca76cSShri Abhyankar     }
330853cca76cSShri Abhyankar 
330953cca76cSShri Abhyankar    /* backward solve the upper triangular */
331053cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
331153cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
331253cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
331353cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
331453cca76cSShri Abhyankar      idt = bs*i;
331553cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
331653cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
331753cca76cSShri Abhyankar      for(k=0;k<nz;k++){
331853cca76cSShri Abhyankar       idx   = bs*vi[k];
331953cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
332053cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
332153cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
332253cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
332353cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
332453cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
332553cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
332653cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
332753cca76cSShri Abhyankar         v   +=  bs2;
332853cca76cSShri Abhyankar     }
332953cca76cSShri Abhyankar     /* x = inv_diagonal*x */
333053cca76cSShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
333153cca76cSShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
333253cca76cSShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
333353cca76cSShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
333453cca76cSShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
333553cca76cSShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
333653cca76cSShri Abhyankar   }
333753cca76cSShri Abhyankar 
33383649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
333953cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
334053cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
334153cca76cSShri Abhyankar   PetscFunctionReturn(0);
334253cca76cSShri Abhyankar }
334353cca76cSShri Abhyankar 
334453cca76cSShri Abhyankar #undef __FUNCT__
334506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
334606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
33474e2b4712SSatish Balay {
33484e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
33494e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
33506849ba73SBarry Smith   PetscErrorCode    ierr;
33515d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3352b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3353b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
3354d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3355d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3356d9fead3dSBarry Smith   const PetscScalar *b;
33574e2b4712SSatish Balay 
33584e2b4712SSatish Balay   PetscFunctionBegin;
33593649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
33601ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3361f1af5d2fSBarry Smith   t  = a->solve_work;
33624e2b4712SSatish Balay 
33634e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
33644e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
33654e2b4712SSatish Balay 
33664e2b4712SSatish Balay   /* forward solve the lower triangular */
33674e2b4712SSatish Balay   idx    = 5*(*r++);
3368f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3369f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
33704e2b4712SSatish Balay   for (i=1; i<n; i++) {
33714e2b4712SSatish Balay     v     = aa + 25*ai[i];
33724e2b4712SSatish Balay     vi    = aj + ai[i];
33734e2b4712SSatish Balay     nz    = diag[i] - ai[i];
33744e2b4712SSatish Balay     idx   = 5*(*r++);
3375f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3376f1af5d2fSBarry Smith     s5  = b[4+idx];
33774e2b4712SSatish Balay     while (nz--) {
33784e2b4712SSatish Balay       idx   = 5*(*vi++);
3379f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3380f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
3381f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3382f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3383f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3384f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3385f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
33864e2b4712SSatish Balay       v += 25;
33874e2b4712SSatish Balay     }
33884e2b4712SSatish Balay     idx = 5*i;
3389f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3390f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
33914e2b4712SSatish Balay   }
33924e2b4712SSatish Balay   /* backward solve the upper triangular */
33934e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
33944e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
33954e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
33964e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
33974e2b4712SSatish Balay     idt  = 5*i;
3398f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3399f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
34004e2b4712SSatish Balay     while (nz--) {
34014e2b4712SSatish Balay       idx   = 5*(*vi++);
3402f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3403f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3404f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3405f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3406f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3407f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3408f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
34094e2b4712SSatish Balay       v += 25;
34104e2b4712SSatish Balay     }
34114e2b4712SSatish Balay     idc = 5*(*c--);
34124e2b4712SSatish Balay     v   = aa + 25*diag[i];
3413f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3414f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
3415f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3416f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
3417f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3418f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
3419f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3420f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
3421f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3422f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
34234e2b4712SSatish Balay   }
34244e2b4712SSatish Balay 
34254e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
34264e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
34273649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
34281ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3429dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
34304e2b4712SSatish Balay   PetscFunctionReturn(0);
34314e2b4712SSatish Balay }
34324e2b4712SSatish Balay 
343378bb4007SShri Abhyankar #undef __FUNCT__
34344dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5"
34354dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
343678bb4007SShri Abhyankar {
343778bb4007SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
343878bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
343978bb4007SShri Abhyankar   PetscErrorCode    ierr;
344078bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
3441b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3442b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
344378bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
344478bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
344578bb4007SShri Abhyankar   const PetscScalar *b;
344678bb4007SShri Abhyankar 
344778bb4007SShri Abhyankar   PetscFunctionBegin;
34483649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
344978bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
345078bb4007SShri Abhyankar   t  = a->solve_work;
345178bb4007SShri Abhyankar 
345278bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
345378bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
345478bb4007SShri Abhyankar 
345578bb4007SShri Abhyankar   /* forward solve the lower triangular */
345678bb4007SShri Abhyankar   idx    = 5*r[0];
345778bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
345878bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
345978bb4007SShri Abhyankar   for (i=1; i<n; i++) {
346078bb4007SShri Abhyankar     v     = aa + 25*ai[i];
346178bb4007SShri Abhyankar     vi    = aj + ai[i];
346278bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
346378bb4007SShri Abhyankar     idx   = 5*r[i];
346478bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
346578bb4007SShri Abhyankar     s5  = b[4+idx];
346678bb4007SShri Abhyankar     for(m=0;m<nz;m++){
346778bb4007SShri Abhyankar       idx   = 5*vi[m];
346878bb4007SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
346978bb4007SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
347078bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
347178bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
347278bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
347378bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
347478bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
347578bb4007SShri Abhyankar       v += 25;
347678bb4007SShri Abhyankar     }
347778bb4007SShri Abhyankar     idx = 5*i;
347878bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
347978bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
348078bb4007SShri Abhyankar   }
348178bb4007SShri Abhyankar   /* backward solve the upper triangular */
348278bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
348378bb4007SShri Abhyankar     v    = aa + 25*(adiag[i+1]+1);
348478bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
348578bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
348678bb4007SShri Abhyankar     idt  = 5*i;
348778bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
348878bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
348978bb4007SShri Abhyankar     for(m=0;m<nz;m++){
349078bb4007SShri Abhyankar       idx   = 5*vi[m];
349178bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
349278bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
349378bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
349478bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
349578bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
349678bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
349778bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
349878bb4007SShri Abhyankar       v += 25;
349978bb4007SShri Abhyankar     }
350078bb4007SShri Abhyankar     idc = 5*c[i];
350178bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
350278bb4007SShri Abhyankar                                  v[15]*s4+v[20]*s5;
350378bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
350478bb4007SShri Abhyankar                                  v[16]*s4+v[21]*s5;
350578bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
350678bb4007SShri Abhyankar                                  v[17]*s4+v[22]*s5;
350778bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
350878bb4007SShri Abhyankar                                  v[18]*s4+v[23]*s5;
350978bb4007SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
351078bb4007SShri Abhyankar                                  v[19]*s4+v[24]*s5;
351178bb4007SShri Abhyankar   }
351278bb4007SShri Abhyankar 
351378bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
351478bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
35153649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
351678bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
351778bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
351878bb4007SShri Abhyankar   PetscFunctionReturn(0);
351978bb4007SShri Abhyankar }
352078bb4007SShri Abhyankar 
35218f690400SShri Abhyankar #undef __FUNCT__
352206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
352306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
352415091d37SBarry Smith {
352515091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3526b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3527b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
3528dfbe8321SBarry Smith   PetscErrorCode    ierr;
3529d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3530d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3531d9fead3dSBarry Smith   const PetscScalar *b;
353215091d37SBarry Smith 
353315091d37SBarry Smith   PetscFunctionBegin;
35343649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
35351ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
353615091d37SBarry Smith   /* forward solve the lower triangular */
353715091d37SBarry Smith   idx    = 0;
353815091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
353915091d37SBarry Smith   for (i=1; i<n; i++) {
354015091d37SBarry Smith     v     =  aa + 25*ai[i];
354115091d37SBarry Smith     vi    =  aj + ai[i];
354215091d37SBarry Smith     nz    =  diag[i] - ai[i];
354315091d37SBarry Smith     idx   =  5*i;
3544f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
354515091d37SBarry Smith     while (nz--) {
354615091d37SBarry Smith       jdx   = 5*(*vi++);
354715091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3548f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3549f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3550f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3551f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3552f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
355315091d37SBarry Smith       v    += 25;
355415091d37SBarry Smith     }
3555f1af5d2fSBarry Smith     x[idx]   = s1;
3556f1af5d2fSBarry Smith     x[1+idx] = s2;
3557f1af5d2fSBarry Smith     x[2+idx] = s3;
3558f1af5d2fSBarry Smith     x[3+idx] = s4;
3559f1af5d2fSBarry Smith     x[4+idx] = s5;
356015091d37SBarry Smith   }
356115091d37SBarry Smith   /* backward solve the upper triangular */
356215091d37SBarry Smith   for (i=n-1; i>=0; i--){
356315091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
356415091d37SBarry Smith     vi   = aj + diag[i] + 1;
356515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
356615091d37SBarry Smith     idt  = 5*i;
3567f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3568f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
356915091d37SBarry Smith     while (nz--) {
357015091d37SBarry Smith       idx   = 5*(*vi++);
357115091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3572f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3573f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3574f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3575f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3576f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
357715091d37SBarry Smith       v    += 25;
357815091d37SBarry Smith     }
357915091d37SBarry Smith     v        = aa + 25*diag[i];
3580f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3581f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3582f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3583f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3584f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
358515091d37SBarry Smith   }
358615091d37SBarry Smith 
35873649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
35881ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3589dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
359015091d37SBarry Smith   PetscFunctionReturn(0);
359115091d37SBarry Smith }
359215091d37SBarry Smith 
3593cee9d6f2SShri Abhyankar #undef __FUNCT__
35944dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
35954dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
359653cca76cSShri Abhyankar {
359753cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3598b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3599b3260449SShri Abhyankar   PetscInt          i,k,nz,idx,idt,jdx;
360053cca76cSShri Abhyankar   PetscErrorCode    ierr;
360153cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
360253cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
360353cca76cSShri Abhyankar   const PetscScalar *b;
360453cca76cSShri Abhyankar 
360553cca76cSShri Abhyankar   PetscFunctionBegin;
36063649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
360753cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
360853cca76cSShri Abhyankar   /* forward solve the lower triangular */
360953cca76cSShri Abhyankar   idx    = 0;
361053cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
361153cca76cSShri Abhyankar   for (i=1; i<n; i++) {
361253cca76cSShri Abhyankar     v   = aa + 25*ai[i];
361353cca76cSShri Abhyankar     vi  = aj + ai[i];
361453cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
361553cca76cSShri Abhyankar     idx = 5*i;
361653cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
361753cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
361853cca76cSShri Abhyankar       jdx   = 5*vi[k];
361953cca76cSShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
362053cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
362153cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
362253cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
362353cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
362453cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
362553cca76cSShri Abhyankar       v    += 25;
362653cca76cSShri Abhyankar     }
362753cca76cSShri Abhyankar     x[idx]   = s1;
362853cca76cSShri Abhyankar     x[1+idx] = s2;
362953cca76cSShri Abhyankar     x[2+idx] = s3;
363053cca76cSShri Abhyankar     x[3+idx] = s4;
363153cca76cSShri Abhyankar     x[4+idx] = s5;
363253cca76cSShri Abhyankar   }
363353cca76cSShri Abhyankar 
363453cca76cSShri Abhyankar   /* backward solve the upper triangular */
363553cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
363653cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
363753cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
363853cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
363953cca76cSShri Abhyankar     idt = 5*i;
364053cca76cSShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
364153cca76cSShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
364253cca76cSShri Abhyankar     for(k=0;k<nz;k++){
364353cca76cSShri Abhyankar       idx   = 5*vi[k];
364453cca76cSShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
364553cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
364653cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
364753cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
364853cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
364953cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
365053cca76cSShri Abhyankar       v    += 25;
365153cca76cSShri Abhyankar     }
365253cca76cSShri Abhyankar     /* x = inv_diagonal*x */
365353cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
365453cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
365553cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
365653cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
365753cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
365853cca76cSShri Abhyankar   }
365953cca76cSShri Abhyankar 
36603649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
366153cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
366253cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
366353cca76cSShri Abhyankar   PetscFunctionReturn(0);
366453cca76cSShri Abhyankar }
366553cca76cSShri Abhyankar 
366653cca76cSShri Abhyankar #undef __FUNCT__
366706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
366806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
36694e2b4712SSatish Balay {
36704e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
36714e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
36726849ba73SBarry Smith   PetscErrorCode    ierr;
3673b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3674b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
36755d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3676d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3677d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3678d9fead3dSBarry Smith   const PetscScalar *b;
36794e2b4712SSatish Balay 
36804e2b4712SSatish Balay   PetscFunctionBegin;
36813649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
36821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3683f1af5d2fSBarry Smith   t  = a->solve_work;
36844e2b4712SSatish Balay 
36854e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
36864e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
36874e2b4712SSatish Balay 
36884e2b4712SSatish Balay   /* forward solve the lower triangular */
36894e2b4712SSatish Balay   idx    = 4*(*r++);
3690f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3691f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
36924e2b4712SSatish Balay   for (i=1; i<n; i++) {
36934e2b4712SSatish Balay     v     = aa + 16*ai[i];
36944e2b4712SSatish Balay     vi    = aj + ai[i];
36954e2b4712SSatish Balay     nz    = diag[i] - ai[i];
36964e2b4712SSatish Balay     idx   = 4*(*r++);
3697f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
36984e2b4712SSatish Balay     while (nz--) {
36994e2b4712SSatish Balay       idx   = 4*(*vi++);
3700f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3701f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3702f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3703f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3704f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
37054e2b4712SSatish Balay       v    += 16;
37064e2b4712SSatish Balay     }
37074e2b4712SSatish Balay     idx        = 4*i;
3708f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3709f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
37104e2b4712SSatish Balay   }
37114e2b4712SSatish Balay   /* backward solve the upper triangular */
37124e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
37134e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
37144e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
37154e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
37164e2b4712SSatish Balay     idt  = 4*i;
3717f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3718f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
37194e2b4712SSatish Balay     while (nz--) {
37204e2b4712SSatish Balay       idx   = 4*(*vi++);
3721f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3722f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
3723f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3724f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3725f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3726f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
37274e2b4712SSatish Balay       v += 16;
37284e2b4712SSatish Balay     }
37294e2b4712SSatish Balay     idc      = 4*(*c--);
37304e2b4712SSatish Balay     v        = aa + 16*diag[i];
3731f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3732f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3733f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3734f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
37354e2b4712SSatish Balay   }
37364e2b4712SSatish Balay 
37374e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
37384e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
37393649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
37401ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3741dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
37424e2b4712SSatish Balay   PetscFunctionReturn(0);
37434e2b4712SSatish Balay }
3744f26ec98cSKris Buschelman 
37458f690400SShri Abhyankar #undef __FUNCT__
37464dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4"
37474dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
374878bb4007SShri Abhyankar {
374978bb4007SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
375078bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
375178bb4007SShri Abhyankar   PetscErrorCode    ierr;
3752b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3753b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
375478bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
375578bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
375678bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
375778bb4007SShri Abhyankar   const PetscScalar *b;
375878bb4007SShri Abhyankar 
375978bb4007SShri Abhyankar   PetscFunctionBegin;
37603649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
376178bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
376278bb4007SShri Abhyankar   t  = a->solve_work;
376378bb4007SShri Abhyankar 
376478bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
376578bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
376678bb4007SShri Abhyankar 
376778bb4007SShri Abhyankar   /* forward solve the lower triangular */
376878bb4007SShri Abhyankar   idx    = 4*r[0];
376978bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
377078bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
377178bb4007SShri Abhyankar   for (i=1; i<n; i++) {
377278bb4007SShri Abhyankar     v     = aa + 16*ai[i];
377378bb4007SShri Abhyankar     vi    = aj + ai[i];
377478bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
377578bb4007SShri Abhyankar     idx   = 4*r[i];
377678bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
377778bb4007SShri Abhyankar     for(m=0;m<nz;m++){
377878bb4007SShri Abhyankar       idx   = 4*vi[m];
377978bb4007SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
378078bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
378178bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
378278bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
378378bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
378478bb4007SShri Abhyankar       v    += 16;
378578bb4007SShri Abhyankar     }
378678bb4007SShri Abhyankar     idx        = 4*i;
378778bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
378878bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
378978bb4007SShri Abhyankar   }
379078bb4007SShri Abhyankar   /* backward solve the upper triangular */
379178bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
379278bb4007SShri Abhyankar     v    = aa + 16*(adiag[i+1]+1);
379378bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
379478bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
379578bb4007SShri Abhyankar     idt  = 4*i;
379678bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
379778bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
379878bb4007SShri Abhyankar     for(m=0;m<nz;m++){
379978bb4007SShri Abhyankar       idx   = 4*vi[m];
380078bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
380178bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
380278bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
380378bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
380478bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
380578bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
380678bb4007SShri Abhyankar       v += 16;
380778bb4007SShri Abhyankar     }
380878bb4007SShri Abhyankar     idc      = 4*c[i];
380978bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
381078bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
381178bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
381278bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
381378bb4007SShri Abhyankar   }
381478bb4007SShri Abhyankar 
381578bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
381678bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
38173649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
381878bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
381978bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
382078bb4007SShri Abhyankar   PetscFunctionReturn(0);
382178bb4007SShri Abhyankar }
382278bb4007SShri Abhyankar 
382378bb4007SShri Abhyankar #undef __FUNCT__
3824f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3825dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3826f26ec98cSKris Buschelman {
3827f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3828f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
38296849ba73SBarry Smith   PetscErrorCode    ierr;
3830b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3831b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
38325d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3833d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3834d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3835d9fead3dSBarry Smith   PetscScalar       *x;
3836d9fead3dSBarry Smith   const PetscScalar *b;
3837f26ec98cSKris Buschelman 
3838f26ec98cSKris Buschelman   PetscFunctionBegin;
38393649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
38401ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3841f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
3842f26ec98cSKris Buschelman 
3843f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3844f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3845f26ec98cSKris Buschelman 
3846f26ec98cSKris Buschelman   /* forward solve the lower triangular */
3847f26ec98cSKris Buschelman   idx    = 4*(*r++);
3848f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
3849f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
3850f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
3851f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
3852f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
3853f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
3854f26ec98cSKris Buschelman     vi    = aj + ai[i];
3855f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
3856f26ec98cSKris Buschelman     idx   = 4*(*r++);
3857f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
3858f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
3859f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
3860f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
3861f26ec98cSKris Buschelman     while (nz--) {
3862f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3863f26ec98cSKris Buschelman       x1  = t[idx];
3864f26ec98cSKris Buschelman       x2  = t[1+idx];
3865f26ec98cSKris Buschelman       x3  = t[2+idx];
3866f26ec98cSKris Buschelman       x4  = t[3+idx];
3867f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3868f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3869f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3870f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3871f26ec98cSKris Buschelman       v    += 16;
3872f26ec98cSKris Buschelman     }
3873f26ec98cSKris Buschelman     idx        = 4*i;
3874f26ec98cSKris Buschelman     t[idx]   = s1;
3875f26ec98cSKris Buschelman     t[1+idx] = s2;
3876f26ec98cSKris Buschelman     t[2+idx] = s3;
3877f26ec98cSKris Buschelman     t[3+idx] = s4;
3878f26ec98cSKris Buschelman   }
3879f26ec98cSKris Buschelman   /* backward solve the upper triangular */
3880f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
3881f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
3882f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
3883f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
3884f26ec98cSKris Buschelman     idt  = 4*i;
3885f26ec98cSKris Buschelman     s1 = t[idt];
3886f26ec98cSKris Buschelman     s2 = t[1+idt];
3887f26ec98cSKris Buschelman     s3 = t[2+idt];
3888f26ec98cSKris Buschelman     s4 = t[3+idt];
3889f26ec98cSKris Buschelman     while (nz--) {
3890f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3891f26ec98cSKris Buschelman       x1  = t[idx];
3892f26ec98cSKris Buschelman       x2  = t[1+idx];
3893f26ec98cSKris Buschelman       x3  = t[2+idx];
3894f26ec98cSKris Buschelman       x4  = t[3+idx];
3895f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3896f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3897f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3898f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3899f26ec98cSKris Buschelman       v += 16;
3900f26ec98cSKris Buschelman     }
3901f26ec98cSKris Buschelman     idc      = 4*(*c--);
3902f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
3903f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3904f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3905f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3906f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3907f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
3908f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
3909f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
3910f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
3911f26ec98cSKris Buschelman  }
3912f26ec98cSKris Buschelman 
3913f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3914f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
39153649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
39161ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3917dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3918f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3919f26ec98cSKris Buschelman }
3920f26ec98cSKris Buschelman 
392124c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
392224c233c2SKris Buschelman 
392324c233c2SKris Buschelman #include PETSC_HAVE_SSE
392424c233c2SKris Buschelman 
392524c233c2SKris Buschelman #undef __FUNCT__
392624c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3927dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
392824c233c2SKris Buschelman {
392924c233c2SKris Buschelman   /*
393024c233c2SKris Buschelman      Note: This code uses demotion of double
393124c233c2SKris Buschelman      to float when performing the mixed-mode computation.
393224c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
393324c233c2SKris Buschelman   */
393424c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
393524c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
39366849ba73SBarry Smith   PetscErrorCode ierr;
39375d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
39385d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
393924c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
394087828ca2SBarry Smith   PetscScalar    *x,*b,*t;
394124c233c2SKris Buschelman 
394224c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
394324c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
394424c233c2SKris Buschelman   unsigned long   offset;
394524c233c2SKris Buschelman 
394624c233c2SKris Buschelman   PetscFunctionBegin;
394724c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
394824c233c2SKris Buschelman 
394924c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
395024c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
395124c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
395224c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
395324c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
395424c233c2SKris Buschelman 
39551ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
39561ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
395724c233c2SKris Buschelman     t  = a->solve_work;
395824c233c2SKris Buschelman 
395924c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
396024c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
396124c233c2SKris Buschelman 
396224c233c2SKris Buschelman     /* forward solve the lower triangular */
396324c233c2SKris Buschelman     idx  = 4*(*r++);
396424c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
396524c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
396624c233c2SKris Buschelman     v    =  aa + 16*ai[1];
396724c233c2SKris Buschelman 
396824c233c2SKris Buschelman     for (i=1; i<n;) {
396924c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
397024c233c2SKris Buschelman       vi   =  aj      + ai[i];
397124c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
397224c233c2SKris Buschelman       idx  =  4*(*r++);
397324c233c2SKris Buschelman 
397424c233c2SKris Buschelman       /* Demote sum from double to float */
397524c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
397624c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
397724c233c2SKris Buschelman 
397824c233c2SKris Buschelman       while (nz--) {
397924c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
398024c233c2SKris Buschelman         idx = 4*(*vi++);
398124c233c2SKris Buschelman 
398224c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
398324c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
398424c233c2SKris Buschelman 
398524c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
398624c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
398724c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
398824c233c2SKris Buschelman 
398924c233c2SKris Buschelman           /* First Column */
399024c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
399124c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
399224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
399324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
399424c233c2SKris Buschelman 
399524c233c2SKris Buschelman           /* Second Column */
399624c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
399724c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
399824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
399924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
400024c233c2SKris Buschelman 
400124c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
400224c233c2SKris Buschelman 
400324c233c2SKris Buschelman           /* Third Column */
400424c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
400524c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
400624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
400724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
400824c233c2SKris Buschelman 
400924c233c2SKris Buschelman           /* Fourth Column */
401024c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
401124c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
401224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
401324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
401424c233c2SKris Buschelman         SSE_INLINE_END_2
401524c233c2SKris Buschelman 
401624c233c2SKris Buschelman         v  += 16;
401724c233c2SKris Buschelman       }
401824c233c2SKris Buschelman       idx = 4*i;
401924c233c2SKris Buschelman       v   = aa + 16*ai[++i];
402024c233c2SKris Buschelman       PREFETCH_NTA(v);
402124c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
402224c233c2SKris Buschelman 
402324c233c2SKris Buschelman       /* Promote result from float to double */
402424c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
402524c233c2SKris Buschelman     }
402624c233c2SKris Buschelman     /* backward solve the upper triangular */
402724c233c2SKris Buschelman     idt  = 4*(n-1);
402824c233c2SKris Buschelman     ai16 = 16*diag[n-1];
402924c233c2SKris Buschelman     v    = aa + ai16 + 16;
403024c233c2SKris Buschelman     for (i=n-1; i>=0;){
403124c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
403224c233c2SKris Buschelman       vi = aj + diag[i] + 1;
403324c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
403424c233c2SKris Buschelman 
403524c233c2SKris Buschelman       /* Demote accumulator from double to float */
403624c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
403724c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
403824c233c2SKris Buschelman 
403924c233c2SKris Buschelman       while (nz--) {
404024c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
404124c233c2SKris Buschelman         idx = 4*(*vi++);
404224c233c2SKris Buschelman 
404324c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
404424c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
404524c233c2SKris Buschelman 
404624c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
404724c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
404824c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
404924c233c2SKris Buschelman 
405024c233c2SKris Buschelman           /* First Column */
405124c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
405224c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
405324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
405424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
405524c233c2SKris Buschelman 
405624c233c2SKris Buschelman           /* Second Column */
405724c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
405824c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
405924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
406024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
406124c233c2SKris Buschelman 
406224c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
406324c233c2SKris Buschelman 
406424c233c2SKris Buschelman           /* Third Column */
406524c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
406624c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
406724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
406824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
406924c233c2SKris Buschelman 
407024c233c2SKris Buschelman           /* Fourth Column */
407124c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
407224c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
407324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
407424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
407524c233c2SKris Buschelman         SSE_INLINE_END_2
407624c233c2SKris Buschelman         v  += 16;
407724c233c2SKris Buschelman       }
407824c233c2SKris Buschelman       v    = aa + ai16;
407924c233c2SKris Buschelman       ai16 = 16*diag[--i];
408024c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
408124c233c2SKris Buschelman       /*
408224c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
408324c233c2SKris Buschelman          which was inverted as part of the factorization
408424c233c2SKris Buschelman       */
408524c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
408624c233c2SKris Buschelman         /* First Column */
408724c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
408824c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
408924c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
409024c233c2SKris Buschelman 
409124c233c2SKris Buschelman         /* Second Column */
409224c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
409324c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
409424c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
409524c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
409624c233c2SKris Buschelman 
409724c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
409824c233c2SKris Buschelman 
409924c233c2SKris Buschelman         /* Third Column */
410024c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
410124c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
410224c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
410324c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
410424c233c2SKris Buschelman 
410524c233c2SKris Buschelman         /* Fourth Column */
410624c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
410724c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
410824c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
410924c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
411024c233c2SKris Buschelman 
411124c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
411224c233c2SKris Buschelman       SSE_INLINE_END_3
411324c233c2SKris Buschelman 
411424c233c2SKris Buschelman       /* Promote solution from float to double */
411524c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
411624c233c2SKris Buschelman 
411724c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
411824c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
411924c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
412024c233c2SKris Buschelman       idc  = 4*(*c--);
412124c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
412224c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
412324c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
412424c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
412524c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
412624c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
412724c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
412824c233c2SKris Buschelman       SSE_INLINE_END_2
412924c233c2SKris Buschelman       v    = aa + ai16 + 16;
413024c233c2SKris Buschelman       idt -= 4;
413124c233c2SKris Buschelman     }
413224c233c2SKris Buschelman 
413324c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
413424c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
41351ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
41361ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4137dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
413824c233c2SKris Buschelman   SSE_SCOPE_END;
413924c233c2SKris Buschelman   PetscFunctionReturn(0);
414024c233c2SKris Buschelman }
414124c233c2SKris Buschelman 
414224c233c2SKris Buschelman #endif
41430ef38995SBarry Smith 
41440ef38995SBarry Smith 
41454e2b4712SSatish Balay /*
41464e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
41474e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
41484e2b4712SSatish Balay */
41494a2ae208SSatish Balay #undef __FUNCT__
415006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
415106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
41524e2b4712SSatish Balay {
41534e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4154356650c2SBarry Smith   PetscInt          n=a->mbs;
4155356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
4156dfbe8321SBarry Smith   PetscErrorCode    ierr;
4157356650c2SBarry Smith   const PetscInt    *diag = a->diag;
4158d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
4159d9fead3dSBarry Smith   PetscScalar       *x;
4160d9fead3dSBarry Smith   const PetscScalar *b;
41614e2b4712SSatish Balay 
41624e2b4712SSatish Balay   PetscFunctionBegin;
41633649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
41641ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
41654e2b4712SSatish Balay 
4166aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
41672853dc0eSBarry Smith   {
416887828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
41692853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
41702853dc0eSBarry Smith   }
4171aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
41722853dc0eSBarry Smith   {
417387828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
41742853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
41752853dc0eSBarry Smith   }
4176aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
41772853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4178e1293385SBarry Smith #else
417930d4dcafSBarry Smith   {
418087828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4181d9fead3dSBarry Smith     const MatScalar *v;
4182356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
4183356650c2SBarry Smith     const PetscInt  *vi;
4184e1293385SBarry Smith 
41854e2b4712SSatish Balay   /* forward solve the lower triangular */
41864e2b4712SSatish Balay   idx    = 0;
4187e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
41884e2b4712SSatish Balay   for (i=1; i<n; i++) {
41894e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
41904e2b4712SSatish Balay     vi    =  aj      + ai[i];
41914e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
4192e1293385SBarry Smith     idx   +=  4;
4193f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
41944e2b4712SSatish Balay     while (nz--) {
41954e2b4712SSatish Balay       jdx   = 4*(*vi++);
41964e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4197f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4198f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4199f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4200f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
42014e2b4712SSatish Balay       v    += 16;
42024e2b4712SSatish Balay     }
4203f1af5d2fSBarry Smith     x[idx]   = s1;
4204f1af5d2fSBarry Smith     x[1+idx] = s2;
4205f1af5d2fSBarry Smith     x[2+idx] = s3;
4206f1af5d2fSBarry Smith     x[3+idx] = s4;
42074e2b4712SSatish Balay   }
42084e2b4712SSatish Balay   /* backward solve the upper triangular */
42094e555682SBarry Smith   idt = 4*(n-1);
42104e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
42114e555682SBarry Smith     ai16 = 16*diag[i];
42124e555682SBarry Smith     v    = aa + ai16 + 16;
42134e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
42144e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4215f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4216f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
42174e2b4712SSatish Balay     while (nz--) {
42184e2b4712SSatish Balay       idx   = 4*(*vi++);
42194e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4220f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4221f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4222f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4223f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
42244e2b4712SSatish Balay       v    += 16;
42254e2b4712SSatish Balay     }
42264e555682SBarry Smith     v        = aa + ai16;
4227f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4228f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4229f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4230f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4231329f5518SBarry Smith     idt -= 4;
42324e2b4712SSatish Balay   }
423330d4dcafSBarry Smith   }
4234e1293385SBarry Smith #endif
42354e2b4712SSatish Balay 
42363649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
42371ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4238dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
42394e2b4712SSatish Balay   PetscFunctionReturn(0);
42404e2b4712SSatish Balay }
42414e2b4712SSatish Balay 
4242b2b2dd24SShri Abhyankar #undef __FUNCT__
42434dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
42444dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4245b2b2dd24SShri Abhyankar {
4246b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4247b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4248b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
4249b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4250b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4251b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4252b2b2dd24SShri Abhyankar     PetscScalar       *x;
4253b2b2dd24SShri Abhyankar     const PetscScalar *b;
4254b2b2dd24SShri Abhyankar     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4255cee9d6f2SShri Abhyankar 
4256b2b2dd24SShri Abhyankar     PetscFunctionBegin;
42573649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
4258b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4259b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4260b2b2dd24SShri Abhyankar     idx    = 0;
4261b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4262b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4263b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
4264b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4265b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4266b2b2dd24SShri Abhyankar       idx   = bs*i;
4267b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4268b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++) {
4269b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
4270b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4271b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4272b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4273b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4274b2b2dd24SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4275b2b2dd24SShri Abhyankar 
4276b2b2dd24SShri Abhyankar           v   +=  bs2;
4277b2b2dd24SShri Abhyankar         }
4278b2b2dd24SShri Abhyankar 
4279b2b2dd24SShri Abhyankar        x[idx]   = s1;
4280b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4281b2b2dd24SShri Abhyankar        x[2+idx] = s3;
4282b2b2dd24SShri Abhyankar        x[3+idx] = s4;
4283b2b2dd24SShri Abhyankar     }
4284b2b2dd24SShri Abhyankar 
4285b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4286b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4287b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4288b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4289b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4290b2b2dd24SShri Abhyankar      idt = bs*i;
4291b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4292b2b2dd24SShri Abhyankar 
4293b2b2dd24SShri Abhyankar     for(k=0;k<nz;k++){
4294b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
4295b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4296b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4297b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4298b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4299b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4300b2b2dd24SShri Abhyankar 
4301b2b2dd24SShri Abhyankar         v   +=  bs2;
4302b2b2dd24SShri Abhyankar     }
4303b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4304b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4305b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4306b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4307b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4308b2b2dd24SShri Abhyankar 
4309b2b2dd24SShri Abhyankar   }
4310b2b2dd24SShri Abhyankar 
43113649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4312b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4313b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4314b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4315b2b2dd24SShri Abhyankar }
4316cee9d6f2SShri Abhyankar 
4317cee9d6f2SShri Abhyankar #undef __FUNCT__
4318f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4319dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4320f26ec98cSKris Buschelman {
4321f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4322b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4323dfbe8321SBarry Smith   PetscErrorCode    ierr;
4324b3260449SShri Abhyankar   const MatScalar   *aa=a->a;
4325b3260449SShri Abhyankar   const PetscScalar *b;
4326b3260449SShri Abhyankar   PetscScalar       *x;
4327f26ec98cSKris Buschelman 
4328f26ec98cSKris Buschelman   PetscFunctionBegin;
43293649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
43301ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4331f26ec98cSKris Buschelman 
4332f26ec98cSKris Buschelman   {
4333f26ec98cSKris Buschelman     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4334b3260449SShri Abhyankar     const MatScalar  *v;
4335b3260449SShri Abhyankar     MatScalar        *t=(MatScalar *)x;
4336b3260449SShri Abhyankar     PetscInt         jdx,idt,idx,nz,i,ai16;
4337b3260449SShri Abhyankar     const PetscInt   *vi;
4338f26ec98cSKris Buschelman 
4339f26ec98cSKris Buschelman     /* forward solve the lower triangular */
4340f26ec98cSKris Buschelman     idx  = 0;
4341f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
4342f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
4343f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
4344f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
4345f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
4346f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
4347f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
4348f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
4349f26ec98cSKris Buschelman       idx   +=  4;
4350f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
4351f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
4352f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
4353f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
4354f26ec98cSKris Buschelman       while (nz--) {
4355f26ec98cSKris Buschelman         jdx = 4*(*vi++);
4356f26ec98cSKris Buschelman         x1  = t[jdx];
4357f26ec98cSKris Buschelman         x2  = t[1+jdx];
4358f26ec98cSKris Buschelman         x3  = t[2+jdx];
4359f26ec98cSKris Buschelman         x4  = t[3+jdx];
4360f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4361f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4362f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4363f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4364f26ec98cSKris Buschelman         v    += 16;
4365f26ec98cSKris Buschelman       }
4366f26ec98cSKris Buschelman       t[idx]   = s1;
4367f26ec98cSKris Buschelman       t[1+idx] = s2;
4368f26ec98cSKris Buschelman       t[2+idx] = s3;
4369f26ec98cSKris Buschelman       t[3+idx] = s4;
4370f26ec98cSKris Buschelman     }
4371f26ec98cSKris Buschelman     /* backward solve the upper triangular */
4372f26ec98cSKris Buschelman     idt = 4*(n-1);
4373f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
4374f26ec98cSKris Buschelman       ai16 = 16*diag[i];
4375f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
4376f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
4377f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
4378f26ec98cSKris Buschelman       s1   = t[idt];
4379f26ec98cSKris Buschelman       s2   = t[1+idt];
4380f26ec98cSKris Buschelman       s3   = t[2+idt];
4381f26ec98cSKris Buschelman       s4   = t[3+idt];
4382f26ec98cSKris Buschelman       while (nz--) {
4383f26ec98cSKris Buschelman         idx = 4*(*vi++);
4384f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
4385f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
4386f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
4387f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
4388f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4389f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4390f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4391f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4392f26ec98cSKris Buschelman         v    += 16;
4393f26ec98cSKris Buschelman       }
4394f26ec98cSKris Buschelman       v        = aa + ai16;
4395f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4396f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4397f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4398f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4399f26ec98cSKris Buschelman       idt -= 4;
4400f26ec98cSKris Buschelman     }
4401f26ec98cSKris Buschelman   }
4402f26ec98cSKris Buschelman 
44033649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
44041ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4405dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4406f26ec98cSKris Buschelman   PetscFunctionReturn(0);
4407f26ec98cSKris Buschelman }
4408f26ec98cSKris Buschelman 
44093660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
44103660e330SKris Buschelman 
44113660e330SKris Buschelman #include PETSC_HAVE_SSE
44123660e330SKris Buschelman #undef __FUNCT__
44137cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4414dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
44153660e330SKris Buschelman {
44163660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
44172aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
4418dfbe8321SBarry Smith   PetscErrorCode ierr;
4419dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
44203660e330SKris Buschelman   MatScalar      *aa=a->a;
442187828ca2SBarry Smith   PetscScalar    *x,*b;
44223660e330SKris Buschelman 
44233660e330SKris Buschelman   PetscFunctionBegin;
44243660e330SKris Buschelman   SSE_SCOPE_BEGIN;
44253660e330SKris Buschelman   /*
44263660e330SKris Buschelman      Note: This code currently uses demotion of double
44273660e330SKris Buschelman      to float when performing the mixed-mode computation.
44283660e330SKris Buschelman      This may not be numerically reasonable for all applications.
44293660e330SKris Buschelman   */
44303660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
44313660e330SKris Buschelman 
44321ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
44331ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
44343660e330SKris Buschelman   {
4435eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
4436eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
44372aa5897fSKris Buschelman     int            nz,i,idt,ai16;
44382aa5897fSKris Buschelman     unsigned int   jdx,idx;
44392aa5897fSKris Buschelman     unsigned short *vi;
4440eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
44413660e330SKris Buschelman 
4442eb05f457SKris Buschelman     /* First block is the identity. */
44433660e330SKris Buschelman     idx  = 0;
4444eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
44452aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
44463660e330SKris Buschelman 
44473660e330SKris Buschelman     for (i=1; i<n;) {
44483660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
44493660e330SKris Buschelman       vi   =  aj      + ai[i];
44503660e330SKris Buschelman       nz   =  diag[i] - ai[i];
44513660e330SKris Buschelman       idx +=  4;
44523660e330SKris Buschelman 
4453eb05f457SKris Buschelman       /* Demote RHS from double to float. */
4454eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4455eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
44563660e330SKris Buschelman 
44573660e330SKris Buschelman       while (nz--) {
44583660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
44592aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
44603660e330SKris Buschelman 
44613660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
4462eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
44633660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
44643660e330SKris Buschelman 
44653660e330SKris Buschelman           /* First Column */
44663660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
44673660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
44683660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
44693660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
44703660e330SKris Buschelman 
44713660e330SKris Buschelman           /* Second Column */
44723660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
44733660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
44743660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
44753660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
44763660e330SKris Buschelman 
44773660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
44783660e330SKris Buschelman 
44793660e330SKris Buschelman           /* Third Column */
44803660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
44813660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
44823660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
44833660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
44843660e330SKris Buschelman 
44853660e330SKris Buschelman           /* Fourth Column */
44863660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
44873660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
44883660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
44893660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
44903660e330SKris Buschelman         SSE_INLINE_END_2
44913660e330SKris Buschelman 
44923660e330SKris Buschelman         v  += 16;
44933660e330SKris Buschelman       }
44943660e330SKris Buschelman       v    =  aa + 16*ai[++i];
44953660e330SKris Buschelman       PREFETCH_NTA(v);
4496eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
44973660e330SKris Buschelman     }
4498eb05f457SKris Buschelman 
4499eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
4500eb05f457SKris Buschelman 
45013660e330SKris Buschelman     idt  = 4*(n-1);
45023660e330SKris Buschelman     ai16 = 16*diag[n-1];
45033660e330SKris Buschelman     v    = aa + ai16 + 16;
45043660e330SKris Buschelman     for (i=n-1; i>=0;){
45053660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
45063660e330SKris Buschelman       vi = aj + diag[i] + 1;
45073660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
45083660e330SKris Buschelman 
4509eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
45103660e330SKris Buschelman 
45113660e330SKris Buschelman       while (nz--) {
45123660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
45132aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
45143660e330SKris Buschelman 
45153660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
4516eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
45173660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
45183660e330SKris Buschelman 
45193660e330SKris Buschelman           /* First Column */
45203660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
45213660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
45223660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
45233660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
45243660e330SKris Buschelman 
45253660e330SKris Buschelman           /* Second Column */
45263660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
45273660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
45283660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
45293660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
45303660e330SKris Buschelman 
45313660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
45323660e330SKris Buschelman 
45333660e330SKris Buschelman           /* Third Column */
45343660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
45353660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
45363660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
45373660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
45383660e330SKris Buschelman 
45393660e330SKris Buschelman           /* Fourth Column */
45403660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
45413660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
45423660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
45433660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
45443660e330SKris Buschelman         SSE_INLINE_END_2
45453660e330SKris Buschelman         v  += 16;
45463660e330SKris Buschelman       }
45473660e330SKris Buschelman       v    = aa + ai16;
45483660e330SKris Buschelman       ai16 = 16*diag[--i];
45493660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
45503660e330SKris Buschelman       /*
45513660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
45523660e330SKris Buschelman          which was inverted as part of the factorization
45533660e330SKris Buschelman       */
4554eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
45553660e330SKris Buschelman         /* First Column */
45563660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
45573660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
45583660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
45593660e330SKris Buschelman 
45603660e330SKris Buschelman         /* Second Column */
45613660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
45623660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
45633660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
45643660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
45653660e330SKris Buschelman 
45663660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
45673660e330SKris Buschelman 
45683660e330SKris Buschelman         /* Third Column */
45693660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
45703660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
45713660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
45723660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
45733660e330SKris Buschelman 
45743660e330SKris Buschelman         /* Fourth Column */
45753660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
45763660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
45773660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
45783660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
45793660e330SKris Buschelman 
45803660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
45813660e330SKris Buschelman       SSE_INLINE_END_3
45823660e330SKris Buschelman 
45833660e330SKris Buschelman       v    = aa + ai16 + 16;
45843660e330SKris Buschelman       idt -= 4;
45853660e330SKris Buschelman     }
4586eb05f457SKris Buschelman 
4587eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
4588eb05f457SKris Buschelman     idt = 4*(n-1);
4589eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
4590eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4591eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4592eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
4593eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
4594eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
4595eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
4596eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
4597eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
459854693613SKris Buschelman       idt -= 4;
45993660e330SKris Buschelman     }
4600eb05f457SKris Buschelman 
4601eb05f457SKris Buschelman   } /* End of artificial scope. */
46021ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
46031ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4604dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
46053660e330SKris Buschelman   SSE_SCOPE_END;
46063660e330SKris Buschelman   PetscFunctionReturn(0);
46073660e330SKris Buschelman }
46083660e330SKris Buschelman 
46097cf1b8d3SKris Buschelman #undef __FUNCT__
46107cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4611dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
46127cf1b8d3SKris Buschelman {
46137cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
46147cf1b8d3SKris Buschelman   int            *aj=a->j;
4615dfbe8321SBarry Smith   PetscErrorCode ierr;
4616dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
46177cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
46187cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
46197cf1b8d3SKris Buschelman 
46207cf1b8d3SKris Buschelman   PetscFunctionBegin;
46217cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
46227cf1b8d3SKris Buschelman   /*
46237cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
46247cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
46257cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
46267cf1b8d3SKris Buschelman   */
46277cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
46287cf1b8d3SKris Buschelman 
46291ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
46301ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
46317cf1b8d3SKris Buschelman   {
46327cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
46337cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
46347cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
46357cf1b8d3SKris Buschelman     int       jdx,idx;
46367cf1b8d3SKris Buschelman     int       *vi;
46377cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
46387cf1b8d3SKris Buschelman 
46397cf1b8d3SKris Buschelman     /* First block is the identity. */
46407cf1b8d3SKris Buschelman     idx  = 0;
46417cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
46427cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
46437cf1b8d3SKris Buschelman 
46447cf1b8d3SKris Buschelman     for (i=1; i<n;) {
46457cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
46467cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
46477cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
46487cf1b8d3SKris Buschelman       idx +=  4;
46497cf1b8d3SKris Buschelman 
46507cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
46517cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
46527cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
46537cf1b8d3SKris Buschelman 
46547cf1b8d3SKris Buschelman       while (nz--) {
46557cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
46567cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
46577cf1b8d3SKris Buschelman /*          jdx = *vi++; */
46587cf1b8d3SKris Buschelman 
46597cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
46607cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
46617cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
46627cf1b8d3SKris Buschelman 
46637cf1b8d3SKris Buschelman           /* First Column */
46647cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
46657cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
46667cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
46677cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
46687cf1b8d3SKris Buschelman 
46697cf1b8d3SKris Buschelman           /* Second Column */
46707cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
46717cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
46727cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
46737cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
46747cf1b8d3SKris Buschelman 
46757cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
46767cf1b8d3SKris Buschelman 
46777cf1b8d3SKris Buschelman           /* Third Column */
46787cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
46797cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
46807cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
46817cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
46827cf1b8d3SKris Buschelman 
46837cf1b8d3SKris Buschelman           /* Fourth Column */
46847cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
46857cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
46867cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
46877cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
46887cf1b8d3SKris Buschelman         SSE_INLINE_END_2
46897cf1b8d3SKris Buschelman 
46907cf1b8d3SKris Buschelman         v  += 16;
46917cf1b8d3SKris Buschelman       }
46927cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
46937cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
46947cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
46957cf1b8d3SKris Buschelman     }
46967cf1b8d3SKris Buschelman 
46977cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
46987cf1b8d3SKris Buschelman 
46997cf1b8d3SKris Buschelman     idt  = 4*(n-1);
47007cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
47017cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
47027cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
47037cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
47047cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
47057cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
47067cf1b8d3SKris Buschelman 
47077cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
47087cf1b8d3SKris Buschelman 
47097cf1b8d3SKris Buschelman       while (nz--) {
47107cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
47117cf1b8d3SKris Buschelman         idx = 4*(*vi++);
47127cf1b8d3SKris Buschelman /*          idx = *vi++; */
47137cf1b8d3SKris Buschelman 
47147cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
47157cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
47167cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
47177cf1b8d3SKris Buschelman 
47187cf1b8d3SKris Buschelman           /* First Column */
47197cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
47207cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
47217cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
47227cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
47237cf1b8d3SKris Buschelman 
47247cf1b8d3SKris Buschelman           /* Second Column */
47257cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
47267cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
47277cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
47287cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
47297cf1b8d3SKris Buschelman 
47307cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
47317cf1b8d3SKris Buschelman 
47327cf1b8d3SKris Buschelman           /* Third Column */
47337cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
47347cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
47357cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
47367cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
47377cf1b8d3SKris Buschelman 
47387cf1b8d3SKris Buschelman           /* Fourth Column */
47397cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
47407cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
47417cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
47427cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
47437cf1b8d3SKris Buschelman         SSE_INLINE_END_2
47447cf1b8d3SKris Buschelman         v  += 16;
47457cf1b8d3SKris Buschelman       }
47467cf1b8d3SKris Buschelman       v    = aa + ai16;
47477cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
47487cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
47497cf1b8d3SKris Buschelman       /*
47507cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
47517cf1b8d3SKris Buschelman          which was inverted as part of the factorization
47527cf1b8d3SKris Buschelman       */
47537cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
47547cf1b8d3SKris Buschelman         /* First Column */
47557cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
47567cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
47577cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
47587cf1b8d3SKris Buschelman 
47597cf1b8d3SKris Buschelman         /* Second Column */
47607cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
47617cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
47627cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
47637cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
47647cf1b8d3SKris Buschelman 
47657cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
47667cf1b8d3SKris Buschelman 
47677cf1b8d3SKris Buschelman         /* Third Column */
47687cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
47697cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
47707cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
47717cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
47727cf1b8d3SKris Buschelman 
47737cf1b8d3SKris Buschelman         /* Fourth Column */
47747cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
47757cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
47767cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
47777cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
47787cf1b8d3SKris Buschelman 
47797cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
47807cf1b8d3SKris Buschelman       SSE_INLINE_END_3
47817cf1b8d3SKris Buschelman 
47827cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
47837cf1b8d3SKris Buschelman       idt -= 4;
47847cf1b8d3SKris Buschelman     }
47857cf1b8d3SKris Buschelman 
47867cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
47877cf1b8d3SKris Buschelman     idt = 4*(n-1);
47887cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
47897cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
47907cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
47917cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
47927cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
47937cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
47947cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
47957cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
47967cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
47977cf1b8d3SKris Buschelman       idt -= 4;
47987cf1b8d3SKris Buschelman     }
47997cf1b8d3SKris Buschelman 
48007cf1b8d3SKris Buschelman   } /* End of artificial scope. */
48011ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
48021ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4803dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
48047cf1b8d3SKris Buschelman   SSE_SCOPE_END;
48057cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
48067cf1b8d3SKris Buschelman }
48077cf1b8d3SKris Buschelman 
48083660e330SKris Buschelman #endif
48098f690400SShri Abhyankar 
48104a2ae208SSatish Balay #undef __FUNCT__
481106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
481206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
48134e2b4712SSatish Balay {
48144e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
48154e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
48166849ba73SBarry Smith   PetscErrorCode    ierr;
4817b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4818b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
48195d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4820d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4821d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4822d9fead3dSBarry Smith   const PetscScalar *b;
48234e2b4712SSatish Balay 
48244e2b4712SSatish Balay   PetscFunctionBegin;
48253649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
48261ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4827f1af5d2fSBarry Smith   t  = a->solve_work;
48284e2b4712SSatish Balay 
48294e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
48304e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
48314e2b4712SSatish Balay 
48324e2b4712SSatish Balay   /* forward solve the lower triangular */
48334e2b4712SSatish Balay   idx    = 3*(*r++);
4834f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
48354e2b4712SSatish Balay   for (i=1; i<n; i++) {
48364e2b4712SSatish Balay     v     = aa + 9*ai[i];
48374e2b4712SSatish Balay     vi    = aj + ai[i];
48384e2b4712SSatish Balay     nz    = diag[i] - ai[i];
48394e2b4712SSatish Balay     idx   = 3*(*r++);
4840f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
48414e2b4712SSatish Balay     while (nz--) {
48424e2b4712SSatish Balay       idx   = 3*(*vi++);
4843f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4844f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4845f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4846f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48474e2b4712SSatish Balay       v += 9;
48484e2b4712SSatish Balay     }
48494e2b4712SSatish Balay     idx = 3*i;
4850f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
48514e2b4712SSatish Balay   }
48524e2b4712SSatish Balay   /* backward solve the upper triangular */
48534e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
48544e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
48554e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
48564e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
48574e2b4712SSatish Balay     idt  = 3*i;
4858f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
48594e2b4712SSatish Balay     while (nz--) {
48604e2b4712SSatish Balay       idx   = 3*(*vi++);
4861f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4862f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4863f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4864f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48654e2b4712SSatish Balay       v += 9;
48664e2b4712SSatish Balay     }
48674e2b4712SSatish Balay     idc = 3*(*c--);
48684e2b4712SSatish Balay     v   = aa + 9*diag[i];
4869f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4870f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4871f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
48724e2b4712SSatish Balay   }
48734e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
48744e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
48753649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
48761ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4877dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
48784e2b4712SSatish Balay   PetscFunctionReturn(0);
48794e2b4712SSatish Balay }
48804e2b4712SSatish Balay 
48810c4413a7SShri Abhyankar #undef __FUNCT__
48824dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3"
48834dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
48840c4413a7SShri Abhyankar {
48850c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
48860c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
48870c4413a7SShri Abhyankar   PetscErrorCode    ierr;
4888b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4889b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
48900c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
48910c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
48920c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
48930c4413a7SShri Abhyankar   const PetscScalar *b;
48940c4413a7SShri Abhyankar 
48950c4413a7SShri Abhyankar   PetscFunctionBegin;
48963649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
48970c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
48980c4413a7SShri Abhyankar   t  = a->solve_work;
48990c4413a7SShri Abhyankar 
49000c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
49010c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
49020c4413a7SShri Abhyankar 
49030c4413a7SShri Abhyankar   /* forward solve the lower triangular */
49040c4413a7SShri Abhyankar   idx    = 3*r[0];
49050c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
49060c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
49070c4413a7SShri Abhyankar     v     = aa + 9*ai[i];
49080c4413a7SShri Abhyankar     vi    = aj + ai[i];
49090c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
49100c4413a7SShri Abhyankar     idx   = 3*r[i];
49110c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
49120c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
49130c4413a7SShri Abhyankar       idx   = 3*vi[m];
49140c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
49150c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
49160c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
49170c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
49180c4413a7SShri Abhyankar       v += 9;
49190c4413a7SShri Abhyankar     }
49200c4413a7SShri Abhyankar     idx = 3*i;
49210c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
49220c4413a7SShri Abhyankar   }
49230c4413a7SShri Abhyankar   /* backward solve the upper triangular */
49240c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
49250c4413a7SShri Abhyankar     v    = aa + 9*(adiag[i+1]+1);
49260c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
49270c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
49280c4413a7SShri Abhyankar     idt  = 3*i;
49290c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
49300c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
49310c4413a7SShri Abhyankar       idx   = 3*vi[m];
49320c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
49330c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
49340c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
49350c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
49360c4413a7SShri Abhyankar       v += 9;
49370c4413a7SShri Abhyankar     }
49380c4413a7SShri Abhyankar     idc = 3*c[i];
49390c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
49400c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
49410c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
49420c4413a7SShri Abhyankar   }
49430c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
49440c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
49453649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
49460c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
49470c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
49480c4413a7SShri Abhyankar   PetscFunctionReturn(0);
49490c4413a7SShri Abhyankar }
49500c4413a7SShri Abhyankar 
495115091d37SBarry Smith /*
495215091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
495315091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
495415091d37SBarry Smith */
49554a2ae208SSatish Balay #undef __FUNCT__
495606e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
495706e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
495815091d37SBarry Smith {
495915091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
49600b68f018SBarry Smith   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4961dfbe8321SBarry Smith   PetscErrorCode    ierr;
49620b68f018SBarry Smith   const PetscInt    *diag = a->diag,*vi;
4963d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4964d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4965d9fead3dSBarry Smith   const PetscScalar *b;
49660b68f018SBarry Smith   PetscInt          jdx,idt,idx,nz,i;
496715091d37SBarry Smith 
496815091d37SBarry Smith   PetscFunctionBegin;
49693649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
49701ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
497115091d37SBarry Smith 
497215091d37SBarry Smith   /* forward solve the lower triangular */
497315091d37SBarry Smith   idx    = 0;
497415091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
497515091d37SBarry Smith   for (i=1; i<n; i++) {
497615091d37SBarry Smith     v     =  aa      + 9*ai[i];
497715091d37SBarry Smith     vi    =  aj      + ai[i];
497815091d37SBarry Smith     nz    =  diag[i] - ai[i];
497915091d37SBarry Smith     idx   +=  3;
4980f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
498115091d37SBarry Smith     while (nz--) {
498215091d37SBarry Smith       jdx   = 3*(*vi++);
498315091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4984f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4985f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4986f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
498715091d37SBarry Smith       v    += 9;
498815091d37SBarry Smith     }
4989f1af5d2fSBarry Smith     x[idx]   = s1;
4990f1af5d2fSBarry Smith     x[1+idx] = s2;
4991f1af5d2fSBarry Smith     x[2+idx] = s3;
499215091d37SBarry Smith   }
499315091d37SBarry Smith   /* backward solve the upper triangular */
499415091d37SBarry Smith   for (i=n-1; i>=0; i--){
499515091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
499615091d37SBarry Smith     vi   = aj + diag[i] + 1;
499715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
499815091d37SBarry Smith     idt  = 3*i;
4999f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
5000f1af5d2fSBarry Smith     s3 = x[2+idt];
500115091d37SBarry Smith     while (nz--) {
500215091d37SBarry Smith       idx   = 3*(*vi++);
500315091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
5004f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5005f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5006f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
500715091d37SBarry Smith       v    += 9;
500815091d37SBarry Smith     }
500915091d37SBarry Smith     v        = aa +  9*diag[i];
5010f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5011f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5012f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
501315091d37SBarry Smith   }
501415091d37SBarry Smith 
50153649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
50161ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5017dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
501815091d37SBarry Smith   PetscFunctionReturn(0);
501915091d37SBarry Smith }
502015091d37SBarry Smith 
5021cee9d6f2SShri Abhyankar #undef __FUNCT__
50224dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
50234dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
5024b2b2dd24SShri Abhyankar {
5025b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5026b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5027b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
5028b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
5029b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
5030b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
5031b2b2dd24SShri Abhyankar     PetscScalar       *x;
5032b2b2dd24SShri Abhyankar     const PetscScalar *b;
5033b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
5034b2b2dd24SShri Abhyankar 
5035b2b2dd24SShri Abhyankar     PetscFunctionBegin;
50363649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5037b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5038b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
5039b2b2dd24SShri Abhyankar     idx    = 0;
5040b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
5041b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
5042b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
5043b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
5044b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
5045b2b2dd24SShri Abhyankar       idx   = bs*i;
5046b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
5047b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
5048b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
5049b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
5050b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5051b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5052b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5053b2b2dd24SShri Abhyankar 
5054b2b2dd24SShri Abhyankar           v   +=  bs2;
5055b2b2dd24SShri Abhyankar         }
5056b2b2dd24SShri Abhyankar 
5057b2b2dd24SShri Abhyankar        x[idx]   = s1;
5058b2b2dd24SShri Abhyankar        x[1+idx] = s2;
5059b2b2dd24SShri Abhyankar        x[2+idx] = s3;
5060b2b2dd24SShri Abhyankar     }
5061b2b2dd24SShri Abhyankar 
5062b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
5063b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
5064b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
5065b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
5066b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
5067b2b2dd24SShri Abhyankar      idt = bs*i;
5068b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
5069b2b2dd24SShri Abhyankar 
5070b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
5071b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
5072b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
5073b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
5074b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
5075b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
5076b2b2dd24SShri Abhyankar 
5077b2b2dd24SShri Abhyankar         v   +=  bs2;
5078b2b2dd24SShri Abhyankar     }
5079b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
5080b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
5081b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
5082b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
5083b2b2dd24SShri Abhyankar 
5084b2b2dd24SShri Abhyankar   }
5085b2b2dd24SShri Abhyankar 
50863649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5087b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5088b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5089b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
5090b2b2dd24SShri Abhyankar }
5091b2b2dd24SShri Abhyankar 
5092b2b2dd24SShri Abhyankar #undef __FUNCT__
509306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
509406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
50954e2b4712SSatish Balay {
50964e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
50974e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
50986849ba73SBarry Smith   PetscErrorCode    ierr;
5099b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5100b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
51015d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5102d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
5103d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
5104d9fead3dSBarry Smith   const PetscScalar *b;
51054e2b4712SSatish Balay 
51064e2b4712SSatish Balay   PetscFunctionBegin;
51073649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
51081ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5109f1af5d2fSBarry Smith   t  = a->solve_work;
51104e2b4712SSatish Balay 
51114e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
51124e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
51134e2b4712SSatish Balay 
51144e2b4712SSatish Balay   /* forward solve the lower triangular */
51154e2b4712SSatish Balay   idx    = 2*(*r++);
5116f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
51174e2b4712SSatish Balay   for (i=1; i<n; i++) {
51184e2b4712SSatish Balay     v     = aa + 4*ai[i];
51194e2b4712SSatish Balay     vi    = aj + ai[i];
51204e2b4712SSatish Balay     nz    = diag[i] - ai[i];
51214e2b4712SSatish Balay     idx   = 2*(*r++);
5122f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
51234e2b4712SSatish Balay     while (nz--) {
51244e2b4712SSatish Balay       idx   = 2*(*vi++);
5125f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
5126f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5127f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
51284e2b4712SSatish Balay       v += 4;
51294e2b4712SSatish Balay     }
51304e2b4712SSatish Balay     idx = 2*i;
5131f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
51324e2b4712SSatish Balay   }
51334e2b4712SSatish Balay   /* backward solve the upper triangular */
51344e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
51354e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
51364e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
51374e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
51384e2b4712SSatish Balay     idt  = 2*i;
5139f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
51404e2b4712SSatish Balay     while (nz--) {
51414e2b4712SSatish Balay       idx   = 2*(*vi++);
5142f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
5143f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5144f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
51454e2b4712SSatish Balay       v += 4;
51464e2b4712SSatish Balay     }
51474e2b4712SSatish Balay     idc = 2*(*c--);
51484e2b4712SSatish Balay     v   = aa + 4*diag[i];
5149f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5150f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
51514e2b4712SSatish Balay   }
51524e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
51534e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
51543649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
51551ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5156dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
51574e2b4712SSatish Balay   PetscFunctionReturn(0);
51584e2b4712SSatish Balay }
51594e2b4712SSatish Balay 
51600c4413a7SShri Abhyankar #undef __FUNCT__
51614dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2"
51624dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
51630c4413a7SShri Abhyankar {
51640c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
51650c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
51660c4413a7SShri Abhyankar   PetscErrorCode    ierr;
5167b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5168b3260449SShri Abhyankar   PetscInt          i,nz,idx,jdx,idt,idc,m;
51690c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
51700c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
51710c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
51720c4413a7SShri Abhyankar   const PetscScalar *b;
51730c4413a7SShri Abhyankar 
51740c4413a7SShri Abhyankar   PetscFunctionBegin;
51753649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
51760c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
51770c4413a7SShri Abhyankar   t  = a->solve_work;
51780c4413a7SShri Abhyankar 
51790c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
51800c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
51810c4413a7SShri Abhyankar 
51820c4413a7SShri Abhyankar   /* forward solve the lower triangular */
51830c4413a7SShri Abhyankar   idx    = 2*r[0];
51840c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
51850c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
51860c4413a7SShri Abhyankar     v     = aa + 4*ai[i];
51870c4413a7SShri Abhyankar     vi    = aj + ai[i];
51880c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
51890c4413a7SShri Abhyankar     idx   = 2*r[i];
51900c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
51910c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
51920c4413a7SShri Abhyankar       jdx   = 2*vi[m];
51930c4413a7SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
51940c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
51950c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
51960c4413a7SShri Abhyankar       v += 4;
51970c4413a7SShri Abhyankar     }
51980c4413a7SShri Abhyankar     idx = 2*i;
51990c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
52000c4413a7SShri Abhyankar   }
52010c4413a7SShri Abhyankar   /* backward solve the upper triangular */
52020c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
52030c4413a7SShri Abhyankar     v    = aa + 4*(adiag[i+1]+1);
52040c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
52050c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
52060c4413a7SShri Abhyankar     idt  = 2*i;
52070c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
52080c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
52090c4413a7SShri Abhyankar       idx   = 2*vi[m];
52100c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
52110c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
52120c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
52130c4413a7SShri Abhyankar       v += 4;
52140c4413a7SShri Abhyankar     }
52150c4413a7SShri Abhyankar     idc = 2*c[i];
52160c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
52170c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
52180c4413a7SShri Abhyankar   }
52190c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
52200c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
52213649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
52220c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52230c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
52240c4413a7SShri Abhyankar   PetscFunctionReturn(0);
52250c4413a7SShri Abhyankar }
52268f690400SShri Abhyankar 
522715091d37SBarry Smith /*
522815091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
522915091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
523015091d37SBarry Smith */
52314a2ae208SSatish Balay #undef __FUNCT__
523206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
523306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
523415091d37SBarry Smith {
523515091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5236b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5237dfbe8321SBarry Smith   PetscErrorCode    ierr;
5238d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
5239d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
5240d9fead3dSBarry Smith   const PetscScalar *b;
5241b3260449SShri Abhyankar   PetscInt          jdx,idt,idx,nz,i;
524215091d37SBarry Smith 
524315091d37SBarry Smith   PetscFunctionBegin;
52443649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
52451ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
524615091d37SBarry Smith 
524715091d37SBarry Smith   /* forward solve the lower triangular */
524815091d37SBarry Smith   idx    = 0;
524915091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
525015091d37SBarry Smith   for (i=1; i<n; i++) {
525115091d37SBarry Smith     v     =  aa      + 4*ai[i];
525215091d37SBarry Smith     vi    =  aj      + ai[i];
525315091d37SBarry Smith     nz    =  diag[i] - ai[i];
525415091d37SBarry Smith     idx   +=  2;
5255f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
525615091d37SBarry Smith     while (nz--) {
525715091d37SBarry Smith       jdx   = 2*(*vi++);
525815091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
5259f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5260f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
526115091d37SBarry Smith       v    += 4;
526215091d37SBarry Smith     }
5263f1af5d2fSBarry Smith     x[idx]   = s1;
5264f1af5d2fSBarry Smith     x[1+idx] = s2;
526515091d37SBarry Smith   }
526615091d37SBarry Smith   /* backward solve the upper triangular */
526715091d37SBarry Smith   for (i=n-1; i>=0; i--){
526815091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
526915091d37SBarry Smith     vi   = aj + diag[i] + 1;
527015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
527115091d37SBarry Smith     idt  = 2*i;
5272f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
527315091d37SBarry Smith     while (nz--) {
527415091d37SBarry Smith       idx   = 2*(*vi++);
527515091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
5276f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5277f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
527815091d37SBarry Smith       v    += 4;
527915091d37SBarry Smith     }
528015091d37SBarry Smith     v        = aa +  4*diag[i];
5281f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
5282f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
528315091d37SBarry Smith   }
528415091d37SBarry Smith 
52853649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
52861ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5287dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
528815091d37SBarry Smith   PetscFunctionReturn(0);
528915091d37SBarry Smith }
529015091d37SBarry Smith 
5291cee9d6f2SShri Abhyankar #undef __FUNCT__
52924dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
52934dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5294b2b2dd24SShri Abhyankar {
5295b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5296b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5297b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,idt,jdx;
5298b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
5299b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
5300b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
5301b2b2dd24SShri Abhyankar     const PetscScalar *b;
5302b2b2dd24SShri Abhyankar 
5303b2b2dd24SShri Abhyankar     PetscFunctionBegin;
53043649974fSBarry Smith     ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5305b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5306b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
5307b2b2dd24SShri Abhyankar     idx    = 0;
5308b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
5309b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
5310b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
5311b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
5312b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
5313b2b2dd24SShri Abhyankar        idx  = 2*i;
5314b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
5315*4c0dbd8dSJed Brown        PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5316*4c0dbd8dSJed Brown        PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5317b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
5318b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
5319b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
5320b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
5321b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
5322b2b2dd24SShri Abhyankar            v   +=  4;
5323b2b2dd24SShri Abhyankar         }
5324b2b2dd24SShri Abhyankar        x[idx]   = s1;
5325b2b2dd24SShri Abhyankar        x[1+idx] = s2;
5326b2b2dd24SShri Abhyankar     }
5327b2b2dd24SShri Abhyankar 
5328b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
5329b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
5330b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
5331b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
5332b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
5333b2b2dd24SShri Abhyankar      idt = 2*i;
5334b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
5335*4c0dbd8dSJed Brown      PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA);
5336*4c0dbd8dSJed Brown      PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA);
5337b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
5338b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
5339b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
5340b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
5341b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
5342b2b2dd24SShri Abhyankar          v    += 4;
5343b2b2dd24SShri Abhyankar     }
5344b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
5345b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
5346b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
5347b2b2dd24SShri Abhyankar   }
5348b2b2dd24SShri Abhyankar 
53493649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5350b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5351b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5352b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
5353b2b2dd24SShri Abhyankar }
5354b2b2dd24SShri Abhyankar 
5355b2b2dd24SShri Abhyankar #undef __FUNCT__
535606e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
535706e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
53584e2b4712SSatish Balay {
53594e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
53604e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
53616849ba73SBarry Smith   PetscErrorCode    ierr;
5362b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5363b3260449SShri Abhyankar   PetscInt          i,nz;
53645d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5365b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
5366b3260449SShri Abhyankar   PetscScalar       *x,s1,*t;
5367b3260449SShri Abhyankar   const PetscScalar *b;
53684e2b4712SSatish Balay 
53694e2b4712SSatish Balay   PetscFunctionBegin;
53704e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
53714e2b4712SSatish Balay 
53723649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
53731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5374f1af5d2fSBarry Smith   t  = a->solve_work;
53754e2b4712SSatish Balay 
53764e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
53774e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
53784e2b4712SSatish Balay 
53794e2b4712SSatish Balay   /* forward solve the lower triangular */
5380f1af5d2fSBarry Smith   t[0] = b[*r++];
53814e2b4712SSatish Balay   for (i=1; i<n; i++) {
53824e2b4712SSatish Balay     v     = aa + ai[i];
53834e2b4712SSatish Balay     vi    = aj + ai[i];
53844e2b4712SSatish Balay     nz    = diag[i] - ai[i];
5385f1af5d2fSBarry Smith     s1  = b[*r++];
53864e2b4712SSatish Balay     while (nz--) {
5387f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
53884e2b4712SSatish Balay     }
5389f1af5d2fSBarry Smith     t[i] = s1;
53904e2b4712SSatish Balay   }
53914e2b4712SSatish Balay   /* backward solve the upper triangular */
53924e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
53934e2b4712SSatish Balay     v    = aa + diag[i] + 1;
53944e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
53954e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
5396f1af5d2fSBarry Smith     s1 = t[i];
53974e2b4712SSatish Balay     while (nz--) {
5398f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
53994e2b4712SSatish Balay     }
5400f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
54014e2b4712SSatish Balay   }
54024e2b4712SSatish Balay 
54034e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
54044e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
54053649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
54061ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5407dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
54084e2b4712SSatish Balay   PetscFunctionReturn(0);
54094e2b4712SSatish Balay }
5410048b5e81SShri Abhyankar 
5411048b5e81SShri Abhyankar #undef __FUNCT__
5412048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1"
5413048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5414048b5e81SShri Abhyankar {
5415048b5e81SShri Abhyankar   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
5416048b5e81SShri Abhyankar   IS                iscol = a->col,isrow = a->row;
5417048b5e81SShri Abhyankar   PetscErrorCode    ierr;
5418048b5e81SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5419048b5e81SShri Abhyankar   const PetscInt    *rout,*cout,*r,*c;
5420048b5e81SShri Abhyankar   PetscScalar       *x,*tmp,sum;
5421048b5e81SShri Abhyankar   const PetscScalar *b;
5422048b5e81SShri Abhyankar   const MatScalar   *aa = a->a,*v;
5423048b5e81SShri Abhyankar 
5424048b5e81SShri Abhyankar   PetscFunctionBegin;
5425048b5e81SShri Abhyankar   if (!n) PetscFunctionReturn(0);
5426048b5e81SShri Abhyankar 
54273649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5428048b5e81SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5429048b5e81SShri Abhyankar   tmp  = a->solve_work;
5430048b5e81SShri Abhyankar 
5431048b5e81SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5432048b5e81SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5433048b5e81SShri Abhyankar 
5434048b5e81SShri Abhyankar   /* forward solve the lower triangular */
5435048b5e81SShri Abhyankar   tmp[0] = b[r[0]];
5436048b5e81SShri Abhyankar   v      = aa;
5437048b5e81SShri Abhyankar   vi     = aj;
5438048b5e81SShri Abhyankar   for (i=1; i<n; i++) {
5439048b5e81SShri Abhyankar     nz  = ai[i+1] - ai[i];
5440048b5e81SShri Abhyankar     sum = b[r[i]];
5441048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5442048b5e81SShri Abhyankar     tmp[i] = sum;
5443048b5e81SShri Abhyankar     v += nz; vi += nz;
5444048b5e81SShri Abhyankar   }
5445048b5e81SShri Abhyankar 
5446048b5e81SShri Abhyankar   /* backward solve the upper triangular */
5447048b5e81SShri Abhyankar   for (i=n-1; i>=0; i--){
5448048b5e81SShri Abhyankar     v   = aa + adiag[i+1]+1;
5449048b5e81SShri Abhyankar     vi  = aj + adiag[i+1]+1;
5450048b5e81SShri Abhyankar     nz  = adiag[i]-adiag[i+1]-1;
5451048b5e81SShri Abhyankar     sum = tmp[i];
5452048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5453048b5e81SShri Abhyankar     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5454048b5e81SShri Abhyankar   }
5455048b5e81SShri Abhyankar 
5456048b5e81SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5457048b5e81SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
54583649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5459048b5e81SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5460048b5e81SShri Abhyankar   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
5461048b5e81SShri Abhyankar   PetscFunctionReturn(0);
5462048b5e81SShri Abhyankar }
5463048b5e81SShri Abhyankar 
546415091d37SBarry Smith /*
546515091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
546615091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
546715091d37SBarry Smith */
54684a2ae208SSatish Balay #undef __FUNCT__
546906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
547006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
547115091d37SBarry Smith {
547215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5473b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5474dfbe8321SBarry Smith   PetscErrorCode    ierr;
5475b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
5476b3260449SShri Abhyankar   PetscScalar       *x;
5477b3260449SShri Abhyankar   const PetscScalar *b;
547887828ca2SBarry Smith   PetscScalar       s1,x1;
5479b3260449SShri Abhyankar   PetscInt          jdx,idt,idx,nz,i;
548015091d37SBarry Smith 
548115091d37SBarry Smith   PetscFunctionBegin;
54823649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
54831ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
548415091d37SBarry Smith 
548515091d37SBarry Smith   /* forward solve the lower triangular */
548615091d37SBarry Smith   idx    = 0;
548715091d37SBarry Smith   x[0]   = b[0];
548815091d37SBarry Smith   for (i=1; i<n; i++) {
548915091d37SBarry Smith     v     =  aa      + ai[i];
549015091d37SBarry Smith     vi    =  aj      + ai[i];
549115091d37SBarry Smith     nz    =  diag[i] - ai[i];
549215091d37SBarry Smith     idx   +=  1;
5493f1af5d2fSBarry Smith     s1  =  b[idx];
549415091d37SBarry Smith     while (nz--) {
549515091d37SBarry Smith       jdx   = *vi++;
549615091d37SBarry Smith       x1    = x[jdx];
5497f1af5d2fSBarry Smith       s1 -= v[0]*x1;
549815091d37SBarry Smith       v    += 1;
549915091d37SBarry Smith     }
5500f1af5d2fSBarry Smith     x[idx]   = s1;
550115091d37SBarry Smith   }
550215091d37SBarry Smith   /* backward solve the upper triangular */
550315091d37SBarry Smith   for (i=n-1; i>=0; i--){
550415091d37SBarry Smith     v    = aa + diag[i] + 1;
550515091d37SBarry Smith     vi   = aj + diag[i] + 1;
550615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
550715091d37SBarry Smith     idt  = i;
5508f1af5d2fSBarry Smith     s1 = x[idt];
550915091d37SBarry Smith     while (nz--) {
551015091d37SBarry Smith       idx   = *vi++;
551115091d37SBarry Smith       x1    = x[idx];
5512f1af5d2fSBarry Smith       s1 -= v[0]*x1;
551315091d37SBarry Smith       v    += 1;
551415091d37SBarry Smith     }
551515091d37SBarry Smith     v        = aa +  diag[i];
5516f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
551715091d37SBarry Smith   }
55183649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
55191ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5520dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
552115091d37SBarry Smith   PetscFunctionReturn(0);
552215091d37SBarry Smith }
55234e2b4712SSatish Balay 
5524048b5e81SShri Abhyankar 
5525048b5e81SShri Abhyankar #undef __FUNCT__
5526048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5527048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5528048b5e81SShri Abhyankar {
5529048b5e81SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5530048b5e81SShri Abhyankar   PetscErrorCode    ierr;
5531048b5e81SShri Abhyankar   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5532048b5e81SShri Abhyankar   PetscScalar       *x,sum;
5533048b5e81SShri Abhyankar   const PetscScalar *b;
5534048b5e81SShri Abhyankar   const MatScalar   *aa = a->a,*v;
5535048b5e81SShri Abhyankar   PetscInt          i,nz;
5536048b5e81SShri Abhyankar 
5537048b5e81SShri Abhyankar   PetscFunctionBegin;
5538048b5e81SShri Abhyankar   if (!n) PetscFunctionReturn(0);
5539048b5e81SShri Abhyankar 
55403649974fSBarry Smith   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
5541048b5e81SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5542048b5e81SShri Abhyankar 
5543048b5e81SShri Abhyankar   /* forward solve the lower triangular */
5544048b5e81SShri Abhyankar   x[0] = b[0];
5545048b5e81SShri Abhyankar   v    = aa;
5546048b5e81SShri Abhyankar   vi   = aj;
5547048b5e81SShri Abhyankar   for (i=1; i<n; i++) {
5548048b5e81SShri Abhyankar     nz  = ai[i+1] - ai[i];
5549048b5e81SShri Abhyankar     sum = b[i];
5550048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5551048b5e81SShri Abhyankar     v  += nz;
5552048b5e81SShri Abhyankar     vi += nz;
5553048b5e81SShri Abhyankar     x[i] = sum;
5554048b5e81SShri Abhyankar   }
5555048b5e81SShri Abhyankar 
5556048b5e81SShri Abhyankar   /* backward solve the upper triangular */
5557048b5e81SShri Abhyankar   for (i=n-1; i>=0; i--){
5558048b5e81SShri Abhyankar     v   = aa + adiag[i+1] + 1;
5559048b5e81SShri Abhyankar     vi  = aj + adiag[i+1] + 1;
5560048b5e81SShri Abhyankar     nz = adiag[i] - adiag[i+1]-1;
5561048b5e81SShri Abhyankar     sum = x[i];
5562048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5563048b5e81SShri Abhyankar     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5564048b5e81SShri Abhyankar   }
5565048b5e81SShri Abhyankar 
5566048b5e81SShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
55673649974fSBarry Smith   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5568048b5e81SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5569048b5e81SShri Abhyankar   PetscFunctionReturn(0);
5570048b5e81SShri Abhyankar }
5571048b5e81SShri Abhyankar 
55724e2b4712SSatish Balay /* ----------------------------------------------------------------*/
557309573ac7SBarry Smith extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool );
55746bce7ff8SHong Zhang 
55752b0b2ea7SShri Abhyankar #undef __FUNCT__
557629a97285SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5577766f9fbaSBarry Smith /*
5578766f9fbaSBarry Smith    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5579766f9fbaSBarry Smith */
558029a97285SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
55812b0b2ea7SShri Abhyankar {
55822b0b2ea7SShri Abhyankar   Mat             C=B;
55832b0b2ea7SShri Abhyankar   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
55842b0b2ea7SShri Abhyankar   PetscErrorCode  ierr;
5585766f9fbaSBarry Smith   PetscInt        i,j,k,ipvt[15];
5586766f9fbaSBarry Smith   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5587766f9fbaSBarry Smith   PetscInt        nz,nzL,row;
5588766f9fbaSBarry Smith   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5589766f9fbaSBarry Smith   const MatScalar *v,*aa=a->a;
55902b0b2ea7SShri Abhyankar   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
55910fa040f9SShri Abhyankar   PetscInt        sol_ver;
55922b0b2ea7SShri Abhyankar 
55932b0b2ea7SShri Abhyankar   PetscFunctionBegin;
55942b0b2ea7SShri Abhyankar 
55950fa040f9SShri Abhyankar   ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr);
55960fa040f9SShri Abhyankar 
55972b0b2ea7SShri Abhyankar   /* generate work space needed by the factorization */
55982b0b2ea7SShri Abhyankar   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
55992b0b2ea7SShri Abhyankar   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
56002b0b2ea7SShri Abhyankar 
56012b0b2ea7SShri Abhyankar   for (i=0; i<n; i++){
56022b0b2ea7SShri Abhyankar     /* zero rtmp */
56032b0b2ea7SShri Abhyankar     /* L part */
56042b0b2ea7SShri Abhyankar     nz    = bi[i+1] - bi[i];
56052b0b2ea7SShri Abhyankar     bjtmp = bj + bi[i];
56062b0b2ea7SShri Abhyankar     for  (j=0; j<nz; j++){
56072b0b2ea7SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56082b0b2ea7SShri Abhyankar     }
56092b0b2ea7SShri Abhyankar 
56102b0b2ea7SShri Abhyankar     /* U part */
56112b0b2ea7SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
56122b0b2ea7SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
56132b0b2ea7SShri Abhyankar     for  (j=0; j<nz; j++){
56142b0b2ea7SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56152b0b2ea7SShri Abhyankar     }
56162b0b2ea7SShri Abhyankar 
56172b0b2ea7SShri Abhyankar     /* load in initial (unfactored row) */
561829a97285SShri Abhyankar     nz    = ai[i+1] - ai[i];
561929a97285SShri Abhyankar     ajtmp = aj + ai[i];
562029a97285SShri Abhyankar     v     = aa + bs2*ai[i];
56212b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
562229a97285SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
56232b0b2ea7SShri Abhyankar     }
56242b0b2ea7SShri Abhyankar 
56252b0b2ea7SShri Abhyankar     /* elimination */
56262b0b2ea7SShri Abhyankar     bjtmp = bj + bi[i];
56272b0b2ea7SShri Abhyankar     nzL   = bi[i+1] - bi[i];
56282b0b2ea7SShri Abhyankar     for(k=0;k < nzL;k++) {
56292b0b2ea7SShri Abhyankar       row = bjtmp[k];
56302b0b2ea7SShri Abhyankar       pc = rtmp + bs2*row;
56312b0b2ea7SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
56322b0b2ea7SShri Abhyankar       if (flg) {
56332b0b2ea7SShri Abhyankar         pv = b->a + bs2*bdiag[row];
5634766f9fbaSBarry Smith 	Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5635766f9fbaSBarry Smith 	/*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
56362b0b2ea7SShri Abhyankar 	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
56372b0b2ea7SShri Abhyankar         pv = b->a + bs2*(bdiag[row+1]+1);
56382b0b2ea7SShri Abhyankar         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
56392b0b2ea7SShri Abhyankar         for (j=0; j<nz; j++) {
5640766f9fbaSBarry Smith           vv   = rtmp + bs2*pj[j];
5641766f9fbaSBarry Smith           Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5642766f9fbaSBarry Smith 	  /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
56432b0b2ea7SShri Abhyankar 	  pv  += bs2;
56442b0b2ea7SShri Abhyankar         }
5645766f9fbaSBarry Smith         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
56462b0b2ea7SShri Abhyankar       }
56472b0b2ea7SShri Abhyankar     }
56482b0b2ea7SShri Abhyankar 
56492b0b2ea7SShri Abhyankar     /* finished row so stick it into b->a */
56502b0b2ea7SShri Abhyankar     /* L part */
56512b0b2ea7SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
56522b0b2ea7SShri Abhyankar     pj   = b->j + bi[i] ;
56532b0b2ea7SShri Abhyankar     nz   = bi[i+1] - bi[i];
56542b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
56552b0b2ea7SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56562b0b2ea7SShri Abhyankar     }
56572b0b2ea7SShri Abhyankar 
56582b0b2ea7SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
56592b0b2ea7SShri Abhyankar     pv   = b->a + bs2*bdiag[i];
56602b0b2ea7SShri Abhyankar     pj   = b->j + bdiag[i];
56612b0b2ea7SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5662766f9fbaSBarry Smith     /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5663182b8fbaSHong Zhang     ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr);
56642b0b2ea7SShri Abhyankar 
56652b0b2ea7SShri Abhyankar     /* U part */
56662b0b2ea7SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
56672b0b2ea7SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
56682b0b2ea7SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
56692b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++){
56702b0b2ea7SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56712b0b2ea7SShri Abhyankar     }
56722b0b2ea7SShri Abhyankar   }
56732b0b2ea7SShri Abhyankar 
56742b0b2ea7SShri Abhyankar   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5675832cc040SShri Abhyankar   C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5676766f9fbaSBarry Smith   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
56772b0b2ea7SShri Abhyankar   C->assembled = PETSC_TRUE;
5678766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
56792b0b2ea7SShri Abhyankar   PetscFunctionReturn(0);
56802b0b2ea7SShri Abhyankar }
56812b0b2ea7SShri Abhyankar 
56826bce7ff8SHong Zhang #undef __FUNCT__
56834dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
56844dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
56856bce7ff8SHong Zhang {
56866bce7ff8SHong Zhang   Mat            C=B;
56876bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
56886bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
56896bce7ff8SHong Zhang   PetscErrorCode ierr;
56905a586d82SBarry Smith   const PetscInt *r,*ic;
56916bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
56926bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5693b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5694914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5695914a18a2SHong Zhang   MatScalar      *v_work;
5696ace3abfcSBarry Smith   PetscBool      col_identity,row_identity,both_identity;
56976bce7ff8SHong Zhang 
56986bce7ff8SHong Zhang   PetscFunctionBegin;
56996bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
57006bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5701ae3d28f0SHong Zhang 
5702fca92195SBarry Smith   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5703fca92195SBarry Smith   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
57046bce7ff8SHong Zhang 
5705914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
5706fca92195SBarry Smith   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5707914a18a2SHong Zhang 
57086bce7ff8SHong Zhang   for (i=0; i<n; i++){
57096bce7ff8SHong Zhang     /* zero rtmp */
57106bce7ff8SHong Zhang     /* L part */
57116bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
57126bce7ff8SHong Zhang     bjtmp = bj + bi[i];
5713914a18a2SHong Zhang     for  (j=0; j<nz; j++){
5714914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5715914a18a2SHong Zhang     }
57166bce7ff8SHong Zhang 
57176bce7ff8SHong Zhang     /* U part */
57181a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
57191a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
57201a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
57211a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57221a83e813SShri Abhyankar     }
57231a83e813SShri Abhyankar 
57241a83e813SShri Abhyankar     /* load in initial (unfactored row) */
57251a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
57261a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
57271a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
57281a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
57291a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
57301a83e813SShri Abhyankar     }
57311a83e813SShri Abhyankar 
57321a83e813SShri Abhyankar     /* elimination */
57331a83e813SShri Abhyankar     bjtmp = bj + bi[i];
57341a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
57351a83e813SShri Abhyankar     for(k=0;k < nzL;k++) {
57361a83e813SShri Abhyankar       row = bjtmp[k];
57371a83e813SShri Abhyankar       pc = rtmp + bs2*row;
57381a83e813SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
57391a83e813SShri Abhyankar       if (flg) {
57401a83e813SShri Abhyankar         pv         = b->a + bs2*bdiag[row];
57411a83e813SShri Abhyankar         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
57421a83e813SShri Abhyankar         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
57431a83e813SShri Abhyankar         pv         = b->a + bs2*(bdiag[row+1]+1);
57441a83e813SShri Abhyankar         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
57451a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
57461a83e813SShri Abhyankar           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
57471a83e813SShri Abhyankar         }
57481a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
57491a83e813SShri Abhyankar       }
57501a83e813SShri Abhyankar     }
57511a83e813SShri Abhyankar 
57521a83e813SShri Abhyankar     /* finished row so stick it into b->a */
57531a83e813SShri Abhyankar     /* L part */
57541a83e813SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
57551a83e813SShri Abhyankar     pj   = b->j + bi[i] ;
57561a83e813SShri Abhyankar     nz   = bi[i+1] - bi[i];
57571a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
57581a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57591a83e813SShri Abhyankar     }
57601a83e813SShri Abhyankar 
57611a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
57621a83e813SShri Abhyankar     pv  = b->a + bs2*bdiag[i];
57631a83e813SShri Abhyankar     pj  = b->j + bdiag[i];
5764e32f2f54SBarry Smith     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
57651a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57661a83e813SShri Abhyankar     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
57671a83e813SShri Abhyankar 
57681a83e813SShri Abhyankar     /* U part */
57691a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
57701a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
57711a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
57721a83e813SShri Abhyankar     for (j=0; j<nz; j++){
57731a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
57741a83e813SShri Abhyankar     }
57751a83e813SShri Abhyankar   }
57761a83e813SShri Abhyankar 
57771a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5778fca92195SBarry Smith   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
57791a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
57801a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
57811a83e813SShri Abhyankar 
5782ae3d28f0SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5783ae3d28f0SHong Zhang   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5784ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
5785ae3d28f0SHong Zhang   if (both_identity){
57864dd39f65SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5787ae3d28f0SHong Zhang   } else {
57884dd39f65SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N;
5789ae3d28f0SHong Zhang   }
57904dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5791ae3d28f0SHong Zhang 
57921a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
5793766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
57941a83e813SShri Abhyankar   PetscFunctionReturn(0);
57951a83e813SShri Abhyankar }
57961a83e813SShri Abhyankar 
57976bce7ff8SHong Zhang /*
57986bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
57994dd39f65SShri Abhyankar    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
58004dd39f65SShri Abhyankar    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
58016bce7ff8SHong Zhang */
5802c0c7eb62SShri Abhyankar 
58036bce7ff8SHong Zhang #undef __FUNCT__
58044dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
58054dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
58066bce7ff8SHong Zhang {
58076bce7ff8SHong Zhang 
58086bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
58096bce7ff8SHong Zhang   PetscErrorCode     ierr;
581016a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
581135aa4fcfSShri Abhyankar   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
581235aa4fcfSShri Abhyankar 
581335aa4fcfSShri Abhyankar   PetscFunctionBegin;
581435aa4fcfSShri Abhyankar   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
581535aa4fcfSShri Abhyankar   b    = (Mat_SeqBAIJ*)(fact)->data;
581635aa4fcfSShri Abhyankar 
581735aa4fcfSShri Abhyankar   /* allocate matrix arrays for new data structure */
581835aa4fcfSShri Abhyankar   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
581935aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
582035aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_TRUE;
582135aa4fcfSShri Abhyankar   if (!b->diag){
582235aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
582335aa4fcfSShri Abhyankar     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
582435aa4fcfSShri Abhyankar   }
582535aa4fcfSShri Abhyankar   bdiag = b->diag;
582635aa4fcfSShri Abhyankar 
582735aa4fcfSShri Abhyankar   if (n > 0) {
582835aa4fcfSShri Abhyankar     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
582935aa4fcfSShri Abhyankar   }
583035aa4fcfSShri Abhyankar 
583135aa4fcfSShri Abhyankar   /* set bi and bj with new data structure */
583235aa4fcfSShri Abhyankar   bi = b->i;
583335aa4fcfSShri Abhyankar   bj = b->j;
583435aa4fcfSShri Abhyankar 
583535aa4fcfSShri Abhyankar   /* L part */
583635aa4fcfSShri Abhyankar   bi[0] = 0;
583735aa4fcfSShri Abhyankar   for (i=0; i<n; i++){
583835aa4fcfSShri Abhyankar     nz = adiag[i] - ai[i];
583935aa4fcfSShri Abhyankar     bi[i+1] = bi[i] + nz;
584035aa4fcfSShri Abhyankar     aj = a->j + ai[i];
584135aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
584235aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
584335aa4fcfSShri Abhyankar     }
584435aa4fcfSShri Abhyankar   }
584535aa4fcfSShri Abhyankar 
584635aa4fcfSShri Abhyankar   /* U part */
584735aa4fcfSShri Abhyankar   bi_temp = bi[n];
584835aa4fcfSShri Abhyankar   bdiag[n] = bi[n]-1;
584935aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
585035aa4fcfSShri Abhyankar     nz = ai[i+1] - adiag[i] - 1;
585135aa4fcfSShri Abhyankar     bi_temp = bi_temp + nz + 1;
585235aa4fcfSShri Abhyankar     aj = a->j + adiag[i] + 1;
585335aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
585435aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
585535aa4fcfSShri Abhyankar     }
585635aa4fcfSShri Abhyankar     /* diag[i] */
585735aa4fcfSShri Abhyankar     *bj = i; bj++;
585835aa4fcfSShri Abhyankar     bdiag[i] = bi_temp - 1;
585935aa4fcfSShri Abhyankar   }
586035aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
586135aa4fcfSShri Abhyankar }
586235aa4fcfSShri Abhyankar 
586335aa4fcfSShri Abhyankar #undef __FUNCT__
58644dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
58654dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
586616a2bf60SHong Zhang {
586716a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
586816a2bf60SHong Zhang   IS                 isicol;
586916a2bf60SHong Zhang   PetscErrorCode     ierr;
587016a2bf60SHong Zhang   const PetscInt     *r,*ic;
58717fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
587216a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
587316a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
587416a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
5875ace3abfcSBarry Smith   PetscBool          col_identity,row_identity,both_identity;
587616a2bf60SHong Zhang   PetscReal          f;
587716a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
587816a2bf60SHong Zhang   PetscBT            lnkbt;
587916a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
588016a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
588116a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5882ace3abfcSBarry Smith   PetscBool          missing;
58837fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
588416a2bf60SHong Zhang 
588516a2bf60SHong Zhang   PetscFunctionBegin;
5886e32f2f54SBarry Smith   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
58876ba06ab7SHong Zhang   if (bs>1){  /* check shifttype */
58886ba06ab7SHong Zhang     if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
58896ba06ab7SHong Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
58906ba06ab7SHong Zhang   }
58916ba06ab7SHong Zhang 
589216a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5893e32f2f54SBarry Smith   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
589416a2bf60SHong Zhang 
589516a2bf60SHong Zhang   f             = info->fill;
589616a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
589716a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
589816a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
589916a2bf60SHong Zhang 
590016a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
590116a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5902ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
590316a2bf60SHong Zhang 
59047fa3a6a0SHong Zhang   if (!levels && both_identity) {
590516a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
59064dd39f65SShri Abhyankar     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
59074dd39f65SShri Abhyankar     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
590835aa4fcfSShri Abhyankar 
5909d5f3da31SBarry Smith     fact->factortype               = MAT_FACTOR_ILU;
591035aa4fcfSShri Abhyankar     (fact)->info.factor_mallocs    = 0;
591135aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_given  = info->fill;
591235aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_needed = 1.0;
591335aa4fcfSShri Abhyankar     b                = (Mat_SeqBAIJ*)(fact)->data;
591435aa4fcfSShri Abhyankar     b->row           = isrow;
591535aa4fcfSShri Abhyankar     b->col           = iscol;
591635aa4fcfSShri Abhyankar     b->icol          = isicol;
591735aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
591835aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
591935aa4fcfSShri Abhyankar     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
592035aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
592135aa4fcfSShri Abhyankar     PetscFunctionReturn(0);
592235aa4fcfSShri Abhyankar   }
592335aa4fcfSShri Abhyankar 
592435aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
592535aa4fcfSShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
592635aa4fcfSShri Abhyankar 
592735aa4fcfSShri Abhyankar   /* get new row pointers */
592835aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
592935aa4fcfSShri Abhyankar   bi[0] = 0;
593035aa4fcfSShri Abhyankar   /* bdiag is location of diagonal in factor */
593135aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
593235aa4fcfSShri Abhyankar   bdiag[0]  = 0;
593335aa4fcfSShri Abhyankar 
5934fca92195SBarry Smith   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
593535aa4fcfSShri Abhyankar 
593635aa4fcfSShri Abhyankar   /* create a linked list for storing column indices of the active row */
593735aa4fcfSShri Abhyankar   nlnk = n + 1;
593835aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
593935aa4fcfSShri Abhyankar 
594035aa4fcfSShri Abhyankar   /* initial FreeSpace size is f*(ai[n]+1) */
594135aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
594235aa4fcfSShri Abhyankar   current_space = free_space;
594335aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
594435aa4fcfSShri Abhyankar   current_space_lvl = free_space_lvl;
594535aa4fcfSShri Abhyankar 
594635aa4fcfSShri Abhyankar   for (i=0; i<n; i++) {
594735aa4fcfSShri Abhyankar     nzi = 0;
594835aa4fcfSShri Abhyankar     /* copy current row into linked list */
594935aa4fcfSShri Abhyankar     nnz  = ai[r[i]+1] - ai[r[i]];
5950e32f2f54SBarry Smith     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
595135aa4fcfSShri Abhyankar     cols = aj + ai[r[i]];
595235aa4fcfSShri Abhyankar     lnk[i] = -1; /* marker to indicate if diagonal exists */
595335aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
595435aa4fcfSShri Abhyankar     nzi += nlnk;
595535aa4fcfSShri Abhyankar 
595635aa4fcfSShri Abhyankar     /* make sure diagonal entry is included */
595735aa4fcfSShri Abhyankar     if (diagonal_fill && lnk[i] == -1) {
595835aa4fcfSShri Abhyankar       fm = n;
595935aa4fcfSShri Abhyankar       while (lnk[fm] < i) fm = lnk[fm];
596035aa4fcfSShri Abhyankar       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
596135aa4fcfSShri Abhyankar       lnk[fm]    = i;
596235aa4fcfSShri Abhyankar       lnk_lvl[i] = 0;
596335aa4fcfSShri Abhyankar       nzi++; dcount++;
596435aa4fcfSShri Abhyankar     }
596535aa4fcfSShri Abhyankar 
596635aa4fcfSShri Abhyankar     /* add pivot rows into the active row */
596735aa4fcfSShri Abhyankar     nzbd = 0;
596835aa4fcfSShri Abhyankar     prow = lnk[n];
596935aa4fcfSShri Abhyankar     while (prow < i) {
597035aa4fcfSShri Abhyankar       nnz      = bdiag[prow];
597135aa4fcfSShri Abhyankar       cols     = bj_ptr[prow] + nnz + 1;
597235aa4fcfSShri Abhyankar       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
597335aa4fcfSShri Abhyankar       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
597435aa4fcfSShri Abhyankar       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
597535aa4fcfSShri Abhyankar       nzi += nlnk;
597635aa4fcfSShri Abhyankar       prow = lnk[prow];
597735aa4fcfSShri Abhyankar       nzbd++;
597835aa4fcfSShri Abhyankar     }
597935aa4fcfSShri Abhyankar     bdiag[i] = nzbd;
598035aa4fcfSShri Abhyankar     bi[i+1]  = bi[i] + nzi;
598135aa4fcfSShri Abhyankar 
598235aa4fcfSShri Abhyankar     /* if free space is not available, make more free space */
598335aa4fcfSShri Abhyankar     if (current_space->local_remaining<nzi) {
598435aa4fcfSShri Abhyankar       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
598535aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
598635aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
598735aa4fcfSShri Abhyankar       reallocs++;
598835aa4fcfSShri Abhyankar     }
598935aa4fcfSShri Abhyankar 
599035aa4fcfSShri Abhyankar     /* copy data into free_space and free_space_lvl, then initialize lnk */
599135aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
599235aa4fcfSShri Abhyankar     bj_ptr[i]    = current_space->array;
599335aa4fcfSShri Abhyankar     bjlvl_ptr[i] = current_space_lvl->array;
599435aa4fcfSShri Abhyankar 
599535aa4fcfSShri Abhyankar     /* make sure the active row i has diagonal entry */
599665e19b50SBarry Smith     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
599735aa4fcfSShri Abhyankar 
599835aa4fcfSShri Abhyankar     current_space->array           += nzi;
599935aa4fcfSShri Abhyankar     current_space->local_used      += nzi;
600035aa4fcfSShri Abhyankar     current_space->local_remaining -= nzi;
600135aa4fcfSShri Abhyankar     current_space_lvl->array           += nzi;
600235aa4fcfSShri Abhyankar     current_space_lvl->local_used      += nzi;
600335aa4fcfSShri Abhyankar     current_space_lvl->local_remaining -= nzi;
600435aa4fcfSShri Abhyankar   }
600535aa4fcfSShri Abhyankar 
600635aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
600735aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
600835aa4fcfSShri Abhyankar 
600935aa4fcfSShri Abhyankar   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
60109263d837SHong Zhang   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
60112ce24eb6SHong Zhang   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
601235aa4fcfSShri Abhyankar 
601335aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
601435aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
6015fca92195SBarry Smith   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
601635aa4fcfSShri Abhyankar 
601735aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO)
601835aa4fcfSShri Abhyankar   {
6019aef85c9fSShri Abhyankar     PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
602035aa4fcfSShri Abhyankar     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
602135aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
602235aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
602335aa4fcfSShri Abhyankar     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
602435aa4fcfSShri Abhyankar     if (diagonal_fill) {
602535aa4fcfSShri Abhyankar       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
602635aa4fcfSShri Abhyankar     }
602735aa4fcfSShri Abhyankar   }
602835aa4fcfSShri Abhyankar #endif
602935aa4fcfSShri Abhyankar 
603035aa4fcfSShri Abhyankar   /* put together the new matrix */
603135aa4fcfSShri Abhyankar   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
603235aa4fcfSShri Abhyankar   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
603335aa4fcfSShri Abhyankar   b = (Mat_SeqBAIJ*)(fact)->data;
603435aa4fcfSShri Abhyankar   b->free_a       = PETSC_TRUE;
603535aa4fcfSShri Abhyankar   b->free_ij      = PETSC_TRUE;
603635aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_FALSE;
603735aa4fcfSShri Abhyankar   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
603835aa4fcfSShri Abhyankar   b->j          = bj;
603935aa4fcfSShri Abhyankar   b->i          = bi;
604035aa4fcfSShri Abhyankar   b->diag       = bdiag;
604135aa4fcfSShri Abhyankar   b->free_diag  = PETSC_TRUE;
604235aa4fcfSShri Abhyankar   b->ilen       = 0;
604335aa4fcfSShri Abhyankar   b->imax       = 0;
604435aa4fcfSShri Abhyankar   b->row        = isrow;
604535aa4fcfSShri Abhyankar   b->col        = iscol;
604635aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
604735aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
604835aa4fcfSShri Abhyankar   b->icol       = isicol;
604935aa4fcfSShri Abhyankar   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
605035aa4fcfSShri Abhyankar   /* In b structure:  Free imax, ilen, old a, old j.
605135aa4fcfSShri Abhyankar      Allocate bdiag, solve_work, new a, new j */
605235aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
605335aa4fcfSShri Abhyankar   b->maxnz = b->nz = bdiag[0]+1;
6054ae3d28f0SHong Zhang   fact->info.factor_mallocs    = reallocs;
6055ae3d28f0SHong Zhang   fact->info.fill_ratio_given  = f;
6056ae3d28f0SHong Zhang   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
60574dd39f65SShri Abhyankar   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
605835aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
605935aa4fcfSShri Abhyankar }
606035aa4fcfSShri Abhyankar 
60614e2b4712SSatish Balay /*
60624e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
60634e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
60644e2b4712SSatish Balay    Not a good example of code reuse.
60654e2b4712SSatish Balay */
60664a2ae208SSatish Balay #undef __FUNCT__
606706e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
606806e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
60694e2b4712SSatish Balay {
60704e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
60714e2b4712SSatish Balay   IS             isicol;
60726849ba73SBarry Smith   PetscErrorCode ierr;
60735d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
60745d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
6075a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
6076d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
6077ace3abfcSBarry Smith   PetscBool      col_identity,row_identity,both_identity,flg;
6078329f5518SBarry Smith   PetscReal      f;
60794e2b4712SSatish Balay 
60804e2b4712SSatish Balay   PetscFunctionBegin;
60816bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
6082e32f2f54SBarry Smith   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
60836bce7ff8SHong Zhang 
6084435faa5fSBarry Smith   f             = info->fill;
6085690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
6086690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
60874c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
608816a2bf60SHong Zhang 
6089667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
6090667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
6091ace3abfcSBarry Smith   both_identity = (PetscBool) (row_identity && col_identity);
6092309c388cSBarry Smith 
609341df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
609416a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
60958b1456e3SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
60966bce7ff8SHong Zhang 
6097d5f3da31SBarry Smith     fact->factortype = MAT_FACTOR_ILU;
6098ae3d28f0SHong Zhang     b            = (Mat_SeqBAIJ*)fact->data;
6099bb3d539aSBarry Smith     b->row       = isrow;
6100bb3d539aSBarry Smith     b->col       = iscol;
6101bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6102bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6103bb3d539aSBarry Smith     b->icol      = isicol;
6104bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6105b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
61066bce7ff8SHong Zhang     PetscFunctionReturn(0);
61076bce7ff8SHong Zhang   }
61086bce7ff8SHong Zhang 
61096bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
61104e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
61114e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
61124e2b4712SSatish Balay 
61134e2b4712SSatish Balay     /* get new row pointers */
6114690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
61154e2b4712SSatish Balay     ainew[0] = 0;
61164e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
6117690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
6118690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
61194e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
6120690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
61214e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
6122690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
61234e2b4712SSatish Balay     /* im is level for each filled value */
6124690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
61254e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
6126690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
61274e2b4712SSatish Balay     dloc[0]  = 0;
61284e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
6129435faa5fSBarry Smith 
6130435faa5fSBarry Smith       /* copy prow into linked list */
61314e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
6132e32f2f54SBarry Smith       if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
61334e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
61344e2b4712SSatish Balay       fill[n]    = n;
6135435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
61364e2b4712SSatish Balay       while (nz--) {
61374e2b4712SSatish Balay 	fm  = n;
61384e2b4712SSatish Balay 	idx = ic[*xi++];
61394e2b4712SSatish Balay 	do {
61404e2b4712SSatish Balay 	  m  = fm;
61414e2b4712SSatish Balay 	  fm = fill[m];
61424e2b4712SSatish Balay 	} while (fm < idx);
61434e2b4712SSatish Balay 	fill[m]   = idx;
61444e2b4712SSatish Balay 	fill[idx] = fm;
61454e2b4712SSatish Balay 	im[idx]   = 0;
61464e2b4712SSatish Balay       }
6147435faa5fSBarry Smith 
6148435faa5fSBarry Smith       /* make sure diagonal entry is included */
6149435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
6150435faa5fSBarry Smith 	fm = n;
6151435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
6152435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
6153435faa5fSBarry Smith 	fill[fm]   = prow;
6154435faa5fSBarry Smith 	im[prow]   = 0;
6155435faa5fSBarry Smith 	nzf++;
6156335d9088SBarry Smith 	dcount++;
6157435faa5fSBarry Smith       }
6158435faa5fSBarry Smith 
61594e2b4712SSatish Balay       nzi = 0;
61604e2b4712SSatish Balay       row = fill[n];
61614e2b4712SSatish Balay       while (row < prow) {
61624e2b4712SSatish Balay 	incrlev = im[row] + 1;
61634e2b4712SSatish Balay 	nz      = dloc[row];
6164435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
61654e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
61664e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
61674e2b4712SSatish Balay 	fm      = row;
61684e2b4712SSatish Balay 	while (nnz-- > 0) {
61694e2b4712SSatish Balay 	  idx = *xi++;
61704e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
61714e2b4712SSatish Balay 	    flev++;
61724e2b4712SSatish Balay 	    continue;
61734e2b4712SSatish Balay 	  }
61744e2b4712SSatish Balay 	  do {
61754e2b4712SSatish Balay 	    m  = fm;
61764e2b4712SSatish Balay 	    fm = fill[m];
61774e2b4712SSatish Balay 	  } while (fm < idx);
61784e2b4712SSatish Balay 	  if (fm != idx) {
61794e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
61804e2b4712SSatish Balay 	    fill[m]   = idx;
61814e2b4712SSatish Balay 	    fill[idx] = fm;
61824e2b4712SSatish Balay 	    fm        = idx;
61834e2b4712SSatish Balay 	    nzf++;
6184ecf371e4SBarry Smith 	  } else {
61854e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
61864e2b4712SSatish Balay 	  }
61874e2b4712SSatish Balay 	  flev++;
61884e2b4712SSatish Balay 	}
61894e2b4712SSatish Balay 	row = fill[row];
61904e2b4712SSatish Balay 	nzi++;
61914e2b4712SSatish Balay       }
61924e2b4712SSatish Balay       /* copy new filled row into permanent storage */
61934e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
61944e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
6195ecf371e4SBarry Smith 
6196ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
6197ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6198ecf371e4SBarry Smith 	/* just double the memory each time */
6199690b6cddSBarry Smith 	PetscInt maxadd = jmax;
6200ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
62014e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
62024e2b4712SSatish Balay 	jmax += maxadd;
6203ecf371e4SBarry Smith 
6204ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
62055d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
62065d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6207606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
62085d0c19d7SBarry Smith 	ajnew = xitmp;
62095d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
62105d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6211606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
62125d0c19d7SBarry Smith 	ajfill = xitmp;
6213eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
62144e2b4712SSatish Balay       }
62155d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
62164e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
62174e2b4712SSatish Balay       dloc[prow]  = nzi;
62184e2b4712SSatish Balay       fm          = fill[n];
62194e2b4712SSatish Balay       while (nzf--) {
62205d0c19d7SBarry Smith 	*xitmp++ = fm;
62214e2b4712SSatish Balay 	*flev++ = im[fm];
62224e2b4712SSatish Balay 	fm      = fill[fm];
62234e2b4712SSatish Balay       }
6224435faa5fSBarry Smith       /* make sure row has diagonal entry */
6225435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6226e32f2f54SBarry Smith 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
62272401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6228435faa5fSBarry Smith       }
62294e2b4712SSatish Balay     }
6230606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
62314e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
62324e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6233606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
6234606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
62354e2b4712SSatish Balay 
62366cf91177SBarry Smith #if defined(PETSC_USE_INFO)
62374e2b4712SSatish Balay     {
6238329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6239ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6240ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6241ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6242ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6243335d9088SBarry Smith       if (diagonal_fill) {
6244ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6245335d9088SBarry Smith       }
62464e2b4712SSatish Balay     }
624763ba0a88SBarry Smith #endif
62484e2b4712SSatish Balay 
62494e2b4712SSatish Balay     /* put together the new matrix */
6250719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6251719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6252ae3d28f0SHong Zhang     b    = (Mat_SeqBAIJ*)fact->data;
6253e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
6254e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
62557c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
6256a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
62574e2b4712SSatish Balay     b->j          = ajnew;
62584e2b4712SSatish Balay     b->i          = ainew;
62594e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
62604e2b4712SSatish Balay     b->diag       = dloc;
62617f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
62624e2b4712SSatish Balay     b->ilen       = 0;
62634e2b4712SSatish Balay     b->imax       = 0;
62644e2b4712SSatish Balay     b->row        = isrow;
62654e2b4712SSatish Balay     b->col        = iscol;
6266bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6267c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6268c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6269e51c0b9cSSatish Balay     b->icol       = isicol;
627087828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
62714e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
62724e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
6273719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
62744e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
62754e2b4712SSatish Balay 
6276ae3d28f0SHong Zhang     fact->info.factor_mallocs    = reallocate;
6277ae3d28f0SHong Zhang     fact->info.fill_ratio_given  = f;
6278ae3d28f0SHong Zhang     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
62796bce7ff8SHong Zhang 
62808b1456e3SHong Zhang   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
62818661488fSKris Buschelman   PetscFunctionReturn(0);
62828661488fSKris Buschelman }
62838661488fSKris Buschelman 
6284732ee342SKris Buschelman #undef __FUNCT__
62857e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6286dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
62877e7071cdSKris Buschelman {
628812272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
628912272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
62905a9542e3SKris Buschelman   PetscFunctionBegin;
62917cf1b8d3SKris Buschelman   /* Undo Column scaling */
62927cf1b8d3SKris Buschelman /*    while (nz--) { */
62937cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
62947cf1b8d3SKris Buschelman /*    } */
6295c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
6296c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
62977cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
62987cf1b8d3SKris Buschelman }
62997cf1b8d3SKris Buschelman 
63007cf1b8d3SKris Buschelman #undef __FUNCT__
63017cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6302dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
63037cf1b8d3SKris Buschelman {
63047cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6305b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
63062aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
63075a9542e3SKris Buschelman   PetscFunctionBegin;
63080b9da03eSKris Buschelman   /* Is this really necessary? */
630920235379SKris Buschelman   while (nz--) {
63100b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
63117e7071cdSKris Buschelman   }
6312c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
63137e7071cdSKris Buschelman   PetscFunctionReturn(0);
63147e7071cdSKris Buschelman }
63157e7071cdSKris Buschelman 
6316732ee342SKris Buschelman 
6317