xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact.c (revision 49b5e25f194e2f2348d54770c8c9996e6dec7aec)
1*49b5e25fSSatish Balay /* Using Modified Sparse Row (MSR) storage.
2*49b5e25fSSatish Balay See page 85, "Iterative Methods ..." by Saad. */
3*49b5e25fSSatish Balay 
4*49b5e25fSSatish Balay /*$Id: baijfact.c,v 1.78 2000/01/11 21:00:52 bsmith Exp bsmith $*/
5*49b5e25fSSatish Balay /*
6*49b5e25fSSatish Balay     Factorization code for SBAIJ format.
7*49b5e25fSSatish Balay */
8*49b5e25fSSatish Balay #include "sbaij.h"
9*49b5e25fSSatish Balay #include "src/mat/impls/baij/seq/baij.h"
10*49b5e25fSSatish Balay #include "src/vec/vecimpl.h"
11*49b5e25fSSatish Balay #include "src/inline/ilu.h"
12*49b5e25fSSatish Balay 
13*49b5e25fSSatish Balay #undef __FUNC__
14*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorSymbolic_SeqSBAIJ"
15*49b5e25fSSatish Balay int MatLUFactorSymbolic_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatLUInfo *info,Mat *B)
16*49b5e25fSSatish Balay {
17*49b5e25fSSatish Balay   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
18*49b5e25fSSatish Balay   IS          isicol;
19*49b5e25fSSatish Balay   int         *rip,*riip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
20*49b5e25fSSatish Balay   int         *fill,*jutmp,nz,bs = a->bs,bs2=a->bs2;
21*49b5e25fSSatish Balay   int         *idnew,idx,row,m,fm,nnz,nzi,realloc = 0,nzbd,*im;
22*49b5e25fSSatish Balay   int         *jl,*q,jumin,jumax,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
23*49b5e25fSSatish Balay   PetscReal   f = 1.0;
24*49b5e25fSSatish Balay 
25*49b5e25fSSatish Balay   PetscFunctionBegin;
26*49b5e25fSSatish Balay   PetscValidHeaderSpecific(isrow,IS_COOKIE);
27*49b5e25fSSatish Balay   PetscValidHeaderSpecific(iscol,IS_COOKIE);
28*49b5e25fSSatish Balay   /* if (A->M != A->N) SETERRQ(PETSC_ERR_ARG_WRONG,0,"matrix must be square");*/
29*49b5e25fSSatish Balay   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
30*49b5e25fSSatish Balay   ierr = ISGetIndices(isrow,&rip);CHKERRQ(ierr);
31*49b5e25fSSatish Balay   ierr = ISGetIndices(isicol,&riip);CHKERRQ(ierr);
32*49b5e25fSSatish Balay   for (k=0; k<mbs; k++) {
33*49b5e25fSSatish Balay     if ( rip[k] - riip[k] != 0 ) {
34*49b5e25fSSatish Balay       printf("Non-symm. permutation, use symm. permutation or general matrix format\n");
35*49b5e25fSSatish Balay       break;
36*49b5e25fSSatish Balay     }
37*49b5e25fSSatish Balay   }
38*49b5e25fSSatish Balay 
39*49b5e25fSSatish Balay   /* initialization */
40*49b5e25fSSatish Balay   /* Don't know how many column pointers are needed so estimate.
41*49b5e25fSSatish Balay      Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */
42*49b5e25fSSatish Balay   if (info) f = info->fill;
43*49b5e25fSSatish Balay   umax = (int)(f*ai[mbs] + 1); umax += mbs + 1;
44*49b5e25fSSatish Balay   ju = iu = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(ju);
45*49b5e25fSSatish Balay   iu[0] = mbs+1;
46*49b5e25fSSatish Balay   juptr = mbs;
47*49b5e25fSSatish Balay   jl =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
48*49b5e25fSSatish Balay   q  =  (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(q);
49*49b5e25fSSatish Balay   for (i=0; i<mbs; i++){
50*49b5e25fSSatish Balay     jl[i] = mbs; q[i] = 0;
51*49b5e25fSSatish Balay   }
52*49b5e25fSSatish Balay 
53*49b5e25fSSatish Balay   /* for each row k */
54*49b5e25fSSatish Balay   for (k=0; k<mbs; k++){
55*49b5e25fSSatish Balay     nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */
56*49b5e25fSSatish Balay     q[k] = mbs;
57*49b5e25fSSatish Balay     /* initialize nonzero structure of k-th row to row rip[k] of A */
58*49b5e25fSSatish Balay     jmin = ai[rip[k]];
59*49b5e25fSSatish Balay     jmax = ai[rip[k]+1];
60*49b5e25fSSatish Balay     for (j=jmin; j<jmax; j++){
61*49b5e25fSSatish Balay       vj = riip[aj[j]]; /* col. value */
62*49b5e25fSSatish Balay       if(vj > k){
63*49b5e25fSSatish Balay         qm = k;
64*49b5e25fSSatish Balay         do {
65*49b5e25fSSatish Balay           m  = qm; qm = q[m];
66*49b5e25fSSatish Balay         } while(qm < vj);
67*49b5e25fSSatish Balay         if (qm == vj) {
68*49b5e25fSSatish Balay           printf(" error: duplicate entry in A\n"); break;
69*49b5e25fSSatish Balay         }
70*49b5e25fSSatish Balay         nzk++;
71*49b5e25fSSatish Balay         q[m] = vj;
72*49b5e25fSSatish Balay         q[vj] = qm;
73*49b5e25fSSatish Balay       } /* if(vj > k) */
74*49b5e25fSSatish Balay     } /* for (j=jmin; j<jmax; j++) */
75*49b5e25fSSatish Balay 
76*49b5e25fSSatish Balay     /* modify nonzero structure of k-th row by computing fill-in
77*49b5e25fSSatish Balay        for each row i to be merged in */
78*49b5e25fSSatish Balay     i = k;
79*49b5e25fSSatish Balay     i = jl[i]; /* next pivot row (== mbs for symbolic factorization) */
80*49b5e25fSSatish Balay     /* printf(" next pivot row i=%d\n",i); */
81*49b5e25fSSatish Balay     while (i < mbs){
82*49b5e25fSSatish Balay       /* merge row i into k-th row */
83*49b5e25fSSatish Balay       nzi = iu[i+1] - (iu[i]+1);
84*49b5e25fSSatish Balay       jmin = iu[i] + 1; jmax = iu[i] + nzi;
85*49b5e25fSSatish Balay       qm = k;
86*49b5e25fSSatish Balay       for (j=jmin; j<jmax+1; j++){
87*49b5e25fSSatish Balay         vj = ju[j];
88*49b5e25fSSatish Balay         do {
89*49b5e25fSSatish Balay           m = qm; qm = q[m];
90*49b5e25fSSatish Balay         } while (qm < vj);
91*49b5e25fSSatish Balay         if (qm != vj){
92*49b5e25fSSatish Balay          nzk++; q[m] = vj; q[vj] = qm; qm = vj;
93*49b5e25fSSatish Balay         }
94*49b5e25fSSatish Balay       }
95*49b5e25fSSatish Balay       i = jl[i]; /* next pivot row */
96*49b5e25fSSatish Balay     }
97*49b5e25fSSatish Balay 
98*49b5e25fSSatish Balay     /* add k to row list for first nonzero element in k-th row */
99*49b5e25fSSatish Balay     if (nzk > 0){
100*49b5e25fSSatish Balay       i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */
101*49b5e25fSSatish Balay       jl[k] = jl[i]; jl[i] = k;
102*49b5e25fSSatish Balay     }
103*49b5e25fSSatish Balay     iu[k+1] = iu[k] + nzk;   /* printf(" iu[%d]=%d, umax=%d\n", k+1, iu[k+1],umax);*/
104*49b5e25fSSatish Balay 
105*49b5e25fSSatish Balay     /* allocate more space to ju if needed */
106*49b5e25fSSatish Balay     if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%d\n",k+1, iu[k+1],umax);
107*49b5e25fSSatish Balay       /* estimate how much additional space we will need */
108*49b5e25fSSatish Balay       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
109*49b5e25fSSatish Balay       /* just double the memory each time */
110*49b5e25fSSatish Balay       maxadd = umax;
111*49b5e25fSSatish Balay       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
112*49b5e25fSSatish Balay       umax += maxadd;
113*49b5e25fSSatish Balay 
114*49b5e25fSSatish Balay       /* allocate a longer ju (NOTE: iu poits to the beginning of ju) */
115*49b5e25fSSatish Balay       jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp);
116*49b5e25fSSatish Balay       ierr  = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
117*49b5e25fSSatish Balay       ierr = PetscFree(ju);CHKERRQ(ierr);
118*49b5e25fSSatish Balay       ju = iu = jutmp;
119*49b5e25fSSatish Balay       realloc++; /* count how many times we realloc */
120*49b5e25fSSatish Balay     }
121*49b5e25fSSatish Balay 
122*49b5e25fSSatish Balay     /* save nonzero structure of k-th row in ju */
123*49b5e25fSSatish Balay     i=k;
124*49b5e25fSSatish Balay     jumin = juptr + 1; juptr += nzk;
125*49b5e25fSSatish Balay     for (j=jumin; j<juptr+1; j++){
126*49b5e25fSSatish Balay       i=q[i];
127*49b5e25fSSatish Balay       ju[j]=i;
128*49b5e25fSSatish Balay       /* printf(" k=%d, ju[%d]=%d\n",k,j,ju[j]);*/
129*49b5e25fSSatish Balay     }
130*49b5e25fSSatish Balay     /* printf("\n");  */
131*49b5e25fSSatish Balay   } /* for (k=0; k<mbs; k++) */
132*49b5e25fSSatish Balay 
133*49b5e25fSSatish Balay   if (ai[mbs] != 0) {
134*49b5e25fSSatish Balay     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
135*49b5e25fSSatish Balay     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
136*49b5e25fSSatish Balay     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use \n",af);
137*49b5e25fSSatish Balay     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);\n",af);
138*49b5e25fSSatish Balay     PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:for best performance.\n");
139*49b5e25fSSatish Balay   } else {
140*49b5e25fSSatish Balay      PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
141*49b5e25fSSatish Balay   }
142*49b5e25fSSatish Balay 
143*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&rip);CHKERRQ(ierr);
144*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isicol,&riip);CHKERRQ(ierr);
145*49b5e25fSSatish Balay 
146*49b5e25fSSatish Balay   ierr = PetscFree(q);CHKERRQ(ierr);
147*49b5e25fSSatish Balay   ierr = PetscFree(jl);CHKERRQ(ierr);
148*49b5e25fSSatish Balay 
149*49b5e25fSSatish Balay   /* put together the new matrix */
150*49b5e25fSSatish Balay   ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr);
151*49b5e25fSSatish Balay   PLogObjectParent(*B,isicol);
152*49b5e25fSSatish Balay   b = (Mat_SeqSBAIJ*)(*B)->data;
153*49b5e25fSSatish Balay   ierr = PetscFree(b->imax);CHKERRQ(ierr);
154*49b5e25fSSatish Balay   b->singlemalloc = PETSC_FALSE;
155*49b5e25fSSatish Balay   /* the next line frees the default space generated by the Create() */
156*49b5e25fSSatish Balay   ierr = PetscFree(b->a);CHKERRQ(ierr);
157*49b5e25fSSatish Balay   ierr = PetscFree(b->ilen);CHKERRQ(ierr);
158*49b5e25fSSatish Balay   b->a          = (MatScalar*)PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2);CHKPTRQ(b->a);
159*49b5e25fSSatish Balay   b->j          = ju;
160*49b5e25fSSatish Balay   b->i          = iu;
161*49b5e25fSSatish Balay   b->diag       = 0;
162*49b5e25fSSatish Balay   b->ilen       = 0;
163*49b5e25fSSatish Balay   b->imax       = 0;
164*49b5e25fSSatish Balay   b->row        = isrow;
165*49b5e25fSSatish Balay   b->col        = iscol;
166*49b5e25fSSatish Balay   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
167*49b5e25fSSatish Balay   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
168*49b5e25fSSatish Balay   b->icol       = isicol;
169*49b5e25fSSatish Balay   b->solve_work = (Scalar*)PetscMalloc((bs*mbs+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
170*49b5e25fSSatish Balay   /* In b structure:  Free imax, ilen, old a, old j.
171*49b5e25fSSatish Balay      Allocate idnew, solve_work, new a, new j */
172*49b5e25fSSatish Balay   PLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
173*49b5e25fSSatish Balay   b->s_maxnz = b->s_nz = iu[mbs];
174*49b5e25fSSatish Balay 
175*49b5e25fSSatish Balay   (*B)->factor                 = FACTOR_LU;
176*49b5e25fSSatish Balay   (*B)->info.factor_mallocs    = realloc;
177*49b5e25fSSatish Balay   (*B)->info.fill_ratio_given  = f;
178*49b5e25fSSatish Balay   if (ai[mbs] != 0) {
179*49b5e25fSSatish Balay     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
180*49b5e25fSSatish Balay   } else {
181*49b5e25fSSatish Balay     (*B)->info.fill_ratio_needed = 0.0;
182*49b5e25fSSatish Balay   }
183*49b5e25fSSatish Balay 
184*49b5e25fSSatish Balay 
185*49b5e25fSSatish Balay   PetscFunctionReturn(0);
186*49b5e25fSSatish Balay }
187*49b5e25fSSatish Balay 
188*49b5e25fSSatish Balay /* ----------------------------------------------------------- */
189*49b5e25fSSatish Balay #undef __FUNC__
190*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_N"
191*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_N(Mat A,Mat *B)
192*49b5e25fSSatish Balay {
193*49b5e25fSSatish Balay   Mat                C = *B;
194*49b5e25fSSatish Balay   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
195*49b5e25fSSatish Balay   IS                 isrow = b->row,isicol = b->icol;
196*49b5e25fSSatish Balay   int                *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
197*49b5e25fSSatish Balay   int                *ajtmpold,*ajtmp,nz,row,bslog,*ai=a->i,*aj=a->j,k,flg;
198*49b5e25fSSatish Balay   int                *diag_offset=b->diag,diag,bs=a->bs,bs2 = a->bs2,*v_pivots,*pj;
199*49b5e25fSSatish Balay   MatScalar          *ba = b->a,*aa = a->a,*pv,*v,*rtmp,*multiplier,*v_work,*pc,*w;
200*49b5e25fSSatish Balay 
201*49b5e25fSSatish Balay   PetscFunctionBegin;
202*49b5e25fSSatish Balay   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
203*49b5e25fSSatish Balay   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
204*49b5e25fSSatish Balay   rtmp = (MatScalar*)PetscMalloc(bs2*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
205*49b5e25fSSatish Balay   ierr = PetscMemzero(rtmp,bs2*(n+1)*sizeof(MatScalar));CHKERRQ(ierr);
206*49b5e25fSSatish Balay   /* generate work space needed by dense LU factorization */
207*49b5e25fSSatish Balay   v_work     = (MatScalar*)PetscMalloc(bs*sizeof(int) + (bs+bs2)*sizeof(MatScalar));CHKPTRQ(v_work);
208*49b5e25fSSatish Balay   multiplier = v_work + bs;
209*49b5e25fSSatish Balay   v_pivots   = (int*)(multiplier + bs2);
210*49b5e25fSSatish Balay 
211*49b5e25fSSatish Balay   /* flops in while loop */
212*49b5e25fSSatish Balay   bslog = 2*bs*bs2;
213*49b5e25fSSatish Balay 
214*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
215*49b5e25fSSatish Balay     nz    = bi[i+1] - bi[i];
216*49b5e25fSSatish Balay     ajtmp = bj + bi[i];
217*49b5e25fSSatish Balay     for  (j=0; j<nz; j++) {
218*49b5e25fSSatish Balay       ierr = PetscMemzero(rtmp+bs2*ajtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
219*49b5e25fSSatish Balay     }
220*49b5e25fSSatish Balay     /* load in initial (unfactored row) */
221*49b5e25fSSatish Balay     nz       = ai[r[i]+1] - ai[r[i]];
222*49b5e25fSSatish Balay     ajtmpold = aj + ai[r[i]];
223*49b5e25fSSatish Balay     v        = aa + bs2*ai[r[i]];
224*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
225*49b5e25fSSatish Balay       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmpold[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
226*49b5e25fSSatish Balay     }
227*49b5e25fSSatish Balay     row = *ajtmp++;
228*49b5e25fSSatish Balay     while (row < i) {
229*49b5e25fSSatish Balay       pc = rtmp + bs2*row;
230*49b5e25fSSatish Balay /*      if (*pc) { */
231*49b5e25fSSatish Balay       for (flg=0,k=0; k<bs2; k++) { if (pc[k]!=0.0) { flg =1; break; }}
232*49b5e25fSSatish Balay       if (flg) {
233*49b5e25fSSatish Balay         pv = ba + bs2*diag_offset[row];
234*49b5e25fSSatish Balay         pj = bj + diag_offset[row] + 1;
235*49b5e25fSSatish Balay         Kernel_A_gets_A_times_B(bs,pc,pv,multiplier);
236*49b5e25fSSatish Balay         nz = bi[row+1] - diag_offset[row] - 1;
237*49b5e25fSSatish Balay         pv += bs2;
238*49b5e25fSSatish Balay         for (j=0; j<nz; j++) {
239*49b5e25fSSatish Balay           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
240*49b5e25fSSatish Balay         }
241*49b5e25fSSatish Balay         PLogFlops(bslog*(nz+1)-bs);
242*49b5e25fSSatish Balay       }
243*49b5e25fSSatish Balay         row = *ajtmp++;
244*49b5e25fSSatish Balay     }
245*49b5e25fSSatish Balay     /* finished row so stick it into b->a */
246*49b5e25fSSatish Balay     pv = ba + bs2*bi[i];
247*49b5e25fSSatish Balay     pj = bj + bi[i];
248*49b5e25fSSatish Balay     nz = bi[i+1] - bi[i];
249*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
250*49b5e25fSSatish Balay       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
251*49b5e25fSSatish Balay     }
252*49b5e25fSSatish Balay     diag = diag_offset[i] - bi[i];
253*49b5e25fSSatish Balay     /* invert diagonal block */
254*49b5e25fSSatish Balay     w = pv + bs2*diag;
255*49b5e25fSSatish Balay     Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);
256*49b5e25fSSatish Balay   }
257*49b5e25fSSatish Balay 
258*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
259*49b5e25fSSatish Balay   ierr = PetscFree(v_work);CHKERRQ(ierr);
260*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
261*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
262*49b5e25fSSatish Balay   C->factor = FACTOR_LU;
263*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
264*49b5e25fSSatish Balay   PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
265*49b5e25fSSatish Balay   PetscFunctionReturn(0);
266*49b5e25fSSatish Balay }
267*49b5e25fSSatish Balay /* ------------------------------------------------------------*/
268*49b5e25fSSatish Balay /*
269*49b5e25fSSatish Balay       Version for when blocks are 7 by 7
270*49b5e25fSSatish Balay */
271*49b5e25fSSatish Balay #undef __FUNC__
272*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_7"
273*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_7(Mat A,Mat *B)
274*49b5e25fSSatish Balay {
275*49b5e25fSSatish Balay   Mat         C = *B;
276*49b5e25fSSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
277*49b5e25fSSatish Balay   IS          isrow = b->row,isicol = b->icol;
278*49b5e25fSSatish Balay   int         *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
279*49b5e25fSSatish Balay   int         *ajtmpold,*ajtmp,nz,row;
280*49b5e25fSSatish Balay   int         *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
281*49b5e25fSSatish Balay   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
282*49b5e25fSSatish Balay   MatScalar   p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
283*49b5e25fSSatish Balay   MatScalar   p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
284*49b5e25fSSatish Balay   MatScalar   x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14;
285*49b5e25fSSatish Balay   MatScalar   p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12;
286*49b5e25fSSatish Balay   MatScalar   m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25;
287*49b5e25fSSatish Balay   MatScalar   p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36;
288*49b5e25fSSatish Balay   MatScalar   p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49;
289*49b5e25fSSatish Balay   MatScalar   x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36;
290*49b5e25fSSatish Balay   MatScalar   x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49;
291*49b5e25fSSatish Balay   MatScalar   m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36;
292*49b5e25fSSatish Balay   MatScalar   m37,m38,m39,m40,m41,m42,m43,m44,m45,m46,m47,m48,m49;
293*49b5e25fSSatish Balay   MatScalar   *ba = b->a,*aa = a->a;
294*49b5e25fSSatish Balay 
295*49b5e25fSSatish Balay   PetscFunctionBegin;
296*49b5e25fSSatish Balay   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
297*49b5e25fSSatish Balay   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
298*49b5e25fSSatish Balay   rtmp  = (MatScalar*)PetscMalloc(49*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
299*49b5e25fSSatish Balay 
300*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
301*49b5e25fSSatish Balay     nz    = bi[i+1] - bi[i];
302*49b5e25fSSatish Balay     ajtmp = bj + bi[i];
303*49b5e25fSSatish Balay     for  (j=0; j<nz; j++) {
304*49b5e25fSSatish Balay       x = rtmp+49*ajtmp[j];
305*49b5e25fSSatish Balay       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
306*49b5e25fSSatish Balay       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
307*49b5e25fSSatish Balay       x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ;
308*49b5e25fSSatish Balay       x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ;
309*49b5e25fSSatish Balay       x[34] = x[35] = x[36] = x[37] = x[38] = x[39] = x[40] = x[41] = 0.0 ;
310*49b5e25fSSatish Balay       x[42] = x[43] = x[44] = x[45] = x[46] = x[47] = x[48] = 0.0 ;
311*49b5e25fSSatish Balay     }
312*49b5e25fSSatish Balay     /* load in initial (unfactored row) */
313*49b5e25fSSatish Balay     idx      = r[i];
314*49b5e25fSSatish Balay     nz       = ai[idx+1] - ai[idx];
315*49b5e25fSSatish Balay     ajtmpold = aj + ai[idx];
316*49b5e25fSSatish Balay     v        = aa + 49*ai[idx];
317*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
318*49b5e25fSSatish Balay       x    = rtmp+49*ic[ajtmpold[j]];
319*49b5e25fSSatish Balay       x[0] =  v[0];  x[1] =  v[1];  x[2] =  v[2];  x[3] =  v[3];
320*49b5e25fSSatish Balay       x[4] =  v[4];  x[5] =  v[5];  x[6] =  v[6];  x[7] =  v[7];
321*49b5e25fSSatish Balay       x[8] =  v[8];  x[9] =  v[9];  x[10] = v[10]; x[11] = v[11];
322*49b5e25fSSatish Balay       x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15];
323*49b5e25fSSatish Balay       x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19];
324*49b5e25fSSatish Balay       x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23];
325*49b5e25fSSatish Balay       x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27];
326*49b5e25fSSatish Balay       x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31];
327*49b5e25fSSatish Balay       x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35];
328*49b5e25fSSatish Balay       x[36] = v[36]; x[37] = v[37]; x[38] = v[38]; x[39] = v[39];
329*49b5e25fSSatish Balay       x[40] = v[40]; x[41] = v[41]; x[42] = v[42]; x[43] = v[43];
330*49b5e25fSSatish Balay       x[44] = v[44]; x[45] = v[45]; x[46] = v[46]; x[47] = v[47];
331*49b5e25fSSatish Balay       x[48] = v[48];
332*49b5e25fSSatish Balay       v    += 49;
333*49b5e25fSSatish Balay     }
334*49b5e25fSSatish Balay     row = *ajtmp++;
335*49b5e25fSSatish Balay     while (row < i) {
336*49b5e25fSSatish Balay       pc  =  rtmp + 49*row;
337*49b5e25fSSatish Balay       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
338*49b5e25fSSatish Balay       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];
339*49b5e25fSSatish Balay       p9  = pc[8];  p10 = pc[9];  p11 = pc[10]; p12 = pc[11];
340*49b5e25fSSatish Balay       p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15];
341*49b5e25fSSatish Balay       p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19];
342*49b5e25fSSatish Balay       p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23];
343*49b5e25fSSatish Balay       p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27];
344*49b5e25fSSatish Balay       p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31];
345*49b5e25fSSatish Balay       p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35];
346*49b5e25fSSatish Balay       p37 = pc[36]; p38 = pc[37]; p39 = pc[38]; p40 = pc[39];
347*49b5e25fSSatish Balay       p41 = pc[40]; p42 = pc[41]; p43 = pc[42]; p44 = pc[43];
348*49b5e25fSSatish Balay       p45 = pc[44]; p46 = pc[45]; p47 = pc[46]; p48 = pc[47];
349*49b5e25fSSatish Balay       p49 = pc[48];
350*49b5e25fSSatish Balay       if (p1  != 0.0 || p2  != 0.0 || p3  != 0.0 || p4  != 0.0 ||
351*49b5e25fSSatish Balay           p5  != 0.0 || p6  != 0.0 || p7  != 0.0 || p8  != 0.0 ||
352*49b5e25fSSatish Balay           p9  != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 ||
353*49b5e25fSSatish Balay           p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 ||
354*49b5e25fSSatish Balay           p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 ||
355*49b5e25fSSatish Balay           p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 ||
356*49b5e25fSSatish Balay           p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 ||
357*49b5e25fSSatish Balay           p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 ||
358*49b5e25fSSatish Balay           p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0 ||
359*49b5e25fSSatish Balay           p37 != 0.0 || p38 != 0.0 || p39 != 0.0 || p40 != 0.0 ||
360*49b5e25fSSatish Balay           p41 != 0.0 || p42 != 0.0 || p43 != 0.0 || p44 != 0.0 ||
361*49b5e25fSSatish Balay           p45 != 0.0 || p46 != 0.0 || p47 != 0.0 || p48 != 0.0 ||
362*49b5e25fSSatish Balay           p49 != 0.0) {
363*49b5e25fSSatish Balay         pv = ba + 49*diag_offset[row];
364*49b5e25fSSatish Balay         pj = bj + diag_offset[row] + 1;
365*49b5e25fSSatish Balay 	x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
366*49b5e25fSSatish Balay 	x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
367*49b5e25fSSatish Balay 	x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
368*49b5e25fSSatish Balay 	x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
369*49b5e25fSSatish Balay 	x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
370*49b5e25fSSatish Balay 	x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
371*49b5e25fSSatish Balay 	x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
372*49b5e25fSSatish Balay 	x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
373*49b5e25fSSatish Balay 	x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
374*49b5e25fSSatish Balay 	x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39];
375*49b5e25fSSatish Balay 	x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43];
376*49b5e25fSSatish Balay 	x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47];
377*49b5e25fSSatish Balay 	x49 = pv[48];
378*49b5e25fSSatish Balay         pc[0]  = m1  = p1*x1  + p8*x2   + p15*x3  + p22*x4  + p29*x5  + p36*x6 + p43*x7;
379*49b5e25fSSatish Balay         pc[1]  = m2  = p2*x1  + p9*x2   + p16*x3  + p23*x4  + p30*x5  + p37*x6 + p44*x7;
380*49b5e25fSSatish Balay         pc[2]  = m3  = p3*x1  + p10*x2  + p17*x3  + p24*x4  + p31*x5  + p38*x6 + p45*x7;
381*49b5e25fSSatish Balay         pc[3]  = m4  = p4*x1  + p11*x2  + p18*x3  + p25*x4  + p32*x5  + p39*x6 + p46*x7;
382*49b5e25fSSatish Balay         pc[4]  = m5  = p5*x1  + p12*x2  + p19*x3  + p26*x4  + p33*x5  + p40*x6 + p47*x7;
383*49b5e25fSSatish Balay         pc[5]  = m6  = p6*x1  + p13*x2  + p20*x3  + p27*x4  + p34*x5  + p41*x6 + p48*x7;
384*49b5e25fSSatish Balay         pc[6]  = m7  = p7*x1  + p14*x2  + p21*x3  + p28*x4  + p35*x5  + p42*x6 + p49*x7;
385*49b5e25fSSatish Balay 
386*49b5e25fSSatish Balay         pc[7]  = m8  = p1*x8  + p8*x9   + p15*x10 + p22*x11 + p29*x12 + p36*x13 + p43*x14;
387*49b5e25fSSatish Balay         pc[8]  = m9  = p2*x8  + p9*x9   + p16*x10 + p23*x11 + p30*x12 + p37*x13 + p44*x14;
388*49b5e25fSSatish Balay         pc[9]  = m10 = p3*x8  + p10*x9  + p17*x10 + p24*x11 + p31*x12 + p38*x13 + p45*x14;
389*49b5e25fSSatish Balay         pc[10] = m11 = p4*x8  + p11*x9  + p18*x10 + p25*x11 + p32*x12 + p39*x13 + p46*x14;
390*49b5e25fSSatish Balay         pc[11] = m12 = p5*x8  + p12*x9  + p19*x10 + p26*x11 + p33*x12 + p40*x13 + p47*x14;
391*49b5e25fSSatish Balay         pc[12] = m13 = p6*x8  + p13*x9  + p20*x10 + p27*x11 + p34*x12 + p41*x13 + p48*x14;
392*49b5e25fSSatish Balay         pc[13] = m14 = p7*x8  + p14*x9  + p21*x10 + p28*x11 + p35*x12 + p42*x13 + p49*x14;
393*49b5e25fSSatish Balay 
394*49b5e25fSSatish Balay         pc[14] = m15 = p1*x15 + p8*x16  + p15*x17 + p22*x18 + p29*x19 + p36*x20 + p43*x21;
395*49b5e25fSSatish Balay         pc[15] = m16 = p2*x15 + p9*x16  + p16*x17 + p23*x18 + p30*x19 + p37*x20 + p44*x21;
396*49b5e25fSSatish Balay         pc[16] = m17 = p3*x15 + p10*x16 + p17*x17 + p24*x18 + p31*x19 + p38*x20 + p45*x21;
397*49b5e25fSSatish Balay         pc[17] = m18 = p4*x15 + p11*x16 + p18*x17 + p25*x18 + p32*x19 + p39*x20 + p46*x21;
398*49b5e25fSSatish Balay         pc[18] = m19 = p5*x15 + p12*x16 + p19*x17 + p26*x18 + p33*x19 + p40*x20 + p47*x21;
399*49b5e25fSSatish Balay         pc[19] = m20 = p6*x15 + p13*x16 + p20*x17 + p27*x18 + p34*x19 + p41*x20 + p48*x21;
400*49b5e25fSSatish Balay         pc[20] = m21 = p7*x15 + p14*x16 + p21*x17 + p28*x18 + p35*x19 + p42*x20 + p49*x21;
401*49b5e25fSSatish Balay 
402*49b5e25fSSatish Balay         pc[21] = m22 = p1*x22 + p8*x23  + p15*x24 + p22*x25 + p29*x26 + p36*x27 + p43*x28;
403*49b5e25fSSatish Balay         pc[22] = m23 = p2*x22 + p9*x23  + p16*x24 + p23*x25 + p30*x26 + p37*x27 + p44*x28;
404*49b5e25fSSatish Balay         pc[23] = m24 = p3*x22 + p10*x23 + p17*x24 + p24*x25 + p31*x26 + p38*x27 + p45*x28;
405*49b5e25fSSatish Balay         pc[24] = m25 = p4*x22 + p11*x23 + p18*x24 + p25*x25 + p32*x26 + p39*x27 + p46*x28;
406*49b5e25fSSatish Balay         pc[25] = m26 = p5*x22 + p12*x23 + p19*x24 + p26*x25 + p33*x26 + p40*x27 + p47*x28;
407*49b5e25fSSatish Balay         pc[26] = m27 = p6*x22 + p13*x23 + p20*x24 + p27*x25 + p34*x26 + p41*x27 + p48*x28;
408*49b5e25fSSatish Balay         pc[27] = m28 = p7*x22 + p14*x23 + p21*x24 + p28*x25 + p35*x26 + p42*x27 + p49*x28;
409*49b5e25fSSatish Balay 
410*49b5e25fSSatish Balay         pc[28] = m29 = p1*x29 + p8*x30  + p15*x31 + p22*x32 + p29*x33 + p36*x34 + p43*x35;
411*49b5e25fSSatish Balay         pc[29] = m30 = p2*x29 + p9*x30  + p16*x31 + p23*x32 + p30*x33 + p37*x34 + p44*x35;
412*49b5e25fSSatish Balay         pc[30] = m31 = p3*x29 + p10*x30 + p17*x31 + p24*x32 + p31*x33 + p38*x34 + p45*x35;
413*49b5e25fSSatish Balay         pc[31] = m32 = p4*x29 + p11*x30 + p18*x31 + p25*x32 + p32*x33 + p39*x34 + p46*x35;
414*49b5e25fSSatish Balay         pc[32] = m33 = p5*x29 + p12*x30 + p19*x31 + p26*x32 + p33*x33 + p40*x34 + p47*x35;
415*49b5e25fSSatish Balay         pc[33] = m34 = p6*x29 + p13*x30 + p20*x31 + p27*x32 + p34*x33 + p41*x34 + p48*x35;
416*49b5e25fSSatish Balay         pc[34] = m35 = p7*x29 + p14*x30 + p21*x31 + p28*x32 + p35*x33 + p42*x34 + p49*x35;
417*49b5e25fSSatish Balay 
418*49b5e25fSSatish Balay         pc[35] = m36 = p1*x36 + p8*x37  + p15*x38 + p22*x39 + p29*x40 + p36*x41 + p43*x42;
419*49b5e25fSSatish Balay         pc[36] = m37 = p2*x36 + p9*x37  + p16*x38 + p23*x39 + p30*x40 + p37*x41 + p44*x42;
420*49b5e25fSSatish Balay         pc[37] = m38 = p3*x36 + p10*x37 + p17*x38 + p24*x39 + p31*x40 + p38*x41 + p45*x42;
421*49b5e25fSSatish Balay         pc[38] = m39 = p4*x36 + p11*x37 + p18*x38 + p25*x39 + p32*x40 + p39*x41 + p46*x42;
422*49b5e25fSSatish Balay         pc[39] = m40 = p5*x36 + p12*x37 + p19*x38 + p26*x39 + p33*x40 + p40*x41 + p47*x42;
423*49b5e25fSSatish Balay         pc[40] = m41 = p6*x36 + p13*x37 + p20*x38 + p27*x39 + p34*x40 + p41*x41 + p48*x42;
424*49b5e25fSSatish Balay         pc[41] = m42 = p7*x36 + p14*x37 + p21*x38 + p28*x39 + p35*x40 + p42*x41 + p49*x42;
425*49b5e25fSSatish Balay 
426*49b5e25fSSatish Balay         pc[42] = m43 = p1*x43 + p8*x44  + p15*x45 + p22*x46 + p29*x47 + p36*x48 + p43*x49;
427*49b5e25fSSatish Balay         pc[43] = m44 = p2*x43 + p9*x44  + p16*x45 + p23*x46 + p30*x47 + p37*x48 + p44*x49;
428*49b5e25fSSatish Balay         pc[44] = m45 = p3*x43 + p10*x44 + p17*x45 + p24*x46 + p31*x47 + p38*x48 + p45*x49;
429*49b5e25fSSatish Balay         pc[45] = m46 = p4*x43 + p11*x44 + p18*x45 + p25*x46 + p32*x47 + p39*x48 + p46*x49;
430*49b5e25fSSatish Balay         pc[46] = m47 = p5*x43 + p12*x44 + p19*x45 + p26*x46 + p33*x47 + p40*x48 + p47*x49;
431*49b5e25fSSatish Balay         pc[47] = m48 = p6*x43 + p13*x44 + p20*x45 + p27*x46 + p34*x47 + p41*x48 + p48*x49;
432*49b5e25fSSatish Balay         pc[48] = m49 = p7*x43 + p14*x44 + p21*x45 + p28*x46 + p35*x47 + p42*x48 + p49*x49;
433*49b5e25fSSatish Balay 
434*49b5e25fSSatish Balay         nz = bi[row+1] - diag_offset[row] - 1;
435*49b5e25fSSatish Balay         pv += 49;
436*49b5e25fSSatish Balay         for (j=0; j<nz; j++) {
437*49b5e25fSSatish Balay 	  x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
438*49b5e25fSSatish Balay 	  x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
439*49b5e25fSSatish Balay 	  x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
440*49b5e25fSSatish Balay 	  x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
441*49b5e25fSSatish Balay 	  x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
442*49b5e25fSSatish Balay 	  x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
443*49b5e25fSSatish Balay 	  x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
444*49b5e25fSSatish Balay 	  x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
445*49b5e25fSSatish Balay 	  x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
446*49b5e25fSSatish Balay 	  x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39];
447*49b5e25fSSatish Balay 	  x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43];
448*49b5e25fSSatish Balay 	  x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47];
449*49b5e25fSSatish Balay 	  x49 = pv[48];
450*49b5e25fSSatish Balay 	  x    = rtmp + 49*pj[j];
451*49b5e25fSSatish Balay 	  x[0]  -= m1*x1  + m8*x2   + m15*x3  + m22*x4  + m29*x5  + m36*x6 + m43*x7;
452*49b5e25fSSatish Balay 	  x[1]  -= m2*x1  + m9*x2   + m16*x3  + m23*x4  + m30*x5  + m37*x6 + m44*x7;
453*49b5e25fSSatish Balay 	  x[2]  -= m3*x1  + m10*x2  + m17*x3  + m24*x4  + m31*x5  + m38*x6 + m45*x7;
454*49b5e25fSSatish Balay 	  x[3]  -= m4*x1  + m11*x2  + m18*x3  + m25*x4  + m32*x5  + m39*x6 + m46*x7;
455*49b5e25fSSatish Balay 	  x[4]  -= m5*x1  + m12*x2  + m19*x3  + m26*x4  + m33*x5  + m40*x6 + m47*x7;
456*49b5e25fSSatish Balay 	  x[5]  -= m6*x1  + m13*x2  + m20*x3  + m27*x4  + m34*x5  + m41*x6 + m48*x7;
457*49b5e25fSSatish Balay 	  x[6]  -= m7*x1  + m14*x2  + m21*x3  + m28*x4  + m35*x5  + m42*x6 + m49*x7;
458*49b5e25fSSatish Balay 
459*49b5e25fSSatish Balay 	  x[7]  -= m1*x8  + m8*x9   + m15*x10 + m22*x11 + m29*x12 + m36*x13 + m43*x14;
460*49b5e25fSSatish Balay 	  x[8]  -= m2*x8  + m9*x9   + m16*x10 + m23*x11 + m30*x12 + m37*x13 + m44*x14;
461*49b5e25fSSatish Balay 	  x[9]  -= m3*x8  + m10*x9  + m17*x10 + m24*x11 + m31*x12 + m38*x13 + m45*x14;
462*49b5e25fSSatish Balay 	  x[10] -= m4*x8  + m11*x9  + m18*x10 + m25*x11 + m32*x12 + m39*x13 + m46*x14;
463*49b5e25fSSatish Balay 	  x[11] -= m5*x8  + m12*x9  + m19*x10 + m26*x11 + m33*x12 + m40*x13 + m47*x14;
464*49b5e25fSSatish Balay 	  x[12] -= m6*x8  + m13*x9  + m20*x10 + m27*x11 + m34*x12 + m41*x13 + m48*x14;
465*49b5e25fSSatish Balay 	  x[13] -= m7*x8  + m14*x9  + m21*x10 + m28*x11 + m35*x12 + m42*x13 + m49*x14;
466*49b5e25fSSatish Balay 
467*49b5e25fSSatish Balay 	  x[14] -= m1*x15 + m8*x16  + m15*x17 + m22*x18 + m29*x19 + m36*x20 + m43*x21;
468*49b5e25fSSatish Balay 	  x[15] -= m2*x15 + m9*x16  + m16*x17 + m23*x18 + m30*x19 + m37*x20 + m44*x21;
469*49b5e25fSSatish Balay 	  x[16] -= m3*x15 + m10*x16 + m17*x17 + m24*x18 + m31*x19 + m38*x20 + m45*x21;
470*49b5e25fSSatish Balay 	  x[17] -= m4*x15 + m11*x16 + m18*x17 + m25*x18 + m32*x19 + m39*x20 + m46*x21;
471*49b5e25fSSatish Balay 	  x[18] -= m5*x15 + m12*x16 + m19*x17 + m26*x18 + m33*x19 + m40*x20 + m47*x21;
472*49b5e25fSSatish Balay 	  x[19] -= m6*x15 + m13*x16 + m20*x17 + m27*x18 + m34*x19 + m41*x20 + m48*x21;
473*49b5e25fSSatish Balay 	  x[20] -= m7*x15 + m14*x16 + m21*x17 + m28*x18 + m35*x19 + m42*x20 + m49*x21;
474*49b5e25fSSatish Balay 
475*49b5e25fSSatish Balay 	  x[21] -= m1*x22 + m8*x23  + m15*x24 + m22*x25 + m29*x26 + m36*x27 + m43*x28;
476*49b5e25fSSatish Balay 	  x[22] -= m2*x22 + m9*x23  + m16*x24 + m23*x25 + m30*x26 + m37*x27 + m44*x28;
477*49b5e25fSSatish Balay 	  x[23] -= m3*x22 + m10*x23 + m17*x24 + m24*x25 + m31*x26 + m38*x27 + m45*x28;
478*49b5e25fSSatish Balay 	  x[24] -= m4*x22 + m11*x23 + m18*x24 + m25*x25 + m32*x26 + m39*x27 + m46*x28;
479*49b5e25fSSatish Balay 	  x[25] -= m5*x22 + m12*x23 + m19*x24 + m26*x25 + m33*x26 + m40*x27 + m47*x28;
480*49b5e25fSSatish Balay 	  x[26] -= m6*x22 + m13*x23 + m20*x24 + m27*x25 + m34*x26 + m41*x27 + m48*x28;
481*49b5e25fSSatish Balay 	  x[27] -= m7*x22 + m14*x23 + m21*x24 + m28*x25 + m35*x26 + m42*x27 + m49*x28;
482*49b5e25fSSatish Balay 
483*49b5e25fSSatish Balay 	  x[28] -= m1*x29 + m8*x30  + m15*x31 + m22*x32 + m29*x33 + m36*x34 + m43*x35;
484*49b5e25fSSatish Balay 	  x[29] -= m2*x29 + m9*x30  + m16*x31 + m23*x32 + m30*x33 + m37*x34 + m44*x35;
485*49b5e25fSSatish Balay 	  x[30] -= m3*x29 + m10*x30 + m17*x31 + m24*x32 + m31*x33 + m38*x34 + m45*x35;
486*49b5e25fSSatish Balay 	  x[31] -= m4*x29 + m11*x30 + m18*x31 + m25*x32 + m32*x33 + m39*x34 + m46*x35;
487*49b5e25fSSatish Balay 	  x[32] -= m5*x29 + m12*x30 + m19*x31 + m26*x32 + m33*x33 + m40*x34 + m47*x35;
488*49b5e25fSSatish Balay 	  x[33] -= m6*x29 + m13*x30 + m20*x31 + m27*x32 + m34*x33 + m41*x34 + m48*x35;
489*49b5e25fSSatish Balay 	  x[34] -= m7*x29 + m14*x30 + m21*x31 + m28*x32 + m35*x33 + m42*x34 + m49*x35;
490*49b5e25fSSatish Balay 
491*49b5e25fSSatish Balay 	  x[35] -= m1*x36 + m8*x37  + m15*x38 + m22*x39 + m29*x40 + m36*x41 + m43*x42;
492*49b5e25fSSatish Balay 	  x[36] -= m2*x36 + m9*x37  + m16*x38 + m23*x39 + m30*x40 + m37*x41 + m44*x42;
493*49b5e25fSSatish Balay 	  x[37] -= m3*x36 + m10*x37 + m17*x38 + m24*x39 + m31*x40 + m38*x41 + m45*x42;
494*49b5e25fSSatish Balay 	  x[38] -= m4*x36 + m11*x37 + m18*x38 + m25*x39 + m32*x40 + m39*x41 + m46*x42;
495*49b5e25fSSatish Balay 	  x[39] -= m5*x36 + m12*x37 + m19*x38 + m26*x39 + m33*x40 + m40*x41 + m47*x42;
496*49b5e25fSSatish Balay 	  x[40] -= m6*x36 + m13*x37 + m20*x38 + m27*x39 + m34*x40 + m41*x41 + m48*x42;
497*49b5e25fSSatish Balay 	  x[41] -= m7*x36 + m14*x37 + m21*x38 + m28*x39 + m35*x40 + m42*x41 + m49*x42;
498*49b5e25fSSatish Balay 
499*49b5e25fSSatish Balay 	  x[42] -= m1*x43 + m8*x44  + m15*x45 + m22*x46 + m29*x47 + m36*x48 + m43*x49;
500*49b5e25fSSatish Balay 	  x[43] -= m2*x43 + m9*x44  + m16*x45 + m23*x46 + m30*x47 + m37*x48 + m44*x49;
501*49b5e25fSSatish Balay 	  x[44] -= m3*x43 + m10*x44 + m17*x45 + m24*x46 + m31*x47 + m38*x48 + m45*x49;
502*49b5e25fSSatish Balay 	  x[45] -= m4*x43 + m11*x44 + m18*x45 + m25*x46 + m32*x47 + m39*x48 + m46*x49;
503*49b5e25fSSatish Balay 	  x[46] -= m5*x43 + m12*x44 + m19*x45 + m26*x46 + m33*x47 + m40*x48 + m47*x49;
504*49b5e25fSSatish Balay 	  x[47] -= m6*x43 + m13*x44 + m20*x45 + m27*x46 + m34*x47 + m41*x48 + m48*x49;
505*49b5e25fSSatish Balay 	  x[48] -= m7*x43 + m14*x44 + m21*x45 + m28*x46 + m35*x47 + m42*x48 + m49*x49;
506*49b5e25fSSatish Balay           pv   += 49;
507*49b5e25fSSatish Balay         }
508*49b5e25fSSatish Balay         PLogFlops(686*nz+637);
509*49b5e25fSSatish Balay       }
510*49b5e25fSSatish Balay       row = *ajtmp++;
511*49b5e25fSSatish Balay     }
512*49b5e25fSSatish Balay     /* finished row so stick it into b->a */
513*49b5e25fSSatish Balay     pv = ba + 49*bi[i];
514*49b5e25fSSatish Balay     pj = bj + bi[i];
515*49b5e25fSSatish Balay     nz = bi[i+1] - bi[i];
516*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
517*49b5e25fSSatish Balay       x      = rtmp+49*pj[j];
518*49b5e25fSSatish Balay       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
519*49b5e25fSSatish Balay       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7];
520*49b5e25fSSatish Balay       pv[8]  = x[8];  pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11];
521*49b5e25fSSatish Balay       pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
522*49b5e25fSSatish Balay       pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19];
523*49b5e25fSSatish Balay       pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23];
524*49b5e25fSSatish Balay       pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27];
525*49b5e25fSSatish Balay       pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31];
526*49b5e25fSSatish Balay       pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35];
527*49b5e25fSSatish Balay       pv[36] = x[36]; pv[37] = x[37]; pv[38] = x[38]; pv[39] = x[39];
528*49b5e25fSSatish Balay       pv[40] = x[40]; pv[41] = x[41]; pv[42] = x[42]; pv[43] = x[43];
529*49b5e25fSSatish Balay       pv[44] = x[44]; pv[45] = x[45]; pv[46] = x[46]; pv[47] = x[47];
530*49b5e25fSSatish Balay       pv[48] = x[48];
531*49b5e25fSSatish Balay       pv   += 49;
532*49b5e25fSSatish Balay     }
533*49b5e25fSSatish Balay     /* invert diagonal block */
534*49b5e25fSSatish Balay     w = ba + 49*diag_offset[i];
535*49b5e25fSSatish Balay     ierr = Kernel_A_gets_inverse_A_7(w);CHKERRQ(ierr);
536*49b5e25fSSatish Balay   }
537*49b5e25fSSatish Balay 
538*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
539*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
540*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
541*49b5e25fSSatish Balay   C->factor = FACTOR_LU;
542*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
543*49b5e25fSSatish Balay   PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */
544*49b5e25fSSatish Balay   PetscFunctionReturn(0);
545*49b5e25fSSatish Balay }
546*49b5e25fSSatish Balay 
547*49b5e25fSSatish Balay /*
548*49b5e25fSSatish Balay       Version for when blocks are 7 by 7 Using natural ordering
549*49b5e25fSSatish Balay */
550*49b5e25fSSatish Balay #undef __FUNC__
551*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_7_NaturalOrdering"
552*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_7_NaturalOrdering(Mat A,Mat *B)
553*49b5e25fSSatish Balay {
554*49b5e25fSSatish Balay   Mat          C = *B;
555*49b5e25fSSatish Balay   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
556*49b5e25fSSatish Balay   int          ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
557*49b5e25fSSatish Balay   int          *ajtmpold,*ajtmp,nz,row;
558*49b5e25fSSatish Balay   int          *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
559*49b5e25fSSatish Balay   MatScalar    *pv,*v,*rtmp,*pc,*w,*x;
560*49b5e25fSSatish Balay   MatScalar    x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
561*49b5e25fSSatish Balay   MatScalar    x16,x17,x18,x19,x20,x21,x22,x23,x24,x25;
562*49b5e25fSSatish Balay   MatScalar    p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15;
563*49b5e25fSSatish Balay   MatScalar    p16,p17,p18,p19,p20,p21,p22,p23,p24,p25;
564*49b5e25fSSatish Balay   MatScalar    m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15;
565*49b5e25fSSatish Balay   MatScalar    m16,m17,m18,m19,m20,m21,m22,m23,m24,m25;
566*49b5e25fSSatish Balay   MatScalar    p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36;
567*49b5e25fSSatish Balay   MatScalar    p37,p38,p39,p40,p41,p42,p43,p44,p45,p46,p47,p48,p49;
568*49b5e25fSSatish Balay   MatScalar    x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36;
569*49b5e25fSSatish Balay   MatScalar    x37,x38,x39,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49;
570*49b5e25fSSatish Balay   MatScalar    m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36;
571*49b5e25fSSatish Balay   MatScalar    m37,m38,m39,m40,m41,m42,m43,m44,m45,m46,m47,m48,m49;
572*49b5e25fSSatish Balay   MatScalar    *ba = b->a,*aa = a->a;
573*49b5e25fSSatish Balay 
574*49b5e25fSSatish Balay   PetscFunctionBegin;
575*49b5e25fSSatish Balay   rtmp  = (MatScalar*)PetscMalloc(49*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
576*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
577*49b5e25fSSatish Balay     nz    = bi[i+1] - bi[i];
578*49b5e25fSSatish Balay     ajtmp = bj + bi[i];
579*49b5e25fSSatish Balay     for  (j=0; j<nz; j++) {
580*49b5e25fSSatish Balay       x = rtmp+49*ajtmp[j];
581*49b5e25fSSatish Balay       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
582*49b5e25fSSatish Balay       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
583*49b5e25fSSatish Balay       x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ;
584*49b5e25fSSatish Balay       x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ;
585*49b5e25fSSatish Balay       x[34] = x[35] = x[36] = x[37] = x[38] = x[39] = x[40] = x[41] = 0.0 ;
586*49b5e25fSSatish Balay       x[42] = x[43] = x[44] = x[45] = x[46] = x[47] = x[48] = 0.0 ;
587*49b5e25fSSatish Balay     }
588*49b5e25fSSatish Balay     /* load in initial (unfactored row) */
589*49b5e25fSSatish Balay     nz       = ai[i+1] - ai[i];
590*49b5e25fSSatish Balay     ajtmpold = aj + ai[i];
591*49b5e25fSSatish Balay     v        = aa + 49*ai[i];
592*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
593*49b5e25fSSatish Balay       x    = rtmp+49*ajtmpold[j];
594*49b5e25fSSatish Balay       x[0] =  v[0];  x[1] =  v[1];  x[2] =  v[2];  x[3] =  v[3];
595*49b5e25fSSatish Balay       x[4] =  v[4];  x[5] =  v[5];  x[6] =  v[6];  x[7] =  v[7];
596*49b5e25fSSatish Balay       x[8] =  v[8];  x[9] =  v[9];  x[10] = v[10]; x[11] = v[11];
597*49b5e25fSSatish Balay       x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15];
598*49b5e25fSSatish Balay       x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19];
599*49b5e25fSSatish Balay       x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23];
600*49b5e25fSSatish Balay       x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27];
601*49b5e25fSSatish Balay       x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31];
602*49b5e25fSSatish Balay       x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35];
603*49b5e25fSSatish Balay       x[36] = v[36]; x[37] = v[37]; x[38] = v[38]; x[39] = v[39];
604*49b5e25fSSatish Balay       x[40] = v[40]; x[41] = v[41]; x[42] = v[42]; x[43] = v[43];
605*49b5e25fSSatish Balay       x[44] = v[44]; x[45] = v[45]; x[46] = v[46]; x[47] = v[47];
606*49b5e25fSSatish Balay       x[48] = v[48];
607*49b5e25fSSatish Balay       v    += 49;
608*49b5e25fSSatish Balay     }
609*49b5e25fSSatish Balay     row = *ajtmp++;
610*49b5e25fSSatish Balay     while (row < i) {
611*49b5e25fSSatish Balay       pc  = rtmp + 49*row;
612*49b5e25fSSatish Balay       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
613*49b5e25fSSatish Balay       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];
614*49b5e25fSSatish Balay       p9  = pc[8];  p10 = pc[9];  p11 = pc[10]; p12 = pc[11];
615*49b5e25fSSatish Balay       p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15];
616*49b5e25fSSatish Balay       p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19];
617*49b5e25fSSatish Balay       p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23];
618*49b5e25fSSatish Balay       p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27];
619*49b5e25fSSatish Balay       p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31];
620*49b5e25fSSatish Balay       p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35];
621*49b5e25fSSatish Balay       p37 = pc[36]; p38 = pc[37]; p39 = pc[38]; p40 = pc[39];
622*49b5e25fSSatish Balay       p41 = pc[40]; p42 = pc[41]; p43 = pc[42]; p44 = pc[43];
623*49b5e25fSSatish Balay       p45 = pc[44]; p46 = pc[45]; p47 = pc[46]; p48 = pc[47];
624*49b5e25fSSatish Balay       p49 = pc[48];
625*49b5e25fSSatish Balay       if (p1  != 0.0 || p2  != 0.0 || p3  != 0.0 || p4  != 0.0 ||
626*49b5e25fSSatish Balay           p5  != 0.0 || p6  != 0.0 || p7  != 0.0 || p8  != 0.0 ||
627*49b5e25fSSatish Balay           p9  != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 ||
628*49b5e25fSSatish Balay           p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 ||
629*49b5e25fSSatish Balay           p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 ||
630*49b5e25fSSatish Balay           p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 ||
631*49b5e25fSSatish Balay           p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 ||
632*49b5e25fSSatish Balay           p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 ||
633*49b5e25fSSatish Balay           p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0 ||
634*49b5e25fSSatish Balay           p37 != 0.0 || p38 != 0.0 || p39 != 0.0 || p40 != 0.0 ||
635*49b5e25fSSatish Balay           p41 != 0.0 || p42 != 0.0 || p43 != 0.0 || p44 != 0.0 ||
636*49b5e25fSSatish Balay           p45 != 0.0 || p46 != 0.0 || p47 != 0.0 || p48 != 0.0 ||
637*49b5e25fSSatish Balay           p49 != 0.0) {
638*49b5e25fSSatish Balay         pv = ba + 49*diag_offset[row];
639*49b5e25fSSatish Balay         pj = bj + diag_offset[row] + 1;
640*49b5e25fSSatish Balay 	x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
641*49b5e25fSSatish Balay 	x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
642*49b5e25fSSatish Balay 	x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
643*49b5e25fSSatish Balay 	x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
644*49b5e25fSSatish Balay 	x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
645*49b5e25fSSatish Balay 	x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
646*49b5e25fSSatish Balay 	x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
647*49b5e25fSSatish Balay 	x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
648*49b5e25fSSatish Balay 	x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
649*49b5e25fSSatish Balay 	x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39];
650*49b5e25fSSatish Balay 	x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43];
651*49b5e25fSSatish Balay 	x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47];
652*49b5e25fSSatish Balay         x49 = pv[48];
653*49b5e25fSSatish Balay         pc[0]  = m1  = p1*x1  + p8*x2   + p15*x3  + p22*x4  + p29*x5  + p36*x6 + p43*x7;
654*49b5e25fSSatish Balay         pc[1]  = m2  = p2*x1  + p9*x2   + p16*x3  + p23*x4  + p30*x5  + p37*x6 + p44*x7;
655*49b5e25fSSatish Balay         pc[2]  = m3  = p3*x1  + p10*x2  + p17*x3  + p24*x4  + p31*x5  + p38*x6 + p45*x7;
656*49b5e25fSSatish Balay         pc[3]  = m4  = p4*x1  + p11*x2  + p18*x3  + p25*x4  + p32*x5  + p39*x6 + p46*x7;
657*49b5e25fSSatish Balay         pc[4]  = m5  = p5*x1  + p12*x2  + p19*x3  + p26*x4  + p33*x5  + p40*x6 + p47*x7;
658*49b5e25fSSatish Balay         pc[5]  = m6  = p6*x1  + p13*x2  + p20*x3  + p27*x4  + p34*x5  + p41*x6 + p48*x7;
659*49b5e25fSSatish Balay         pc[6]  = m7  = p7*x1  + p14*x2  + p21*x3  + p28*x4  + p35*x5  + p42*x6 + p49*x7;
660*49b5e25fSSatish Balay 
661*49b5e25fSSatish Balay         pc[7]  = m8  = p1*x8  + p8*x9   + p15*x10 + p22*x11 + p29*x12 + p36*x13 + p43*x14;
662*49b5e25fSSatish Balay         pc[8]  = m9  = p2*x8  + p9*x9   + p16*x10 + p23*x11 + p30*x12 + p37*x13 + p44*x14;
663*49b5e25fSSatish Balay         pc[9]  = m10 = p3*x8  + p10*x9  + p17*x10 + p24*x11 + p31*x12 + p38*x13 + p45*x14;
664*49b5e25fSSatish Balay         pc[10] = m11 = p4*x8  + p11*x9  + p18*x10 + p25*x11 + p32*x12 + p39*x13 + p46*x14;
665*49b5e25fSSatish Balay         pc[11] = m12 = p5*x8  + p12*x9  + p19*x10 + p26*x11 + p33*x12 + p40*x13 + p47*x14;
666*49b5e25fSSatish Balay         pc[12] = m13 = p6*x8  + p13*x9  + p20*x10 + p27*x11 + p34*x12 + p41*x13 + p48*x14;
667*49b5e25fSSatish Balay         pc[13] = m14 = p7*x8  + p14*x9  + p21*x10 + p28*x11 + p35*x12 + p42*x13 + p49*x14;
668*49b5e25fSSatish Balay 
669*49b5e25fSSatish Balay         pc[14] = m15 = p1*x15 + p8*x16  + p15*x17 + p22*x18 + p29*x19 + p36*x20 + p43*x21;
670*49b5e25fSSatish Balay         pc[15] = m16 = p2*x15 + p9*x16  + p16*x17 + p23*x18 + p30*x19 + p37*x20 + p44*x21;
671*49b5e25fSSatish Balay         pc[16] = m17 = p3*x15 + p10*x16 + p17*x17 + p24*x18 + p31*x19 + p38*x20 + p45*x21;
672*49b5e25fSSatish Balay         pc[17] = m18 = p4*x15 + p11*x16 + p18*x17 + p25*x18 + p32*x19 + p39*x20 + p46*x21;
673*49b5e25fSSatish Balay         pc[18] = m19 = p5*x15 + p12*x16 + p19*x17 + p26*x18 + p33*x19 + p40*x20 + p47*x21;
674*49b5e25fSSatish Balay         pc[19] = m20 = p6*x15 + p13*x16 + p20*x17 + p27*x18 + p34*x19 + p41*x20 + p48*x21;
675*49b5e25fSSatish Balay         pc[20] = m21 = p7*x15 + p14*x16 + p21*x17 + p28*x18 + p35*x19 + p42*x20 + p49*x21;
676*49b5e25fSSatish Balay 
677*49b5e25fSSatish Balay         pc[21] = m22 = p1*x22 + p8*x23  + p15*x24 + p22*x25 + p29*x26 + p36*x27 + p43*x28;
678*49b5e25fSSatish Balay         pc[22] = m23 = p2*x22 + p9*x23  + p16*x24 + p23*x25 + p30*x26 + p37*x27 + p44*x28;
679*49b5e25fSSatish Balay         pc[23] = m24 = p3*x22 + p10*x23 + p17*x24 + p24*x25 + p31*x26 + p38*x27 + p45*x28;
680*49b5e25fSSatish Balay         pc[24] = m25 = p4*x22 + p11*x23 + p18*x24 + p25*x25 + p32*x26 + p39*x27 + p46*x28;
681*49b5e25fSSatish Balay         pc[25] = m26 = p5*x22 + p12*x23 + p19*x24 + p26*x25 + p33*x26 + p40*x27 + p47*x28;
682*49b5e25fSSatish Balay         pc[26] = m27 = p6*x22 + p13*x23 + p20*x24 + p27*x25 + p34*x26 + p41*x27 + p48*x28;
683*49b5e25fSSatish Balay         pc[27] = m28 = p7*x22 + p14*x23 + p21*x24 + p28*x25 + p35*x26 + p42*x27 + p49*x28;
684*49b5e25fSSatish Balay 
685*49b5e25fSSatish Balay         pc[28] = m29 = p1*x29 + p8*x30  + p15*x31 + p22*x32 + p29*x33 + p36*x34 + p43*x35;
686*49b5e25fSSatish Balay         pc[29] = m30 = p2*x29 + p9*x30  + p16*x31 + p23*x32 + p30*x33 + p37*x34 + p44*x35;
687*49b5e25fSSatish Balay         pc[30] = m31 = p3*x29 + p10*x30 + p17*x31 + p24*x32 + p31*x33 + p38*x34 + p45*x35;
688*49b5e25fSSatish Balay         pc[31] = m32 = p4*x29 + p11*x30 + p18*x31 + p25*x32 + p32*x33 + p39*x34 + p46*x35;
689*49b5e25fSSatish Balay         pc[32] = m33 = p5*x29 + p12*x30 + p19*x31 + p26*x32 + p33*x33 + p40*x34 + p47*x35;
690*49b5e25fSSatish Balay         pc[33] = m34 = p6*x29 + p13*x30 + p20*x31 + p27*x32 + p34*x33 + p41*x34 + p48*x35;
691*49b5e25fSSatish Balay         pc[34] = m35 = p7*x29 + p14*x30 + p21*x31 + p28*x32 + p35*x33 + p42*x34 + p49*x35;
692*49b5e25fSSatish Balay 
693*49b5e25fSSatish Balay         pc[35] = m36 = p1*x36 + p8*x37  + p15*x38 + p22*x39 + p29*x40 + p36*x41 + p43*x42;
694*49b5e25fSSatish Balay         pc[36] = m37 = p2*x36 + p9*x37  + p16*x38 + p23*x39 + p30*x40 + p37*x41 + p44*x42;
695*49b5e25fSSatish Balay         pc[37] = m38 = p3*x36 + p10*x37 + p17*x38 + p24*x39 + p31*x40 + p38*x41 + p45*x42;
696*49b5e25fSSatish Balay         pc[38] = m39 = p4*x36 + p11*x37 + p18*x38 + p25*x39 + p32*x40 + p39*x41 + p46*x42;
697*49b5e25fSSatish Balay         pc[39] = m40 = p5*x36 + p12*x37 + p19*x38 + p26*x39 + p33*x40 + p40*x41 + p47*x42;
698*49b5e25fSSatish Balay         pc[40] = m41 = p6*x36 + p13*x37 + p20*x38 + p27*x39 + p34*x40 + p41*x41 + p48*x42;
699*49b5e25fSSatish Balay         pc[41] = m42 = p7*x36 + p14*x37 + p21*x38 + p28*x39 + p35*x40 + p42*x41 + p49*x42;
700*49b5e25fSSatish Balay 
701*49b5e25fSSatish Balay         pc[42] = m43 = p1*x43 + p8*x44  + p15*x45 + p22*x46 + p29*x47 + p36*x48 + p43*x49;
702*49b5e25fSSatish Balay         pc[43] = m44 = p2*x43 + p9*x44  + p16*x45 + p23*x46 + p30*x47 + p37*x48 + p44*x49;
703*49b5e25fSSatish Balay         pc[44] = m45 = p3*x43 + p10*x44 + p17*x45 + p24*x46 + p31*x47 + p38*x48 + p45*x49;
704*49b5e25fSSatish Balay         pc[45] = m46 = p4*x43 + p11*x44 + p18*x45 + p25*x46 + p32*x47 + p39*x48 + p46*x49;
705*49b5e25fSSatish Balay         pc[46] = m47 = p5*x43 + p12*x44 + p19*x45 + p26*x46 + p33*x47 + p40*x48 + p47*x49;
706*49b5e25fSSatish Balay         pc[47] = m48 = p6*x43 + p13*x44 + p20*x45 + p27*x46 + p34*x47 + p41*x48 + p48*x49;
707*49b5e25fSSatish Balay         pc[48] = m49 = p7*x43 + p14*x44 + p21*x45 + p28*x46 + p35*x47 + p42*x48 + p49*x49;
708*49b5e25fSSatish Balay 
709*49b5e25fSSatish Balay         nz = bi[row+1] - diag_offset[row] - 1;
710*49b5e25fSSatish Balay         pv += 49;
711*49b5e25fSSatish Balay         for (j=0; j<nz; j++) {
712*49b5e25fSSatish Balay 	  x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
713*49b5e25fSSatish Balay 	  x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
714*49b5e25fSSatish Balay 	  x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
715*49b5e25fSSatish Balay 	  x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
716*49b5e25fSSatish Balay 	  x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
717*49b5e25fSSatish Balay 	  x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
718*49b5e25fSSatish Balay 	  x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
719*49b5e25fSSatish Balay 	  x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
720*49b5e25fSSatish Balay 	  x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
721*49b5e25fSSatish Balay 	  x37 = pv[36]; x38 = pv[37]; x39 = pv[38]; x40 = pv[39];
722*49b5e25fSSatish Balay 	  x41 = pv[40]; x42 = pv[41]; x43 = pv[42]; x44 = pv[43];
723*49b5e25fSSatish Balay 	  x45 = pv[44]; x46 = pv[45]; x47 = pv[46]; x48 = pv[47];
724*49b5e25fSSatish Balay 	  x49 = pv[48];
725*49b5e25fSSatish Balay 	  x    = rtmp + 49*pj[j];
726*49b5e25fSSatish Balay 	  x[0]  -= m1*x1  + m8*x2   + m15*x3  + m22*x4  + m29*x5  + m36*x6 + m43*x7;
727*49b5e25fSSatish Balay 	  x[1]  -= m2*x1  + m9*x2   + m16*x3  + m23*x4  + m30*x5  + m37*x6 + m44*x7;
728*49b5e25fSSatish Balay 	  x[2]  -= m3*x1  + m10*x2  + m17*x3  + m24*x4  + m31*x5  + m38*x6 + m45*x7;
729*49b5e25fSSatish Balay 	  x[3]  -= m4*x1  + m11*x2  + m18*x3  + m25*x4  + m32*x5  + m39*x6 + m46*x7;
730*49b5e25fSSatish Balay 	  x[4]  -= m5*x1  + m12*x2  + m19*x3  + m26*x4  + m33*x5  + m40*x6 + m47*x7;
731*49b5e25fSSatish Balay 	  x[5]  -= m6*x1  + m13*x2  + m20*x3  + m27*x4  + m34*x5  + m41*x6 + m48*x7;
732*49b5e25fSSatish Balay 	  x[6]  -= m7*x1  + m14*x2  + m21*x3  + m28*x4  + m35*x5  + m42*x6 + m49*x7;
733*49b5e25fSSatish Balay 
734*49b5e25fSSatish Balay 	  x[7]  -= m1*x8  + m8*x9   + m15*x10 + m22*x11 + m29*x12 + m36*x13 + m43*x14;
735*49b5e25fSSatish Balay 	  x[8]  -= m2*x8  + m9*x9   + m16*x10 + m23*x11 + m30*x12 + m37*x13 + m44*x14;
736*49b5e25fSSatish Balay 	  x[9]  -= m3*x8  + m10*x9  + m17*x10 + m24*x11 + m31*x12 + m38*x13 + m45*x14;
737*49b5e25fSSatish Balay 	  x[10] -= m4*x8  + m11*x9  + m18*x10 + m25*x11 + m32*x12 + m39*x13 + m46*x14;
738*49b5e25fSSatish Balay 	  x[11] -= m5*x8  + m12*x9  + m19*x10 + m26*x11 + m33*x12 + m40*x13 + m47*x14;
739*49b5e25fSSatish Balay 	  x[12] -= m6*x8  + m13*x9  + m20*x10 + m27*x11 + m34*x12 + m41*x13 + m48*x14;
740*49b5e25fSSatish Balay 	  x[13] -= m7*x8  + m14*x9  + m21*x10 + m28*x11 + m35*x12 + m42*x13 + m49*x14;
741*49b5e25fSSatish Balay 
742*49b5e25fSSatish Balay 	  x[14] -= m1*x15 + m8*x16  + m15*x17 + m22*x18 + m29*x19 + m36*x20 + m43*x21;
743*49b5e25fSSatish Balay 	  x[15] -= m2*x15 + m9*x16  + m16*x17 + m23*x18 + m30*x19 + m37*x20 + m44*x21;
744*49b5e25fSSatish Balay 	  x[16] -= m3*x15 + m10*x16 + m17*x17 + m24*x18 + m31*x19 + m38*x20 + m45*x21;
745*49b5e25fSSatish Balay 	  x[17] -= m4*x15 + m11*x16 + m18*x17 + m25*x18 + m32*x19 + m39*x20 + m46*x21;
746*49b5e25fSSatish Balay 	  x[18] -= m5*x15 + m12*x16 + m19*x17 + m26*x18 + m33*x19 + m40*x20 + m47*x21;
747*49b5e25fSSatish Balay 	  x[19] -= m6*x15 + m13*x16 + m20*x17 + m27*x18 + m34*x19 + m41*x20 + m48*x21;
748*49b5e25fSSatish Balay 	  x[20] -= m7*x15 + m14*x16 + m21*x17 + m28*x18 + m35*x19 + m42*x20 + m49*x21;
749*49b5e25fSSatish Balay 
750*49b5e25fSSatish Balay 	  x[21] -= m1*x22 + m8*x23  + m15*x24 + m22*x25 + m29*x26 + m36*x27 + m43*x28;
751*49b5e25fSSatish Balay 	  x[22] -= m2*x22 + m9*x23  + m16*x24 + m23*x25 + m30*x26 + m37*x27 + m44*x28;
752*49b5e25fSSatish Balay 	  x[23] -= m3*x22 + m10*x23 + m17*x24 + m24*x25 + m31*x26 + m38*x27 + m45*x28;
753*49b5e25fSSatish Balay 	  x[24] -= m4*x22 + m11*x23 + m18*x24 + m25*x25 + m32*x26 + m39*x27 + m46*x28;
754*49b5e25fSSatish Balay 	  x[25] -= m5*x22 + m12*x23 + m19*x24 + m26*x25 + m33*x26 + m40*x27 + m47*x28;
755*49b5e25fSSatish Balay 	  x[26] -= m6*x22 + m13*x23 + m20*x24 + m27*x25 + m34*x26 + m41*x27 + m48*x28;
756*49b5e25fSSatish Balay 	  x[27] -= m7*x22 + m14*x23 + m21*x24 + m28*x25 + m35*x26 + m42*x27 + m49*x28;
757*49b5e25fSSatish Balay 
758*49b5e25fSSatish Balay 	  x[28] -= m1*x29 + m8*x30  + m15*x31 + m22*x32 + m29*x33 + m36*x34 + m43*x35;
759*49b5e25fSSatish Balay 	  x[29] -= m2*x29 + m9*x30  + m16*x31 + m23*x32 + m30*x33 + m37*x34 + m44*x35;
760*49b5e25fSSatish Balay 	  x[30] -= m3*x29 + m10*x30 + m17*x31 + m24*x32 + m31*x33 + m38*x34 + m45*x35;
761*49b5e25fSSatish Balay 	  x[31] -= m4*x29 + m11*x30 + m18*x31 + m25*x32 + m32*x33 + m39*x34 + m46*x35;
762*49b5e25fSSatish Balay 	  x[32] -= m5*x29 + m12*x30 + m19*x31 + m26*x32 + m33*x33 + m40*x34 + m47*x35;
763*49b5e25fSSatish Balay 	  x[33] -= m6*x29 + m13*x30 + m20*x31 + m27*x32 + m34*x33 + m41*x34 + m48*x35;
764*49b5e25fSSatish Balay 	  x[34] -= m7*x29 + m14*x30 + m21*x31 + m28*x32 + m35*x33 + m42*x34 + m49*x35;
765*49b5e25fSSatish Balay 
766*49b5e25fSSatish Balay 	  x[35] -= m1*x36 + m8*x37  + m15*x38 + m22*x39 + m29*x40 + m36*x41 + m43*x42;
767*49b5e25fSSatish Balay 	  x[36] -= m2*x36 + m9*x37  + m16*x38 + m23*x39 + m30*x40 + m37*x41 + m44*x42;
768*49b5e25fSSatish Balay 	  x[37] -= m3*x36 + m10*x37 + m17*x38 + m24*x39 + m31*x40 + m38*x41 + m45*x42;
769*49b5e25fSSatish Balay 	  x[38] -= m4*x36 + m11*x37 + m18*x38 + m25*x39 + m32*x40 + m39*x41 + m46*x42;
770*49b5e25fSSatish Balay 	  x[39] -= m5*x36 + m12*x37 + m19*x38 + m26*x39 + m33*x40 + m40*x41 + m47*x42;
771*49b5e25fSSatish Balay 	  x[40] -= m6*x36 + m13*x37 + m20*x38 + m27*x39 + m34*x40 + m41*x41 + m48*x42;
772*49b5e25fSSatish Balay 	  x[41] -= m7*x36 + m14*x37 + m21*x38 + m28*x39 + m35*x40 + m42*x41 + m49*x42;
773*49b5e25fSSatish Balay 
774*49b5e25fSSatish Balay 	  x[42] -= m1*x43 + m8*x44  + m15*x45 + m22*x46 + m29*x47 + m36*x48 + m43*x49;
775*49b5e25fSSatish Balay 	  x[43] -= m2*x43 + m9*x44  + m16*x45 + m23*x46 + m30*x47 + m37*x48 + m44*x49;
776*49b5e25fSSatish Balay 	  x[44] -= m3*x43 + m10*x44 + m17*x45 + m24*x46 + m31*x47 + m38*x48 + m45*x49;
777*49b5e25fSSatish Balay 	  x[45] -= m4*x43 + m11*x44 + m18*x45 + m25*x46 + m32*x47 + m39*x48 + m46*x49;
778*49b5e25fSSatish Balay 	  x[46] -= m5*x43 + m12*x44 + m19*x45 + m26*x46 + m33*x47 + m40*x48 + m47*x49;
779*49b5e25fSSatish Balay 	  x[47] -= m6*x43 + m13*x44 + m20*x45 + m27*x46 + m34*x47 + m41*x48 + m48*x49;
780*49b5e25fSSatish Balay 	  x[48] -= m7*x43 + m14*x44 + m21*x45 + m28*x46 + m35*x47 + m42*x48 + m49*x49;
781*49b5e25fSSatish Balay           pv   += 49;
782*49b5e25fSSatish Balay         }
783*49b5e25fSSatish Balay         PLogFlops(686*nz+637);
784*49b5e25fSSatish Balay       }
785*49b5e25fSSatish Balay       row = *ajtmp++;
786*49b5e25fSSatish Balay     }
787*49b5e25fSSatish Balay     /* finished row so stick it into b->a */
788*49b5e25fSSatish Balay     pv = ba + 49*bi[i];
789*49b5e25fSSatish Balay     pj = bj + bi[i];
790*49b5e25fSSatish Balay     nz = bi[i+1] - bi[i];
791*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
792*49b5e25fSSatish Balay       x      = rtmp+49*pj[j];
793*49b5e25fSSatish Balay       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
794*49b5e25fSSatish Balay       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7];
795*49b5e25fSSatish Balay       pv[8]  = x[8];  pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11];
796*49b5e25fSSatish Balay       pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
797*49b5e25fSSatish Balay       pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19];
798*49b5e25fSSatish Balay       pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23];
799*49b5e25fSSatish Balay       pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27];
800*49b5e25fSSatish Balay       pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31];
801*49b5e25fSSatish Balay       pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35];
802*49b5e25fSSatish Balay       pv[36] = x[36]; pv[37] = x[37]; pv[38] = x[38]; pv[39] = x[39];
803*49b5e25fSSatish Balay       pv[40] = x[40]; pv[41] = x[41]; pv[42] = x[42]; pv[43] = x[43];
804*49b5e25fSSatish Balay       pv[44] = x[44]; pv[45] = x[45]; pv[46] = x[46]; pv[47] = x[47];
805*49b5e25fSSatish Balay       pv[48] = x[48];
806*49b5e25fSSatish Balay       pv   += 49;
807*49b5e25fSSatish Balay     }
808*49b5e25fSSatish Balay     /* invert diagonal block */
809*49b5e25fSSatish Balay     w = ba + 49*diag_offset[i];
810*49b5e25fSSatish Balay     ierr = Kernel_A_gets_inverse_A_7(w);CHKERRQ(ierr);
811*49b5e25fSSatish Balay   }
812*49b5e25fSSatish Balay 
813*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
814*49b5e25fSSatish Balay   C->factor    = FACTOR_LU;
815*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
816*49b5e25fSSatish Balay   PLogFlops(1.3333*343*b->mbs); /* from inverting diagonal blocks */
817*49b5e25fSSatish Balay   PetscFunctionReturn(0);
818*49b5e25fSSatish Balay }
819*49b5e25fSSatish Balay 
820*49b5e25fSSatish Balay /* ------------------------------------------------------------*/
821*49b5e25fSSatish Balay /*
822*49b5e25fSSatish Balay       Version for when blocks are 6 by 6
823*49b5e25fSSatish Balay */
824*49b5e25fSSatish Balay #undef __FUNC__
825*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_6"
826*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_6(Mat A,Mat *B)
827*49b5e25fSSatish Balay {
828*49b5e25fSSatish Balay   Mat          C = *B;
829*49b5e25fSSatish Balay   Mat_SeqBAIJ  *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
830*49b5e25fSSatish Balay   IS           isrow = b->row,isicol = b->icol;
831*49b5e25fSSatish Balay   int          *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
832*49b5e25fSSatish Balay   int          *ajtmpold,*ajtmp,nz,row;
833*49b5e25fSSatish Balay   int          *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
834*49b5e25fSSatish Balay   MatScalar    *pv,*v,*rtmp,*pc,*w,*x;
835*49b5e25fSSatish Balay   MatScalar    p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
836*49b5e25fSSatish Balay   MatScalar    p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
837*49b5e25fSSatish Balay   MatScalar    x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14;
838*49b5e25fSSatish Balay   MatScalar    p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12;
839*49b5e25fSSatish Balay   MatScalar    m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25;
840*49b5e25fSSatish Balay   MatScalar    p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36;
841*49b5e25fSSatish Balay   MatScalar    x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36;
842*49b5e25fSSatish Balay   MatScalar    m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36;
843*49b5e25fSSatish Balay   MatScalar    *ba = b->a,*aa = a->a;
844*49b5e25fSSatish Balay 
845*49b5e25fSSatish Balay   PetscFunctionBegin;
846*49b5e25fSSatish Balay   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
847*49b5e25fSSatish Balay   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
848*49b5e25fSSatish Balay   rtmp  = (MatScalar*)PetscMalloc(36*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
849*49b5e25fSSatish Balay 
850*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
851*49b5e25fSSatish Balay     nz    = bi[i+1] - bi[i];
852*49b5e25fSSatish Balay     ajtmp = bj + bi[i];
853*49b5e25fSSatish Balay     for  (j=0; j<nz; j++) {
854*49b5e25fSSatish Balay       x = rtmp+36*ajtmp[j];
855*49b5e25fSSatish Balay       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
856*49b5e25fSSatish Balay       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
857*49b5e25fSSatish Balay       x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ;
858*49b5e25fSSatish Balay       x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ;
859*49b5e25fSSatish Balay       x[34] = x[35] = 0.0 ;
860*49b5e25fSSatish Balay     }
861*49b5e25fSSatish Balay     /* load in initial (unfactored row) */
862*49b5e25fSSatish Balay     idx      = r[i];
863*49b5e25fSSatish Balay     nz       = ai[idx+1] - ai[idx];
864*49b5e25fSSatish Balay     ajtmpold = aj + ai[idx];
865*49b5e25fSSatish Balay     v        = aa + 36*ai[idx];
866*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
867*49b5e25fSSatish Balay       x    = rtmp+36*ic[ajtmpold[j]];
868*49b5e25fSSatish Balay       x[0] =  v[0];  x[1] =  v[1];  x[2] =  v[2];  x[3] =  v[3];
869*49b5e25fSSatish Balay       x[4] =  v[4];  x[5] =  v[5];  x[6] =  v[6];  x[7] =  v[7];
870*49b5e25fSSatish Balay       x[8] =  v[8];  x[9] =  v[9];  x[10] = v[10]; x[11] = v[11];
871*49b5e25fSSatish Balay       x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15];
872*49b5e25fSSatish Balay       x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19];
873*49b5e25fSSatish Balay       x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23];
874*49b5e25fSSatish Balay       x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27];
875*49b5e25fSSatish Balay       x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31];
876*49b5e25fSSatish Balay       x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35];
877*49b5e25fSSatish Balay       v    += 36;
878*49b5e25fSSatish Balay     }
879*49b5e25fSSatish Balay     row = *ajtmp++;
880*49b5e25fSSatish Balay     while (row < i) {
881*49b5e25fSSatish Balay       pc  =  rtmp + 36*row;
882*49b5e25fSSatish Balay       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
883*49b5e25fSSatish Balay       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];
884*49b5e25fSSatish Balay       p9  = pc[8];  p10 = pc[9];  p11 = pc[10]; p12 = pc[11];
885*49b5e25fSSatish Balay       p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15];
886*49b5e25fSSatish Balay       p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19];
887*49b5e25fSSatish Balay       p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23];
888*49b5e25fSSatish Balay       p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27];
889*49b5e25fSSatish Balay       p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31];
890*49b5e25fSSatish Balay       p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35];
891*49b5e25fSSatish Balay       if (p1  != 0.0 || p2  != 0.0 || p3  != 0.0 || p4  != 0.0 ||
892*49b5e25fSSatish Balay           p5  != 0.0 || p6  != 0.0 || p7  != 0.0 || p8  != 0.0 ||
893*49b5e25fSSatish Balay           p9  != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 ||
894*49b5e25fSSatish Balay           p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 ||
895*49b5e25fSSatish Balay           p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 ||
896*49b5e25fSSatish Balay           p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 ||
897*49b5e25fSSatish Balay           p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 ||
898*49b5e25fSSatish Balay           p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 ||
899*49b5e25fSSatish Balay           p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0) {
900*49b5e25fSSatish Balay         pv = ba + 36*diag_offset[row];
901*49b5e25fSSatish Balay         pj = bj + diag_offset[row] + 1;
902*49b5e25fSSatish Balay 	x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
903*49b5e25fSSatish Balay 	x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
904*49b5e25fSSatish Balay 	x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
905*49b5e25fSSatish Balay 	x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
906*49b5e25fSSatish Balay 	x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
907*49b5e25fSSatish Balay 	x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
908*49b5e25fSSatish Balay 	x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
909*49b5e25fSSatish Balay 	x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
910*49b5e25fSSatish Balay 	x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
911*49b5e25fSSatish Balay         pc[0]  = m1  = p1*x1  + p7*x2   + p13*x3  + p19*x4  + p25*x5  + p31*x6;
912*49b5e25fSSatish Balay         pc[1]  = m2  = p2*x1  + p8*x2   + p14*x3  + p20*x4  + p26*x5  + p32*x6;
913*49b5e25fSSatish Balay         pc[2]  = m3  = p3*x1  + p9*x2   + p15*x3  + p21*x4  + p27*x5  + p33*x6;
914*49b5e25fSSatish Balay         pc[3]  = m4  = p4*x1  + p10*x2  + p16*x3  + p22*x4  + p28*x5  + p34*x6;
915*49b5e25fSSatish Balay         pc[4]  = m5  = p5*x1  + p11*x2  + p17*x3  + p23*x4  + p29*x5  + p35*x6;
916*49b5e25fSSatish Balay         pc[5]  = m6  = p6*x1  + p12*x2  + p18*x3  + p24*x4  + p30*x5  + p36*x6;
917*49b5e25fSSatish Balay 
918*49b5e25fSSatish Balay         pc[6]  = m7  = p1*x7  + p7*x8   + p13*x9  + p19*x10 + p25*x11 + p31*x12;
919*49b5e25fSSatish Balay         pc[7]  = m8  = p2*x7  + p8*x8   + p14*x9  + p20*x10 + p26*x11 + p32*x12;
920*49b5e25fSSatish Balay         pc[8]  = m9  = p3*x7  + p9*x8   + p15*x9  + p21*x10 + p27*x11 + p33*x12;
921*49b5e25fSSatish Balay         pc[9]  = m10 = p4*x7  + p10*x8  + p16*x9  + p22*x10 + p28*x11 + p34*x12;
922*49b5e25fSSatish Balay         pc[10] = m11 = p5*x7  + p11*x8  + p17*x9  + p23*x10 + p29*x11 + p35*x12;
923*49b5e25fSSatish Balay         pc[11] = m12 = p6*x7  + p12*x8  + p18*x9  + p24*x10 + p30*x11 + p36*x12;
924*49b5e25fSSatish Balay 
925*49b5e25fSSatish Balay         pc[12] = m13 = p1*x13 + p7*x14  + p13*x15 + p19*x16 + p25*x17 + p31*x18;
926*49b5e25fSSatish Balay         pc[13] = m14 = p2*x13 + p8*x14  + p14*x15 + p20*x16 + p26*x17 + p32*x18;
927*49b5e25fSSatish Balay         pc[14] = m15 = p3*x13 + p9*x14  + p15*x15 + p21*x16 + p27*x17 + p33*x18;
928*49b5e25fSSatish Balay         pc[15] = m16 = p4*x13 + p10*x14 + p16*x15 + p22*x16 + p28*x17 + p34*x18;
929*49b5e25fSSatish Balay         pc[16] = m17 = p5*x13 + p11*x14 + p17*x15 + p23*x16 + p29*x17 + p35*x18;
930*49b5e25fSSatish Balay         pc[17] = m18 = p6*x13 + p12*x14 + p18*x15 + p24*x16 + p30*x17 + p36*x18;
931*49b5e25fSSatish Balay 
932*49b5e25fSSatish Balay         pc[18] = m19 = p1*x19 + p7*x20  + p13*x21 + p19*x22 + p25*x23 + p31*x24;
933*49b5e25fSSatish Balay         pc[19] = m20 = p2*x19 + p8*x20  + p14*x21 + p20*x22 + p26*x23 + p32*x24;
934*49b5e25fSSatish Balay         pc[20] = m21 = p3*x19 + p9*x20  + p15*x21 + p21*x22 + p27*x23 + p33*x24;
935*49b5e25fSSatish Balay         pc[21] = m22 = p4*x19 + p10*x20 + p16*x21 + p22*x22 + p28*x23 + p34*x24;
936*49b5e25fSSatish Balay         pc[22] = m23 = p5*x19 + p11*x20 + p17*x21 + p23*x22 + p29*x23 + p35*x24;
937*49b5e25fSSatish Balay         pc[23] = m24 = p6*x19 + p12*x20 + p18*x21 + p24*x22 + p30*x23 + p36*x24;
938*49b5e25fSSatish Balay 
939*49b5e25fSSatish Balay         pc[24] = m25 = p1*x25 + p7*x26  + p13*x27 + p19*x28 + p25*x29 + p31*x30;
940*49b5e25fSSatish Balay         pc[25] = m26 = p2*x25 + p8*x26  + p14*x27 + p20*x28 + p26*x29 + p32*x30;
941*49b5e25fSSatish Balay         pc[26] = m27 = p3*x25 + p9*x26  + p15*x27 + p21*x28 + p27*x29 + p33*x30;
942*49b5e25fSSatish Balay         pc[27] = m28 = p4*x25 + p10*x26 + p16*x27 + p22*x28 + p28*x29 + p34*x30;
943*49b5e25fSSatish Balay         pc[28] = m29 = p5*x25 + p11*x26 + p17*x27 + p23*x28 + p29*x29 + p35*x30;
944*49b5e25fSSatish Balay         pc[29] = m30 = p6*x25 + p12*x26 + p18*x27 + p24*x28 + p30*x29 + p36*x30;
945*49b5e25fSSatish Balay 
946*49b5e25fSSatish Balay         pc[30] = m31 = p1*x31 + p7*x32  + p13*x33 + p19*x34 + p25*x35 + p31*x36;
947*49b5e25fSSatish Balay         pc[31] = m32 = p2*x31 + p8*x32  + p14*x33 + p20*x34 + p26*x35 + p32*x36;
948*49b5e25fSSatish Balay         pc[32] = m33 = p3*x31 + p9*x32  + p15*x33 + p21*x34 + p27*x35 + p33*x36;
949*49b5e25fSSatish Balay         pc[33] = m34 = p4*x31 + p10*x32 + p16*x33 + p22*x34 + p28*x35 + p34*x36;
950*49b5e25fSSatish Balay         pc[34] = m35 = p5*x31 + p11*x32 + p17*x33 + p23*x34 + p29*x35 + p35*x36;
951*49b5e25fSSatish Balay         pc[35] = m36 = p6*x31 + p12*x32 + p18*x33 + p24*x34 + p30*x35 + p36*x36;
952*49b5e25fSSatish Balay 
953*49b5e25fSSatish Balay         nz = bi[row+1] - diag_offset[row] - 1;
954*49b5e25fSSatish Balay         pv += 36;
955*49b5e25fSSatish Balay         for (j=0; j<nz; j++) {
956*49b5e25fSSatish Balay 	  x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
957*49b5e25fSSatish Balay 	  x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
958*49b5e25fSSatish Balay 	  x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
959*49b5e25fSSatish Balay 	  x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
960*49b5e25fSSatish Balay 	  x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
961*49b5e25fSSatish Balay 	  x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
962*49b5e25fSSatish Balay 	  x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
963*49b5e25fSSatish Balay 	  x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
964*49b5e25fSSatish Balay 	  x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
965*49b5e25fSSatish Balay 	  x    = rtmp + 36*pj[j];
966*49b5e25fSSatish Balay           x[0]  -= m1*x1  + m7*x2   + m13*x3  + m19*x4  + m25*x5  + m31*x6;
967*49b5e25fSSatish Balay           x[1]  -= m2*x1  + m8*x2   + m14*x3  + m20*x4  + m26*x5  + m32*x6;
968*49b5e25fSSatish Balay           x[2]  -= m3*x1  + m9*x2   + m15*x3  + m21*x4  + m27*x5  + m33*x6;
969*49b5e25fSSatish Balay           x[3]  -= m4*x1  + m10*x2  + m16*x3  + m22*x4  + m28*x5  + m34*x6;
970*49b5e25fSSatish Balay           x[4]  -= m5*x1  + m11*x2  + m17*x3  + m23*x4  + m29*x5  + m35*x6;
971*49b5e25fSSatish Balay           x[5]  -= m6*x1  + m12*x2  + m18*x3  + m24*x4  + m30*x5  + m36*x6;
972*49b5e25fSSatish Balay 
973*49b5e25fSSatish Balay 	  x[6]  -= m1*x7  + m7*x8   + m13*x9  + m19*x10 + m25*x11 + m31*x12;
974*49b5e25fSSatish Balay 	  x[7]  -= m2*x7  + m8*x8   + m14*x9  + m20*x10 + m26*x11 + m32*x12;
975*49b5e25fSSatish Balay 	  x[8]  -= m3*x7  + m9*x8   + m15*x9  + m21*x10 + m27*x11 + m33*x12;
976*49b5e25fSSatish Balay 	  x[9]  -= m4*x7  + m10*x8  + m16*x9  + m22*x10 + m28*x11 + m34*x12;
977*49b5e25fSSatish Balay 	  x[10] -= m5*x7  + m11*x8  + m17*x9  + m23*x10 + m29*x11 + m35*x12;
978*49b5e25fSSatish Balay 	  x[11] -= m6*x7  + m12*x8  + m18*x9  + m24*x10 + m30*x11 + m36*x12;
979*49b5e25fSSatish Balay 
980*49b5e25fSSatish Balay 	  x[12] -= m1*x13 + m7*x14  + m13*x15 + m19*x16 + m25*x17 + m31*x18;
981*49b5e25fSSatish Balay 	  x[13] -= m2*x13 + m8*x14  + m14*x15 + m20*x16 + m26*x17 + m32*x18;
982*49b5e25fSSatish Balay 	  x[14] -= m3*x13 + m9*x14  + m15*x15 + m21*x16 + m27*x17 + m33*x18;
983*49b5e25fSSatish Balay 	  x[15] -= m4*x13 + m10*x14 + m16*x15 + m22*x16 + m28*x17 + m34*x18;
984*49b5e25fSSatish Balay 	  x[16] -= m5*x13 + m11*x14 + m17*x15 + m23*x16 + m29*x17 + m35*x18;
985*49b5e25fSSatish Balay 	  x[17] -= m6*x13 + m12*x14 + m18*x15 + m24*x16 + m30*x17 + m36*x18;
986*49b5e25fSSatish Balay 
987*49b5e25fSSatish Balay 	  x[18] -= m1*x19 + m7*x20  + m13*x21 + m19*x22 + m25*x23 + m31*x24;
988*49b5e25fSSatish Balay 	  x[19] -= m2*x19 + m8*x20  + m14*x21 + m20*x22 + m26*x23 + m32*x24;
989*49b5e25fSSatish Balay 	  x[20] -= m3*x19 + m9*x20  + m15*x21 + m21*x22 + m27*x23 + m33*x24;
990*49b5e25fSSatish Balay 	  x[21] -= m4*x19 + m10*x20 + m16*x21 + m22*x22 + m28*x23 + m34*x24;
991*49b5e25fSSatish Balay 	  x[22] -= m5*x19 + m11*x20 + m17*x21 + m23*x22 + m29*x23 + m35*x24;
992*49b5e25fSSatish Balay 	  x[23] -= m6*x19 + m12*x20 + m18*x21 + m24*x22 + m30*x23 + m36*x24;
993*49b5e25fSSatish Balay 
994*49b5e25fSSatish Balay 	  x[24] -= m1*x25 + m7*x26  + m13*x27 + m19*x28 + m25*x29 + m31*x30;
995*49b5e25fSSatish Balay 	  x[25] -= m2*x25 + m8*x26  + m14*x27 + m20*x28 + m26*x29 + m32*x30;
996*49b5e25fSSatish Balay 	  x[26] -= m3*x25 + m9*x26  + m15*x27 + m21*x28 + m27*x29 + m33*x30;
997*49b5e25fSSatish Balay 	  x[27] -= m4*x25 + m10*x26 + m16*x27 + m22*x28 + m28*x29 + m34*x30;
998*49b5e25fSSatish Balay 	  x[28] -= m5*x25 + m11*x26 + m17*x27 + m23*x28 + m29*x29 + m35*x30;
999*49b5e25fSSatish Balay 	  x[29] -= m6*x25 + m12*x26 + m18*x27 + m24*x28 + m30*x29 + m36*x30;
1000*49b5e25fSSatish Balay 
1001*49b5e25fSSatish Balay 	  x[30] -= m1*x31 + m7*x32  + m13*x33 + m19*x34 + m25*x35 + m31*x36;
1002*49b5e25fSSatish Balay 	  x[31] -= m2*x31 + m8*x32  + m14*x33 + m20*x34 + m26*x35 + m32*x36;
1003*49b5e25fSSatish Balay 	  x[32] -= m3*x31 + m9*x32  + m15*x33 + m21*x34 + m27*x35 + m33*x36;
1004*49b5e25fSSatish Balay 	  x[33] -= m4*x31 + m10*x32 + m16*x33 + m22*x34 + m28*x35 + m34*x36;
1005*49b5e25fSSatish Balay 	  x[34] -= m5*x31 + m11*x32 + m17*x33 + m23*x34 + m29*x35 + m35*x36;
1006*49b5e25fSSatish Balay 	  x[35] -= m6*x31 + m12*x32 + m18*x33 + m24*x34 + m30*x35 + m36*x36;
1007*49b5e25fSSatish Balay 
1008*49b5e25fSSatish Balay           pv   += 36;
1009*49b5e25fSSatish Balay         }
1010*49b5e25fSSatish Balay         PLogFlops(432*nz+396);
1011*49b5e25fSSatish Balay       }
1012*49b5e25fSSatish Balay       row = *ajtmp++;
1013*49b5e25fSSatish Balay     }
1014*49b5e25fSSatish Balay     /* finished row so stick it into b->a */
1015*49b5e25fSSatish Balay     pv = ba + 36*bi[i];
1016*49b5e25fSSatish Balay     pj = bj + bi[i];
1017*49b5e25fSSatish Balay     nz = bi[i+1] - bi[i];
1018*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
1019*49b5e25fSSatish Balay       x      = rtmp+36*pj[j];
1020*49b5e25fSSatish Balay       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
1021*49b5e25fSSatish Balay       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7];
1022*49b5e25fSSatish Balay       pv[8]  = x[8];  pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11];
1023*49b5e25fSSatish Balay       pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
1024*49b5e25fSSatish Balay       pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19];
1025*49b5e25fSSatish Balay       pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23];
1026*49b5e25fSSatish Balay       pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27];
1027*49b5e25fSSatish Balay       pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31];
1028*49b5e25fSSatish Balay       pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35];
1029*49b5e25fSSatish Balay       pv   += 36;
1030*49b5e25fSSatish Balay     }
1031*49b5e25fSSatish Balay     /* invert diagonal block */
1032*49b5e25fSSatish Balay     w = ba + 36*diag_offset[i];
1033*49b5e25fSSatish Balay     ierr = Kernel_A_gets_inverse_A_6(w);CHKERRQ(ierr);
1034*49b5e25fSSatish Balay   }
1035*49b5e25fSSatish Balay 
1036*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1037*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1038*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1039*49b5e25fSSatish Balay   C->factor = FACTOR_LU;
1040*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
1041*49b5e25fSSatish Balay   PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */
1042*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1043*49b5e25fSSatish Balay }
1044*49b5e25fSSatish Balay /*
1045*49b5e25fSSatish Balay       Version for when blocks are 6 by 6 Using natural ordering
1046*49b5e25fSSatish Balay */
1047*49b5e25fSSatish Balay #undef __FUNC__
1048*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_6_NaturalOrdering"
1049*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_6_NaturalOrdering(Mat A,Mat *B)
1050*49b5e25fSSatish Balay {
1051*49b5e25fSSatish Balay   Mat         C = *B;
1052*49b5e25fSSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1053*49b5e25fSSatish Balay   int         ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
1054*49b5e25fSSatish Balay   int         *ajtmpold,*ajtmp,nz,row;
1055*49b5e25fSSatish Balay   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
1056*49b5e25fSSatish Balay   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
1057*49b5e25fSSatish Balay   MatScalar   x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
1058*49b5e25fSSatish Balay   MatScalar   x16,x17,x18,x19,x20,x21,x22,x23,x24,x25;
1059*49b5e25fSSatish Balay   MatScalar   p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15;
1060*49b5e25fSSatish Balay   MatScalar   p16,p17,p18,p19,p20,p21,p22,p23,p24,p25;
1061*49b5e25fSSatish Balay   MatScalar   m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15;
1062*49b5e25fSSatish Balay   MatScalar   m16,m17,m18,m19,m20,m21,m22,m23,m24,m25;
1063*49b5e25fSSatish Balay   MatScalar   p26,p27,p28,p29,p30,p31,p32,p33,p34,p35,p36;
1064*49b5e25fSSatish Balay   MatScalar   x26,x27,x28,x29,x30,x31,x32,x33,x34,x35,x36;
1065*49b5e25fSSatish Balay   MatScalar   m26,m27,m28,m29,m30,m31,m32,m33,m34,m35,m36;
1066*49b5e25fSSatish Balay   MatScalar   *ba = b->a,*aa = a->a;
1067*49b5e25fSSatish Balay 
1068*49b5e25fSSatish Balay   PetscFunctionBegin;
1069*49b5e25fSSatish Balay   rtmp  = (MatScalar*)PetscMalloc(36*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
1070*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
1071*49b5e25fSSatish Balay     nz    = bi[i+1] - bi[i];
1072*49b5e25fSSatish Balay     ajtmp = bj + bi[i];
1073*49b5e25fSSatish Balay     for  (j=0; j<nz; j++) {
1074*49b5e25fSSatish Balay       x = rtmp+36*ajtmp[j];
1075*49b5e25fSSatish Balay       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
1076*49b5e25fSSatish Balay       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
1077*49b5e25fSSatish Balay       x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = x[25] = 0.0 ;
1078*49b5e25fSSatish Balay       x[26] = x[27] = x[28] = x[29] = x[30] = x[31] = x[32] = x[33] = 0.0 ;
1079*49b5e25fSSatish Balay       x[34] = x[35] = 0.0 ;
1080*49b5e25fSSatish Balay     }
1081*49b5e25fSSatish Balay     /* load in initial (unfactored row) */
1082*49b5e25fSSatish Balay     nz       = ai[i+1] - ai[i];
1083*49b5e25fSSatish Balay     ajtmpold = aj + ai[i];
1084*49b5e25fSSatish Balay     v        = aa + 36*ai[i];
1085*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
1086*49b5e25fSSatish Balay       x    = rtmp+36*ajtmpold[j];
1087*49b5e25fSSatish Balay       x[0] =  v[0];  x[1] =  v[1];  x[2] =  v[2];  x[3] =  v[3];
1088*49b5e25fSSatish Balay       x[4] =  v[4];  x[5] =  v[5];  x[6] =  v[6];  x[7] =  v[7];
1089*49b5e25fSSatish Balay       x[8] =  v[8];  x[9] =  v[9];  x[10] = v[10]; x[11] = v[11];
1090*49b5e25fSSatish Balay       x[12] = v[12]; x[13] = v[13]; x[14] = v[14]; x[15] = v[15];
1091*49b5e25fSSatish Balay       x[16] = v[16]; x[17] = v[17]; x[18] = v[18]; x[19] = v[19];
1092*49b5e25fSSatish Balay       x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23];
1093*49b5e25fSSatish Balay       x[24] = v[24]; x[25] = v[25]; x[26] = v[26]; x[27] = v[27];
1094*49b5e25fSSatish Balay       x[28] = v[28]; x[29] = v[29]; x[30] = v[30]; x[31] = v[31];
1095*49b5e25fSSatish Balay       x[32] = v[32]; x[33] = v[33]; x[34] = v[34]; x[35] = v[35];
1096*49b5e25fSSatish Balay       v    += 36;
1097*49b5e25fSSatish Balay     }
1098*49b5e25fSSatish Balay     row = *ajtmp++;
1099*49b5e25fSSatish Balay     while (row < i) {
1100*49b5e25fSSatish Balay       pc  = rtmp + 36*row;
1101*49b5e25fSSatish Balay       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
1102*49b5e25fSSatish Balay       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];
1103*49b5e25fSSatish Balay       p9  = pc[8];  p10 = pc[9];  p11 = pc[10]; p12 = pc[11];
1104*49b5e25fSSatish Balay       p13 = pc[12]; p14 = pc[13]; p15 = pc[14]; p16 = pc[15];
1105*49b5e25fSSatish Balay       p17 = pc[16]; p18 = pc[17]; p19 = pc[18]; p20 = pc[19];
1106*49b5e25fSSatish Balay       p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23];
1107*49b5e25fSSatish Balay       p25 = pc[24]; p26 = pc[25]; p27 = pc[26]; p28 = pc[27];
1108*49b5e25fSSatish Balay       p29 = pc[28]; p30 = pc[29]; p31 = pc[30]; p32 = pc[31];
1109*49b5e25fSSatish Balay       p33 = pc[32]; p34 = pc[33]; p35 = pc[34]; p36 = pc[35];
1110*49b5e25fSSatish Balay       if (p1  != 0.0 || p2  != 0.0 || p3  != 0.0 || p4  != 0.0 ||
1111*49b5e25fSSatish Balay           p5  != 0.0 || p6  != 0.0 || p7  != 0.0 || p8  != 0.0 ||
1112*49b5e25fSSatish Balay           p9  != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 ||
1113*49b5e25fSSatish Balay           p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0 ||
1114*49b5e25fSSatish Balay           p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0 ||
1115*49b5e25fSSatish Balay           p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 ||
1116*49b5e25fSSatish Balay           p25 != 0.0 || p26 != 0.0 || p27 != 0.0 || p28 != 0.0 ||
1117*49b5e25fSSatish Balay           p29 != 0.0 || p30 != 0.0 || p31 != 0.0 || p32 != 0.0 ||
1118*49b5e25fSSatish Balay           p33 != 0.0 || p34 != 0.0 || p35 != 0.0 || p36 != 0.0) {
1119*49b5e25fSSatish Balay         pv = ba + 36*diag_offset[row];
1120*49b5e25fSSatish Balay         pj = bj + diag_offset[row] + 1;
1121*49b5e25fSSatish Balay 	x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
1122*49b5e25fSSatish Balay 	x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
1123*49b5e25fSSatish Balay 	x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
1124*49b5e25fSSatish Balay 	x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
1125*49b5e25fSSatish Balay 	x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
1126*49b5e25fSSatish Balay 	x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
1127*49b5e25fSSatish Balay 	x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
1128*49b5e25fSSatish Balay 	x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
1129*49b5e25fSSatish Balay 	x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
1130*49b5e25fSSatish Balay         pc[0]  = m1  = p1*x1  + p7*x2   + p13*x3  + p19*x4  + p25*x5  + p31*x6;
1131*49b5e25fSSatish Balay         pc[1]  = m2  = p2*x1  + p8*x2   + p14*x3  + p20*x4  + p26*x5  + p32*x6;
1132*49b5e25fSSatish Balay         pc[2]  = m3  = p3*x1  + p9*x2   + p15*x3  + p21*x4  + p27*x5  + p33*x6;
1133*49b5e25fSSatish Balay         pc[3]  = m4  = p4*x1  + p10*x2  + p16*x3  + p22*x4  + p28*x5  + p34*x6;
1134*49b5e25fSSatish Balay         pc[4]  = m5  = p5*x1  + p11*x2  + p17*x3  + p23*x4  + p29*x5  + p35*x6;
1135*49b5e25fSSatish Balay         pc[5]  = m6  = p6*x1  + p12*x2  + p18*x3  + p24*x4  + p30*x5  + p36*x6;
1136*49b5e25fSSatish Balay 
1137*49b5e25fSSatish Balay         pc[6]  = m7  = p1*x7  + p7*x8   + p13*x9  + p19*x10 + p25*x11 + p31*x12;
1138*49b5e25fSSatish Balay         pc[7]  = m8  = p2*x7  + p8*x8   + p14*x9  + p20*x10 + p26*x11 + p32*x12;
1139*49b5e25fSSatish Balay         pc[8]  = m9  = p3*x7  + p9*x8   + p15*x9  + p21*x10 + p27*x11 + p33*x12;
1140*49b5e25fSSatish Balay         pc[9]  = m10 = p4*x7  + p10*x8  + p16*x9  + p22*x10 + p28*x11 + p34*x12;
1141*49b5e25fSSatish Balay         pc[10] = m11 = p5*x7  + p11*x8  + p17*x9  + p23*x10 + p29*x11 + p35*x12;
1142*49b5e25fSSatish Balay         pc[11] = m12 = p6*x7  + p12*x8  + p18*x9  + p24*x10 + p30*x11 + p36*x12;
1143*49b5e25fSSatish Balay 
1144*49b5e25fSSatish Balay         pc[12] = m13 = p1*x13 + p7*x14  + p13*x15 + p19*x16 + p25*x17 + p31*x18;
1145*49b5e25fSSatish Balay         pc[13] = m14 = p2*x13 + p8*x14  + p14*x15 + p20*x16 + p26*x17 + p32*x18;
1146*49b5e25fSSatish Balay         pc[14] = m15 = p3*x13 + p9*x14  + p15*x15 + p21*x16 + p27*x17 + p33*x18;
1147*49b5e25fSSatish Balay         pc[15] = m16 = p4*x13 + p10*x14 + p16*x15 + p22*x16 + p28*x17 + p34*x18;
1148*49b5e25fSSatish Balay         pc[16] = m17 = p5*x13 + p11*x14 + p17*x15 + p23*x16 + p29*x17 + p35*x18;
1149*49b5e25fSSatish Balay         pc[17] = m18 = p6*x13 + p12*x14 + p18*x15 + p24*x16 + p30*x17 + p36*x18;
1150*49b5e25fSSatish Balay 
1151*49b5e25fSSatish Balay         pc[18] = m19 = p1*x19 + p7*x20  + p13*x21 + p19*x22 + p25*x23 + p31*x24;
1152*49b5e25fSSatish Balay         pc[19] = m20 = p2*x19 + p8*x20  + p14*x21 + p20*x22 + p26*x23 + p32*x24;
1153*49b5e25fSSatish Balay         pc[20] = m21 = p3*x19 + p9*x20  + p15*x21 + p21*x22 + p27*x23 + p33*x24;
1154*49b5e25fSSatish Balay         pc[21] = m22 = p4*x19 + p10*x20 + p16*x21 + p22*x22 + p28*x23 + p34*x24;
1155*49b5e25fSSatish Balay         pc[22] = m23 = p5*x19 + p11*x20 + p17*x21 + p23*x22 + p29*x23 + p35*x24;
1156*49b5e25fSSatish Balay         pc[23] = m24 = p6*x19 + p12*x20 + p18*x21 + p24*x22 + p30*x23 + p36*x24;
1157*49b5e25fSSatish Balay 
1158*49b5e25fSSatish Balay         pc[24] = m25 = p1*x25 + p7*x26  + p13*x27 + p19*x28 + p25*x29 + p31*x30;
1159*49b5e25fSSatish Balay         pc[25] = m26 = p2*x25 + p8*x26  + p14*x27 + p20*x28 + p26*x29 + p32*x30;
1160*49b5e25fSSatish Balay         pc[26] = m27 = p3*x25 + p9*x26  + p15*x27 + p21*x28 + p27*x29 + p33*x30;
1161*49b5e25fSSatish Balay         pc[27] = m28 = p4*x25 + p10*x26 + p16*x27 + p22*x28 + p28*x29 + p34*x30;
1162*49b5e25fSSatish Balay         pc[28] = m29 = p5*x25 + p11*x26 + p17*x27 + p23*x28 + p29*x29 + p35*x30;
1163*49b5e25fSSatish Balay         pc[29] = m30 = p6*x25 + p12*x26 + p18*x27 + p24*x28 + p30*x29 + p36*x30;
1164*49b5e25fSSatish Balay 
1165*49b5e25fSSatish Balay         pc[30] = m31 = p1*x31 + p7*x32  + p13*x33 + p19*x34 + p25*x35 + p31*x36;
1166*49b5e25fSSatish Balay         pc[31] = m32 = p2*x31 + p8*x32  + p14*x33 + p20*x34 + p26*x35 + p32*x36;
1167*49b5e25fSSatish Balay         pc[32] = m33 = p3*x31 + p9*x32  + p15*x33 + p21*x34 + p27*x35 + p33*x36;
1168*49b5e25fSSatish Balay         pc[33] = m34 = p4*x31 + p10*x32 + p16*x33 + p22*x34 + p28*x35 + p34*x36;
1169*49b5e25fSSatish Balay         pc[34] = m35 = p5*x31 + p11*x32 + p17*x33 + p23*x34 + p29*x35 + p35*x36;
1170*49b5e25fSSatish Balay         pc[35] = m36 = p6*x31 + p12*x32 + p18*x33 + p24*x34 + p30*x35 + p36*x36;
1171*49b5e25fSSatish Balay 
1172*49b5e25fSSatish Balay         nz = bi[row+1] - diag_offset[row] - 1;
1173*49b5e25fSSatish Balay         pv += 36;
1174*49b5e25fSSatish Balay         for (j=0; j<nz; j++) {
1175*49b5e25fSSatish Balay 	  x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
1176*49b5e25fSSatish Balay 	  x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];
1177*49b5e25fSSatish Balay 	  x9  = pv[8];  x10 = pv[9];  x11 = pv[10]; x12 = pv[11];
1178*49b5e25fSSatish Balay 	  x13 = pv[12]; x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
1179*49b5e25fSSatish Balay 	  x17 = pv[16]; x18 = pv[17]; x19 = pv[18]; x20 = pv[19];
1180*49b5e25fSSatish Balay 	  x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
1181*49b5e25fSSatish Balay 	  x25 = pv[24]; x26 = pv[25]; x27 = pv[26]; x28 = pv[27];
1182*49b5e25fSSatish Balay 	  x29 = pv[28]; x30 = pv[29]; x31 = pv[30]; x32 = pv[31];
1183*49b5e25fSSatish Balay 	  x33 = pv[32]; x34 = pv[33]; x35 = pv[34]; x36 = pv[35];
1184*49b5e25fSSatish Balay 	  x    = rtmp + 36*pj[j];
1185*49b5e25fSSatish Balay           x[0]  -= m1*x1  + m7*x2   + m13*x3  + m19*x4  + m25*x5  + m31*x6;
1186*49b5e25fSSatish Balay           x[1]  -= m2*x1  + m8*x2   + m14*x3  + m20*x4  + m26*x5  + m32*x6;
1187*49b5e25fSSatish Balay           x[2]  -= m3*x1  + m9*x2   + m15*x3  + m21*x4  + m27*x5  + m33*x6;
1188*49b5e25fSSatish Balay           x[3]  -= m4*x1  + m10*x2  + m16*x3  + m22*x4  + m28*x5  + m34*x6;
1189*49b5e25fSSatish Balay           x[4]  -= m5*x1  + m11*x2  + m17*x3  + m23*x4  + m29*x5  + m35*x6;
1190*49b5e25fSSatish Balay           x[5]  -= m6*x1  + m12*x2  + m18*x3  + m24*x4  + m30*x5  + m36*x6;
1191*49b5e25fSSatish Balay 
1192*49b5e25fSSatish Balay 	  x[6]  -= m1*x7  + m7*x8   + m13*x9  + m19*x10 + m25*x11 + m31*x12;
1193*49b5e25fSSatish Balay 	  x[7]  -= m2*x7  + m8*x8   + m14*x9  + m20*x10 + m26*x11 + m32*x12;
1194*49b5e25fSSatish Balay 	  x[8]  -= m3*x7  + m9*x8   + m15*x9  + m21*x10 + m27*x11 + m33*x12;
1195*49b5e25fSSatish Balay 	  x[9]  -= m4*x7  + m10*x8  + m16*x9  + m22*x10 + m28*x11 + m34*x12;
1196*49b5e25fSSatish Balay 	  x[10] -= m5*x7  + m11*x8  + m17*x9  + m23*x10 + m29*x11 + m35*x12;
1197*49b5e25fSSatish Balay 	  x[11] -= m6*x7  + m12*x8  + m18*x9  + m24*x10 + m30*x11 + m36*x12;
1198*49b5e25fSSatish Balay 
1199*49b5e25fSSatish Balay 	  x[12] -= m1*x13 + m7*x14  + m13*x15 + m19*x16 + m25*x17 + m31*x18;
1200*49b5e25fSSatish Balay 	  x[13] -= m2*x13 + m8*x14  + m14*x15 + m20*x16 + m26*x17 + m32*x18;
1201*49b5e25fSSatish Balay 	  x[14] -= m3*x13 + m9*x14  + m15*x15 + m21*x16 + m27*x17 + m33*x18;
1202*49b5e25fSSatish Balay 	  x[15] -= m4*x13 + m10*x14 + m16*x15 + m22*x16 + m28*x17 + m34*x18;
1203*49b5e25fSSatish Balay 	  x[16] -= m5*x13 + m11*x14 + m17*x15 + m23*x16 + m29*x17 + m35*x18;
1204*49b5e25fSSatish Balay 	  x[17] -= m6*x13 + m12*x14 + m18*x15 + m24*x16 + m30*x17 + m36*x18;
1205*49b5e25fSSatish Balay 
1206*49b5e25fSSatish Balay 	  x[18] -= m1*x19 + m7*x20  + m13*x21 + m19*x22 + m25*x23 + m31*x24;
1207*49b5e25fSSatish Balay 	  x[19] -= m2*x19 + m8*x20  + m14*x21 + m20*x22 + m26*x23 + m32*x24;
1208*49b5e25fSSatish Balay 	  x[20] -= m3*x19 + m9*x20  + m15*x21 + m21*x22 + m27*x23 + m33*x24;
1209*49b5e25fSSatish Balay 	  x[21] -= m4*x19 + m10*x20 + m16*x21 + m22*x22 + m28*x23 + m34*x24;
1210*49b5e25fSSatish Balay 	  x[22] -= m5*x19 + m11*x20 + m17*x21 + m23*x22 + m29*x23 + m35*x24;
1211*49b5e25fSSatish Balay 	  x[23] -= m6*x19 + m12*x20 + m18*x21 + m24*x22 + m30*x23 + m36*x24;
1212*49b5e25fSSatish Balay 
1213*49b5e25fSSatish Balay 	  x[24] -= m1*x25 + m7*x26  + m13*x27 + m19*x28 + m25*x29 + m31*x30;
1214*49b5e25fSSatish Balay 	  x[25] -= m2*x25 + m8*x26  + m14*x27 + m20*x28 + m26*x29 + m32*x30;
1215*49b5e25fSSatish Balay 	  x[26] -= m3*x25 + m9*x26  + m15*x27 + m21*x28 + m27*x29 + m33*x30;
1216*49b5e25fSSatish Balay 	  x[27] -= m4*x25 + m10*x26 + m16*x27 + m22*x28 + m28*x29 + m34*x30;
1217*49b5e25fSSatish Balay 	  x[28] -= m5*x25 + m11*x26 + m17*x27 + m23*x28 + m29*x29 + m35*x30;
1218*49b5e25fSSatish Balay 	  x[29] -= m6*x25 + m12*x26 + m18*x27 + m24*x28 + m30*x29 + m36*x30;
1219*49b5e25fSSatish Balay 
1220*49b5e25fSSatish Balay 	  x[30] -= m1*x31 + m7*x32  + m13*x33 + m19*x34 + m25*x35 + m31*x36;
1221*49b5e25fSSatish Balay 	  x[31] -= m2*x31 + m8*x32  + m14*x33 + m20*x34 + m26*x35 + m32*x36;
1222*49b5e25fSSatish Balay 	  x[32] -= m3*x31 + m9*x32  + m15*x33 + m21*x34 + m27*x35 + m33*x36;
1223*49b5e25fSSatish Balay 	  x[33] -= m4*x31 + m10*x32 + m16*x33 + m22*x34 + m28*x35 + m34*x36;
1224*49b5e25fSSatish Balay 	  x[34] -= m5*x31 + m11*x32 + m17*x33 + m23*x34 + m29*x35 + m35*x36;
1225*49b5e25fSSatish Balay 	  x[35] -= m6*x31 + m12*x32 + m18*x33 + m24*x34 + m30*x35 + m36*x36;
1226*49b5e25fSSatish Balay 
1227*49b5e25fSSatish Balay           pv   += 36;
1228*49b5e25fSSatish Balay         }
1229*49b5e25fSSatish Balay         PLogFlops(432*nz+396);
1230*49b5e25fSSatish Balay       }
1231*49b5e25fSSatish Balay       row = *ajtmp++;
1232*49b5e25fSSatish Balay     }
1233*49b5e25fSSatish Balay     /* finished row so stick it into b->a */
1234*49b5e25fSSatish Balay     pv = ba + 36*bi[i];
1235*49b5e25fSSatish Balay     pj = bj + bi[i];
1236*49b5e25fSSatish Balay     nz = bi[i+1] - bi[i];
1237*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
1238*49b5e25fSSatish Balay       x      = rtmp+36*pj[j];
1239*49b5e25fSSatish Balay       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
1240*49b5e25fSSatish Balay       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7];
1241*49b5e25fSSatish Balay       pv[8]  = x[8];  pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11];
1242*49b5e25fSSatish Balay       pv[12] = x[12]; pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
1243*49b5e25fSSatish Balay       pv[16] = x[16]; pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19];
1244*49b5e25fSSatish Balay       pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23];
1245*49b5e25fSSatish Balay       pv[24] = x[24]; pv[25] = x[25]; pv[26] = x[26]; pv[27] = x[27];
1246*49b5e25fSSatish Balay       pv[28] = x[28]; pv[29] = x[29]; pv[30] = x[30]; pv[31] = x[31];
1247*49b5e25fSSatish Balay       pv[32] = x[32]; pv[33] = x[33]; pv[34] = x[34]; pv[35] = x[35];
1248*49b5e25fSSatish Balay       pv   += 36;
1249*49b5e25fSSatish Balay     }
1250*49b5e25fSSatish Balay     /* invert diagonal block */
1251*49b5e25fSSatish Balay     w = ba + 36*diag_offset[i];
1252*49b5e25fSSatish Balay     ierr = Kernel_A_gets_inverse_A_6(w);CHKERRQ(ierr);
1253*49b5e25fSSatish Balay   }
1254*49b5e25fSSatish Balay 
1255*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1256*49b5e25fSSatish Balay   C->factor    = FACTOR_LU;
1257*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
1258*49b5e25fSSatish Balay   PLogFlops(1.3333*216*b->mbs); /* from inverting diagonal blocks */
1259*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1260*49b5e25fSSatish Balay }
1261*49b5e25fSSatish Balay 
1262*49b5e25fSSatish Balay /* ------------------------------------------------------------*/
1263*49b5e25fSSatish Balay /*
1264*49b5e25fSSatish Balay       Version for when blocks are 5 by 5
1265*49b5e25fSSatish Balay */
1266*49b5e25fSSatish Balay #undef __FUNC__
1267*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_5"
1268*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_5(Mat A,Mat *B)
1269*49b5e25fSSatish Balay {
1270*49b5e25fSSatish Balay   Mat         C = *B;
1271*49b5e25fSSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1272*49b5e25fSSatish Balay   IS          isrow = b->row,isicol = b->icol;
1273*49b5e25fSSatish Balay   int         *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
1274*49b5e25fSSatish Balay   int         *ajtmpold,*ajtmp,nz,row;
1275*49b5e25fSSatish Balay   int         *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
1276*49b5e25fSSatish Balay   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
1277*49b5e25fSSatish Balay   MatScalar   p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
1278*49b5e25fSSatish Balay   MatScalar   p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
1279*49b5e25fSSatish Balay   MatScalar   x17,x18,x19,x20,x21,x22,x23,x24,x25,p10,p11,p12,p13,p14;
1280*49b5e25fSSatish Balay   MatScalar   p15,p16,p17,p18,p19,p20,p21,p22,p23,p24,p25,m10,m11,m12;
1281*49b5e25fSSatish Balay   MatScalar   m13,m14,m15,m16,m17,m18,m19,m20,m21,m22,m23,m24,m25;
1282*49b5e25fSSatish Balay   MatScalar   *ba = b->a,*aa = a->a;
1283*49b5e25fSSatish Balay 
1284*49b5e25fSSatish Balay   PetscFunctionBegin;
1285*49b5e25fSSatish Balay   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1286*49b5e25fSSatish Balay   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1287*49b5e25fSSatish Balay   rtmp  = (MatScalar*)PetscMalloc(25*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
1288*49b5e25fSSatish Balay 
1289*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
1290*49b5e25fSSatish Balay     nz    = bi[i+1] - bi[i];
1291*49b5e25fSSatish Balay     ajtmp = bj + bi[i];
1292*49b5e25fSSatish Balay     for  (j=0; j<nz; j++) {
1293*49b5e25fSSatish Balay       x = rtmp+25*ajtmp[j];
1294*49b5e25fSSatish Balay       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
1295*49b5e25fSSatish Balay       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = x[16] = x[17] = 0.0;
1296*49b5e25fSSatish Balay       x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0;
1297*49b5e25fSSatish Balay     }
1298*49b5e25fSSatish Balay     /* load in initial (unfactored row) */
1299*49b5e25fSSatish Balay     idx      = r[i];
1300*49b5e25fSSatish Balay     nz       = ai[idx+1] - ai[idx];
1301*49b5e25fSSatish Balay     ajtmpold = aj + ai[idx];
1302*49b5e25fSSatish Balay     v        = aa + 25*ai[idx];
1303*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
1304*49b5e25fSSatish Balay       x    = rtmp+25*ic[ajtmpold[j]];
1305*49b5e25fSSatish Balay       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
1306*49b5e25fSSatish Balay       x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
1307*49b5e25fSSatish Balay       x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
1308*49b5e25fSSatish Balay       x[14] = v[14]; x[15] = v[15]; x[16] = v[16]; x[17] = v[17];
1309*49b5e25fSSatish Balay       x[18] = v[18]; x[19] = v[19]; x[20] = v[20]; x[21] = v[21];
1310*49b5e25fSSatish Balay       x[22] = v[22]; x[23] = v[23]; x[24] = v[24];
1311*49b5e25fSSatish Balay       v    += 25;
1312*49b5e25fSSatish Balay     }
1313*49b5e25fSSatish Balay     row = *ajtmp++;
1314*49b5e25fSSatish Balay     while (row < i) {
1315*49b5e25fSSatish Balay       pc = rtmp + 25*row;
1316*49b5e25fSSatish Balay       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
1317*49b5e25fSSatish Balay       p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
1318*49b5e25fSSatish Balay       p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
1319*49b5e25fSSatish Balay       p15 = pc[14]; p16 = pc[15]; p17 = pc[16]; p18 = pc[17]; p19 = pc[18];
1320*49b5e25fSSatish Balay       p20 = pc[19]; p21 = pc[20]; p22 = pc[21]; p23 = pc[22]; p24 = pc[23];
1321*49b5e25fSSatish Balay       p25 = pc[24];
1322*49b5e25fSSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
1323*49b5e25fSSatish Balay           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
1324*49b5e25fSSatish Balay           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
1325*49b5e25fSSatish Balay           || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 ||
1326*49b5e25fSSatish Balay           p20 != 0.0 || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 ||
1327*49b5e25fSSatish Balay           p24 != 0.0 || p25 != 0.0) {
1328*49b5e25fSSatish Balay         pv = ba + 25*diag_offset[row];
1329*49b5e25fSSatish Balay         pj = bj + diag_offset[row] + 1;
1330*49b5e25fSSatish Balay         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
1331*49b5e25fSSatish Balay         x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
1332*49b5e25fSSatish Balay         x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
1333*49b5e25fSSatish Balay         x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17];
1334*49b5e25fSSatish Balay         x19 = pv[18]; x20 = pv[19]; x21 = pv[20]; x22 = pv[21];
1335*49b5e25fSSatish Balay         x23 = pv[22]; x24 = pv[23]; x25 = pv[24];
1336*49b5e25fSSatish Balay         pc[0] = m1 = p1*x1 + p6*x2  + p11*x3 + p16*x4 + p21*x5;
1337*49b5e25fSSatish Balay         pc[1] = m2 = p2*x1 + p7*x2  + p12*x3 + p17*x4 + p22*x5;
1338*49b5e25fSSatish Balay         pc[2] = m3 = p3*x1 + p8*x2  + p13*x3 + p18*x4 + p23*x5;
1339*49b5e25fSSatish Balay         pc[3] = m4 = p4*x1 + p9*x2  + p14*x3 + p19*x4 + p24*x5;
1340*49b5e25fSSatish Balay         pc[4] = m5 = p5*x1 + p10*x2 + p15*x3 + p20*x4 + p25*x5;
1341*49b5e25fSSatish Balay 
1342*49b5e25fSSatish Balay         pc[5] = m6 = p1*x6 + p6*x7  + p11*x8 + p16*x9 + p21*x10;
1343*49b5e25fSSatish Balay         pc[6] = m7 = p2*x6 + p7*x7  + p12*x8 + p17*x9 + p22*x10;
1344*49b5e25fSSatish Balay         pc[7] = m8 = p3*x6 + p8*x7  + p13*x8 + p18*x9 + p23*x10;
1345*49b5e25fSSatish Balay         pc[8] = m9 = p4*x6 + p9*x7  + p14*x8 + p19*x9 + p24*x10;
1346*49b5e25fSSatish Balay         pc[9] = m10 = p5*x6 + p10*x7 + p15*x8 + p20*x9 + p25*x10;
1347*49b5e25fSSatish Balay 
1348*49b5e25fSSatish Balay         pc[10] = m11 = p1*x11 + p6*x12  + p11*x13 + p16*x14 + p21*x15;
1349*49b5e25fSSatish Balay         pc[11] = m12 = p2*x11 + p7*x12  + p12*x13 + p17*x14 + p22*x15;
1350*49b5e25fSSatish Balay         pc[12] = m13 = p3*x11 + p8*x12  + p13*x13 + p18*x14 + p23*x15;
1351*49b5e25fSSatish Balay         pc[13] = m14 = p4*x11 + p9*x12  + p14*x13 + p19*x14 + p24*x15;
1352*49b5e25fSSatish Balay         pc[14] = m15 = p5*x11 + p10*x12 + p15*x13 + p20*x14 + p25*x15;
1353*49b5e25fSSatish Balay 
1354*49b5e25fSSatish Balay         pc[15] = m16 = p1*x16 + p6*x17  + p11*x18 + p16*x19 + p21*x20;
1355*49b5e25fSSatish Balay         pc[16] = m17 = p2*x16 + p7*x17  + p12*x18 + p17*x19 + p22*x20;
1356*49b5e25fSSatish Balay         pc[17] = m18 = p3*x16 + p8*x17  + p13*x18 + p18*x19 + p23*x20;
1357*49b5e25fSSatish Balay         pc[18] = m19 = p4*x16 + p9*x17  + p14*x18 + p19*x19 + p24*x20;
1358*49b5e25fSSatish Balay         pc[19] = m20 = p5*x16 + p10*x17 + p15*x18 + p20*x19 + p25*x20;
1359*49b5e25fSSatish Balay 
1360*49b5e25fSSatish Balay         pc[20] = m21 = p1*x21 + p6*x22  + p11*x23 + p16*x24 + p21*x25;
1361*49b5e25fSSatish Balay         pc[21] = m22 = p2*x21 + p7*x22  + p12*x23 + p17*x24 + p22*x25;
1362*49b5e25fSSatish Balay         pc[22] = m23 = p3*x21 + p8*x22  + p13*x23 + p18*x24 + p23*x25;
1363*49b5e25fSSatish Balay         pc[23] = m24 = p4*x21 + p9*x22  + p14*x23 + p19*x24 + p24*x25;
1364*49b5e25fSSatish Balay         pc[24] = m25 = p5*x21 + p10*x22 + p15*x23 + p20*x24 + p25*x25;
1365*49b5e25fSSatish Balay 
1366*49b5e25fSSatish Balay         nz = bi[row+1] - diag_offset[row] - 1;
1367*49b5e25fSSatish Balay         pv += 25;
1368*49b5e25fSSatish Balay         for (j=0; j<nz; j++) {
1369*49b5e25fSSatish Balay           x1   = pv[0];  x2 = pv[1];   x3  = pv[2];  x4  = pv[3];
1370*49b5e25fSSatish Balay           x5   = pv[4];  x6 = pv[5];   x7  = pv[6];  x8  = pv[7]; x9 = pv[8];
1371*49b5e25fSSatish Balay           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
1372*49b5e25fSSatish Balay           x14  = pv[13]; x15 = pv[14]; x16 = pv[15]; x17 = pv[16];
1373*49b5e25fSSatish Balay           x18  = pv[17]; x19 = pv[18]; x20 = pv[19]; x21 = pv[20];
1374*49b5e25fSSatish Balay           x22  = pv[21]; x23 = pv[22]; x24 = pv[23]; x25 = pv[24];
1375*49b5e25fSSatish Balay           x    = rtmp + 25*pj[j];
1376*49b5e25fSSatish Balay           x[0] -= m1*x1 + m6*x2  + m11*x3 + m16*x4 + m21*x5;
1377*49b5e25fSSatish Balay           x[1] -= m2*x1 + m7*x2  + m12*x3 + m17*x4 + m22*x5;
1378*49b5e25fSSatish Balay           x[2] -= m3*x1 + m8*x2  + m13*x3 + m18*x4 + m23*x5;
1379*49b5e25fSSatish Balay           x[3] -= m4*x1 + m9*x2  + m14*x3 + m19*x4 + m24*x5;
1380*49b5e25fSSatish Balay           x[4] -= m5*x1 + m10*x2 + m15*x3 + m20*x4 + m25*x5;
1381*49b5e25fSSatish Balay 
1382*49b5e25fSSatish Balay           x[5] -= m1*x6 + m6*x7  + m11*x8 + m16*x9 + m21*x10;
1383*49b5e25fSSatish Balay           x[6] -= m2*x6 + m7*x7  + m12*x8 + m17*x9 + m22*x10;
1384*49b5e25fSSatish Balay           x[7] -= m3*x6 + m8*x7  + m13*x8 + m18*x9 + m23*x10;
1385*49b5e25fSSatish Balay           x[8] -= m4*x6 + m9*x7  + m14*x8 + m19*x9 + m24*x10;
1386*49b5e25fSSatish Balay           x[9] -= m5*x6 + m10*x7 + m15*x8 + m20*x9 + m25*x10;
1387*49b5e25fSSatish Balay 
1388*49b5e25fSSatish Balay           x[10] -= m1*x11 + m6*x12  + m11*x13 + m16*x14 + m21*x15;
1389*49b5e25fSSatish Balay           x[11] -= m2*x11 + m7*x12  + m12*x13 + m17*x14 + m22*x15;
1390*49b5e25fSSatish Balay           x[12] -= m3*x11 + m8*x12  + m13*x13 + m18*x14 + m23*x15;
1391*49b5e25fSSatish Balay           x[13] -= m4*x11 + m9*x12  + m14*x13 + m19*x14 + m24*x15;
1392*49b5e25fSSatish Balay           x[14] -= m5*x11 + m10*x12 + m15*x13 + m20*x14 + m25*x15;
1393*49b5e25fSSatish Balay 
1394*49b5e25fSSatish Balay           x[15] -= m1*x16 + m6*x17  + m11*x18 + m16*x19 + m21*x20;
1395*49b5e25fSSatish Balay           x[16] -= m2*x16 + m7*x17  + m12*x18 + m17*x19 + m22*x20;
1396*49b5e25fSSatish Balay           x[17] -= m3*x16 + m8*x17  + m13*x18 + m18*x19 + m23*x20;
1397*49b5e25fSSatish Balay           x[18] -= m4*x16 + m9*x17  + m14*x18 + m19*x19 + m24*x20;
1398*49b5e25fSSatish Balay           x[19] -= m5*x16 + m10*x17 + m15*x18 + m20*x19 + m25*x20;
1399*49b5e25fSSatish Balay 
1400*49b5e25fSSatish Balay           x[20] -= m1*x21 + m6*x22  + m11*x23 + m16*x24 + m21*x25;
1401*49b5e25fSSatish Balay           x[21] -= m2*x21 + m7*x22  + m12*x23 + m17*x24 + m22*x25;
1402*49b5e25fSSatish Balay           x[22] -= m3*x21 + m8*x22  + m13*x23 + m18*x24 + m23*x25;
1403*49b5e25fSSatish Balay           x[23] -= m4*x21 + m9*x22  + m14*x23 + m19*x24 + m24*x25;
1404*49b5e25fSSatish Balay           x[24] -= m5*x21 + m10*x22 + m15*x23 + m20*x24 + m25*x25;
1405*49b5e25fSSatish Balay 
1406*49b5e25fSSatish Balay           pv   += 25;
1407*49b5e25fSSatish Balay         }
1408*49b5e25fSSatish Balay         PLogFlops(250*nz+225);
1409*49b5e25fSSatish Balay       }
1410*49b5e25fSSatish Balay       row = *ajtmp++;
1411*49b5e25fSSatish Balay     }
1412*49b5e25fSSatish Balay     /* finished row so stick it into b->a */
1413*49b5e25fSSatish Balay     pv = ba + 25*bi[i];
1414*49b5e25fSSatish Balay     pj = bj + bi[i];
1415*49b5e25fSSatish Balay     nz = bi[i+1] - bi[i];
1416*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
1417*49b5e25fSSatish Balay       x     = rtmp+25*pj[j];
1418*49b5e25fSSatish Balay       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
1419*49b5e25fSSatish Balay       pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
1420*49b5e25fSSatish Balay       pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
1421*49b5e25fSSatish Balay       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; pv[16] = x[16];
1422*49b5e25fSSatish Balay       pv[17] = x[17]; pv[18] = x[18]; pv[19] = x[19]; pv[20] = x[20];
1423*49b5e25fSSatish Balay       pv[21] = x[21]; pv[22] = x[22]; pv[23] = x[23]; pv[24] = x[24];
1424*49b5e25fSSatish Balay       pv   += 25;
1425*49b5e25fSSatish Balay     }
1426*49b5e25fSSatish Balay     /* invert diagonal block */
1427*49b5e25fSSatish Balay     w = ba + 25*diag_offset[i];
1428*49b5e25fSSatish Balay     ierr = Kernel_A_gets_inverse_A_5(w);CHKERRQ(ierr);
1429*49b5e25fSSatish Balay   }
1430*49b5e25fSSatish Balay 
1431*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1432*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1433*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1434*49b5e25fSSatish Balay   C->factor = FACTOR_LU;
1435*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
1436*49b5e25fSSatish Balay   PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */
1437*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1438*49b5e25fSSatish Balay }
1439*49b5e25fSSatish Balay /*
1440*49b5e25fSSatish Balay       Version for when blocks are 5 by 5 Using natural ordering
1441*49b5e25fSSatish Balay */
1442*49b5e25fSSatish Balay #undef __FUNC__
1443*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_5_NaturalOrdering"
1444*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_5_NaturalOrdering(Mat A,Mat *B)
1445*49b5e25fSSatish Balay {
1446*49b5e25fSSatish Balay   Mat         C = *B;
1447*49b5e25fSSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1448*49b5e25fSSatish Balay   int         ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
1449*49b5e25fSSatish Balay   int         *ajtmpold,*ajtmp,nz,row;
1450*49b5e25fSSatish Balay   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
1451*49b5e25fSSatish Balay   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
1452*49b5e25fSSatish Balay   MatScalar   x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
1453*49b5e25fSSatish Balay   MatScalar   x16,x17,x18,x19,x20,x21,x22,x23,x24,x25;
1454*49b5e25fSSatish Balay   MatScalar   p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15;
1455*49b5e25fSSatish Balay   MatScalar   p16,p17,p18,p19,p20,p21,p22,p23,p24,p25;
1456*49b5e25fSSatish Balay   MatScalar   m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,m15;
1457*49b5e25fSSatish Balay   MatScalar   m16,m17,m18,m19,m20,m21,m22,m23,m24,m25;
1458*49b5e25fSSatish Balay   MatScalar   *ba = b->a,*aa = a->a;
1459*49b5e25fSSatish Balay 
1460*49b5e25fSSatish Balay   PetscFunctionBegin;
1461*49b5e25fSSatish Balay   rtmp  = (MatScalar*)PetscMalloc(25*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
1462*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
1463*49b5e25fSSatish Balay     nz    = bi[i+1] - bi[i];
1464*49b5e25fSSatish Balay     ajtmp = bj + bi[i];
1465*49b5e25fSSatish Balay     for  (j=0; j<nz; j++) {
1466*49b5e25fSSatish Balay       x = rtmp+25*ajtmp[j];
1467*49b5e25fSSatish Balay       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
1468*49b5e25fSSatish Balay       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
1469*49b5e25fSSatish Balay       x[16] = x[17] = x[18] = x[19] = x[20] = x[21] = x[22] = x[23] = x[24] = 0.0;
1470*49b5e25fSSatish Balay     }
1471*49b5e25fSSatish Balay     /* load in initial (unfactored row) */
1472*49b5e25fSSatish Balay     nz       = ai[i+1] - ai[i];
1473*49b5e25fSSatish Balay     ajtmpold = aj + ai[i];
1474*49b5e25fSSatish Balay     v        = aa + 25*ai[i];
1475*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
1476*49b5e25fSSatish Balay       x    = rtmp+25*ajtmpold[j];
1477*49b5e25fSSatish Balay       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
1478*49b5e25fSSatish Balay       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
1479*49b5e25fSSatish Balay       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
1480*49b5e25fSSatish Balay       x[14] = v[14]; x[15] = v[15]; x[16] = v[16]; x[17] = v[17]; x[18] = v[18];
1481*49b5e25fSSatish Balay       x[19] = v[19]; x[20] = v[20]; x[21] = v[21]; x[22] = v[22]; x[23] = v[23];
1482*49b5e25fSSatish Balay       x[24] = v[24];
1483*49b5e25fSSatish Balay       v    += 25;
1484*49b5e25fSSatish Balay     }
1485*49b5e25fSSatish Balay     row = *ajtmp++;
1486*49b5e25fSSatish Balay     while (row < i) {
1487*49b5e25fSSatish Balay       pc  = rtmp + 25*row;
1488*49b5e25fSSatish Balay       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
1489*49b5e25fSSatish Balay       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
1490*49b5e25fSSatish Balay       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
1491*49b5e25fSSatish Balay       p15 = pc[14]; p16 = pc[15]; p17 = pc[16]; p18 = pc[17];
1492*49b5e25fSSatish Balay       p19 = pc[18]; p20 = pc[19]; p21 = pc[20]; p22 = pc[21]; p23 = pc[22];
1493*49b5e25fSSatish Balay       p24 = pc[23]; p25 = pc[24];
1494*49b5e25fSSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
1495*49b5e25fSSatish Balay           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
1496*49b5e25fSSatish Balay           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
1497*49b5e25fSSatish Balay           || p16 != 0.0 || p17 != 0.0 || p18 != 0.0 || p19 != 0.0 || p20 != 0.0
1498*49b5e25fSSatish Balay           || p21 != 0.0 || p22 != 0.0 || p23 != 0.0 || p24 != 0.0 || p25 != 0.0) {
1499*49b5e25fSSatish Balay         pv = ba + 25*diag_offset[row];
1500*49b5e25fSSatish Balay         pj = bj + diag_offset[row] + 1;
1501*49b5e25fSSatish Balay         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
1502*49b5e25fSSatish Balay         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
1503*49b5e25fSSatish Balay         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
1504*49b5e25fSSatish Balay         x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17]; x19 = pv[18];
1505*49b5e25fSSatish Balay         x20 = pv[19]; x21 = pv[20]; x22 = pv[21]; x23 = pv[22]; x24 = pv[23];
1506*49b5e25fSSatish Balay         x25 = pv[24];
1507*49b5e25fSSatish Balay         pc[0] = m1 = p1*x1 + p6*x2  + p11*x3 + p16*x4 + p21*x5;
1508*49b5e25fSSatish Balay         pc[1] = m2 = p2*x1 + p7*x2  + p12*x3 + p17*x4 + p22*x5;
1509*49b5e25fSSatish Balay         pc[2] = m3 = p3*x1 + p8*x2  + p13*x3 + p18*x4 + p23*x5;
1510*49b5e25fSSatish Balay         pc[3] = m4 = p4*x1 + p9*x2  + p14*x3 + p19*x4 + p24*x5;
1511*49b5e25fSSatish Balay         pc[4] = m5 = p5*x1 + p10*x2 + p15*x3 + p20*x4 + p25*x5;
1512*49b5e25fSSatish Balay 
1513*49b5e25fSSatish Balay         pc[5]  = m6  = p1*x6 + p6*x7  + p11*x8 + p16*x9 + p21*x10;
1514*49b5e25fSSatish Balay         pc[6]  = m7  = p2*x6 + p7*x7  + p12*x8 + p17*x9 + p22*x10;
1515*49b5e25fSSatish Balay         pc[7]  = m8  = p3*x6 + p8*x7  + p13*x8 + p18*x9 + p23*x10;
1516*49b5e25fSSatish Balay         pc[8]  = m9  = p4*x6 + p9*x7  + p14*x8 + p19*x9 + p24*x10;
1517*49b5e25fSSatish Balay         pc[9]  = m10 = p5*x6 + p10*x7 + p15*x8 + p20*x9 + p25*x10;
1518*49b5e25fSSatish Balay 
1519*49b5e25fSSatish Balay         pc[10] = m11 = p1*x11 + p6*x12  + p11*x13 + p16*x14 + p21*x15;
1520*49b5e25fSSatish Balay         pc[11] = m12 = p2*x11 + p7*x12  + p12*x13 + p17*x14 + p22*x15;
1521*49b5e25fSSatish Balay         pc[12] = m13 = p3*x11 + p8*x12  + p13*x13 + p18*x14 + p23*x15;
1522*49b5e25fSSatish Balay         pc[13] = m14 = p4*x11 + p9*x12  + p14*x13 + p19*x14 + p24*x15;
1523*49b5e25fSSatish Balay         pc[14] = m15 = p5*x11 + p10*x12 + p15*x13 + p20*x14 + p25*x15;
1524*49b5e25fSSatish Balay 
1525*49b5e25fSSatish Balay         pc[15] = m16 = p1*x16 + p6*x17  + p11*x18 + p16*x19 + p21*x20;
1526*49b5e25fSSatish Balay         pc[16] = m17 = p2*x16 + p7*x17  + p12*x18 + p17*x19 + p22*x20;
1527*49b5e25fSSatish Balay         pc[17] = m18 = p3*x16 + p8*x17  + p13*x18 + p18*x19 + p23*x20;
1528*49b5e25fSSatish Balay         pc[18] = m19 = p4*x16 + p9*x17  + p14*x18 + p19*x19 + p24*x20;
1529*49b5e25fSSatish Balay         pc[19] = m20 = p5*x16 + p10*x17 + p15*x18 + p20*x19 + p25*x20;
1530*49b5e25fSSatish Balay 
1531*49b5e25fSSatish Balay         pc[20] = m21 = p1*x21 + p6*x22  + p11*x23 + p16*x24 + p21*x25;
1532*49b5e25fSSatish Balay         pc[21] = m22 = p2*x21 + p7*x22  + p12*x23 + p17*x24 + p22*x25;
1533*49b5e25fSSatish Balay         pc[22] = m23 = p3*x21 + p8*x22  + p13*x23 + p18*x24 + p23*x25;
1534*49b5e25fSSatish Balay         pc[23] = m24 = p4*x21 + p9*x22  + p14*x23 + p19*x24 + p24*x25;
1535*49b5e25fSSatish Balay         pc[24] = m25 = p5*x21 + p10*x22 + p15*x23 + p20*x24 + p25*x25;
1536*49b5e25fSSatish Balay 
1537*49b5e25fSSatish Balay         nz = bi[row+1] - diag_offset[row] - 1;
1538*49b5e25fSSatish Balay         pv += 25;
1539*49b5e25fSSatish Balay         for (j=0; j<nz; j++) {
1540*49b5e25fSSatish Balay           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
1541*49b5e25fSSatish Balay           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
1542*49b5e25fSSatish Balay           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
1543*49b5e25fSSatish Balay           x14  = pv[13]; x15 = pv[14]; x16 = pv[15]; x17 = pv[16]; x18 = pv[17];
1544*49b5e25fSSatish Balay           x19 = pv[18];  x20 = pv[19]; x21 = pv[20]; x22 = pv[21]; x23 = pv[22];
1545*49b5e25fSSatish Balay           x24 = pv[23];  x25 = pv[24];
1546*49b5e25fSSatish Balay           x    = rtmp + 25*pj[j];
1547*49b5e25fSSatish Balay           x[0] -= m1*x1 + m6*x2   + m11*x3  + m16*x4 + m21*x5;
1548*49b5e25fSSatish Balay           x[1] -= m2*x1 + m7*x2   + m12*x3  + m17*x4 + m22*x5;
1549*49b5e25fSSatish Balay           x[2] -= m3*x1 + m8*x2   + m13*x3  + m18*x4 + m23*x5;
1550*49b5e25fSSatish Balay           x[3] -= m4*x1 + m9*x2   + m14*x3  + m19*x4 + m24*x5;
1551*49b5e25fSSatish Balay           x[4] -= m5*x1 + m10*x2  + m15*x3  + m20*x4 + m25*x5;
1552*49b5e25fSSatish Balay 
1553*49b5e25fSSatish Balay           x[5] -= m1*x6 + m6*x7   + m11*x8  + m16*x9 + m21*x10;
1554*49b5e25fSSatish Balay           x[6] -= m2*x6 + m7*x7   + m12*x8  + m17*x9 + m22*x10;
1555*49b5e25fSSatish Balay           x[7] -= m3*x6 + m8*x7   + m13*x8  + m18*x9 + m23*x10;
1556*49b5e25fSSatish Balay           x[8] -= m4*x6 + m9*x7   + m14*x8  + m19*x9 + m24*x10;
1557*49b5e25fSSatish Balay           x[9] -= m5*x6 + m10*x7  + m15*x8  + m20*x9 + m25*x10;
1558*49b5e25fSSatish Balay 
1559*49b5e25fSSatish Balay           x[10] -= m1*x11 + m6*x12  + m11*x13 + m16*x14 + m21*x15;
1560*49b5e25fSSatish Balay           x[11] -= m2*x11 + m7*x12  + m12*x13 + m17*x14 + m22*x15;
1561*49b5e25fSSatish Balay           x[12] -= m3*x11 + m8*x12  + m13*x13 + m18*x14 + m23*x15;
1562*49b5e25fSSatish Balay           x[13] -= m4*x11 + m9*x12  + m14*x13 + m19*x14 + m24*x15;
1563*49b5e25fSSatish Balay           x[14] -= m5*x11 + m10*x12 + m15*x13 + m20*x14 + m25*x15;
1564*49b5e25fSSatish Balay 
1565*49b5e25fSSatish Balay           x[15] -= m1*x16 + m6*x17  + m11*x18 + m16*x19 + m21*x20;
1566*49b5e25fSSatish Balay           x[16] -= m2*x16 + m7*x17  + m12*x18 + m17*x19 + m22*x20;
1567*49b5e25fSSatish Balay           x[17] -= m3*x16 + m8*x17  + m13*x18 + m18*x19 + m23*x20;
1568*49b5e25fSSatish Balay           x[18] -= m4*x16 + m9*x17  + m14*x18 + m19*x19 + m24*x20;
1569*49b5e25fSSatish Balay           x[19] -= m5*x16 + m10*x17 + m15*x18 + m20*x19 + m25*x20;
1570*49b5e25fSSatish Balay 
1571*49b5e25fSSatish Balay           x[20] -= m1*x21 + m6*x22  + m11*x23 + m16*x24 + m21*x25;
1572*49b5e25fSSatish Balay           x[21] -= m2*x21 + m7*x22  + m12*x23 + m17*x24 + m22*x25;
1573*49b5e25fSSatish Balay           x[22] -= m3*x21 + m8*x22  + m13*x23 + m18*x24 + m23*x25;
1574*49b5e25fSSatish Balay           x[23] -= m4*x21 + m9*x22  + m14*x23 + m19*x24 + m24*x25;
1575*49b5e25fSSatish Balay           x[24] -= m5*x21 + m10*x22 + m15*x23 + m20*x24 + m25*x25;
1576*49b5e25fSSatish Balay           pv   += 25;
1577*49b5e25fSSatish Balay         }
1578*49b5e25fSSatish Balay         PLogFlops(250*nz+225);
1579*49b5e25fSSatish Balay       }
1580*49b5e25fSSatish Balay       row = *ajtmp++;
1581*49b5e25fSSatish Balay     }
1582*49b5e25fSSatish Balay     /* finished row so stick it into b->a */
1583*49b5e25fSSatish Balay     pv = ba + 25*bi[i];
1584*49b5e25fSSatish Balay     pj = bj + bi[i];
1585*49b5e25fSSatish Balay     nz = bi[i+1] - bi[i];
1586*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
1587*49b5e25fSSatish Balay       x      = rtmp+25*pj[j];
1588*49b5e25fSSatish Balay       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
1589*49b5e25fSSatish Balay       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
1590*49b5e25fSSatish Balay       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
1591*49b5e25fSSatish Balay       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; pv[16] = x[16]; pv[17] = x[17];
1592*49b5e25fSSatish Balay       pv[18] = x[18]; pv[19] = x[19]; pv[20] = x[20]; pv[21] = x[21]; pv[22] = x[22];
1593*49b5e25fSSatish Balay       pv[23] = x[23]; pv[24] = x[24];
1594*49b5e25fSSatish Balay       pv   += 25;
1595*49b5e25fSSatish Balay     }
1596*49b5e25fSSatish Balay     /* invert diagonal block */
1597*49b5e25fSSatish Balay     w = ba + 25*diag_offset[i];
1598*49b5e25fSSatish Balay     ierr = Kernel_A_gets_inverse_A_5(w);CHKERRQ(ierr);
1599*49b5e25fSSatish Balay   }
1600*49b5e25fSSatish Balay 
1601*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1602*49b5e25fSSatish Balay   C->factor    = FACTOR_LU;
1603*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
1604*49b5e25fSSatish Balay   PLogFlops(1.3333*125*b->mbs); /* from inverting diagonal blocks */
1605*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1606*49b5e25fSSatish Balay }
1607*49b5e25fSSatish Balay 
1608*49b5e25fSSatish Balay /* ------------------------------------------------------------*/
1609*49b5e25fSSatish Balay /*
1610*49b5e25fSSatish Balay       Version for when blocks are 4 by 4
1611*49b5e25fSSatish Balay */
1612*49b5e25fSSatish Balay #undef __FUNC__
1613*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_4"
1614*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_4(Mat A,Mat *B)
1615*49b5e25fSSatish Balay {
1616*49b5e25fSSatish Balay   Mat         C = *B;
1617*49b5e25fSSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1618*49b5e25fSSatish Balay   IS          isrow = b->row,isicol = b->icol;
1619*49b5e25fSSatish Balay   int         *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
1620*49b5e25fSSatish Balay   int         *ajtmpold,*ajtmp,nz,row;
1621*49b5e25fSSatish Balay   int         *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
1622*49b5e25fSSatish Balay   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
1623*49b5e25fSSatish Balay   MatScalar   p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
1624*49b5e25fSSatish Balay   MatScalar   p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
1625*49b5e25fSSatish Balay   MatScalar   p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
1626*49b5e25fSSatish Balay   MatScalar   m13,m14,m15,m16;
1627*49b5e25fSSatish Balay   MatScalar   *ba = b->a,*aa = a->a;
1628*49b5e25fSSatish Balay 
1629*49b5e25fSSatish Balay   PetscFunctionBegin;
1630*49b5e25fSSatish Balay   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1631*49b5e25fSSatish Balay   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1632*49b5e25fSSatish Balay   rtmp  = (MatScalar*)PetscMalloc(16*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
1633*49b5e25fSSatish Balay 
1634*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
1635*49b5e25fSSatish Balay     nz    = bi[i+1] - bi[i];
1636*49b5e25fSSatish Balay     ajtmp = bj + bi[i];
1637*49b5e25fSSatish Balay     for  (j=0; j<nz; j++) {
1638*49b5e25fSSatish Balay       x = rtmp+16*ajtmp[j];
1639*49b5e25fSSatish Balay       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
1640*49b5e25fSSatish Balay       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
1641*49b5e25fSSatish Balay     }
1642*49b5e25fSSatish Balay     /* load in initial (unfactored row) */
1643*49b5e25fSSatish Balay     idx      = r[i];
1644*49b5e25fSSatish Balay     nz       = ai[idx+1] - ai[idx];
1645*49b5e25fSSatish Balay     ajtmpold = aj + ai[idx];
1646*49b5e25fSSatish Balay     v        = aa + 16*ai[idx];
1647*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
1648*49b5e25fSSatish Balay       x    = rtmp+16*ic[ajtmpold[j]];
1649*49b5e25fSSatish Balay       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
1650*49b5e25fSSatish Balay       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
1651*49b5e25fSSatish Balay       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
1652*49b5e25fSSatish Balay       x[14] = v[14]; x[15] = v[15];
1653*49b5e25fSSatish Balay       v    += 16;
1654*49b5e25fSSatish Balay     }
1655*49b5e25fSSatish Balay     row = *ajtmp++;
1656*49b5e25fSSatish Balay     while (row < i) {
1657*49b5e25fSSatish Balay       pc  = rtmp + 16*row;
1658*49b5e25fSSatish Balay       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
1659*49b5e25fSSatish Balay       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
1660*49b5e25fSSatish Balay       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
1661*49b5e25fSSatish Balay       p15 = pc[14]; p16 = pc[15];
1662*49b5e25fSSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
1663*49b5e25fSSatish Balay           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
1664*49b5e25fSSatish Balay           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
1665*49b5e25fSSatish Balay           || p16 != 0.0) {
1666*49b5e25fSSatish Balay         pv = ba + 16*diag_offset[row];
1667*49b5e25fSSatish Balay         pj = bj + diag_offset[row] + 1;
1668*49b5e25fSSatish Balay         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
1669*49b5e25fSSatish Balay         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
1670*49b5e25fSSatish Balay         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
1671*49b5e25fSSatish Balay         x15 = pv[14]; x16 = pv[15];
1672*49b5e25fSSatish Balay         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
1673*49b5e25fSSatish Balay         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
1674*49b5e25fSSatish Balay         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
1675*49b5e25fSSatish Balay         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
1676*49b5e25fSSatish Balay 
1677*49b5e25fSSatish Balay         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
1678*49b5e25fSSatish Balay         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
1679*49b5e25fSSatish Balay         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
1680*49b5e25fSSatish Balay         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
1681*49b5e25fSSatish Balay 
1682*49b5e25fSSatish Balay         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
1683*49b5e25fSSatish Balay         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
1684*49b5e25fSSatish Balay         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
1685*49b5e25fSSatish Balay         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
1686*49b5e25fSSatish Balay 
1687*49b5e25fSSatish Balay         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
1688*49b5e25fSSatish Balay         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
1689*49b5e25fSSatish Balay         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
1690*49b5e25fSSatish Balay         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
1691*49b5e25fSSatish Balay 
1692*49b5e25fSSatish Balay         nz = bi[row+1] - diag_offset[row] - 1;
1693*49b5e25fSSatish Balay         pv += 16;
1694*49b5e25fSSatish Balay         for (j=0; j<nz; j++) {
1695*49b5e25fSSatish Balay           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
1696*49b5e25fSSatish Balay           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
1697*49b5e25fSSatish Balay           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
1698*49b5e25fSSatish Balay           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
1699*49b5e25fSSatish Balay           x    = rtmp + 16*pj[j];
1700*49b5e25fSSatish Balay           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
1701*49b5e25fSSatish Balay           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
1702*49b5e25fSSatish Balay           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
1703*49b5e25fSSatish Balay           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
1704*49b5e25fSSatish Balay 
1705*49b5e25fSSatish Balay           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
1706*49b5e25fSSatish Balay           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
1707*49b5e25fSSatish Balay           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
1708*49b5e25fSSatish Balay           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
1709*49b5e25fSSatish Balay 
1710*49b5e25fSSatish Balay           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
1711*49b5e25fSSatish Balay           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
1712*49b5e25fSSatish Balay           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
1713*49b5e25fSSatish Balay           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
1714*49b5e25fSSatish Balay 
1715*49b5e25fSSatish Balay           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
1716*49b5e25fSSatish Balay           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
1717*49b5e25fSSatish Balay           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
1718*49b5e25fSSatish Balay           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
1719*49b5e25fSSatish Balay 
1720*49b5e25fSSatish Balay           pv   += 16;
1721*49b5e25fSSatish Balay         }
1722*49b5e25fSSatish Balay         PLogFlops(128*nz+112);
1723*49b5e25fSSatish Balay       }
1724*49b5e25fSSatish Balay       row = *ajtmp++;
1725*49b5e25fSSatish Balay     }
1726*49b5e25fSSatish Balay     /* finished row so stick it into b->a */
1727*49b5e25fSSatish Balay     pv = ba + 16*bi[i];
1728*49b5e25fSSatish Balay     pj = bj + bi[i];
1729*49b5e25fSSatish Balay     nz = bi[i+1] - bi[i];
1730*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
1731*49b5e25fSSatish Balay       x      = rtmp+16*pj[j];
1732*49b5e25fSSatish Balay       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
1733*49b5e25fSSatish Balay       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
1734*49b5e25fSSatish Balay       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
1735*49b5e25fSSatish Balay       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
1736*49b5e25fSSatish Balay       pv   += 16;
1737*49b5e25fSSatish Balay     }
1738*49b5e25fSSatish Balay     /* invert diagonal block */
1739*49b5e25fSSatish Balay     w = ba + 16*diag_offset[i];
1740*49b5e25fSSatish Balay     ierr = Kernel_A_gets_inverse_A_4(w);CHKERRQ(ierr);
1741*49b5e25fSSatish Balay   }
1742*49b5e25fSSatish Balay 
1743*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1744*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1745*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1746*49b5e25fSSatish Balay   C->factor = FACTOR_LU;
1747*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
1748*49b5e25fSSatish Balay   PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
1749*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1750*49b5e25fSSatish Balay }
1751*49b5e25fSSatish Balay /*
1752*49b5e25fSSatish Balay       Version for when blocks are 4 by 4 Using natural ordering
1753*49b5e25fSSatish Balay */
1754*49b5e25fSSatish Balay #undef __FUNC__
1755*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_4_NaturalOrdering"
1756*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_4_NaturalOrdering(Mat A,Mat *B)
1757*49b5e25fSSatish Balay {
1758*49b5e25fSSatish Balay   Mat         C = *B;
1759*49b5e25fSSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1760*49b5e25fSSatish Balay   int         ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
1761*49b5e25fSSatish Balay   int         *ajtmpold,*ajtmp,nz,row;
1762*49b5e25fSSatish Balay   int         *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
1763*49b5e25fSSatish Balay   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
1764*49b5e25fSSatish Balay   MatScalar   p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
1765*49b5e25fSSatish Balay   MatScalar   p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
1766*49b5e25fSSatish Balay   MatScalar   p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
1767*49b5e25fSSatish Balay   MatScalar   m13,m14,m15,m16;
1768*49b5e25fSSatish Balay   MatScalar   *ba = b->a,*aa = a->a;
1769*49b5e25fSSatish Balay 
1770*49b5e25fSSatish Balay   PetscFunctionBegin;
1771*49b5e25fSSatish Balay   rtmp  = (MatScalar*)PetscMalloc(16*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
1772*49b5e25fSSatish Balay 
1773*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
1774*49b5e25fSSatish Balay     nz    = bi[i+1] - bi[i];
1775*49b5e25fSSatish Balay     ajtmp = bj + bi[i];
1776*49b5e25fSSatish Balay     for  (j=0; j<nz; j++) {
1777*49b5e25fSSatish Balay       x = rtmp+16*ajtmp[j];
1778*49b5e25fSSatish Balay       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
1779*49b5e25fSSatish Balay       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
1780*49b5e25fSSatish Balay     }
1781*49b5e25fSSatish Balay     /* load in initial (unfactored row) */
1782*49b5e25fSSatish Balay     nz       = ai[i+1] - ai[i];
1783*49b5e25fSSatish Balay     ajtmpold = aj + ai[i];
1784*49b5e25fSSatish Balay     v        = aa + 16*ai[i];
1785*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
1786*49b5e25fSSatish Balay       x    = rtmp+16*ajtmpold[j];
1787*49b5e25fSSatish Balay       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
1788*49b5e25fSSatish Balay       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
1789*49b5e25fSSatish Balay       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
1790*49b5e25fSSatish Balay       x[14] = v[14]; x[15] = v[15];
1791*49b5e25fSSatish Balay       v    += 16;
1792*49b5e25fSSatish Balay     }
1793*49b5e25fSSatish Balay     row = *ajtmp++;
1794*49b5e25fSSatish Balay     while (row < i) {
1795*49b5e25fSSatish Balay       pc  = rtmp + 16*row;
1796*49b5e25fSSatish Balay       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
1797*49b5e25fSSatish Balay       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
1798*49b5e25fSSatish Balay       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
1799*49b5e25fSSatish Balay       p15 = pc[14]; p16 = pc[15];
1800*49b5e25fSSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
1801*49b5e25fSSatish Balay           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
1802*49b5e25fSSatish Balay           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
1803*49b5e25fSSatish Balay           || p16 != 0.0) {
1804*49b5e25fSSatish Balay         pv = ba + 16*diag_offset[row];
1805*49b5e25fSSatish Balay         pj = bj + diag_offset[row] + 1;
1806*49b5e25fSSatish Balay         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
1807*49b5e25fSSatish Balay         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
1808*49b5e25fSSatish Balay         x10 = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
1809*49b5e25fSSatish Balay         x15 = pv[14]; x16 = pv[15];
1810*49b5e25fSSatish Balay         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
1811*49b5e25fSSatish Balay         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
1812*49b5e25fSSatish Balay         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
1813*49b5e25fSSatish Balay         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
1814*49b5e25fSSatish Balay 
1815*49b5e25fSSatish Balay         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
1816*49b5e25fSSatish Balay         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
1817*49b5e25fSSatish Balay         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
1818*49b5e25fSSatish Balay         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
1819*49b5e25fSSatish Balay 
1820*49b5e25fSSatish Balay         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
1821*49b5e25fSSatish Balay         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
1822*49b5e25fSSatish Balay         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
1823*49b5e25fSSatish Balay         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
1824*49b5e25fSSatish Balay 
1825*49b5e25fSSatish Balay         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
1826*49b5e25fSSatish Balay         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
1827*49b5e25fSSatish Balay         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
1828*49b5e25fSSatish Balay         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
1829*49b5e25fSSatish Balay 
1830*49b5e25fSSatish Balay         nz = bi[row+1] - diag_offset[row] - 1;
1831*49b5e25fSSatish Balay         pv += 16;
1832*49b5e25fSSatish Balay         for (j=0; j<nz; j++) {
1833*49b5e25fSSatish Balay           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
1834*49b5e25fSSatish Balay           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
1835*49b5e25fSSatish Balay           x10  = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
1836*49b5e25fSSatish Balay           x14  = pv[13]; x15 = pv[14]; x16 = pv[15];
1837*49b5e25fSSatish Balay           x    = rtmp + 16*pj[j];
1838*49b5e25fSSatish Balay           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
1839*49b5e25fSSatish Balay           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
1840*49b5e25fSSatish Balay           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
1841*49b5e25fSSatish Balay           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
1842*49b5e25fSSatish Balay 
1843*49b5e25fSSatish Balay           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
1844*49b5e25fSSatish Balay           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
1845*49b5e25fSSatish Balay           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
1846*49b5e25fSSatish Balay           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
1847*49b5e25fSSatish Balay 
1848*49b5e25fSSatish Balay           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
1849*49b5e25fSSatish Balay           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
1850*49b5e25fSSatish Balay           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
1851*49b5e25fSSatish Balay           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
1852*49b5e25fSSatish Balay 
1853*49b5e25fSSatish Balay           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
1854*49b5e25fSSatish Balay           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
1855*49b5e25fSSatish Balay           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
1856*49b5e25fSSatish Balay           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
1857*49b5e25fSSatish Balay 
1858*49b5e25fSSatish Balay           pv   += 16;
1859*49b5e25fSSatish Balay         }
1860*49b5e25fSSatish Balay         PLogFlops(128*nz+112);
1861*49b5e25fSSatish Balay       }
1862*49b5e25fSSatish Balay       row = *ajtmp++;
1863*49b5e25fSSatish Balay     }
1864*49b5e25fSSatish Balay     /* finished row so stick it into b->a */
1865*49b5e25fSSatish Balay     pv = ba + 16*bi[i];
1866*49b5e25fSSatish Balay     pj = bj + bi[i];
1867*49b5e25fSSatish Balay     nz = bi[i+1] - bi[i];
1868*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
1869*49b5e25fSSatish Balay       x      = rtmp+16*pj[j];
1870*49b5e25fSSatish Balay       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
1871*49b5e25fSSatish Balay       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
1872*49b5e25fSSatish Balay       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
1873*49b5e25fSSatish Balay       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
1874*49b5e25fSSatish Balay       pv   += 16;
1875*49b5e25fSSatish Balay     }
1876*49b5e25fSSatish Balay     /* invert diagonal block */
1877*49b5e25fSSatish Balay     w = ba + 16*diag_offset[i];
1878*49b5e25fSSatish Balay     ierr = Kernel_A_gets_inverse_A_4(w);CHKERRQ(ierr);
1879*49b5e25fSSatish Balay   }
1880*49b5e25fSSatish Balay 
1881*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1882*49b5e25fSSatish Balay   C->factor    = FACTOR_LU;
1883*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
1884*49b5e25fSSatish Balay   PLogFlops(1.3333*64*b->mbs); /* from inverting diagonal blocks */
1885*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1886*49b5e25fSSatish Balay }
1887*49b5e25fSSatish Balay 
1888*49b5e25fSSatish Balay 
1889*49b5e25fSSatish Balay /* ------------------------------------------------------------*/
1890*49b5e25fSSatish Balay /*
1891*49b5e25fSSatish Balay       Version for when blocks are 3 by 3
1892*49b5e25fSSatish Balay */
1893*49b5e25fSSatish Balay #undef __FUNC__
1894*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_3"
1895*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_3(Mat A,Mat *B)
1896*49b5e25fSSatish Balay {
1897*49b5e25fSSatish Balay   Mat         C = *B;
1898*49b5e25fSSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
1899*49b5e25fSSatish Balay   IS          isrow = b->row,isicol = b->icol;
1900*49b5e25fSSatish Balay   int         *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
1901*49b5e25fSSatish Balay   int         *ajtmpold,*ajtmp,nz,row,*ai=a->i,*aj=a->j;
1902*49b5e25fSSatish Balay   int         *diag_offset = b->diag,idx,*pj;
1903*49b5e25fSSatish Balay   MatScalar   *pv,*v,*rtmp,*pc,*w,*x;
1904*49b5e25fSSatish Balay   MatScalar   p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
1905*49b5e25fSSatish Balay   MatScalar   p5,p6,p7,p8,p9,x5,x6,x7,x8,x9;
1906*49b5e25fSSatish Balay   MatScalar   *ba = b->a,*aa = a->a;
1907*49b5e25fSSatish Balay 
1908*49b5e25fSSatish Balay   PetscFunctionBegin;
1909*49b5e25fSSatish Balay   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1910*49b5e25fSSatish Balay   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1911*49b5e25fSSatish Balay   rtmp  = (MatScalar*)PetscMalloc(9*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
1912*49b5e25fSSatish Balay 
1913*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
1914*49b5e25fSSatish Balay     nz    = bi[i+1] - bi[i];
1915*49b5e25fSSatish Balay     ajtmp = bj + bi[i];
1916*49b5e25fSSatish Balay     for  (j=0; j<nz; j++) {
1917*49b5e25fSSatish Balay       x = rtmp + 9*ajtmp[j];
1918*49b5e25fSSatish Balay       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
1919*49b5e25fSSatish Balay     }
1920*49b5e25fSSatish Balay     /* load in initial (unfactored row) */
1921*49b5e25fSSatish Balay     idx      = r[i];
1922*49b5e25fSSatish Balay     nz       = ai[idx+1] - ai[idx];
1923*49b5e25fSSatish Balay     ajtmpold = aj + ai[idx];
1924*49b5e25fSSatish Balay     v        = aa + 9*ai[idx];
1925*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
1926*49b5e25fSSatish Balay       x    = rtmp + 9*ic[ajtmpold[j]];
1927*49b5e25fSSatish Balay       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
1928*49b5e25fSSatish Balay       x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
1929*49b5e25fSSatish Balay       v    += 9;
1930*49b5e25fSSatish Balay     }
1931*49b5e25fSSatish Balay     row = *ajtmp++;
1932*49b5e25fSSatish Balay     while (row < i) {
1933*49b5e25fSSatish Balay       pc = rtmp + 9*row;
1934*49b5e25fSSatish Balay       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
1935*49b5e25fSSatish Balay       p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
1936*49b5e25fSSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
1937*49b5e25fSSatish Balay           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
1938*49b5e25fSSatish Balay         pv = ba + 9*diag_offset[row];
1939*49b5e25fSSatish Balay         pj = bj + diag_offset[row] + 1;
1940*49b5e25fSSatish Balay         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
1941*49b5e25fSSatish Balay         x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
1942*49b5e25fSSatish Balay         pc[0] = m1 = p1*x1 + p4*x2 + p7*x3;
1943*49b5e25fSSatish Balay         pc[1] = m2 = p2*x1 + p5*x2 + p8*x3;
1944*49b5e25fSSatish Balay         pc[2] = m3 = p3*x1 + p6*x2 + p9*x3;
1945*49b5e25fSSatish Balay 
1946*49b5e25fSSatish Balay         pc[3] = m4 = p1*x4 + p4*x5 + p7*x6;
1947*49b5e25fSSatish Balay         pc[4] = m5 = p2*x4 + p5*x5 + p8*x6;
1948*49b5e25fSSatish Balay         pc[5] = m6 = p3*x4 + p6*x5 + p9*x6;
1949*49b5e25fSSatish Balay 
1950*49b5e25fSSatish Balay         pc[6] = m7 = p1*x7 + p4*x8 + p7*x9;
1951*49b5e25fSSatish Balay         pc[7] = m8 = p2*x7 + p5*x8 + p8*x9;
1952*49b5e25fSSatish Balay         pc[8] = m9 = p3*x7 + p6*x8 + p9*x9;
1953*49b5e25fSSatish Balay         nz = bi[row+1] - diag_offset[row] - 1;
1954*49b5e25fSSatish Balay         pv += 9;
1955*49b5e25fSSatish Balay         for (j=0; j<nz; j++) {
1956*49b5e25fSSatish Balay           x1   = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
1957*49b5e25fSSatish Balay           x5   = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
1958*49b5e25fSSatish Balay           x    = rtmp + 9*pj[j];
1959*49b5e25fSSatish Balay           x[0] -= m1*x1 + m4*x2 + m7*x3;
1960*49b5e25fSSatish Balay           x[1] -= m2*x1 + m5*x2 + m8*x3;
1961*49b5e25fSSatish Balay           x[2] -= m3*x1 + m6*x2 + m9*x3;
1962*49b5e25fSSatish Balay 
1963*49b5e25fSSatish Balay           x[3] -= m1*x4 + m4*x5 + m7*x6;
1964*49b5e25fSSatish Balay           x[4] -= m2*x4 + m5*x5 + m8*x6;
1965*49b5e25fSSatish Balay           x[5] -= m3*x4 + m6*x5 + m9*x6;
1966*49b5e25fSSatish Balay 
1967*49b5e25fSSatish Balay           x[6] -= m1*x7 + m4*x8 + m7*x9;
1968*49b5e25fSSatish Balay           x[7] -= m2*x7 + m5*x8 + m8*x9;
1969*49b5e25fSSatish Balay           x[8] -= m3*x7 + m6*x8 + m9*x9;
1970*49b5e25fSSatish Balay           pv   += 9;
1971*49b5e25fSSatish Balay         }
1972*49b5e25fSSatish Balay         PLogFlops(54*nz+36);
1973*49b5e25fSSatish Balay       }
1974*49b5e25fSSatish Balay       row = *ajtmp++;
1975*49b5e25fSSatish Balay     }
1976*49b5e25fSSatish Balay     /* finished row so stick it into b->a */
1977*49b5e25fSSatish Balay     pv = ba + 9*bi[i];
1978*49b5e25fSSatish Balay     pj = bj + bi[i];
1979*49b5e25fSSatish Balay     nz = bi[i+1] - bi[i];
1980*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
1981*49b5e25fSSatish Balay       x     = rtmp + 9*pj[j];
1982*49b5e25fSSatish Balay       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
1983*49b5e25fSSatish Balay       pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
1984*49b5e25fSSatish Balay       pv   += 9;
1985*49b5e25fSSatish Balay     }
1986*49b5e25fSSatish Balay     /* invert diagonal block */
1987*49b5e25fSSatish Balay     w = ba + 9*diag_offset[i];
1988*49b5e25fSSatish Balay     ierr = Kernel_A_gets_inverse_A_3(w);CHKERRQ(ierr);
1989*49b5e25fSSatish Balay   }
1990*49b5e25fSSatish Balay 
1991*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1992*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1993*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1994*49b5e25fSSatish Balay   C->factor = FACTOR_LU;
1995*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
1996*49b5e25fSSatish Balay   PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */
1997*49b5e25fSSatish Balay   PetscFunctionReturn(0);
1998*49b5e25fSSatish Balay }
1999*49b5e25fSSatish Balay /*
2000*49b5e25fSSatish Balay       Version for when blocks are 3 by 3 Using natural ordering
2001*49b5e25fSSatish Balay */
2002*49b5e25fSSatish Balay #undef __FUNC__
2003*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_3_NaturalOrdering"
2004*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_3_NaturalOrdering(Mat A,Mat *B)
2005*49b5e25fSSatish Balay {
2006*49b5e25fSSatish Balay   Mat                C = *B;
2007*49b5e25fSSatish Balay   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
2008*49b5e25fSSatish Balay   int                ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
2009*49b5e25fSSatish Balay   int                *ajtmpold,*ajtmp,nz,row;
2010*49b5e25fSSatish Balay   int                *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
2011*49b5e25fSSatish Balay   MatScalar          *pv,*v,*rtmp,*pc,*w,*x;
2012*49b5e25fSSatish Balay   MatScalar          p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
2013*49b5e25fSSatish Balay   MatScalar          p5,p6,p7,p8,p9,x5,x6,x7,x8,x9;
2014*49b5e25fSSatish Balay   MatScalar          *ba = b->a,*aa = a->a;
2015*49b5e25fSSatish Balay 
2016*49b5e25fSSatish Balay   PetscFunctionBegin;
2017*49b5e25fSSatish Balay   rtmp  = (MatScalar*)PetscMalloc(9*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
2018*49b5e25fSSatish Balay 
2019*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
2020*49b5e25fSSatish Balay     nz    = bi[i+1] - bi[i];
2021*49b5e25fSSatish Balay     ajtmp = bj + bi[i];
2022*49b5e25fSSatish Balay     for  (j=0; j<nz; j++) {
2023*49b5e25fSSatish Balay       x = rtmp+9*ajtmp[j];
2024*49b5e25fSSatish Balay       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = 0.0;
2025*49b5e25fSSatish Balay     }
2026*49b5e25fSSatish Balay     /* load in initial (unfactored row) */
2027*49b5e25fSSatish Balay     nz       = ai[i+1] - ai[i];
2028*49b5e25fSSatish Balay     ajtmpold = aj + ai[i];
2029*49b5e25fSSatish Balay     v        = aa + 9*ai[i];
2030*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
2031*49b5e25fSSatish Balay       x    = rtmp+9*ajtmpold[j];
2032*49b5e25fSSatish Balay       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
2033*49b5e25fSSatish Balay       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
2034*49b5e25fSSatish Balay       v    += 9;
2035*49b5e25fSSatish Balay     }
2036*49b5e25fSSatish Balay     row = *ajtmp++;
2037*49b5e25fSSatish Balay     while (row < i) {
2038*49b5e25fSSatish Balay       pc  = rtmp + 9*row;
2039*49b5e25fSSatish Balay       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
2040*49b5e25fSSatish Balay       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
2041*49b5e25fSSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
2042*49b5e25fSSatish Balay           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
2043*49b5e25fSSatish Balay         pv = ba + 9*diag_offset[row];
2044*49b5e25fSSatish Balay         pj = bj + diag_offset[row] + 1;
2045*49b5e25fSSatish Balay         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
2046*49b5e25fSSatish Balay         x5  = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
2047*49b5e25fSSatish Balay         pc[0] = m1 = p1*x1 + p4*x2 + p7*x3;
2048*49b5e25fSSatish Balay         pc[1] = m2 = p2*x1 + p5*x2 + p8*x3;
2049*49b5e25fSSatish Balay         pc[2] = m3 = p3*x1 + p6*x2 + p9*x3;
2050*49b5e25fSSatish Balay 
2051*49b5e25fSSatish Balay         pc[3] = m4 = p1*x4 + p4*x5 + p7*x6;
2052*49b5e25fSSatish Balay         pc[4] = m5 = p2*x4 + p5*x5 + p8*x6;
2053*49b5e25fSSatish Balay         pc[5] = m6 = p3*x4 + p6*x5 + p9*x6;
2054*49b5e25fSSatish Balay 
2055*49b5e25fSSatish Balay         pc[6] = m7 = p1*x7 + p4*x8 + p7*x9;
2056*49b5e25fSSatish Balay         pc[7] = m8 = p2*x7 + p5*x8 + p8*x9;
2057*49b5e25fSSatish Balay         pc[8] = m9 = p3*x7 + p6*x8 + p9*x9;
2058*49b5e25fSSatish Balay 
2059*49b5e25fSSatish Balay         nz = bi[row+1] - diag_offset[row] - 1;
2060*49b5e25fSSatish Balay         pv += 9;
2061*49b5e25fSSatish Balay         for (j=0; j<nz; j++) {
2062*49b5e25fSSatish Balay           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
2063*49b5e25fSSatish Balay           x5   = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
2064*49b5e25fSSatish Balay           x    = rtmp + 9*pj[j];
2065*49b5e25fSSatish Balay           x[0] -= m1*x1 + m4*x2 + m7*x3;
2066*49b5e25fSSatish Balay           x[1] -= m2*x1 + m5*x2 + m8*x3;
2067*49b5e25fSSatish Balay           x[2] -= m3*x1 + m6*x2 + m9*x3;
2068*49b5e25fSSatish Balay 
2069*49b5e25fSSatish Balay           x[3] -= m1*x4 + m4*x5 + m7*x6;
2070*49b5e25fSSatish Balay           x[4] -= m2*x4 + m5*x5 + m8*x6;
2071*49b5e25fSSatish Balay           x[5] -= m3*x4 + m6*x5 + m9*x6;
2072*49b5e25fSSatish Balay 
2073*49b5e25fSSatish Balay           x[6] -= m1*x7 + m4*x8 + m7*x9;
2074*49b5e25fSSatish Balay           x[7] -= m2*x7 + m5*x8 + m8*x9;
2075*49b5e25fSSatish Balay           x[8] -= m3*x7 + m6*x8 + m9*x9;
2076*49b5e25fSSatish Balay           pv   += 9;
2077*49b5e25fSSatish Balay         }
2078*49b5e25fSSatish Balay         PLogFlops(54*nz+36);
2079*49b5e25fSSatish Balay       }
2080*49b5e25fSSatish Balay       row = *ajtmp++;
2081*49b5e25fSSatish Balay     }
2082*49b5e25fSSatish Balay     /* finished row so stick it into b->a */
2083*49b5e25fSSatish Balay     pv = ba + 9*bi[i];
2084*49b5e25fSSatish Balay     pj = bj + bi[i];
2085*49b5e25fSSatish Balay     nz = bi[i+1] - bi[i];
2086*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
2087*49b5e25fSSatish Balay       x      = rtmp+9*pj[j];
2088*49b5e25fSSatish Balay       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
2089*49b5e25fSSatish Balay       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
2090*49b5e25fSSatish Balay       pv   += 9;
2091*49b5e25fSSatish Balay     }
2092*49b5e25fSSatish Balay     /* invert diagonal block */
2093*49b5e25fSSatish Balay     w = ba + 9*diag_offset[i];
2094*49b5e25fSSatish Balay     ierr = Kernel_A_gets_inverse_A_3(w);CHKERRQ(ierr);
2095*49b5e25fSSatish Balay   }
2096*49b5e25fSSatish Balay 
2097*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2098*49b5e25fSSatish Balay   C->factor    = FACTOR_LU;
2099*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
2100*49b5e25fSSatish Balay   PLogFlops(1.3333*27*b->mbs); /* from inverting diagonal blocks */
2101*49b5e25fSSatish Balay   PetscFunctionReturn(0);
2102*49b5e25fSSatish Balay }
2103*49b5e25fSSatish Balay 
2104*49b5e25fSSatish Balay /* ------------------------------------------------------------*/
2105*49b5e25fSSatish Balay /*
2106*49b5e25fSSatish Balay       Version for when blocks are 2 by 2
2107*49b5e25fSSatish Balay */
2108*49b5e25fSSatish Balay #undef __FUNC__
2109*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_2"
2110*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_2(Mat A,Mat *B)
2111*49b5e25fSSatish Balay {
2112*49b5e25fSSatish Balay   Mat                C = *B;
2113*49b5e25fSSatish Balay   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
2114*49b5e25fSSatish Balay   IS                 isrow = b->row,isicol = b->icol;
2115*49b5e25fSSatish Balay   int                *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
2116*49b5e25fSSatish Balay   int                *ajtmpold,*ajtmp,nz,row;
2117*49b5e25fSSatish Balay   int                *diag_offset=b->diag,idx,*ai=a->i,*aj=a->j,*pj;
2118*49b5e25fSSatish Balay   MatScalar          *pv,*v,*rtmp,m1,m2,m3,m4,*pc,*w,*x,x1,x2,x3,x4;
2119*49b5e25fSSatish Balay   MatScalar          p1,p2,p3,p4;
2120*49b5e25fSSatish Balay   MatScalar          *ba = b->a,*aa = a->a;
2121*49b5e25fSSatish Balay 
2122*49b5e25fSSatish Balay   PetscFunctionBegin;
2123*49b5e25fSSatish Balay   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
2124*49b5e25fSSatish Balay   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
2125*49b5e25fSSatish Balay   rtmp  = (MatScalar*)PetscMalloc(4*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
2126*49b5e25fSSatish Balay 
2127*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
2128*49b5e25fSSatish Balay     nz    = bi[i+1] - bi[i];
2129*49b5e25fSSatish Balay     ajtmp = bj + bi[i];
2130*49b5e25fSSatish Balay     for  (j=0; j<nz; j++) {
2131*49b5e25fSSatish Balay       x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0;
2132*49b5e25fSSatish Balay     }
2133*49b5e25fSSatish Balay     /* load in initial (unfactored row) */
2134*49b5e25fSSatish Balay     idx      = r[i];
2135*49b5e25fSSatish Balay     nz       = ai[idx+1] - ai[idx];
2136*49b5e25fSSatish Balay     ajtmpold = aj + ai[idx];
2137*49b5e25fSSatish Balay     v        = aa + 4*ai[idx];
2138*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
2139*49b5e25fSSatish Balay       x    = rtmp+4*ic[ajtmpold[j]];
2140*49b5e25fSSatish Balay       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
2141*49b5e25fSSatish Balay       v    += 4;
2142*49b5e25fSSatish Balay     }
2143*49b5e25fSSatish Balay     row = *ajtmp++;
2144*49b5e25fSSatish Balay     while (row < i) {
2145*49b5e25fSSatish Balay       pc = rtmp + 4*row;
2146*49b5e25fSSatish Balay       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
2147*49b5e25fSSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
2148*49b5e25fSSatish Balay         pv = ba + 4*diag_offset[row];
2149*49b5e25fSSatish Balay         pj = bj + diag_offset[row] + 1;
2150*49b5e25fSSatish Balay         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
2151*49b5e25fSSatish Balay         pc[0] = m1 = p1*x1 + p3*x2;
2152*49b5e25fSSatish Balay         pc[1] = m2 = p2*x1 + p4*x2;
2153*49b5e25fSSatish Balay         pc[2] = m3 = p1*x3 + p3*x4;
2154*49b5e25fSSatish Balay         pc[3] = m4 = p2*x3 + p4*x4;
2155*49b5e25fSSatish Balay         nz = bi[row+1] - diag_offset[row] - 1;
2156*49b5e25fSSatish Balay         pv += 4;
2157*49b5e25fSSatish Balay         for (j=0; j<nz; j++) {
2158*49b5e25fSSatish Balay           x1   = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
2159*49b5e25fSSatish Balay           x    = rtmp + 4*pj[j];
2160*49b5e25fSSatish Balay           x[0] -= m1*x1 + m3*x2;
2161*49b5e25fSSatish Balay           x[1] -= m2*x1 + m4*x2;
2162*49b5e25fSSatish Balay           x[2] -= m1*x3 + m3*x4;
2163*49b5e25fSSatish Balay           x[3] -= m2*x3 + m4*x4;
2164*49b5e25fSSatish Balay           pv   += 4;
2165*49b5e25fSSatish Balay         }
2166*49b5e25fSSatish Balay         PLogFlops(16*nz+12);
2167*49b5e25fSSatish Balay       }
2168*49b5e25fSSatish Balay       row = *ajtmp++;
2169*49b5e25fSSatish Balay     }
2170*49b5e25fSSatish Balay     /* finished row so stick it into b->a */
2171*49b5e25fSSatish Balay     pv = ba + 4*bi[i];
2172*49b5e25fSSatish Balay     pj = bj + bi[i];
2173*49b5e25fSSatish Balay     nz = bi[i+1] - bi[i];
2174*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
2175*49b5e25fSSatish Balay       x     = rtmp+4*pj[j];
2176*49b5e25fSSatish Balay       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
2177*49b5e25fSSatish Balay       pv   += 4;
2178*49b5e25fSSatish Balay     }
2179*49b5e25fSSatish Balay     /* invert diagonal block */
2180*49b5e25fSSatish Balay     w = ba + 4*diag_offset[i];
2181*49b5e25fSSatish Balay     ierr = Kernel_A_gets_inverse_A_2(w);CHKERRQ(ierr);
2182*49b5e25fSSatish Balay     /*Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);*/
2183*49b5e25fSSatish Balay   }
2184*49b5e25fSSatish Balay 
2185*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2186*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2187*49b5e25fSSatish Balay   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2188*49b5e25fSSatish Balay   C->factor = FACTOR_LU;
2189*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
2190*49b5e25fSSatish Balay   PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
2191*49b5e25fSSatish Balay   PetscFunctionReturn(0);
2192*49b5e25fSSatish Balay }
2193*49b5e25fSSatish Balay /*
2194*49b5e25fSSatish Balay       Version for when blocks are 2 by 2 Using natural ordering
2195*49b5e25fSSatish Balay */
2196*49b5e25fSSatish Balay #undef __FUNC__
2197*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_2_NaturalOrdering"
2198*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat A,Mat *B)
2199*49b5e25fSSatish Balay {
2200*49b5e25fSSatish Balay   Mat                C = *B;
2201*49b5e25fSSatish Balay   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
2202*49b5e25fSSatish Balay   int                ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j;
2203*49b5e25fSSatish Balay   int                *ajtmpold,*ajtmp,nz,row;
2204*49b5e25fSSatish Balay   int                *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
2205*49b5e25fSSatish Balay   MatScalar          *pv,*v,*rtmp,*pc,*w,*x;
2206*49b5e25fSSatish Balay   MatScalar          p1,p2,p3,p4,m1,m2,m3,m4,x1,x2,x3,x4;
2207*49b5e25fSSatish Balay   MatScalar          *ba = b->a,*aa = a->a;
2208*49b5e25fSSatish Balay 
2209*49b5e25fSSatish Balay   PetscFunctionBegin;
2210*49b5e25fSSatish Balay   rtmp  = (MatScalar*)PetscMalloc(4*(n+1)*sizeof(MatScalar));CHKPTRQ(rtmp);
2211*49b5e25fSSatish Balay 
2212*49b5e25fSSatish Balay   for (i=0; i<n; i++) {
2213*49b5e25fSSatish Balay     nz    = bi[i+1] - bi[i];
2214*49b5e25fSSatish Balay     ajtmp = bj + bi[i];
2215*49b5e25fSSatish Balay     for  (j=0; j<nz; j++) {
2216*49b5e25fSSatish Balay       x = rtmp+4*ajtmp[j];
2217*49b5e25fSSatish Balay       x[0]  = x[1]  = x[2]  = x[3]  = 0.0;
2218*49b5e25fSSatish Balay     }
2219*49b5e25fSSatish Balay     /* load in initial (unfactored row) */
2220*49b5e25fSSatish Balay     nz       = ai[i+1] - ai[i];
2221*49b5e25fSSatish Balay     ajtmpold = aj + ai[i];
2222*49b5e25fSSatish Balay     v        = aa + 4*ai[i];
2223*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
2224*49b5e25fSSatish Balay       x    = rtmp+4*ajtmpold[j];
2225*49b5e25fSSatish Balay       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
2226*49b5e25fSSatish Balay       v    += 4;
2227*49b5e25fSSatish Balay     }
2228*49b5e25fSSatish Balay     row = *ajtmp++;
2229*49b5e25fSSatish Balay     while (row < i) {
2230*49b5e25fSSatish Balay       pc  = rtmp + 4*row;
2231*49b5e25fSSatish Balay       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
2232*49b5e25fSSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
2233*49b5e25fSSatish Balay         pv = ba + 4*diag_offset[row];
2234*49b5e25fSSatish Balay         pj = bj + diag_offset[row] + 1;
2235*49b5e25fSSatish Balay         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
2236*49b5e25fSSatish Balay         pc[0] = m1 = p1*x1 + p3*x2;
2237*49b5e25fSSatish Balay         pc[1] = m2 = p2*x1 + p4*x2;
2238*49b5e25fSSatish Balay         pc[2] = m3 = p1*x3 + p3*x4;
2239*49b5e25fSSatish Balay         pc[3] = m4 = p2*x3 + p4*x4;
2240*49b5e25fSSatish Balay         nz = bi[row+1] - diag_offset[row] - 1;
2241*49b5e25fSSatish Balay         pv += 4;
2242*49b5e25fSSatish Balay         for (j=0; j<nz; j++) {
2243*49b5e25fSSatish Balay           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
2244*49b5e25fSSatish Balay           x    = rtmp + 4*pj[j];
2245*49b5e25fSSatish Balay           x[0] -= m1*x1 + m3*x2;
2246*49b5e25fSSatish Balay           x[1] -= m2*x1 + m4*x2;
2247*49b5e25fSSatish Balay           x[2] -= m1*x3 + m3*x4;
2248*49b5e25fSSatish Balay           x[3] -= m2*x3 + m4*x4;
2249*49b5e25fSSatish Balay           pv   += 4;
2250*49b5e25fSSatish Balay         }
2251*49b5e25fSSatish Balay         PLogFlops(16*nz+12);
2252*49b5e25fSSatish Balay       }
2253*49b5e25fSSatish Balay       row = *ajtmp++;
2254*49b5e25fSSatish Balay     }
2255*49b5e25fSSatish Balay     /* finished row so stick it into b->a */
2256*49b5e25fSSatish Balay     pv = ba + 4*bi[i];
2257*49b5e25fSSatish Balay     pj = bj + bi[i];
2258*49b5e25fSSatish Balay     nz = bi[i+1] - bi[i];
2259*49b5e25fSSatish Balay     for (j=0; j<nz; j++) {
2260*49b5e25fSSatish Balay       x      = rtmp+4*pj[j];
2261*49b5e25fSSatish Balay       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
2262*49b5e25fSSatish Balay       pv   += 4;
2263*49b5e25fSSatish Balay     }
2264*49b5e25fSSatish Balay     /* invert diagonal block */
2265*49b5e25fSSatish Balay     w = ba + 4*diag_offset[i];
2266*49b5e25fSSatish Balay     ierr = Kernel_A_gets_inverse_A_2(w);CHKERRQ(ierr);
2267*49b5e25fSSatish Balay     /*Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);*/
2268*49b5e25fSSatish Balay   }
2269*49b5e25fSSatish Balay 
2270*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2271*49b5e25fSSatish Balay   C->factor    = FACTOR_LU;
2272*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
2273*49b5e25fSSatish Balay   PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
2274*49b5e25fSSatish Balay   PetscFunctionReturn(0);
2275*49b5e25fSSatish Balay }
2276*49b5e25fSSatish Balay 
2277*49b5e25fSSatish Balay /* ----------------------------------------------------------- */
2278*49b5e25fSSatish Balay /*
2279*49b5e25fSSatish Balay      Version for when blocks are 1 by 1.
2280*49b5e25fSSatish Balay */
2281*49b5e25fSSatish Balay #undef __FUNC__
2282*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactorNumeric_SeqSBAIJ_1"
2283*49b5e25fSSatish Balay int MatLUFactorNumeric_SeqSBAIJ_1(Mat A,Mat *B)
2284*49b5e25fSSatish Balay {
2285*49b5e25fSSatish Balay   Mat                C = *B;
2286*49b5e25fSSatish Balay   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data;
2287*49b5e25fSSatish Balay   IS                 ip = b->row;
2288*49b5e25fSSatish Balay   int                *rip,*riip,*ic,ierr,i,j,mbs = a->mbs,*bi = b->i,*bj = b->j;
2289*49b5e25fSSatish Balay   int                *ajtmpold,*ajtmp,nz,row,*ai = a->i,*aj = a->j;
2290*49b5e25fSSatish Balay   int                *diag_offset = b->diag,diag,*pj;
2291*49b5e25fSSatish Balay   MatScalar          *pv,*v,*rtmp,multiplier,*pc;
2292*49b5e25fSSatish Balay   MatScalar          *ba = b->a,*aa = a->a;
2293*49b5e25fSSatish Balay   MatScalar          dk,uikdi;
2294*49b5e25fSSatish Balay   int                k,jmin,jmax,*jl,*il,vj,nexti,juj,ili;
2295*49b5e25fSSatish Balay 
2296*49b5e25fSSatish Balay   PetscFunctionBegin;
2297*49b5e25fSSatish Balay   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
2298*49b5e25fSSatish Balay   riip = rip;
2299*49b5e25fSSatish Balay 
2300*49b5e25fSSatish Balay   /* INITIALIZATION */
2301*49b5e25fSSatish Balay   /* il and jl record the first nonzero element in each row of the accessing
2302*49b5e25fSSatish Balay      window U(0:k, k:mbs-1).
2303*49b5e25fSSatish Balay      jl:    list of rows to be added to uneliminated rows
2304*49b5e25fSSatish Balay             i>= k: jl(i) is the first row to be added to row i
2305*49b5e25fSSatish Balay             i<  k: jl(i) is the row following row i in some list of rows
2306*49b5e25fSSatish Balay             jl(i) = mbs indicates the end of a list
2307*49b5e25fSSatish Balay      il(i): points to the first nonzero element in columns k,...,mbs-1 of
2308*49b5e25fSSatish Balay             row i of U */
2309*49b5e25fSSatish Balay   rtmp  = (MatScalar*)PetscMalloc(mbs*sizeof(MatScalar));CHKPTRQ(rtmp);
2310*49b5e25fSSatish Balay   il = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(il);
2311*49b5e25fSSatish Balay   jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl);
2312*49b5e25fSSatish Balay   for (i=0; i<mbs; i++) {
2313*49b5e25fSSatish Balay     rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
2314*49b5e25fSSatish Balay   }
2315*49b5e25fSSatish Balay 
2316*49b5e25fSSatish Balay   /* FOR EACH ROW K */
2317*49b5e25fSSatish Balay   for (k = 0; k<mbs; k++){
2318*49b5e25fSSatish Balay 
2319*49b5e25fSSatish Balay     /* INITIALIZE K-TH ROW WITH ELEMENTS NONZERO IN ROW P(K) OF A */
2320*49b5e25fSSatish Balay     jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
2321*49b5e25fSSatish Balay     if (jmin < jmax) {
2322*49b5e25fSSatish Balay       for (j = jmin; j < jmax; j++){
2323*49b5e25fSSatish Balay         vj = riip[aj[j]];
2324*49b5e25fSSatish Balay         if (k <= vj) rtmp[vj] = aa[j];
2325*49b5e25fSSatish Balay       }
2326*49b5e25fSSatish Balay     }
2327*49b5e25fSSatish Balay 
2328*49b5e25fSSatish Balay     /* MODIFY K-TH ROW BY ADDING IN THOSE ROWS I WITH U(I,K) NE 0
2329*49b5e25fSSatish Balay        FOR EACH ROW I TO BE ADDED IN */
2330*49b5e25fSSatish Balay     dk = rtmp[k];
2331*49b5e25fSSatish Balay     i = jl[k]; /* first row to be added to k_th row  */
2332*49b5e25fSSatish Balay     /* printf(" k=%d, pivot row = %d\n",k,i); */
2333*49b5e25fSSatish Balay 
2334*49b5e25fSSatish Balay     while (i < mbs){
2335*49b5e25fSSatish Balay       nexti = jl[i]; /* next row to be added to k_th row */
2336*49b5e25fSSatish Balay       /* printf("      pivot row = %d\n", nexti); */
2337*49b5e25fSSatish Balay 
2338*49b5e25fSSatish Balay       /* COMPUTE MULTIPLIER AND UPDATE DIAGONAL ELEMENT */
2339*49b5e25fSSatish Balay       ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
2340*49b5e25fSSatish Balay       uikdi = - ba[ili]*ba[i];
2341*49b5e25fSSatish Balay       dk += uikdi*ba[ili];
2342*49b5e25fSSatish Balay       ba[ili] = uikdi; /* update U(i,k) */
2343*49b5e25fSSatish Balay 
2344*49b5e25fSSatish Balay       /* ADD MULTIPLE OF ROW I TO K-TH ROW ... */
2345*49b5e25fSSatish Balay       jmin = ili + 1; jmax = bi[i+1];
2346*49b5e25fSSatish Balay       if (jmin < jmax){
2347*49b5e25fSSatish Balay         for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
2348*49b5e25fSSatish Balay         /* ... AND ADD I TO ROW LIST FOR NEXT NONZERO ENTRY */
2349*49b5e25fSSatish Balay          il[i] = jmin;             /* update il(i) in column k+1, ... mbs-1 */
2350*49b5e25fSSatish Balay          j     = bj[jmin];
2351*49b5e25fSSatish Balay          jl[i] = jl[j]; jl[j] = i; /* update jl */
2352*49b5e25fSSatish Balay       }
2353*49b5e25fSSatish Balay       i = nexti;
2354*49b5e25fSSatish Balay       /* printf("                  pivot row i=%d\n",i);  */
2355*49b5e25fSSatish Balay     }
2356*49b5e25fSSatish Balay 
2357*49b5e25fSSatish Balay     /* CHECK FOR ZERO PIVOT AND SAVE DIAGONAL ELEMENT */
2358*49b5e25fSSatish Balay     if (dk == 0.0){
2359*49b5e25fSSatish Balay       SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,0,"Zero pivot");
2360*49b5e25fSSatish Balay     }
2361*49b5e25fSSatish Balay 
2362*49b5e25fSSatish Balay     /* SAVE NONZERO ENTRIES IN K-TH ROW OF U ... */
2363*49b5e25fSSatish Balay     ba[k] = 1/dk;
2364*49b5e25fSSatish Balay     jmin = bi[k]; jmax = bi[k+1];
2365*49b5e25fSSatish Balay     if (jmin < jmax) {
2366*49b5e25fSSatish Balay       for (j=jmin; j<jmax; j++){
2367*49b5e25fSSatish Balay          juj = bj[j]; ba[j] = rtmp[juj]; rtmp[juj] = 0.0;
2368*49b5e25fSSatish Balay       }
2369*49b5e25fSSatish Balay 
2370*49b5e25fSSatish Balay       /* ... AND ADD K TO ROW LIST FOR FIRST NONZERO ENTRY IN K-TH ROW */
2371*49b5e25fSSatish Balay       il[k] = jmin;
2372*49b5e25fSSatish Balay       i     = bj[jmin];
2373*49b5e25fSSatish Balay       jl[k] = jl[i]; jl[i] = k;
2374*49b5e25fSSatish Balay     }
2375*49b5e25fSSatish Balay   }
2376*49b5e25fSSatish Balay 
2377*49b5e25fSSatish Balay   ierr = PetscFree(rtmp);CHKERRQ(ierr);
2378*49b5e25fSSatish Balay   ierr = PetscFree(il);CHKERRQ(ierr);
2379*49b5e25fSSatish Balay   ierr = PetscFree(jl);CHKERRQ(ierr);
2380*49b5e25fSSatish Balay 
2381*49b5e25fSSatish Balay   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
2382*49b5e25fSSatish Balay   C->factor    = FACTOR_LU;
2383*49b5e25fSSatish Balay   C->assembled = PETSC_TRUE;
2384*49b5e25fSSatish Balay   PLogFlops(b->mbs);
2385*49b5e25fSSatish Balay   PetscFunctionReturn(0);
2386*49b5e25fSSatish Balay }
2387*49b5e25fSSatish Balay 
2388*49b5e25fSSatish Balay #undef __FUNC__
2389*49b5e25fSSatish Balay #define __FUNC__ "MatLUFactor_SeqSBAIJ"
2390*49b5e25fSSatish Balay int MatLUFactor_SeqSBAIJ(Mat A,IS row,IS col,MatLUInfo *info)
2391*49b5e25fSSatish Balay {
2392*49b5e25fSSatish Balay   Mat_SeqBAIJ    *mat = (Mat_SeqBAIJ*)A->data;
2393*49b5e25fSSatish Balay   int            ierr,refct;
2394*49b5e25fSSatish Balay   Mat            C;
2395*49b5e25fSSatish Balay   PetscOps *Abops;
2396*49b5e25fSSatish Balay   MatOps   Aops;
2397*49b5e25fSSatish Balay 
2398*49b5e25fSSatish Balay   PetscFunctionBegin;
2399*49b5e25fSSatish Balay   ierr = MatLUFactorSymbolic(A,row,col,info,&C);CHKERRQ(ierr);
2400*49b5e25fSSatish Balay   ierr = MatLUFactorNumeric(A,&C);CHKERRQ(ierr);
2401*49b5e25fSSatish Balay 
2402*49b5e25fSSatish Balay   /* free all the data structures from mat */
2403*49b5e25fSSatish Balay   ierr = PetscFree(mat->a);CHKERRQ(ierr);
2404*49b5e25fSSatish Balay   if (!mat->singlemalloc) {
2405*49b5e25fSSatish Balay     ierr = PetscFree(mat->i);CHKERRQ(ierr);
2406*49b5e25fSSatish Balay     ierr = PetscFree(mat->j);CHKERRQ(ierr);
2407*49b5e25fSSatish Balay   }
2408*49b5e25fSSatish Balay   if (mat->diag) {ierr = PetscFree(mat->diag);CHKERRQ(ierr);}
2409*49b5e25fSSatish Balay   if (mat->ilen) {ierr = PetscFree(mat->ilen);CHKERRQ(ierr);}
2410*49b5e25fSSatish Balay   if (mat->imax) {ierr = PetscFree(mat->imax);CHKERRQ(ierr);}
2411*49b5e25fSSatish Balay   if (mat->solve_work) {ierr = PetscFree(mat->solve_work);CHKERRQ(ierr);}
2412*49b5e25fSSatish Balay   if (mat->mult_work) {ierr = PetscFree(mat->mult_work);CHKERRQ(ierr);}
2413*49b5e25fSSatish Balay   if (mat->icol) {ierr = ISDestroy(mat->icol);CHKERRQ(ierr);}
2414*49b5e25fSSatish Balay   ierr = PetscFree(mat);CHKERRQ(ierr);
2415*49b5e25fSSatish Balay 
2416*49b5e25fSSatish Balay   ierr = MapDestroy(A->rmap);CHKERRQ(ierr);
2417*49b5e25fSSatish Balay   ierr = MapDestroy(A->cmap);CHKERRQ(ierr);
2418*49b5e25fSSatish Balay 
2419*49b5e25fSSatish Balay   /*
2420*49b5e25fSSatish Balay        This is horrible,horrible code. We need to keep the
2421*49b5e25fSSatish Balay     A pointers for the bops and ops but copy everything
2422*49b5e25fSSatish Balay     else from C.
2423*49b5e25fSSatish Balay   */
2424*49b5e25fSSatish Balay   Abops = A->bops;
2425*49b5e25fSSatish Balay   Aops  = A->ops;
2426*49b5e25fSSatish Balay   refct = A->refct;
2427*49b5e25fSSatish Balay   ierr  = PetscMemcpy(A,C,sizeof(struct _p_Mat));CHKERRQ(ierr);
2428*49b5e25fSSatish Balay   mat   = (Mat_SeqBAIJ*)A->data;
2429*49b5e25fSSatish Balay   PLogObjectParent(A,mat->icol);
2430*49b5e25fSSatish Balay 
2431*49b5e25fSSatish Balay   A->bops  = Abops;
2432*49b5e25fSSatish Balay   A->ops   = Aops;
2433*49b5e25fSSatish Balay   A->qlist = 0;
2434*49b5e25fSSatish Balay   A->refct = refct;
2435*49b5e25fSSatish Balay   /* copy over the type_name and name */
2436*49b5e25fSSatish Balay   ierr     = PetscStrallocpy(C->type_name,&A->type_name);CHKERRQ(ierr);
2437*49b5e25fSSatish Balay   ierr     = PetscStrallocpy(C->name,&A->name);CHKERRQ(ierr);
2438*49b5e25fSSatish Balay 
2439*49b5e25fSSatish Balay   PetscHeaderDestroy(C);
2440*49b5e25fSSatish Balay   PetscFunctionReturn(0);
2441*49b5e25fSSatish Balay }
2442*49b5e25fSSatish Balay 
2443*49b5e25fSSatish Balay 
2444