xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision 7fc0212e64316908372d1091343c7f3bcc655bd0)
1 #ifndef lint
2 static char vcid[] = "$Id: baijfact.c,v 1.3 1996/02/17 05:41:36 bsmith Exp bsmith $";
3 #endif
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "baij.h"
9 
10 /*
11     The symbolic factorization code is identical to that for AIJ format,
12   except for very small changes since this is now a SeqBAIJ datastructure.
13   NOT good code reuse.
14 */
15 int MatLUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,Mat *B)
16 {
17   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
18   IS          isicol;
19   int         *r,*ic, ierr, i, n = a->mbs, *ai = a->i, *aj = a->j;
20   int         *ainew,*ajnew, jmax,*fill, *ajtmp, nz, bs = a->bs;
21   int         *idnew, idx, row,m,fm, nnz, nzi,len, realloc = 0,nzbd,*im;
22 
23   if (a->m != a->n) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must be square");
24   if (!isrow) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have row permutation");
25   if (!iscol) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have col. permutation");
26 
27   ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
28   ISGetIndices(isrow,&r); ISGetIndices(isicol,&ic);
29 
30   /* get new row pointers */
31   ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
32   ainew[0] = 0;
33   /* don't know how many column pointers are needed so estimate */
34   jmax = (int) (f*ai[n] + 1);
35   ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
36   /* fill is a linked list of nonzeros in active row */
37   fill = (int *) PetscMalloc( (2*n+1)*sizeof(int)); CHKPTRQ(fill);
38   im = fill + n + 1;
39   /* idnew is location of diagonal in factor */
40   idnew = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(idnew);
41   idnew[0] = 0;
42 
43   for ( i=0; i<n; i++ ) {
44     /* first copy previous fill into linked list */
45     nnz     = nz    = ai[r[i]+1] - ai[r[i]];
46     ajtmp   = aj + ai[r[i]];
47     fill[n] = n;
48     while (nz--) {
49       fm  = n;
50       idx = ic[*ajtmp++];
51       do {
52         m  = fm;
53         fm = fill[m];
54       } while (fm < idx);
55       fill[m]   = idx;
56       fill[idx] = fm;
57     }
58     row = fill[n];
59     while ( row < i ) {
60       ajtmp = ajnew + idnew[row] + 1;
61       nzbd  = 1 + idnew[row] - ainew[row];
62       nz    = im[row] - nzbd;
63       fm    = row;
64       while (nz-- > 0) {
65         idx = *ajtmp++;
66         nzbd++;
67         if (idx == i) im[row] = nzbd;
68         do {
69           m  = fm;
70           fm = fill[m];
71         } while (fm < idx);
72         if (fm != idx) {
73           fill[m]   = idx;
74           fill[idx] = fm;
75           fm        = idx;
76           nnz++;
77         }
78       }
79       row = fill[row];
80     }
81     /* copy new filled row into permanent storage */
82     ainew[i+1] = ainew[i] + nnz;
83     if (ainew[i+1] > jmax+1) {
84       /* allocate a longer ajnew */
85       int maxadd;
86       maxadd = (int) ((f*(ai[n]+1)*(n-i+5))/n);
87       if (maxadd < nnz) maxadd = (n-i)*(nnz+1);
88       jmax += maxadd;
89       ajtmp = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(ajtmp);
90       PetscMemcpy(ajtmp,ajnew,ainew[i]*sizeof(int));
91       PetscFree(ajnew);
92       ajnew = ajtmp;
93       realloc++; /* count how many times we realloc */
94     }
95     ajtmp = ajnew + ainew[i];
96     fm    = fill[n];
97     nzi   = 0;
98     im[i] = nnz;
99     while (nnz--) {
100       if (fm < i) nzi++;
101       *ajtmp++ = fm;
102       fm       = fill[fm];
103     }
104     idnew[i] = ainew[i] + nzi;
105   }
106 
107   PLogInfo((PetscObject)A,
108     "Info:MatLUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",
109                              realloc,f,((double)ainew[n])/((double)ai[i]));
110 
111   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
112   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
113 
114   PetscFree(fill);
115 
116   /* put together the new matrix */
117   ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,B); CHKERRQ(ierr);
118   PLogObjectParent(*B,isicol);
119   ierr = ISDestroy(isicol); CHKERRQ(ierr);
120   b = (Mat_SeqBAIJ *) (*B)->data;
121   PetscFree(b->imax);
122   b->singlemalloc = 0;
123   len             = ainew[n]*sizeof(Scalar);
124   /* the next line frees the default space generated by the Create() */
125   PetscFree(b->a); PetscFree(b->ilen);
126   b->a          = (Scalar *) PetscMalloc( len*bs*bs ); CHKPTRQ(b->a);
127   b->j          = ajnew;
128   b->i          = ainew;
129   b->diag       = idnew;
130   b->ilen       = 0;
131   b->imax       = 0;
132   b->row        = isrow;
133   b->col        = iscol;
134   b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar));
135   CHKPTRQ(b->solve_work);
136   /* In b structure:  Free imax, ilen, old a, old j.
137      Allocate idnew, solve_work, new a, new j */
138   PLogObjectMemory(*B,(ainew[n]-n)*(sizeof(int)+sizeof(Scalar)));
139   b->maxnz = b->nz = ainew[n];
140 
141   return 0;
142 }
143 
144 #include "pinclude/plapack.h"
145 
146 #define BlockZero(v,idx) {PetscMemzero(v+bs2*(idx),bs2*sizeof(Scalar));}
147 
148 #define BlockCopy(v_in,v_out,idx_in,idx_out) \
149   {PetscMemcpy(v_out + bs2*(idx_out),v_in + bs2*(idx_in),bs2*sizeof(Scalar));}
150 
151 #define BlockInvert(vv,idx) \
152   { int _i,info; Scalar *w = vv + bs2*idx; \
153     LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info); \
154     PetscMemzero(v_work,bs2*sizeof(Scalar));  \
155     for ( _i=0; _i<bs; _i++ ) { v_work[_i + bs*_i] = 1.0; } \
156     LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info);\
157     PetscMemcpy(w,v_work,bs2*sizeof(Scalar)); \
158   }
159 
160 #define BlockMult(v1,v2,v3) \
161   {Scalar _DOne=1.0, _DZero=0.0;\
162   BLgemm_("N","N",&bs,&bs,&bs,&_DOne,v1,&bs,v2,&bs,&_DZero,v3,&bs);}
163 
164 #define BlockMultSub(v1,v2,v3,idx2,idx3) \
165   {Scalar _DOne=1.0, _DMOne=-1.0;\
166   BLgemm_("N","N",&bs,&bs,&bs,&_DMOne,v1,&bs,v2+bs2*idx2,&bs,&_DOne,v3+bs2*idx3,&bs);}
167 
168 /* ----------------------------------------------------------- */
169 int MatLUFactorNumeric_SeqBAIJ_N(Mat A,Mat *B)
170 {
171   Mat         C = *B;
172   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b = (Mat_SeqBAIJ *)C->data;
173   IS          iscol = b->col, isrow = b->row, isicol;
174   int         *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j;
175   int         *ajtmpold, *ajtmp, nz, row, bslog;
176   int         *diag_offset = b->diag,diag,bs = a->bs,bs2 = bs*bs, *v_pivots;
177   register Scalar *pv,*v,*rtmp,*multiplier,*v_work,*pc;
178   register int    *pj;
179 
180   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
181   PLogObjectParent(*B,isicol);
182   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
183   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
184   rtmp  = (Scalar *) PetscMalloc(bs2*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
185 
186   /* generate work space needed by dense LU factorization */
187   v_work     = (Scalar *) PetscMalloc((bs+2*bs2)*sizeof(Scalar));CHKPTRQ(v_work);
188   multiplier = v_work + bs2;
189   v_pivots   = (int *) (multiplier + bs2);
190 
191   /* flops in while loop */
192   bslog = 4*bs*bs2 - bs;
193 
194   for ( i=0; i<n; i++ ) {
195     nz    = ai[i+1] - ai[i];
196     ajtmp = aj + ai[i];
197     for  ( j=0; j<nz; j++ ) BlockZero(rtmp,ajtmp[j]);
198     /* load in initial (unfactored row) */
199     nz       = a->i[r[i]+1] - a->i[r[i]];
200     ajtmpold = a->j + a->i[r[i]];
201     v        = a->a + bs2*a->i[r[i]];
202     for ( j=0; j<nz; j++ ) BlockCopy(v,rtmp,j,ic[ajtmpold[j]]);
203     row = *ajtmp++;
204     while (row < i) {
205       pc = rtmp + bs2*row;
206 /*      if (*pc) { */
207         pv = b->a + bs2*diag_offset[row];
208         pj = b->j + diag_offset[row] + 1;
209 	BlockMult(pc,pv,multiplier);
210         BlockCopy(multiplier,pc,0,0);
211         nz = ai[row+1] - diag_offset[row] - 1;
212         pv += bs2;
213         for (j=0; j<nz; j++) BlockMultSub(multiplier,pv,rtmp,j,pj[j]);
214         PLogFlops(bslog*nz);
215 /*      } */
216         row = *ajtmp++;
217     }
218     /* finished row so stick it into b->a */
219     pv = b->a + bs2*ai[i];
220     pj = b->j + ai[i];
221     nz = ai[i+1] - ai[i];
222     for ( j=0; j<nz; j++ ) BlockCopy(rtmp,pv,pj[j],j);
223     diag = diag_offset[i] - ai[i];
224     /* invert diagonal block */
225     BlockInvert(pv,diag);
226   }
227 
228   PetscFree(rtmp); PetscFree(v_work);
229   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
230   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
231   ierr = ISDestroy(isicol); CHKERRQ(ierr);
232   C->factor = FACTOR_LU;
233   C->assembled = PETSC_TRUE;
234   PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
235   return 0;
236 }
237 
238 /* ----------------------------------------------------------- */
239 /*
240      Version for when blocks are 1 by 1.
241 */
242 int MatLUFactorNumeric_SeqBAIJ_1(Mat A,Mat *B)
243 {
244   Mat         C = *B;
245   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b = (Mat_SeqBAIJ *)C->data;
246   IS          iscol = b->col, isrow = b->row, isicol;
247   int         *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j;
248   int         *ajtmpold, *ajtmp, nz, row;
249   int         *diag_offset = b->diag,diag;
250   register Scalar *pv,*v,*rtmp,multiplier,*pc;
251   register int    *pj;
252 
253   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
254   PLogObjectParent(*B,isicol);
255   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
256   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
257   rtmp  = (Scalar *) PetscMalloc((n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
258 
259   for ( i=0; i<n; i++ ) {
260     nz    = ai[i+1] - ai[i];
261     ajtmp = aj + ai[i];
262     for  ( j=0; j<nz; j++ ) rtmp[ajtmp[j]] = 0.0;
263 
264     /* load in initial (unfactored row) */
265     nz       = a->i[r[i]+1] - a->i[r[i]];
266     ajtmpold = a->j + a->i[r[i]];
267     v        = a->a + a->i[r[i]];
268     for ( j=0; j<nz; j++ ) rtmp[ic[ajtmpold[j]]] =  v[j];
269 
270     row = *ajtmp++;
271     while (row < i) {
272       pc = rtmp + row;
273       if (*pc != 0.0) {
274         pv         = b->a + diag_offset[row];
275         pj         = b->j + diag_offset[row] + 1;
276         multiplier = *pc * *pv++;
277         *pc        = multiplier;
278         nz         = ai[row+1] - diag_offset[row] - 1;
279         for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
280         PLogFlops(2*nz);
281       }
282       row = *ajtmp++;
283     }
284     /* finished row so stick it into b->a */
285     pv = b->a + ai[i];
286     pj = b->j + ai[i];
287     nz = ai[i+1] - ai[i];
288     for ( j=0; j<nz; j++ ) {pv[j] = rtmp[pj[j]];}
289     diag = diag_offset[i] - ai[i];
290     /* check pivot entry for current row */
291     if (pv[diag] == 0.0) {
292       SETERRQ(1,"MatLUFactorNumeric_SeqAIJ:Zero pivot");
293     }
294     pv[diag] = 1.0/pv[diag];
295   }
296 
297   PetscFree(rtmp);
298   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
299   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
300   ierr = ISDestroy(isicol); CHKERRQ(ierr);
301   C->factor    = FACTOR_LU;
302   C->assembled = PETSC_TRUE;
303   PLogFlops(b->n);
304   return 0;
305 }
306 
307 /* ----------------------------------------------------------- */
308 int MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,double f)
309 {
310   Mat_SeqBAIJ *mat = (Mat_SeqBAIJ *) A->data;
311   int         ierr;
312   Mat         C;
313 
314   ierr = MatLUFactorSymbolic_SeqBAIJ(A,row,col,f,&C); CHKERRQ(ierr);
315   ierr = MatLUFactorNumeric(A,&C); CHKERRQ(ierr);
316 
317   /* free all the data structures from mat */
318   PetscFree(mat->a);
319   if (!mat->singlemalloc) {PetscFree(mat->i); PetscFree(mat->j);}
320   if (mat->diag) PetscFree(mat->diag);
321   if (mat->ilen) PetscFree(mat->ilen);
322   if (mat->imax) PetscFree(mat->imax);
323   if (mat->solve_work) PetscFree(mat->solve_work);
324   PetscFree(mat);
325 
326   PetscMemcpy(A,C,sizeof(struct _Mat));
327   PetscHeaderDestroy(C);
328   return 0;
329 }
330 /* ----------------------------------------------------------- */
331 int MatSolve_SeqBAIJ(Mat A,Vec bb, Vec xx)
332 {
333   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
334   IS          iscol = a->col, isrow = a->row;
335   int         *r,*c, ierr, i,  n = a->mbs, *vi, *ai = a->i, *aj = a->j;
336   int         nz,bs = a->bs,bs2 = bs*bs,idx,idt,idc, _One = 1;
337   Scalar      *xa,*ba,*aa = a->a, *sum, _DOne = 1.0, _DMOne = -1.0;
338   Scalar      _DZero = 0.0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
339   register Scalar *x, *b, *lsum, *tmp, *v;
340 
341   if (A->factor != FACTOR_LU) SETERRQ(1,"MatSolve_SeqBAIJ:Not for unfactored matrix");
342 
343   ierr = VecGetArray(bb,&ba); CHKERRQ(ierr); b = ba;
344   ierr = VecGetArray(xx,&xa); CHKERRQ(ierr); x = xa;
345   tmp  = a->solve_work;
346 
347   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
348   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); c = c + (n-1);
349 
350   switch (bs) {
351     case 1:
352       /* forward solve the lower triangular */
353       tmp[0] = b[*r++];
354       for ( i=1; i<n; i++ ) {
355         v     = aa + ai[i];
356         vi    = aj + ai[i];
357         nz    = a->diag[i] - ai[i];
358         sum1  = b[*r++];
359         while (nz--) {
360           sum1 -= (*v++)*tmp[*vi++];
361         }
362         tmp[i] = sum1;
363       }
364       /* backward solve the upper triangular */
365       for ( i=n-1; i>=0; i-- ){
366         v    = aa + a->diag[i] + 1;
367         vi   = aj + a->diag[i] + 1;
368         nz   = ai[i+1] - a->diag[i] - 1;
369         sum1 = tmp[i];
370         while (nz--) {
371           sum1 -= (*v++)*tmp[*vi++];
372         }
373         x[*c--] = tmp[i] = aa[a->diag[i]]*sum1;
374       }
375       break;
376     case 2:
377       /* forward solve the lower triangular */
378       idx    = 2*(*r++);
379       tmp[0] = b[idx]; tmp[1] = b[1+idx];
380       for ( i=1; i<n; i++ ) {
381         v     = aa + 4*ai[i];
382         vi    = aj + ai[i];
383         nz    = a->diag[i] - ai[i];
384         idx   = 2*(*r++);
385         sum1  = b[idx]; sum2 = b[1+idx];
386         while (nz--) {
387           idx   = 2*(*vi++);
388           x1    = tmp[idx]; x2 = tmp[1+idx];
389           sum1 -= v[0]*x1 + v[2]*x2;
390           sum2 -= v[1]*x1 + v[3]*x2;
391           v += 4;
392         }
393         idx = 2*i;
394         tmp[idx] = sum1; tmp[1+idx] = sum2;
395       }
396       /* backward solve the upper triangular */
397       for ( i=n-1; i>=0; i-- ){
398         v    = aa + 4*a->diag[i] + 4;
399         vi   = aj + a->diag[i] + 1;
400         nz   = ai[i+1] - a->diag[i] - 1;
401         idt  = 2*i;
402         sum1 = tmp[idt]; sum2 = tmp[1+idt];
403         while (nz--) {
404           idx   = 2*(*vi++);
405           x1    = tmp[idx]; x2 = tmp[1+idx];
406           sum1 -= v[0]*x1 + v[2]*x2;
407           sum2 -= v[1]*x1 + v[3]*x2;
408           v += 4;
409         }
410         idc = 2*(*c--);
411         v   = aa + 4*a->diag[i];
412         x[idc]   = tmp[idt]   = v[0]*sum1 + v[2]*sum2;
413         x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[3]*sum2;
414       }
415       break;
416     case 3:
417       /* forward solve the lower triangular */
418       idx    = 3*(*r++);
419       tmp[0] = b[idx]; tmp[1] = b[1+idx]; tmp[2] = b[2+idx];
420       for ( i=1; i<n; i++ ) {
421         v     = aa + 9*ai[i];
422         vi    = aj + ai[i];
423         nz    = a->diag[i] - ai[i];
424         idx   = 3*(*r++);
425         sum1  = b[idx]; sum2 = b[1+idx]; sum3 = b[2+idx];
426         while (nz--) {
427           idx   = 3*(*vi++);
428           x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
429           sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
430           sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
431           sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
432           v += 9;
433         }
434         idx = 3*i;
435         tmp[idx] = sum1; tmp[1+idx] = sum2; tmp[2+idx] = sum3;
436       }
437       /* backward solve the upper triangular */
438       for ( i=n-1; i>=0; i-- ){
439         v    = aa + 9*a->diag[i] + 9;
440         vi   = aj + a->diag[i] + 1;
441         nz   = ai[i+1] - a->diag[i] - 1;
442         idt  = 3*i;
443         sum1 = tmp[idt]; sum2 = tmp[1+idt]; sum3 = tmp[2+idt];
444         while (nz--) {
445           idx   = 3*(*vi++);
446           x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
447           sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
448           sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
449           sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
450           v += 9;
451         }
452         idc = 3*(*c--);
453         v   = aa + 9*a->diag[i];
454         x[idc]   = tmp[idt]   = v[0]*sum1 + v[3]*sum2 + v[6]*sum3;
455         x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[4]*sum2 + v[7]*sum3;
456         x[2+idc] = tmp[2+idt] = v[2]*sum1 + v[5]*sum2 + v[8]*sum3;
457       }
458       break;
459     case 4:
460       /* forward solve the lower triangular */
461       idx    = 4*(*r++);
462       tmp[0] = b[idx];   tmp[1] = b[1+idx];
463       tmp[2] = b[2+idx]; tmp[3] = b[3+idx];
464       for ( i=1; i<n; i++ ) {
465         v     = aa + 16*ai[i];
466         vi    = aj + ai[i];
467         nz    = a->diag[i] - ai[i];
468         idx   = 4*(*r++);
469         sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
470         while (nz--) {
471           idx   = 4*(*vi++);
472           x1    = tmp[idx];x2 = tmp[1+idx];x3 = tmp[2+idx];x4 = tmp[3+idx];
473           sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
474           sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
475           sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
476           sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
477           v += 16;
478         }
479         idx = 4*i;
480         tmp[idx]   = sum1;tmp[1+idx] = sum2;
481         tmp[2+idx] = sum3;tmp[3+idx] = sum4;
482       }
483       /* backward solve the upper triangular */
484       for ( i=n-1; i>=0; i-- ){
485         v    = aa + 16*a->diag[i] + 16;
486         vi   = aj + a->diag[i] + 1;
487         nz   = ai[i+1] - a->diag[i] - 1;
488         idt  = 4*i;
489         sum1 = tmp[idt];  sum2 = tmp[1+idt];
490         sum3 = tmp[2+idt];sum4 = tmp[3+idt];
491         while (nz--) {
492           idx   = 4*(*vi++);
493           x1    = tmp[idx];   x2 = tmp[1+idx];
494           x3    = tmp[2+idx]; x4 = tmp[3+idx];
495           sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
496           sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
497           sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
498           sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
499           v += 16;
500         }
501         idc = 4*(*c--);
502         v   = aa + 16*a->diag[i];
503         x[idc]   = tmp[idt]   = v[0]*sum1+v[4]*sum2+v[8]*sum3+v[12]*sum4;
504         x[1+idc] = tmp[1+idt] = v[1]*sum1+v[5]*sum2+v[9]*sum3+v[13]*sum4;
505         x[2+idc] = tmp[2+idt] = v[2]*sum1+v[6]*sum2+v[10]*sum3+v[14]*sum4;
506         x[3+idc] = tmp[3+idt] = v[3]*sum1+v[7]*sum2+v[11]*sum3+v[15]*sum4;
507       }
508       break;
509     case 5:
510       /* forward solve the lower triangular */
511       idx    = 5*(*r++);
512       tmp[0] = b[idx];   tmp[1] = b[1+idx];
513       tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx];
514       for ( i=1; i<n; i++ ) {
515         v     = aa + 25*ai[i];
516         vi    = aj + ai[i];
517         nz    = a->diag[i] - ai[i];
518         idx   = 5*(*r++);
519         sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
520         sum5  = b[4+idx];
521         while (nz--) {
522           idx   = 5*(*vi++);
523           x1    = tmp[idx];  x2 = tmp[1+idx];x3 = tmp[2+idx];
524           x4    = tmp[3+idx];x5 = tmp[4+idx];
525           sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
526           sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
527           sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
528           sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
529           sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
530           v += 25;
531         }
532         idx = 5*i;
533         tmp[idx]   = sum1;tmp[1+idx] = sum2;
534         tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5;
535       }
536       /* backward solve the upper triangular */
537       for ( i=n-1; i>=0; i-- ){
538         v    = aa + 25*a->diag[i] + 25;
539         vi   = aj + a->diag[i] + 1;
540         nz   = ai[i+1] - a->diag[i] - 1;
541         idt  = 5*i;
542         sum1 = tmp[idt];  sum2 = tmp[1+idt];
543         sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt];
544         while (nz--) {
545           idx   = 5*(*vi++);
546           x1    = tmp[idx];   x2 = tmp[1+idx];
547           x3    = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx];
548           sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
549           sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
550           sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
551           sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
552           sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
553           v += 25;
554         }
555         idc = 5*(*c--);
556         v   = aa + 25*a->diag[i];
557         x[idc]   = tmp[idt]   = v[0]*sum1+v[5]*sum2+v[10]*sum3+
558                                 v[15]*sum4+v[20]*sum5;
559         x[1+idc] = tmp[1+idt] = v[1]*sum1+v[6]*sum2+v[11]*sum3+
560                                 v[16]*sum4+v[21]*sum5;
561         x[2+idc] = tmp[2+idt] = v[2]*sum1+v[7]*sum2+v[12]*sum3+
562                                 v[17]*sum4+v[22]*sum5;
563         x[3+idc] = tmp[3+idt] = v[3]*sum1+v[8]*sum2+v[13]*sum3+
564                                 v[18]*sum4+v[23]*sum5;
565         x[4+idc] = tmp[4+idt] = v[4]*sum1+v[9]*sum2+v[14]*sum3+
566                                 v[19]*sum4+v[24]*sum5;
567       }
568       break;
569     default: {
570       /* forward solve the lower triangular */
571       PetscMemcpy(tmp,b + bs*(*r++), bs*sizeof(Scalar));
572       for ( i=1; i<n; i++ ) {
573         v   = aa + bs2*ai[i];
574         vi  = aj + ai[i];
575         nz  = a->diag[i] - ai[i];
576         sum = tmp + bs*i;
577         PetscMemcpy(sum,b+bs*(*r++),bs*sizeof(Scalar));
578         while (nz--) {
579           LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,sum,&_One);
580           v += bs2;
581         }
582       }
583       /* backward solve the upper triangular */
584       lsum = a->solve_work + a->n;
585       for ( i=n-1; i>=0; i-- ){
586         v   = aa + bs2*(a->diag[i] + 1);
587         vi  = aj + a->diag[i] + 1;
588         nz  = ai[i+1] - a->diag[i] - 1;
589         PetscMemcpy(lsum,tmp+i*bs,bs*sizeof(Scalar));
590         while (nz--) {
591           LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,lsum,&_One);
592           v += bs2;
593         }
594         LAgemv_("N",&bs,&bs,&_DOne,aa+bs2*a->diag[i],&bs,lsum,&_One,&_DZero,
595                 tmp+i*bs,&_One);
596         PetscMemcpy(x + bs*(*c--),tmp+i*bs,bs*sizeof(Scalar));
597       }
598     }
599   }
600 
601   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
602   ierr = ISRestoreIndices(iscol,&c); CHKERRQ(ierr);
603   PLogFlops(2*bs2*a->nz - a->n);
604   return 0;
605 }
606 
607 /* ----------------------------------------------------------------*/
608 /*
609      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
610    except that the data structure of Mat_SeqAIJ is slightly different.
611    Not a good example of code reuse.
612    */
613 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,int levels,
614                                  Mat *fact)
615 {
616   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
617   IS          isicol;
618   int         *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j;
619   int         *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev;
620   int         *dloc, idx, row,m,fm, nzf, nzi,len,  realloc = 0;
621   int         incrlev,nnz,i,bs = a->bs;
622 
623   if (a->m != a->n) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Matrix must be square");
624   if (!isrow) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have row permutation");
625   if (!iscol) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have column permutation");
626 
627   /* special case that simply copies fill pattern */
628   if (levels == 0 && ISIsIdentity(isrow) && ISIsIdentity(iscol)) {
629     ierr = MatConvertSameType_SeqBAIJ(A,fact,DO_NOT_COPY_VALUES); CHKERRQ(ierr);
630     (*fact)->factor = FACTOR_LU;
631     b               = (Mat_SeqBAIJ *) (*fact)->data;
632     if (!b->diag) {
633       ierr = MatMarkDiag_SeqBAIJ(*fact); CHKERRQ(ierr);
634     }
635     b->row        = isrow;
636     b->col        = iscol;
637     b->solve_work = (Scalar *) PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
638     return 0;
639   }
640 
641   ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
642   ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr);
643   ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
644 
645   /* get new row pointers */
646   ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
647   ainew[0] = 0;
648   /* don't know how many column pointers are needed so estimate */
649   jmax = (int) (f*ai[n] + 1);
650   ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
651   /* ajfill is level of fill for each fill entry */
652   ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajfill);
653   /* fill is a linked list of nonzeros in active row */
654   fill = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(fill);
655   /* im is level for each filled value */
656   im = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(im);
657   /* dloc is location of diagonal in factor */
658   dloc = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(dloc);
659   dloc[0]  = 0;
660   for ( prow=0; prow<n; prow++ ) {
661     /* first copy previous fill into linked list */
662     nzf     = nz  = ai[r[prow]+1] - ai[r[prow]];
663     xi      = aj + ai[r[prow]];
664     fill[n] = n;
665     while (nz--) {
666       fm  = n;
667       idx = ic[*xi++];
668       do {
669         m  = fm;
670         fm = fill[m];
671       } while (fm < idx);
672       fill[m]   = idx;
673       fill[idx] = fm;
674       im[idx]   = 0;
675     }
676     nzi = 0;
677     row = fill[n];
678     while ( row < prow ) {
679       incrlev = im[row] + 1;
680       nz      = dloc[row];
681       xi      = ajnew  + ainew[row] + nz;
682       flev    = ajfill + ainew[row] + nz + 1;
683       nnz     = ainew[row+1] - ainew[row] - nz - 1;
684       if (*xi++ != row) {
685         SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:zero pivot");
686       }
687       fm      = row;
688       while (nnz-- > 0) {
689         idx = *xi++;
690         if (*flev + incrlev > levels) {
691           flev++;
692           continue;
693         }
694         do {
695           m  = fm;
696           fm = fill[m];
697         } while (fm < idx);
698         if (fm != idx) {
699           im[idx]   = *flev + incrlev;
700           fill[m]   = idx;
701           fill[idx] = fm;
702           fm        = idx;
703           nzf++;
704         }
705         else {
706           if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
707         }
708         flev++;
709       }
710       row = fill[row];
711       nzi++;
712     }
713     /* copy new filled row into permanent storage */
714     ainew[prow+1] = ainew[prow] + nzf;
715     if (ainew[prow+1] > jmax) {
716       /* allocate a longer ajnew */
717       int maxadd;
718       maxadd = (int) (((f*ai[n]+1)*(n-prow+5))/n);
719       if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
720       jmax += maxadd;
721       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
722       PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
723       PetscFree(ajnew);
724       ajnew = xi;
725       /* allocate a longer ajfill */
726       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
727       PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
728       PetscFree(ajfill);
729       ajfill = xi;
730       realloc++;
731     }
732     xi          = ajnew + ainew[prow];
733     flev        = ajfill + ainew[prow];
734     dloc[prow]  = nzi;
735     fm          = fill[n];
736     while (nzf--) {
737       *xi++   = fm;
738       *flev++ = im[fm];
739       fm      = fill[fm];
740     }
741   }
742   PetscFree(ajfill);
743   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
744   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
745   ierr = ISDestroy(isicol); CHKERRQ(ierr);
746   PetscFree(fill); PetscFree(im);
747 
748   PLogInfo((PetscObject)A,
749     "Info:MatILUFactorSymbolic_SeqBAIJ:Realloc %d Fill ratio:given %g needed %g\n",
750                              realloc,f,((double)ainew[n])/((double)ai[prow]));
751 
752   /* put together the new matrix */
753   ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
754   b = (Mat_SeqBAIJ *) (*fact)->data;
755   PetscFree(b->imax);
756   b->singlemalloc = 0;
757   len = bs*bs*ainew[n]*sizeof(Scalar);
758   /* the next line frees the default space generated by the Create() */
759   PetscFree(b->a); PetscFree(b->ilen);
760   b->a          = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
761   b->j          = ajnew;
762   b->i          = ainew;
763   for ( i=0; i<n; i++ ) dloc[i] += ainew[i];
764   b->diag       = dloc;
765   b->ilen       = 0;
766   b->imax       = 0;
767   b->row        = isrow;
768   b->col        = iscol;
769   b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar));
770   CHKPTRQ(b->solve_work);
771   /* In b structure:  Free imax, ilen, old a, old j.
772      Allocate dloc, solve_work, new a, new j */
773   PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs*bs*ainew[n]*sizeof(Scalar));
774   b->maxnz          = b->nz = ainew[n];
775   (*fact)->factor   = FACTOR_LU;
776   return 0;
777 }
778 
779 
780 
781 
782