xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision 5768c4f9b9846e1de8cfa49a3a79d804fb4ff497)
1 #ifndef lint
2 static char vcid[] = "$Id: baijfact.c,v 1.5 1996/02/19 03:51:17 bsmith Exp curfman $";
3 #endif
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "baij.h"
9 
10 /*
11     The symbolic factorization code is identical to that for AIJ format,
12   except for very small changes since this is now a SeqBAIJ datastructure.
13   NOT good code reuse.
14 */
15 int MatLUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,Mat *B)
16 {
17   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
18   IS          isicol;
19   int         *r,*ic, ierr, i, n = a->mbs, *ai = a->i, *aj = a->j;
20   int         *ainew,*ajnew, jmax,*fill, *ajtmp, nz, bs = a->bs;
21   int         *idnew, idx, row,m,fm, nnz, nzi,len, realloc = 0,nzbd,*im;
22 
23   if (a->m != a->n) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must be square");
24   if (!isrow) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have row permutation");
25   if (!iscol) SETERRQ(1,"MatLUFactorSymbolic_SeqBAIJ:Must have col. permutation");
26 
27   ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
28   ISGetIndices(isrow,&r); ISGetIndices(isicol,&ic);
29 
30   /* get new row pointers */
31   ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
32   ainew[0] = 0;
33   /* don't know how many column pointers are needed so estimate */
34   jmax = (int) (f*ai[n] + 1);
35   ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
36   /* fill is a linked list of nonzeros in active row */
37   fill = (int *) PetscMalloc( (2*n+1)*sizeof(int)); CHKPTRQ(fill);
38   im = fill + n + 1;
39   /* idnew is location of diagonal in factor */
40   idnew = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(idnew);
41   idnew[0] = 0;
42 
43   for ( i=0; i<n; i++ ) {
44     /* first copy previous fill into linked list */
45     nnz     = nz    = ai[r[i]+1] - ai[r[i]];
46     ajtmp   = aj + ai[r[i]];
47     fill[n] = n;
48     while (nz--) {
49       fm  = n;
50       idx = ic[*ajtmp++];
51       do {
52         m  = fm;
53         fm = fill[m];
54       } while (fm < idx);
55       fill[m]   = idx;
56       fill[idx] = fm;
57     }
58     row = fill[n];
59     while ( row < i ) {
60       ajtmp = ajnew + idnew[row] + 1;
61       nzbd  = 1 + idnew[row] - ainew[row];
62       nz    = im[row] - nzbd;
63       fm    = row;
64       while (nz-- > 0) {
65         idx = *ajtmp++;
66         nzbd++;
67         if (idx == i) im[row] = nzbd;
68         do {
69           m  = fm;
70           fm = fill[m];
71         } while (fm < idx);
72         if (fm != idx) {
73           fill[m]   = idx;
74           fill[idx] = fm;
75           fm        = idx;
76           nnz++;
77         }
78       }
79       row = fill[row];
80     }
81     /* copy new filled row into permanent storage */
82     ainew[i+1] = ainew[i] + nnz;
83     if (ainew[i+1] > jmax+1) {
84       /* allocate a longer ajnew */
85       int maxadd;
86       maxadd = (int) ((f*(ai[n]+1)*(n-i+5))/n);
87       if (maxadd < nnz) maxadd = (n-i)*(nnz+1);
88       jmax += maxadd;
89       ajtmp = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(ajtmp);
90       PetscMemcpy(ajtmp,ajnew,ainew[i]*sizeof(int));
91       PetscFree(ajnew);
92       ajnew = ajtmp;
93       realloc++; /* count how many times we realloc */
94     }
95     ajtmp = ajnew + ainew[i];
96     fm    = fill[n];
97     nzi   = 0;
98     im[i] = nnz;
99     while (nnz--) {
100       if (fm < i) nzi++;
101       *ajtmp++ = fm;
102       fm       = fill[fm];
103     }
104     idnew[i] = ainew[i] + nzi;
105   }
106 
107   PLogInfo((PetscObject)A,
108     "Info:MatLUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",
109                              realloc,f,((double)ainew[n])/((double)ai[i]));
110 
111   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
112   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
113 
114   PetscFree(fill);
115 
116   /* put together the new matrix */
117   ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,B); CHKERRQ(ierr);
118   PLogObjectParent(*B,isicol);
119   ierr = ISDestroy(isicol); CHKERRQ(ierr);
120   b = (Mat_SeqBAIJ *) (*B)->data;
121   PetscFree(b->imax);
122   b->singlemalloc = 0;
123   len             = ainew[n]*sizeof(Scalar);
124   /* the next line frees the default space generated by the Create() */
125   PetscFree(b->a); PetscFree(b->ilen);
126   b->a          = (Scalar *) PetscMalloc( len*bs*bs ); CHKPTRQ(b->a);
127   b->j          = ajnew;
128   b->i          = ainew;
129   b->diag       = idnew;
130   b->ilen       = 0;
131   b->imax       = 0;
132   b->row        = isrow;
133   b->col        = iscol;
134   b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar));
135   CHKPTRQ(b->solve_work);
136   /* In b structure:  Free imax, ilen, old a, old j.
137      Allocate idnew, solve_work, new a, new j */
138   PLogObjectMemory(*B,(ainew[n]-n)*(sizeof(int)+sizeof(Scalar)));
139   b->maxnz = b->nz = ainew[n];
140 
141   return 0;
142 }
143 
144 #include "pinclude/plapack.h"
145 /* ----------------------------------------------------------- */
146 int MatLUFactorNumeric_SeqBAIJ_N(Mat A,Mat *B)
147 {
148   Mat             C = *B;
149   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data;
150   IS              iscol = b->col, isrow = b->row, isicol;
151   int             *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j;
152   int             *ajtmpold, *ajtmp, nz, row, bslog,info;
153   int             *diag_offset=b->diag,diag,bs=a->bs,bs2 = bs*bs,*v_pivots;
154   register Scalar *pv,*v,*rtmp,*multiplier,*v_work,*pc,*w;
155   Scalar          one = 1.0, zero = 0.0, mone = -1.0;
156   register int    *pj;
157 
158   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
159   PLogObjectParent(*B,isicol);
160   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
161   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
162   rtmp  = (Scalar *) PetscMalloc(bs2*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
163 
164   /* generate work space needed by dense LU factorization */
165   v_work     = (Scalar *) PetscMalloc((bs+2*bs2)*sizeof(Scalar));CHKPTRQ(v_work);
166   multiplier = v_work + bs2;
167   v_pivots   = (int *) (multiplier + bs2);
168 
169   /* flops in while loop */
170   bslog = 2*bs*bs2;
171 
172   for ( i=0; i<n; i++ ) {
173     nz    = ai[i+1] - ai[i];
174     ajtmp = aj + ai[i];
175     for  ( j=0; j<nz; j++ ) {
176       PetscMemzero(rtmp+bs2*ajtmp[j],bs2*sizeof(Scalar));
177     }
178     /* load in initial (unfactored row) */
179     nz       = a->i[r[i]+1] - a->i[r[i]];
180     ajtmpold = a->j + a->i[r[i]];
181     v        = a->a + bs2*a->i[r[i]];
182     for ( j=0; j<nz; j++ ) {
183       PetscMemcpy(rtmp+bs2*ic[ajtmpold[j]],v+bs2*j,bs2*sizeof(Scalar));
184     }
185     row = *ajtmp++;
186     while (row < i) {
187       pc = rtmp + bs2*row;
188 /*      if (*pc) { */
189         pv = b->a + bs2*diag_offset[row];
190         pj = b->j + diag_offset[row] + 1;
191         BLgemm_("N","N",&bs,&bs,&bs,&one,pc,&bs,pv,&bs,&zero,
192                 multiplier,&bs);
193         PetscMemcpy(pc,multiplier,bs2*sizeof(Scalar));
194         nz = ai[row+1] - diag_offset[row] - 1;
195         pv += bs2;
196         for (j=0; j<nz; j++) {
197           BLgemm_("N","N",&bs,&bs,&bs,&mone,multiplier,&bs,pv+bs2*j,&bs,
198                   &one,rtmp+bs2*pj[j],&bs);
199         }
200         PLogFlops(bslog*(nz+1)-bs);
201 /*      } */
202         row = *ajtmp++;
203     }
204     /* finished row so stick it into b->a */
205     pv = b->a + bs2*ai[i];
206     pj = b->j + ai[i];
207     nz = ai[i+1] - ai[i];
208     for ( j=0; j<nz; j++ ) {
209       PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(Scalar));
210     }
211     diag = diag_offset[i] - ai[i];
212     /* invert diagonal block */
213     w = pv + bs2*diag;
214     LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info);
215     PetscMemzero(v_work,bs2*sizeof(Scalar));
216     for ( j=0; j<bs; j++ ) { v_work[j + bs*j] = 1.0; }
217     LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info);
218     PetscMemcpy(w,v_work,bs2*sizeof(Scalar));
219   }
220 
221   PetscFree(rtmp); PetscFree(v_work);
222   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
223   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
224   ierr = ISDestroy(isicol); CHKERRQ(ierr);
225   C->factor = FACTOR_LU;
226   C->assembled = PETSC_TRUE;
227   PLogFlops(1.3333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
228   return 0;
229 }
230 /* ------------------------------------------------------------*/
231 /*
232       Version for when blocks are 2 by 2
233 */
234 int MatLUFactorNumeric_SeqBAIJ_2(Mat A,Mat *B)
235 {
236   Mat             C = *B;
237   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *) A->data,*b = (Mat_SeqBAIJ *)C->data;
238   IS              iscol = b->col, isrow = b->row, isicol;
239   int             *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j;
240   int             *ajtmpold, *ajtmp, nz, row, info;
241   int             *diag_offset=b->diag,*v_pivots,bs = 2,idx;
242   register Scalar *pv,*v,*rtmp,m1,m2,m3,m4,*v_work,*pc,*w,*x,x1,x2,x3,x4;
243   Scalar          p1,p2,p3,p4;
244   register int    *pj;
245 
246   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
247   PLogObjectParent(*B,isicol);
248   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
249   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
250   rtmp  = (Scalar *) PetscMalloc(4*(n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
251 
252   /* generate work space needed by dense LU factorization */
253   v_work     = (Scalar *) PetscMalloc(6*sizeof(Scalar));CHKPTRQ(v_work);
254   v_pivots   = (int *) (v_work + 4);
255 
256   for ( i=0; i<n; i++ ) {
257     nz    = ai[i+1] - ai[i];
258     ajtmp = aj + ai[i];
259     for  ( j=0; j<nz; j++ ) {
260       x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0;
261     }
262     /* load in initial (unfactored row) */
263     idx      = r[i];
264     nz       = a->i[idx+1] - a->i[idx];
265     ajtmpold = a->j + a->i[idx];
266     v        = a->a + 4*a->i[idx];
267     for ( j=0; j<nz; j++ ) {
268       x    = rtmp+4*ic[ajtmpold[j]];
269       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
270       v    += 4;
271     }
272     row = *ajtmp++;
273     while (row < i) {
274       pc = rtmp + 4*row;
275       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
276       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
277         pv = b->a + 4*diag_offset[row];
278         pj = b->j + diag_offset[row] + 1;
279         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
280         pc[0] = m1 = p1*x1 + p3*x2;
281         pc[1] = m2 = p2*x1 + p4*x2;
282         pc[2] = m3 = p1*x3 + p3*x4;
283         pc[3] = m4 = p2*x3 + p4*x4;
284         nz = ai[row+1] - diag_offset[row] - 1;
285         pv += 4;
286         for (j=0; j<nz; j++) {
287           x1   = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
288           x    = rtmp + 4*pj[j];
289           x[0] -= m1*x1 + m3*x2;
290           x[1] -= m2*x1 + m4*x2;
291           x[2] -= m1*x3 + m3*x4;
292           x[3] -= m2*x3 + m4*x4;
293           pv   += 4;
294         }
295         PLogFlops(16*nz+12);
296       }
297       row = *ajtmp++;
298     }
299     /* finished row so stick it into b->a */
300     pv = b->a + 4*ai[i];
301     pj = b->j + ai[i];
302     nz = ai[i+1] - ai[i];
303     for ( j=0; j<nz; j++ ) {
304       x     = rtmp+4*pj[j];
305       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
306       pv   += 4;
307     }
308     /* invert diagonal block */
309     w = b->a + 4*diag_offset[i];
310     LAgetrf_(&bs,&bs,w,&bs,v_pivots,&info); CHKERRQ(info);
311     v_work[0] = 1.0; v_work[1] = 0.0; v_work[2] = 0.0; v_work[3] = 1.0;
312     LAgetrs_("N",&bs,&bs,w,&bs,v_pivots,v_work,&bs, &info);CHKERRQ(info);
313     w[0] = v_work[0]; w[1] = v_work[1]; w[2] = v_work[2]; w[3] = v_work[3];
314   }
315 
316   PetscFree(rtmp); PetscFree(v_work);
317   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
318   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
319   ierr = ISDestroy(isicol); CHKERRQ(ierr);
320   C->factor = FACTOR_LU;
321   C->assembled = PETSC_TRUE;
322   PLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */
323   return 0;
324 }
325 
326 /* ----------------------------------------------------------- */
327 /*
328      Version for when blocks are 1 by 1.
329 */
330 int MatLUFactorNumeric_SeqBAIJ_1(Mat A,Mat *B)
331 {
332   Mat         C = *B;
333   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b = (Mat_SeqBAIJ *)C->data;
334   IS          iscol = b->col, isrow = b->row, isicol;
335   int         *r,*ic, ierr, i, j, n = a->mbs, *ai = b->i, *aj = b->j;
336   int         *ajtmpold, *ajtmp, nz, row;
337   int         *diag_offset = b->diag,diag;
338   register Scalar *pv,*v,*rtmp,multiplier,*pc;
339   register int    *pj;
340 
341   ierr  = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
342   PLogObjectParent(*B,isicol);
343   ierr  = ISGetIndices(isrow,&r); CHKERRQ(ierr);
344   ierr  = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
345   rtmp  = (Scalar *) PetscMalloc((n+1)*sizeof(Scalar));CHKPTRQ(rtmp);
346 
347   for ( i=0; i<n; i++ ) {
348     nz    = ai[i+1] - ai[i];
349     ajtmp = aj + ai[i];
350     for  ( j=0; j<nz; j++ ) rtmp[ajtmp[j]] = 0.0;
351 
352     /* load in initial (unfactored row) */
353     nz       = a->i[r[i]+1] - a->i[r[i]];
354     ajtmpold = a->j + a->i[r[i]];
355     v        = a->a + a->i[r[i]];
356     for ( j=0; j<nz; j++ ) rtmp[ic[ajtmpold[j]]] =  v[j];
357 
358     row = *ajtmp++;
359     while (row < i) {
360       pc = rtmp + row;
361       if (*pc != 0.0) {
362         pv         = b->a + diag_offset[row];
363         pj         = b->j + diag_offset[row] + 1;
364         multiplier = *pc * *pv++;
365         *pc        = multiplier;
366         nz         = ai[row+1] - diag_offset[row] - 1;
367         for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
368         PLogFlops(1+2*nz);
369       }
370       row = *ajtmp++;
371     }
372     /* finished row so stick it into b->a */
373     pv = b->a + ai[i];
374     pj = b->j + ai[i];
375     nz = ai[i+1] - ai[i];
376     for ( j=0; j<nz; j++ ) {pv[j] = rtmp[pj[j]];}
377     diag = diag_offset[i] - ai[i];
378     /* check pivot entry for current row */
379     if (pv[diag] == 0.0) {
380       SETERRQ(1,"MatLUFactorNumeric_SeqAIJ:Zero pivot");
381     }
382     pv[diag] = 1.0/pv[diag];
383   }
384 
385   PetscFree(rtmp);
386   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
387   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
388   ierr = ISDestroy(isicol); CHKERRQ(ierr);
389   C->factor    = FACTOR_LU;
390   C->assembled = PETSC_TRUE;
391   PLogFlops(b->n);
392   return 0;
393 }
394 
395 /* ----------------------------------------------------------- */
396 int MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,double f)
397 {
398   Mat_SeqBAIJ *mat = (Mat_SeqBAIJ *) A->data;
399   int         ierr;
400   Mat         C;
401 
402   ierr = MatLUFactorSymbolic_SeqBAIJ(A,row,col,f,&C); CHKERRQ(ierr);
403   ierr = MatLUFactorNumeric(A,&C); CHKERRQ(ierr);
404 
405   /* free all the data structures from mat */
406   PetscFree(mat->a);
407   if (!mat->singlemalloc) {PetscFree(mat->i); PetscFree(mat->j);}
408   if (mat->diag) PetscFree(mat->diag);
409   if (mat->ilen) PetscFree(mat->ilen);
410   if (mat->imax) PetscFree(mat->imax);
411   if (mat->solve_work) PetscFree(mat->solve_work);
412   PetscFree(mat);
413 
414   PetscMemcpy(A,C,sizeof(struct _Mat));
415   PetscHeaderDestroy(C);
416   return 0;
417 }
418 /* ----------------------------------------------------------- */
419 int MatSolve_SeqBAIJ(Mat A,Vec bb, Vec xx)
420 {
421   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data;
422   IS          iscol = a->col, isrow = a->row;
423   int         *r,*c, ierr, i,  n = a->mbs, *vi, *ai = a->i, *aj = a->j;
424   int         nz,bs = a->bs,bs2 = bs*bs,idx,idt,idc, _One = 1;
425   Scalar      *xa,*ba,*aa = a->a, *sum, _DOne = 1.0, _DMOne = -1.0;
426   Scalar      _DZero = 0.0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
427   register Scalar *x, *b, *lsum, *tmp, *v;
428 
429   if (A->factor != FACTOR_LU) SETERRQ(1,"MatSolve_SeqBAIJ:Not for unfactored matrix");
430 
431   ierr = VecGetArray(bb,&ba); CHKERRQ(ierr); b = ba;
432   ierr = VecGetArray(xx,&xa); CHKERRQ(ierr); x = xa;
433   tmp  = a->solve_work;
434 
435   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
436   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); c = c + (n-1);
437 
438   switch (bs) {
439     case 1:
440       /* forward solve the lower triangular */
441       tmp[0] = b[*r++];
442       for ( i=1; i<n; i++ ) {
443         v     = aa + ai[i];
444         vi    = aj + ai[i];
445         nz    = a->diag[i] - ai[i];
446         sum1  = b[*r++];
447         while (nz--) {
448           sum1 -= (*v++)*tmp[*vi++];
449         }
450         tmp[i] = sum1;
451       }
452       /* backward solve the upper triangular */
453       for ( i=n-1; i>=0; i-- ){
454         v    = aa + a->diag[i] + 1;
455         vi   = aj + a->diag[i] + 1;
456         nz   = ai[i+1] - a->diag[i] - 1;
457         sum1 = tmp[i];
458         while (nz--) {
459           sum1 -= (*v++)*tmp[*vi++];
460         }
461         x[*c--] = tmp[i] = aa[a->diag[i]]*sum1;
462       }
463       break;
464     case 2:
465       /* forward solve the lower triangular */
466       idx    = 2*(*r++);
467       tmp[0] = b[idx]; tmp[1] = b[1+idx];
468       for ( i=1; i<n; i++ ) {
469         v     = aa + 4*ai[i];
470         vi    = aj + ai[i];
471         nz    = a->diag[i] - ai[i];
472         idx   = 2*(*r++);
473         sum1  = b[idx]; sum2 = b[1+idx];
474         while (nz--) {
475           idx   = 2*(*vi++);
476           x1    = tmp[idx]; x2 = tmp[1+idx];
477           sum1 -= v[0]*x1 + v[2]*x2;
478           sum2 -= v[1]*x1 + v[3]*x2;
479           v += 4;
480         }
481         idx = 2*i;
482         tmp[idx] = sum1; tmp[1+idx] = sum2;
483       }
484       /* backward solve the upper triangular */
485       for ( i=n-1; i>=0; i-- ){
486         v    = aa + 4*a->diag[i] + 4;
487         vi   = aj + a->diag[i] + 1;
488         nz   = ai[i+1] - a->diag[i] - 1;
489         idt  = 2*i;
490         sum1 = tmp[idt]; sum2 = tmp[1+idt];
491         while (nz--) {
492           idx   = 2*(*vi++);
493           x1    = tmp[idx]; x2 = tmp[1+idx];
494           sum1 -= v[0]*x1 + v[2]*x2;
495           sum2 -= v[1]*x1 + v[3]*x2;
496           v += 4;
497         }
498         idc = 2*(*c--);
499         v   = aa + 4*a->diag[i];
500         x[idc]   = tmp[idt]   = v[0]*sum1 + v[2]*sum2;
501         x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[3]*sum2;
502       }
503       break;
504     case 3:
505       /* forward solve the lower triangular */
506       idx    = 3*(*r++);
507       tmp[0] = b[idx]; tmp[1] = b[1+idx]; tmp[2] = b[2+idx];
508       for ( i=1; i<n; i++ ) {
509         v     = aa + 9*ai[i];
510         vi    = aj + ai[i];
511         nz    = a->diag[i] - ai[i];
512         idx   = 3*(*r++);
513         sum1  = b[idx]; sum2 = b[1+idx]; sum3 = b[2+idx];
514         while (nz--) {
515           idx   = 3*(*vi++);
516           x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
517           sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
518           sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
519           sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
520           v += 9;
521         }
522         idx = 3*i;
523         tmp[idx] = sum1; tmp[1+idx] = sum2; tmp[2+idx] = sum3;
524       }
525       /* backward solve the upper triangular */
526       for ( i=n-1; i>=0; i-- ){
527         v    = aa + 9*a->diag[i] + 9;
528         vi   = aj + a->diag[i] + 1;
529         nz   = ai[i+1] - a->diag[i] - 1;
530         idt  = 3*i;
531         sum1 = tmp[idt]; sum2 = tmp[1+idt]; sum3 = tmp[2+idt];
532         while (nz--) {
533           idx   = 3*(*vi++);
534           x1    = tmp[idx]; x2 = tmp[1+idx]; x3 = tmp[2+idx];
535           sum1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
536           sum2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
537           sum3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
538           v += 9;
539         }
540         idc = 3*(*c--);
541         v   = aa + 9*a->diag[i];
542         x[idc]   = tmp[idt]   = v[0]*sum1 + v[3]*sum2 + v[6]*sum3;
543         x[1+idc] = tmp[1+idt] = v[1]*sum1 + v[4]*sum2 + v[7]*sum3;
544         x[2+idc] = tmp[2+idt] = v[2]*sum1 + v[5]*sum2 + v[8]*sum3;
545       }
546       break;
547     case 4:
548       /* forward solve the lower triangular */
549       idx    = 4*(*r++);
550       tmp[0] = b[idx];   tmp[1] = b[1+idx];
551       tmp[2] = b[2+idx]; tmp[3] = b[3+idx];
552       for ( i=1; i<n; i++ ) {
553         v     = aa + 16*ai[i];
554         vi    = aj + ai[i];
555         nz    = a->diag[i] - ai[i];
556         idx   = 4*(*r++);
557         sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
558         while (nz--) {
559           idx   = 4*(*vi++);
560           x1    = tmp[idx];x2 = tmp[1+idx];x3 = tmp[2+idx];x4 = tmp[3+idx];
561           sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
562           sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
563           sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
564           sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
565           v += 16;
566         }
567         idx = 4*i;
568         tmp[idx]   = sum1;tmp[1+idx] = sum2;
569         tmp[2+idx] = sum3;tmp[3+idx] = sum4;
570       }
571       /* backward solve the upper triangular */
572       for ( i=n-1; i>=0; i-- ){
573         v    = aa + 16*a->diag[i] + 16;
574         vi   = aj + a->diag[i] + 1;
575         nz   = ai[i+1] - a->diag[i] - 1;
576         idt  = 4*i;
577         sum1 = tmp[idt];  sum2 = tmp[1+idt];
578         sum3 = tmp[2+idt];sum4 = tmp[3+idt];
579         while (nz--) {
580           idx   = 4*(*vi++);
581           x1    = tmp[idx];   x2 = tmp[1+idx];
582           x3    = tmp[2+idx]; x4 = tmp[3+idx];
583           sum1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
584           sum2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
585           sum3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
586           sum4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
587           v += 16;
588         }
589         idc = 4*(*c--);
590         v   = aa + 16*a->diag[i];
591         x[idc]   = tmp[idt]   = v[0]*sum1+v[4]*sum2+v[8]*sum3+v[12]*sum4;
592         x[1+idc] = tmp[1+idt] = v[1]*sum1+v[5]*sum2+v[9]*sum3+v[13]*sum4;
593         x[2+idc] = tmp[2+idt] = v[2]*sum1+v[6]*sum2+v[10]*sum3+v[14]*sum4;
594         x[3+idc] = tmp[3+idt] = v[3]*sum1+v[7]*sum2+v[11]*sum3+v[15]*sum4;
595       }
596       break;
597     case 5:
598       /* forward solve the lower triangular */
599       idx    = 5*(*r++);
600       tmp[0] = b[idx];   tmp[1] = b[1+idx];
601       tmp[2] = b[2+idx]; tmp[3] = b[3+idx]; tmp[4] = b[4+idx];
602       for ( i=1; i<n; i++ ) {
603         v     = aa + 25*ai[i];
604         vi    = aj + ai[i];
605         nz    = a->diag[i] - ai[i];
606         idx   = 5*(*r++);
607         sum1  = b[idx];sum2 = b[1+idx];sum3 = b[2+idx];sum4 = b[3+idx];
608         sum5  = b[4+idx];
609         while (nz--) {
610           idx   = 5*(*vi++);
611           x1    = tmp[idx];  x2 = tmp[1+idx];x3 = tmp[2+idx];
612           x4    = tmp[3+idx];x5 = tmp[4+idx];
613           sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
614           sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
615           sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
616           sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
617           sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
618           v += 25;
619         }
620         idx = 5*i;
621         tmp[idx]   = sum1;tmp[1+idx] = sum2;
622         tmp[2+idx] = sum3;tmp[3+idx] = sum4; tmp[4+idx] = sum5;
623       }
624       /* backward solve the upper triangular */
625       for ( i=n-1; i>=0; i-- ){
626         v    = aa + 25*a->diag[i] + 25;
627         vi   = aj + a->diag[i] + 1;
628         nz   = ai[i+1] - a->diag[i] - 1;
629         idt  = 5*i;
630         sum1 = tmp[idt];  sum2 = tmp[1+idt];
631         sum3 = tmp[2+idt];sum4 = tmp[3+idt]; sum5 = tmp[4+idt];
632         while (nz--) {
633           idx   = 5*(*vi++);
634           x1    = tmp[idx];   x2 = tmp[1+idx];
635           x3    = tmp[2+idx]; x4 = tmp[3+idx]; x5 = tmp[4+idx];
636           sum1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
637           sum2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
638           sum3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
639           sum4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
640           sum5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
641           v += 25;
642         }
643         idc = 5*(*c--);
644         v   = aa + 25*a->diag[i];
645         x[idc]   = tmp[idt]   = v[0]*sum1+v[5]*sum2+v[10]*sum3+
646                                 v[15]*sum4+v[20]*sum5;
647         x[1+idc] = tmp[1+idt] = v[1]*sum1+v[6]*sum2+v[11]*sum3+
648                                 v[16]*sum4+v[21]*sum5;
649         x[2+idc] = tmp[2+idt] = v[2]*sum1+v[7]*sum2+v[12]*sum3+
650                                 v[17]*sum4+v[22]*sum5;
651         x[3+idc] = tmp[3+idt] = v[3]*sum1+v[8]*sum2+v[13]*sum3+
652                                 v[18]*sum4+v[23]*sum5;
653         x[4+idc] = tmp[4+idt] = v[4]*sum1+v[9]*sum2+v[14]*sum3+
654                                 v[19]*sum4+v[24]*sum5;
655       }
656       break;
657     default: {
658       /* forward solve the lower triangular */
659       PetscMemcpy(tmp,b + bs*(*r++), bs*sizeof(Scalar));
660       for ( i=1; i<n; i++ ) {
661         v   = aa + bs2*ai[i];
662         vi  = aj + ai[i];
663         nz  = a->diag[i] - ai[i];
664         sum = tmp + bs*i;
665         PetscMemcpy(sum,b+bs*(*r++),bs*sizeof(Scalar));
666         while (nz--) {
667           LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,sum,&_One);
668           v += bs2;
669         }
670       }
671       /* backward solve the upper triangular */
672       lsum = a->solve_work + a->n;
673       for ( i=n-1; i>=0; i-- ){
674         v   = aa + bs2*(a->diag[i] + 1);
675         vi  = aj + a->diag[i] + 1;
676         nz  = ai[i+1] - a->diag[i] - 1;
677         PetscMemcpy(lsum,tmp+i*bs,bs*sizeof(Scalar));
678         while (nz--) {
679           LAgemv_("N",&bs,&bs,&_DMOne,v,&bs,tmp+bs*(*vi++),&_One,&_DOne,lsum,&_One);
680           v += bs2;
681         }
682         LAgemv_("N",&bs,&bs,&_DOne,aa+bs2*a->diag[i],&bs,lsum,&_One,&_DZero,
683                 tmp+i*bs,&_One);
684         PetscMemcpy(x + bs*(*c--),tmp+i*bs,bs*sizeof(Scalar));
685       }
686     }
687   }
688 
689   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
690   ierr = ISRestoreIndices(iscol,&c); CHKERRQ(ierr);
691   PLogFlops(2*bs2*a->nz - a->n);
692   return 0;
693 }
694 
695 /* ----------------------------------------------------------------*/
696 /*
697      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
698    except that the data structure of Mat_SeqAIJ is slightly different.
699    Not a good example of code reuse.
700    */
701 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,double f,int levels,
702                                  Mat *fact)
703 {
704   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *) A->data, *b;
705   IS          isicol;
706   int         *r,*ic, ierr, prow, n = a->mbs, *ai = a->i, *aj = a->j;
707   int         *ainew,*ajnew, jmax,*fill, *xi, nz, *im,*ajfill,*flev;
708   int         *dloc, idx, row,m,fm, nzf, nzi,len,  realloc = 0;
709   int         incrlev,nnz,i,bs = a->bs;
710 
711   if (a->m != a->n) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Matrix must be square");
712   if (!isrow) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have row permutation");
713   if (!iscol) SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:Must have column permutation");
714 
715   /* special case that simply copies fill pattern */
716   if (levels == 0 && ISIsIdentity(isrow) && ISIsIdentity(iscol)) {
717     ierr = MatConvertSameType_SeqBAIJ(A,fact,DO_NOT_COPY_VALUES); CHKERRQ(ierr);
718     (*fact)->factor = FACTOR_LU;
719     b               = (Mat_SeqBAIJ *) (*fact)->data;
720     if (!b->diag) {
721       ierr = MatMarkDiag_SeqBAIJ(*fact); CHKERRQ(ierr);
722     }
723     b->row        = isrow;
724     b->col        = iscol;
725     b->solve_work = (Scalar *) PetscMalloc((b->m+1+b->bs)*sizeof(Scalar));CHKPTRQ(b->solve_work);
726     return 0;
727   }
728 
729   ierr = ISInvertPermutation(iscol,&isicol); CHKERRQ(ierr);
730   ierr = ISGetIndices(isrow,&r); CHKERRQ(ierr);
731   ierr = ISGetIndices(isicol,&ic); CHKERRQ(ierr);
732 
733   /* get new row pointers */
734   ainew = (int *) PetscMalloc( (n+1)*sizeof(int) ); CHKPTRQ(ainew);
735   ainew[0] = 0;
736   /* don't know how many column pointers are needed so estimate */
737   jmax = (int) (f*ai[n] + 1);
738   ajnew = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajnew);
739   /* ajfill is level of fill for each fill entry */
740   ajfill = (int *) PetscMalloc( (jmax)*sizeof(int) ); CHKPTRQ(ajfill);
741   /* fill is a linked list of nonzeros in active row */
742   fill = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(fill);
743   /* im is level for each filled value */
744   im = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(im);
745   /* dloc is location of diagonal in factor */
746   dloc = (int *) PetscMalloc( (n+1)*sizeof(int)); CHKPTRQ(dloc);
747   dloc[0]  = 0;
748   for ( prow=0; prow<n; prow++ ) {
749     /* first copy previous fill into linked list */
750     nzf     = nz  = ai[r[prow]+1] - ai[r[prow]];
751     xi      = aj + ai[r[prow]];
752     fill[n] = n;
753     while (nz--) {
754       fm  = n;
755       idx = ic[*xi++];
756       do {
757         m  = fm;
758         fm = fill[m];
759       } while (fm < idx);
760       fill[m]   = idx;
761       fill[idx] = fm;
762       im[idx]   = 0;
763     }
764     nzi = 0;
765     row = fill[n];
766     while ( row < prow ) {
767       incrlev = im[row] + 1;
768       nz      = dloc[row];
769       xi      = ajnew  + ainew[row] + nz;
770       flev    = ajfill + ainew[row] + nz + 1;
771       nnz     = ainew[row+1] - ainew[row] - nz - 1;
772       if (*xi++ != row) {
773         SETERRQ(1,"MatILUFactorSymbolic_SeqBAIJ:zero pivot");
774       }
775       fm      = row;
776       while (nnz-- > 0) {
777         idx = *xi++;
778         if (*flev + incrlev > levels) {
779           flev++;
780           continue;
781         }
782         do {
783           m  = fm;
784           fm = fill[m];
785         } while (fm < idx);
786         if (fm != idx) {
787           im[idx]   = *flev + incrlev;
788           fill[m]   = idx;
789           fill[idx] = fm;
790           fm        = idx;
791           nzf++;
792         }
793         else {
794           if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
795         }
796         flev++;
797       }
798       row = fill[row];
799       nzi++;
800     }
801     /* copy new filled row into permanent storage */
802     ainew[prow+1] = ainew[prow] + nzf;
803     if (ainew[prow+1] > jmax) {
804       /* allocate a longer ajnew */
805       int maxadd;
806       maxadd = (int) (((f*ai[n]+1)*(n-prow+5))/n);
807       if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
808       jmax += maxadd;
809       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
810       PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));
811       PetscFree(ajnew);
812       ajnew = xi;
813       /* allocate a longer ajfill */
814       xi = (int *) PetscMalloc( jmax*sizeof(int) );CHKPTRQ(xi);
815       PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));
816       PetscFree(ajfill);
817       ajfill = xi;
818       realloc++;
819     }
820     xi          = ajnew + ainew[prow];
821     flev        = ajfill + ainew[prow];
822     dloc[prow]  = nzi;
823     fm          = fill[n];
824     while (nzf--) {
825       *xi++   = fm;
826       *flev++ = im[fm];
827       fm      = fill[fm];
828     }
829   }
830   PetscFree(ajfill);
831   ierr = ISRestoreIndices(isrow,&r); CHKERRQ(ierr);
832   ierr = ISRestoreIndices(isicol,&ic); CHKERRQ(ierr);
833   ierr = ISDestroy(isicol); CHKERRQ(ierr);
834   PetscFree(fill); PetscFree(im);
835 
836   PLogInfo((PetscObject)A,
837     "Info:MatILUFactorSymbolic_SeqBAIJ:Realloc %d Fill ratio:given %g needed %g\n",
838                              realloc,f,((double)ainew[n])/((double)ai[prow]));
839 
840   /* put together the new matrix */
841   ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
842   b = (Mat_SeqBAIJ *) (*fact)->data;
843   PetscFree(b->imax);
844   b->singlemalloc = 0;
845   len = bs*bs*ainew[n]*sizeof(Scalar);
846   /* the next line frees the default space generated by the Create() */
847   PetscFree(b->a); PetscFree(b->ilen);
848   b->a          = (Scalar *) PetscMalloc( len ); CHKPTRQ(b->a);
849   b->j          = ajnew;
850   b->i          = ainew;
851   for ( i=0; i<n; i++ ) dloc[i] += ainew[i];
852   b->diag       = dloc;
853   b->ilen       = 0;
854   b->imax       = 0;
855   b->row        = isrow;
856   b->col        = iscol;
857   b->solve_work = (Scalar *) PetscMalloc( (bs*n+bs)*sizeof(Scalar));
858   CHKPTRQ(b->solve_work);
859   /* In b structure:  Free imax, ilen, old a, old j.
860      Allocate dloc, solve_work, new a, new j */
861   PLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs*bs*ainew[n]*sizeof(Scalar));
862   b->maxnz          = b->nz = ainew[n];
863   (*fact)->factor   = FACTOR_LU;
864   return 0;
865 }
866 
867 
868 
869 
870