xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision 911f9fe8bd6cba67889e436249bb2d60a401f759)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 #include "../src/mat/impls/baij/seq/baij.h"
7 #include "../src/mat/blockinvert.h"
8 
9 /* ------------------------------------------------------------*/
10 /*
11       Version for when blocks are 2 by 2
12 */
13 /* MatLUFactorNumeric_SeqBAIJ_2_newdatastruct -
14      copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented
15        Kernel_A_gets_A_times_B()
16        Kernel_A_gets_A_minus_B_times_C()
17        Kernel_A_gets_inverse_A()
18 */
19 #undef __FUNCT__
20 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_newdatastruct"
21 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
22 {
23   Mat            C=B;
24   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
25   IS             isrow = b->row,isicol = b->icol;
26   PetscErrorCode ierr;
27   const PetscInt *r,*ic,*ics;
28   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
29   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
30   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
31   PetscInt       bs2 = a->bs2,flg;
32   PetscReal      shift = info->shiftinblocks;
33 
34   PetscFunctionBegin;
35   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
36   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
37 
38   /* generate work space needed by the factorization */
39   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
40   mwork = rtmp + bs2*n;
41   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
42   ics  = ic;
43 
44   for (i=0; i<n; i++){
45     /* zero rtmp */
46     /* L part */
47     nz    = bi[i+1] - bi[i];
48     bjtmp = bj + bi[i];
49     for  (j=0; j<nz; j++){
50       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51     }
52 
53     /* U part */
54     nz = bi[2*n-i+1] - bi[2*n-i];
55     bjtmp = bj + bi[2*n-i];
56     for  (j=0; j<nz; j++){
57       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
58     }
59 
60     /* load in initial (unfactored row) */
61     nz    = ai[r[i]+1] - ai[r[i]];
62     ajtmp = aj + ai[r[i]];
63     v     = aa + bs2*ai[r[i]];
64     for (j=0; j<nz; j++) {
65       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
66     }
67 
68     /* elimination */
69     bjtmp = bj + bi[i];
70     row   = *bjtmp++;
71     nzL   = bi[i+1] - bi[i];
72     k   = 0;
73     while  (k < nzL) {
74       pc = rtmp + bs2*row;
75       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
76       if (flg) {
77         pv = b->a + bs2*bdiag[row];
78         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
79         ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr);
80 
81         pj = b->j + bi[2*n-row]; /* begining of U(row,:) */
82         pv = b->a + bs2*bi[2*n-row];
83         nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
84         for (j=0; j<nz; j++) {
85           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
86           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
87           v    = rtmp + 4*pj[j];
88           ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr);
89           pv  += 4;
90         }
91         ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
92       }
93       row = *bjtmp++; k++;
94     }
95 
96     /* finished row so stick it into b->a */
97     /* L part */
98     pv   = b->a + bs2*bi[i] ;
99     pj   = b->j + bi[i] ;
100     nz   = bi[i+1] - bi[i];
101     for (j=0; j<nz; j++) {
102       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
103     }
104 
105     /* Mark diagonal and invert diagonal for simplier triangular solves */
106     pv   = b->a + bs2*bdiag[i];
107     pj   = b->j + bdiag[i];
108     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
109     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
110     ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr);
111 
112     /* U part */
113     pv = b->a + bs2*bi[2*n-i];
114     pj = b->j + bi[2*n-i];
115     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
116     for (j=0; j<nz; j++){
117       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
118     }
119   }
120 
121   ierr = PetscFree(rtmp);CHKERRQ(ierr);
122   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
123   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
124 
125   C->assembled = PETSC_TRUE;
126   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
127   PetscFunctionReturn(0);
128 }
129 
130 /*
131   MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct -
132     copied from MatLUFactorNumeric_SeqBAIJ_2_newdatastruct(), then remove ordering
133 */
134 #undef __FUNCT__
135 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct"
136 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
137 {
138   Mat            C=B;
139   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
140   PetscErrorCode ierr;
141   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
142   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
143   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
144   PetscInt       bs2 = a->bs2,flg;
145   PetscReal      shift = info->shiftinblocks;
146 
147   PetscFunctionBegin;
148   /* generate work space needed by the factorization */
149   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
150   mwork = rtmp + bs2*n;
151   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
152 
153   for (i=0; i<n; i++){
154     /* zero rtmp */
155     /* L part */
156     nz    = bi[i+1] - bi[i];
157     bjtmp = bj + bi[i];
158     for  (j=0; j<nz; j++){
159       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
160     }
161 
162     /* U part */
163     nz = bi[2*n-i+1] - bi[2*n-i];
164     bjtmp = bj + bi[2*n-i];
165     for  (j=0; j<nz; j++){
166       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
167     }
168 
169     /* load in initial (unfactored row) */
170     nz    = ai[i+1] - ai[i];
171     ajtmp = aj + ai[i];
172     v     = aa + bs2*ai[i];
173     for (j=0; j<nz; j++) {
174       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
175     }
176 
177     /* elimination */
178     bjtmp = bj + bi[i];
179     row   = *bjtmp++;
180     nzL   = bi[i+1] - bi[i];
181     k   = 0;
182     while  (k < nzL) {
183       pc = rtmp + bs2*row;
184       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
185       if (flg) {
186         pv = b->a + bs2*bdiag[row];
187         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
188         ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr);
189 
190         pj = b->j + bi[2*n-row]; /* begining of U(row,:) */
191         pv = b->a + bs2*bi[2*n-row];
192         nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
193         for (j=0; j<nz; j++) {
194           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
195           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
196           v    = rtmp + 4*pj[j];
197           ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr);
198           pv  += 4;
199         }
200         ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
201       }
202       row = *bjtmp++; k++;
203     }
204 
205     /* finished row so stick it into b->a */
206     /* L part */
207     pv   = b->a + bs2*bi[i] ;
208     pj   = b->j + bi[i] ;
209     nz   = bi[i+1] - bi[i];
210     for (j=0; j<nz; j++) {
211       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
212     }
213 
214     /* Mark diagonal and invert diagonal for simplier triangular solves */
215     pv   = b->a + bs2*bdiag[i];
216     pj   = b->j + bdiag[i];
217     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
218     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
219     ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr);
220 
221     /* U part */
222     pv = b->a + bs2*bi[2*n-i];
223     pj = b->j + bi[2*n-i];
224     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
225     for (j=0; j<nz; j++){
226       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
227     }
228   }
229 
230   ierr = PetscFree(rtmp);CHKERRQ(ierr);
231   C->assembled = PETSC_TRUE;
232   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
233   PetscFunctionReturn(0);
234 }
235 
236 #undef __FUNCT__
237 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2"
238 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B,Mat A,const MatFactorInfo *info)
239 {
240   Mat            C = B;
241   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
242   IS             isrow = b->row,isicol = b->icol;
243   PetscErrorCode ierr;
244   const PetscInt *r,*ic;
245   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
246   PetscInt       *ajtmpold,*ajtmp,nz,row;
247   PetscInt       *diag_offset=b->diag,idx,*ai=a->i,*aj=a->j,*pj;
248   MatScalar      *pv,*v,*rtmp,m1,m2,m3,m4,*pc,*w,*x,x1,x2,x3,x4;
249   MatScalar      p1,p2,p3,p4;
250   MatScalar      *ba = b->a,*aa = a->a;
251   PetscReal      shift = info->shiftinblocks;
252 
253   PetscFunctionBegin;
254   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
255   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
256   ierr  = PetscMalloc(4*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
257 
258   for (i=0; i<n; i++) {
259     nz    = bi[i+1] - bi[i];
260     ajtmp = bj + bi[i];
261     for  (j=0; j<nz; j++) {
262       x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0;
263     }
264     /* load in initial (unfactored row) */
265     idx      = r[i];
266     nz       = ai[idx+1] - ai[idx];
267     ajtmpold = aj + ai[idx];
268     v        = aa + 4*ai[idx];
269     for (j=0; j<nz; j++) {
270       x    = rtmp+4*ic[ajtmpold[j]];
271       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
272       v    += 4;
273     }
274     row = *ajtmp++;
275     while (row < i) {
276       pc = rtmp + 4*row;
277       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
278       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
279         pv = ba + 4*diag_offset[row];
280         pj = bj + diag_offset[row] + 1;
281         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
282         pc[0] = m1 = p1*x1 + p3*x2;
283         pc[1] = m2 = p2*x1 + p4*x2;
284         pc[2] = m3 = p1*x3 + p3*x4;
285         pc[3] = m4 = p2*x3 + p4*x4;
286         nz = bi[row+1] - diag_offset[row] - 1;
287         pv += 4;
288         for (j=0; j<nz; j++) {
289           x1   = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
290           x    = rtmp + 4*pj[j];
291           x[0] -= m1*x1 + m3*x2;
292           x[1] -= m2*x1 + m4*x2;
293           x[2] -= m1*x3 + m3*x4;
294           x[3] -= m2*x3 + m4*x4;
295           pv   += 4;
296         }
297         ierr = PetscLogFlops(16.0*nz+12.0);CHKERRQ(ierr);
298       }
299       row = *ajtmp++;
300     }
301     /* finished row so stick it into b->a */
302     pv = ba + 4*bi[i];
303     pj = bj + bi[i];
304     nz = bi[i+1] - bi[i];
305     for (j=0; j<nz; j++) {
306       x     = rtmp+4*pj[j];
307       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
308       pv   += 4;
309     }
310     /* invert diagonal block */
311     w = ba + 4*diag_offset[i];
312     ierr = Kernel_A_gets_inverse_A_2(w,shift);CHKERRQ(ierr);
313   }
314 
315   ierr = PetscFree(rtmp);CHKERRQ(ierr);
316   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
317   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
318   C->ops->solve          = MatSolve_SeqBAIJ_2;
319   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
320   C->assembled = PETSC_TRUE;
321   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
322   PetscFunctionReturn(0);
323 }
324 /*
325       Version for when blocks are 2 by 2 Using natural ordering
326 */
327 #undef __FUNCT__
328 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering"
329 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
330 {
331   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
332   PetscErrorCode ierr;
333   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
334   PetscInt       *ajtmpold,*ajtmp,nz,row;
335   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
336   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
337   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,x1,x2,x3,x4;
338   MatScalar      *ba = b->a,*aa = a->a;
339   PetscReal      shift = info->shiftinblocks;
340 
341   PetscFunctionBegin;
342   ierr = PetscMalloc(4*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
343   for (i=0; i<n; i++) {
344     nz    = bi[i+1] - bi[i];
345     ajtmp = bj + bi[i];
346     for  (j=0; j<nz; j++) {
347       x = rtmp+4*ajtmp[j];
348       x[0]  = x[1]  = x[2]  = x[3]  = 0.0;
349     }
350     /* load in initial (unfactored row) */
351     nz       = ai[i+1] - ai[i];
352     ajtmpold = aj + ai[i];
353     v        = aa + 4*ai[i];
354     for (j=0; j<nz; j++) {
355       x    = rtmp+4*ajtmpold[j];
356       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
357       v    += 4;
358     }
359     row = *ajtmp++;
360     while (row < i) {
361       pc  = rtmp + 4*row;
362       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
363       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
364         pv = ba + 4*diag_offset[row];
365         pj = bj + diag_offset[row] + 1;
366         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
367         pc[0] = m1 = p1*x1 + p3*x2;
368         pc[1] = m2 = p2*x1 + p4*x2;
369         pc[2] = m3 = p1*x3 + p3*x4;
370         pc[3] = m4 = p2*x3 + p4*x4;
371         nz = bi[row+1] - diag_offset[row] - 1;
372         pv += 4;
373         for (j=0; j<nz; j++) {
374           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
375           x    = rtmp + 4*pj[j];
376           x[0] -= m1*x1 + m3*x2;
377           x[1] -= m2*x1 + m4*x2;
378           x[2] -= m1*x3 + m3*x4;
379           x[3] -= m2*x3 + m4*x4;
380           pv   += 4;
381         }
382         ierr = PetscLogFlops(16.0*nz+12.0);CHKERRQ(ierr);
383       }
384       row = *ajtmp++;
385     }
386     /* finished row so stick it into b->a */
387     pv = ba + 4*bi[i];
388     pj = bj + bi[i];
389     nz = bi[i+1] - bi[i];
390     for (j=0; j<nz; j++) {
391       x      = rtmp+4*pj[j];
392       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
393       /*
394       printf(" col %d:",pj[j]);
395       PetscInt j1;
396       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
397       printf("\n");
398       */
399       pv   += 4;
400     }
401     /* invert diagonal block */
402     w = ba + 4*diag_offset[i];
403     /*
404     printf(" \n%d -th: diag: ",i);
405     for (j=0; j<4; j++){
406       printf(" %g,",w[j]);
407     }
408     printf("\n----------------------------\n");
409     */
410     ierr = Kernel_A_gets_inverse_A_2(w,shift);CHKERRQ(ierr);
411   }
412 
413   ierr = PetscFree(rtmp);CHKERRQ(ierr);
414   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
415   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
416   C->assembled = PETSC_TRUE;
417   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
418   PetscFunctionReturn(0);
419 }
420 
421 /* ----------------------------------------------------------- */
422 /*
423      Version for when blocks are 1 by 1.
424 */
425 #undef __FUNCT__
426 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_1"
427 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat C,Mat A,const MatFactorInfo *info)
428 {
429   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
430   IS             isrow = b->row,isicol = b->icol;
431   PetscErrorCode ierr;
432   const PetscInt *r,*ic;
433   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
434   PetscInt       *ajtmpold,*ajtmp,nz,row,*ai = a->i,*aj = a->j;
435   PetscInt       *diag_offset = b->diag,diag,*pj;
436   MatScalar      *pv,*v,*rtmp,multiplier,*pc;
437   MatScalar      *ba = b->a,*aa = a->a;
438   PetscTruth     row_identity, col_identity;
439 
440   PetscFunctionBegin;
441   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
442   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
443   ierr  = PetscMalloc((n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
444 
445   for (i=0; i<n; i++) {
446     nz    = bi[i+1] - bi[i];
447     ajtmp = bj + bi[i];
448     for  (j=0; j<nz; j++) rtmp[ajtmp[j]] = 0.0;
449 
450     /* load in initial (unfactored row) */
451     nz       = ai[r[i]+1] - ai[r[i]];
452     ajtmpold = aj + ai[r[i]];
453     v        = aa + ai[r[i]];
454     for (j=0; j<nz; j++) rtmp[ic[ajtmpold[j]]] =  v[j];
455 
456     row = *ajtmp++;
457     while (row < i) {
458       pc = rtmp + row;
459       if (*pc != 0.0) {
460         pv         = ba + diag_offset[row];
461         pj         = bj + diag_offset[row] + 1;
462         multiplier = *pc * *pv++;
463         *pc        = multiplier;
464         nz         = bi[row+1] - diag_offset[row] - 1;
465         for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
466         ierr = PetscLogFlops(1.0+2.0*nz);CHKERRQ(ierr);
467       }
468       row = *ajtmp++;
469     }
470     /* finished row so stick it into b->a */
471     pv = ba + bi[i];
472     pj = bj + bi[i];
473     nz = bi[i+1] - bi[i];
474     for (j=0; j<nz; j++) {pv[j] = rtmp[pj[j]];}
475     diag = diag_offset[i] - bi[i];
476     /* check pivot entry for current row */
477     if (pv[diag] == 0.0) {
478       SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot: row in original ordering %D in permuted ordering %D",r[i],i);
479     }
480     pv[diag] = 1.0/pv[diag];
481   }
482 
483   ierr = PetscFree(rtmp);CHKERRQ(ierr);
484   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
485   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
486   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
487   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
488   if (row_identity && col_identity) {
489     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
490     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
491   } else {
492     C->ops->solve          = MatSolve_SeqBAIJ_1;
493     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
494   }
495   C->assembled = PETSC_TRUE;
496   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
497   PetscFunctionReturn(0);
498 }
499 
500 EXTERN_C_BEGIN
501 #undef __FUNCT__
502 #define __FUNCT__ "MatGetFactor_seqbaij_petsc"
503 PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A,MatFactorType ftype,Mat *B)
504 {
505   PetscInt           n = A->rmap->n;
506   PetscErrorCode     ierr;
507 
508   PetscFunctionBegin;
509   ierr = MatCreate(((PetscObject)A)->comm,B);CHKERRQ(ierr);
510   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
511   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
512     ierr = MatSetType(*B,MATSEQBAIJ);CHKERRQ(ierr);
513     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
514     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
515     (*B)->ops->iludtfactor       = MatILUDTFactor_SeqBAIJ;
516   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
517     ierr = MatSetType(*B,MATSEQSBAIJ);CHKERRQ(ierr);
518     ierr = MatSeqSBAIJSetPreallocation(*B,1,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
519     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
520     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
521   } else SETERRQ(PETSC_ERR_SUP,"Factor type not supported");
522   (*B)->factor = ftype;
523   PetscFunctionReturn(0);
524 }
525 EXTERN_C_END
526 
527 EXTERN_C_BEGIN
528 #undef __FUNCT__
529 #define __FUNCT__ "MatGetFactorAvailable_seqaij_petsc"
530 PetscErrorCode MatGetFactorAvailable_seqbaij_petsc(Mat A,MatFactorType ftype,PetscTruth *flg)
531 {
532   PetscFunctionBegin;
533   *flg = PETSC_TRUE;
534   PetscFunctionReturn(0);
535 }
536 EXTERN_C_END
537 
538 /* ----------------------------------------------------------- */
539 #undef __FUNCT__
540 #define __FUNCT__ "MatLUFactor_SeqBAIJ"
541 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,const MatFactorInfo *info)
542 {
543   PetscErrorCode ierr;
544   Mat            C;
545 
546   PetscFunctionBegin;
547   ierr = MatGetFactor(A,MAT_SOLVER_PETSC,MAT_FACTOR_LU,&C);CHKERRQ(ierr);
548   ierr = MatLUFactorSymbolic(C,A,row,col,info);CHKERRQ(ierr);
549   ierr = MatLUFactorNumeric(C,A,info);CHKERRQ(ierr);
550   A->ops->solve            = C->ops->solve;
551   A->ops->solvetranspose   = C->ops->solvetranspose;
552   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
553   ierr = PetscLogObjectParent(A,((Mat_SeqBAIJ*)(A->data))->icol);CHKERRQ(ierr);
554   PetscFunctionReturn(0);
555 }
556 
557 #include "../src/mat/impls/sbaij/seq/sbaij.h"
558 #undef __FUNCT__
559 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqBAIJ_N"
560 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C,Mat A,const MatFactorInfo *info)
561 {
562   PetscErrorCode ierr;
563   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data;
564   Mat_SeqSBAIJ   *b=(Mat_SeqSBAIJ*)C->data;
565   IS             ip=b->row;
566   const PetscInt *rip;
567   PetscInt       i,j,mbs=a->mbs,bs=A->rmap->bs,*bi=b->i,*bj=b->j,*bcol;
568   PetscInt       *ai=a->i,*aj=a->j;
569   PetscInt       k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
570   MatScalar      *rtmp,*ba=b->a,*bval,*aa=a->a,dk,uikdi;
571   PetscReal      zeropivot,rs,shiftnz;
572   PetscReal      shiftpd;
573   ChShift_Ctx    sctx;
574   PetscInt       newshift;
575 
576   PetscFunctionBegin;
577   if (bs > 1) {
578     if (!a->sbaijMat){
579       ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr);
580     }
581     ierr = (a->sbaijMat)->ops->choleskyfactornumeric(C,a->sbaijMat,info);CHKERRQ(ierr);
582     ierr = MatDestroy(a->sbaijMat);CHKERRQ(ierr);
583     a->sbaijMat = PETSC_NULL;
584     PetscFunctionReturn(0);
585   }
586 
587   /* initialization */
588   shiftnz   = info->shiftnz;
589   shiftpd   = info->shiftpd;
590   zeropivot = info->zeropivot;
591 
592   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
593   nz   = (2*mbs+1)*sizeof(PetscInt)+mbs*sizeof(MatScalar);
594   ierr = PetscMalloc(nz,&il);CHKERRQ(ierr);
595   jl   = il + mbs;
596   rtmp = (MatScalar*)(jl + mbs);
597 
598   sctx.shift_amount = 0;
599   sctx.nshift       = 0;
600   do {
601     sctx.chshift = PETSC_FALSE;
602     for (i=0; i<mbs; i++) {
603       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
604     }
605 
606     for (k = 0; k<mbs; k++){
607       bval = ba + bi[k];
608       /* initialize k-th row by the perm[k]-th row of A */
609       jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
610       for (j = jmin; j < jmax; j++){
611         col = rip[aj[j]];
612         if (col >= k){ /* only take upper triangular entry */
613           rtmp[col] = aa[j];
614           *bval++  = 0.0; /* for in-place factorization */
615         }
616       }
617 
618       /* shift the diagonal of the matrix */
619       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
620 
621       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
622       dk = rtmp[k];
623       i = jl[k]; /* first row to be added to k_th row  */
624 
625       while (i < k){
626         nexti = jl[i]; /* next row to be added to k_th row */
627 
628         /* compute multiplier, update diag(k) and U(i,k) */
629         ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
630         uikdi = - ba[ili]*ba[bi[i]];  /* diagonal(k) */
631         dk += uikdi*ba[ili];
632         ba[ili] = uikdi; /* -U(i,k) */
633 
634         /* add multiple of row i to k-th row */
635         jmin = ili + 1; jmax = bi[i+1];
636         if (jmin < jmax){
637           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
638           /* update il and jl for row i */
639           il[i] = jmin;
640           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
641         }
642         i = nexti;
643       }
644 
645       /* shift the diagonals when zero pivot is detected */
646       /* compute rs=sum of abs(off-diagonal) */
647       rs   = 0.0;
648       jmin = bi[k]+1;
649       nz   = bi[k+1] - jmin;
650       if (nz){
651         bcol = bj + jmin;
652         while (nz--){
653           rs += PetscAbsScalar(rtmp[*bcol]);
654           bcol++;
655         }
656       }
657 
658       sctx.rs = rs;
659       sctx.pv = dk;
660       ierr = MatCholeskyCheckShift_inline(info,sctx,k,newshift);CHKERRQ(ierr);
661       if (newshift == 1) break;
662 
663       /* copy data into U(k,:) */
664       ba[bi[k]] = 1.0/dk; /* U(k,k) */
665       jmin = bi[k]+1; jmax = bi[k+1];
666       if (jmin < jmax) {
667         for (j=jmin; j<jmax; j++){
668           col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
669         }
670         /* add the k-th row into il and jl */
671         il[k] = jmin;
672         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
673       }
674     }
675   } while (sctx.chshift);
676   ierr = PetscFree(il);CHKERRQ(ierr);
677 
678   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
679   C->assembled    = PETSC_TRUE;
680   C->preallocated = PETSC_TRUE;
681   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
682   if (sctx.nshift){
683     if (shiftpd) {
684       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
685     } else if (shiftnz) {
686       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
687     }
688   }
689   PetscFunctionReturn(0);
690 }
691 
692 #undef __FUNCT__
693 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering"
694 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
695 {
696   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data;
697   Mat_SeqSBAIJ   *b=(Mat_SeqSBAIJ*)C->data;
698   PetscErrorCode ierr;
699   PetscInt       i,j,am=a->mbs;
700   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
701   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
702   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
703   PetscReal      zeropivot,rs,shiftnz;
704   PetscReal      shiftpd;
705   ChShift_Ctx    sctx;
706   PetscInt       newshift;
707 
708   PetscFunctionBegin;
709   /* initialization */
710   shiftnz   = info->shiftnz;
711   shiftpd   = info->shiftpd;
712   zeropivot = info->zeropivot;
713 
714   nz   = (2*am+1)*sizeof(PetscInt)+am*sizeof(MatScalar);
715   ierr = PetscMalloc(nz,&il);CHKERRQ(ierr);
716   jl   = il + am;
717   rtmp = (MatScalar*)(jl + am);
718 
719   sctx.shift_amount = 0;
720   sctx.nshift       = 0;
721   do {
722     sctx.chshift = PETSC_FALSE;
723     for (i=0; i<am; i++) {
724       rtmp[i] = 0.0; jl[i] = am; il[0] = 0;
725     }
726 
727     for (k = 0; k<am; k++){
728     /* initialize k-th row with elements nonzero in row perm(k) of A */
729       nz   = ai[k+1] - ai[k];
730       acol = aj + ai[k];
731       aval = aa + ai[k];
732       bval = ba + bi[k];
733       while (nz -- ){
734         if (*acol < k) { /* skip lower triangular entries */
735           acol++; aval++;
736         } else {
737           rtmp[*acol++] = *aval++;
738           *bval++       = 0.0; /* for in-place factorization */
739         }
740       }
741 
742       /* shift the diagonal of the matrix */
743       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
744 
745       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
746       dk = rtmp[k];
747       i  = jl[k]; /* first row to be added to k_th row  */
748 
749       while (i < k){
750         nexti = jl[i]; /* next row to be added to k_th row */
751         /* compute multiplier, update D(k) and U(i,k) */
752         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
753         uikdi = - ba[ili]*ba[bi[i]];
754         dk   += uikdi*ba[ili];
755         ba[ili] = uikdi; /* -U(i,k) */
756 
757         /* add multiple of row i to k-th row ... */
758         jmin = ili + 1;
759         nz   = bi[i+1] - jmin;
760         if (nz > 0){
761           bcol = bj + jmin;
762           bval = ba + jmin;
763           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
764           /* update il and jl for i-th row */
765           il[i] = jmin;
766           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
767         }
768         i = nexti;
769       }
770 
771       /* shift the diagonals when zero pivot is detected */
772       /* compute rs=sum of abs(off-diagonal) */
773       rs   = 0.0;
774       jmin = bi[k]+1;
775       nz   = bi[k+1] - jmin;
776       if (nz){
777         bcol = bj + jmin;
778         while (nz--){
779           rs += PetscAbsScalar(rtmp[*bcol]);
780           bcol++;
781         }
782       }
783 
784       sctx.rs = rs;
785       sctx.pv = dk;
786       ierr = MatCholeskyCheckShift_inline(info,sctx,k,newshift);CHKERRQ(ierr);
787       if (newshift == 1) break;    /* sctx.shift_amount is updated */
788 
789       /* copy data into U(k,:) */
790       ba[bi[k]] = 1.0/dk;
791       jmin      = bi[k]+1;
792       nz        = bi[k+1] - jmin;
793       if (nz){
794         bcol = bj + jmin;
795         bval = ba + jmin;
796         while (nz--){
797           *bval++       = rtmp[*bcol];
798           rtmp[*bcol++] = 0.0;
799         }
800         /* add k-th row into il and jl */
801         il[k] = jmin;
802         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
803       }
804     }
805   } while (sctx.chshift);
806   ierr = PetscFree(il);CHKERRQ(ierr);
807 
808   C->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
809   C->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering;
810   C->assembled    = PETSC_TRUE;
811   C->preallocated = PETSC_TRUE;
812   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
813     if (sctx.nshift){
814     if (shiftnz) {
815       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
816     } else if (shiftpd) {
817       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
818     }
819   }
820   PetscFunctionReturn(0);
821 }
822 
823 #include "petscbt.h"
824 #include "../src/mat/utils/freespace.h"
825 #undef __FUNCT__
826 #define __FUNCT__ "MatICCFactorSymbolic_SeqBAIJ"
827 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
828 {
829   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
830   Mat_SeqSBAIJ       *b;
831   Mat                B;
832   PetscErrorCode     ierr;
833   PetscTruth         perm_identity;
834   PetscInt           reallocs=0,i,*ai=a->i,*aj=a->j,am=a->mbs,bs=A->rmap->bs,*ui;
835   const PetscInt     *rip;
836   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
837   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,ncols_upper,*cols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
838   PetscReal          fill=info->fill,levels=info->levels;
839   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
840   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
841   PetscBT            lnkbt;
842 
843   PetscFunctionBegin;
844   if (bs > 1){
845     if (!a->sbaijMat){
846       ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr);
847     }
848     (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ;  /* undue the change made in MatGetFactor_seqbaij_petsc */
849     ierr = MatICCFactorSymbolic(fact,a->sbaijMat,perm,info);CHKERRQ(ierr);
850     PetscFunctionReturn(0);
851   }
852 
853   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
854   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
855 
856   /* special case that simply copies fill pattern */
857   if (!levels && perm_identity) {
858     ierr = MatMarkDiagonal_SeqBAIJ(A);CHKERRQ(ierr);
859     ierr = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
860     for (i=0; i<am; i++) {
861       ui[i] = ai[i+1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
862     }
863     B = fact;
864     ierr = MatSeqSBAIJSetPreallocation(B,1,0,ui);CHKERRQ(ierr);
865 
866 
867     b  = (Mat_SeqSBAIJ*)B->data;
868     uj = b->j;
869     for (i=0; i<am; i++) {
870       aj = a->j + a->diag[i];
871       for (j=0; j<ui[i]; j++){
872         *uj++ = *aj++;
873       }
874       b->ilen[i] = ui[i];
875     }
876     ierr = PetscFree(ui);CHKERRQ(ierr);
877     B->factor = MAT_FACTOR_NONE;
878     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
879     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
880     B->factor = MAT_FACTOR_ICC;
881 
882     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
883     PetscFunctionReturn(0);
884   }
885 
886   /* initialization */
887   ierr  = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
888   ui[0] = 0;
889   ierr  = PetscMalloc((2*am+1)*sizeof(PetscInt),&cols_lvl);CHKERRQ(ierr);
890 
891   /* jl: linked list for storing indices of the pivot rows
892      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
893   ierr = PetscMalloc((2*am+1)*sizeof(PetscInt)+2*am*sizeof(PetscInt*),&jl);CHKERRQ(ierr);
894   il         = jl + am;
895   uj_ptr     = (PetscInt**)(il + am);
896   uj_lvl_ptr = (PetscInt**)(uj_ptr + am);
897   for (i=0; i<am; i++){
898     jl[i] = am; il[i] = 0;
899   }
900 
901   /* create and initialize a linked list for storing column indices of the active row k */
902   nlnk = am + 1;
903   ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
904 
905   /* initial FreeSpace size is fill*(ai[am]+1) */
906   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
907   current_space = free_space;
908   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
909   current_space_lvl = free_space_lvl;
910 
911   for (k=0; k<am; k++){  /* for each active row k */
912     /* initialize lnk by the column indices of row rip[k] of A */
913     nzk   = 0;
914     ncols = ai[rip[k]+1] - ai[rip[k]];
915     ncols_upper = 0;
916     cols        = cols_lvl + am;
917     for (j=0; j<ncols; j++){
918       i = rip[*(aj + ai[rip[k]] + j)];
919       if (i >= k){ /* only take upper triangular entry */
920         cols[ncols_upper] = i;
921         cols_lvl[ncols_upper] = -1;  /* initialize level for nonzero entries */
922         ncols_upper++;
923       }
924     }
925     ierr = PetscIncompleteLLAdd(ncols_upper,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
926     nzk += nlnk;
927 
928     /* update lnk by computing fill-in for each pivot row to be merged in */
929     prow = jl[k]; /* 1st pivot row */
930 
931     while (prow < k){
932       nextprow = jl[prow];
933 
934       /* merge prow into k-th row */
935       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
936       jmax = ui[prow+1];
937       ncols = jmax-jmin;
938       i     = jmin - ui[prow];
939       cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
940       for (j=0; j<ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
941       ierr = PetscIncompleteLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
942       nzk += nlnk;
943 
944       /* update il and jl for prow */
945       if (jmin < jmax){
946         il[prow] = jmin;
947         j = *cols; jl[prow] = jl[j]; jl[j] = prow;
948       }
949       prow = nextprow;
950     }
951 
952     /* if free space is not available, make more free space */
953     if (current_space->local_remaining<nzk) {
954       i = am - k + 1; /* num of unfactored rows */
955       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
956       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
957       ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
958       reallocs++;
959     }
960 
961     /* copy data into free_space and free_space_lvl, then initialize lnk */
962     ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
963 
964     /* add the k-th row into il and jl */
965     if (nzk-1 > 0){
966       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
967       jl[k] = jl[i]; jl[i] = k;
968       il[k] = ui[k] + 1;
969     }
970     uj_ptr[k]     = current_space->array;
971     uj_lvl_ptr[k] = current_space_lvl->array;
972 
973     current_space->array           += nzk;
974     current_space->local_used      += nzk;
975     current_space->local_remaining -= nzk;
976 
977     current_space_lvl->array           += nzk;
978     current_space_lvl->local_used      += nzk;
979     current_space_lvl->local_remaining -= nzk;
980 
981     ui[k+1] = ui[k] + nzk;
982   }
983 
984 #if defined(PETSC_USE_INFO)
985   if (ai[am] != 0) {
986     PetscReal af = ((PetscReal)(2*ui[am]-am))/((PetscReal)ai[am]);
987     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
988     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
989     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
990   } else {
991     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
992   }
993 #endif
994 
995   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
996   ierr = PetscFree(jl);CHKERRQ(ierr);
997   ierr = PetscFree(cols_lvl);CHKERRQ(ierr);
998 
999   /* destroy list of free space and other temporary array(s) */
1000   ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
1001   ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
1002   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1003   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
1004 
1005   /* put together the new matrix in MATSEQSBAIJ format */
1006   B = fact;
1007   ierr = MatSeqSBAIJSetPreallocation(B,1,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
1008 
1009   b = (Mat_SeqSBAIJ*)B->data;
1010   b->singlemalloc = PETSC_FALSE;
1011   b->free_a       = PETSC_TRUE;
1012   b->free_ij       = PETSC_TRUE;
1013   ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
1014   b->j    = uj;
1015   b->i    = ui;
1016   b->diag = 0;
1017   b->ilen = 0;
1018   b->imax = 0;
1019   b->row  = perm;
1020   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1021   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1022   b->icol = perm;
1023   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1024   ierr    = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
1025   ierr    = PetscLogObjectMemory(B,(ui[am]-am)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
1026   b->maxnz = b->nz = ui[am];
1027 
1028   B->info.factor_mallocs    = reallocs;
1029   B->info.fill_ratio_given  = fill;
1030   if (ai[am] != 0) {
1031     B->info.fill_ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
1032   } else {
1033     B->info.fill_ratio_needed = 0.0;
1034   }
1035   if (perm_identity){
1036     B->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1037     B->ops->solvetranspose  = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1038     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1039   } else {
1040     (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1041   }
1042   PetscFunctionReturn(0);
1043 }
1044 
1045 #undef __FUNCT__
1046 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqBAIJ"
1047 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
1048 {
1049   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
1050   Mat_SeqSBAIJ       *b;
1051   Mat                B;
1052   PetscErrorCode     ierr;
1053   PetscTruth         perm_identity;
1054   PetscReal          fill = info->fill;
1055   const PetscInt     *rip;
1056   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,*ai=a->i,*aj=a->j,reallocs=0,prow;
1057   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
1058   PetscInt           nlnk,*lnk,ncols,ncols_upper,*cols,*uj,**ui_ptr,*uj_ptr;
1059   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
1060   PetscBT            lnkbt;
1061 
1062   PetscFunctionBegin;
1063   if (bs > 1) { /* convert to seqsbaij */
1064     if (!a->sbaijMat){
1065       ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr);
1066     }
1067     (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1068     ierr = MatCholeskyFactorSymbolic(fact,a->sbaijMat,perm,info);CHKERRQ(ierr);
1069     PetscFunctionReturn(0);
1070   }
1071 
1072   /* check whether perm is the identity mapping */
1073   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
1074   if (!perm_identity) SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported");
1075   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
1076 
1077   /* initialization */
1078   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
1079   ui[0] = 0;
1080 
1081   /* jl: linked list for storing indices of the pivot rows
1082      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1083   ierr = PetscMalloc((3*mbs+1)*sizeof(PetscInt)+mbs*sizeof(PetscInt*),&jl);CHKERRQ(ierr);
1084   il     = jl + mbs;
1085   cols   = il + mbs;
1086   ui_ptr = (PetscInt**)(cols + mbs);
1087   for (i=0; i<mbs; i++){
1088     jl[i] = mbs; il[i] = 0;
1089   }
1090 
1091   /* create and initialize a linked list for storing column indices of the active row k */
1092   nlnk = mbs + 1;
1093   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1094 
1095   /* initial FreeSpace size is fill*(ai[mbs]+1) */
1096   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[mbs]+1)),&free_space);CHKERRQ(ierr);
1097   current_space = free_space;
1098 
1099   for (k=0; k<mbs; k++){  /* for each active row k */
1100     /* initialize lnk by the column indices of row rip[k] of A */
1101     nzk   = 0;
1102     ncols = ai[rip[k]+1] - ai[rip[k]];
1103     ncols_upper = 0;
1104     for (j=0; j<ncols; j++){
1105       i = rip[*(aj + ai[rip[k]] + j)];
1106       if (i >= k){ /* only take upper triangular entry */
1107         cols[ncols_upper] = i;
1108         ncols_upper++;
1109       }
1110     }
1111     ierr = PetscLLAdd(ncols_upper,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1112     nzk += nlnk;
1113 
1114     /* update lnk by computing fill-in for each pivot row to be merged in */
1115     prow = jl[k]; /* 1st pivot row */
1116 
1117     while (prow < k){
1118       nextprow = jl[prow];
1119       /* merge prow into k-th row */
1120       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1121       jmax = ui[prow+1];
1122       ncols = jmax-jmin;
1123       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1124       ierr = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1125       nzk += nlnk;
1126 
1127       /* update il and jl for prow */
1128       if (jmin < jmax){
1129         il[prow] = jmin;
1130         j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
1131       }
1132       prow = nextprow;
1133     }
1134 
1135     /* if free space is not available, make more free space */
1136     if (current_space->local_remaining<nzk) {
1137       i = mbs - k + 1; /* num of unfactored rows */
1138       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1139       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
1140       reallocs++;
1141     }
1142 
1143     /* copy data into free space, then initialize lnk */
1144     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1145 
1146     /* add the k-th row into il and jl */
1147     if (nzk-1 > 0){
1148       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1149       jl[k] = jl[i]; jl[i] = k;
1150       il[k] = ui[k] + 1;
1151     }
1152     ui_ptr[k] = current_space->array;
1153     current_space->array           += nzk;
1154     current_space->local_used      += nzk;
1155     current_space->local_remaining -= nzk;
1156 
1157     ui[k+1] = ui[k] + nzk;
1158   }
1159 
1160 #if defined(PETSC_USE_INFO)
1161   if (ai[mbs] != 0) {
1162     PetscReal af = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
1163     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
1164     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
1165     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
1166   } else {
1167     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
1168   }
1169 #endif
1170 
1171   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
1172   ierr = PetscFree(jl);CHKERRQ(ierr);
1173 
1174   /* destroy list of free space and other temporary array(s) */
1175   ierr = PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
1176   ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
1177   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1178 
1179   /* put together the new matrix in MATSEQSBAIJ format */
1180   B    = fact;
1181   ierr = MatSeqSBAIJSetPreallocation(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
1182 
1183   b = (Mat_SeqSBAIJ*)B->data;
1184   b->singlemalloc = PETSC_FALSE;
1185   b->free_a       = PETSC_TRUE;
1186   b->free_ij      = PETSC_TRUE;
1187   ierr = PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
1188   b->j    = uj;
1189   b->i    = ui;
1190   b->diag = 0;
1191   b->ilen = 0;
1192   b->imax = 0;
1193   b->row  = perm;
1194   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1195   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1196   b->icol = perm;
1197   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1198   ierr    = PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
1199   ierr    = PetscLogObjectMemory(B,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
1200   b->maxnz = b->nz = ui[mbs];
1201 
1202   B->info.factor_mallocs    = reallocs;
1203   B->info.fill_ratio_given  = fill;
1204   if (ai[mbs] != 0) {
1205     B->info.fill_ratio_needed = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
1206   } else {
1207     B->info.fill_ratio_needed = 0.0;
1208   }
1209   if (perm_identity){
1210     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1211   } else {
1212     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1213   }
1214   PetscFunctionReturn(0);
1215 }
1216 
1217 /* --------------------------------------------------------- */
1218 #undef __FUNCT__
1219 #define __FUNCT__ "MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct"
1220 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1221 {
1222   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1223   PetscErrorCode ierr;
1224   const PetscInt *ai=a->i,*aj=a->j,*vi;
1225   PetscInt       i,k,n=a->mbs;
1226   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2,*adiag=a->diag;
1227   MatScalar      *aa=a->a,*v;
1228   PetscScalar    *x,*b,*s,*t,*ls;
1229 
1230   PetscFunctionBegin;
1231   /* printf("MatSolve_SeqBAIJ_NaturalOrdering_iludt..bs %d\n",bs); */
1232   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1233   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1234   t  = a->solve_work;
1235 
1236   /* forward solve the lower triangular */
1237   ierr = PetscMemcpy(t,b,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy 1st block of b to t */
1238 
1239   for (i=1; i<n; i++) {
1240     v   = aa + bs2*ai[i];
1241     vi  = aj + ai[i];
1242     nz = ai[i+1] - ai[i];
1243     s = t + bs*i;
1244     ierr = PetscMemcpy(s,b+bs*i,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy i_th block of b to t */
1245     for(k=0;k<nz;k++){
1246       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*vi[k]);
1247       v += bs2;
1248     }
1249   }
1250 
1251   /* backward solve the upper triangular */
1252   ls = a->solve_work + A->cmap->n;
1253   for (i=n-1; i>=0; i--){
1254     v  = aa + bs2*ai[2*n-i];
1255     vi = aj + ai[2*n-i];
1256     nz = ai[2*n-i +1] - ai[2*n-i]-1;
1257     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1258     for(k=0;k<nz;k++){
1259       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*vi[k]);
1260       v += bs2;
1261     }
1262     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*adiag[i],t+i*bs); /* *inv(diagonal[i]) */
1263     ierr = PetscMemcpy(x+i*bs,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1264   }
1265 
1266   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1267   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1268   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1269   PetscFunctionReturn(0);
1270 }
1271 
1272 #undef __FUNCT__
1273 #define __FUNCT__ "MatSolve_SeqBAIJ_N_newdatastruct"
1274 PetscErrorCode MatSolve_SeqBAIJ_N_newdatastruct(Mat A,Vec bb,Vec xx)
1275 {
1276   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1277   IS             iscol=a->col,isrow=a->row;
1278   PetscErrorCode ierr;
1279   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1280   PetscInt       i,m,n=a->mbs;
1281   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2,k;
1282   MatScalar      *aa=a->a,*v;
1283   PetscScalar    *x,*b,*s,*t,*ls;
1284 
1285   PetscFunctionBegin;
1286   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1287   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1288   t  = a->solve_work;
1289 
1290   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1291   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1292 
1293   /* forward solve the lower triangular */
1294   ierr = PetscMemcpy(t,b+bs*r[0],bs*sizeof(PetscScalar));CHKERRQ(ierr);
1295   for (i=1; i<n; i++) {
1296     v   = aa + bs2*ai[i];
1297     vi  = aj + ai[i];
1298     nz = ai[i+1] - ai[i];
1299     s = t + bs*i;
1300     ierr = PetscMemcpy(s,b+bs*r[i],bs*sizeof(PetscScalar));CHKERRQ(ierr);
1301     for(m=0;m<nz;m++){
1302       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*vi[m]);
1303       v += bs2;
1304     }
1305   }
1306 
1307   /* backward solve the upper triangular */
1308   ls = a->solve_work + A->cmap->n;
1309   for (i=n-1; i>=0; i--){
1310     k  = 2*n-i;
1311     v  = aa + bs2*ai[k];
1312     vi = aj + ai[k];
1313     nz = ai[k+1] - ai[k] - 1;
1314     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1315     for(m=0;m<nz;m++){
1316       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*vi[m]);
1317       v += bs2;
1318     }
1319     Kernel_w_gets_A_times_v(bs,ls,v,t+i*bs); /* *inv(diagonal[i]) */
1320     ierr = PetscMemcpy(x + bs*c[i],t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1321   }
1322   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1323   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1324   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1325   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1326   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1327   PetscFunctionReturn(0);
1328 }
1329 
1330 #undef __FUNCT__
1331 #define __FUNCT__ "BlockAbs_privat"
1332 PetscErrorCode BlockAbs_private(PetscInt nbs,PetscInt bs2,PetscScalar *blockarray,PetscReal *absarray)
1333 {
1334   PetscErrorCode     ierr;
1335   PetscInt           i,j;
1336   PetscFunctionBegin;
1337   ierr = PetscMemzero(absarray,(nbs+1)*sizeof(PetscReal));CHKERRQ(ierr);
1338   for (i=0; i<nbs; i++){
1339     for (j=0; j<bs2; j++){
1340       if (absarray[i] < PetscAbsScalar(blockarray[i*nbs+j])) absarray[i] = PetscAbsScalar(blockarray[i*nbs+j]);
1341     }
1342   }
1343   PetscFunctionReturn(0);
1344 }
1345 
1346 #undef __FUNCT__
1347 #define __FUNCT__ "MatILUDTFactor_SeqBAIJ"
1348 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A,IS isrow,IS iscol,const MatFactorInfo *info,Mat *fact)
1349 {
1350   Mat                B = *fact;
1351   Mat_SeqBAIJ        *a=(Mat_SeqBAIJ*)A->data,*b;
1352   IS                 isicol;
1353   PetscErrorCode     ierr;
1354   const PetscInt     *r,*ic;
1355   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,bs2=a->bs2,*ai=a->i,*aj=a->j,*ajtmp,*adiag;
1356   PetscInt           *bi,*bj,*bdiag;
1357 
1358   PetscInt           row,nzi,nzi_bl,nzi_bu,*im,dtcount,nzi_al,nzi_au;
1359   PetscInt           nlnk,*lnk;
1360   PetscBT            lnkbt;
1361   PetscTruth         row_identity,icol_identity,both_identity;
1362   MatScalar          *aatmp,*pv,*batmp,*ba,*rtmp,*pc,*multiplier,*vtmp;
1363   const PetscInt     *ics;
1364   PetscInt           j,nz,*pj,*bjtmp,k,ncut,*jtmp;
1365 
1366   PetscReal          dt=info->dt; /* shift=info->shiftinblocks; */
1367   PetscInt           nnz_max;
1368   PetscTruth         missing;
1369   PetscReal          *vtmp_abs;
1370   MatScalar          *v_work;
1371   PetscInt           *v_pivots;
1372 
1373   PetscFunctionBegin;
1374   /* ------- symbolic factorization, can be reused ---------*/
1375   ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr);
1376   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
1377   adiag=a->diag;
1378 
1379   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
1380 
1381   /* bdiag is location of diagonal in factor */
1382   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
1383 
1384   /* allocate row pointers bi */
1385   ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1386 
1387   /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1388   dtcount = (PetscInt)info->dtcount;
1389   if (dtcount > mbs-1) dtcount = mbs-1;
1390   nnz_max  = ai[mbs]+2*mbs*dtcount +2;
1391   /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max  %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1392   ierr = PetscMalloc(nnz_max*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1393   nnz_max = nnz_max*bs2;
1394   ierr = PetscMalloc(nnz_max*sizeof(MatScalar),&ba);CHKERRQ(ierr);
1395 
1396   /* put together the new matrix */
1397   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
1398   ierr = PetscLogObjectParent(B,isicol);CHKERRQ(ierr);
1399   b    = (Mat_SeqBAIJ*)(B)->data;
1400   b->free_a       = PETSC_TRUE;
1401   b->free_ij      = PETSC_TRUE;
1402   b->singlemalloc = PETSC_FALSE;
1403   b->a          = ba;
1404   b->j          = bj;
1405   b->i          = bi;
1406   b->diag       = bdiag;
1407   b->ilen       = 0;
1408   b->imax       = 0;
1409   b->row        = isrow;
1410   b->col        = iscol;
1411   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
1412   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
1413   b->icol       = isicol;
1414   ierr = PetscMalloc((bs*(mbs+1))*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
1415 
1416   ierr = PetscLogObjectMemory(B,nnz_max*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
1417   b->maxnz = nnz_max/bs2;
1418 
1419   (B)->factor                = MAT_FACTOR_ILUDT;
1420   (B)->info.factor_mallocs   = 0;
1421   (B)->info.fill_ratio_given = ((PetscReal)nnz_max)/((PetscReal)(ai[mbs]*bs2));
1422   CHKMEMQ;
1423   /* ------- end of symbolic factorization ---------*/
1424   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1425   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1426   ics  = ic;
1427 
1428   /* linked list for storing column indices of the active row */
1429   nlnk = mbs + 1;
1430   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1431 
1432   /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1433   ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
1434   jtmp = im + mbs;
1435   /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1436   ierr = PetscMalloc((2*mbs*bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1437   vtmp = rtmp + bs2*mbs;
1438   ierr = PetscMalloc((mbs+1)*sizeof(PetscReal),&vtmp_abs);CHKERRQ(ierr);
1439 
1440   ierr       = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
1441   multiplier = v_work + bs;
1442   v_pivots   = (PetscInt*)(multiplier + bs2);
1443 
1444   bi[0]    = 0;
1445   bdiag[0] = (nnz_max/bs2)-1; /* location of diagonal in factor B */
1446   bi[2*mbs+1] = bdiag[0]+1; /* endof bj and ba array */
1447   for (i=0; i<mbs; i++) {
1448     /* copy initial fill into linked list */
1449     nzi = 0; /* nonzeros for active row i */
1450     nzi = ai[r[i]+1] - ai[r[i]];
1451     if (!nzi) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
1452     nzi_al = adiag[r[i]] - ai[r[i]];
1453     nzi_au = ai[r[i]+1] - adiag[r[i]] -1;
1454     /* printf("row %d, nzi_al/au %d %d\n",i,nzi_al,nzi_au); */
1455 
1456     /* load in initial unfactored row */
1457     ajtmp = aj + ai[r[i]];
1458     ierr = PetscLLAddPerm(nzi,ajtmp,ic,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1459     ierr = PetscMemzero(rtmp,(mbs*bs2+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1460     aatmp = a->a + bs2*ai[r[i]];
1461     for (j=0; j<nzi; j++) {
1462       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],aatmp+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1463     }
1464 
1465     /* add pivot rows into linked list */
1466     row = lnk[mbs];
1467     while (row < i) {
1468       nzi_bl = bi[row+1] - bi[row] + 1;
1469       bjtmp = bj + bdiag[row+1]+1; /* points to 1st column next to the diagonal in U */
1470       ierr  = PetscLLAddSortedLU(bjtmp,row,nlnk,lnk,lnkbt,i,nzi_bl,im);CHKERRQ(ierr);
1471       nzi  += nlnk;
1472       row   = lnk[row];
1473     }
1474 
1475     /* copy data from lnk into jtmp, then initialize lnk */
1476     ierr = PetscLLClean(mbs,mbs,nzi,lnk,jtmp,lnkbt);CHKERRQ(ierr);
1477 
1478     /* numerical factorization */
1479     bjtmp = jtmp;
1480     row   = *bjtmp++; /* 1st pivot row */
1481 
1482     while  (row < i) {
1483       pc = rtmp + bs2*row;
1484       pv = ba + bs2*bdiag[row]; /* inv(diag) of the pivot row */
1485       Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1486       ierr = BlockAbs_private(1,bs2,pc,vtmp_abs);CHKERRQ(ierr);
1487       if (vtmp_abs[0] > dt){ /* apply tolerance dropping rule */
1488         pj         = bj + bdiag[row+1] + 1; /* point to 1st entry of U(row,:) */
1489         pv         = ba + bs2*(bdiag[row+1] + 1);
1490         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries in U(row,:), excluding diagonal */
1491         for (j=0; j<nz; j++){
1492           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
1493         }
1494         /* ierr = PetscLogFlops(bslog*(nz+1.0)-bs);CHKERRQ(ierr); */
1495       }
1496       row = *bjtmp++;
1497     }
1498 
1499     /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1500     nzi_bl = 0; j = 0;
1501     while (jtmp[j] < i){ /* L-part. Note: jtmp is sorted */
1502       ierr = PetscMemcpy(vtmp+bs2*j,rtmp+bs2*jtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1503       nzi_bl++; j++;
1504     }
1505     nzi_bu = nzi - nzi_bl -1;
1506     /* printf("nzi %d, nzi_bl %d, nzi_bu %d\n",nzi,nzi_bl,nzi_bu); */
1507 
1508     while (j < nzi){ /* U-part */
1509       ierr = PetscMemcpy(vtmp+bs2*j,rtmp+bs2*jtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1510       /*
1511       printf(" col %d: ",jtmp[j]);
1512       for (j1=0; j1<bs2; j1++) printf(" %g",*(vtmp+bs2*j+j1));
1513       printf(" \n");
1514       */
1515       j++;
1516     }
1517 
1518     ierr = BlockAbs_private(nzi,bs2,vtmp,vtmp_abs);CHKERRQ(ierr);
1519     /*
1520     printf(" row %d, nzi %d, vtmp_abs\n",i,nzi);
1521     for (j1=0; j1<nzi; j1++) printf(" (%d %g),",jtmp[j1],vtmp_abs[j1]);
1522     printf(" \n");
1523     */
1524     bjtmp = bj + bi[i];
1525     batmp = ba + bs2*bi[i];
1526     /* apply level dropping rule to L part */
1527     ncut = nzi_al + dtcount;
1528     if (ncut < nzi_bl){
1529       ierr = PetscSortSplitReal(ncut,nzi_bl,vtmp_abs,jtmp);CHKERRQ(ierr);
1530       ierr = PetscSortIntWithScalarArray(ncut,jtmp,vtmp);CHKERRQ(ierr);
1531     } else {
1532       ncut = nzi_bl;
1533     }
1534     for (j=0; j<ncut; j++){
1535       bjtmp[j] = jtmp[j];
1536       ierr = PetscMemcpy(batmp+bs2*j,rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1537       /*
1538       printf(" col %d: ",bjtmp[j]);
1539       for (j1=0; j1<bs2; j1++) printf(" %g,",*(batmp+bs2*j+j1));
1540       printf("\n");
1541       */
1542     }
1543     bi[i+1] = bi[i] + ncut;
1544     nzi = ncut + 1;
1545 
1546     /* apply level dropping rule to U part */
1547     ncut = nzi_au + dtcount;
1548     if (ncut < nzi_bu){
1549       ierr = PetscSortSplitReal(ncut,nzi_bu,vtmp_abs+nzi_bl+1,jtmp+nzi_bl+1);CHKERRQ(ierr);
1550       ierr = PetscSortIntWithScalarArray(ncut,jtmp+nzi_bl+1,vtmp+nzi_bl+1);CHKERRQ(ierr);
1551     } else {
1552       ncut = nzi_bu;
1553     }
1554     nzi += ncut;
1555 
1556     /* mark bdiagonal */
1557     bdiag[i+1]    = bdiag[i] - (ncut + 1);
1558     bi[2*mbs - i] = bi[2*mbs - i +1] - (ncut + 1);
1559 
1560     bjtmp = bj + bdiag[i];
1561     batmp = ba + bs2*bdiag[i];
1562     ierr = PetscMemcpy(batmp,rtmp+bs2*i,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1563     *bjtmp = i;
1564     /*
1565     printf(" diag %d: ",*bjtmp);
1566     for (j=0; j<bs2; j++){
1567       printf(" %g,",batmp[j]);
1568     }
1569     printf("\n");
1570     */
1571     bjtmp = bj + bdiag[i+1]+1;
1572     batmp = ba + (bdiag[i+1]+1)*bs2;
1573 
1574     for (k=0; k<ncut; k++){
1575       bjtmp[k] = jtmp[nzi_bl+1+k];
1576       ierr = PetscMemcpy(batmp+bs2*k,rtmp+bs2*bjtmp[k],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1577       /*
1578       printf(" col %d:",bjtmp[k]);
1579       for (j1=0; j1<bs2; j1++) printf(" %g,",*(batmp+bs2*k+j1));
1580       printf("\n");
1581       */
1582     }
1583 
1584     im[i] = nzi; /* used by PetscLLAddSortedLU() */
1585 
1586     /* invert diagonal block for simplier triangular solves - add shift??? */
1587     batmp = ba + bs2*bdiag[i];
1588     ierr = Kernel_A_gets_inverse_A(bs,batmp,v_pivots,v_work);CHKERRQ(ierr);
1589   } /* for (i=0; i<mbs; i++) */
1590 
1591   /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1592   if (bi[mbs] >= bdiag[mbs]) SETERRQ2(PETSC_ERR_ARG_SIZ,"end of L array %d cannot >= the beginning of U array %d",bi[mbs],bdiag[mbs]);
1593 
1594   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1595   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1596 
1597   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1598 
1599   ierr = PetscFree(im);CHKERRQ(ierr);
1600   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1601 
1602   ierr = PetscLogFlops(bs2*B->cmap->n);CHKERRQ(ierr);
1603   b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];
1604 
1605   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1606   ierr = ISIdentity(isicol,&icol_identity);CHKERRQ(ierr);
1607   both_identity = (PetscTruth) (row_identity && icol_identity);
1608   if (row_identity && icol_identity) {
1609     B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
1610   } else {
1611     B->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
1612   }
1613 
1614   B->ops->solveadd          = 0;
1615   B->ops->solvetranspose    = 0;
1616   B->ops->solvetransposeadd = 0;
1617   B->ops->matsolve          = 0;
1618   B->assembled              = PETSC_TRUE;
1619   B->preallocated           = PETSC_TRUE;
1620   PetscFunctionReturn(0);
1621 }
1622