xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision eeffb40d691afbdd57a8091619e7ddd44ac5fdca)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for BAIJ format.
5 */
6 #include "../src/mat/impls/baij/seq/baij.h"
7 #include "../src/mat/blockinvert.h"
8 
9 /* ------------------------------------------------------------*/
10 /*
11       Version for when blocks are 2 by 2
12 */
13 /* MatLUFactorNumeric_SeqBAIJ_2_newdatastruct -
14      copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented
15        Kernel_A_gets_A_times_B()
16        Kernel_A_gets_A_minus_B_times_C()
17        Kernel_A_gets_inverse_A()
18 */
19 #undef __FUNCT__
20 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_newdatastruct"
21 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
22 {
23   Mat            C=B;
24   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
25   IS             isrow = b->row,isicol = b->icol;
26   PetscErrorCode ierr;
27   const PetscInt *r,*ic,*ics;
28   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
29   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
30   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
31   PetscInt       bs2 = a->bs2,flg;
32   PetscReal      shift = info->shiftinblocks;
33 
34   PetscFunctionBegin;
35   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
36   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
37 
38   /* generate work space needed by the factorization */
39   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
40   mwork = rtmp + bs2*n;
41   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
42   ics  = ic;
43 
44   for (i=0; i<n; i++){
45     /* zero rtmp */
46     /* L part */
47     nz    = bi[i+1] - bi[i];
48     bjtmp = bj + bi[i];
49     for  (j=0; j<nz; j++){
50       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51     }
52 
53     /* U part */
54     nz = bi[2*n-i+1] - bi[2*n-i];
55     bjtmp = bj + bi[2*n-i];
56     for  (j=0; j<nz; j++){
57       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
58     }
59 
60     /* load in initial (unfactored row) */
61     nz    = ai[r[i]+1] - ai[r[i]];
62     ajtmp = aj + ai[r[i]];
63     v     = aa + bs2*ai[r[i]];
64     for (j=0; j<nz; j++) {
65       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
66     }
67 
68     /* elimination */
69     bjtmp = bj + bi[i];
70     nzL   = bi[i+1] - bi[i];
71     for(k=0;k < nzL;k++) {
72       row = bjtmp[k];
73       pc = rtmp + bs2*row;
74       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
75       if (flg) {
76         pv = b->a + bs2*bdiag[row];
77         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
78         ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr);
79 
80         pj = b->j + bi[2*n-row]; /* begining of U(row,:) */
81         pv = b->a + bs2*bi[2*n-row];
82         nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
83         for (j=0; j<nz; j++) {
84           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
85           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
86           v    = rtmp + 4*pj[j];
87           ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr);
88           pv  += 4;
89         }
90         ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
91       }
92     }
93 
94     /* finished row so stick it into b->a */
95     /* L part */
96     pv   = b->a + bs2*bi[i] ;
97     pj   = b->j + bi[i] ;
98     nz   = bi[i+1] - bi[i];
99     for (j=0; j<nz; j++) {
100       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
101     }
102 
103     /* Mark diagonal and invert diagonal for simplier triangular solves */
104     pv   = b->a + bs2*bdiag[i];
105     pj   = b->j + bdiag[i];
106     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
107     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
108     ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr);
109 
110     /* U part */
111     pv = b->a + bs2*bi[2*n-i];
112     pj = b->j + bi[2*n-i];
113     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
114     for (j=0; j<nz; j++){
115       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
116     }
117   }
118 
119   ierr = PetscFree(rtmp);CHKERRQ(ierr);
120   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
121   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
122 
123   C->assembled = PETSC_TRUE;
124   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
125   PetscFunctionReturn(0);
126 }
127 
128 #undef __FUNCT__
129 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_newdatastruct_v2"
130 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
131 {
132   Mat            C=B;
133   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
134   IS             isrow = b->row,isicol = b->icol;
135   PetscErrorCode ierr;
136   const PetscInt *r,*ic,*ics;
137   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
138   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
139   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
140   PetscInt       bs2 = a->bs2,flg;
141   PetscReal      shift = info->shiftinblocks;
142 
143   PetscFunctionBegin;
144   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
145   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
146 
147   /* generate work space needed by the factorization */
148   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
149   mwork = rtmp + bs2*n;
150   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
151   ics  = ic;
152 
153   for (i=0; i<n; i++){
154     /* zero rtmp */
155     /* L part */
156     nz    = bi[i+1] - bi[i];
157     bjtmp = bj + bi[i];
158     for  (j=0; j<nz; j++){
159       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
160     }
161 
162     /* U part */
163     nz = bdiag[i] - bdiag[i+1];
164     bjtmp = bj + bdiag[i+1]+1;
165     for  (j=0; j<nz; j++){
166       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
167     }
168 
169     /* load in initial (unfactored row) */
170     nz    = ai[r[i]+1] - ai[r[i]];
171     ajtmp = aj + ai[r[i]];
172     v     = aa + bs2*ai[r[i]];
173     for (j=0; j<nz; j++) {
174       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
175     }
176 
177     /* elimination */
178     bjtmp = bj + bi[i];
179     nzL   = bi[i+1] - bi[i];
180     for(k=0;k < nzL;k++) {
181       row = bjtmp[k];
182       pc = rtmp + bs2*row;
183       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
184       if (flg) {
185         pv = b->a + bs2*bdiag[row];
186         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
187         ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr);
188 
189         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
190         pv = b->a + bs2*(bdiag[row+1]+1);
191         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
192         for (j=0; j<nz; j++) {
193           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
194           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
195           v    = rtmp + 4*pj[j];
196           ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr);
197           pv  += 4;
198         }
199         ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
200       }
201     }
202 
203     /* finished row so stick it into b->a */
204     /* L part */
205     pv   = b->a + bs2*bi[i] ;
206     pj   = b->j + bi[i] ;
207     nz   = bi[i+1] - bi[i];
208     for (j=0; j<nz; j++) {
209       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
210     }
211 
212     /* Mark diagonal and invert diagonal for simplier triangular solves */
213     pv   = b->a + bs2*bdiag[i];
214     pj   = b->j + bdiag[i];
215     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
216     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
217     ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr);
218 
219     /* U part */
220     pv = b->a + bs2*(bdiag[i+1]+1);
221     pj = b->j + bdiag[i+1]+1;
222     nz = bdiag[i] - bdiag[i+1] - 1;
223     for (j=0; j<nz; j++){
224       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
225     }
226   }
227 
228   ierr = PetscFree(rtmp);CHKERRQ(ierr);
229   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
230   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
231 
232   C->assembled = PETSC_TRUE;
233   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
234   PetscFunctionReturn(0);
235 }
236 
237 /*
238   MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct -
239     copied from MatLUFactorNumeric_SeqBAIJ_2_newdatastruct(), then remove ordering
240 */
241 #undef __FUNCT__
242 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct"
243 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
244 {
245   Mat            C=B;
246   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
247   PetscErrorCode ierr;
248   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
249   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
250   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
251   PetscInt       bs2 = a->bs2,flg;
252   PetscReal      shift = info->shiftinblocks;
253 
254   PetscFunctionBegin;
255   /* generate work space needed by the factorization */
256   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
257   mwork = rtmp + bs2*n;
258   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
259 
260   for (i=0; i<n; i++){
261     /* zero rtmp */
262     /* L part */
263     nz    = bi[i+1] - bi[i];
264     bjtmp = bj + bi[i];
265     for  (j=0; j<nz; j++){
266       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
267     }
268 
269     /* U part */
270     nz = bi[2*n-i+1] - bi[2*n-i];
271     bjtmp = bj + bi[2*n-i];
272     for  (j=0; j<nz; j++){
273       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
274     }
275 
276     /* load in initial (unfactored row) */
277     nz    = ai[i+1] - ai[i];
278     ajtmp = aj + ai[i];
279     v     = aa + bs2*ai[i];
280     for (j=0; j<nz; j++) {
281       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
282     }
283 
284     /* elimination */
285     bjtmp = bj + bi[i];
286     nzL   = bi[i+1] - bi[i];
287     for(k=0;k < nzL;k++) {
288       row = bjtmp[k];
289       pc = rtmp + bs2*row;
290       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
291       if (flg) {
292         pv = b->a + bs2*bdiag[row];
293         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
294         ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr);
295 
296         pj = b->j + bi[2*n-row]; /* begining of U(row,:) */
297         pv = b->a + bs2*bi[2*n-row];
298         nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
299         for (j=0; j<nz; j++) {
300           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
301           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
302           v    = rtmp + 4*pj[j];
303           ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr);
304           pv  += 4;
305         }
306         ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
307       }
308     }
309 
310     /* finished row so stick it into b->a */
311     /* L part */
312     pv   = b->a + bs2*bi[i] ;
313     pj   = b->j + bi[i] ;
314     nz   = bi[i+1] - bi[i];
315     for (j=0; j<nz; j++) {
316       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
317     }
318 
319     /* Mark diagonal and invert diagonal for simplier triangular solves */
320     pv   = b->a + bs2*bdiag[i];
321     pj   = b->j + bdiag[i];
322     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
323     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
324     ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr);
325 
326     /* U part */
327     pv = b->a + bs2*bi[2*n-i];
328     pj = b->j + bi[2*n-i];
329     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
330     for (j=0; j<nz; j++){
331       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
332     }
333   }
334 
335   ierr = PetscFree(rtmp);CHKERRQ(ierr);
336   C->assembled = PETSC_TRUE;
337   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
338   PetscFunctionReturn(0);
339 }
340 
341 #undef __FUNCT__
342 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2"
343 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
344 {
345   Mat            C=B;
346   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
347   PetscErrorCode ierr;
348   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
349   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
350   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
351   PetscInt       bs2 = a->bs2,flg;
352   PetscReal      shift = info->shiftinblocks;
353 
354   PetscFunctionBegin;
355   /* generate work space needed by the factorization */
356   ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
357   mwork = rtmp + bs2*n;
358   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
359 
360   for (i=0; i<n; i++){
361     /* zero rtmp */
362     /* L part */
363     nz    = bi[i+1] - bi[i];
364     bjtmp = bj + bi[i];
365     for  (j=0; j<nz; j++){
366       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
367     }
368 
369     /* U part */
370     nz = bdiag[i] - bdiag[i+1];
371     bjtmp = bj + bdiag[i+1]+1;
372     for  (j=0; j<nz; j++){
373       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
374     }
375 
376     /* load in initial (unfactored row) */
377     nz    = ai[i+1] - ai[i];
378     ajtmp = aj + ai[i];
379     v     = aa + bs2*ai[i];
380     for (j=0; j<nz; j++) {
381       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
382     }
383 
384     /* elimination */
385     bjtmp = bj + bi[i];
386     nzL   = bi[i+1] - bi[i];
387     for(k=0;k < nzL;k++) {
388       row = bjtmp[k];
389       pc = rtmp + bs2*row;
390       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
391       if (flg) {
392         pv = b->a + bs2*bdiag[row];
393         /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
394         ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr);
395 
396         pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
397 	pv = b->a + bs2*(bdiag[row+1]+1);
398 	nz = bdiag[row]-bdiag[row+1] - 1; /* num of entries in U(row,:) excluding diag */
399         for (j=0; j<nz; j++) {
400           /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
401           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
402           v    = rtmp + 4*pj[j];
403           ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr);
404           pv  += 4;
405         }
406         ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
407       }
408     }
409 
410     /* finished row so stick it into b->a */
411     /* L part */
412     pv   = b->a + bs2*bi[i] ;
413     pj   = b->j + bi[i] ;
414     nz   = bi[i+1] - bi[i];
415     for (j=0; j<nz; j++) {
416       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
417     }
418 
419     /* Mark diagonal and invert diagonal for simplier triangular solves */
420     pv   = b->a + bs2*bdiag[i];
421     pj   = b->j + bdiag[i];
422     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
423     /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */
424     ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr);
425 
426     /* U part */
427     /*
428     pv = b->a + bs2*bi[2*n-i];
429     pj = b->j + bi[2*n-i];
430     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
431     */
432     pv = b->a + bs2*(bdiag[i+1]+1);
433     pj = b->j + bdiag[i+1]+1;
434     nz = bdiag[i] - bdiag[i+1] - 1;
435     for (j=0; j<nz; j++){
436       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
437     }
438   }
439 
440   ierr = PetscFree(rtmp);CHKERRQ(ierr);
441   C->assembled = PETSC_TRUE;
442   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
443   PetscFunctionReturn(0);
444 }
445 
446 #undef __FUNCT__
447 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2"
448 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B,Mat A,const MatFactorInfo *info)
449 {
450   Mat            C = B;
451   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
452   IS             isrow = b->row,isicol = b->icol;
453   PetscErrorCode ierr;
454   const PetscInt *r,*ic;
455   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
456   PetscInt       *ajtmpold,*ajtmp,nz,row;
457   PetscInt       *diag_offset=b->diag,idx,*ai=a->i,*aj=a->j,*pj;
458   MatScalar      *pv,*v,*rtmp,m1,m2,m3,m4,*pc,*w,*x,x1,x2,x3,x4;
459   MatScalar      p1,p2,p3,p4;
460   MatScalar      *ba = b->a,*aa = a->a;
461   PetscReal      shift = info->shiftinblocks;
462 
463   PetscFunctionBegin;
464   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
465   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
466   ierr  = PetscMalloc(4*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
467 
468   for (i=0; i<n; i++) {
469     nz    = bi[i+1] - bi[i];
470     ajtmp = bj + bi[i];
471     for  (j=0; j<nz; j++) {
472       x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0;
473     }
474     /* load in initial (unfactored row) */
475     idx      = r[i];
476     nz       = ai[idx+1] - ai[idx];
477     ajtmpold = aj + ai[idx];
478     v        = aa + 4*ai[idx];
479     for (j=0; j<nz; j++) {
480       x    = rtmp+4*ic[ajtmpold[j]];
481       x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
482       v    += 4;
483     }
484     row = *ajtmp++;
485     while (row < i) {
486       pc = rtmp + 4*row;
487       p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
488       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
489         pv = ba + 4*diag_offset[row];
490         pj = bj + diag_offset[row] + 1;
491         x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
492         pc[0] = m1 = p1*x1 + p3*x2;
493         pc[1] = m2 = p2*x1 + p4*x2;
494         pc[2] = m3 = p1*x3 + p3*x4;
495         pc[3] = m4 = p2*x3 + p4*x4;
496         nz = bi[row+1] - diag_offset[row] - 1;
497         pv += 4;
498         for (j=0; j<nz; j++) {
499           x1   = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
500           x    = rtmp + 4*pj[j];
501           x[0] -= m1*x1 + m3*x2;
502           x[1] -= m2*x1 + m4*x2;
503           x[2] -= m1*x3 + m3*x4;
504           x[3] -= m2*x3 + m4*x4;
505           pv   += 4;
506         }
507         ierr = PetscLogFlops(16.0*nz+12.0);CHKERRQ(ierr);
508       }
509       row = *ajtmp++;
510     }
511     /* finished row so stick it into b->a */
512     pv = ba + 4*bi[i];
513     pj = bj + bi[i];
514     nz = bi[i+1] - bi[i];
515     for (j=0; j<nz; j++) {
516       x     = rtmp+4*pj[j];
517       pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
518       pv   += 4;
519     }
520     /* invert diagonal block */
521     w = ba + 4*diag_offset[i];
522     ierr = Kernel_A_gets_inverse_A_2(w,shift);CHKERRQ(ierr);
523   }
524 
525   ierr = PetscFree(rtmp);CHKERRQ(ierr);
526   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
527   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
528   C->ops->solve          = MatSolve_SeqBAIJ_2;
529   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
530   C->assembled = PETSC_TRUE;
531   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
532   PetscFunctionReturn(0);
533 }
534 /*
535       Version for when blocks are 2 by 2 Using natural ordering
536 */
537 #undef __FUNCT__
538 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering"
539 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
540 {
541   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
542   PetscErrorCode ierr;
543   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
544   PetscInt       *ajtmpold,*ajtmp,nz,row;
545   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
546   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
547   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,x1,x2,x3,x4;
548   MatScalar      *ba = b->a,*aa = a->a;
549   PetscReal      shift = info->shiftinblocks;
550 
551   PetscFunctionBegin;
552   ierr = PetscMalloc(4*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
553   for (i=0; i<n; i++) {
554     nz    = bi[i+1] - bi[i];
555     ajtmp = bj + bi[i];
556     for  (j=0; j<nz; j++) {
557       x = rtmp+4*ajtmp[j];
558       x[0]  = x[1]  = x[2]  = x[3]  = 0.0;
559     }
560     /* load in initial (unfactored row) */
561     nz       = ai[i+1] - ai[i];
562     ajtmpold = aj + ai[i];
563     v        = aa + 4*ai[i];
564     for (j=0; j<nz; j++) {
565       x    = rtmp+4*ajtmpold[j];
566       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
567       v    += 4;
568     }
569     row = *ajtmp++;
570     while (row < i) {
571       pc  = rtmp + 4*row;
572       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
573       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) {
574         pv = ba + 4*diag_offset[row];
575         pj = bj + diag_offset[row] + 1;
576         x1  = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
577         pc[0] = m1 = p1*x1 + p3*x2;
578         pc[1] = m2 = p2*x1 + p4*x2;
579         pc[2] = m3 = p1*x3 + p3*x4;
580         pc[3] = m4 = p2*x3 + p4*x4;
581         nz = bi[row+1] - diag_offset[row] - 1;
582         pv += 4;
583         for (j=0; j<nz; j++) {
584           x1   = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
585           x    = rtmp + 4*pj[j];
586           x[0] -= m1*x1 + m3*x2;
587           x[1] -= m2*x1 + m4*x2;
588           x[2] -= m1*x3 + m3*x4;
589           x[3] -= m2*x3 + m4*x4;
590           pv   += 4;
591         }
592         ierr = PetscLogFlops(16.0*nz+12.0);CHKERRQ(ierr);
593       }
594       row = *ajtmp++;
595     }
596     /* finished row so stick it into b->a */
597     pv = ba + 4*bi[i];
598     pj = bj + bi[i];
599     nz = bi[i+1] - bi[i];
600     for (j=0; j<nz; j++) {
601       x      = rtmp+4*pj[j];
602       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
603       /*
604       printf(" col %d:",pj[j]);
605       PetscInt j1;
606       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
607       printf("\n");
608       */
609       pv   += 4;
610     }
611     /* invert diagonal block */
612     w = ba + 4*diag_offset[i];
613     /*
614     printf(" \n%d -th: diag: ",i);
615     for (j=0; j<4; j++){
616       printf(" %g,",w[j]);
617     }
618     printf("\n----------------------------\n");
619     */
620     ierr = Kernel_A_gets_inverse_A_2(w,shift);CHKERRQ(ierr);
621   }
622 
623   ierr = PetscFree(rtmp);CHKERRQ(ierr);
624   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
625   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
626   C->assembled = PETSC_TRUE;
627   ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
628   PetscFunctionReturn(0);
629 }
630 
631 /* ----------------------------------------------------------- */
632 /*
633      Version for when blocks are 1 by 1.
634 */
635 #undef __FUNCT__
636 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_1"
637 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat C,Mat A,const MatFactorInfo *info)
638 {
639   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data;
640   IS             isrow = b->row,isicol = b->icol;
641   PetscErrorCode ierr;
642   const PetscInt *r,*ic;
643   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
644   PetscInt       *ajtmpold,*ajtmp,nz,row,*ai = a->i,*aj = a->j;
645   PetscInt       *diag_offset = b->diag,diag,*pj;
646   MatScalar      *pv,*v,*rtmp,multiplier,*pc;
647   MatScalar      *ba = b->a,*aa = a->a;
648   PetscTruth     row_identity, col_identity;
649 
650   PetscFunctionBegin;
651   ierr  = ISGetIndices(isrow,&r);CHKERRQ(ierr);
652   ierr  = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
653   ierr  = PetscMalloc((n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
654 
655   for (i=0; i<n; i++) {
656     nz    = bi[i+1] - bi[i];
657     ajtmp = bj + bi[i];
658     for  (j=0; j<nz; j++) rtmp[ajtmp[j]] = 0.0;
659 
660     /* load in initial (unfactored row) */
661     nz       = ai[r[i]+1] - ai[r[i]];
662     ajtmpold = aj + ai[r[i]];
663     v        = aa + ai[r[i]];
664     for (j=0; j<nz; j++) rtmp[ic[ajtmpold[j]]] =  v[j];
665 
666     row = *ajtmp++;
667     while (row < i) {
668       pc = rtmp + row;
669       if (*pc != 0.0) {
670         pv         = ba + diag_offset[row];
671         pj         = bj + diag_offset[row] + 1;
672         multiplier = *pc * *pv++;
673         *pc        = multiplier;
674         nz         = bi[row+1] - diag_offset[row] - 1;
675         for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
676         ierr = PetscLogFlops(1.0+2.0*nz);CHKERRQ(ierr);
677       }
678       row = *ajtmp++;
679     }
680     /* finished row so stick it into b->a */
681     pv = ba + bi[i];
682     pj = bj + bi[i];
683     nz = bi[i+1] - bi[i];
684     for (j=0; j<nz; j++) {pv[j] = rtmp[pj[j]];}
685     diag = diag_offset[i] - bi[i];
686     /* check pivot entry for current row */
687     if (pv[diag] == 0.0) {
688       SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot: row in original ordering %D in permuted ordering %D",r[i],i);
689     }
690     pv[diag] = 1.0/pv[diag];
691   }
692 
693   ierr = PetscFree(rtmp);CHKERRQ(ierr);
694   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
695   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
696   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
697   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
698   if (row_identity && col_identity) {
699     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
700     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
701   } else {
702     C->ops->solve          = MatSolve_SeqBAIJ_1;
703     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
704   }
705   C->assembled = PETSC_TRUE;
706   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
707   PetscFunctionReturn(0);
708 }
709 
710 EXTERN_C_BEGIN
711 #undef __FUNCT__
712 #define __FUNCT__ "MatGetFactor_seqbaij_petsc"
713 PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A,MatFactorType ftype,Mat *B)
714 {
715   PetscInt           n = A->rmap->n;
716   PetscErrorCode     ierr;
717 
718   PetscFunctionBegin;
719   ierr = MatCreate(((PetscObject)A)->comm,B);CHKERRQ(ierr);
720   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
721   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
722     ierr = MatSetType(*B,MATSEQBAIJ);CHKERRQ(ierr);
723     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
724     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
725     (*B)->ops->iludtfactor       = MatILUDTFactor_SeqBAIJ;
726   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
727     ierr = MatSetType(*B,MATSEQSBAIJ);CHKERRQ(ierr);
728     ierr = MatSeqSBAIJSetPreallocation(*B,1,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
729     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
730     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
731   } else SETERRQ(PETSC_ERR_SUP,"Factor type not supported");
732   (*B)->factor = ftype;
733   PetscFunctionReturn(0);
734 }
735 EXTERN_C_END
736 
737 EXTERN_C_BEGIN
738 #undef __FUNCT__
739 #define __FUNCT__ "MatGetFactorAvailable_seqaij_petsc"
740 PetscErrorCode MatGetFactorAvailable_seqbaij_petsc(Mat A,MatFactorType ftype,PetscTruth *flg)
741 {
742   PetscFunctionBegin;
743   *flg = PETSC_TRUE;
744   PetscFunctionReturn(0);
745 }
746 EXTERN_C_END
747 
748 /* ----------------------------------------------------------- */
749 #undef __FUNCT__
750 #define __FUNCT__ "MatLUFactor_SeqBAIJ"
751 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,const MatFactorInfo *info)
752 {
753   PetscErrorCode ierr;
754   Mat            C;
755 
756   PetscFunctionBegin;
757   ierr = MatGetFactor(A,MAT_SOLVER_PETSC,MAT_FACTOR_LU,&C);CHKERRQ(ierr);
758   ierr = MatLUFactorSymbolic(C,A,row,col,info);CHKERRQ(ierr);
759   ierr = MatLUFactorNumeric(C,A,info);CHKERRQ(ierr);
760   A->ops->solve            = C->ops->solve;
761   A->ops->solvetranspose   = C->ops->solvetranspose;
762   ierr = MatHeaderCopy(A,C);CHKERRQ(ierr);
763   ierr = PetscLogObjectParent(A,((Mat_SeqBAIJ*)(A->data))->icol);CHKERRQ(ierr);
764   PetscFunctionReturn(0);
765 }
766 
767 #include "../src/mat/impls/sbaij/seq/sbaij.h"
768 #undef __FUNCT__
769 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqBAIJ_N"
770 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C,Mat A,const MatFactorInfo *info)
771 {
772   PetscErrorCode ierr;
773   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data;
774   Mat_SeqSBAIJ   *b=(Mat_SeqSBAIJ*)C->data;
775   IS             ip=b->row;
776   const PetscInt *rip;
777   PetscInt       i,j,mbs=a->mbs,bs=A->rmap->bs,*bi=b->i,*bj=b->j,*bcol;
778   PetscInt       *ai=a->i,*aj=a->j;
779   PetscInt       k,jmin,jmax,*jl,*il,col,nexti,ili,nz;
780   MatScalar      *rtmp,*ba=b->a,*bval,*aa=a->a,dk,uikdi;
781   PetscReal      zeropivot,rs,shiftnz;
782   PetscReal      shiftpd;
783   ChShift_Ctx    sctx;
784   PetscInt       newshift;
785 
786   PetscFunctionBegin;
787   if (bs > 1) {
788     if (!a->sbaijMat){
789       ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr);
790     }
791     ierr = (a->sbaijMat)->ops->choleskyfactornumeric(C,a->sbaijMat,info);CHKERRQ(ierr);
792     ierr = MatDestroy(a->sbaijMat);CHKERRQ(ierr);
793     a->sbaijMat = PETSC_NULL;
794     PetscFunctionReturn(0);
795   }
796 
797   /* initialization */
798   shiftnz   = info->shiftnz;
799   shiftpd   = info->shiftpd;
800   zeropivot = info->zeropivot;
801 
802   ierr  = ISGetIndices(ip,&rip);CHKERRQ(ierr);
803   nz   = (2*mbs+1)*sizeof(PetscInt)+mbs*sizeof(MatScalar);
804   ierr = PetscMalloc(nz,&il);CHKERRQ(ierr);
805   jl   = il + mbs;
806   rtmp = (MatScalar*)(jl + mbs);
807 
808   sctx.shift_amount = 0;
809   sctx.nshift       = 0;
810   do {
811     sctx.chshift = PETSC_FALSE;
812     for (i=0; i<mbs; i++) {
813       rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0;
814     }
815 
816     for (k = 0; k<mbs; k++){
817       bval = ba + bi[k];
818       /* initialize k-th row by the perm[k]-th row of A */
819       jmin = ai[rip[k]]; jmax = ai[rip[k]+1];
820       for (j = jmin; j < jmax; j++){
821         col = rip[aj[j]];
822         if (col >= k){ /* only take upper triangular entry */
823           rtmp[col] = aa[j];
824           *bval++  = 0.0; /* for in-place factorization */
825         }
826       }
827 
828       /* shift the diagonal of the matrix */
829       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
830 
831       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
832       dk = rtmp[k];
833       i = jl[k]; /* first row to be added to k_th row  */
834 
835       while (i < k){
836         nexti = jl[i]; /* next row to be added to k_th row */
837 
838         /* compute multiplier, update diag(k) and U(i,k) */
839         ili = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
840         uikdi = - ba[ili]*ba[bi[i]];  /* diagonal(k) */
841         dk += uikdi*ba[ili];
842         ba[ili] = uikdi; /* -U(i,k) */
843 
844         /* add multiple of row i to k-th row */
845         jmin = ili + 1; jmax = bi[i+1];
846         if (jmin < jmax){
847           for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j];
848           /* update il and jl for row i */
849           il[i] = jmin;
850           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
851         }
852         i = nexti;
853       }
854 
855       /* shift the diagonals when zero pivot is detected */
856       /* compute rs=sum of abs(off-diagonal) */
857       rs   = 0.0;
858       jmin = bi[k]+1;
859       nz   = bi[k+1] - jmin;
860       if (nz){
861         bcol = bj + jmin;
862         while (nz--){
863           rs += PetscAbsScalar(rtmp[*bcol]);
864           bcol++;
865         }
866       }
867 
868       sctx.rs = rs;
869       sctx.pv = dk;
870       ierr = MatCholeskyCheckShift_inline(info,sctx,k,newshift);CHKERRQ(ierr);
871       if (newshift == 1) break;
872 
873       /* copy data into U(k,:) */
874       ba[bi[k]] = 1.0/dk; /* U(k,k) */
875       jmin = bi[k]+1; jmax = bi[k+1];
876       if (jmin < jmax) {
877         for (j=jmin; j<jmax; j++){
878           col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0;
879         }
880         /* add the k-th row into il and jl */
881         il[k] = jmin;
882         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
883       }
884     }
885   } while (sctx.chshift);
886   ierr = PetscFree(il);CHKERRQ(ierr);
887 
888   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
889   C->assembled    = PETSC_TRUE;
890   C->preallocated = PETSC_TRUE;
891   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
892   if (sctx.nshift){
893     if (shiftpd) {
894       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
895     } else if (shiftnz) {
896       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
897     }
898   }
899   PetscFunctionReturn(0);
900 }
901 
902 #undef __FUNCT__
903 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering"
904 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info)
905 {
906   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data;
907   Mat_SeqSBAIJ   *b=(Mat_SeqSBAIJ*)C->data;
908   PetscErrorCode ierr;
909   PetscInt       i,j,am=a->mbs;
910   PetscInt       *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
911   PetscInt       k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz;
912   MatScalar      *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval;
913   PetscReal      zeropivot,rs,shiftnz;
914   PetscReal      shiftpd;
915   ChShift_Ctx    sctx;
916   PetscInt       newshift;
917 
918   PetscFunctionBegin;
919   /* initialization */
920   shiftnz   = info->shiftnz;
921   shiftpd   = info->shiftpd;
922   zeropivot = info->zeropivot;
923 
924   nz   = (2*am+1)*sizeof(PetscInt)+am*sizeof(MatScalar);
925   ierr = PetscMalloc(nz,&il);CHKERRQ(ierr);
926   jl   = il + am;
927   rtmp = (MatScalar*)(jl + am);
928 
929   sctx.shift_amount = 0;
930   sctx.nshift       = 0;
931   do {
932     sctx.chshift = PETSC_FALSE;
933     for (i=0; i<am; i++) {
934       rtmp[i] = 0.0; jl[i] = am; il[0] = 0;
935     }
936 
937     for (k = 0; k<am; k++){
938     /* initialize k-th row with elements nonzero in row perm(k) of A */
939       nz   = ai[k+1] - ai[k];
940       acol = aj + ai[k];
941       aval = aa + ai[k];
942       bval = ba + bi[k];
943       while (nz -- ){
944         if (*acol < k) { /* skip lower triangular entries */
945           acol++; aval++;
946         } else {
947           rtmp[*acol++] = *aval++;
948           *bval++       = 0.0; /* for in-place factorization */
949         }
950       }
951 
952       /* shift the diagonal of the matrix */
953       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
954 
955       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
956       dk = rtmp[k];
957       i  = jl[k]; /* first row to be added to k_th row  */
958 
959       while (i < k){
960         nexti = jl[i]; /* next row to be added to k_th row */
961         /* compute multiplier, update D(k) and U(i,k) */
962         ili   = il[i];  /* index of first nonzero element in U(i,k:bms-1) */
963         uikdi = - ba[ili]*ba[bi[i]];
964         dk   += uikdi*ba[ili];
965         ba[ili] = uikdi; /* -U(i,k) */
966 
967         /* add multiple of row i to k-th row ... */
968         jmin = ili + 1;
969         nz   = bi[i+1] - jmin;
970         if (nz > 0){
971           bcol = bj + jmin;
972           bval = ba + jmin;
973           while (nz --) rtmp[*bcol++] += uikdi*(*bval++);
974           /* update il and jl for i-th row */
975           il[i] = jmin;
976           j = bj[jmin]; jl[i] = jl[j]; jl[j] = i;
977         }
978         i = nexti;
979       }
980 
981       /* shift the diagonals when zero pivot is detected */
982       /* compute rs=sum of abs(off-diagonal) */
983       rs   = 0.0;
984       jmin = bi[k]+1;
985       nz   = bi[k+1] - jmin;
986       if (nz){
987         bcol = bj + jmin;
988         while (nz--){
989           rs += PetscAbsScalar(rtmp[*bcol]);
990           bcol++;
991         }
992       }
993 
994       sctx.rs = rs;
995       sctx.pv = dk;
996       ierr = MatCholeskyCheckShift_inline(info,sctx,k,newshift);CHKERRQ(ierr);
997       if (newshift == 1) break;    /* sctx.shift_amount is updated */
998 
999       /* copy data into U(k,:) */
1000       ba[bi[k]] = 1.0/dk;
1001       jmin      = bi[k]+1;
1002       nz        = bi[k+1] - jmin;
1003       if (nz){
1004         bcol = bj + jmin;
1005         bval = ba + jmin;
1006         while (nz--){
1007           *bval++       = rtmp[*bcol];
1008           rtmp[*bcol++] = 0.0;
1009         }
1010         /* add k-th row into il and jl */
1011         il[k] = jmin;
1012         i = bj[jmin]; jl[k] = jl[i]; jl[i] = k;
1013       }
1014     }
1015   } while (sctx.chshift);
1016   ierr = PetscFree(il);CHKERRQ(ierr);
1017 
1018   C->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1019   C->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1020   C->assembled    = PETSC_TRUE;
1021   C->preallocated = PETSC_TRUE;
1022   ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr);
1023     if (sctx.nshift){
1024     if (shiftnz) {
1025       ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1026     } else if (shiftpd) {
1027       ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr);
1028     }
1029   }
1030   PetscFunctionReturn(0);
1031 }
1032 
1033 #include "petscbt.h"
1034 #include "../src/mat/utils/freespace.h"
1035 #undef __FUNCT__
1036 #define __FUNCT__ "MatICCFactorSymbolic_SeqBAIJ"
1037 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
1038 {
1039   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
1040   Mat_SeqSBAIJ       *b;
1041   Mat                B;
1042   PetscErrorCode     ierr;
1043   PetscTruth         perm_identity;
1044   PetscInt           reallocs=0,i,*ai=a->i,*aj=a->j,am=a->mbs,bs=A->rmap->bs,*ui;
1045   const PetscInt     *rip;
1046   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
1047   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,ncols_upper,*cols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
1048   PetscReal          fill=info->fill,levels=info->levels;
1049   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
1050   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
1051   PetscBT            lnkbt;
1052 
1053   PetscFunctionBegin;
1054   if (bs > 1){
1055     if (!a->sbaijMat){
1056       ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr);
1057     }
1058     (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ;  /* undue the change made in MatGetFactor_seqbaij_petsc */
1059     ierr = MatICCFactorSymbolic(fact,a->sbaijMat,perm,info);CHKERRQ(ierr);
1060     PetscFunctionReturn(0);
1061   }
1062 
1063   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
1064   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
1065 
1066   /* special case that simply copies fill pattern */
1067   if (!levels && perm_identity) {
1068     ierr = MatMarkDiagonal_SeqBAIJ(A);CHKERRQ(ierr);
1069     ierr = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
1070     for (i=0; i<am; i++) {
1071       ui[i] = ai[i+1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1072     }
1073     B = fact;
1074     ierr = MatSeqSBAIJSetPreallocation(B,1,0,ui);CHKERRQ(ierr);
1075 
1076 
1077     b  = (Mat_SeqSBAIJ*)B->data;
1078     uj = b->j;
1079     for (i=0; i<am; i++) {
1080       aj = a->j + a->diag[i];
1081       for (j=0; j<ui[i]; j++){
1082         *uj++ = *aj++;
1083       }
1084       b->ilen[i] = ui[i];
1085     }
1086     ierr = PetscFree(ui);CHKERRQ(ierr);
1087     B->factor = MAT_FACTOR_NONE;
1088     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1089     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1090     B->factor = MAT_FACTOR_ICC;
1091 
1092     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1093     PetscFunctionReturn(0);
1094   }
1095 
1096   /* initialization */
1097   ierr  = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
1098   ui[0] = 0;
1099   ierr  = PetscMalloc((2*am+1)*sizeof(PetscInt),&cols_lvl);CHKERRQ(ierr);
1100 
1101   /* jl: linked list for storing indices of the pivot rows
1102      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1103   ierr = PetscMalloc((2*am+1)*sizeof(PetscInt)+2*am*sizeof(PetscInt*),&jl);CHKERRQ(ierr);
1104   il         = jl + am;
1105   uj_ptr     = (PetscInt**)(il + am);
1106   uj_lvl_ptr = (PetscInt**)(uj_ptr + am);
1107   for (i=0; i<am; i++){
1108     jl[i] = am; il[i] = 0;
1109   }
1110 
1111   /* create and initialize a linked list for storing column indices of the active row k */
1112   nlnk = am + 1;
1113   ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
1114 
1115   /* initial FreeSpace size is fill*(ai[am]+1) */
1116   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
1117   current_space = free_space;
1118   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
1119   current_space_lvl = free_space_lvl;
1120 
1121   for (k=0; k<am; k++){  /* for each active row k */
1122     /* initialize lnk by the column indices of row rip[k] of A */
1123     nzk   = 0;
1124     ncols = ai[rip[k]+1] - ai[rip[k]];
1125     ncols_upper = 0;
1126     cols        = cols_lvl + am;
1127     for (j=0; j<ncols; j++){
1128       i = rip[*(aj + ai[rip[k]] + j)];
1129       if (i >= k){ /* only take upper triangular entry */
1130         cols[ncols_upper] = i;
1131         cols_lvl[ncols_upper] = -1;  /* initialize level for nonzero entries */
1132         ncols_upper++;
1133       }
1134     }
1135     ierr = PetscIncompleteLLAdd(ncols_upper,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
1136     nzk += nlnk;
1137 
1138     /* update lnk by computing fill-in for each pivot row to be merged in */
1139     prow = jl[k]; /* 1st pivot row */
1140 
1141     while (prow < k){
1142       nextprow = jl[prow];
1143 
1144       /* merge prow into k-th row */
1145       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
1146       jmax = ui[prow+1];
1147       ncols = jmax-jmin;
1148       i     = jmin - ui[prow];
1149       cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1150       for (j=0; j<ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1151       ierr = PetscIncompleteLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
1152       nzk += nlnk;
1153 
1154       /* update il and jl for prow */
1155       if (jmin < jmax){
1156         il[prow] = jmin;
1157         j = *cols; jl[prow] = jl[j]; jl[j] = prow;
1158       }
1159       prow = nextprow;
1160     }
1161 
1162     /* if free space is not available, make more free space */
1163     if (current_space->local_remaining<nzk) {
1164       i = am - k + 1; /* num of unfactored rows */
1165       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1166       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
1167       ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
1168       reallocs++;
1169     }
1170 
1171     /* copy data into free_space and free_space_lvl, then initialize lnk */
1172     ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
1173 
1174     /* add the k-th row into il and jl */
1175     if (nzk-1 > 0){
1176       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1177       jl[k] = jl[i]; jl[i] = k;
1178       il[k] = ui[k] + 1;
1179     }
1180     uj_ptr[k]     = current_space->array;
1181     uj_lvl_ptr[k] = current_space_lvl->array;
1182 
1183     current_space->array           += nzk;
1184     current_space->local_used      += nzk;
1185     current_space->local_remaining -= nzk;
1186 
1187     current_space_lvl->array           += nzk;
1188     current_space_lvl->local_used      += nzk;
1189     current_space_lvl->local_remaining -= nzk;
1190 
1191     ui[k+1] = ui[k] + nzk;
1192   }
1193 
1194 #if defined(PETSC_USE_INFO)
1195   if (ai[am] != 0) {
1196     PetscReal af = ((PetscReal)(2*ui[am]-am))/((PetscReal)ai[am]);
1197     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
1198     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
1199     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
1200   } else {
1201     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
1202   }
1203 #endif
1204 
1205   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
1206   ierr = PetscFree(jl);CHKERRQ(ierr);
1207   ierr = PetscFree(cols_lvl);CHKERRQ(ierr);
1208 
1209   /* destroy list of free space and other temporary array(s) */
1210   ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
1211   ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
1212   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1213   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
1214 
1215   /* put together the new matrix in MATSEQSBAIJ format */
1216   B = fact;
1217   ierr = MatSeqSBAIJSetPreallocation(B,1,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
1218 
1219   b = (Mat_SeqSBAIJ*)B->data;
1220   b->singlemalloc = PETSC_FALSE;
1221   b->free_a       = PETSC_TRUE;
1222   b->free_ij       = PETSC_TRUE;
1223   ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
1224   b->j    = uj;
1225   b->i    = ui;
1226   b->diag = 0;
1227   b->ilen = 0;
1228   b->imax = 0;
1229   b->row  = perm;
1230   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1231   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1232   b->icol = perm;
1233   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1234   ierr    = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
1235   ierr    = PetscLogObjectMemory(B,(ui[am]-am)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
1236   b->maxnz = b->nz = ui[am];
1237 
1238   B->info.factor_mallocs    = reallocs;
1239   B->info.fill_ratio_given  = fill;
1240   if (ai[am] != 0) {
1241     B->info.fill_ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
1242   } else {
1243     B->info.fill_ratio_needed = 0.0;
1244   }
1245   if (perm_identity){
1246     B->ops->solve           = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1247     B->ops->solvetranspose  = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1248     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1249   } else {
1250     (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1251   }
1252   PetscFunctionReturn(0);
1253 }
1254 
1255 #undef __FUNCT__
1256 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqBAIJ"
1257 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
1258 {
1259   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
1260   Mat_SeqSBAIJ       *b;
1261   Mat                B;
1262   PetscErrorCode     ierr;
1263   PetscTruth         perm_identity;
1264   PetscReal          fill = info->fill;
1265   const PetscInt     *rip;
1266   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,*ai=a->i,*aj=a->j,reallocs=0,prow;
1267   PetscInt           *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow;
1268   PetscInt           nlnk,*lnk,ncols,ncols_upper,*cols,*uj,**ui_ptr,*uj_ptr;
1269   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
1270   PetscBT            lnkbt;
1271 
1272   PetscFunctionBegin;
1273   if (bs > 1) { /* convert to seqsbaij */
1274     if (!a->sbaijMat){
1275       ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr);
1276     }
1277     (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1278     ierr = MatCholeskyFactorSymbolic(fact,a->sbaijMat,perm,info);CHKERRQ(ierr);
1279     PetscFunctionReturn(0);
1280   }
1281 
1282   /* check whether perm is the identity mapping */
1283   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
1284   if (!perm_identity) SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported");
1285   ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
1286 
1287   /* initialization */
1288   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
1289   ui[0] = 0;
1290 
1291   /* jl: linked list for storing indices of the pivot rows
1292      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1293   ierr = PetscMalloc((3*mbs+1)*sizeof(PetscInt)+mbs*sizeof(PetscInt*),&jl);CHKERRQ(ierr);
1294   il     = jl + mbs;
1295   cols   = il + mbs;
1296   ui_ptr = (PetscInt**)(cols + mbs);
1297   for (i=0; i<mbs; i++){
1298     jl[i] = mbs; il[i] = 0;
1299   }
1300 
1301   /* create and initialize a linked list for storing column indices of the active row k */
1302   nlnk = mbs + 1;
1303   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1304 
1305   /* initial FreeSpace size is fill*(ai[mbs]+1) */
1306   ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[mbs]+1)),&free_space);CHKERRQ(ierr);
1307   current_space = free_space;
1308 
1309   for (k=0; k<mbs; k++){  /* for each active row k */
1310     /* initialize lnk by the column indices of row rip[k] of A */
1311     nzk   = 0;
1312     ncols = ai[rip[k]+1] - ai[rip[k]];
1313     ncols_upper = 0;
1314     for (j=0; j<ncols; j++){
1315       i = rip[*(aj + ai[rip[k]] + j)];
1316       if (i >= k){ /* only take upper triangular entry */
1317         cols[ncols_upper] = i;
1318         ncols_upper++;
1319       }
1320     }
1321     ierr = PetscLLAdd(ncols_upper,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1322     nzk += nlnk;
1323 
1324     /* update lnk by computing fill-in for each pivot row to be merged in */
1325     prow = jl[k]; /* 1st pivot row */
1326 
1327     while (prow < k){
1328       nextprow = jl[prow];
1329       /* merge prow into k-th row */
1330       jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1331       jmax = ui[prow+1];
1332       ncols = jmax-jmin;
1333       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1334       ierr = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1335       nzk += nlnk;
1336 
1337       /* update il and jl for prow */
1338       if (jmin < jmax){
1339         il[prow] = jmin;
1340         j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow;
1341       }
1342       prow = nextprow;
1343     }
1344 
1345     /* if free space is not available, make more free space */
1346     if (current_space->local_remaining<nzk) {
1347       i = mbs - k + 1; /* num of unfactored rows */
1348       i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1349       ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
1350       reallocs++;
1351     }
1352 
1353     /* copy data into free space, then initialize lnk */
1354     ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1355 
1356     /* add the k-th row into il and jl */
1357     if (nzk-1 > 0){
1358       i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1359       jl[k] = jl[i]; jl[i] = k;
1360       il[k] = ui[k] + 1;
1361     }
1362     ui_ptr[k] = current_space->array;
1363     current_space->array           += nzk;
1364     current_space->local_used      += nzk;
1365     current_space->local_remaining -= nzk;
1366 
1367     ui[k+1] = ui[k] + nzk;
1368   }
1369 
1370 #if defined(PETSC_USE_INFO)
1371   if (ai[mbs] != 0) {
1372     PetscReal af = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
1373     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
1374     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
1375     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
1376   } else {
1377     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
1378   }
1379 #endif
1380 
1381   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
1382   ierr = PetscFree(jl);CHKERRQ(ierr);
1383 
1384   /* destroy list of free space and other temporary array(s) */
1385   ierr = PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
1386   ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
1387   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1388 
1389   /* put together the new matrix in MATSEQSBAIJ format */
1390   B    = fact;
1391   ierr = MatSeqSBAIJSetPreallocation(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
1392 
1393   b = (Mat_SeqSBAIJ*)B->data;
1394   b->singlemalloc = PETSC_FALSE;
1395   b->free_a       = PETSC_TRUE;
1396   b->free_ij      = PETSC_TRUE;
1397   ierr = PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
1398   b->j    = uj;
1399   b->i    = ui;
1400   b->diag = 0;
1401   b->ilen = 0;
1402   b->imax = 0;
1403   b->row  = perm;
1404   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1405   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1406   b->icol = perm;
1407   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1408   ierr    = PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
1409   ierr    = PetscLogObjectMemory(B,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
1410   b->maxnz = b->nz = ui[mbs];
1411 
1412   B->info.factor_mallocs    = reallocs;
1413   B->info.fill_ratio_given  = fill;
1414   if (ai[mbs] != 0) {
1415     B->info.fill_ratio_needed = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]);
1416   } else {
1417     B->info.fill_ratio_needed = 0.0;
1418   }
1419   if (perm_identity){
1420     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1421   } else {
1422     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1423   }
1424   PetscFunctionReturn(0);
1425 }
1426 
1427 /* --------------------------------------------------------- */
1428 #undef __FUNCT__
1429 #define __FUNCT__ "MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct"
1430 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1431 {
1432   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1433   PetscErrorCode ierr;
1434   const PetscInt *ai=a->i,*aj=a->j,*vi;
1435   PetscInt       i,k,n=a->mbs;
1436   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2,*adiag=a->diag;
1437   MatScalar      *aa=a->a,*v;
1438   PetscScalar    *x,*b,*s,*t,*ls;
1439 
1440   PetscFunctionBegin;
1441   /* printf("MatSolve_SeqBAIJ_NaturalOrdering_iludt..bs %d\n",bs); */
1442   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1443   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1444   t  = a->solve_work;
1445 
1446   /* forward solve the lower triangular */
1447   ierr = PetscMemcpy(t,b,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy 1st block of b to t */
1448 
1449   for (i=1; i<n; i++) {
1450     v   = aa + bs2*ai[i];
1451     vi  = aj + ai[i];
1452     nz = ai[i+1] - ai[i];
1453     s = t + bs*i;
1454     ierr = PetscMemcpy(s,b+bs*i,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy i_th block of b to t */
1455     for(k=0;k<nz;k++){
1456       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*vi[k]);
1457       v += bs2;
1458     }
1459   }
1460 
1461   /* backward solve the upper triangular */
1462   ls = a->solve_work + A->cmap->n;
1463   for (i=n-1; i>=0; i--){
1464     v  = aa + bs2*ai[2*n-i];
1465     vi = aj + ai[2*n-i];
1466     nz = ai[2*n-i +1] - ai[2*n-i]-1;
1467     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1468     for(k=0;k<nz;k++){
1469       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*vi[k]);
1470       v += bs2;
1471     }
1472     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*adiag[i],t+i*bs); /* *inv(diagonal[i]) */
1473     ierr = PetscMemcpy(x+i*bs,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1474   }
1475 
1476   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1477   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1478   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1479   PetscFunctionReturn(0);
1480 }
1481 
1482 #undef __FUNCT__
1483 #define __FUNCT__ "MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2"
1484 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
1485 {
1486   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1487   PetscErrorCode ierr;
1488   const PetscInt *ai=a->i,*aj=a->j,*adiag=a->diag,*vi;
1489   PetscInt       i,k,n=a->mbs;
1490   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1491   MatScalar      *aa=a->a,*v;
1492   PetscScalar    *x,*b,*s,*t,*ls;
1493 
1494   PetscFunctionBegin;
1495   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1496   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1497   t  = a->solve_work;
1498 
1499   /* forward solve the lower triangular */
1500   ierr = PetscMemcpy(t,b,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy 1st block of b to t */
1501 
1502   for (i=1; i<n; i++) {
1503     v   = aa + bs2*ai[i];
1504     vi  = aj + ai[i];
1505     nz = ai[i+1] - ai[i];
1506     s = t + bs*i;
1507     ierr = PetscMemcpy(s,b+bs*i,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy i_th block of b to t */
1508     for(k=0;k<nz;k++){
1509       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*vi[k]);
1510       v += bs2;
1511     }
1512   }
1513 
1514   /* backward solve the upper triangular */
1515   ls = a->solve_work + A->cmap->n;
1516   for (i=n-1; i>=0; i--){
1517     v  = aa + bs2*(adiag[i+1]+1);
1518     vi = aj + adiag[i+1]+1;
1519     nz = adiag[i] - adiag[i+1]-1;
1520     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1521     for(k=0;k<nz;k++){
1522       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*vi[k]);
1523       v += bs2;
1524     }
1525     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*adiag[i],t+i*bs); /* *inv(diagonal[i]) */
1526     ierr = PetscMemcpy(x+i*bs,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1527   }
1528 
1529   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1530   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1531   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1532   PetscFunctionReturn(0);
1533 }
1534 
1535 #undef __FUNCT__
1536 #define __FUNCT__ "MatSolve_SeqBAIJ_N_newdatastruct"
1537 PetscErrorCode MatSolve_SeqBAIJ_N_newdatastruct(Mat A,Vec bb,Vec xx)
1538 {
1539   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1540   IS             iscol=a->col,isrow=a->row;
1541   PetscErrorCode ierr;
1542   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1543   PetscInt       i,m,n=a->mbs;
1544   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2,k;
1545   MatScalar      *aa=a->a,*v;
1546   PetscScalar    *x,*b,*s,*t,*ls;
1547 
1548   PetscFunctionBegin;
1549   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1550   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1551   t  = a->solve_work;
1552 
1553   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1554   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1555 
1556   /* forward solve the lower triangular */
1557   ierr = PetscMemcpy(t,b+bs*r[0],bs*sizeof(PetscScalar));CHKERRQ(ierr);
1558   for (i=1; i<n; i++) {
1559     v   = aa + bs2*ai[i];
1560     vi  = aj + ai[i];
1561     nz = ai[i+1] - ai[i];
1562     s = t + bs*i;
1563     ierr = PetscMemcpy(s,b+bs*r[i],bs*sizeof(PetscScalar));CHKERRQ(ierr);
1564     for(m=0;m<nz;m++){
1565       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*vi[m]);
1566       v += bs2;
1567     }
1568   }
1569 
1570   /* backward solve the upper triangular */
1571   ls = a->solve_work + A->cmap->n;
1572   for (i=n-1; i>=0; i--){
1573     k  = 2*n-i;
1574     v  = aa + bs2*ai[k];
1575     vi = aj + ai[k];
1576     nz = ai[k+1] - ai[k] - 1;
1577     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1578     for(m=0;m<nz;m++){
1579       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*vi[m]);
1580       v += bs2;
1581     }
1582     Kernel_w_gets_A_times_v(bs,ls,v,t+i*bs); /* *inv(diagonal[i]) */
1583     ierr = PetscMemcpy(x + bs*c[i],t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1584   }
1585   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1586   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1587   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1588   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1589   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 #undef __FUNCT__
1594 #define __FUNCT__ "BlockAbs_privat"
1595 PetscErrorCode BlockAbs_private(PetscInt nbs,PetscInt bs2,PetscScalar *blockarray,PetscReal *absarray)
1596 {
1597   PetscErrorCode     ierr;
1598   PetscInt           i,j;
1599   PetscFunctionBegin;
1600   ierr = PetscMemzero(absarray,(nbs+1)*sizeof(PetscReal));CHKERRQ(ierr);
1601   for (i=0; i<nbs; i++){
1602     for (j=0; j<bs2; j++){
1603       if (absarray[i] < PetscAbsScalar(blockarray[i*nbs+j])) absarray[i] = PetscAbsScalar(blockarray[i*nbs+j]);
1604     }
1605   }
1606   PetscFunctionReturn(0);
1607 }
1608 
1609 #undef __FUNCT__
1610 #define __FUNCT__ "MatILUDTFactor_SeqBAIJ"
1611 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A,IS isrow,IS iscol,const MatFactorInfo *info,Mat *fact)
1612 {
1613   Mat                B = *fact;
1614   Mat_SeqBAIJ        *a=(Mat_SeqBAIJ*)A->data,*b;
1615   IS                 isicol;
1616   PetscErrorCode     ierr;
1617   const PetscInt     *r,*ic;
1618   PetscInt           i,mbs=a->mbs,bs=A->rmap->bs,bs2=a->bs2,*ai=a->i,*aj=a->j,*ajtmp,*adiag;
1619   PetscInt           *bi,*bj,*bdiag;
1620 
1621   PetscInt           row,nzi,nzi_bl,nzi_bu,*im,dtcount,nzi_al,nzi_au;
1622   PetscInt           nlnk,*lnk;
1623   PetscBT            lnkbt;
1624   PetscTruth         row_identity,icol_identity,both_identity;
1625   MatScalar          *aatmp,*pv,*batmp,*ba,*rtmp,*pc,*multiplier,*vtmp;
1626   const PetscInt     *ics;
1627   PetscInt           j,nz,*pj,*bjtmp,k,ncut,*jtmp;
1628 
1629   PetscReal          dt=info->dt; /* shift=info->shiftinblocks; */
1630   PetscInt           nnz_max;
1631   PetscTruth         missing;
1632   PetscReal          *vtmp_abs;
1633   MatScalar          *v_work;
1634   PetscInt           *v_pivots;
1635 
1636   PetscFunctionBegin;
1637   /* ------- symbolic factorization, can be reused ---------*/
1638   ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr);
1639   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
1640   adiag=a->diag;
1641 
1642   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
1643 
1644   /* bdiag is location of diagonal in factor */
1645   ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
1646 
1647   /* allocate row pointers bi */
1648   ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1649 
1650   /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1651   dtcount = (PetscInt)info->dtcount;
1652   if (dtcount > mbs-1) dtcount = mbs-1;
1653   nnz_max  = ai[mbs]+2*mbs*dtcount +2;
1654   /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max  %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1655   ierr = PetscMalloc(nnz_max*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1656   nnz_max = nnz_max*bs2;
1657   ierr = PetscMalloc(nnz_max*sizeof(MatScalar),&ba);CHKERRQ(ierr);
1658 
1659   /* put together the new matrix */
1660   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
1661   ierr = PetscLogObjectParent(B,isicol);CHKERRQ(ierr);
1662   b    = (Mat_SeqBAIJ*)(B)->data;
1663   b->free_a       = PETSC_TRUE;
1664   b->free_ij      = PETSC_TRUE;
1665   b->singlemalloc = PETSC_FALSE;
1666   b->a          = ba;
1667   b->j          = bj;
1668   b->i          = bi;
1669   b->diag       = bdiag;
1670   b->ilen       = 0;
1671   b->imax       = 0;
1672   b->row        = isrow;
1673   b->col        = iscol;
1674   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
1675   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
1676   b->icol       = isicol;
1677   ierr = PetscMalloc((bs*(mbs+1))*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
1678 
1679   ierr = PetscLogObjectMemory(B,nnz_max*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
1680   b->maxnz = nnz_max/bs2;
1681 
1682   (B)->factor                = MAT_FACTOR_ILUDT;
1683   (B)->info.factor_mallocs   = 0;
1684   (B)->info.fill_ratio_given = ((PetscReal)nnz_max)/((PetscReal)(ai[mbs]*bs2));
1685   CHKMEMQ;
1686   /* ------- end of symbolic factorization ---------*/
1687   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1688   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1689   ics  = ic;
1690 
1691   /* linked list for storing column indices of the active row */
1692   nlnk = mbs + 1;
1693   ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1694 
1695   /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1696   ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
1697   jtmp = im + mbs;
1698   /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1699   ierr = PetscMalloc((2*mbs*bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
1700   vtmp = rtmp + bs2*mbs;
1701   ierr = PetscMalloc((mbs+1)*sizeof(PetscReal),&vtmp_abs);CHKERRQ(ierr);
1702 
1703   ierr       = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
1704   multiplier = v_work + bs;
1705   v_pivots   = (PetscInt*)(multiplier + bs2);
1706 
1707   bi[0]    = 0;
1708   bdiag[0] = (nnz_max/bs2)-1; /* location of diagonal in factor B */
1709   bi[2*mbs+1] = bdiag[0]+1; /* endof bj and ba array */
1710   for (i=0; i<mbs; i++) {
1711     /* copy initial fill into linked list */
1712     nzi = 0; /* nonzeros for active row i */
1713     nzi = ai[r[i]+1] - ai[r[i]];
1714     if (!nzi) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
1715     nzi_al = adiag[r[i]] - ai[r[i]];
1716     nzi_au = ai[r[i]+1] - adiag[r[i]] -1;
1717     /* printf("row %d, nzi_al/au %d %d\n",i,nzi_al,nzi_au); */
1718 
1719     /* load in initial unfactored row */
1720     ajtmp = aj + ai[r[i]];
1721     ierr = PetscLLAddPerm(nzi,ajtmp,ic,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1722     ierr = PetscMemzero(rtmp,(mbs*bs2+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1723     aatmp = a->a + bs2*ai[r[i]];
1724     for (j=0; j<nzi; j++) {
1725       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],aatmp+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1726     }
1727 
1728     /* add pivot rows into linked list */
1729     row = lnk[mbs];
1730     while (row < i) {
1731       nzi_bl = bi[row+1] - bi[row] + 1;
1732       bjtmp = bj + bdiag[row+1]+1; /* points to 1st column next to the diagonal in U */
1733       ierr  = PetscLLAddSortedLU(bjtmp,row,nlnk,lnk,lnkbt,i,nzi_bl,im);CHKERRQ(ierr);
1734       nzi  += nlnk;
1735       row   = lnk[row];
1736     }
1737 
1738     /* copy data from lnk into jtmp, then initialize lnk */
1739     ierr = PetscLLClean(mbs,mbs,nzi,lnk,jtmp,lnkbt);CHKERRQ(ierr);
1740 
1741     /* numerical factorization */
1742     bjtmp = jtmp;
1743     row   = *bjtmp++; /* 1st pivot row */
1744 
1745     while  (row < i) {
1746       pc = rtmp + bs2*row;
1747       pv = ba + bs2*bdiag[row]; /* inv(diag) of the pivot row */
1748       Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1749       ierr = BlockAbs_private(1,bs2,pc,vtmp_abs);CHKERRQ(ierr);
1750       if (vtmp_abs[0] > dt){ /* apply tolerance dropping rule */
1751         pj         = bj + bdiag[row+1] + 1; /* point to 1st entry of U(row,:) */
1752         pv         = ba + bs2*(bdiag[row+1] + 1);
1753         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries in U(row,:), excluding diagonal */
1754         for (j=0; j<nz; j++){
1755           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
1756         }
1757         /* ierr = PetscLogFlops(bslog*(nz+1.0)-bs);CHKERRQ(ierr); */
1758       }
1759       row = *bjtmp++;
1760     }
1761 
1762     /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1763     nzi_bl = 0; j = 0;
1764     while (jtmp[j] < i){ /* L-part. Note: jtmp is sorted */
1765       ierr = PetscMemcpy(vtmp+bs2*j,rtmp+bs2*jtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1766       nzi_bl++; j++;
1767     }
1768     nzi_bu = nzi - nzi_bl -1;
1769     /* printf("nzi %d, nzi_bl %d, nzi_bu %d\n",nzi,nzi_bl,nzi_bu); */
1770 
1771     while (j < nzi){ /* U-part */
1772       ierr = PetscMemcpy(vtmp+bs2*j,rtmp+bs2*jtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1773       /*
1774       printf(" col %d: ",jtmp[j]);
1775       for (j1=0; j1<bs2; j1++) printf(" %g",*(vtmp+bs2*j+j1));
1776       printf(" \n");
1777       */
1778       j++;
1779     }
1780 
1781     ierr = BlockAbs_private(nzi,bs2,vtmp,vtmp_abs);CHKERRQ(ierr);
1782     /*
1783     printf(" row %d, nzi %d, vtmp_abs\n",i,nzi);
1784     for (j1=0; j1<nzi; j1++) printf(" (%d %g),",jtmp[j1],vtmp_abs[j1]);
1785     printf(" \n");
1786     */
1787     bjtmp = bj + bi[i];
1788     batmp = ba + bs2*bi[i];
1789     /* apply level dropping rule to L part */
1790     ncut = nzi_al + dtcount;
1791     if (ncut < nzi_bl){
1792       ierr = PetscSortSplitReal(ncut,nzi_bl,vtmp_abs,jtmp);CHKERRQ(ierr);
1793       ierr = PetscSortIntWithScalarArray(ncut,jtmp,vtmp);CHKERRQ(ierr);
1794     } else {
1795       ncut = nzi_bl;
1796     }
1797     for (j=0; j<ncut; j++){
1798       bjtmp[j] = jtmp[j];
1799       ierr = PetscMemcpy(batmp+bs2*j,rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1800       /*
1801       printf(" col %d: ",bjtmp[j]);
1802       for (j1=0; j1<bs2; j1++) printf(" %g,",*(batmp+bs2*j+j1));
1803       printf("\n");
1804       */
1805     }
1806     bi[i+1] = bi[i] + ncut;
1807     nzi = ncut + 1;
1808 
1809     /* apply level dropping rule to U part */
1810     ncut = nzi_au + dtcount;
1811     if (ncut < nzi_bu){
1812       ierr = PetscSortSplitReal(ncut,nzi_bu,vtmp_abs+nzi_bl+1,jtmp+nzi_bl+1);CHKERRQ(ierr);
1813       ierr = PetscSortIntWithScalarArray(ncut,jtmp+nzi_bl+1,vtmp+nzi_bl+1);CHKERRQ(ierr);
1814     } else {
1815       ncut = nzi_bu;
1816     }
1817     nzi += ncut;
1818 
1819     /* mark bdiagonal */
1820     bdiag[i+1]    = bdiag[i] - (ncut + 1);
1821     bi[2*mbs - i] = bi[2*mbs - i +1] - (ncut + 1);
1822 
1823     bjtmp = bj + bdiag[i];
1824     batmp = ba + bs2*bdiag[i];
1825     ierr = PetscMemcpy(batmp,rtmp+bs2*i,bs2*sizeof(MatScalar));CHKERRQ(ierr);
1826     *bjtmp = i;
1827     /*
1828     printf(" diag %d: ",*bjtmp);
1829     for (j=0; j<bs2; j++){
1830       printf(" %g,",batmp[j]);
1831     }
1832     printf("\n");
1833     */
1834     bjtmp = bj + bdiag[i+1]+1;
1835     batmp = ba + (bdiag[i+1]+1)*bs2;
1836 
1837     for (k=0; k<ncut; k++){
1838       bjtmp[k] = jtmp[nzi_bl+1+k];
1839       ierr = PetscMemcpy(batmp+bs2*k,rtmp+bs2*bjtmp[k],bs2*sizeof(MatScalar));CHKERRQ(ierr);
1840       /*
1841       printf(" col %d:",bjtmp[k]);
1842       for (j1=0; j1<bs2; j1++) printf(" %g,",*(batmp+bs2*k+j1));
1843       printf("\n");
1844       */
1845     }
1846 
1847     im[i] = nzi; /* used by PetscLLAddSortedLU() */
1848 
1849     /* invert diagonal block for simplier triangular solves - add shift??? */
1850     batmp = ba + bs2*bdiag[i];
1851     ierr = Kernel_A_gets_inverse_A(bs,batmp,v_pivots,v_work);CHKERRQ(ierr);
1852   } /* for (i=0; i<mbs; i++) */
1853 
1854   /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1855   if (bi[mbs] >= bdiag[mbs]) SETERRQ2(PETSC_ERR_ARG_SIZ,"end of L array %d cannot >= the beginning of U array %d",bi[mbs],bdiag[mbs]);
1856 
1857   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1858   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1859 
1860   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1861 
1862   ierr = PetscFree(im);CHKERRQ(ierr);
1863   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1864 
1865   ierr = PetscLogFlops(bs2*B->cmap->n);CHKERRQ(ierr);
1866   b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];
1867 
1868   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
1869   ierr = ISIdentity(isicol,&icol_identity);CHKERRQ(ierr);
1870   both_identity = (PetscTruth) (row_identity && icol_identity);
1871   if (row_identity && icol_identity) {
1872     B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
1873   } else {
1874     B->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
1875   }
1876 
1877   B->ops->solveadd          = 0;
1878   B->ops->solvetranspose    = 0;
1879   B->ops->solvetransposeadd = 0;
1880   B->ops->matsolve          = 0;
1881   B->assembled              = PETSC_TRUE;
1882   B->preallocated           = PETSC_TRUE;
1883   PetscFunctionReturn(0);
1884 }
1885